Coverage Report

Created: 2025-07-23 06:32

/src/aom/third_party/SVT-AV1/convolve_avx2.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#ifndef THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
13
#define THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
14
15
#include "EbMemory_AVX2.h"
16
#include "EbMemory_SSE4_1.h"
17
#include "synonyms.h"
18
19
#include "aom_dsp/aom_filter.h"
20
#include "aom_dsp/x86/convolve_avx2.h"
21
#include "aom_dsp/x86/mem_sse2.h"
22
23
static inline void populate_coeffs_4tap_avx2(const __m128i coeffs_128,
24
188k
                                             __m256i coeffs[2]) {
25
188k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
26
27
  // coeffs 2 3 2 3 2 3 2 3
28
188k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
29
  // coeffs 4 5 4 5 4 5 4 5
30
188k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
31
188k
}
convolve_2d_avx2.c:populate_coeffs_4tap_avx2
Line
Count
Source
24
42.6k
                                             __m256i coeffs[2]) {
25
42.6k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
26
27
  // coeffs 2 3 2 3 2 3 2 3
28
42.6k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
29
  // coeffs 4 5 4 5 4 5 4 5
30
42.6k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
31
42.6k
}
convolve_avx2.c:populate_coeffs_4tap_avx2
Line
Count
Source
24
146k
                                             __m256i coeffs[2]) {
25
146k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
26
27
  // coeffs 2 3 2 3 2 3 2 3
28
146k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
29
  // coeffs 4 5 4 5 4 5 4 5
30
146k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
31
146k
}
32
33
static inline void populate_coeffs_6tap_avx2(const __m128i coeffs_128,
34
1.13M
                                             __m256i coeffs[3]) {
35
1.13M
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
36
37
  // coeffs 1 2 1 2 1 2 1 2
38
1.13M
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
39
  // coeffs 3 4 3 4 3 4 3 4
40
1.13M
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
41
  // coeffs 5 6 5 6 5 6 5 6
42
1.13M
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
43
1.13M
}
convolve_2d_avx2.c:populate_coeffs_6tap_avx2
Line
Count
Source
34
657k
                                             __m256i coeffs[3]) {
35
657k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
36
37
  // coeffs 1 2 1 2 1 2 1 2
38
657k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
39
  // coeffs 3 4 3 4 3 4 3 4
40
657k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
41
  // coeffs 5 6 5 6 5 6 5 6
42
657k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
43
657k
}
convolve_avx2.c:populate_coeffs_6tap_avx2
Line
Count
Source
34
478k
                                             __m256i coeffs[3]) {
35
478k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
36
37
  // coeffs 1 2 1 2 1 2 1 2
38
478k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
39
  // coeffs 3 4 3 4 3 4 3 4
40
478k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
41
  // coeffs 5 6 5 6 5 6 5 6
42
478k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
43
478k
}
44
45
static inline void populate_coeffs_8tap_avx2(const __m128i coeffs_128,
46
79.8k
                                             __m256i coeffs[4]) {
47
79.8k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
48
49
  // coeffs 0 1 0 1 0 1 0 1
50
79.8k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
51
  // coeffs 2 3 2 3 2 3 2 3
52
79.8k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
53
  // coeffs 4 5 4 5 4 5 4 5
54
79.8k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
55
  // coeffs 6 7 6 7 6 7 6 7
56
79.8k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
57
79.8k
}
convolve_2d_avx2.c:populate_coeffs_8tap_avx2
Line
Count
Source
46
56.4k
                                             __m256i coeffs[4]) {
47
56.4k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
48
49
  // coeffs 0 1 0 1 0 1 0 1
50
56.4k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
51
  // coeffs 2 3 2 3 2 3 2 3
52
56.4k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
53
  // coeffs 4 5 4 5 4 5 4 5
54
56.4k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
55
  // coeffs 6 7 6 7 6 7 6 7
56
56.4k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
57
56.4k
}
convolve_avx2.c:populate_coeffs_8tap_avx2
Line
Count
Source
46
23.3k
                                             __m256i coeffs[4]) {
47
23.3k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
48
49
  // coeffs 0 1 0 1 0 1 0 1
50
23.3k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
51
  // coeffs 2 3 2 3 2 3 2 3
52
23.3k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
53
  // coeffs 4 5 4 5 4 5 4 5
54
23.3k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
55
  // coeffs 6 7 6 7 6 7 6 7
56
23.3k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
57
23.3k
}
58
59
static inline void prepare_half_coeffs_2tap_ssse3(
60
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
61
89.2k
    __m128i *const coeffs /* [1] */) {
62
89.2k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
63
89.2k
      filter_params, subpel_q4 & SUBPEL_MASK);
64
89.2k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
65
66
  // right shift all filter co-efficients by 1 to reduce the bits required.
67
  // This extra right shift will be taken care of at the end while rounding
68
  // the result.
69
  // Since all filter co-efficients are even, this change will not affect the
70
  // end result
71
89.2k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
72
89.2k
                            _mm_set1_epi16((short)0xffff)));
73
74
89.2k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
75
76
  // coeffs 3 4 3 4 3 4 3 4
77
89.2k
  *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
78
89.2k
}
convolve_2d_avx2.c:prepare_half_coeffs_2tap_ssse3
Line
Count
Source
61
59.6k
    __m128i *const coeffs /* [1] */) {
62
59.6k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
63
59.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
64
59.6k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
65
66
  // right shift all filter co-efficients by 1 to reduce the bits required.
67
  // This extra right shift will be taken care of at the end while rounding
68
  // the result.
69
  // Since all filter co-efficients are even, this change will not affect the
70
  // end result
71
59.6k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
72
59.6k
                            _mm_set1_epi16((short)0xffff)));
73
74
59.6k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
75
76
  // coeffs 3 4 3 4 3 4 3 4
77
59.6k
  *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
78
59.6k
}
convolve_avx2.c:prepare_half_coeffs_2tap_ssse3
Line
Count
Source
61
29.5k
    __m128i *const coeffs /* [1] */) {
62
29.5k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
63
29.5k
      filter_params, subpel_q4 & SUBPEL_MASK);
64
29.5k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
65
66
  // right shift all filter co-efficients by 1 to reduce the bits required.
67
  // This extra right shift will be taken care of at the end while rounding
68
  // the result.
69
  // Since all filter co-efficients are even, this change will not affect the
70
  // end result
71
29.5k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
72
29.5k
                            _mm_set1_epi16((short)0xffff)));
73
74
29.5k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
75
76
  // coeffs 3 4 3 4 3 4 3 4
77
29.5k
  *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
78
29.5k
}
79
80
static inline void prepare_half_coeffs_4tap_ssse3(
81
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
82
985k
    __m128i *const coeffs /* [2] */) {
83
985k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
84
985k
      filter_params, subpel_q4 & SUBPEL_MASK);
85
985k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
86
87
  // right shift all filter co-efficients by 1 to reduce the bits required.
88
  // This extra right shift will be taken care of at the end while rounding
89
  // the result.
90
  // Since all filter co-efficients are even, this change will not affect the
91
  // end result
92
985k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
93
985k
                            _mm_set1_epi16((short)0xffff)));
94
95
985k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
96
97
  // coeffs 2 3 2 3 2 3 2 3
98
985k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
99
  // coeffs 4 5 4 5 4 5 4 5
100
985k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
101
985k
}
convolve_2d_avx2.c:prepare_half_coeffs_4tap_ssse3
Line
Count
Source
82
583k
    __m128i *const coeffs /* [2] */) {
83
583k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
84
583k
      filter_params, subpel_q4 & SUBPEL_MASK);
85
583k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
86
87
  // right shift all filter co-efficients by 1 to reduce the bits required.
88
  // This extra right shift will be taken care of at the end while rounding
89
  // the result.
90
  // Since all filter co-efficients are even, this change will not affect the
91
  // end result
92
583k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
93
583k
                            _mm_set1_epi16((short)0xffff)));
94
95
583k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
96
97
  // coeffs 2 3 2 3 2 3 2 3
98
583k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
99
  // coeffs 4 5 4 5 4 5 4 5
100
583k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
101
583k
}
convolve_avx2.c:prepare_half_coeffs_4tap_ssse3
Line
Count
Source
82
402k
    __m128i *const coeffs /* [2] */) {
83
402k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
84
402k
      filter_params, subpel_q4 & SUBPEL_MASK);
85
402k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
86
87
  // right shift all filter co-efficients by 1 to reduce the bits required.
88
  // This extra right shift will be taken care of at the end while rounding
89
  // the result.
90
  // Since all filter co-efficients are even, this change will not affect the
91
  // end result
92
402k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
93
402k
                            _mm_set1_epi16((short)0xffff)));
94
95
402k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
96
97
  // coeffs 2 3 2 3 2 3 2 3
98
402k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
99
  // coeffs 4 5 4 5 4 5 4 5
100
402k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
101
402k
}
102
103
static inline void prepare_half_coeffs_6tap_ssse3(
104
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
105
89.9k
    __m128i *const coeffs /* [3] */) {
106
89.9k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
107
89.9k
      filter_params, subpel_q4 & SUBPEL_MASK);
108
89.9k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
109
110
  // right shift all filter co-efficients by 1 to reduce the bits required.
111
  // This extra right shift will be taken care of at the end while rounding
112
  // the result.
113
  // Since all filter co-efficients are even, this change will not affect the
114
  // end result
115
89.9k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
116
89.9k
                            _mm_set1_epi16((short)0xffff)));
117
118
89.9k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
119
120
  // coeffs 1 2 1 2 1 2 1 2
121
89.9k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
122
  // coeffs 3 4 3 4 3 4 3 4
123
89.9k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
124
  // coeffs 5 6 5 6 5 6 5 6
125
89.9k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0C0Au));
126
89.9k
}
Unexecuted instantiation: convolve_2d_avx2.c:prepare_half_coeffs_6tap_ssse3
convolve_avx2.c:prepare_half_coeffs_6tap_ssse3
Line
Count
Source
105
89.9k
    __m128i *const coeffs /* [3] */) {
106
89.9k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
107
89.9k
      filter_params, subpel_q4 & SUBPEL_MASK);
108
89.9k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
109
110
  // right shift all filter co-efficients by 1 to reduce the bits required.
111
  // This extra right shift will be taken care of at the end while rounding
112
  // the result.
113
  // Since all filter co-efficients are even, this change will not affect the
114
  // end result
115
89.9k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
116
89.9k
                            _mm_set1_epi16((short)0xffff)));
117
118
89.9k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
119
120
  // coeffs 1 2 1 2 1 2 1 2
121
89.9k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
122
  // coeffs 3 4 3 4 3 4 3 4
123
89.9k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
124
  // coeffs 5 6 5 6 5 6 5 6
125
89.9k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0C0Au));
126
89.9k
}
127
128
static inline void prepare_half_coeffs_8tap_ssse3(
129
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
130
5.00k
    __m128i *const coeffs /* [4] */) {
131
5.00k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
132
5.00k
      filter_params, subpel_q4 & SUBPEL_MASK);
133
5.00k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
134
135
  // right shift all filter co-efficients by 1 to reduce the bits required.
136
  // This extra right shift will be taken care of at the end while rounding
137
  // the result.
138
  // Since all filter co-efficients are even, this change will not affect the
139
  // end result
140
5.00k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
141
5.00k
                            _mm_set1_epi16((short)0xffff)));
142
143
5.00k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
144
145
  // coeffs 0 1 0 1 0 1 0 1
146
5.00k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
147
  // coeffs 2 3 2 3 2 3 2 3
148
5.00k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
149
  // coeffs 4 5 4 5 4 5 4 5
150
5.00k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
151
  // coeffs 6 7 6 7 6 7 6 7
152
5.00k
  coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
153
5.00k
}
Unexecuted instantiation: convolve_2d_avx2.c:prepare_half_coeffs_8tap_ssse3
convolve_avx2.c:prepare_half_coeffs_8tap_ssse3
Line
Count
Source
130
5.00k
    __m128i *const coeffs /* [4] */) {
131
5.00k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
132
5.00k
      filter_params, subpel_q4 & SUBPEL_MASK);
133
5.00k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
134
135
  // right shift all filter co-efficients by 1 to reduce the bits required.
136
  // This extra right shift will be taken care of at the end while rounding
137
  // the result.
138
  // Since all filter co-efficients are even, this change will not affect the
139
  // end result
140
5.00k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
141
5.00k
                            _mm_set1_epi16((short)0xffff)));
142
143
5.00k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
144
145
  // coeffs 0 1 0 1 0 1 0 1
146
5.00k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
147
  // coeffs 2 3 2 3 2 3 2 3
148
5.00k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
149
  // coeffs 4 5 4 5 4 5 4 5
150
5.00k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
151
  // coeffs 6 7 6 7 6 7 6 7
152
5.00k
  coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
153
5.00k
}
154
155
static inline void prepare_half_coeffs_2tap_avx2(
156
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
157
24.1k
    __m256i *const coeffs /* [1] */) {
158
24.1k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
159
24.1k
      filter_params, subpel_q4 & SUBPEL_MASK);
160
24.1k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
161
24.1k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
162
163
  // right shift all filter co-efficients by 1 to reduce the bits required.
164
  // This extra right shift will be taken care of at the end while rounding
165
  // the result.
166
  // Since all filter co-efficients are even, this change will not affect the
167
  // end result
168
24.1k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
169
24.1k
                            _mm_set1_epi16((short)0xffff)));
170
171
24.1k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
172
173
  // coeffs 3 4 3 4 3 4 3 4
174
24.1k
  *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
175
24.1k
}
convolve_2d_avx2.c:prepare_half_coeffs_2tap_avx2
Line
Count
Source
157
14.7k
    __m256i *const coeffs /* [1] */) {
158
14.7k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
159
14.7k
      filter_params, subpel_q4 & SUBPEL_MASK);
160
14.7k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
161
14.7k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
162
163
  // right shift all filter co-efficients by 1 to reduce the bits required.
164
  // This extra right shift will be taken care of at the end while rounding
165
  // the result.
166
  // Since all filter co-efficients are even, this change will not affect the
167
  // end result
168
14.7k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
169
14.7k
                            _mm_set1_epi16((short)0xffff)));
170
171
14.7k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
172
173
  // coeffs 3 4 3 4 3 4 3 4
174
14.7k
  *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
175
14.7k
}
convolve_avx2.c:prepare_half_coeffs_2tap_avx2
Line
Count
Source
157
9.37k
    __m256i *const coeffs /* [1] */) {
158
9.37k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
159
9.37k
      filter_params, subpel_q4 & SUBPEL_MASK);
160
9.37k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
161
9.37k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
162
163
  // right shift all filter co-efficients by 1 to reduce the bits required.
164
  // This extra right shift will be taken care of at the end while rounding
165
  // the result.
166
  // Since all filter co-efficients are even, this change will not affect the
167
  // end result
168
9.37k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
169
9.37k
                            _mm_set1_epi16((short)0xffff)));
170
171
9.37k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
172
173
  // coeffs 3 4 3 4 3 4 3 4
174
9.37k
  *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
175
9.37k
}
176
177
static inline void prepare_half_coeffs_4tap_avx2(
178
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
179
188k
    __m256i *const coeffs /* [2] */) {
180
188k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
181
188k
      filter_params, subpel_q4 & SUBPEL_MASK);
182
188k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
183
184
  // right shift all filter co-efficients by 1 to reduce the bits required.
185
  // This extra right shift will be taken care of at the end while rounding
186
  // the result.
187
  // Since all filter co-efficients are even, this change will not affect the
188
  // end result
189
188k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
190
188k
                            _mm_set1_epi16((short)0xffff)));
191
188k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
192
188k
  populate_coeffs_4tap_avx2(coeffs_1, coeffs);
193
188k
}
convolve_2d_avx2.c:prepare_half_coeffs_4tap_avx2
Line
Count
Source
179
42.6k
    __m256i *const coeffs /* [2] */) {
180
42.6k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
181
42.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
182
42.6k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
183
184
  // right shift all filter co-efficients by 1 to reduce the bits required.
185
  // This extra right shift will be taken care of at the end while rounding
186
  // the result.
187
  // Since all filter co-efficients are even, this change will not affect the
188
  // end result
189
42.6k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
190
42.6k
                            _mm_set1_epi16((short)0xffff)));
191
42.6k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
192
42.6k
  populate_coeffs_4tap_avx2(coeffs_1, coeffs);
193
42.6k
}
convolve_avx2.c:prepare_half_coeffs_4tap_avx2
Line
Count
Source
179
146k
    __m256i *const coeffs /* [2] */) {
180
146k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
181
146k
      filter_params, subpel_q4 & SUBPEL_MASK);
182
146k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
183
184
  // right shift all filter co-efficients by 1 to reduce the bits required.
185
  // This extra right shift will be taken care of at the end while rounding
186
  // the result.
187
  // Since all filter co-efficients are even, this change will not affect the
188
  // end result
189
146k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
190
146k
                            _mm_set1_epi16((short)0xffff)));
191
146k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
192
146k
  populate_coeffs_4tap_avx2(coeffs_1, coeffs);
193
146k
}
194
195
static inline void prepare_half_coeffs_6tap_avx2(
196
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
197
1.13M
    __m256i *const coeffs /* [3] */) {
198
1.13M
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
199
1.13M
      filter_params, subpel_q4 & SUBPEL_MASK);
200
1.13M
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
201
202
  // right shift all filter co-efficients by 1 to reduce the bits required.
203
  // This extra right shift will be taken care of at the end while rounding
204
  // the result.
205
  // Since all filter co-efficients are even, this change will not affect the
206
  // end result
207
1.13M
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
208
1.13M
                            _mm_set1_epi16((short)0xffff)));
209
1.13M
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
210
1.13M
  populate_coeffs_6tap_avx2(coeffs_1, coeffs);
211
1.13M
}
convolve_2d_avx2.c:prepare_half_coeffs_6tap_avx2
Line
Count
Source
197
657k
    __m256i *const coeffs /* [3] */) {
198
657k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
199
657k
      filter_params, subpel_q4 & SUBPEL_MASK);
200
657k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
201
202
  // right shift all filter co-efficients by 1 to reduce the bits required.
203
  // This extra right shift will be taken care of at the end while rounding
204
  // the result.
205
  // Since all filter co-efficients are even, this change will not affect the
206
  // end result
207
657k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
208
657k
                            _mm_set1_epi16((short)0xffff)));
209
657k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
210
657k
  populate_coeffs_6tap_avx2(coeffs_1, coeffs);
211
657k
}
convolve_avx2.c:prepare_half_coeffs_6tap_avx2
Line
Count
Source
197
478k
    __m256i *const coeffs /* [3] */) {
198
478k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
199
478k
      filter_params, subpel_q4 & SUBPEL_MASK);
200
478k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
201
202
  // right shift all filter co-efficients by 1 to reduce the bits required.
203
  // This extra right shift will be taken care of at the end while rounding
204
  // the result.
205
  // Since all filter co-efficients are even, this change will not affect the
206
  // end result
207
478k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
208
478k
                            _mm_set1_epi16((short)0xffff)));
209
478k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
210
478k
  populate_coeffs_6tap_avx2(coeffs_1, coeffs);
211
478k
}
212
213
static inline void prepare_half_coeffs_8tap_avx2(
214
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
215
79.8k
    __m256i *const coeffs /* [4] */) {
216
79.8k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
217
79.8k
      filter_params, subpel_q4 & SUBPEL_MASK);
218
79.8k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
219
220
  // right shift all filter co-efficients by 1 to reduce the bits required.
221
  // This extra right shift will be taken care of at the end while rounding
222
  // the result.
223
  // Since all filter co-efficients are even, this change will not affect the
224
  // end result
225
79.8k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
226
79.8k
                            _mm_set1_epi16((short)0xffff)));
227
79.8k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
228
79.8k
  populate_coeffs_8tap_avx2(coeffs_1, coeffs);
229
79.8k
}
convolve_2d_avx2.c:prepare_half_coeffs_8tap_avx2
Line
Count
Source
215
56.4k
    __m256i *const coeffs /* [4] */) {
216
56.4k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
217
56.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
218
56.4k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
219
220
  // right shift all filter co-efficients by 1 to reduce the bits required.
221
  // This extra right shift will be taken care of at the end while rounding
222
  // the result.
223
  // Since all filter co-efficients are even, this change will not affect the
224
  // end result
225
56.4k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
226
56.4k
                            _mm_set1_epi16((short)0xffff)));
227
56.4k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
228
56.4k
  populate_coeffs_8tap_avx2(coeffs_1, coeffs);
229
56.4k
}
convolve_avx2.c:prepare_half_coeffs_8tap_avx2
Line
Count
Source
215
23.3k
    __m256i *const coeffs /* [4] */) {
216
23.3k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
217
23.3k
      filter_params, subpel_q4 & SUBPEL_MASK);
218
23.3k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
219
220
  // right shift all filter co-efficients by 1 to reduce the bits required.
221
  // This extra right shift will be taken care of at the end while rounding
222
  // the result.
223
  // Since all filter co-efficients are even, this change will not affect the
224
  // end result
225
23.3k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
226
23.3k
                            _mm_set1_epi16((short)0xffff)));
227
23.3k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
228
23.3k
  populate_coeffs_8tap_avx2(coeffs_1, coeffs);
229
23.3k
}
230
231
static inline void prepare_coeffs_2tap_sse2(
232
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
233
26.8k
    __m128i *const coeffs /* [1] */) {
234
26.8k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
235
26.8k
      filter_params, subpel_q4 & SUBPEL_MASK);
236
237
26.8k
  const __m128i coeff = _mm_cvtsi32_si128(loadu_int32(filter + 3));
238
239
  // coeffs 3 4 3 4 3 4 3 4
240
26.8k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
241
26.8k
}
convolve_2d_avx2.c:prepare_coeffs_2tap_sse2
Line
Count
Source
233
26.8k
    __m128i *const coeffs /* [1] */) {
234
26.8k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
235
26.8k
      filter_params, subpel_q4 & SUBPEL_MASK);
236
237
26.8k
  const __m128i coeff = _mm_cvtsi32_si128(loadu_int32(filter + 3));
238
239
  // coeffs 3 4 3 4 3 4 3 4
240
26.8k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
241
26.8k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_2tap_sse2
242
243
static inline void prepare_coeffs_4tap_sse2(
244
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
245
67.4k
    __m128i *const coeffs /* [2] */) {
246
67.4k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
247
67.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
248
249
67.4k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
250
251
  // coeffs 2 3 2 3 2 3 2 3
252
67.4k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x55);
253
  // coeffs 4 5 4 5 4 5 4 5
254
67.4k
  coeffs[1] = _mm_shuffle_epi32(coeff, 0xaa);
255
67.4k
}
convolve_2d_avx2.c:prepare_coeffs_4tap_sse2
Line
Count
Source
245
67.4k
    __m128i *const coeffs /* [2] */) {
246
67.4k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
247
67.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
248
249
67.4k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
250
251
  // coeffs 2 3 2 3 2 3 2 3
252
67.4k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x55);
253
  // coeffs 4 5 4 5 4 5 4 5
254
67.4k
  coeffs[1] = _mm_shuffle_epi32(coeff, 0xaa);
255
67.4k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_4tap_sse2
256
257
static inline void prepare_coeffs_6tap_ssse3(
258
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
259
46.6k
    __m128i *const coeffs /* [3] */) {
260
46.6k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
261
46.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
262
46.6k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
263
264
  // coeffs 1 2 1 2 1 2 1 2
265
46.6k
  coeffs[0] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x05040302u));
266
  // coeffs 3 4 3 4 3 4 3 4
267
46.6k
  coeffs[1] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x09080706u));
268
  // coeffs 5 6 5 6 5 6 5 6
269
46.6k
  coeffs[2] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x0D0C0B0Au));
270
46.6k
}
convolve_2d_avx2.c:prepare_coeffs_6tap_ssse3
Line
Count
Source
259
46.6k
    __m128i *const coeffs /* [3] */) {
260
46.6k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
261
46.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
262
46.6k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
263
264
  // coeffs 1 2 1 2 1 2 1 2
265
46.6k
  coeffs[0] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x05040302u));
266
  // coeffs 3 4 3 4 3 4 3 4
267
46.6k
  coeffs[1] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x09080706u));
268
  // coeffs 5 6 5 6 5 6 5 6
269
46.6k
  coeffs[2] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x0D0C0B0Au));
270
46.6k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_6tap_ssse3
271
272
static inline void prepare_coeffs_8tap_sse2(
273
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
274
2.52k
    __m128i *const coeffs /* [4] */) {
275
2.52k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
276
2.52k
      filter_params, subpel_q4 & SUBPEL_MASK);
277
278
2.52k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
279
280
  // coeffs 0 1 0 1 0 1 0 1
281
2.52k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
282
  // coeffs 2 3 2 3 2 3 2 3
283
2.52k
  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
284
  // coeffs 4 5 4 5 4 5 4 5
285
2.52k
  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
286
  // coeffs 6 7 6 7 6 7 6 7
287
2.52k
  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
288
2.52k
}
convolve_2d_avx2.c:prepare_coeffs_8tap_sse2
Line
Count
Source
274
2.52k
    __m128i *const coeffs /* [4] */) {
275
2.52k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
276
2.52k
      filter_params, subpel_q4 & SUBPEL_MASK);
277
278
2.52k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
279
280
  // coeffs 0 1 0 1 0 1 0 1
281
2.52k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
282
  // coeffs 2 3 2 3 2 3 2 3
283
2.52k
  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
284
  // coeffs 4 5 4 5 4 5 4 5
285
2.52k
  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
286
  // coeffs 6 7 6 7 6 7 6 7
287
2.52k
  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
288
2.52k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_8tap_sse2
289
290
static inline void prepare_coeffs_2tap_avx2(
291
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
292
24.2k
    __m256i *const coeffs /* [1] */) {
293
24.2k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
294
24.2k
      filter_params, subpel_q4 & SUBPEL_MASK);
295
296
24.2k
  const __m128i coeff_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
297
24.2k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
298
299
  // coeffs 3 4 3 4 3 4 3 4
300
24.2k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
301
24.2k
}
convolve_2d_avx2.c:prepare_coeffs_2tap_avx2
Line
Count
Source
292
24.2k
    __m256i *const coeffs /* [1] */) {
293
24.2k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
294
24.2k
      filter_params, subpel_q4 & SUBPEL_MASK);
295
296
24.2k
  const __m128i coeff_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
297
24.2k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
298
299
  // coeffs 3 4 3 4 3 4 3 4
300
24.2k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
301
24.2k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_2tap_avx2
302
303
static inline void prepare_coeffs_4tap_avx2(
304
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
305
642k
    __m256i *const coeffs /* [2] */) {
306
642k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
307
642k
      filter_params, subpel_q4 & SUBPEL_MASK);
308
309
642k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
310
642k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
311
312
  // coeffs 2 3 2 3 2 3 2 3
313
642k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
314
  // coeffs 4 5 4 5 4 5 4 5
315
642k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
316
642k
}
convolve_2d_avx2.c:prepare_coeffs_4tap_avx2
Line
Count
Source
305
642k
    __m256i *const coeffs /* [2] */) {
306
642k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
307
642k
      filter_params, subpel_q4 & SUBPEL_MASK);
308
309
642k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
310
642k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
311
312
  // coeffs 2 3 2 3 2 3 2 3
313
642k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
314
  // coeffs 4 5 4 5 4 5 4 5
315
642k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
316
642k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_4tap_avx2
317
318
static inline void prepare_coeffs_6tap_avx2(
319
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
320
530k
    __m256i *const coeffs /* [3]*/) {
321
530k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
322
530k
      filter_params, subpel_q4 & SUBPEL_MASK);
323
530k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
324
530k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeffs_8);
325
326
  // coeffs 1 2 1 2 1 2 1 2
327
530k
  coeffs[0] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x05040302u));
328
  // coeffs 3 4 3 4 3 4 3 4
329
530k
  coeffs[1] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x09080706u));
330
  // coeffs 5 6 5 6 5 6 5 6
331
530k
  coeffs[2] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x0D0C0B0Au));
332
530k
}
convolve_2d_avx2.c:prepare_coeffs_6tap_avx2
Line
Count
Source
320
530k
    __m256i *const coeffs /* [3]*/) {
321
530k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
322
530k
      filter_params, subpel_q4 & SUBPEL_MASK);
323
530k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
324
530k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeffs_8);
325
326
  // coeffs 1 2 1 2 1 2 1 2
327
530k
  coeffs[0] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x05040302u));
328
  // coeffs 3 4 3 4 3 4 3 4
329
530k
  coeffs[1] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x09080706u));
330
  // coeffs 5 6 5 6 5 6 5 6
331
530k
  coeffs[2] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x0D0C0B0Au));
332
530k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_6tap_avx2
333
334
static inline void prepare_coeffs_8tap_avx2(
335
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
336
49.6k
    __m256i *const coeffs /* [4] */) {
337
49.6k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
338
49.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
339
340
49.6k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
341
49.6k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
342
343
  // coeffs 0 1 0 1 0 1 0 1
344
49.6k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
345
  // coeffs 2 3 2 3 2 3 2 3
346
49.6k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
347
  // coeffs 4 5 4 5 4 5 4 5
348
49.6k
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
349
  // coeffs 6 7 6 7 6 7 6 7
350
49.6k
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
351
49.6k
}
convolve_2d_avx2.c:prepare_coeffs_8tap_avx2
Line
Count
Source
336
49.6k
    __m256i *const coeffs /* [4] */) {
337
49.6k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
338
49.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
339
340
49.6k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
341
49.6k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
342
343
  // coeffs 0 1 0 1 0 1 0 1
344
49.6k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
345
  // coeffs 2 3 2 3 2 3 2 3
346
49.6k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
347
  // coeffs 4 5 4 5 4 5 4 5
348
49.6k
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
349
  // coeffs 6 7 6 7 6 7 6 7
350
49.6k
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
351
49.6k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_8tap_avx2
352
353
static inline void load_16bit_5rows_avx2(const int16_t *const src,
354
                                         const ptrdiff_t stride,
355
0
                                         __m256i dst[5]) {
356
0
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
357
0
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
358
0
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
359
0
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
360
0
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
361
0
}
Unexecuted instantiation: convolve_2d_avx2.c:load_16bit_5rows_avx2
Unexecuted instantiation: convolve_avx2.c:load_16bit_5rows_avx2
362
363
static inline void load_16bit_7rows_avx2(const int16_t *const src,
364
                                         const ptrdiff_t stride,
365
74.3k
                                         __m256i dst[7]) {
366
74.3k
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
367
74.3k
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
368
74.3k
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
369
74.3k
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
370
74.3k
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
371
74.3k
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
372
74.3k
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
373
74.3k
}
convolve_2d_avx2.c:load_16bit_7rows_avx2
Line
Count
Source
365
74.3k
                                         __m256i dst[7]) {
366
74.3k
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
367
74.3k
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
368
74.3k
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
369
74.3k
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
370
74.3k
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
371
74.3k
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
372
74.3k
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
373
74.3k
}
Unexecuted instantiation: convolve_avx2.c:load_16bit_7rows_avx2
374
375
static AOM_FORCE_INLINE void load_16bit_8rows_avx2(const int16_t *const src,
376
                                                   const ptrdiff_t stride,
377
418
                                                   __m256i dst[8]) {
378
418
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
379
418
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
380
418
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
381
418
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
382
418
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
383
418
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
384
418
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
385
418
  dst[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
386
418
}
convolve_2d_avx2.c:load_16bit_8rows_avx2
Line
Count
Source
377
418
                                                   __m256i dst[8]) {
378
418
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
379
418
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
380
418
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
381
418
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
382
418
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
383
418
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
384
418
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
385
418
  dst[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
386
418
}
Unexecuted instantiation: convolve_avx2.c:load_16bit_8rows_avx2
387
388
static AOM_FORCE_INLINE void loadu_unpack_16bit_5rows_avx2(
389
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[5],
390
146k
    __m256i ss_256[5], __m256i tt_256[5]) {
391
146k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
392
146k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
393
146k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
394
146k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
395
146k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
396
397
146k
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
398
146k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
399
146k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
400
146k
  ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
401
402
146k
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
403
146k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
404
146k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
405
146k
  tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
406
146k
}
convolve_2d_avx2.c:loadu_unpack_16bit_5rows_avx2
Line
Count
Source
390
146k
    __m256i ss_256[5], __m256i tt_256[5]) {
391
146k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
392
146k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
393
146k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
394
146k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
395
146k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
396
397
146k
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
398
146k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
399
146k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
400
146k
  ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
401
402
146k
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
403
146k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
404
146k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
405
146k
  tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
406
146k
}
Unexecuted instantiation: convolve_avx2.c:loadu_unpack_16bit_5rows_avx2
407
408
static AOM_FORCE_INLINE void loadu_unpack_16bit_3rows_avx2(
409
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[3],
410
21.3k
    __m256i ss_256[3], __m256i tt_256[3]) {
411
21.3k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
412
21.3k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
413
21.3k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
414
415
21.3k
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
416
21.3k
  ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
417
418
21.3k
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
419
21.3k
  tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
420
21.3k
}
convolve_2d_avx2.c:loadu_unpack_16bit_3rows_avx2
Line
Count
Source
410
21.3k
    __m256i ss_256[3], __m256i tt_256[3]) {
411
21.3k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
412
21.3k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
413
21.3k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
414
415
21.3k
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
416
21.3k
  ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
417
418
21.3k
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
419
21.3k
  tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
420
21.3k
}
Unexecuted instantiation: convolve_avx2.c:loadu_unpack_16bit_3rows_avx2
421
422
static inline void convolve_8tap_unpack_avx2(const __m256i s[6],
423
150k
                                             __m256i ss[7]) {
424
150k
  ss[0] = _mm256_unpacklo_epi16(s[0], s[1]);
425
150k
  ss[1] = _mm256_unpacklo_epi16(s[2], s[3]);
426
150k
  ss[2] = _mm256_unpacklo_epi16(s[4], s[5]);
427
150k
  ss[4] = _mm256_unpackhi_epi16(s[0], s[1]);
428
150k
  ss[5] = _mm256_unpackhi_epi16(s[2], s[3]);
429
150k
  ss[6] = _mm256_unpackhi_epi16(s[4], s[5]);
430
150k
}
convolve_2d_avx2.c:convolve_8tap_unpack_avx2
Line
Count
Source
423
150k
                                             __m256i ss[7]) {
424
150k
  ss[0] = _mm256_unpacklo_epi16(s[0], s[1]);
425
150k
  ss[1] = _mm256_unpacklo_epi16(s[2], s[3]);
426
150k
  ss[2] = _mm256_unpacklo_epi16(s[4], s[5]);
427
150k
  ss[4] = _mm256_unpackhi_epi16(s[0], s[1]);
428
150k
  ss[5] = _mm256_unpackhi_epi16(s[2], s[3]);
429
150k
  ss[6] = _mm256_unpackhi_epi16(s[4], s[5]);
430
150k
}
Unexecuted instantiation: convolve_avx2.c:convolve_8tap_unpack_avx2
431
432
static inline __m128i convolve_2tap_ssse3(const __m128i ss[1],
433
482k
                                          const __m128i coeffs[1]) {
434
482k
  return _mm_maddubs_epi16(ss[0], coeffs[0]);
435
482k
}
convolve_2d_avx2.c:convolve_2tap_ssse3
Line
Count
Source
433
348k
                                          const __m128i coeffs[1]) {
434
348k
  return _mm_maddubs_epi16(ss[0], coeffs[0]);
435
348k
}
convolve_avx2.c:convolve_2tap_ssse3
Line
Count
Source
433
133k
                                          const __m128i coeffs[1]) {
434
133k
  return _mm_maddubs_epi16(ss[0], coeffs[0]);
435
133k
}
436
437
static inline __m128i convolve_4tap_ssse3(const __m128i ss[2],
438
4.24M
                                          const __m128i coeffs[2]) {
439
4.24M
  const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
440
4.24M
  const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
441
4.24M
  return _mm_add_epi16(res_23, res_45);
442
4.24M
}
convolve_2d_avx2.c:convolve_4tap_ssse3
Line
Count
Source
438
3.18M
                                          const __m128i coeffs[2]) {
439
3.18M
  const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
440
3.18M
  const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
441
3.18M
  return _mm_add_epi16(res_23, res_45);
442
3.18M
}
convolve_avx2.c:convolve_4tap_ssse3
Line
Count
Source
438
1.05M
                                          const __m128i coeffs[2]) {
439
1.05M
  const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
440
1.05M
  const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
441
1.05M
  return _mm_add_epi16(res_23, res_45);
442
1.05M
}
443
444
static inline __m128i convolve_6tap_ssse3(const __m128i ss[3],
445
466k
                                          const __m128i coeffs[3]) {
446
466k
  const __m128i res_12 = _mm_maddubs_epi16(ss[0], coeffs[0]);
447
466k
  const __m128i res_34 = _mm_maddubs_epi16(ss[1], coeffs[1]);
448
466k
  const __m128i res_56 = _mm_maddubs_epi16(ss[2], coeffs[2]);
449
466k
  const __m128i res_1256 = _mm_add_epi16(res_12, res_56);
450
466k
  return _mm_add_epi16(res_1256, res_34);
451
466k
}
Unexecuted instantiation: convolve_2d_avx2.c:convolve_6tap_ssse3
convolve_avx2.c:convolve_6tap_ssse3
Line
Count
Source
445
466k
                                          const __m128i coeffs[3]) {
446
466k
  const __m128i res_12 = _mm_maddubs_epi16(ss[0], coeffs[0]);
447
466k
  const __m128i res_34 = _mm_maddubs_epi16(ss[1], coeffs[1]);
448
466k
  const __m128i res_56 = _mm_maddubs_epi16(ss[2], coeffs[2]);
449
466k
  const __m128i res_1256 = _mm_add_epi16(res_12, res_56);
450
466k
  return _mm_add_epi16(res_1256, res_34);
451
466k
}
452
453
static inline __m128i convolve_8tap_ssse3(const __m128i ss[4],
454
25.5k
                                          const __m128i coeffs[4]) {
455
25.5k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
456
25.5k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
457
25.5k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
458
25.5k
  const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
459
25.5k
  const __m128i res_0145 = _mm_add_epi16(res_01, res_45);
460
25.5k
  const __m128i res_2367 = _mm_add_epi16(res_23, res_67);
461
25.5k
  return _mm_add_epi16(res_0145, res_2367);
462
25.5k
}
Unexecuted instantiation: convolve_2d_avx2.c:convolve_8tap_ssse3
convolve_avx2.c:convolve_8tap_ssse3
Line
Count
Source
454
25.5k
                                          const __m128i coeffs[4]) {
455
25.5k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
456
25.5k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
457
25.5k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
458
25.5k
  const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
459
25.5k
  const __m128i res_0145 = _mm_add_epi16(res_01, res_45);
460
25.5k
  const __m128i res_2367 = _mm_add_epi16(res_23, res_67);
461
25.5k
  return _mm_add_epi16(res_0145, res_2367);
462
25.5k
}
463
464
static inline __m256i convolve_2tap_avx2(const __m256i ss[1],
465
1.68M
                                         const __m256i coeffs[1]) {
466
1.68M
  return _mm256_maddubs_epi16(ss[0], coeffs[0]);
467
1.68M
}
convolve_2d_avx2.c:convolve_2tap_avx2
Line
Count
Source
465
956k
                                         const __m256i coeffs[1]) {
466
956k
  return _mm256_maddubs_epi16(ss[0], coeffs[0]);
467
956k
}
convolve_avx2.c:convolve_2tap_avx2
Line
Count
Source
465
724k
                                         const __m256i coeffs[1]) {
466
724k
  return _mm256_maddubs_epi16(ss[0], coeffs[0]);
467
724k
}
468
469
static inline __m256i convolve_4tap_avx2(const __m256i ss[2],
470
2.26M
                                         const __m256i coeffs[2]) {
471
2.26M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
472
2.26M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
473
2.26M
  return _mm256_add_epi16(res_23, res_45);
474
2.26M
}
convolve_2d_avx2.c:convolve_4tap_avx2
Line
Count
Source
470
1.27M
                                         const __m256i coeffs[2]) {
471
1.27M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
472
1.27M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
473
1.27M
  return _mm256_add_epi16(res_23, res_45);
474
1.27M
}
convolve_avx2.c:convolve_4tap_avx2
Line
Count
Source
470
993k
                                         const __m256i coeffs[2]) {
471
993k
  const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
472
993k
  const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
473
993k
  return _mm256_add_epi16(res_23, res_45);
474
993k
}
475
476
static inline __m256i convolve_6tap_avx2(const __m256i ss[3],
477
20.2M
                                         const __m256i coeffs[3]) {
478
20.2M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
479
20.2M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
480
20.2M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
481
20.2M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
482
20.2M
  return _mm256_add_epi16(res_0145, res_23);
483
20.2M
}
convolve_2d_avx2.c:convolve_6tap_avx2
Line
Count
Source
477
11.8M
                                         const __m256i coeffs[3]) {
478
11.8M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
479
11.8M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
480
11.8M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
481
11.8M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
482
11.8M
  return _mm256_add_epi16(res_0145, res_23);
483
11.8M
}
convolve_avx2.c:convolve_6tap_avx2
Line
Count
Source
477
8.35M
                                         const __m256i coeffs[3]) {
478
8.35M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
479
8.35M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
480
8.35M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
481
8.35M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
482
8.35M
  return _mm256_add_epi16(res_0145, res_23);
483
8.35M
}
484
485
static inline __m256i convolve_8tap_avx2(const __m256i ss[4],
486
3.98M
                                         const __m256i coeffs[4]) {
487
3.98M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
488
3.98M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
489
3.98M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
490
3.98M
  const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
491
3.98M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
492
3.98M
  const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
493
3.98M
  return _mm256_add_epi16(res_0145, res_2367);
494
3.98M
}
convolve_2d_avx2.c:convolve_8tap_avx2
Line
Count
Source
486
2.88M
                                         const __m256i coeffs[4]) {
487
2.88M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
488
2.88M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
489
2.88M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
490
2.88M
  const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
491
2.88M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
492
2.88M
  const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
493
2.88M
  return _mm256_add_epi16(res_0145, res_2367);
494
2.88M
}
convolve_avx2.c:convolve_8tap_avx2
Line
Count
Source
486
1.10M
                                         const __m256i coeffs[4]) {
487
1.10M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
488
1.10M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
489
1.10M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
490
1.10M
  const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
491
1.10M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
492
1.10M
  const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
493
1.10M
  return _mm256_add_epi16(res_0145, res_2367);
494
1.10M
}
495
496
static inline __m128i convolve16_2tap_sse2(const __m128i ss[1],
497
127k
                                           const __m128i coeffs[1]) {
498
127k
  return _mm_madd_epi16(ss[0], coeffs[0]);
499
127k
}
convolve_2d_avx2.c:convolve16_2tap_sse2
Line
Count
Source
497
127k
                                           const __m128i coeffs[1]) {
498
127k
  return _mm_madd_epi16(ss[0], coeffs[0]);
499
127k
}
Unexecuted instantiation: convolve_avx2.c:convolve16_2tap_sse2
500
501
static inline __m128i convolve16_4tap_sse2(const __m128i ss[2],
502
115k
                                           const __m128i coeffs[2]) {
503
115k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
504
115k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
505
115k
  return _mm_add_epi32(res_01, res_23);
506
115k
}
convolve_2d_avx2.c:convolve16_4tap_sse2
Line
Count
Source
502
115k
                                           const __m128i coeffs[2]) {
503
115k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
504
115k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
505
115k
  return _mm_add_epi32(res_01, res_23);
506
115k
}
Unexecuted instantiation: convolve_avx2.c:convolve16_4tap_sse2
507
508
static inline __m128i convolve16_6tap_sse2(const __m128i ss[3],
509
186k
                                           const __m128i coeffs[3]) {
510
186k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
511
186k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
512
186k
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
513
186k
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
514
186k
  return _mm_add_epi32(res_0123, res_45);
515
186k
}
convolve_2d_avx2.c:convolve16_6tap_sse2
Line
Count
Source
509
186k
                                           const __m128i coeffs[3]) {
510
186k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
511
186k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
512
186k
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
513
186k
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
514
186k
  return _mm_add_epi32(res_0123, res_45);
515
186k
}
Unexecuted instantiation: convolve_avx2.c:convolve16_6tap_sse2
516
517
static inline __m128i convolve16_8tap_sse2(const __m128i ss[4],
518
10.0k
                                           const __m128i coeffs[4]) {
519
10.0k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
520
10.0k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
521
10.0k
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
522
10.0k
  const __m128i res_67 = _mm_madd_epi16(ss[3], coeffs[3]);
523
10.0k
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
524
10.0k
  const __m128i res_4567 = _mm_add_epi32(res_45, res_67);
525
10.0k
  return _mm_add_epi32(res_0123, res_4567);
526
10.0k
}
convolve_2d_avx2.c:convolve16_8tap_sse2
Line
Count
Source
518
10.0k
                                           const __m128i coeffs[4]) {
519
10.0k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
520
10.0k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
521
10.0k
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
522
10.0k
  const __m128i res_67 = _mm_madd_epi16(ss[3], coeffs[3]);
523
10.0k
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
524
10.0k
  const __m128i res_4567 = _mm_add_epi32(res_45, res_67);
525
10.0k
  return _mm_add_epi32(res_0123, res_4567);
526
10.0k
}
Unexecuted instantiation: convolve_avx2.c:convolve16_8tap_sse2
527
528
static inline __m256i convolve16_2tap_avx2(const __m256i ss[1],
529
1.40M
                                           const __m256i coeffs[1]) {
530
1.40M
  return _mm256_madd_epi16(ss[0], coeffs[0]);
531
1.40M
}
convolve_2d_avx2.c:convolve16_2tap_avx2
Line
Count
Source
529
1.40M
                                           const __m256i coeffs[1]) {
530
1.40M
  return _mm256_madd_epi16(ss[0], coeffs[0]);
531
1.40M
}
Unexecuted instantiation: convolve_avx2.c:convolve16_2tap_avx2
532
533
static inline __m256i convolve16_4tap_avx2(const __m256i ss[2],
534
5.24M
                                           const __m256i coeffs[2]) {
535
5.24M
  const __m256i res_1 = _mm256_madd_epi16(ss[0], coeffs[0]);
536
5.24M
  const __m256i res_2 = _mm256_madd_epi16(ss[1], coeffs[1]);
537
5.24M
  return _mm256_add_epi32(res_1, res_2);
538
5.24M
}
convolve_2d_avx2.c:convolve16_4tap_avx2
Line
Count
Source
534
5.24M
                                           const __m256i coeffs[2]) {
535
5.24M
  const __m256i res_1 = _mm256_madd_epi16(ss[0], coeffs[0]);
536
5.24M
  const __m256i res_2 = _mm256_madd_epi16(ss[1], coeffs[1]);
537
5.24M
  return _mm256_add_epi32(res_1, res_2);
538
5.24M
}
Unexecuted instantiation: convolve_avx2.c:convolve16_4tap_avx2
539
540
static inline __m256i convolve16_6tap_avx2(const __m256i ss[3],
541
16.1M
                                           const __m256i coeffs[3]) {
542
16.1M
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
543
16.1M
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
544
16.1M
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
545
16.1M
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
546
16.1M
  return _mm256_add_epi32(res_0123, res_45);
547
16.1M
}
convolve_2d_avx2.c:convolve16_6tap_avx2
Line
Count
Source
541
16.1M
                                           const __m256i coeffs[3]) {
542
16.1M
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
543
16.1M
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
544
16.1M
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
545
16.1M
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
546
16.1M
  return _mm256_add_epi32(res_0123, res_45);
547
16.1M
}
Unexecuted instantiation: convolve_avx2.c:convolve16_6tap_avx2
548
549
static inline __m256i convolve16_8tap_avx2(const __m256i ss[4],
550
4.27M
                                           const __m256i coeffs[4]) {
551
4.27M
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
552
4.27M
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
553
4.27M
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
554
4.27M
  const __m256i res_67 = _mm256_madd_epi16(ss[3], coeffs[3]);
555
4.27M
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
556
4.27M
  const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
557
4.27M
  return _mm256_add_epi32(res_0123, res_4567);
558
4.27M
}
convolve_2d_avx2.c:convolve16_8tap_avx2
Line
Count
Source
550
4.27M
                                           const __m256i coeffs[4]) {
551
4.27M
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
552
4.27M
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
553
4.27M
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
554
4.27M
  const __m256i res_67 = _mm256_madd_epi16(ss[3], coeffs[3]);
555
4.27M
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
556
4.27M
  const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
557
4.27M
  return _mm256_add_epi32(res_0123, res_4567);
558
4.27M
}
Unexecuted instantiation: convolve_avx2.c:convolve16_8tap_avx2
559
560
static inline __m256i x_convolve_4tap_avx2(const __m256i data,
561
                                           const __m256i coeffs[2],
562
1.27M
                                           const __m256i filt[2]) {
563
1.27M
  __m256i ss[2];
564
565
1.27M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
566
1.27M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
567
568
1.27M
  return convolve_4tap_avx2(ss, coeffs);
569
1.27M
}
convolve_2d_avx2.c:x_convolve_4tap_avx2
Line
Count
Source
562
1.27M
                                           const __m256i filt[2]) {
563
1.27M
  __m256i ss[2];
564
565
1.27M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
566
1.27M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
567
568
1.27M
  return convolve_4tap_avx2(ss, coeffs);
569
1.27M
}
Unexecuted instantiation: convolve_avx2.c:x_convolve_4tap_avx2
570
571
static inline __m256i x_convolve_6tap_avx2(const __m256i data,
572
                                           const __m256i coeffs[3],
573
16.4M
                                           const __m256i filt[3]) {
574
16.4M
  __m256i ss[3];
575
576
16.4M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
577
16.4M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
578
16.4M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
579
580
16.4M
  return convolve_6tap_avx2(ss, coeffs);
581
16.4M
}
convolve_2d_avx2.c:x_convolve_6tap_avx2
Line
Count
Source
573
11.8M
                                           const __m256i filt[3]) {
574
11.8M
  __m256i ss[3];
575
576
11.8M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
577
11.8M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
578
11.8M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
579
580
11.8M
  return convolve_6tap_avx2(ss, coeffs);
581
11.8M
}
convolve_avx2.c:x_convolve_6tap_avx2
Line
Count
Source
573
4.55M
                                           const __m256i filt[3]) {
574
4.55M
  __m256i ss[3];
575
576
4.55M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
577
4.55M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
578
4.55M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
579
580
4.55M
  return convolve_6tap_avx2(ss, coeffs);
581
4.55M
}
582
583
static inline __m256i x_convolve_8tap_avx2(const __m256i data,
584
                                           const __m256i coeffs[4],
585
3.40M
                                           const __m256i filt[4]) {
586
3.40M
  __m256i ss[4];
587
588
3.40M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
589
3.40M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
590
3.40M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
591
3.40M
  ss[3] = _mm256_shuffle_epi8(data, filt[3]);
592
593
3.40M
  return convolve_8tap_avx2(ss, coeffs);
594
3.40M
}
convolve_2d_avx2.c:x_convolve_8tap_avx2
Line
Count
Source
585
2.88M
                                           const __m256i filt[4]) {
586
2.88M
  __m256i ss[4];
587
588
2.88M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
589
2.88M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
590
2.88M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
591
2.88M
  ss[3] = _mm256_shuffle_epi8(data, filt[3]);
592
593
2.88M
  return convolve_8tap_avx2(ss, coeffs);
594
2.88M
}
convolve_avx2.c:x_convolve_8tap_avx2
Line
Count
Source
585
523k
                                           const __m256i filt[4]) {
586
523k
  __m256i ss[4];
587
588
523k
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
589
523k
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
590
523k
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
591
523k
  ss[3] = _mm256_shuffle_epi8(data, filt[3]);
592
593
523k
  return convolve_8tap_avx2(ss, coeffs);
594
523k
}
595
596
5.74M
static inline __m256i sr_y_round_avx2(const __m256i src) {
597
5.74M
  const __m256i round = _mm256_set1_epi16(32);
598
5.74M
  const __m256i dst = _mm256_add_epi16(src, round);
599
5.74M
  return _mm256_srai_epi16(dst, FILTER_BITS - 1);
600
5.74M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_avx2
convolve_avx2.c:sr_y_round_avx2
Line
Count
Source
596
5.74M
static inline __m256i sr_y_round_avx2(const __m256i src) {
597
5.74M
  const __m256i round = _mm256_set1_epi16(32);
598
5.74M
  const __m256i dst = _mm256_add_epi16(src, round);
599
5.74M
  return _mm256_srai_epi16(dst, FILTER_BITS - 1);
600
5.74M
}
601
602
3.53M
static inline __m128i xy_x_round_sse2(const __m128i src) {
603
3.53M
  const __m128i round = _mm_set1_epi16(2);
604
3.53M
  const __m128i dst = _mm_add_epi16(src, round);
605
3.53M
  return _mm_srai_epi16(dst, 2);
606
3.53M
}
convolve_2d_avx2.c:xy_x_round_sse2
Line
Count
Source
602
3.53M
static inline __m128i xy_x_round_sse2(const __m128i src) {
603
3.53M
  const __m128i round = _mm_set1_epi16(2);
604
3.53M
  const __m128i dst = _mm_add_epi16(src, round);
605
3.53M
  return _mm_srai_epi16(dst, 2);
606
3.53M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_sse2
607
608
16.9M
static inline __m256i xy_x_round_avx2(const __m256i src) {
609
16.9M
  const __m256i round = _mm256_set1_epi16(2);
610
16.9M
  const __m256i dst = _mm256_add_epi16(src, round);
611
16.9M
  return _mm256_srai_epi16(dst, 2);
612
16.9M
}
convolve_2d_avx2.c:xy_x_round_avx2
Line
Count
Source
608
16.9M
static inline __m256i xy_x_round_avx2(const __m256i src) {
609
16.9M
  const __m256i round = _mm256_set1_epi16(2);
610
16.9M
  const __m256i dst = _mm256_add_epi16(src, round);
611
16.9M
  return _mm256_srai_epi16(dst, 2);
612
16.9M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_avx2
613
614
static inline void xy_x_round_store_2x2_sse2(const __m128i res,
615
625k
                                             int16_t *const dst) {
616
625k
  const __m128i d = xy_x_round_sse2(res);
617
625k
  _mm_storel_epi64((__m128i *)dst, d);
618
625k
}
convolve_2d_avx2.c:xy_x_round_store_2x2_sse2
Line
Count
Source
615
625k
                                             int16_t *const dst) {
616
625k
  const __m128i d = xy_x_round_sse2(res);
617
625k
  _mm_storel_epi64((__m128i *)dst, d);
618
625k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_2x2_sse2
619
620
static inline void xy_x_round_store_4x2_sse2(const __m128i res,
621
2.70M
                                             int16_t *const dst) {
622
2.70M
  const __m128i d = xy_x_round_sse2(res);
623
2.70M
  _mm_storeu_si128((__m128i *)dst, d);
624
2.70M
}
convolve_2d_avx2.c:xy_x_round_store_4x2_sse2
Line
Count
Source
621
2.70M
                                             int16_t *const dst) {
622
2.70M
  const __m128i d = xy_x_round_sse2(res);
623
2.70M
  _mm_storeu_si128((__m128i *)dst, d);
624
2.70M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_4x2_sse2
625
626
static inline void xy_x_round_store_8x2_sse2(const __m128i res[2],
627
103k
                                             int16_t *const dst) {
628
103k
  __m128i r[2];
629
630
103k
  r[0] = xy_x_round_sse2(res[0]);
631
103k
  r[1] = xy_x_round_sse2(res[1]);
632
103k
  _mm_storeu_si128((__m128i *)dst, r[0]);
633
103k
  _mm_storeu_si128((__m128i *)(dst + 8), r[1]);
634
103k
}
convolve_2d_avx2.c:xy_x_round_store_8x2_sse2
Line
Count
Source
627
103k
                                             int16_t *const dst) {
628
103k
  __m128i r[2];
629
630
103k
  r[0] = xy_x_round_sse2(res[0]);
631
103k
  r[1] = xy_x_round_sse2(res[1]);
632
103k
  _mm_storeu_si128((__m128i *)dst, r[0]);
633
103k
  _mm_storeu_si128((__m128i *)(dst + 8), r[1]);
634
103k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_8x2_sse2
635
636
static inline void xy_x_round_store_8x2_avx2(const __m256i res,
637
2.64M
                                             int16_t *const dst) {
638
2.64M
  const __m256i d = xy_x_round_avx2(res);
639
2.64M
  _mm256_storeu_si256((__m256i *)dst, d);
640
2.64M
}
convolve_2d_avx2.c:xy_x_round_store_8x2_avx2
Line
Count
Source
637
2.64M
                                             int16_t *const dst) {
638
2.64M
  const __m256i d = xy_x_round_avx2(res);
639
2.64M
  _mm256_storeu_si256((__m256i *)dst, d);
640
2.64M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_8x2_avx2
641
642
static inline void xy_x_round_store_32_avx2(const __m256i res[2],
643
1.82M
                                            int16_t *const dst) {
644
1.82M
  __m256i r[2];
645
646
1.82M
  r[0] = xy_x_round_avx2(res[0]);
647
1.82M
  r[1] = xy_x_round_avx2(res[1]);
648
1.82M
  const __m256i d0 =
649
1.82M
      _mm256_inserti128_si256(r[0], _mm256_castsi256_si128(r[1]), 1);
650
1.82M
  const __m256i d1 =
651
1.82M
      _mm256_inserti128_si256(r[1], _mm256_extracti128_si256(r[0], 1), 0);
652
1.82M
  _mm256_storeu_si256((__m256i *)dst, d0);
653
1.82M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
654
1.82M
}
convolve_2d_avx2.c:xy_x_round_store_32_avx2
Line
Count
Source
643
1.82M
                                            int16_t *const dst) {
644
1.82M
  __m256i r[2];
645
646
1.82M
  r[0] = xy_x_round_avx2(res[0]);
647
1.82M
  r[1] = xy_x_round_avx2(res[1]);
648
1.82M
  const __m256i d0 =
649
1.82M
      _mm256_inserti128_si256(r[0], _mm256_castsi256_si128(r[1]), 1);
650
1.82M
  const __m256i d1 =
651
1.82M
      _mm256_inserti128_si256(r[1], _mm256_extracti128_si256(r[0], 1), 0);
652
1.82M
  _mm256_storeu_si256((__m256i *)dst, d0);
653
1.82M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
654
1.82M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_32_avx2
655
656
439k
static inline __m128i xy_y_round_sse2(const __m128i src) {
657
439k
  const __m128i round = _mm_set1_epi32(1024);
658
439k
  const __m128i dst = _mm_add_epi32(src, round);
659
439k
  return _mm_srai_epi32(dst, 11);
660
439k
}
convolve_2d_avx2.c:xy_y_round_sse2
Line
Count
Source
656
439k
static inline __m128i xy_y_round_sse2(const __m128i src) {
657
439k
  const __m128i round = _mm_set1_epi32(1024);
658
439k
  const __m128i dst = _mm_add_epi32(src, round);
659
439k
  return _mm_srai_epi32(dst, 11);
660
439k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_sse2
661
662
34.3k
static inline __m128i xy_y_round_half_pel_sse2(const __m128i src) {
663
34.3k
  const __m128i round = _mm_set1_epi16(16);
664
34.3k
  const __m128i dst = _mm_add_epi16(src, round);
665
34.3k
  return _mm_srai_epi16(dst, 5);
666
34.3k
}
convolve_2d_avx2.c:xy_y_round_half_pel_sse2
Line
Count
Source
662
34.3k
static inline __m128i xy_y_round_half_pel_sse2(const __m128i src) {
663
34.3k
  const __m128i round = _mm_set1_epi16(16);
664
34.3k
  const __m128i dst = _mm_add_epi16(src, round);
665
34.3k
  return _mm_srai_epi16(dst, 5);
666
34.3k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_half_pel_sse2
667
668
27.0M
static inline __m256i xy_y_round_avx2(const __m256i src) {
669
27.0M
  const __m256i round = _mm256_set1_epi32(1024);
670
27.0M
  const __m256i dst = _mm256_add_epi32(src, round);
671
27.0M
  return _mm256_srai_epi32(dst, 11);
672
27.0M
}
convolve_2d_avx2.c:xy_y_round_avx2
Line
Count
Source
668
27.0M
static inline __m256i xy_y_round_avx2(const __m256i src) {
669
27.0M
  const __m256i round = _mm256_set1_epi32(1024);
670
27.0M
  const __m256i dst = _mm256_add_epi32(src, round);
671
27.0M
  return _mm256_srai_epi32(dst, 11);
672
27.0M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_avx2
673
674
12.7M
static inline __m256i xy_y_round_16_avx2(const __m256i r[2]) {
675
12.7M
  const __m256i r0 = xy_y_round_avx2(r[0]);
676
12.7M
  const __m256i r1 = xy_y_round_avx2(r[1]);
677
12.7M
  return _mm256_packs_epi32(r0, r1);
678
12.7M
}
convolve_2d_avx2.c:xy_y_round_16_avx2
Line
Count
Source
674
12.7M
static inline __m256i xy_y_round_16_avx2(const __m256i r[2]) {
675
12.7M
  const __m256i r0 = xy_y_round_avx2(r[0]);
676
12.7M
  const __m256i r1 = xy_y_round_avx2(r[1]);
677
12.7M
  return _mm256_packs_epi32(r0, r1);
678
12.7M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_16_avx2
679
680
427k
static inline __m256i xy_y_round_half_pel_avx2(const __m256i src) {
681
427k
  const __m256i round = _mm256_set1_epi16(16);
682
427k
  const __m256i dst = _mm256_add_epi16(src, round);
683
427k
  return _mm256_srai_epi16(dst, 5);
684
427k
}
convolve_2d_avx2.c:xy_y_round_half_pel_avx2
Line
Count
Source
680
427k
static inline __m256i xy_y_round_half_pel_avx2(const __m256i src) {
681
427k
  const __m256i round = _mm256_set1_epi16(16);
682
427k
  const __m256i dst = _mm256_add_epi16(src, round);
683
427k
  return _mm256_srai_epi16(dst, 5);
684
427k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_half_pel_avx2
685
686
static inline void pack_store_2x2_sse2(const __m128i res, uint8_t *const dst,
687
602k
                                       const ptrdiff_t stride) {
688
602k
  const __m128i d = _mm_packus_epi16(res, res);
689
602k
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
690
602k
  *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
691
602k
}
convolve_2d_avx2.c:pack_store_2x2_sse2
Line
Count
Source
687
331k
                                       const ptrdiff_t stride) {
688
331k
  const __m128i d = _mm_packus_epi16(res, res);
689
331k
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
690
331k
  *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
691
331k
}
convolve_avx2.c:pack_store_2x2_sse2
Line
Count
Source
687
271k
                                       const ptrdiff_t stride) {
688
271k
  const __m128i d = _mm_packus_epi16(res, res);
689
271k
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
690
271k
  *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
691
271k
}
692
693
static inline void pack_store_4x2_sse2(const __m128i res, uint8_t *const dst,
694
1.41M
                                       const ptrdiff_t stride) {
695
1.41M
  const __m128i d = _mm_packus_epi16(res, res);
696
1.41M
  store_u8_4x2_sse2(d, dst, stride);
697
1.41M
}
convolve_2d_avx2.c:pack_store_4x2_sse2
Line
Count
Source
694
85.7k
                                       const ptrdiff_t stride) {
695
85.7k
  const __m128i d = _mm_packus_epi16(res, res);
696
85.7k
  store_u8_4x2_sse2(d, dst, stride);
697
85.7k
}
convolve_avx2.c:pack_store_4x2_sse2
Line
Count
Source
694
1.33M
                                       const ptrdiff_t stride) {
695
1.33M
  const __m128i d = _mm_packus_epi16(res, res);
696
1.33M
  store_u8_4x2_sse2(d, dst, stride);
697
1.33M
}
698
699
static inline void pack_store_4x2_avx2(const __m256i res, uint8_t *const dst,
700
1.48M
                                       const ptrdiff_t stride) {
701
1.48M
  const __m256i d = _mm256_packus_epi16(res, res);
702
1.48M
  const __m128i d0 = _mm256_castsi256_si128(d);
703
1.48M
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
704
705
1.48M
  xx_storel_32(dst, d0);
706
1.48M
  xx_storel_32(dst + stride, d1);
707
1.48M
}
convolve_2d_avx2.c:pack_store_4x2_avx2
Line
Count
Source
700
1.48M
                                       const ptrdiff_t stride) {
701
1.48M
  const __m256i d = _mm256_packus_epi16(res, res);
702
1.48M
  const __m128i d0 = _mm256_castsi256_si128(d);
703
1.48M
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
704
705
1.48M
  xx_storel_32(dst, d0);
706
1.48M
  xx_storel_32(dst + stride, d1);
707
1.48M
}
Unexecuted instantiation: convolve_avx2.c:pack_store_4x2_avx2
708
709
static inline void pack_store_8x2_avx2(const __m256i res, uint8_t *const dst,
710
3.04M
                                       const ptrdiff_t stride) {
711
3.04M
  const __m256i d = _mm256_packus_epi16(res, res);
712
3.04M
  const __m128i d0 = _mm256_castsi256_si128(d);
713
3.04M
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
714
3.04M
  _mm_storel_epi64((__m128i *)dst, d0);
715
3.04M
  _mm_storel_epi64((__m128i *)(dst + stride), d1);
716
3.04M
}
convolve_2d_avx2.c:pack_store_8x2_avx2
Line
Count
Source
710
1.64M
                                       const ptrdiff_t stride) {
711
1.64M
  const __m256i d = _mm256_packus_epi16(res, res);
712
1.64M
  const __m128i d0 = _mm256_castsi256_si128(d);
713
1.64M
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
714
1.64M
  _mm_storel_epi64((__m128i *)dst, d0);
715
1.64M
  _mm_storel_epi64((__m128i *)(dst + stride), d1);
716
1.64M
}
convolve_avx2.c:pack_store_8x2_avx2
Line
Count
Source
710
1.40M
                                       const ptrdiff_t stride) {
711
1.40M
  const __m256i d = _mm256_packus_epi16(res, res);
712
1.40M
  const __m128i d0 = _mm256_castsi256_si128(d);
713
1.40M
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
714
1.40M
  _mm_storel_epi64((__m128i *)dst, d0);
715
1.40M
  _mm_storel_epi64((__m128i *)(dst + stride), d1);
716
1.40M
}
717
718
static inline void pack_store_16x2_avx2(const __m256i res0, const __m256i res1,
719
                                        uint8_t *const dst,
720
1.16M
                                        const ptrdiff_t stride) {
721
1.16M
  const __m256i d = _mm256_packus_epi16(res0, res1);
722
1.16M
  storeu_u8_16x2_avx2(d, dst, stride);
723
1.16M
}
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_16x2_avx2
convolve_avx2.c:pack_store_16x2_avx2
Line
Count
Source
720
1.16M
                                        const ptrdiff_t stride) {
721
1.16M
  const __m256i d = _mm256_packus_epi16(res0, res1);
722
1.16M
  storeu_u8_16x2_avx2(d, dst, stride);
723
1.16M
}
724
725
static inline void xy_y_pack_store_16x2_avx2(const __m256i res0,
726
                                             const __m256i res1,
727
                                             uint8_t *const dst,
728
1.22M
                                             const ptrdiff_t stride) {
729
1.22M
  const __m256i t = _mm256_packus_epi16(res0, res1);
730
1.22M
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
731
1.22M
  storeu_u8_16x2_avx2(d, dst, stride);
732
1.22M
}
convolve_2d_avx2.c:xy_y_pack_store_16x2_avx2
Line
Count
Source
728
1.22M
                                             const ptrdiff_t stride) {
729
1.22M
  const __m256i t = _mm256_packus_epi16(res0, res1);
730
1.22M
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
731
1.22M
  storeu_u8_16x2_avx2(d, dst, stride);
732
1.22M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_pack_store_16x2_avx2
733
734
static inline void pack_store_32_avx2(const __m256i res0, const __m256i res1,
735
0
                                      uint8_t *const dst) {
736
0
  const __m256i t = _mm256_packus_epi16(res0, res1);
737
0
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
738
0
  _mm256_storeu_si256((__m256i *)dst, d);
739
0
}
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_32_avx2
Unexecuted instantiation: convolve_avx2.c:pack_store_32_avx2
740
741
static inline void xy_y_round_store_2x2_sse2(const __m128i res,
742
                                             uint8_t *const dst,
743
325k
                                             const ptrdiff_t stride) {
744
325k
  const __m128i r = xy_y_round_sse2(res);
745
325k
  const __m128i rr = _mm_packs_epi32(r, r);
746
325k
  pack_store_2x2_sse2(rr, dst, stride);
747
325k
}
convolve_2d_avx2.c:xy_y_round_store_2x2_sse2
Line
Count
Source
743
325k
                                             const ptrdiff_t stride) {
744
325k
  const __m128i r = xy_y_round_sse2(res);
745
325k
  const __m128i rr = _mm_packs_epi32(r, r);
746
325k
  pack_store_2x2_sse2(rr, dst, stride);
747
325k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_2x2_sse2
748
749
static inline void xy_y_round_store_4x2_avx2(const __m256i res,
750
                                             uint8_t *const dst,
751
1.48M
                                             const ptrdiff_t stride) {
752
1.48M
  const __m256i r = xy_y_round_avx2(res);
753
1.48M
  const __m256i rr = _mm256_packs_epi32(r, r);
754
1.48M
  pack_store_4x2_avx2(rr, dst, stride);
755
1.48M
}
convolve_2d_avx2.c:xy_y_round_store_4x2_avx2
Line
Count
Source
751
1.48M
                                             const ptrdiff_t stride) {
752
1.48M
  const __m256i r = xy_y_round_avx2(res);
753
1.48M
  const __m256i rr = _mm256_packs_epi32(r, r);
754
1.48M
  pack_store_4x2_avx2(rr, dst, stride);
755
1.48M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_4x2_avx2
756
757
static inline void xy_y_pack_store_32_avx2(const __m256i res0,
758
                                           const __m256i res1,
759
4.56M
                                           uint8_t *const dst) {
760
4.56M
  const __m256i d = _mm256_packus_epi16(res0, res1);
761
  // d = _mm256_permute4x64_epi64(d, 0xD8);
762
4.56M
  _mm256_storeu_si256((__m256i *)dst, d);
763
4.56M
}
convolve_2d_avx2.c:xy_y_pack_store_32_avx2
Line
Count
Source
759
4.56M
                                           uint8_t *const dst) {
760
4.56M
  const __m256i d = _mm256_packus_epi16(res0, res1);
761
  // d = _mm256_permute4x64_epi64(d, 0xD8);
762
4.56M
  _mm256_storeu_si256((__m256i *)dst, d);
763
4.56M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_pack_store_32_avx2
764
765
static inline void xy_y_round_store_32_avx2(const __m256i r0[2],
766
                                            const __m256i r1[2],
767
4.38M
                                            uint8_t *const dst) {
768
4.38M
  const __m256i ra = xy_y_round_16_avx2(r0);
769
4.38M
  const __m256i rb = xy_y_round_16_avx2(r1);
770
4.38M
  xy_y_pack_store_32_avx2(ra, rb, dst);
771
4.38M
}
convolve_2d_avx2.c:xy_y_round_store_32_avx2
Line
Count
Source
767
4.38M
                                            uint8_t *const dst) {
768
4.38M
  const __m256i ra = xy_y_round_16_avx2(r0);
769
4.38M
  const __m256i rb = xy_y_round_16_avx2(r1);
770
4.38M
  xy_y_pack_store_32_avx2(ra, rb, dst);
771
4.38M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_32_avx2
772
773
static inline void convolve_store_32_avx2(const __m256i res0,
774
                                          const __m256i res1,
775
3.72M
                                          uint8_t *const dst) {
776
3.72M
  const __m256i d = _mm256_packus_epi16(res0, res1);
777
3.72M
  _mm256_storeu_si256((__m256i *)dst, d);
778
3.72M
}
Unexecuted instantiation: convolve_2d_avx2.c:convolve_store_32_avx2
convolve_avx2.c:convolve_store_32_avx2
Line
Count
Source
775
3.72M
                                          uint8_t *const dst) {
776
3.72M
  const __m256i d = _mm256_packus_epi16(res0, res1);
777
3.72M
  _mm256_storeu_si256((__m256i *)dst, d);
778
3.72M
}
779
780
838k
static inline __m128i sr_x_round_sse2(const __m128i src) {
781
838k
  const __m128i round = _mm_set1_epi16(34);
782
838k
  const __m128i dst = _mm_add_epi16(src, round);
783
838k
  return _mm_srai_epi16(dst, 6);
784
838k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_sse2
convolve_avx2.c:sr_x_round_sse2
Line
Count
Source
780
838k
static inline __m128i sr_x_round_sse2(const __m128i src) {
781
838k
  const __m128i round = _mm_set1_epi16(34);
782
838k
  const __m128i dst = _mm_add_epi16(src, round);
783
838k
  return _mm_srai_epi16(dst, 6);
784
838k
}
785
786
5.90M
static inline __m256i sr_x_round_avx2(const __m256i src) {
787
5.90M
  const __m256i round = _mm256_set1_epi16(34);
788
5.90M
  const __m256i dst = _mm256_add_epi16(src, round);
789
5.90M
  return _mm256_srai_epi16(dst, 6);
790
5.90M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_avx2
convolve_avx2.c:sr_x_round_avx2
Line
Count
Source
786
5.90M
static inline __m256i sr_x_round_avx2(const __m256i src) {
787
5.90M
  const __m256i round = _mm256_set1_epi16(34);
788
5.90M
  const __m256i dst = _mm256_add_epi16(src, round);
789
5.90M
  return _mm256_srai_epi16(dst, 6);
790
5.90M
}
791
792
845k
static inline __m128i sr_y_round_sse2(const __m128i src) {
793
845k
  const __m128i round = _mm_set1_epi16(32);
794
845k
  const __m128i dst = _mm_add_epi16(src, round);
795
845k
  return _mm_srai_epi16(dst, FILTER_BITS - 1);
796
845k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_sse2
convolve_avx2.c:sr_y_round_sse2
Line
Count
Source
792
845k
static inline __m128i sr_y_round_sse2(const __m128i src) {
793
845k
  const __m128i round = _mm_set1_epi16(32);
794
845k
  const __m128i dst = _mm_add_epi16(src, round);
795
845k
  return _mm_srai_epi16(dst, FILTER_BITS - 1);
796
845k
}
797
798
static inline void sr_x_round_store_8x2_avx2(const __m256i res,
799
                                             uint8_t *const dst,
800
655k
                                             const ptrdiff_t dst_stride) {
801
655k
  const __m256i r = sr_x_round_avx2(res);
802
655k
  pack_store_8x2_avx2(r, dst, dst_stride);
803
655k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_store_8x2_avx2
convolve_avx2.c:sr_x_round_store_8x2_avx2
Line
Count
Source
800
655k
                                             const ptrdiff_t dst_stride) {
801
655k
  const __m256i r = sr_x_round_avx2(res);
802
655k
  pack_store_8x2_avx2(r, dst, dst_stride);
803
655k
}
804
805
static inline void sr_x_round_store_16x2_avx2(const __m256i res[2],
806
                                              uint8_t *const dst,
807
548k
                                              const ptrdiff_t dst_stride) {
808
548k
  __m256i r[2];
809
810
548k
  r[0] = sr_x_round_avx2(res[0]);
811
548k
  r[1] = sr_x_round_avx2(res[1]);
812
548k
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
813
548k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_store_16x2_avx2
convolve_avx2.c:sr_x_round_store_16x2_avx2
Line
Count
Source
807
548k
                                              const ptrdiff_t dst_stride) {
808
548k
  __m256i r[2];
809
810
548k
  r[0] = sr_x_round_avx2(res[0]);
811
548k
  r[1] = sr_x_round_avx2(res[1]);
812
548k
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
813
548k
}
814
815
static inline void sr_x_round_store_32_avx2(const __m256i res[2],
816
1.84M
                                            uint8_t *const dst) {
817
1.84M
  __m256i r[2];
818
819
1.84M
  r[0] = sr_x_round_avx2(res[0]);
820
1.84M
  r[1] = sr_x_round_avx2(res[1]);
821
1.84M
  convolve_store_32_avx2(r[0], r[1], dst);
822
1.84M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_store_32_avx2
convolve_avx2.c:sr_x_round_store_32_avx2
Line
Count
Source
816
1.84M
                                            uint8_t *const dst) {
817
1.84M
  __m256i r[2];
818
819
1.84M
  r[0] = sr_x_round_avx2(res[0]);
820
1.84M
  r[1] = sr_x_round_avx2(res[1]);
821
1.84M
  convolve_store_32_avx2(r[0], r[1], dst);
822
1.84M
}
823
824
static inline void sr_y_round_store_8x2_avx2(const __m256i res,
825
                                             uint8_t *const dst,
826
748k
                                             const ptrdiff_t dst_stride) {
827
748k
  const __m256i r = sr_y_round_avx2(res);
828
748k
  pack_store_8x2_avx2(r, dst, dst_stride);
829
748k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_store_8x2_avx2
convolve_avx2.c:sr_y_round_store_8x2_avx2
Line
Count
Source
826
748k
                                             const ptrdiff_t dst_stride) {
827
748k
  const __m256i r = sr_y_round_avx2(res);
828
748k
  pack_store_8x2_avx2(r, dst, dst_stride);
829
748k
}
830
831
static inline void sr_y_round_store_16x2_avx2(const __m256i res[2],
832
                                              uint8_t *const dst,
833
618k
                                              const ptrdiff_t dst_stride) {
834
618k
  __m256i r[2];
835
836
618k
  r[0] = sr_y_round_avx2(res[0]);
837
618k
  r[1] = sr_y_round_avx2(res[1]);
838
618k
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
839
618k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_store_16x2_avx2
convolve_avx2.c:sr_y_round_store_16x2_avx2
Line
Count
Source
833
618k
                                              const ptrdiff_t dst_stride) {
834
618k
  __m256i r[2];
835
836
618k
  r[0] = sr_y_round_avx2(res[0]);
837
618k
  r[1] = sr_y_round_avx2(res[1]);
838
618k
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
839
618k
}
840
841
static inline void sr_y_2tap_32_avg_avx2(const uint8_t *const src,
842
                                         const __m256i s0, __m256i *const s1,
843
84.8k
                                         uint8_t *const dst) {
844
84.8k
  *s1 = _mm256_loadu_si256((__m256i *)src);
845
84.8k
  const __m256i d = _mm256_avg_epu8(s0, *s1);
846
84.8k
  _mm256_storeu_si256((__m256i *)dst, d);
847
84.8k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_2tap_32_avg_avx2
convolve_avx2.c:sr_y_2tap_32_avg_avx2
Line
Count
Source
843
84.8k
                                         uint8_t *const dst) {
844
84.8k
  *s1 = _mm256_loadu_si256((__m256i *)src);
845
84.8k
  const __m256i d = _mm256_avg_epu8(s0, *s1);
846
84.8k
  _mm256_storeu_si256((__m256i *)dst, d);
847
84.8k
}
848
849
static inline void sr_x_2tap_32_avg_avx2(const uint8_t *const src,
850
166k
                                         uint8_t *const dst) {
851
166k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
852
166k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
853
166k
  const __m256i d = _mm256_avg_epu8(s0, s1);
854
166k
  _mm256_storeu_si256((__m256i *)dst, d);
855
166k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_2tap_32_avg_avx2
convolve_avx2.c:sr_x_2tap_32_avg_avx2
Line
Count
Source
850
166k
                                         uint8_t *const dst) {
851
166k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
852
166k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
853
166k
  const __m256i d = _mm256_avg_epu8(s0, s1);
854
166k
  _mm256_storeu_si256((__m256i *)dst, d);
855
166k
}
856
857
static inline __m128i x_convolve_2tap_2x2_sse4_1(const uint8_t *const src,
858
                                                 const ptrdiff_t stride,
859
33.8k
                                                 const __m128i coeffs[1]) {
860
33.8k
  const __m128i sfl =
861
33.8k
      _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
862
33.8k
  const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
863
33.8k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
864
33.8k
  return convolve_2tap_ssse3(&ss, coeffs);
865
33.8k
}
convolve_2d_avx2.c:x_convolve_2tap_2x2_sse4_1
Line
Count
Source
859
28.0k
                                                 const __m128i coeffs[1]) {
860
28.0k
  const __m128i sfl =
861
28.0k
      _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
862
28.0k
  const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
863
28.0k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
864
28.0k
  return convolve_2tap_ssse3(&ss, coeffs);
865
28.0k
}
convolve_avx2.c:x_convolve_2tap_2x2_sse4_1
Line
Count
Source
859
5.74k
                                                 const __m128i coeffs[1]) {
860
5.74k
  const __m128i sfl =
861
5.74k
      _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
862
5.74k
  const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
863
5.74k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
864
5.74k
  return convolve_2tap_ssse3(&ss, coeffs);
865
5.74k
}
866
867
static inline __m128i x_convolve_2tap_4x2_ssse3(const uint8_t *const src,
868
                                                const ptrdiff_t stride,
869
139k
                                                const __m128i coeffs[1]) {
870
139k
  const __m128i sfl =
871
139k
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
872
139k
  const __m128i s_128 = load_u8_8x2_sse2(src, stride);
873
139k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
874
139k
  return convolve_2tap_ssse3(&ss, coeffs);
875
139k
}
convolve_2d_avx2.c:x_convolve_2tap_4x2_ssse3
Line
Count
Source
869
113k
                                                const __m128i coeffs[1]) {
870
113k
  const __m128i sfl =
871
113k
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
872
113k
  const __m128i s_128 = load_u8_8x2_sse2(src, stride);
873
113k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
874
113k
  return convolve_2tap_ssse3(&ss, coeffs);
875
113k
}
convolve_avx2.c:x_convolve_2tap_4x2_ssse3
Line
Count
Source
869
25.4k
                                                const __m128i coeffs[1]) {
870
25.4k
  const __m128i sfl =
871
25.4k
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
872
25.4k
  const __m128i s_128 = load_u8_8x2_sse2(src, stride);
873
25.4k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
874
25.4k
  return convolve_2tap_ssse3(&ss, coeffs);
875
25.4k
}
876
877
static inline void x_convolve_2tap_8x2_ssse3(const uint8_t *const src,
878
                                             const ptrdiff_t stride,
879
                                             const __m128i coeffs[1],
880
124k
                                             __m128i r[2]) {
881
124k
  __m128i ss[2];
882
124k
  const __m128i s00 = _mm_loadu_si128((__m128i *)src);
883
124k
  const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
884
124k
  const __m128i s01 = _mm_srli_si128(s00, 1);
885
124k
  const __m128i s11 = _mm_srli_si128(s10, 1);
886
124k
  ss[0] = _mm_unpacklo_epi8(s00, s01);
887
124k
  ss[1] = _mm_unpacklo_epi8(s10, s11);
888
889
124k
  r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
890
124k
  r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
891
124k
}
convolve_2d_avx2.c:x_convolve_2tap_8x2_ssse3
Line
Count
Source
880
103k
                                             __m128i r[2]) {
881
103k
  __m128i ss[2];
882
103k
  const __m128i s00 = _mm_loadu_si128((__m128i *)src);
883
103k
  const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
884
103k
  const __m128i s01 = _mm_srli_si128(s00, 1);
885
103k
  const __m128i s11 = _mm_srli_si128(s10, 1);
886
103k
  ss[0] = _mm_unpacklo_epi8(s00, s01);
887
103k
  ss[1] = _mm_unpacklo_epi8(s10, s11);
888
889
103k
  r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
890
103k
  r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
891
103k
}
convolve_avx2.c:x_convolve_2tap_8x2_ssse3
Line
Count
Source
880
21.1k
                                             __m128i r[2]) {
881
21.1k
  __m128i ss[2];
882
21.1k
  const __m128i s00 = _mm_loadu_si128((__m128i *)src);
883
21.1k
  const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
884
21.1k
  const __m128i s01 = _mm_srli_si128(s00, 1);
885
21.1k
  const __m128i s11 = _mm_srli_si128(s10, 1);
886
21.1k
  ss[0] = _mm_unpacklo_epi8(s00, s01);
887
21.1k
  ss[1] = _mm_unpacklo_epi8(s10, s11);
888
889
21.1k
  r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
890
21.1k
  r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
891
21.1k
}
892
893
static inline __m256i x_convolve_2tap_8x2_avx2(const uint8_t *const src,
894
                                               const ptrdiff_t stride,
895
0
                                               const __m256i coeffs[1]) {
896
0
  __m128i s_128[2][2];
897
0
  __m256i s_256[2];
898
0
899
0
  s_128[0][0] = _mm_loadu_si128((__m128i *)src);
900
0
  s_128[1][0] = _mm_loadu_si128((__m128i *)(src + stride));
901
0
  s_128[0][1] = _mm_srli_si128(s_128[0][0], 1);
902
0
  s_128[1][1] = _mm_srli_si128(s_128[1][0], 1);
903
0
  s_256[0] = _mm256_setr_m128i(s_128[0][0], s_128[1][0]);
904
0
  s_256[1] = _mm256_setr_m128i(s_128[0][1], s_128[1][1]);
905
0
  const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
906
0
  return convolve_2tap_avx2(&ss, coeffs);
907
0
}
Unexecuted instantiation: convolve_2d_avx2.c:x_convolve_2tap_8x2_avx2
Unexecuted instantiation: convolve_avx2.c:x_convolve_2tap_8x2_avx2
908
909
static inline void x_convolve_2tap_16x2_avx2(const uint8_t *const src,
910
                                             const ptrdiff_t stride,
911
                                             const __m256i coeffs[1],
912
73.8k
                                             __m256i r[2]) {
913
73.8k
  const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
914
73.8k
  const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
915
73.8k
  const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
916
73.8k
  const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
917
73.8k
  r[0] = convolve_2tap_avx2(&s0, coeffs);
918
73.8k
  r[1] = convolve_2tap_avx2(&s1, coeffs);
919
73.8k
}
convolve_2d_avx2.c:x_convolve_2tap_16x2_avx2
Line
Count
Source
912
59.3k
                                             __m256i r[2]) {
913
59.3k
  const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
914
59.3k
  const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
915
59.3k
  const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
916
59.3k
  const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
917
59.3k
  r[0] = convolve_2tap_avx2(&s0, coeffs);
918
59.3k
  r[1] = convolve_2tap_avx2(&s1, coeffs);
919
59.3k
}
convolve_avx2.c:x_convolve_2tap_16x2_avx2
Line
Count
Source
912
14.4k
                                             __m256i r[2]) {
913
14.4k
  const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
914
14.4k
  const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
915
14.4k
  const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
916
14.4k
  const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
917
14.4k
  r[0] = convolve_2tap_avx2(&s0, coeffs);
918
14.4k
  r[1] = convolve_2tap_avx2(&s1, coeffs);
919
14.4k
}
920
921
static inline void x_convolve_2tap_32_avx2(const uint8_t *const src,
922
                                           const __m256i coeffs[1],
923
168k
                                           __m256i r[2]) {
924
168k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
925
168k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
926
168k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
927
168k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
928
929
168k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
930
168k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
931
168k
}
Unexecuted instantiation: convolve_2d_avx2.c:x_convolve_2tap_32_avx2
convolve_avx2.c:x_convolve_2tap_32_avx2
Line
Count
Source
923
168k
                                           __m256i r[2]) {
924
168k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
925
168k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
926
168k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
927
168k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
928
929
168k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
930
168k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
931
168k
}
932
933
static inline __m128i x_convolve_4tap_2x2_ssse3(const uint8_t *const src,
934
                                                const ptrdiff_t stride,
935
738k
                                                const __m128i coeffs[2]) {
936
738k
  const __m128i sfl0 =
937
738k
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
938
738k
  const __m128i sfl1 =
939
738k
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
940
738k
  const __m128i s = load_u8_8x2_sse2(src, stride);
941
738k
  __m128i ss[2];
942
943
738k
  ss[0] = _mm_shuffle_epi8(s, sfl0);
944
738k
  ss[1] = _mm_shuffle_epi8(s, sfl1);
945
738k
  return convolve_4tap_ssse3(ss, coeffs);
946
738k
}
convolve_2d_avx2.c:x_convolve_4tap_2x2_ssse3
Line
Count
Source
935
597k
                                                const __m128i coeffs[2]) {
936
597k
  const __m128i sfl0 =
937
597k
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
938
597k
  const __m128i sfl1 =
939
597k
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
940
597k
  const __m128i s = load_u8_8x2_sse2(src, stride);
941
597k
  __m128i ss[2];
942
943
597k
  ss[0] = _mm_shuffle_epi8(s, sfl0);
944
597k
  ss[1] = _mm_shuffle_epi8(s, sfl1);
945
597k
  return convolve_4tap_ssse3(ss, coeffs);
946
597k
}
convolve_avx2.c:x_convolve_4tap_2x2_ssse3
Line
Count
Source
935
141k
                                                const __m128i coeffs[2]) {
936
141k
  const __m128i sfl0 =
937
141k
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
938
141k
  const __m128i sfl1 =
939
141k
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
940
141k
  const __m128i s = load_u8_8x2_sse2(src, stride);
941
141k
  __m128i ss[2];
942
943
141k
  ss[0] = _mm_shuffle_epi8(s, sfl0);
944
141k
  ss[1] = _mm_shuffle_epi8(s, sfl1);
945
141k
  return convolve_4tap_ssse3(ss, coeffs);
946
141k
}
947
948
static inline __m128i x_convolve_4tap_4x2_ssse3(const uint8_t *const src,
949
                                                const ptrdiff_t stride,
950
3.21M
                                                const __m128i coeffs[2]) {
951
3.21M
  const __m128i s = load_u8_8x2_sse2(src, stride);
952
3.21M
  const __m128i sfl0 =
953
3.21M
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
954
3.21M
  const __m128i sfl1 =
955
3.21M
      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
956
3.21M
  __m128i ss[2];
957
958
3.21M
  ss[0] = _mm_shuffle_epi8(s, sfl0);
959
3.21M
  ss[1] = _mm_shuffle_epi8(s, sfl1);
960
3.21M
  return convolve_4tap_ssse3(ss, coeffs);
961
3.21M
}
convolve_2d_avx2.c:x_convolve_4tap_4x2_ssse3
Line
Count
Source
950
2.58M
                                                const __m128i coeffs[2]) {
951
2.58M
  const __m128i s = load_u8_8x2_sse2(src, stride);
952
2.58M
  const __m128i sfl0 =
953
2.58M
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
954
2.58M
  const __m128i sfl1 =
955
2.58M
      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
956
2.58M
  __m128i ss[2];
957
958
2.58M
  ss[0] = _mm_shuffle_epi8(s, sfl0);
959
2.58M
  ss[1] = _mm_shuffle_epi8(s, sfl1);
960
2.58M
  return convolve_4tap_ssse3(ss, coeffs);
961
2.58M
}
convolve_avx2.c:x_convolve_4tap_4x2_ssse3
Line
Count
Source
950
623k
                                                const __m128i coeffs[2]) {
951
623k
  const __m128i s = load_u8_8x2_sse2(src, stride);
952
623k
  const __m128i sfl0 =
953
623k
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
954
623k
  const __m128i sfl1 =
955
623k
      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
956
623k
  __m128i ss[2];
957
958
623k
  ss[0] = _mm_shuffle_epi8(s, sfl0);
959
623k
  ss[1] = _mm_shuffle_epi8(s, sfl1);
960
623k
  return convolve_4tap_ssse3(ss, coeffs);
961
623k
}
962
963
static inline __m256i x_convolve_4tap_8x2_avx2(const uint8_t *const src,
964
                                               const ptrdiff_t stride,
965
                                               const __m256i coeffs[2],
966
324k
                                               const __m256i filt[2]) {
967
324k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
968
324k
  return x_convolve_4tap_avx2(s_256, coeffs, filt);
969
324k
}
convolve_2d_avx2.c:x_convolve_4tap_8x2_avx2
Line
Count
Source
966
324k
                                               const __m256i filt[2]) {
967
324k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
968
324k
  return x_convolve_4tap_avx2(s_256, coeffs, filt);
969
324k
}
Unexecuted instantiation: convolve_avx2.c:x_convolve_4tap_8x2_avx2
970
971
static inline void x_convolve_4tap_16x2_avx2(const uint8_t *const src,
972
                                             const int32_t src_stride,
973
                                             const __m256i coeffs[2],
974
                                             const __m256i filt[2],
975
93.1k
                                             __m256i r[2]) {
976
93.1k
  r[0] = x_convolve_4tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
977
93.1k
  r[1] = x_convolve_4tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
978
93.1k
}
convolve_2d_avx2.c:x_convolve_4tap_16x2_avx2
Line
Count
Source
975
93.1k
                                             __m256i r[2]) {
976
93.1k
  r[0] = x_convolve_4tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
977
93.1k
  r[1] = x_convolve_4tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
978
93.1k
}
Unexecuted instantiation: convolve_avx2.c:x_convolve_4tap_16x2_avx2
979
980
static inline void x_convolve_4tap_32_avx2(const uint8_t *const src,
981
                                           const __m256i coeffs[2],
982
                                           const __m256i filt[2],
983
475k
                                           __m256i r[2]) {
984
475k
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
985
475k
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
986
987
475k
  r[0] = x_convolve_4tap_avx2(s0_256, coeffs, filt);
988
475k
  r[1] = x_convolve_4tap_avx2(s1_256, coeffs, filt);
989
475k
}
convolve_2d_avx2.c:x_convolve_4tap_32_avx2
Line
Count
Source
983
475k
                                           __m256i r[2]) {
984
475k
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
985
475k
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
986
987
475k
  r[0] = x_convolve_4tap_avx2(s0_256, coeffs, filt);
988
475k
  r[1] = x_convolve_4tap_avx2(s1_256, coeffs, filt);
989
475k
}
Unexecuted instantiation: convolve_avx2.c:x_convolve_4tap_32_avx2
990
991
static inline __m128i x_convolve_6tap_2x2_ssse3(const uint8_t *const src,
992
                                                const ptrdiff_t stride,
993
0
                                                const __m128i coeffs[3]) {
994
0
  const __m128i sfl0 =
995
0
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
996
0
  const __m128i sfl1 =
997
0
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
998
0
  const __m128i sfl2 =
999
0
      _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
1000
1001
0
  const __m128i s = load_u8_8x2_sse2(src, stride);
1002
0
  __m128i ss[3];
1003
1004
0
  ss[0] = _mm_shuffle_epi8(s, sfl0);
1005
0
  ss[1] = _mm_shuffle_epi8(s, sfl1);
1006
0
  ss[2] = _mm_shuffle_epi8(s, sfl2);
1007
0
  return convolve_6tap_ssse3(ss, coeffs);
1008
0
}
Unexecuted instantiation: convolve_2d_avx2.c:x_convolve_6tap_2x2_ssse3
Unexecuted instantiation: convolve_avx2.c:x_convolve_6tap_2x2_ssse3
1009
1010
static inline __m128i x_convolve_6tap_4x2_ssse3(const uint8_t *const src,
1011
                                                const ptrdiff_t stride,
1012
0
                                                const __m128i coeffs[3]) {
1013
0
  const __m128i s = load_u8_8x2_sse2(src, stride);
1014
0
  const __m128i sfl0 =
1015
0
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
1016
0
  const __m128i sfl1 =
1017
0
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
1018
0
  const __m128i sfl2 =
1019
0
      _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
1020
0
  __m128i ss[3];
1021
1022
0
  ss[0] = _mm_shuffle_epi8(s, sfl0);
1023
0
  ss[1] = _mm_shuffle_epi8(s, sfl1);
1024
0
  ss[2] = _mm_shuffle_epi8(s, sfl2);
1025
0
  return convolve_6tap_ssse3(ss, coeffs);
1026
0
}
Unexecuted instantiation: convolve_2d_avx2.c:x_convolve_6tap_4x2_ssse3
Unexecuted instantiation: convolve_avx2.c:x_convolve_6tap_4x2_ssse3
1027
1028
static inline __m256i x_convolve_6tap_8x2_avx2(const uint8_t *const src,
1029
                                               const ptrdiff_t stride,
1030
                                               const __m256i coeffs[3],
1031
7.19M
                                               const __m256i filt[3]) {
1032
7.19M
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1033
7.19M
  return x_convolve_6tap_avx2(s_256, coeffs, filt);
1034
7.19M
}
convolve_2d_avx2.c:x_convolve_6tap_8x2_avx2
Line
Count
Source
1031
5.54M
                                               const __m256i filt[3]) {
1032
5.54M
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1033
5.54M
  return x_convolve_6tap_avx2(s_256, coeffs, filt);
1034
5.54M
}
convolve_avx2.c:x_convolve_6tap_8x2_avx2
Line
Count
Source
1031
1.64M
                                               const __m256i filt[3]) {
1032
1.64M
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1033
1.64M
  return x_convolve_6tap_avx2(s_256, coeffs, filt);
1034
1.64M
}
1035
1036
static inline void x_convolve_6tap_16x2_avx2(const uint8_t *const src,
1037
                                             const int32_t src_stride,
1038
                                             const __m256i coeffs[3],
1039
                                             const __m256i filt[3],
1040
2.09M
                                             __m256i r[2]) {
1041
2.09M
  r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1042
2.09M
  r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1043
2.09M
}
convolve_2d_avx2.c:x_convolve_6tap_16x2_avx2
Line
Count
Source
1040
1.58M
                                             __m256i r[2]) {
1041
1.58M
  r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1042
1.58M
  r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1043
1.58M
}
convolve_avx2.c:x_convolve_6tap_16x2_avx2
Line
Count
Source
1040
509k
                                             __m256i r[2]) {
1041
509k
  r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1042
509k
  r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1043
509k
}
1044
1045
static inline void x_convolve_6tap_32_avx2(const uint8_t *const src,
1046
                                           const __m256i coeffs[3],
1047
                                           const __m256i filt[3],
1048
4.61M
                                           __m256i r[2]) {
1049
4.61M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1050
4.61M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1051
1052
4.61M
  r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
1053
4.61M
  r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
1054
4.61M
}
convolve_2d_avx2.c:x_convolve_6tap_32_avx2
Line
Count
Source
1048
3.16M
                                           __m256i r[2]) {
1049
3.16M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1050
3.16M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1051
1052
3.16M
  r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
1053
3.16M
  r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
1054
3.16M
}
convolve_avx2.c:x_convolve_6tap_32_avx2
Line
Count
Source
1048
1.45M
                                           __m256i r[2]) {
1049
1.45M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1050
1.45M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1051
1052
1.45M
  r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
1053
1.45M
  r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
1054
1.45M
}
1055
1056
static inline __m256i x_convolve_8tap_8x2_avx2(const uint8_t *const src,
1057
                                               const ptrdiff_t stride,
1058
                                               const __m256i coeffs[4],
1059
388k
                                               const __m256i filt[4]) {
1060
388k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1061
388k
  return x_convolve_8tap_avx2(s_256, coeffs, filt);
1062
388k
}
convolve_2d_avx2.c:x_convolve_8tap_8x2_avx2
Line
Count
Source
1059
313k
                                               const __m256i filt[4]) {
1060
313k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1061
313k
  return x_convolve_8tap_avx2(s_256, coeffs, filt);
1062
313k
}
convolve_avx2.c:x_convolve_8tap_8x2_avx2
Line
Count
Source
1059
75.0k
                                               const __m256i filt[4]) {
1060
75.0k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1061
75.0k
  return x_convolve_8tap_avx2(s_256, coeffs, filt);
1062
75.0k
}
1063
1064
static AOM_FORCE_INLINE void x_convolve_8tap_16x2_avx2(const uint8_t *const src,
1065
                                                       const int32_t src_stride,
1066
                                                       const __m256i coeffs[4],
1067
                                                       const __m256i filt[4],
1068
113k
                                                       __m256i r[2]) {
1069
113k
  r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1070
113k
  r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1071
113k
}
convolve_2d_avx2.c:x_convolve_8tap_16x2_avx2
Line
Count
Source
1068
88.8k
                                                       __m256i r[2]) {
1069
88.8k
  r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1070
88.8k
  r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1071
88.8k
}
convolve_avx2.c:x_convolve_8tap_16x2_avx2
Line
Count
Source
1068
24.2k
                                                       __m256i r[2]) {
1069
24.2k
  r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1070
24.2k
  r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1071
24.2k
}
1072
1073
static AOM_FORCE_INLINE void x_convolve_8tap_32_avx2(const uint8_t *const src,
1074
                                                     const __m256i coeffs[4],
1075
                                                     const __m256i filt[4],
1076
1.50M
                                                     __m256i r[2]) {
1077
1.50M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1078
1.50M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1079
1080
1.50M
  r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
1081
1.50M
  r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
1082
1.50M
}
convolve_2d_avx2.c:x_convolve_8tap_32_avx2
Line
Count
Source
1076
1.28M
                                                     __m256i r[2]) {
1077
1.28M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1078
1.28M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1079
1080
1.28M
  r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
1081
1.28M
  r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
1082
1.28M
}
convolve_avx2.c:x_convolve_8tap_32_avx2
Line
Count
Source
1076
224k
                                                     __m256i r[2]) {
1077
224k
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1078
224k
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1079
1080
224k
  r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
1081
224k
  r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
1082
224k
}
1083
1084
static inline __m128i y_convolve_2tap_2x2_ssse3(const uint8_t *const src,
1085
                                                const ptrdiff_t stride,
1086
                                                const __m128i coeffs[1],
1087
4.02k
                                                __m128i s_16[2]) {
1088
4.02k
  __m128i s_128[2];
1089
1090
4.02k
  s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
1091
4.02k
  s_128[0] = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1092
4.02k
  s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
1093
4.02k
  s_128[1] = _mm_unpacklo_epi16(s_16[1], s_16[0]);
1094
4.02k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1095
4.02k
  return convolve_2tap_ssse3(&ss, coeffs);
1096
4.02k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_2x2_ssse3
convolve_avx2.c:y_convolve_2tap_2x2_ssse3
Line
Count
Source
1087
4.02k
                                                __m128i s_16[2]) {
1088
4.02k
  __m128i s_128[2];
1089
1090
4.02k
  s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
1091
4.02k
  s_128[0] = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1092
4.02k
  s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
1093
4.02k
  s_128[1] = _mm_unpacklo_epi16(s_16[1], s_16[0]);
1094
4.02k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1095
4.02k
  return convolve_2tap_ssse3(&ss, coeffs);
1096
4.02k
}
1097
1098
static inline __m128i y_convolve_2tap_4x2_ssse3(const uint8_t *const src,
1099
                                                const ptrdiff_t stride,
1100
                                                const __m128i coeffs[1],
1101
19.5k
                                                __m128i s_32[2]) {
1102
19.5k
  __m128i s_128[2];
1103
1104
19.5k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1105
19.5k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1106
19.5k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1107
19.5k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1108
19.5k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1109
19.5k
  return convolve_2tap_ssse3(&ss, coeffs);
1110
19.5k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_4x2_ssse3
convolve_avx2.c:y_convolve_2tap_4x2_ssse3
Line
Count
Source
1101
19.5k
                                                __m128i s_32[2]) {
1102
19.5k
  __m128i s_128[2];
1103
1104
19.5k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1105
19.5k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1106
19.5k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1107
19.5k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1108
19.5k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1109
19.5k
  return convolve_2tap_ssse3(&ss, coeffs);
1110
19.5k
}
1111
1112
static inline __m256i y_convolve_2tap_8x2_avx2(const uint8_t *const src,
1113
                                               const ptrdiff_t stride,
1114
                                               const __m256i coeffs[1],
1115
0
                                               __m128i s_64[2]) {
1116
0
  __m256i s_256[2];
1117
0
1118
0
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + stride));
1119
0
  s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
1120
0
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1121
0
  s_256[1] = _mm256_setr_m128i(s_64[1], s_64[0]);
1122
0
  const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1123
0
  return convolve_2tap_avx2(&ss, coeffs);
1124
0
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_8x2_avx2
Unexecuted instantiation: convolve_avx2.c:y_convolve_2tap_8x2_avx2
1125
1126
static inline void y_convolve_2tap_16x2_avx2(const uint8_t *const src,
1127
                                             const ptrdiff_t stride,
1128
                                             const __m256i coeffs[1],
1129
16.8k
                                             __m128i s_128[2], __m256i r[2]) {
1130
16.8k
  __m256i s_256[2];
1131
1132
16.8k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + stride));
1133
16.8k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1134
16.8k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1135
16.8k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1136
16.8k
  const __m256i ss0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1137
16.8k
  const __m256i ss1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1138
16.8k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1139
16.8k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1140
16.8k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_16x2_avx2
convolve_avx2.c:y_convolve_2tap_16x2_avx2
Line
Count
Source
1129
16.8k
                                             __m128i s_128[2], __m256i r[2]) {
1130
16.8k
  __m256i s_256[2];
1131
1132
16.8k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + stride));
1133
16.8k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1134
16.8k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1135
16.8k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1136
16.8k
  const __m256i ss0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1137
16.8k
  const __m256i ss1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1138
16.8k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1139
16.8k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1140
16.8k
}
1141
1142
static inline void y_convolve_2tap_32_avx2(const uint8_t *const src,
1143
                                           const __m256i coeffs[1],
1144
                                           const __m256i s0, __m256i *const s1,
1145
162k
                                           __m256i r[2]) {
1146
162k
  *s1 = _mm256_loadu_si256((__m256i *)src);
1147
162k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, *s1);
1148
162k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, *s1);
1149
162k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1150
162k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1151
162k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_32_avx2
convolve_avx2.c:y_convolve_2tap_32_avx2
Line
Count
Source
1145
162k
                                           __m256i r[2]) {
1146
162k
  *s1 = _mm256_loadu_si256((__m256i *)src);
1147
162k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, *s1);
1148
162k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, *s1);
1149
162k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1150
162k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1151
162k
}
1152
1153
static inline __m128i y_convolve_4tap_2x2_ssse3(const uint8_t *const src,
1154
                                                const ptrdiff_t stride,
1155
                                                const __m128i coeffs[2],
1156
                                                __m128i s_16[4],
1157
43.3k
                                                __m128i ss_128[2]) {
1158
43.3k
  s_16[3] = _mm_cvtsi32_si128(loadu_int16(src + stride));
1159
43.3k
  const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1160
43.3k
  s_16[2] = _mm_cvtsi32_si128(loadu_int16(src + 2 * stride));
1161
43.3k
  const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[2]);
1162
43.3k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1163
43.3k
  return convolve_4tap_ssse3(ss_128, coeffs);
1164
43.3k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_2x2_ssse3
convolve_avx2.c:y_convolve_4tap_2x2_ssse3
Line
Count
Source
1157
43.3k
                                                __m128i ss_128[2]) {
1158
43.3k
  s_16[3] = _mm_cvtsi32_si128(loadu_int16(src + stride));
1159
43.3k
  const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1160
43.3k
  s_16[2] = _mm_cvtsi32_si128(loadu_int16(src + 2 * stride));
1161
43.3k
  const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[2]);
1162
43.3k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1163
43.3k
  return convolve_4tap_ssse3(ss_128, coeffs);
1164
43.3k
}
1165
1166
static inline __m128i y_convolve_4tap_4x2_ssse3(const uint8_t *const src,
1167
                                                const ptrdiff_t stride,
1168
                                                const __m128i coeffs[2],
1169
                                                __m128i s_32[4],
1170
249k
                                                __m128i ss_128[2]) {
1171
249k
  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1172
249k
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1173
249k
  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1174
249k
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1175
249k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1176
249k
  return convolve_4tap_ssse3(ss_128, coeffs);
1177
249k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_4x2_ssse3
convolve_avx2.c:y_convolve_4tap_4x2_ssse3
Line
Count
Source
1170
249k
                                                __m128i ss_128[2]) {
1171
249k
  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1172
249k
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1173
249k
  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1174
249k
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1175
249k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1176
249k
  return convolve_4tap_ssse3(ss_128, coeffs);
1177
249k
}
1178
1179
static inline __m256i y_convolve_4tap_8x2_avx2(const uint8_t *const src,
1180
                                               const ptrdiff_t stride,
1181
                                               const __m256i coeffs[2],
1182
                                               __m128i s_64[4],
1183
193k
                                               __m256i ss_256[2]) {
1184
193k
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + stride));
1185
193k
  const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1186
193k
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1187
193k
  const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[2]);
1188
193k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1189
193k
  return convolve_4tap_avx2(ss_256, coeffs);
1190
193k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_8x2_avx2
convolve_avx2.c:y_convolve_4tap_8x2_avx2
Line
Count
Source
1183
193k
                                               __m256i ss_256[2]) {
1184
193k
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + stride));
1185
193k
  const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1186
193k
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1187
193k
  const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[2]);
1188
193k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1189
193k
  return convolve_4tap_avx2(ss_256, coeffs);
1190
193k
}
1191
1192
static inline void y_convolve_4tap_16x2_avx2(const uint8_t *const src,
1193
                                             const ptrdiff_t stride,
1194
                                             const __m256i coeffs[2],
1195
                                             __m128i s_128[4],
1196
109k
                                             __m256i ss_256[4], __m256i r[2]) {
1197
109k
  s_128[3] = _mm_loadu_si128((__m128i *)(src + stride));
1198
109k
  const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1199
109k
  s_128[2] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1200
109k
  const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[2]);
1201
109k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1202
109k
  ss_256[3] = _mm256_unpackhi_epi8(src23, src34);
1203
109k
  r[0] = convolve_4tap_avx2(ss_256, coeffs);
1204
109k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1205
109k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_16x2_avx2
convolve_avx2.c:y_convolve_4tap_16x2_avx2
Line
Count
Source
1196
109k
                                             __m256i ss_256[4], __m256i r[2]) {
1197
109k
  s_128[3] = _mm_loadu_si128((__m128i *)(src + stride));
1198
109k
  const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1199
109k
  s_128[2] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1200
109k
  const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[2]);
1201
109k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1202
109k
  ss_256[3] = _mm256_unpackhi_epi8(src23, src34);
1203
109k
  r[0] = convolve_4tap_avx2(ss_256, coeffs);
1204
109k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1205
109k
}
1206
1207
static inline __m128i y_convolve_6tap_2x2_ssse3(const uint8_t *const src,
1208
                                                const ptrdiff_t stride,
1209
                                                const __m128i coeffs[3],
1210
                                                __m128i s_16[6],
1211
71.4k
                                                __m128i ss_128[3]) {
1212
71.4k
  s_16[5] = _mm_cvtsi32_si128(loadu_int16(src + 3 * stride));
1213
71.4k
  const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
1214
71.4k
  s_16[4] = _mm_cvtsi32_si128(loadu_int16(src + 4 * stride));
1215
71.4k
  const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[4]);
1216
71.4k
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1217
71.4k
  return convolve_6tap_ssse3(ss_128, coeffs);
1218
71.4k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_2x2_ssse3
convolve_avx2.c:y_convolve_6tap_2x2_ssse3
Line
Count
Source
1211
71.4k
                                                __m128i ss_128[3]) {
1212
71.4k
  s_16[5] = _mm_cvtsi32_si128(loadu_int16(src + 3 * stride));
1213
71.4k
  const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
1214
71.4k
  s_16[4] = _mm_cvtsi32_si128(loadu_int16(src + 4 * stride));
1215
71.4k
  const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[4]);
1216
71.4k
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1217
71.4k
  return convolve_6tap_ssse3(ss_128, coeffs);
1218
71.4k
}
1219
1220
static inline void y_convolve_4tap_32x2_avx2(
1221
    const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[2],
1222
145k
    __m256i s_256[4], __m256i ss_256[4], __m256i tt_256[4], __m256i r[4]) {
1223
145k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
1224
145k
  ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1225
145k
  ss_256[3] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1226
145k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
1227
145k
  tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[2]);
1228
145k
  tt_256[3] = _mm256_unpackhi_epi8(s_256[3], s_256[2]);
1229
145k
  r[0] = convolve_4tap_avx2(ss_256 + 0, coeffs);
1230
145k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1231
145k
  r[2] = convolve_4tap_avx2(tt_256 + 0, coeffs);
1232
145k
  r[3] = convolve_4tap_avx2(tt_256 + 2, coeffs);
1233
145k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_32x2_avx2
convolve_avx2.c:y_convolve_4tap_32x2_avx2
Line
Count
Source
1222
145k
    __m256i s_256[4], __m256i ss_256[4], __m256i tt_256[4], __m256i r[4]) {
1223
145k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
1224
145k
  ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1225
145k
  ss_256[3] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1226
145k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
1227
145k
  tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[2]);
1228
145k
  tt_256[3] = _mm256_unpackhi_epi8(s_256[3], s_256[2]);
1229
145k
  r[0] = convolve_4tap_avx2(ss_256 + 0, coeffs);
1230
145k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1231
145k
  r[2] = convolve_4tap_avx2(tt_256 + 0, coeffs);
1232
145k
  r[3] = convolve_4tap_avx2(tt_256 + 2, coeffs);
1233
145k
}
1234
1235
static inline __m128i y_convolve_6tap_4x2_ssse3(const uint8_t *const src,
1236
                                                const ptrdiff_t stride,
1237
                                                const __m128i coeffs[3],
1238
                                                __m128i s_32[6],
1239
395k
                                                __m128i ss_128[3]) {
1240
395k
  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 3 * stride));
1241
395k
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1242
395k
  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 4 * stride));
1243
395k
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1244
395k
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1245
395k
  return convolve_6tap_ssse3(ss_128, coeffs);
1246
395k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_4x2_ssse3
convolve_avx2.c:y_convolve_6tap_4x2_ssse3
Line
Count
Source
1239
395k
                                                __m128i ss_128[3]) {
1240
395k
  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 3 * stride));
1241
395k
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1242
395k
  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 4 * stride));
1243
395k
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1244
395k
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1245
395k
  return convolve_6tap_ssse3(ss_128, coeffs);
1246
395k
}
1247
1248
static inline __m256i y_convolve_6tap_8x2_avx2(const uint8_t *const src,
1249
                                               const ptrdiff_t stride,
1250
                                               const __m256i coeffs[3],
1251
                                               __m128i s_64[6],
1252
529k
                                               __m256i ss_256[3]) {
1253
529k
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 3 * stride));
1254
529k
  const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
1255
529k
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 4 * stride));
1256
529k
  const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[4]);
1257
529k
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1258
529k
  return convolve_6tap_avx2(ss_256, coeffs);
1259
529k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_8x2_avx2
convolve_avx2.c:y_convolve_6tap_8x2_avx2
Line
Count
Source
1252
529k
                                               __m256i ss_256[3]) {
1253
529k
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 3 * stride));
1254
529k
  const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
1255
529k
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 4 * stride));
1256
529k
  const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[4]);
1257
529k
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1258
529k
  return convolve_6tap_avx2(ss_256, coeffs);
1259
529k
}
1260
1261
static inline void y_convolve_6tap_16x2_avx2(const uint8_t *const src,
1262
                                             const ptrdiff_t stride,
1263
                                             const __m256i coeffs[3],
1264
                                             __m128i s_128[6],
1265
466k
                                             __m256i ss_256[6], __m256i r[2]) {
1266
466k
  s_128[5] = _mm_loadu_si128((__m128i *)(src + 3 * stride));
1267
466k
  const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
1268
466k
  s_128[4] = _mm_loadu_si128((__m128i *)(src + 4 * stride));
1269
466k
  const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[4]);
1270
466k
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1271
466k
  ss_256[5] = _mm256_unpackhi_epi8(src45, src56);
1272
466k
  r[0] = convolve_6tap_avx2(ss_256, coeffs);
1273
466k
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1274
466k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_16x2_avx2
convolve_avx2.c:y_convolve_6tap_16x2_avx2
Line
Count
Source
1265
466k
                                             __m256i ss_256[6], __m256i r[2]) {
1266
466k
  s_128[5] = _mm_loadu_si128((__m128i *)(src + 3 * stride));
1267
466k
  const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
1268
466k
  s_128[4] = _mm_loadu_si128((__m128i *)(src + 4 * stride));
1269
466k
  const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[4]);
1270
466k
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1271
466k
  ss_256[5] = _mm256_unpackhi_epi8(src45, src56);
1272
466k
  r[0] = convolve_6tap_avx2(ss_256, coeffs);
1273
466k
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1274
466k
}
1275
1276
static inline void y_convolve_6tap_32x2_avx2(
1277
    const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[3],
1278
587k
    __m256i s_256[6], __m256i ss_256[6], __m256i tt_256[6], __m256i r[4]) {
1279
587k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1280
587k
  ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
1281
587k
  ss_256[5] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
1282
587k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1283
587k
  tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[4]);
1284
587k
  tt_256[5] = _mm256_unpackhi_epi8(s_256[5], s_256[4]);
1285
587k
  r[0] = convolve_6tap_avx2(ss_256 + 0, coeffs);
1286
587k
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1287
587k
  r[2] = convolve_6tap_avx2(tt_256 + 0, coeffs);
1288
587k
  r[3] = convolve_6tap_avx2(tt_256 + 3, coeffs);
1289
587k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_32x2_avx2
convolve_avx2.c:y_convolve_6tap_32x2_avx2
Line
Count
Source
1278
587k
    __m256i s_256[6], __m256i ss_256[6], __m256i tt_256[6], __m256i r[4]) {
1279
587k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1280
587k
  ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
1281
587k
  ss_256[5] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
1282
587k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1283
587k
  tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[4]);
1284
587k
  tt_256[5] = _mm256_unpackhi_epi8(s_256[5], s_256[4]);
1285
587k
  r[0] = convolve_6tap_avx2(ss_256 + 0, coeffs);
1286
587k
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1287
587k
  r[2] = convolve_6tap_avx2(tt_256 + 0, coeffs);
1288
587k
  r[3] = convolve_6tap_avx2(tt_256 + 3, coeffs);
1289
587k
}
1290
1291
static inline __m128i y_convolve_8tap_2x2_ssse3(const uint8_t *const src,
1292
                                                const ptrdiff_t stride,
1293
                                                const __m128i coeffs[4],
1294
                                                __m128i s_16[8],
1295
5.41k
                                                __m128i ss_128[4]) {
1296
5.41k
  s_16[7] = _mm_cvtsi32_si128(loadu_int16(src + 7 * stride));
1297
5.41k
  const __m128i src67 = _mm_unpacklo_epi16(s_16[6], s_16[7]);
1298
5.41k
  s_16[6] = _mm_cvtsi32_si128(loadu_int16(src + 8 * stride));
1299
5.41k
  const __m128i src78 = _mm_unpacklo_epi16(s_16[7], s_16[6]);
1300
5.41k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1301
5.41k
  return convolve_8tap_ssse3(ss_128, coeffs);
1302
5.41k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_2x2_ssse3
convolve_avx2.c:y_convolve_8tap_2x2_ssse3
Line
Count
Source
1295
5.41k
                                                __m128i ss_128[4]) {
1296
5.41k
  s_16[7] = _mm_cvtsi32_si128(loadu_int16(src + 7 * stride));
1297
5.41k
  const __m128i src67 = _mm_unpacklo_epi16(s_16[6], s_16[7]);
1298
5.41k
  s_16[6] = _mm_cvtsi32_si128(loadu_int16(src + 8 * stride));
1299
5.41k
  const __m128i src78 = _mm_unpacklo_epi16(s_16[7], s_16[6]);
1300
5.41k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1301
5.41k
  return convolve_8tap_ssse3(ss_128, coeffs);
1302
5.41k
}
1303
1304
static inline __m128i y_convolve_8tap_4x2_ssse3(const uint8_t *const src,
1305
                                                const ptrdiff_t stride,
1306
                                                const __m128i coeffs[4],
1307
                                                __m128i s_32[8],
1308
20.1k
                                                __m128i ss_128[4]) {
1309
20.1k
  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * stride));
1310
20.1k
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1311
20.1k
  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * stride));
1312
20.1k
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1313
20.1k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1314
20.1k
  return convolve_8tap_ssse3(ss_128, coeffs);
1315
20.1k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_4x2_ssse3
convolve_avx2.c:y_convolve_8tap_4x2_ssse3
Line
Count
Source
1308
20.1k
                                                __m128i ss_128[4]) {
1309
20.1k
  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * stride));
1310
20.1k
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1311
20.1k
  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * stride));
1312
20.1k
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1313
20.1k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1314
20.1k
  return convolve_8tap_ssse3(ss_128, coeffs);
1315
20.1k
}
1316
1317
static inline __m256i y_convolve_8tap_8x2_avx2(const uint8_t *const src,
1318
                                               const ptrdiff_t stride,
1319
                                               const __m256i coeffs[4],
1320
                                               __m128i s_64[8],
1321
24.8k
                                               __m256i ss_256[4]) {
1322
24.8k
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * stride));
1323
24.8k
  const __m256i src67 = _mm256_setr_m128i(s_64[6], s_64[7]);
1324
24.8k
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * stride));
1325
24.8k
  const __m256i src78 = _mm256_setr_m128i(s_64[7], s_64[6]);
1326
24.8k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1327
24.8k
  return convolve_8tap_avx2(ss_256, coeffs);
1328
24.8k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_8x2_avx2
convolve_avx2.c:y_convolve_8tap_8x2_avx2
Line
Count
Source
1321
24.8k
                                               __m256i ss_256[4]) {
1322
24.8k
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * stride));
1323
24.8k
  const __m256i src67 = _mm256_setr_m128i(s_64[6], s_64[7]);
1324
24.8k
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * stride));
1325
24.8k
  const __m256i src78 = _mm256_setr_m128i(s_64[7], s_64[6]);
1326
24.8k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1327
24.8k
  return convolve_8tap_avx2(ss_256, coeffs);
1328
24.8k
}
1329
1330
static inline void y_convolve_8tap_16x2_avx2(const uint8_t *const src,
1331
                                             const ptrdiff_t stride,
1332
                                             const __m256i coeffs[4],
1333
                                             __m128i s_128[8],
1334
26.2k
                                             __m256i ss_256[8], __m256i r[2]) {
1335
26.2k
  s_128[7] = _mm_loadu_si128((__m128i *)(src + 7 * stride));
1336
26.2k
  const __m256i src67 = _mm256_setr_m128i(s_128[6], s_128[7]);
1337
26.2k
  s_128[6] = _mm_loadu_si128((__m128i *)(src + 8 * stride));
1338
26.2k
  const __m256i src78 = _mm256_setr_m128i(s_128[7], s_128[6]);
1339
26.2k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1340
26.2k
  ss_256[7] = _mm256_unpackhi_epi8(src67, src78);
1341
26.2k
  r[0] = convolve_8tap_avx2(ss_256, coeffs);
1342
26.2k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1343
26.2k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_16x2_avx2
convolve_avx2.c:y_convolve_8tap_16x2_avx2
Line
Count
Source
1334
26.2k
                                             __m256i ss_256[8], __m256i r[2]) {
1335
26.2k
  s_128[7] = _mm_loadu_si128((__m128i *)(src + 7 * stride));
1336
26.2k
  const __m256i src67 = _mm256_setr_m128i(s_128[6], s_128[7]);
1337
26.2k
  s_128[6] = _mm_loadu_si128((__m128i *)(src + 8 * stride));
1338
26.2k
  const __m256i src78 = _mm256_setr_m128i(s_128[7], s_128[6]);
1339
26.2k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1340
26.2k
  ss_256[7] = _mm256_unpackhi_epi8(src67, src78);
1341
26.2k
  r[0] = convolve_8tap_avx2(ss_256, coeffs);
1342
26.2k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1343
26.2k
}
1344
1345
static inline void y_convolve_8tap_32x2_avx2(
1346
    const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1347
125k
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1348
125k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1349
125k
  ss_256[3] = _mm256_unpacklo_epi8(s_256[6], s_256[7]);
1350
125k
  ss_256[7] = _mm256_unpackhi_epi8(s_256[6], s_256[7]);
1351
125k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1352
125k
  tt_256[3] = _mm256_unpacklo_epi8(s_256[7], s_256[6]);
1353
125k
  tt_256[7] = _mm256_unpackhi_epi8(s_256[7], s_256[6]);
1354
125k
  r[0] = convolve_8tap_avx2(ss_256 + 0, coeffs);
1355
125k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1356
125k
  r[2] = convolve_8tap_avx2(tt_256 + 0, coeffs);
1357
125k
  r[3] = convolve_8tap_avx2(tt_256 + 4, coeffs);
1358
125k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_32x2_avx2
convolve_avx2.c:y_convolve_8tap_32x2_avx2
Line
Count
Source
1347
125k
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1348
125k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1349
125k
  ss_256[3] = _mm256_unpacklo_epi8(s_256[6], s_256[7]);
1350
125k
  ss_256[7] = _mm256_unpackhi_epi8(s_256[6], s_256[7]);
1351
125k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1352
125k
  tt_256[3] = _mm256_unpacklo_epi8(s_256[7], s_256[6]);
1353
125k
  tt_256[7] = _mm256_unpackhi_epi8(s_256[7], s_256[6]);
1354
125k
  r[0] = convolve_8tap_avx2(ss_256 + 0, coeffs);
1355
125k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1356
125k
  r[2] = convolve_8tap_avx2(tt_256 + 0, coeffs);
1357
125k
  r[3] = convolve_8tap_avx2(tt_256 + 4, coeffs);
1358
125k
}
1359
1360
static inline void xy_x_convolve_2tap_32_avx2(const uint8_t *const src,
1361
                                              const __m256i coeffs[1],
1362
418k
                                              __m256i r[2]) {
1363
418k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
1364
418k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
1365
418k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
1366
418k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
1367
1368
418k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1369
418k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1370
418k
}
convolve_2d_avx2.c:xy_x_convolve_2tap_32_avx2
Line
Count
Source
1362
418k
                                              __m256i r[2]) {
1363
418k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
1364
418k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
1365
418k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
1366
418k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
1367
1368
418k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1369
418k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1370
418k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_convolve_2tap_32_avx2
1371
1372
static inline void xy_x_2tap_32_avx2(const uint8_t *const src,
1373
                                     const __m256i coeffs[1],
1374
418k
                                     int16_t *const dst) {
1375
418k
  __m256i r[2];
1376
1377
418k
  xy_x_convolve_2tap_32_avx2(src, coeffs, r);
1378
418k
  const __m256i d0 = xy_x_round_avx2(r[0]);
1379
418k
  const __m256i d1 = xy_x_round_avx2(r[1]);
1380
418k
  _mm256_storeu_si256((__m256i *)dst, d0);
1381
418k
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1382
418k
}
convolve_2d_avx2.c:xy_x_2tap_32_avx2
Line
Count
Source
1374
418k
                                     int16_t *const dst) {
1375
418k
  __m256i r[2];
1376
1377
418k
  xy_x_convolve_2tap_32_avx2(src, coeffs, r);
1378
418k
  const __m256i d0 = xy_x_round_avx2(r[0]);
1379
418k
  const __m256i d1 = xy_x_round_avx2(r[1]);
1380
418k
  _mm256_storeu_si256((__m256i *)dst, d0);
1381
418k
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1382
418k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_2tap_32_avx2
1383
1384
static inline void xy_x_4tap_32_avx2(const uint8_t *const src,
1385
                                     const __m256i coeffs[2],
1386
                                     const __m256i filt[2],
1387
475k
                                     int16_t *const dst) {
1388
475k
  __m256i r[2];
1389
1390
475k
  x_convolve_4tap_32_avx2(src, coeffs, filt, r);
1391
475k
  const __m256i d0 = xy_x_round_avx2(r[0]);
1392
475k
  const __m256i d1 = xy_x_round_avx2(r[1]);
1393
475k
  _mm256_storeu_si256((__m256i *)dst, d0);
1394
475k
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1395
475k
}
convolve_2d_avx2.c:xy_x_4tap_32_avx2
Line
Count
Source
1387
475k
                                     int16_t *const dst) {
1388
475k
  __m256i r[2];
1389
1390
475k
  x_convolve_4tap_32_avx2(src, coeffs, filt, r);
1391
475k
  const __m256i d0 = xy_x_round_avx2(r[0]);
1392
475k
  const __m256i d1 = xy_x_round_avx2(r[1]);
1393
475k
  _mm256_storeu_si256((__m256i *)dst, d0);
1394
475k
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1395
475k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_4tap_32_avx2
1396
1397
static inline void xy_x_6tap_32_avx2(const uint8_t *const src,
1398
                                     const __m256i coeffs[3],
1399
                                     const __m256i filt[3],
1400
3.16M
                                     int16_t *const dst) {
1401
3.16M
  __m256i r[2];
1402
1403
3.16M
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
1404
3.16M
  const __m256i d0 = xy_x_round_avx2(r[0]);
1405
3.16M
  const __m256i d1 = xy_x_round_avx2(r[1]);
1406
3.16M
  _mm256_storeu_si256((__m256i *)dst, d0);
1407
3.16M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1408
3.16M
}
convolve_2d_avx2.c:xy_x_6tap_32_avx2
Line
Count
Source
1400
3.16M
                                     int16_t *const dst) {
1401
3.16M
  __m256i r[2];
1402
1403
3.16M
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
1404
3.16M
  const __m256i d0 = xy_x_round_avx2(r[0]);
1405
3.16M
  const __m256i d1 = xy_x_round_avx2(r[1]);
1406
3.16M
  _mm256_storeu_si256((__m256i *)dst, d0);
1407
3.16M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1408
3.16M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_6tap_32_avx2
1409
1410
static inline void xy_x_8tap_32_avx2(const uint8_t *const src,
1411
                                     const __m256i coeffs[4],
1412
                                     const __m256i filt[4],
1413
1.28M
                                     int16_t *const dst) {
1414
1.28M
  __m256i r[2];
1415
1416
1.28M
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
1417
1.28M
  const __m256i d0 = xy_x_round_avx2(r[0]);
1418
1.28M
  const __m256i d1 = xy_x_round_avx2(r[1]);
1419
1.28M
  _mm256_storeu_si256((__m256i *)dst, d0);
1420
1.28M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1421
1.28M
}
convolve_2d_avx2.c:xy_x_8tap_32_avx2
Line
Count
Source
1413
1.28M
                                     int16_t *const dst) {
1414
1.28M
  __m256i r[2];
1415
1416
1.28M
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
1417
1.28M
  const __m256i d0 = xy_x_round_avx2(r[0]);
1418
1.28M
  const __m256i d1 = xy_x_round_avx2(r[1]);
1419
1.28M
  _mm256_storeu_si256((__m256i *)dst, d0);
1420
1.28M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1421
1.28M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_8tap_32_avx2
1422
1423
static inline __m128i xy_y_convolve_2tap_2x2_sse2(const int16_t *const src,
1424
                                                  __m128i s_32[2],
1425
13.2k
                                                  const __m128i coeffs[1]) {
1426
13.2k
  __m128i s_128[2];
1427
1428
13.2k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1429
13.2k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1430
13.2k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1431
13.2k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1432
13.2k
  const __m128i ss = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1433
13.2k
  return convolve16_2tap_sse2(&ss, coeffs);
1434
13.2k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_2x2_sse2
Line
Count
Source
1425
13.2k
                                                  const __m128i coeffs[1]) {
1426
13.2k
  __m128i s_128[2];
1427
1428
13.2k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1429
13.2k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1430
13.2k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1431
13.2k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1432
13.2k
  const __m128i ss = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1433
13.2k
  return convolve16_2tap_sse2(&ss, coeffs);
1434
13.2k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_2x2_sse2
1435
1436
static inline __m128i xy_y_convolve_2tap_2x2_half_pel_sse2(
1437
5.72k
    const int16_t *const src, __m128i s_32[2]) {
1438
5.72k
  __m128i s_128[2];
1439
1440
5.72k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1441
5.72k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1442
5.72k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1443
5.72k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1444
5.72k
  return _mm_add_epi16(s_128[0], s_128[1]);
1445
5.72k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_2x2_half_pel_sse2
Line
Count
Source
1437
5.72k
    const int16_t *const src, __m128i s_32[2]) {
1438
5.72k
  __m128i s_128[2];
1439
1440
5.72k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1441
5.72k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1442
5.72k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1443
5.72k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1444
5.72k
  return _mm_add_epi16(s_128[0], s_128[1]);
1445
5.72k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_2x2_half_pel_sse2
1446
1447
static inline void xy_y_convolve_2tap_4x2_sse2(const int16_t *const src,
1448
                                               __m128i s_64[2],
1449
                                               const __m128i coeffs[1],
1450
57.1k
                                               __m128i r[2]) {
1451
57.1k
  __m128i s_128[2];
1452
1453
57.1k
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1454
57.1k
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1455
57.1k
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1456
57.1k
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1457
57.1k
  const __m128i ss0 = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1458
57.1k
  const __m128i ss1 = _mm_unpackhi_epi16(s_128[0], s_128[1]);
1459
57.1k
  r[0] = convolve16_2tap_sse2(&ss0, coeffs);
1460
57.1k
  r[1] = convolve16_2tap_sse2(&ss1, coeffs);
1461
57.1k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_4x2_sse2
Line
Count
Source
1450
57.1k
                                               __m128i r[2]) {
1451
57.1k
  __m128i s_128[2];
1452
1453
57.1k
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1454
57.1k
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1455
57.1k
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1456
57.1k
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1457
57.1k
  const __m128i ss0 = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1458
57.1k
  const __m128i ss1 = _mm_unpackhi_epi16(s_128[0], s_128[1]);
1459
57.1k
  r[0] = convolve16_2tap_sse2(&ss0, coeffs);
1460
57.1k
  r[1] = convolve16_2tap_sse2(&ss1, coeffs);
1461
57.1k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_4x2_sse2
1462
1463
static inline __m128i xy_y_convolve_2tap_4x2_half_pel_sse2(
1464
28.5k
    const int16_t *const src, __m128i s_64[2]) {
1465
28.5k
  __m128i s_128[2];
1466
1467
28.5k
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1468
28.5k
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1469
28.5k
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1470
28.5k
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1471
28.5k
  return _mm_add_epi16(s_128[0], s_128[1]);
1472
28.5k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_4x2_half_pel_sse2
Line
Count
Source
1464
28.5k
    const int16_t *const src, __m128i s_64[2]) {
1465
28.5k
  __m128i s_128[2];
1466
1467
28.5k
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1468
28.5k
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1469
28.5k
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1470
28.5k
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1471
28.5k
  return _mm_add_epi16(s_128[0], s_128[1]);
1472
28.5k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_4x2_half_pel_sse2
1473
1474
static inline void xy_y_convolve_2tap_16_avx2(const __m256i s0,
1475
                                              const __m256i s1,
1476
                                              const __m256i coeffs[1],
1477
704k
                                              __m256i r[2]) {
1478
704k
  const __m256i ss0 = _mm256_unpacklo_epi16(s0, s1);
1479
704k
  const __m256i ss1 = _mm256_unpackhi_epi16(s0, s1);
1480
704k
  r[0] = convolve16_2tap_avx2(&ss0, coeffs);
1481
704k
  r[1] = convolve16_2tap_avx2(&ss1, coeffs);
1482
704k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_16_avx2
Line
Count
Source
1477
704k
                                              __m256i r[2]) {
1478
704k
  const __m256i ss0 = _mm256_unpacklo_epi16(s0, s1);
1479
704k
  const __m256i ss1 = _mm256_unpackhi_epi16(s0, s1);
1480
704k
  r[0] = convolve16_2tap_avx2(&ss0, coeffs);
1481
704k
  r[1] = convolve16_2tap_avx2(&ss1, coeffs);
1482
704k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_16_avx2
1483
1484
static inline void xy_y_convolve_2tap_8x2_avx2(const int16_t *const src,
1485
                                               __m128i s_128[2],
1486
                                               const __m256i coeffs[1],
1487
49.2k
                                               __m256i r[2]) {
1488
49.2k
  __m256i s_256[2];
1489
49.2k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1490
49.2k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1491
49.2k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1492
49.2k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1493
49.2k
  xy_y_convolve_2tap_16_avx2(s_256[0], s_256[1], coeffs, r);
1494
49.2k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_8x2_avx2
Line
Count
Source
1487
49.2k
                                               __m256i r[2]) {
1488
49.2k
  __m256i s_256[2];
1489
49.2k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1490
49.2k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1491
49.2k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1492
49.2k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1493
49.2k
  xy_y_convolve_2tap_16_avx2(s_256[0], s_256[1], coeffs, r);
1494
49.2k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_8x2_avx2
1495
1496
static inline __m256i xy_y_convolve_2tap_8x2_half_pel_avx2(
1497
31.4k
    const int16_t *const src, __m128i s_128[2]) {
1498
31.4k
  __m256i s_256[2];
1499
31.4k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1500
31.4k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1501
31.4k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1502
31.4k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1503
31.4k
  return _mm256_add_epi16(s_256[0], s_256[1]);
1504
31.4k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_8x2_half_pel_avx2
Line
Count
Source
1497
31.4k
    const int16_t *const src, __m128i s_128[2]) {
1498
31.4k
  __m256i s_256[2];
1499
31.4k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1500
31.4k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1501
31.4k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1502
31.4k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1503
31.4k
  return _mm256_add_epi16(s_256[0], s_256[1]);
1504
31.4k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_8x2_half_pel_avx2
1505
1506
static inline void xy_y_convolve_2tap_16x2_half_pel_avx2(
1507
18.5k
    const int16_t *const src, __m256i s_256[2], __m256i r[2]) {
1508
18.5k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1509
18.5k
  r[0] = _mm256_add_epi16(s_256[0], s_256[1]);
1510
18.5k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1511
18.5k
  r[1] = _mm256_add_epi16(s_256[1], s_256[0]);
1512
18.5k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_16x2_half_pel_avx2
Line
Count
Source
1507
18.5k
    const int16_t *const src, __m256i s_256[2], __m256i r[2]) {
1508
18.5k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1509
18.5k
  r[0] = _mm256_add_epi16(s_256[0], s_256[1]);
1510
18.5k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1511
18.5k
  r[1] = _mm256_add_epi16(s_256[1], s_256[0]);
1512
18.5k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_16x2_half_pel_avx2
1513
1514
static inline void xy_y_store_16x2_avx2(const __m256i r[2], uint8_t *const dst,
1515
0
                                        const ptrdiff_t stride) {
1516
0
  const __m256i t = _mm256_packus_epi16(r[0], r[1]);
1517
0
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
1518
0
  storeu_u8_16x2_avx2(d, dst, stride);
1519
0
}
Unexecuted instantiation: convolve_2d_avx2.c:xy_y_store_16x2_avx2
Unexecuted instantiation: convolve_avx2.c:xy_y_store_16x2_avx2
1520
1521
static inline void xy_y_convolve_2tap_16x2_avx2(const int16_t *const src,
1522
                                                __m256i s[2],
1523
                                                const __m256i coeffs[1],
1524
31.2k
                                                __m256i r[4]) {
1525
31.2k
  s[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1526
31.2k
  xy_y_convolve_2tap_16_avx2(s[0], s[1], coeffs, r + 0);
1527
31.2k
  s[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1528
31.2k
  xy_y_convolve_2tap_16_avx2(s[1], s[0], coeffs, r + 2);
1529
31.2k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_16x2_avx2
Line
Count
Source
1524
31.2k
                                                __m256i r[4]) {
1525
31.2k
  s[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1526
31.2k
  xy_y_convolve_2tap_16_avx2(s[0], s[1], coeffs, r + 0);
1527
31.2k
  s[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1528
31.2k
  xy_y_convolve_2tap_16_avx2(s[1], s[0], coeffs, r + 2);
1529
31.2k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_16x2_avx2
1530
1531
static inline void xy_y_convolve_2tap_32_avx2(const int16_t *const src,
1532
                                              const __m256i s0[2],
1533
                                              __m256i s1[2],
1534
                                              const __m256i coeffs[1],
1535
223k
                                              __m256i r[4]) {
1536
223k
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1537
223k
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1538
223k
  xy_y_convolve_2tap_16_avx2(s0[0], s1[0], coeffs, r + 0);
1539
223k
  xy_y_convolve_2tap_16_avx2(s0[1], s1[1], coeffs, r + 2);
1540
223k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_32_avx2
Line
Count
Source
1535
223k
                                              __m256i r[4]) {
1536
223k
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1537
223k
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1538
223k
  xy_y_convolve_2tap_16_avx2(s0[0], s1[0], coeffs, r + 0);
1539
223k
  xy_y_convolve_2tap_16_avx2(s0[1], s1[1], coeffs, r + 2);
1540
223k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_32_avx2
1541
1542
static inline void xy_y_convolve_2tap_32_all_avx2(const int16_t *const src,
1543
                                                  const __m256i s0[2],
1544
                                                  __m256i s1[2],
1545
                                                  const __m256i coeffs[1],
1546
223k
                                                  uint8_t *const dst) {
1547
223k
  __m256i r[4];
1548
1549
223k
  xy_y_convolve_2tap_32_avx2(src, s0, s1, coeffs, r);
1550
223k
  xy_y_round_store_32_avx2(r + 0, r + 2, dst);
1551
223k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_32_all_avx2
Line
Count
Source
1546
223k
                                                  uint8_t *const dst) {
1547
223k
  __m256i r[4];
1548
1549
223k
  xy_y_convolve_2tap_32_avx2(src, s0, s1, coeffs, r);
1550
223k
  xy_y_round_store_32_avx2(r + 0, r + 2, dst);
1551
223k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_32_all_avx2
1552
1553
static inline void xy_y_convolve_2tap_half_pel_32_avx2(const int16_t *const src,
1554
                                                       const __m256i s0[2],
1555
                                                       __m256i s1[2],
1556
179k
                                                       __m256i r[2]) {
1557
179k
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1558
179k
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1559
179k
  r[0] = _mm256_add_epi16(s0[0], s1[0]);
1560
179k
  r[1] = _mm256_add_epi16(s0[1], s1[1]);
1561
179k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_half_pel_32_avx2
Line
Count
Source
1556
179k
                                                       __m256i r[2]) {
1557
179k
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1558
179k
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1559
179k
  r[0] = _mm256_add_epi16(s0[0], s1[0]);
1560
179k
  r[1] = _mm256_add_epi16(s0[1], s1[1]);
1561
179k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_half_pel_32_avx2
1562
1563
static inline void xy_y_convolve_2tap_half_pel_32_all_avx2(
1564
    const int16_t *const src, const __m256i s0[2], __m256i s1[2],
1565
179k
    uint8_t *const dst) {
1566
179k
  __m256i r[2];
1567
1568
179k
  xy_y_convolve_2tap_half_pel_32_avx2(src, s0, s1, r);
1569
179k
  r[0] = xy_y_round_half_pel_avx2(r[0]);
1570
179k
  r[1] = xy_y_round_half_pel_avx2(r[1]);
1571
179k
  xy_y_pack_store_32_avx2(r[0], r[1], dst);
1572
179k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_half_pel_32_all_avx2
Line
Count
Source
1565
179k
    uint8_t *const dst) {
1566
179k
  __m256i r[2];
1567
1568
179k
  xy_y_convolve_2tap_half_pel_32_avx2(src, s0, s1, r);
1569
179k
  r[0] = xy_y_round_half_pel_avx2(r[0]);
1570
179k
  r[1] = xy_y_round_half_pel_avx2(r[1]);
1571
179k
  xy_y_pack_store_32_avx2(r[0], r[1], dst);
1572
179k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_half_pel_32_all_avx2
1573
1574
static inline __m128i xy_y_convolve_4tap_2x2_sse2(const int16_t *const src,
1575
                                                  __m128i s_32[4],
1576
                                                  __m128i ss_128[2],
1577
115k
                                                  const __m128i coeffs[2]) {
1578
115k
  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + 3 * 2));
1579
115k
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1580
115k
  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 4 * 2));
1581
115k
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1582
115k
  ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1583
115k
  const __m128i r = convolve16_4tap_sse2(ss_128, coeffs);
1584
115k
  ss_128[0] = ss_128[1];
1585
115k
  return r;
1586
115k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_2x2_sse2
Line
Count
Source
1577
115k
                                                  const __m128i coeffs[2]) {
1578
115k
  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + 3 * 2));
1579
115k
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1580
115k
  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 4 * 2));
1581
115k
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1582
115k
  ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1583
115k
  const __m128i r = convolve16_4tap_sse2(ss_128, coeffs);
1584
115k
  ss_128[0] = ss_128[1];
1585
115k
  return r;
1586
115k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_2x2_sse2
1587
1588
static inline __m256i xy_y_convolve_4tap_4x2_avx2(const int16_t *const src,
1589
                                                  __m128i s_64[4],
1590
                                                  __m256i ss_256[2],
1591
597k
                                                  const __m256i coeffs[2]) {
1592
597k
  __m256i s_256[2];
1593
597k
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + 3 * 4));
1594
597k
  s_256[0] = _mm256_setr_m128i(s_64[2], s_64[3]);
1595
597k
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 4 * 4));
1596
597k
  s_256[1] = _mm256_setr_m128i(s_64[3], s_64[2]);
1597
597k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1598
597k
  const __m256i r = convolve16_4tap_avx2(ss_256, coeffs);
1599
597k
  ss_256[0] = ss_256[1];
1600
597k
  return r;
1601
597k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_4x2_avx2
Line
Count
Source
1591
597k
                                                  const __m256i coeffs[2]) {
1592
597k
  __m256i s_256[2];
1593
597k
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + 3 * 4));
1594
597k
  s_256[0] = _mm256_setr_m128i(s_64[2], s_64[3]);
1595
597k
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 4 * 4));
1596
597k
  s_256[1] = _mm256_setr_m128i(s_64[3], s_64[2]);
1597
597k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1598
597k
  const __m256i r = convolve16_4tap_avx2(ss_256, coeffs);
1599
597k
  ss_256[0] = ss_256[1];
1600
597k
  return r;
1601
597k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_4x2_avx2
1602
1603
static inline void xy_y_convolve_4tap_16_avx2(const __m256i *const ss,
1604
                                              const __m256i coeffs[2],
1605
2.32M
                                              __m256i r[2]) {
1606
2.32M
  r[0] = convolve16_4tap_avx2(ss, coeffs);
1607
2.32M
  r[1] = convolve16_4tap_avx2(ss + 2, coeffs);
1608
2.32M
}
convolve_2d_avx2.c:xy_y_convolve_4tap_16_avx2
Line
Count
Source
1605
2.32M
                                              __m256i r[2]) {
1606
2.32M
  r[0] = convolve16_4tap_avx2(ss, coeffs);
1607
2.32M
  r[1] = convolve16_4tap_avx2(ss + 2, coeffs);
1608
2.32M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_16_avx2
1609
1610
static inline void xy_y_convolve_4tap_8x2_avx2(const int16_t *const src,
1611
                                               __m256i ss_256[4],
1612
                                               const __m256i coeffs[2],
1613
381k
                                               __m256i r[2]) {
1614
381k
  __m256i s_256[2];
1615
381k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1616
381k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1617
381k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1618
381k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1619
381k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1620
381k
  ss_256[0] = ss_256[1];
1621
381k
  ss_256[2] = ss_256[3];
1622
381k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_8x2_avx2
Line
Count
Source
1613
381k
                                               __m256i r[2]) {
1614
381k
  __m256i s_256[2];
1615
381k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1616
381k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1617
381k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1618
381k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1619
381k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1620
381k
  ss_256[0] = ss_256[1];
1621
381k
  ss_256[2] = ss_256[3];
1622
381k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_8x2_avx2
1623
1624
static inline void xy_y_convolve_4tap_8x2_half_pel_avx2(
1625
    const int16_t *const src, const __m256i coeffs[1], __m256i s_256[4],
1626
69.5k
    __m256i r[2]) {
1627
69.5k
  __m256i a_256[2];
1628
69.5k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1629
69.5k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1630
69.5k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1631
69.5k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1632
69.5k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r);
1633
69.5k
  s_256[0] = s_256[2];
1634
69.5k
  s_256[1] = s_256[3];
1635
69.5k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_8x2_half_pel_avx2
Line
Count
Source
1626
69.5k
    __m256i r[2]) {
1627
69.5k
  __m256i a_256[2];
1628
69.5k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1629
69.5k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1630
69.5k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1631
69.5k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1632
69.5k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r);
1633
69.5k
  s_256[0] = s_256[2];
1634
69.5k
  s_256[1] = s_256[3];
1635
69.5k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_8x2_half_pel_avx2
1636
1637
static inline void xy_y_convolve_4tap_16x2_avx2(
1638
    const int16_t *const src, __m256i s_256[4], __m256i ss_256[4],
1639
188k
    __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4]) {
1640
188k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1641
188k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1642
188k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1643
188k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1644
188k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1645
188k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1646
188k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1647
188k
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1648
188k
  ss_256[0] = ss_256[1];
1649
188k
  ss_256[2] = ss_256[3];
1650
188k
  tt_256[0] = tt_256[1];
1651
188k
  tt_256[2] = tt_256[3];
1652
188k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_16x2_avx2
Line
Count
Source
1639
188k
    __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4]) {
1640
188k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1641
188k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1642
188k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1643
188k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1644
188k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1645
188k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1646
188k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1647
188k
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1648
188k
  ss_256[0] = ss_256[1];
1649
188k
  ss_256[2] = ss_256[3];
1650
188k
  tt_256[0] = tt_256[1];
1651
188k
  tt_256[2] = tt_256[3];
1652
188k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_16x2_avx2
1653
1654
static inline void xy_y_convolve_4tap_32x2_avx2(
1655
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[4],
1656
    __m256i ss_256[4], __m256i tt_256[4], const __m256i coeffs[2],
1657
407k
    __m256i r[4]) {
1658
407k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1659
407k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1660
407k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1661
407k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1662
407k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1663
407k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1664
407k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1665
407k
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1666
407k
  ss_256[0] = ss_256[1];
1667
407k
  ss_256[2] = ss_256[3];
1668
407k
  tt_256[0] = tt_256[1];
1669
407k
  tt_256[2] = tt_256[3];
1670
407k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_32x2_avx2
Line
Count
Source
1657
407k
    __m256i r[4]) {
1658
407k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1659
407k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1660
407k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1661
407k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1662
407k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1663
407k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1664
407k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1665
407k
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1666
407k
  ss_256[0] = ss_256[1];
1667
407k
  ss_256[2] = ss_256[3];
1668
407k
  tt_256[0] = tt_256[1];
1669
407k
  tt_256[2] = tt_256[3];
1670
407k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_32x2_avx2
1671
1672
static inline void xy_y_convolve_4tap_16x2_half_pelavx2(
1673
    const int16_t *const src, __m256i s_256[5], const __m256i coeffs[1],
1674
38.7k
    __m256i r[4]) {
1675
38.7k
  __m256i a_256[2];
1676
1677
38.7k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1678
38.7k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1679
1680
38.7k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1681
38.7k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1682
38.7k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 0);
1683
1684
38.7k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1685
38.7k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[3]);
1686
38.7k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 2);
1687
1688
38.7k
  s_256[0] = s_256[2];
1689
38.7k
  s_256[1] = s_256[3];
1690
38.7k
  s_256[2] = s_256[4];
1691
38.7k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_16x2_half_pelavx2
Line
Count
Source
1674
38.7k
    __m256i r[4]) {
1675
38.7k
  __m256i a_256[2];
1676
1677
38.7k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1678
38.7k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1679
1680
38.7k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1681
38.7k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1682
38.7k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 0);
1683
1684
38.7k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1685
38.7k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[3]);
1686
38.7k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 2);
1687
1688
38.7k
  s_256[0] = s_256[2];
1689
38.7k
  s_256[1] = s_256[3];
1690
38.7k
  s_256[2] = s_256[4];
1691
38.7k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_16x2_half_pelavx2
1692
1693
static inline __m128i xy_y_convolve_6tap_2x2_sse2(const int16_t *const src,
1694
                                                  __m128i s_32[6],
1695
                                                  __m128i ss_128[3],
1696
186k
                                                  const __m128i coeffs[3]) {
1697
186k
  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 5 * 2));
1698
186k
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1699
186k
  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 6 * 2));
1700
186k
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1701
186k
  ss_128[2] = _mm_unpacklo_epi16(src45, src56);
1702
186k
  const __m128i r = convolve16_6tap_sse2(ss_128, coeffs);
1703
186k
  ss_128[0] = ss_128[1];
1704
186k
  ss_128[1] = ss_128[2];
1705
186k
  return r;
1706
186k
}
convolve_2d_avx2.c:xy_y_convolve_6tap_2x2_sse2
Line
Count
Source
1696
186k
                                                  const __m128i coeffs[3]) {
1697
186k
  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 5 * 2));
1698
186k
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1699
186k
  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 6 * 2));
1700
186k
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1701
186k
  ss_128[2] = _mm_unpacklo_epi16(src45, src56);
1702
186k
  const __m128i r = convolve16_6tap_sse2(ss_128, coeffs);
1703
186k
  ss_128[0] = ss_128[1];
1704
186k
  ss_128[1] = ss_128[2];
1705
186k
  return r;
1706
186k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_2x2_sse2
1707
1708
static inline __m256i xy_y_convolve_6tap_4x2_avx2(const int16_t *const src,
1709
                                                  __m128i s_64[6],
1710
                                                  __m256i ss_256[3],
1711
835k
                                                  const __m256i coeffs[3]) {
1712
835k
  __m256i s_256[2];
1713
835k
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 5 * 4));
1714
835k
  s_256[0] = _mm256_setr_m128i(s_64[4], s_64[5]);
1715
835k
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 6 * 4));
1716
835k
  s_256[1] = _mm256_setr_m128i(s_64[5], s_64[4]);
1717
835k
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1718
835k
  const __m256i r = convolve16_6tap_avx2(ss_256, coeffs);
1719
835k
  ss_256[0] = ss_256[1];
1720
835k
  ss_256[1] = ss_256[2];
1721
835k
  return r;
1722
835k
}
convolve_2d_avx2.c:xy_y_convolve_6tap_4x2_avx2
Line
Count
Source
1711
835k
                                                  const __m256i coeffs[3]) {
1712
835k
  __m256i s_256[2];
1713
835k
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 5 * 4));
1714
835k
  s_256[0] = _mm256_setr_m128i(s_64[4], s_64[5]);
1715
835k
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 6 * 4));
1716
835k
  s_256[1] = _mm256_setr_m128i(s_64[5], s_64[4]);
1717
835k
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1718
835k
  const __m256i r = convolve16_6tap_avx2(ss_256, coeffs);
1719
835k
  ss_256[0] = ss_256[1];
1720
835k
  ss_256[1] = ss_256[2];
1721
835k
  return r;
1722
835k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_4x2_avx2
1723
1724
static inline void xy_y_convolve_6tap_16_avx2(const __m256i ss[6],
1725
                                              const __m256i coeffs[3],
1726
7.65M
                                              __m256i r[2]) {
1727
7.65M
  r[0] = convolve16_6tap_avx2(ss, coeffs);
1728
7.65M
  r[1] = convolve16_6tap_avx2(ss + 3, coeffs);
1729
7.65M
}
convolve_2d_avx2.c:xy_y_convolve_6tap_16_avx2
Line
Count
Source
1726
7.65M
                                              __m256i r[2]) {
1727
7.65M
  r[0] = convolve16_6tap_avx2(ss, coeffs);
1728
7.65M
  r[1] = convolve16_6tap_avx2(ss + 3, coeffs);
1729
7.65M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_16_avx2
1730
1731
static inline void xy_y_convolve_6tap_8x2_avx2(const int16_t *const src,
1732
                                               __m256i ss_256[6],
1733
                                               const __m256i coeffs[3],
1734
809k
                                               __m256i r[2]) {
1735
809k
  __m256i s_256[2];
1736
809k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1737
809k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1738
809k
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1739
809k
  ss_256[5] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1740
809k
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r);
1741
809k
  ss_256[0] = ss_256[1];
1742
809k
  ss_256[1] = ss_256[2];
1743
809k
  ss_256[3] = ss_256[4];
1744
809k
  ss_256[4] = ss_256[5];
1745
809k
}
convolve_2d_avx2.c:xy_y_convolve_6tap_8x2_avx2
Line
Count
Source
1734
809k
                                               __m256i r[2]) {
1735
809k
  __m256i s_256[2];
1736
809k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1737
809k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1738
809k
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1739
809k
  ss_256[5] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1740
809k
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r);
1741
809k
  ss_256[0] = ss_256[1];
1742
809k
  ss_256[1] = ss_256[2];
1743
809k
  ss_256[3] = ss_256[4];
1744
809k
  ss_256[4] = ss_256[5];
1745
809k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_8x2_avx2
1746
1747
static inline void xy_y_convolve_6tap_8x2_half_pel_avx2(
1748
    const int16_t *const src, const __m256i coeffs[2], __m256i s_256[6],
1749
246k
    __m256i r[2]) {
1750
246k
  __m256i a_256[2], ss_256[4];
1751
246k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1752
246k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1753
246k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1754
246k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1755
246k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1756
246k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1757
246k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1758
246k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1759
246k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1760
246k
  s_256[0] = s_256[2];
1761
246k
  s_256[1] = s_256[3];
1762
246k
  s_256[2] = s_256[4];
1763
246k
  s_256[3] = s_256[5];
1764
246k
}
convolve_2d_avx2.c:xy_y_convolve_6tap_8x2_half_pel_avx2
Line
Count
Source
1749
246k
    __m256i r[2]) {
1750
246k
  __m256i a_256[2], ss_256[4];
1751
246k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1752
246k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1753
246k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1754
246k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1755
246k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1756
246k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1757
246k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1758
246k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1759
246k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1760
246k
  s_256[0] = s_256[2];
1761
246k
  s_256[1] = s_256[3];
1762
246k
  s_256[2] = s_256[4];
1763
246k
  s_256[3] = s_256[5];
1764
246k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_8x2_half_pel_avx2
1765
1766
static inline void xy_y_convolve_6tap_16x2_avx2(
1767
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
1768
    __m256i ss_256[6], __m256i tt_256[6], const __m256i coeffs[3],
1769
3.42M
    __m256i r[4]) {
1770
3.42M
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1771
3.42M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
1772
3.42M
  ss_256[5] = _mm256_unpackhi_epi16(s_256[4], s_256[5]);
1773
3.42M
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1774
3.42M
  tt_256[2] = _mm256_unpacklo_epi16(s_256[5], s_256[4]);
1775
3.42M
  tt_256[5] = _mm256_unpackhi_epi16(s_256[5], s_256[4]);
1776
1777
3.42M
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r + 0);
1778
3.42M
  xy_y_convolve_6tap_16_avx2(tt_256, coeffs, r + 2);
1779
1780
3.42M
  ss_256[0] = ss_256[1];
1781
3.42M
  ss_256[1] = ss_256[2];
1782
3.42M
  ss_256[3] = ss_256[4];
1783
3.42M
  ss_256[4] = ss_256[5];
1784
1785
3.42M
  tt_256[0] = tt_256[1];
1786
3.42M
  tt_256[1] = tt_256[2];
1787
3.42M
  tt_256[3] = tt_256[4];
1788
3.42M
  tt_256[4] = tt_256[5];
1789
3.42M
}
convolve_2d_avx2.c:xy_y_convolve_6tap_16x2_avx2
Line
Count
Source
1769
3.42M
    __m256i r[4]) {
1770
3.42M
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1771
3.42M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
1772
3.42M
  ss_256[5] = _mm256_unpackhi_epi16(s_256[4], s_256[5]);
1773
3.42M
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1774
3.42M
  tt_256[2] = _mm256_unpacklo_epi16(s_256[5], s_256[4]);
1775
3.42M
  tt_256[5] = _mm256_unpackhi_epi16(s_256[5], s_256[4]);
1776
1777
3.42M
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r + 0);
1778
3.42M
  xy_y_convolve_6tap_16_avx2(tt_256, coeffs, r + 2);
1779
1780
3.42M
  ss_256[0] = ss_256[1];
1781
3.42M
  ss_256[1] = ss_256[2];
1782
3.42M
  ss_256[3] = ss_256[4];
1783
3.42M
  ss_256[4] = ss_256[5];
1784
1785
3.42M
  tt_256[0] = tt_256[1];
1786
3.42M
  tt_256[1] = tt_256[2];
1787
3.42M
  tt_256[3] = tt_256[4];
1788
3.42M
  tt_256[4] = tt_256[5];
1789
3.42M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_16x2_avx2
1790
1791
static inline void xy_y_convolve_6tap_16x2_half_pel_avx2(
1792
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
1793
229k
    __m256i ss_256[4], const __m256i coeffs[2], __m256i r[4]) {
1794
229k
  __m256i a_256[2];
1795
1796
229k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1797
229k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1798
229k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1799
229k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1800
229k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1801
229k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1802
229k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1803
229k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1804
1805
229k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[5]);
1806
229k
  s_256[0] = s_256[2];
1807
229k
  s_256[2] = s_256[4];
1808
229k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1809
229k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1810
229k
  s_256[1] = s_256[3];
1811
229k
  s_256[3] = s_256[5];
1812
229k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1813
229k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
1814
229k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1815
229k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
1816
229k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1817
229k
}
convolve_2d_avx2.c:xy_y_convolve_6tap_16x2_half_pel_avx2
Line
Count
Source
1793
229k
    __m256i ss_256[4], const __m256i coeffs[2], __m256i r[4]) {
1794
229k
  __m256i a_256[2];
1795
1796
229k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1797
229k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1798
229k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1799
229k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1800
229k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1801
229k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1802
229k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1803
229k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1804
1805
229k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[5]);
1806
229k
  s_256[0] = s_256[2];
1807
229k
  s_256[2] = s_256[4];
1808
229k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1809
229k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1810
229k
  s_256[1] = s_256[3];
1811
229k
  s_256[3] = s_256[5];
1812
229k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1813
229k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
1814
229k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1815
229k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
1816
229k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1817
229k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_16x2_half_pel_avx2
1818
1819
static inline __m128i xy_y_convolve_8tap_2x2_sse2(const int16_t *const src,
1820
                                                  __m128i s_32[8],
1821
                                                  __m128i ss_128[4],
1822
10.0k
                                                  const __m128i coeffs[4]) {
1823
10.0k
  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * 2));
1824
10.0k
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1825
10.0k
  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * 2));
1826
10.0k
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1827
10.0k
  ss_128[3] = _mm_unpacklo_epi16(src67, src78);
1828
10.0k
  const __m128i r = convolve16_8tap_sse2(ss_128, coeffs);
1829
10.0k
  ss_128[0] = ss_128[1];
1830
10.0k
  ss_128[1] = ss_128[2];
1831
10.0k
  ss_128[2] = ss_128[3];
1832
10.0k
  return r;
1833
10.0k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_2x2_sse2
Line
Count
Source
1822
10.0k
                                                  const __m128i coeffs[4]) {
1823
10.0k
  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * 2));
1824
10.0k
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1825
10.0k
  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * 2));
1826
10.0k
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1827
10.0k
  ss_128[3] = _mm_unpacklo_epi16(src67, src78);
1828
10.0k
  const __m128i r = convolve16_8tap_sse2(ss_128, coeffs);
1829
10.0k
  ss_128[0] = ss_128[1];
1830
10.0k
  ss_128[1] = ss_128[2];
1831
10.0k
  ss_128[2] = ss_128[3];
1832
10.0k
  return r;
1833
10.0k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_2x2_sse2
1834
1835
static inline __m256i xy_y_convolve_8tap_4x2_avx2(const int16_t *const src,
1836
                                                  __m128i s_64[8],
1837
                                                  __m256i ss_256[4],
1838
47.8k
                                                  const __m256i coeffs[4]) {
1839
47.8k
  __m256i s_256[2];
1840
47.8k
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * 4));
1841
47.8k
  s_256[0] = _mm256_setr_m128i(s_64[6], s_64[7]);
1842
47.8k
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * 4));
1843
47.8k
  s_256[1] = _mm256_setr_m128i(s_64[7], s_64[6]);
1844
47.8k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1845
47.8k
  const __m256i r = convolve16_8tap_avx2(ss_256, coeffs);
1846
47.8k
  ss_256[0] = ss_256[1];
1847
47.8k
  ss_256[1] = ss_256[2];
1848
47.8k
  ss_256[2] = ss_256[3];
1849
47.8k
  return r;
1850
47.8k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_4x2_avx2
Line
Count
Source
1838
47.8k
                                                  const __m256i coeffs[4]) {
1839
47.8k
  __m256i s_256[2];
1840
47.8k
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * 4));
1841
47.8k
  s_256[0] = _mm256_setr_m128i(s_64[6], s_64[7]);
1842
47.8k
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * 4));
1843
47.8k
  s_256[1] = _mm256_setr_m128i(s_64[7], s_64[6]);
1844
47.8k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1845
47.8k
  const __m256i r = convolve16_8tap_avx2(ss_256, coeffs);
1846
47.8k
  ss_256[0] = ss_256[1];
1847
47.8k
  ss_256[1] = ss_256[2];
1848
47.8k
  ss_256[2] = ss_256[3];
1849
47.8k
  return r;
1850
47.8k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_4x2_avx2
1851
1852
static inline void xy_y_convolve_8tap_16_avx2(const __m256i *const ss,
1853
                                              const __m256i coeffs[4],
1854
2.11M
                                              __m256i r[2]) {
1855
2.11M
  r[0] = convolve16_8tap_avx2(ss, coeffs);
1856
2.11M
  r[1] = convolve16_8tap_avx2(ss + 4, coeffs);
1857
2.11M
}
convolve_2d_avx2.c:xy_y_convolve_8tap_16_avx2
Line
Count
Source
1854
2.11M
                                              __m256i r[2]) {
1855
2.11M
  r[0] = convolve16_8tap_avx2(ss, coeffs);
1856
2.11M
  r[1] = convolve16_8tap_avx2(ss + 4, coeffs);
1857
2.11M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_16_avx2
1858
1859
static inline void xy_y_convolve_8tap_8x2_avx2(const int16_t *const src,
1860
                                               __m256i ss_256[8],
1861
                                               const __m256i coeffs[4],
1862
33.8k
                                               __m256i r[2]) {
1863
33.8k
  __m256i s_256[2];
1864
33.8k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1865
33.8k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1866
33.8k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1867
33.8k
  ss_256[7] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1868
33.8k
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r);
1869
33.8k
  ss_256[0] = ss_256[1];
1870
33.8k
  ss_256[1] = ss_256[2];
1871
33.8k
  ss_256[2] = ss_256[3];
1872
33.8k
  ss_256[4] = ss_256[5];
1873
33.8k
  ss_256[5] = ss_256[6];
1874
33.8k
  ss_256[6] = ss_256[7];
1875
33.8k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_8x2_avx2
Line
Count
Source
1862
33.8k
                                               __m256i r[2]) {
1863
33.8k
  __m256i s_256[2];
1864
33.8k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1865
33.8k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1866
33.8k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1867
33.8k
  ss_256[7] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1868
33.8k
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r);
1869
33.8k
  ss_256[0] = ss_256[1];
1870
33.8k
  ss_256[1] = ss_256[2];
1871
33.8k
  ss_256[2] = ss_256[3];
1872
33.8k
  ss_256[4] = ss_256[5];
1873
33.8k
  ss_256[5] = ss_256[6];
1874
33.8k
  ss_256[6] = ss_256[7];
1875
33.8k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_8x2_avx2
1876
1877
static inline void xy_y_convolve_8tap_8x2_half_pel_avx2(
1878
    const int16_t *const src, const __m256i coeffs[2], __m256i s_256[8],
1879
19.7k
    __m256i r[2]) {
1880
19.7k
  __m256i a_256[4], ss_256[4];
1881
1882
19.7k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1883
19.7k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1884
19.7k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1885
19.7k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1886
19.7k
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1887
19.7k
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1888
19.7k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1889
19.7k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1890
19.7k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1891
19.7k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1892
19.7k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1893
19.7k
  s_256[0] = s_256[2];
1894
19.7k
  s_256[1] = s_256[3];
1895
19.7k
  s_256[2] = s_256[4];
1896
19.7k
  s_256[3] = s_256[5];
1897
19.7k
  s_256[4] = s_256[6];
1898
19.7k
  s_256[5] = s_256[7];
1899
19.7k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_8x2_half_pel_avx2
Line
Count
Source
1879
19.7k
    __m256i r[2]) {
1880
19.7k
  __m256i a_256[4], ss_256[4];
1881
1882
19.7k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1883
19.7k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1884
19.7k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1885
19.7k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1886
19.7k
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1887
19.7k
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1888
19.7k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1889
19.7k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1890
19.7k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1891
19.7k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1892
19.7k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1893
19.7k
  s_256[0] = s_256[2];
1894
19.7k
  s_256[1] = s_256[3];
1895
19.7k
  s_256[2] = s_256[4];
1896
19.7k
  s_256[3] = s_256[5];
1897
19.7k
  s_256[4] = s_256[6];
1898
19.7k
  s_256[5] = s_256[7];
1899
19.7k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_8x2_half_pel_avx2
1900
1901
static AOM_FORCE_INLINE void xy_y_convolve_8tap_16x2_avx2(
1902
    const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1903
1.04M
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1904
1.04M
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1905
1.04M
  ss_256[3] = _mm256_unpacklo_epi16(s_256[6], s_256[7]);
1906
1.04M
  ss_256[7] = _mm256_unpackhi_epi16(s_256[6], s_256[7]);
1907
1.04M
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1908
1.04M
  tt_256[3] = _mm256_unpacklo_epi16(s_256[7], s_256[6]);
1909
1.04M
  tt_256[7] = _mm256_unpackhi_epi16(s_256[7], s_256[6]);
1910
1911
1.04M
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r + 0);
1912
1.04M
  xy_y_convolve_8tap_16_avx2(tt_256, coeffs, r + 2);
1913
1914
1.04M
  ss_256[0] = ss_256[1];
1915
1.04M
  ss_256[1] = ss_256[2];
1916
1.04M
  ss_256[2] = ss_256[3];
1917
1.04M
  ss_256[4] = ss_256[5];
1918
1.04M
  ss_256[5] = ss_256[6];
1919
1.04M
  ss_256[6] = ss_256[7];
1920
1921
1.04M
  tt_256[0] = tt_256[1];
1922
1.04M
  tt_256[1] = tt_256[2];
1923
1.04M
  tt_256[2] = tt_256[3];
1924
1.04M
  tt_256[4] = tt_256[5];
1925
1.04M
  tt_256[5] = tt_256[6];
1926
1.04M
  tt_256[6] = tt_256[7];
1927
1.04M
}
convolve_2d_avx2.c:xy_y_convolve_8tap_16x2_avx2
Line
Count
Source
1903
1.04M
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1904
1.04M
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1905
1.04M
  ss_256[3] = _mm256_unpacklo_epi16(s_256[6], s_256[7]);
1906
1.04M
  ss_256[7] = _mm256_unpackhi_epi16(s_256[6], s_256[7]);
1907
1.04M
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1908
1.04M
  tt_256[3] = _mm256_unpacklo_epi16(s_256[7], s_256[6]);
1909
1.04M
  tt_256[7] = _mm256_unpackhi_epi16(s_256[7], s_256[6]);
1910
1911
1.04M
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r + 0);
1912
1.04M
  xy_y_convolve_8tap_16_avx2(tt_256, coeffs, r + 2);
1913
1914
1.04M
  ss_256[0] = ss_256[1];
1915
1.04M
  ss_256[1] = ss_256[2];
1916
1.04M
  ss_256[2] = ss_256[3];
1917
1.04M
  ss_256[4] = ss_256[5];
1918
1.04M
  ss_256[5] = ss_256[6];
1919
1.04M
  ss_256[6] = ss_256[7];
1920
1921
1.04M
  tt_256[0] = tt_256[1];
1922
1.04M
  tt_256[1] = tt_256[2];
1923
1.04M
  tt_256[2] = tt_256[3];
1924
1.04M
  tt_256[4] = tt_256[5];
1925
1.04M
  tt_256[5] = tt_256[6];
1926
1.04M
  tt_256[6] = tt_256[7];
1927
1.04M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_16x2_avx2
1928
1929
static inline void xy_y_convolve_8tap_16x2_half_pel_avx2(
1930
    const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1931
14.5k
    __m256i s_256[8], __m256i r[4]) {
1932
14.5k
  __m256i a_256[4], ss_256[4];
1933
14.5k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1934
1935
14.5k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1936
14.5k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1937
14.5k
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1938
14.5k
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1939
14.5k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1940
14.5k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1941
14.5k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1942
14.5k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1943
1944
14.5k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1945
1946
14.5k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[7]);
1947
14.5k
  a_256[2] = _mm256_add_epi16(s_256[3], s_256[6]);
1948
14.5k
  a_256[3] = _mm256_add_epi16(s_256[4], s_256[5]);
1949
14.5k
  s_256[0] = s_256[2];
1950
14.5k
  s_256[2] = s_256[4];
1951
14.5k
  s_256[4] = s_256[6];
1952
14.5k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1953
1954
14.5k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[6]);
1955
14.5k
  s_256[1] = s_256[3];
1956
14.5k
  s_256[3] = s_256[5];
1957
14.5k
  s_256[5] = s_256[7];
1958
14.5k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1959
14.5k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1960
14.5k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1961
14.5k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1962
1963
14.5k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1964
14.5k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_16x2_half_pel_avx2
Line
Count
Source
1931
14.5k
    __m256i s_256[8], __m256i r[4]) {
1932
14.5k
  __m256i a_256[4], ss_256[4];
1933
14.5k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1934
1935
14.5k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1936
14.5k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1937
14.5k
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1938
14.5k
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1939
14.5k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1940
14.5k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1941
14.5k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1942
14.5k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1943
1944
14.5k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1945
1946
14.5k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[7]);
1947
14.5k
  a_256[2] = _mm256_add_epi16(s_256[3], s_256[6]);
1948
14.5k
  a_256[3] = _mm256_add_epi16(s_256[4], s_256[5]);
1949
14.5k
  s_256[0] = s_256[2];
1950
14.5k
  s_256[2] = s_256[4];
1951
14.5k
  s_256[4] = s_256[6];
1952
14.5k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1953
1954
14.5k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[6]);
1955
14.5k
  s_256[1] = s_256[3];
1956
14.5k
  s_256[3] = s_256[5];
1957
14.5k
  s_256[5] = s_256[7];
1958
14.5k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1959
14.5k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1960
14.5k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1961
14.5k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1962
1963
14.5k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1964
14.5k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_16x2_half_pel_avx2
1965
1966
static inline void xy_y_round_store_8x2_avx2(const __m256i res[2],
1967
                                             uint8_t *const dst,
1968
1.61M
                                             const ptrdiff_t stride) {
1969
1.61M
  const __m256i r = xy_y_round_16_avx2(res);
1970
1.61M
  pack_store_8x2_avx2(r, dst, stride);
1971
1.61M
}
convolve_2d_avx2.c:xy_y_round_store_8x2_avx2
Line
Count
Source
1968
1.61M
                                             const ptrdiff_t stride) {
1969
1.61M
  const __m256i r = xy_y_round_16_avx2(res);
1970
1.61M
  pack_store_8x2_avx2(r, dst, stride);
1971
1.61M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_8x2_avx2
1972
1973
static inline void xy_y_round_store_16x2_avx2(const __m256i res[4],
1974
                                              uint8_t *const dst,
1975
1.20M
                                              const ptrdiff_t stride) {
1976
1.20M
  const __m256i r0 = xy_y_round_16_avx2(res + 0);
1977
1.20M
  const __m256i r1 = xy_y_round_16_avx2(res + 2);
1978
1.20M
  xy_y_pack_store_16x2_avx2(r0, r1, dst, stride);
1979
1.20M
}
convolve_2d_avx2.c:xy_y_round_store_16x2_avx2
Line
Count
Source
1975
1.20M
                                              const ptrdiff_t stride) {
1976
1.20M
  const __m256i r0 = xy_y_round_16_avx2(res + 0);
1977
1.20M
  const __m256i r1 = xy_y_round_16_avx2(res + 2);
1978
1.20M
  xy_y_pack_store_16x2_avx2(r0, r1, dst, stride);
1979
1.20M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_16x2_avx2
1980
1981
static inline void sr_y_round_store_32_avx2(const __m256i res[2],
1982
1.87M
                                            uint8_t *const dst) {
1983
1.87M
  __m256i r[2];
1984
1985
1.87M
  r[0] = sr_y_round_avx2(res[0]);
1986
1.87M
  r[1] = sr_y_round_avx2(res[1]);
1987
1.87M
  convolve_store_32_avx2(r[0], r[1], dst);
1988
1.87M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_store_32_avx2
convolve_avx2.c:sr_y_round_store_32_avx2
Line
Count
Source
1982
1.87M
                                            uint8_t *const dst) {
1983
1.87M
  __m256i r[2];
1984
1985
1.87M
  r[0] = sr_y_round_avx2(res[0]);
1986
1.87M
  r[1] = sr_y_round_avx2(res[1]);
1987
1.87M
  convolve_store_32_avx2(r[0], r[1], dst);
1988
1.87M
}
1989
1990
static inline void sr_y_round_store_32x2_avx2(const __m256i res[4],
1991
                                              uint8_t *const dst,
1992
857k
                                              const int32_t dst_stride) {
1993
857k
  sr_y_round_store_32_avx2(res, dst);
1994
857k
  sr_y_round_store_32_avx2(res + 2, dst + dst_stride);
1995
857k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_store_32x2_avx2
convolve_avx2.c:sr_y_round_store_32x2_avx2
Line
Count
Source
1992
857k
                                              const int32_t dst_stride) {
1993
857k
  sr_y_round_store_32_avx2(res, dst);
1994
857k
  sr_y_round_store_32_avx2(res + 2, dst + dst_stride);
1995
857k
}
1996
1997
static inline void sr_y_2tap_32_avx2(const uint8_t *const src,
1998
                                     const __m256i coeffs[1], const __m256i s0,
1999
162k
                                     __m256i *const s1, uint8_t *const dst) {
2000
162k
  __m256i r[2];
2001
162k
  y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
2002
162k
  sr_y_round_store_32_avx2(r, dst);
2003
162k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_2tap_32_avx2
convolve_avx2.c:sr_y_2tap_32_avx2
Line
Count
Source
1999
162k
                                     __m256i *const s1, uint8_t *const dst) {
2000
162k
  __m256i r[2];
2001
162k
  y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
2002
162k
  sr_y_round_store_32_avx2(r, dst);
2003
162k
}
2004
2005
static AOM_FORCE_INLINE void av1_convolve_y_sr_specialized_avx2(
2006
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2007
    int32_t w, int32_t h, const InterpFilterParams *filter_params_y,
2008
625k
    const int32_t subpel_y_q4) {
2009
625k
  int32_t x, y;
2010
625k
  __m128i coeffs_128[4];
2011
625k
  __m256i coeffs_256[4];
2012
2013
625k
  int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
2014
2015
625k
  if (vert_tap == 2) {
2016
    // vert_filt as 2 tap
2017
37.2k
    const uint8_t *src_ptr = src;
2018
2019
37.2k
    y = h;
2020
2021
37.2k
    if (subpel_y_q4 != 8) {
2022
18.5k
      if (w <= 8) {
2023
13.7k
        prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4,
2024
13.7k
                                       coeffs_128);
2025
2026
13.7k
        if (w == 2) {
2027
2.09k
          __m128i s_16[2];
2028
2029
2.09k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2030
2031
4.02k
          do {
2032
4.02k
            const __m128i res = y_convolve_2tap_2x2_ssse3(src_ptr, src_stride,
2033
4.02k
                                                          coeffs_128, s_16);
2034
4.02k
            const __m128i r = sr_y_round_sse2(res);
2035
4.02k
            pack_store_2x2_sse2(r, dst, dst_stride);
2036
4.02k
            src_ptr += 2 * src_stride;
2037
4.02k
            dst += 2 * dst_stride;
2038
4.02k
            y -= 2;
2039
4.02k
          } while (y);
2040
11.6k
        } else if (w == 4) {
2041
6.53k
          __m128i s_32[2];
2042
2043
6.53k
          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2044
2045
19.5k
          do {
2046
19.5k
            const __m128i res = y_convolve_2tap_4x2_ssse3(src_ptr, src_stride,
2047
19.5k
                                                          coeffs_128, s_32);
2048
19.5k
            const __m128i r = sr_y_round_sse2(res);
2049
19.5k
            pack_store_4x2_sse2(r, dst, dst_stride);
2050
19.5k
            src_ptr += 2 * src_stride;
2051
19.5k
            dst += 2 * dst_stride;
2052
19.5k
            y -= 2;
2053
19.5k
          } while (y);
2054
6.53k
        } else {
2055
5.10k
          __m128i s_64[2], s_128[2];
2056
2057
5.10k
          assert(w == 8);
2058
2059
5.10k
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2060
2061
18.4k
          do {
2062
            // Note: Faster than binding to AVX2 registers.
2063
18.4k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2064
18.4k
            s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
2065
18.4k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2066
18.4k
            s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
2067
18.4k
            const __m128i ss0 = _mm_unpacklo_epi8(s_128[0], s_128[1]);
2068
18.4k
            const __m128i ss1 = _mm_unpackhi_epi8(s_128[0], s_128[1]);
2069
18.4k
            const __m128i res0 = convolve_2tap_ssse3(&ss0, coeffs_128);
2070
18.4k
            const __m128i res1 = convolve_2tap_ssse3(&ss1, coeffs_128);
2071
18.4k
            const __m128i r0 = sr_y_round_sse2(res0);
2072
18.4k
            const __m128i r1 = sr_y_round_sse2(res1);
2073
18.4k
            const __m128i d = _mm_packus_epi16(r0, r1);
2074
18.4k
            _mm_storel_epi64((__m128i *)dst, d);
2075
18.4k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2076
18.4k
            src_ptr += 2 * src_stride;
2077
18.4k
            dst += 2 * dst_stride;
2078
18.4k
            y -= 2;
2079
18.4k
          } while (y);
2080
5.10k
        }
2081
13.7k
      } else {
2082
4.83k
        prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2083
2084
4.83k
        if (w == 16) {
2085
2.66k
          __m128i s_128[2];
2086
2087
2.66k
          s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2088
2089
16.8k
          do {
2090
16.8k
            __m256i r[2];
2091
2092
16.8k
            y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2093
16.8k
                                      r);
2094
16.8k
            sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2095
16.8k
            src_ptr += 2 * src_stride;
2096
16.8k
            dst += 2 * dst_stride;
2097
16.8k
            y -= 2;
2098
16.8k
          } while (y);
2099
2.66k
        } else if (w == 32) {
2100
1.17k
          __m256i s_256[2];
2101
2102
1.17k
          s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2103
2104
14.5k
          do {
2105
14.5k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0],
2106
14.5k
                              &s_256[1], dst);
2107
14.5k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1],
2108
14.5k
                              &s_256[0], dst + dst_stride);
2109
14.5k
            src_ptr += 2 * src_stride;
2110
14.5k
            dst += 2 * dst_stride;
2111
14.5k
            y -= 2;
2112
14.5k
          } while (y);
2113
1.17k
        } else if (w == 64) {
2114
845
          __m256i s_256[2][2];
2115
2116
845
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2117
845
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2118
2119
19.0k
          do {
2120
19.0k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2121
19.0k
                              &s_256[1][0], dst);
2122
19.0k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 32, coeffs_256,
2123
19.0k
                              s_256[0][1], &s_256[1][1], dst + 32);
2124
19.0k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2125
19.0k
                              &s_256[0][0], dst + dst_stride);
2126
19.0k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 32, coeffs_256,
2127
19.0k
                              s_256[1][1], &s_256[0][1], dst + dst_stride + 32);
2128
2129
19.0k
            src_ptr += 2 * src_stride;
2130
19.0k
            dst += 2 * dst_stride;
2131
19.0k
            y -= 2;
2132
19.0k
          } while (y);
2133
845
        } else {
2134
151
          __m256i s_256[2][4];
2135
2136
151
          assert(w == 128);
2137
2138
151
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2139
151
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2140
151
          s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2141
151
          s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2142
2143
7.16k
          do {
2144
7.16k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2145
7.16k
                              &s_256[1][0], dst);
2146
7.16k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 1 * 32, coeffs_256,
2147
7.16k
                              s_256[0][1], &s_256[1][1], dst + 1 * 32);
2148
7.16k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 2 * 32, coeffs_256,
2149
7.16k
                              s_256[0][2], &s_256[1][2], dst + 2 * 32);
2150
7.16k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 3 * 32, coeffs_256,
2151
7.16k
                              s_256[0][3], &s_256[1][3], dst + 3 * 32);
2152
2153
7.16k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2154
7.16k
                              &s_256[0][0], dst + dst_stride);
2155
7.16k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32, coeffs_256,
2156
7.16k
                              s_256[1][1], &s_256[0][1],
2157
7.16k
                              dst + dst_stride + 1 * 32);
2158
7.16k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32, coeffs_256,
2159
7.16k
                              s_256[1][2], &s_256[0][2],
2160
7.16k
                              dst + dst_stride + 2 * 32);
2161
7.16k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32, coeffs_256,
2162
7.16k
                              s_256[1][3], &s_256[0][3],
2163
7.16k
                              dst + dst_stride + 3 * 32);
2164
2165
7.16k
            src_ptr += 2 * src_stride;
2166
7.16k
            dst += 2 * dst_stride;
2167
7.16k
            y -= 2;
2168
7.16k
          } while (y);
2169
151
        }
2170
4.83k
      }
2171
18.7k
    } else {
2172
      // average to get half pel
2173
18.7k
      if (w <= 8) {
2174
16.0k
        if (w == 2) {
2175
3.53k
          __m128i s_16[2];
2176
2177
3.53k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2178
2179
6.72k
          do {
2180
6.72k
            s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + src_stride));
2181
6.72k
            const __m128i d0 = _mm_avg_epu8(s_16[0], s_16[1]);
2182
6.72k
            *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d0);
2183
6.72k
            s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
2184
6.72k
            const __m128i d1 = _mm_avg_epu8(s_16[1], s_16[0]);
2185
6.72k
            *(int16_t *)(dst + dst_stride) = (int16_t)_mm_cvtsi128_si32(d1);
2186
6.72k
            src_ptr += 2 * src_stride;
2187
6.72k
            dst += 2 * dst_stride;
2188
6.72k
            y -= 2;
2189
6.72k
          } while (y);
2190
12.4k
        } else if (w == 4) {
2191
7.98k
          __m128i s_32[2];
2192
2193
7.98k
          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2194
2195
22.2k
          do {
2196
22.2k
            s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + src_stride));
2197
22.2k
            const __m128i d0 = _mm_avg_epu8(s_32[0], s_32[1]);
2198
22.2k
            xx_storel_32(dst, d0);
2199
22.2k
            s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2200
22.2k
            const __m128i d1 = _mm_avg_epu8(s_32[1], s_32[0]);
2201
22.2k
            xx_storel_32(dst + dst_stride, d1);
2202
22.2k
            src_ptr += 2 * src_stride;
2203
22.2k
            dst += 2 * dst_stride;
2204
22.2k
            y -= 2;
2205
22.2k
          } while (y);
2206
7.98k
        } else {
2207
4.50k
          __m128i s_64[2];
2208
2209
4.50k
          assert(w == 8);
2210
2211
4.50k
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2212
2213
14.4k
          do {
2214
            // Note: Faster than binding to AVX2 registers.
2215
14.4k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2216
14.4k
            const __m128i d0 = _mm_avg_epu8(s_64[0], s_64[1]);
2217
14.4k
            _mm_storel_epi64((__m128i *)dst, d0);
2218
14.4k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2219
14.4k
            const __m128i d1 = _mm_avg_epu8(s_64[1], s_64[0]);
2220
14.4k
            _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
2221
14.4k
            src_ptr += 2 * src_stride;
2222
14.4k
            dst += 2 * dst_stride;
2223
14.4k
            y -= 2;
2224
14.4k
          } while (y);
2225
4.50k
        }
2226
16.0k
      } else if (w == 16) {
2227
1.70k
        __m128i s_128[2];
2228
2229
1.70k
        s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2230
2231
9.96k
        do {
2232
9.96k
          s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
2233
9.96k
          const __m128i d0 = _mm_avg_epu8(s_128[0], s_128[1]);
2234
9.96k
          _mm_storeu_si128((__m128i *)dst, d0);
2235
9.96k
          s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2236
9.96k
          const __m128i d1 = _mm_avg_epu8(s_128[1], s_128[0]);
2237
9.96k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
2238
9.96k
          src_ptr += 2 * src_stride;
2239
9.96k
          dst += 2 * dst_stride;
2240
9.96k
          y -= 2;
2241
9.96k
        } while (y);
2242
1.70k
      } else if (w == 32) {
2243
623
        __m256i s_256[2];
2244
2245
623
        s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2246
2247
8.19k
        do {
2248
8.19k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0], &s_256[1], dst);
2249
8.19k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1], &s_256[0],
2250
8.19k
                                dst + dst_stride);
2251
8.19k
          src_ptr += 2 * src_stride;
2252
8.19k
          dst += 2 * dst_stride;
2253
8.19k
          y -= 2;
2254
8.19k
        } while (y);
2255
623
      } else if (w == 64) {
2256
207
        __m256i s_256[2][2];
2257
2258
207
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2259
207
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2260
2261
4.37k
        do {
2262
4.37k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2263
4.37k
                                dst);
2264
4.37k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 32, s_256[0][1],
2265
4.37k
                                &s_256[1][1], dst + 32);
2266
2267
4.37k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2268
4.37k
                                &s_256[0][0], dst + dst_stride);
2269
4.37k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 32, s_256[1][1],
2270
4.37k
                                &s_256[0][1], dst + dst_stride + 32);
2271
2272
4.37k
          src_ptr += 2 * src_stride;
2273
4.37k
          dst += 2 * dst_stride;
2274
4.37k
          y -= 2;
2275
4.37k
        } while (y);
2276
207
      } else {
2277
150
        __m256i s_256[2][4];
2278
2279
150
        assert(w == 128);
2280
2281
150
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2282
150
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2283
150
        s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2284
150
        s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2285
2286
6.36k
        do {
2287
6.36k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2288
6.36k
                                dst);
2289
6.36k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 1 * 32, s_256[0][1],
2290
6.36k
                                &s_256[1][1], dst + 1 * 32);
2291
6.36k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 2 * 32, s_256[0][2],
2292
6.36k
                                &s_256[1][2], dst + 2 * 32);
2293
6.36k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 3 * 32, s_256[0][3],
2294
6.36k
                                &s_256[1][3], dst + 3 * 32);
2295
2296
6.36k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2297
6.36k
                                &s_256[0][0], dst + dst_stride);
2298
6.36k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 1 * 32, s_256[1][1],
2299
6.36k
                                &s_256[0][1], dst + dst_stride + 1 * 32);
2300
6.36k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 2 * 32, s_256[1][2],
2301
6.36k
                                &s_256[0][2], dst + dst_stride + 2 * 32);
2302
6.36k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 3 * 32, s_256[1][3],
2303
6.36k
                                &s_256[0][3], dst + dst_stride + 3 * 32);
2304
2305
6.36k
          src_ptr += 2 * src_stride;
2306
6.36k
          dst += 2 * dst_stride;
2307
6.36k
          y -= 2;
2308
6.36k
        } while (y);
2309
150
      }
2310
18.7k
    }
2311
588k
  } else if (vert_tap == 4) {
2312
    // vert_filt as 4 tap
2313
296k
    const uint8_t *src_ptr = src - src_stride;
2314
2315
296k
    y = h;
2316
2317
296k
    if (w <= 4) {
2318
150k
      prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2319
2320
150k
      if (w == 2) {
2321
26.5k
        __m128i s_16[4], ss_128[2];
2322
2323
26.5k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2324
26.5k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2325
26.5k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2326
2327
26.5k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2328
26.5k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2329
2330
26.5k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2331
2332
43.3k
        do {
2333
43.3k
          src_ptr += 2 * src_stride;
2334
43.3k
          const __m128i res = y_convolve_4tap_2x2_ssse3(
2335
43.3k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2336
43.3k
          const __m128i r = sr_y_round_sse2(res);
2337
43.3k
          pack_store_2x2_sse2(r, dst, dst_stride);
2338
2339
43.3k
          ss_128[0] = ss_128[1];
2340
43.3k
          dst += 2 * dst_stride;
2341
43.3k
          y -= 2;
2342
43.3k
        } while (y);
2343
124k
      } else {
2344
124k
        __m128i s_32[4], ss_128[2];
2345
2346
124k
        assert(w == 4);
2347
2348
124k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2349
124k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2350
124k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2351
2352
124k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2353
124k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2354
2355
124k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2356
2357
249k
        do {
2358
249k
          src_ptr += 2 * src_stride;
2359
249k
          const __m128i res = y_convolve_4tap_4x2_ssse3(
2360
249k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2361
249k
          const __m128i r = sr_y_round_sse2(res);
2362
249k
          pack_store_4x2_sse2(r, dst, dst_stride);
2363
2364
249k
          ss_128[0] = ss_128[1];
2365
249k
          dst += 2 * dst_stride;
2366
249k
          y -= 2;
2367
249k
        } while (y);
2368
124k
      }
2369
150k
    } else {
2370
146k
      prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2371
2372
146k
      if (w == 8) {
2373
97.1k
        __m128i s_64[4];
2374
97.1k
        __m256i ss_256[2];
2375
2376
97.1k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2377
97.1k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2378
97.1k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2379
2380
        // Load lines a and b. Line a to lower 128, line b to upper 128
2381
97.1k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2382
97.1k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2383
2384
97.1k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2385
2386
193k
        do {
2387
193k
          src_ptr += 2 * src_stride;
2388
193k
          const __m256i res = y_convolve_4tap_8x2_avx2(
2389
193k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2390
193k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2391
2392
193k
          ss_256[0] = ss_256[1];
2393
193k
          dst += 2 * dst_stride;
2394
193k
          y -= 2;
2395
193k
        } while (y);
2396
97.1k
      } else if (w == 16) {
2397
44.5k
        __m128i s_128[4];
2398
44.5k
        __m256i ss_256[4], r[2];
2399
2400
44.5k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2401
44.5k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2402
44.5k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2403
2404
        // Load lines a and b. Line a to lower 128, line b to upper 128
2405
44.5k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2406
44.5k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2407
2408
44.5k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2409
44.5k
        ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
2410
2411
109k
        do {
2412
109k
          src_ptr += 2 * src_stride;
2413
109k
          y_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2414
109k
                                    ss_256, r);
2415
109k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2416
2417
109k
          ss_256[0] = ss_256[1];
2418
109k
          ss_256[2] = ss_256[3];
2419
109k
          dst += 2 * dst_stride;
2420
109k
          y -= 2;
2421
109k
        } while (y);
2422
44.5k
      } else if (w == 32) {
2423
        // AV1 standard won't have 32x4 case.
2424
        // This only favors some optimization feature which
2425
        // subsamples 32x8 to 32x4 and triggers 4-tap filter.
2426
2427
3.15k
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2428
2429
3.15k
        s_256[0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * src_stride));
2430
3.15k
        s_256[1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * src_stride));
2431
3.15k
        s_256[2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * src_stride));
2432
2433
3.15k
        ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2434
3.15k
        ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2435
2436
3.15k
        tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2437
3.15k
        tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2438
2439
26.5k
        do {
2440
26.5k
          src_ptr += 2 * src_stride;
2441
26.5k
          y_convolve_4tap_32x2_avx2(src_ptr, src_stride, coeffs_256, s_256,
2442
26.5k
                                    ss_256, tt_256, r);
2443
26.5k
          sr_y_round_store_32x2_avx2(r, dst, dst_stride);
2444
2445
26.5k
          ss_256[0] = ss_256[1];
2446
26.5k
          ss_256[2] = ss_256[3];
2447
2448
26.5k
          tt_256[0] = tt_256[1];
2449
26.5k
          tt_256[2] = tt_256[3];
2450
26.5k
          dst += 2 * dst_stride;
2451
26.5k
          y -= 2;
2452
26.5k
        } while (y);
2453
3.15k
      } else {
2454
1.24k
        assert(!(w % 32));
2455
2456
1.24k
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2457
1.24k
        x = 0;
2458
3.01k
        do {
2459
3.01k
          const uint8_t *s = src_ptr + x;
2460
3.01k
          uint8_t *d = dst + x;
2461
3.01k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2462
3.01k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2463
3.01k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2464
2465
3.01k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2466
3.01k
          ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2467
2468
3.01k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2469
3.01k
          tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2470
2471
3.01k
          y = h;
2472
118k
          do {
2473
118k
            s += 2 * src_stride;
2474
118k
            y_convolve_4tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2475
118k
                                      tt_256, r);
2476
118k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2477
2478
118k
            ss_256[0] = ss_256[1];
2479
118k
            ss_256[2] = ss_256[3];
2480
2481
118k
            tt_256[0] = tt_256[1];
2482
118k
            tt_256[2] = tt_256[3];
2483
118k
            d += 2 * dst_stride;
2484
118k
            y -= 2;
2485
118k
          } while (y);
2486
3.01k
          x += 32;
2487
3.01k
        } while (x < w);
2488
1.24k
      }
2489
146k
    }
2490
296k
  } else if (vert_tap == 6) {
2491
    // vert_filt as 6 tap
2492
276k
    const uint8_t *src_ptr = src - 2 * src_stride;
2493
2494
276k
    if (w <= 4) {
2495
89.9k
      prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2496
2497
89.9k
      y = h;
2498
2499
89.9k
      if (w == 2) {
2500
17.8k
        __m128i s_16[6], ss_128[3];
2501
2502
17.8k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2503
17.8k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2504
17.8k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2505
17.8k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2506
17.8k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2507
2508
17.8k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2509
17.8k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2510
17.8k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2511
17.8k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2512
2513
17.8k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2514
17.8k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2515
2516
71.4k
        do {
2517
71.4k
          src_ptr += 2 * src_stride;
2518
71.4k
          const __m128i res = y_convolve_6tap_2x2_ssse3(
2519
71.4k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2520
71.4k
          const __m128i r = sr_y_round_sse2(res);
2521
71.4k
          pack_store_2x2_sse2(r, dst, dst_stride);
2522
2523
71.4k
          ss_128[0] = ss_128[1];
2524
71.4k
          ss_128[1] = ss_128[2];
2525
71.4k
          dst += 2 * dst_stride;
2526
71.4k
          y -= 2;
2527
71.4k
        } while (y);
2528
72.0k
      } else {
2529
72.0k
        __m128i s_32[6], ss_128[3];
2530
2531
72.0k
        assert(w == 4);
2532
2533
72.0k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2534
72.0k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2535
72.0k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2536
72.0k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2537
72.0k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2538
2539
72.0k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2540
72.0k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2541
72.0k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2542
72.0k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2543
2544
72.0k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2545
72.0k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2546
2547
395k
        do {
2548
395k
          src_ptr += 2 * src_stride;
2549
395k
          const __m128i res = y_convolve_6tap_4x2_ssse3(
2550
395k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2551
395k
          const __m128i r = sr_y_round_sse2(res);
2552
395k
          pack_store_4x2_sse2(r, dst, dst_stride);
2553
2554
395k
          ss_128[0] = ss_128[1];
2555
395k
          ss_128[1] = ss_128[2];
2556
395k
          dst += 2 * dst_stride;
2557
395k
          y -= 2;
2558
395k
        } while (y);
2559
72.0k
      }
2560
186k
    } else {
2561
186k
      prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2562
2563
186k
      if (w == 8) {
2564
92.4k
        __m128i s_64[6];
2565
92.4k
        __m256i ss_256[3];
2566
2567
92.4k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2568
92.4k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2569
92.4k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2570
92.4k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2571
92.4k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2572
2573
        // Load lines a and b. Line a to lower 128, line b to upper 128
2574
92.4k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2575
92.4k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2576
92.4k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2577
92.4k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2578
2579
92.4k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2580
92.4k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2581
2582
92.4k
        y = h;
2583
529k
        do {
2584
529k
          src_ptr += 2 * src_stride;
2585
529k
          const __m256i res = y_convolve_6tap_8x2_avx2(
2586
529k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2587
529k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2588
2589
529k
          ss_256[0] = ss_256[1];
2590
529k
          ss_256[1] = ss_256[2];
2591
529k
          dst += 2 * dst_stride;
2592
529k
          y -= 2;
2593
529k
        } while (y);
2594
93.6k
      } else if (w == 16) {
2595
64.6k
        __m128i s_128[6];
2596
64.6k
        __m256i ss_256[6], r[2];
2597
2598
64.6k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2599
64.6k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2600
64.6k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2601
64.6k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2602
64.6k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2603
2604
        // Load lines a and b. Line a to lower 128, line b to upper 128
2605
64.6k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2606
64.6k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2607
64.6k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2608
64.6k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2609
2610
64.6k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2611
64.6k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2612
2613
64.6k
        ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
2614
64.6k
        ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
2615
2616
64.6k
        y = h;
2617
466k
        do {
2618
466k
          src_ptr += 2 * src_stride;
2619
466k
          y_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2620
466k
                                    ss_256, r);
2621
466k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2622
2623
466k
          ss_256[0] = ss_256[1];
2624
466k
          ss_256[1] = ss_256[2];
2625
2626
466k
          ss_256[3] = ss_256[4];
2627
466k
          ss_256[4] = ss_256[5];
2628
466k
          dst += 2 * dst_stride;
2629
466k
          y -= 2;
2630
466k
        } while (y);
2631
64.6k
      } else {
2632
28.9k
        __m256i s_256[6], ss_256[6], tt_256[6], r[4];
2633
2634
28.9k
        assert(!(w % 32));
2635
2636
29.0k
        x = 0;
2637
34.8k
        do {
2638
34.8k
          const uint8_t *s = src_ptr + x;
2639
34.8k
          uint8_t *d = dst + x;
2640
2641
34.8k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2642
34.8k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2643
34.8k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2644
34.8k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2645
34.8k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2646
2647
34.8k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2648
34.8k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2649
34.8k
          ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2650
34.8k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2651
2652
34.8k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2653
34.8k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2654
34.8k
          tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2655
34.8k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2656
2657
34.8k
          y = h;
2658
587k
          do {
2659
587k
            s += 2 * src_stride;
2660
587k
            y_convolve_6tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2661
587k
                                      tt_256, r);
2662
587k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2663
2664
587k
            ss_256[0] = ss_256[1];
2665
587k
            ss_256[1] = ss_256[2];
2666
587k
            ss_256[3] = ss_256[4];
2667
587k
            ss_256[4] = ss_256[5];
2668
2669
587k
            tt_256[0] = tt_256[1];
2670
587k
            tt_256[1] = tt_256[2];
2671
587k
            tt_256[3] = tt_256[4];
2672
587k
            tt_256[4] = tt_256[5];
2673
587k
            d += 2 * dst_stride;
2674
587k
            y -= 2;
2675
587k
          } while (y);
2676
2677
34.8k
          x += 32;
2678
34.8k
        } while (x < w);
2679
29.0k
      }
2680
186k
    }
2681
276k
  } else if (vert_tap == 8) {
2682
    // vert_filt as 8 tap
2683
15.2k
    const uint8_t *src_ptr = src - 3 * src_stride;
2684
2685
15.2k
    if (w <= 4) {
2686
5.00k
      prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2687
2688
5.00k
      y = h;
2689
2690
5.00k
      if (w == 2) {
2691
1.35k
        __m128i s_16[8], ss_128[4];
2692
2693
1.35k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2694
1.35k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2695
1.35k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2696
1.35k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2697
1.35k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2698
1.35k
        s_16[5] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 5 * src_stride));
2699
1.35k
        s_16[6] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 6 * src_stride));
2700
2701
1.35k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2702
1.35k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2703
1.35k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2704
1.35k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2705
1.35k
        const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
2706
1.35k
        const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
2707
2708
1.35k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2709
1.35k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2710
1.35k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2711
2712
5.41k
        do {
2713
5.41k
          const __m128i res = y_convolve_8tap_2x2_ssse3(
2714
5.41k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2715
5.41k
          const __m128i r = sr_y_round_sse2(res);
2716
5.41k
          pack_store_2x2_sse2(r, dst, dst_stride);
2717
5.41k
          ss_128[0] = ss_128[1];
2718
5.41k
          ss_128[1] = ss_128[2];
2719
5.41k
          ss_128[2] = ss_128[3];
2720
5.41k
          src_ptr += 2 * src_stride;
2721
5.41k
          dst += 2 * dst_stride;
2722
5.41k
          y -= 2;
2723
5.41k
        } while (y);
2724
3.65k
      } else {
2725
3.65k
        __m128i s_32[8], ss_128[4];
2726
2727
3.65k
        assert(w == 4);
2728
2729
3.65k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2730
3.65k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2731
3.65k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2732
3.65k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2733
3.65k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2734
3.65k
        s_32[5] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 5 * src_stride));
2735
3.65k
        s_32[6] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 6 * src_stride));
2736
2737
3.65k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2738
3.65k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2739
3.65k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2740
3.65k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2741
3.65k
        const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
2742
3.65k
        const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
2743
2744
3.65k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2745
3.65k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2746
3.65k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2747
2748
20.1k
        do {
2749
20.1k
          const __m128i res = y_convolve_8tap_4x2_ssse3(
2750
20.1k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2751
20.1k
          const __m128i r = sr_y_round_sse2(res);
2752
20.1k
          pack_store_4x2_sse2(r, dst, dst_stride);
2753
20.1k
          ss_128[0] = ss_128[1];
2754
20.1k
          ss_128[1] = ss_128[2];
2755
20.1k
          ss_128[2] = ss_128[3];
2756
20.1k
          src_ptr += 2 * src_stride;
2757
20.1k
          dst += 2 * dst_stride;
2758
20.1k
          y -= 2;
2759
20.1k
        } while (y);
2760
3.65k
      }
2761
10.2k
    } else {
2762
10.2k
      prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2763
2764
10.2k
      if (w == 8) {
2765
4.09k
        __m128i s_64[8];
2766
4.09k
        __m256i ss_256[4];
2767
2768
4.09k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2769
4.09k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2770
4.09k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2771
4.09k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2772
4.09k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2773
4.09k
        s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
2774
4.09k
        s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
2775
2776
        // Load lines a and b. Line a to lower 128, line b to upper 128
2777
4.09k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2778
4.09k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2779
4.09k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2780
4.09k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2781
4.09k
        const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
2782
4.09k
        const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
2783
2784
4.09k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2785
4.09k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2786
4.09k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2787
2788
4.09k
        y = h;
2789
24.8k
        do {
2790
24.8k
          const __m256i res = y_convolve_8tap_8x2_avx2(
2791
24.8k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2792
24.8k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2793
24.8k
          ss_256[0] = ss_256[1];
2794
24.8k
          ss_256[1] = ss_256[2];
2795
24.8k
          ss_256[2] = ss_256[3];
2796
24.8k
          src_ptr += 2 * src_stride;
2797
24.8k
          dst += 2 * dst_stride;
2798
24.8k
          y -= 2;
2799
24.8k
        } while (y);
2800
6.13k
      } else if (w == 16) {
2801
3.03k
        __m128i s_128[8];
2802
3.03k
        __m256i ss_256[8], r[2];
2803
2804
3.03k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2805
3.03k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2806
3.03k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2807
3.03k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2808
3.03k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2809
3.03k
        s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
2810
3.03k
        s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
2811
2812
        // Load lines a and b. Line a to lower 128, line b to upper 128
2813
3.03k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2814
3.03k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2815
3.03k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2816
3.03k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2817
3.03k
        const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
2818
3.03k
        const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
2819
2820
3.03k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2821
3.03k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2822
3.03k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2823
2824
3.03k
        ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
2825
3.03k
        ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
2826
3.03k
        ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
2827
2828
3.03k
        y = h;
2829
26.2k
        do {
2830
26.2k
          y_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2831
26.2k
                                    ss_256, r);
2832
26.2k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2833
2834
26.2k
          ss_256[0] = ss_256[1];
2835
26.2k
          ss_256[1] = ss_256[2];
2836
26.2k
          ss_256[2] = ss_256[3];
2837
2838
26.2k
          ss_256[4] = ss_256[5];
2839
26.2k
          ss_256[5] = ss_256[6];
2840
26.2k
          ss_256[6] = ss_256[7];
2841
26.2k
          src_ptr += 2 * src_stride;
2842
26.2k
          dst += 2 * dst_stride;
2843
26.2k
          y -= 2;
2844
26.2k
        } while (y);
2845
3.09k
      } else {
2846
3.09k
        __m256i s_256[8], ss_256[8], tt_256[8], r[4];
2847
2848
3.09k
        assert(!(w % 32));
2849
2850
3.09k
        x = 0;
2851
4.70k
        do {
2852
4.70k
          const uint8_t *s = src_ptr + x;
2853
4.70k
          uint8_t *d = dst + x;
2854
2855
4.70k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2856
4.70k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2857
4.70k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2858
4.70k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2859
4.70k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2860
4.70k
          s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
2861
4.70k
          s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
2862
2863
4.70k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2864
4.70k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2865
4.70k
          ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
2866
4.70k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2867
4.70k
          ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2868
4.70k
          ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
2869
2870
4.70k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2871
4.70k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2872
4.70k
          tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
2873
4.70k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2874
4.70k
          tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2875
4.70k
          tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
2876
2877
4.70k
          y = h;
2878
125k
          do {
2879
125k
            y_convolve_8tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2880
125k
                                      tt_256, r);
2881
125k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2882
2883
125k
            ss_256[0] = ss_256[1];
2884
125k
            ss_256[1] = ss_256[2];
2885
125k
            ss_256[2] = ss_256[3];
2886
125k
            ss_256[4] = ss_256[5];
2887
125k
            ss_256[5] = ss_256[6];
2888
125k
            ss_256[6] = ss_256[7];
2889
2890
125k
            tt_256[0] = tt_256[1];
2891
125k
            tt_256[1] = tt_256[2];
2892
125k
            tt_256[2] = tt_256[3];
2893
125k
            tt_256[4] = tt_256[5];
2894
125k
            tt_256[5] = tt_256[6];
2895
125k
            tt_256[6] = tt_256[7];
2896
125k
            s += 2 * src_stride;
2897
125k
            d += 2 * dst_stride;
2898
125k
            y -= 2;
2899
125k
          } while (y);
2900
2901
4.70k
          x += 32;
2902
4.70k
        } while (x < w);
2903
3.09k
      }
2904
10.2k
    }
2905
15.2k
  }
2906
625k
}
Unexecuted instantiation: convolve_2d_avx2.c:av1_convolve_y_sr_specialized_avx2
convolve_avx2.c:av1_convolve_y_sr_specialized_avx2
Line
Count
Source
2008
625k
    const int32_t subpel_y_q4) {
2009
625k
  int32_t x, y;
2010
625k
  __m128i coeffs_128[4];
2011
625k
  __m256i coeffs_256[4];
2012
2013
625k
  int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
2014
2015
625k
  if (vert_tap == 2) {
2016
    // vert_filt as 2 tap
2017
37.2k
    const uint8_t *src_ptr = src;
2018
2019
37.2k
    y = h;
2020
2021
37.2k
    if (subpel_y_q4 != 8) {
2022
18.5k
      if (w <= 8) {
2023
13.7k
        prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4,
2024
13.7k
                                       coeffs_128);
2025
2026
13.7k
        if (w == 2) {
2027
2.09k
          __m128i s_16[2];
2028
2029
2.09k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2030
2031
4.02k
          do {
2032
4.02k
            const __m128i res = y_convolve_2tap_2x2_ssse3(src_ptr, src_stride,
2033
4.02k
                                                          coeffs_128, s_16);
2034
4.02k
            const __m128i r = sr_y_round_sse2(res);
2035
4.02k
            pack_store_2x2_sse2(r, dst, dst_stride);
2036
4.02k
            src_ptr += 2 * src_stride;
2037
4.02k
            dst += 2 * dst_stride;
2038
4.02k
            y -= 2;
2039
4.02k
          } while (y);
2040
11.6k
        } else if (w == 4) {
2041
6.53k
          __m128i s_32[2];
2042
2043
6.53k
          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2044
2045
19.5k
          do {
2046
19.5k
            const __m128i res = y_convolve_2tap_4x2_ssse3(src_ptr, src_stride,
2047
19.5k
                                                          coeffs_128, s_32);
2048
19.5k
            const __m128i r = sr_y_round_sse2(res);
2049
19.5k
            pack_store_4x2_sse2(r, dst, dst_stride);
2050
19.5k
            src_ptr += 2 * src_stride;
2051
19.5k
            dst += 2 * dst_stride;
2052
19.5k
            y -= 2;
2053
19.5k
          } while (y);
2054
6.53k
        } else {
2055
5.10k
          __m128i s_64[2], s_128[2];
2056
2057
5.10k
          assert(w == 8);
2058
2059
5.10k
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2060
2061
18.4k
          do {
2062
            // Note: Faster than binding to AVX2 registers.
2063
18.4k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2064
18.4k
            s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
2065
18.4k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2066
18.4k
            s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
2067
18.4k
            const __m128i ss0 = _mm_unpacklo_epi8(s_128[0], s_128[1]);
2068
18.4k
            const __m128i ss1 = _mm_unpackhi_epi8(s_128[0], s_128[1]);
2069
18.4k
            const __m128i res0 = convolve_2tap_ssse3(&ss0, coeffs_128);
2070
18.4k
            const __m128i res1 = convolve_2tap_ssse3(&ss1, coeffs_128);
2071
18.4k
            const __m128i r0 = sr_y_round_sse2(res0);
2072
18.4k
            const __m128i r1 = sr_y_round_sse2(res1);
2073
18.4k
            const __m128i d = _mm_packus_epi16(r0, r1);
2074
18.4k
            _mm_storel_epi64((__m128i *)dst, d);
2075
18.4k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2076
18.4k
            src_ptr += 2 * src_stride;
2077
18.4k
            dst += 2 * dst_stride;
2078
18.4k
            y -= 2;
2079
18.4k
          } while (y);
2080
5.10k
        }
2081
13.7k
      } else {
2082
4.83k
        prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2083
2084
4.83k
        if (w == 16) {
2085
2.66k
          __m128i s_128[2];
2086
2087
2.66k
          s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2088
2089
16.8k
          do {
2090
16.8k
            __m256i r[2];
2091
2092
16.8k
            y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2093
16.8k
                                      r);
2094
16.8k
            sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2095
16.8k
            src_ptr += 2 * src_stride;
2096
16.8k
            dst += 2 * dst_stride;
2097
16.8k
            y -= 2;
2098
16.8k
          } while (y);
2099
2.66k
        } else if (w == 32) {
2100
1.17k
          __m256i s_256[2];
2101
2102
1.17k
          s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2103
2104
14.5k
          do {
2105
14.5k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0],
2106
14.5k
                              &s_256[1], dst);
2107
14.5k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1],
2108
14.5k
                              &s_256[0], dst + dst_stride);
2109
14.5k
            src_ptr += 2 * src_stride;
2110
14.5k
            dst += 2 * dst_stride;
2111
14.5k
            y -= 2;
2112
14.5k
          } while (y);
2113
1.17k
        } else if (w == 64) {
2114
845
          __m256i s_256[2][2];
2115
2116
845
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2117
845
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2118
2119
19.0k
          do {
2120
19.0k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2121
19.0k
                              &s_256[1][0], dst);
2122
19.0k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 32, coeffs_256,
2123
19.0k
                              s_256[0][1], &s_256[1][1], dst + 32);
2124
19.0k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2125
19.0k
                              &s_256[0][0], dst + dst_stride);
2126
19.0k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 32, coeffs_256,
2127
19.0k
                              s_256[1][1], &s_256[0][1], dst + dst_stride + 32);
2128
2129
19.0k
            src_ptr += 2 * src_stride;
2130
19.0k
            dst += 2 * dst_stride;
2131
19.0k
            y -= 2;
2132
19.0k
          } while (y);
2133
845
        } else {
2134
151
          __m256i s_256[2][4];
2135
2136
151
          assert(w == 128);
2137
2138
151
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2139
151
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2140
151
          s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2141
151
          s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2142
2143
7.16k
          do {
2144
7.16k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2145
7.16k
                              &s_256[1][0], dst);
2146
7.16k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 1 * 32, coeffs_256,
2147
7.16k
                              s_256[0][1], &s_256[1][1], dst + 1 * 32);
2148
7.16k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 2 * 32, coeffs_256,
2149
7.16k
                              s_256[0][2], &s_256[1][2], dst + 2 * 32);
2150
7.16k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 3 * 32, coeffs_256,
2151
7.16k
                              s_256[0][3], &s_256[1][3], dst + 3 * 32);
2152
2153
7.16k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2154
7.16k
                              &s_256[0][0], dst + dst_stride);
2155
7.16k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32, coeffs_256,
2156
7.16k
                              s_256[1][1], &s_256[0][1],
2157
7.16k
                              dst + dst_stride + 1 * 32);
2158
7.16k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32, coeffs_256,
2159
7.16k
                              s_256[1][2], &s_256[0][2],
2160
7.16k
                              dst + dst_stride + 2 * 32);
2161
7.16k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32, coeffs_256,
2162
7.16k
                              s_256[1][3], &s_256[0][3],
2163
7.16k
                              dst + dst_stride + 3 * 32);
2164
2165
7.16k
            src_ptr += 2 * src_stride;
2166
7.16k
            dst += 2 * dst_stride;
2167
7.16k
            y -= 2;
2168
7.16k
          } while (y);
2169
151
        }
2170
4.83k
      }
2171
18.7k
    } else {
2172
      // average to get half pel
2173
18.7k
      if (w <= 8) {
2174
16.0k
        if (w == 2) {
2175
3.53k
          __m128i s_16[2];
2176
2177
3.53k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2178
2179
6.72k
          do {
2180
6.72k
            s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + src_stride));
2181
6.72k
            const __m128i d0 = _mm_avg_epu8(s_16[0], s_16[1]);
2182
6.72k
            *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d0);
2183
6.72k
            s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
2184
6.72k
            const __m128i d1 = _mm_avg_epu8(s_16[1], s_16[0]);
2185
6.72k
            *(int16_t *)(dst + dst_stride) = (int16_t)_mm_cvtsi128_si32(d1);
2186
6.72k
            src_ptr += 2 * src_stride;
2187
6.72k
            dst += 2 * dst_stride;
2188
6.72k
            y -= 2;
2189
6.72k
          } while (y);
2190
12.4k
        } else if (w == 4) {
2191
7.98k
          __m128i s_32[2];
2192
2193
7.98k
          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2194
2195
22.2k
          do {
2196
22.2k
            s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + src_stride));
2197
22.2k
            const __m128i d0 = _mm_avg_epu8(s_32[0], s_32[1]);
2198
22.2k
            xx_storel_32(dst, d0);
2199
22.2k
            s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2200
22.2k
            const __m128i d1 = _mm_avg_epu8(s_32[1], s_32[0]);
2201
22.2k
            xx_storel_32(dst + dst_stride, d1);
2202
22.2k
            src_ptr += 2 * src_stride;
2203
22.2k
            dst += 2 * dst_stride;
2204
22.2k
            y -= 2;
2205
22.2k
          } while (y);
2206
7.98k
        } else {
2207
4.50k
          __m128i s_64[2];
2208
2209
4.50k
          assert(w == 8);
2210
2211
4.50k
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2212
2213
14.4k
          do {
2214
            // Note: Faster than binding to AVX2 registers.
2215
14.4k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2216
14.4k
            const __m128i d0 = _mm_avg_epu8(s_64[0], s_64[1]);
2217
14.4k
            _mm_storel_epi64((__m128i *)dst, d0);
2218
14.4k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2219
14.4k
            const __m128i d1 = _mm_avg_epu8(s_64[1], s_64[0]);
2220
14.4k
            _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
2221
14.4k
            src_ptr += 2 * src_stride;
2222
14.4k
            dst += 2 * dst_stride;
2223
14.4k
            y -= 2;
2224
14.4k
          } while (y);
2225
4.50k
        }
2226
16.0k
      } else if (w == 16) {
2227
1.70k
        __m128i s_128[2];
2228
2229
1.70k
        s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2230
2231
9.96k
        do {
2232
9.96k
          s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
2233
9.96k
          const __m128i d0 = _mm_avg_epu8(s_128[0], s_128[1]);
2234
9.96k
          _mm_storeu_si128((__m128i *)dst, d0);
2235
9.96k
          s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2236
9.96k
          const __m128i d1 = _mm_avg_epu8(s_128[1], s_128[0]);
2237
9.96k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
2238
9.96k
          src_ptr += 2 * src_stride;
2239
9.96k
          dst += 2 * dst_stride;
2240
9.96k
          y -= 2;
2241
9.96k
        } while (y);
2242
1.70k
      } else if (w == 32) {
2243
623
        __m256i s_256[2];
2244
2245
623
        s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2246
2247
8.19k
        do {
2248
8.19k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0], &s_256[1], dst);
2249
8.19k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1], &s_256[0],
2250
8.19k
                                dst + dst_stride);
2251
8.19k
          src_ptr += 2 * src_stride;
2252
8.19k
          dst += 2 * dst_stride;
2253
8.19k
          y -= 2;
2254
8.19k
        } while (y);
2255
623
      } else if (w == 64) {
2256
207
        __m256i s_256[2][2];
2257
2258
207
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2259
207
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2260
2261
4.37k
        do {
2262
4.37k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2263
4.37k
                                dst);
2264
4.37k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 32, s_256[0][1],
2265
4.37k
                                &s_256[1][1], dst + 32);
2266
2267
4.37k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2268
4.37k
                                &s_256[0][0], dst + dst_stride);
2269
4.37k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 32, s_256[1][1],
2270
4.37k
                                &s_256[0][1], dst + dst_stride + 32);
2271
2272
4.37k
          src_ptr += 2 * src_stride;
2273
4.37k
          dst += 2 * dst_stride;
2274
4.37k
          y -= 2;
2275
4.37k
        } while (y);
2276
207
      } else {
2277
150
        __m256i s_256[2][4];
2278
2279
150
        assert(w == 128);
2280
2281
150
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2282
150
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2283
150
        s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2284
150
        s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2285
2286
6.36k
        do {
2287
6.36k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2288
6.36k
                                dst);
2289
6.36k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 1 * 32, s_256[0][1],
2290
6.36k
                                &s_256[1][1], dst + 1 * 32);
2291
6.36k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 2 * 32, s_256[0][2],
2292
6.36k
                                &s_256[1][2], dst + 2 * 32);
2293
6.36k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 3 * 32, s_256[0][3],
2294
6.36k
                                &s_256[1][3], dst + 3 * 32);
2295
2296
6.36k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2297
6.36k
                                &s_256[0][0], dst + dst_stride);
2298
6.36k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 1 * 32, s_256[1][1],
2299
6.36k
                                &s_256[0][1], dst + dst_stride + 1 * 32);
2300
6.36k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 2 * 32, s_256[1][2],
2301
6.36k
                                &s_256[0][2], dst + dst_stride + 2 * 32);
2302
6.36k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 3 * 32, s_256[1][3],
2303
6.36k
                                &s_256[0][3], dst + dst_stride + 3 * 32);
2304
2305
6.36k
          src_ptr += 2 * src_stride;
2306
6.36k
          dst += 2 * dst_stride;
2307
6.36k
          y -= 2;
2308
6.36k
        } while (y);
2309
150
      }
2310
18.7k
    }
2311
588k
  } else if (vert_tap == 4) {
2312
    // vert_filt as 4 tap
2313
296k
    const uint8_t *src_ptr = src - src_stride;
2314
2315
296k
    y = h;
2316
2317
296k
    if (w <= 4) {
2318
150k
      prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2319
2320
150k
      if (w == 2) {
2321
26.5k
        __m128i s_16[4], ss_128[2];
2322
2323
26.5k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2324
26.5k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2325
26.5k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2326
2327
26.5k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2328
26.5k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2329
2330
26.5k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2331
2332
43.3k
        do {
2333
43.3k
          src_ptr += 2 * src_stride;
2334
43.3k
          const __m128i res = y_convolve_4tap_2x2_ssse3(
2335
43.3k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2336
43.3k
          const __m128i r = sr_y_round_sse2(res);
2337
43.3k
          pack_store_2x2_sse2(r, dst, dst_stride);
2338
2339
43.3k
          ss_128[0] = ss_128[1];
2340
43.3k
          dst += 2 * dst_stride;
2341
43.3k
          y -= 2;
2342
43.3k
        } while (y);
2343
124k
      } else {
2344
124k
        __m128i s_32[4], ss_128[2];
2345
2346
124k
        assert(w == 4);
2347
2348
124k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2349
124k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2350
124k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2351
2352
124k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2353
124k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2354
2355
124k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2356
2357
249k
        do {
2358
249k
          src_ptr += 2 * src_stride;
2359
249k
          const __m128i res = y_convolve_4tap_4x2_ssse3(
2360
249k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2361
249k
          const __m128i r = sr_y_round_sse2(res);
2362
249k
          pack_store_4x2_sse2(r, dst, dst_stride);
2363
2364
249k
          ss_128[0] = ss_128[1];
2365
249k
          dst += 2 * dst_stride;
2366
249k
          y -= 2;
2367
249k
        } while (y);
2368
124k
      }
2369
150k
    } else {
2370
146k
      prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2371
2372
146k
      if (w == 8) {
2373
97.1k
        __m128i s_64[4];
2374
97.1k
        __m256i ss_256[2];
2375
2376
97.1k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2377
97.1k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2378
97.1k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2379
2380
        // Load lines a and b. Line a to lower 128, line b to upper 128
2381
97.1k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2382
97.1k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2383
2384
97.1k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2385
2386
193k
        do {
2387
193k
          src_ptr += 2 * src_stride;
2388
193k
          const __m256i res = y_convolve_4tap_8x2_avx2(
2389
193k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2390
193k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2391
2392
193k
          ss_256[0] = ss_256[1];
2393
193k
          dst += 2 * dst_stride;
2394
193k
          y -= 2;
2395
193k
        } while (y);
2396
97.1k
      } else if (w == 16) {
2397
44.5k
        __m128i s_128[4];
2398
44.5k
        __m256i ss_256[4], r[2];
2399
2400
44.5k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2401
44.5k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2402
44.5k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2403
2404
        // Load lines a and b. Line a to lower 128, line b to upper 128
2405
44.5k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2406
44.5k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2407
2408
44.5k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2409
44.5k
        ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
2410
2411
109k
        do {
2412
109k
          src_ptr += 2 * src_stride;
2413
109k
          y_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2414
109k
                                    ss_256, r);
2415
109k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2416
2417
109k
          ss_256[0] = ss_256[1];
2418
109k
          ss_256[2] = ss_256[3];
2419
109k
          dst += 2 * dst_stride;
2420
109k
          y -= 2;
2421
109k
        } while (y);
2422
44.5k
      } else if (w == 32) {
2423
        // AV1 standard won't have 32x4 case.
2424
        // This only favors some optimization feature which
2425
        // subsamples 32x8 to 32x4 and triggers 4-tap filter.
2426
2427
3.15k
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2428
2429
3.15k
        s_256[0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * src_stride));
2430
3.15k
        s_256[1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * src_stride));
2431
3.15k
        s_256[2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * src_stride));
2432
2433
3.15k
        ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2434
3.15k
        ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2435
2436
3.15k
        tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2437
3.15k
        tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2438
2439
26.5k
        do {
2440
26.5k
          src_ptr += 2 * src_stride;
2441
26.5k
          y_convolve_4tap_32x2_avx2(src_ptr, src_stride, coeffs_256, s_256,
2442
26.5k
                                    ss_256, tt_256, r);
2443
26.5k
          sr_y_round_store_32x2_avx2(r, dst, dst_stride);
2444
2445
26.5k
          ss_256[0] = ss_256[1];
2446
26.5k
          ss_256[2] = ss_256[3];
2447
2448
26.5k
          tt_256[0] = tt_256[1];
2449
26.5k
          tt_256[2] = tt_256[3];
2450
26.5k
          dst += 2 * dst_stride;
2451
26.5k
          y -= 2;
2452
26.5k
        } while (y);
2453
3.15k
      } else {
2454
1.24k
        assert(!(w % 32));
2455
2456
1.24k
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2457
1.24k
        x = 0;
2458
3.01k
        do {
2459
3.01k
          const uint8_t *s = src_ptr + x;
2460
3.01k
          uint8_t *d = dst + x;
2461
3.01k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2462
3.01k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2463
3.01k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2464
2465
3.01k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2466
3.01k
          ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2467
2468
3.01k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2469
3.01k
          tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2470
2471
3.01k
          y = h;
2472
118k
          do {
2473
118k
            s += 2 * src_stride;
2474
118k
            y_convolve_4tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2475
118k
                                      tt_256, r);
2476
118k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2477
2478
118k
            ss_256[0] = ss_256[1];
2479
118k
            ss_256[2] = ss_256[3];
2480
2481
118k
            tt_256[0] = tt_256[1];
2482
118k
            tt_256[2] = tt_256[3];
2483
118k
            d += 2 * dst_stride;
2484
118k
            y -= 2;
2485
118k
          } while (y);
2486
3.01k
          x += 32;
2487
3.01k
        } while (x < w);
2488
1.24k
      }
2489
146k
    }
2490
296k
  } else if (vert_tap == 6) {
2491
    // vert_filt as 6 tap
2492
276k
    const uint8_t *src_ptr = src - 2 * src_stride;
2493
2494
276k
    if (w <= 4) {
2495
89.9k
      prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2496
2497
89.9k
      y = h;
2498
2499
89.9k
      if (w == 2) {
2500
17.8k
        __m128i s_16[6], ss_128[3];
2501
2502
17.8k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2503
17.8k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2504
17.8k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2505
17.8k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2506
17.8k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2507
2508
17.8k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2509
17.8k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2510
17.8k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2511
17.8k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2512
2513
17.8k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2514
17.8k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2515
2516
71.4k
        do {
2517
71.4k
          src_ptr += 2 * src_stride;
2518
71.4k
          const __m128i res = y_convolve_6tap_2x2_ssse3(
2519
71.4k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2520
71.4k
          const __m128i r = sr_y_round_sse2(res);
2521
71.4k
          pack_store_2x2_sse2(r, dst, dst_stride);
2522
2523
71.4k
          ss_128[0] = ss_128[1];
2524
71.4k
          ss_128[1] = ss_128[2];
2525
71.4k
          dst += 2 * dst_stride;
2526
71.4k
          y -= 2;
2527
71.4k
        } while (y);
2528
72.0k
      } else {
2529
72.0k
        __m128i s_32[6], ss_128[3];
2530
2531
72.0k
        assert(w == 4);
2532
2533
72.0k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2534
72.0k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2535
72.0k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2536
72.0k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2537
72.0k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2538
2539
72.0k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2540
72.0k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2541
72.0k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2542
72.0k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2543
2544
72.0k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2545
72.0k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2546
2547
395k
        do {
2548
395k
          src_ptr += 2 * src_stride;
2549
395k
          const __m128i res = y_convolve_6tap_4x2_ssse3(
2550
395k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2551
395k
          const __m128i r = sr_y_round_sse2(res);
2552
395k
          pack_store_4x2_sse2(r, dst, dst_stride);
2553
2554
395k
          ss_128[0] = ss_128[1];
2555
395k
          ss_128[1] = ss_128[2];
2556
395k
          dst += 2 * dst_stride;
2557
395k
          y -= 2;
2558
395k
        } while (y);
2559
72.0k
      }
2560
186k
    } else {
2561
186k
      prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2562
2563
186k
      if (w == 8) {
2564
92.4k
        __m128i s_64[6];
2565
92.4k
        __m256i ss_256[3];
2566
2567
92.4k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2568
92.4k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2569
92.4k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2570
92.4k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2571
92.4k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2572
2573
        // Load lines a and b. Line a to lower 128, line b to upper 128
2574
92.4k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2575
92.4k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2576
92.4k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2577
92.4k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2578
2579
92.4k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2580
92.4k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2581
2582
92.4k
        y = h;
2583
529k
        do {
2584
529k
          src_ptr += 2 * src_stride;
2585
529k
          const __m256i res = y_convolve_6tap_8x2_avx2(
2586
529k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2587
529k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2588
2589
529k
          ss_256[0] = ss_256[1];
2590
529k
          ss_256[1] = ss_256[2];
2591
529k
          dst += 2 * dst_stride;
2592
529k
          y -= 2;
2593
529k
        } while (y);
2594
93.6k
      } else if (w == 16) {
2595
64.6k
        __m128i s_128[6];
2596
64.6k
        __m256i ss_256[6], r[2];
2597
2598
64.6k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2599
64.6k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2600
64.6k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2601
64.6k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2602
64.6k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2603
2604
        // Load lines a and b. Line a to lower 128, line b to upper 128
2605
64.6k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2606
64.6k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2607
64.6k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2608
64.6k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2609
2610
64.6k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2611
64.6k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2612
2613
64.6k
        ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
2614
64.6k
        ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
2615
2616
64.6k
        y = h;
2617
466k
        do {
2618
466k
          src_ptr += 2 * src_stride;
2619
466k
          y_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2620
466k
                                    ss_256, r);
2621
466k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2622
2623
466k
          ss_256[0] = ss_256[1];
2624
466k
          ss_256[1] = ss_256[2];
2625
2626
466k
          ss_256[3] = ss_256[4];
2627
466k
          ss_256[4] = ss_256[5];
2628
466k
          dst += 2 * dst_stride;
2629
466k
          y -= 2;
2630
466k
        } while (y);
2631
64.6k
      } else {
2632
28.9k
        __m256i s_256[6], ss_256[6], tt_256[6], r[4];
2633
2634
28.9k
        assert(!(w % 32));
2635
2636
29.0k
        x = 0;
2637
34.8k
        do {
2638
34.8k
          const uint8_t *s = src_ptr + x;
2639
34.8k
          uint8_t *d = dst + x;
2640
2641
34.8k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2642
34.8k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2643
34.8k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2644
34.8k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2645
34.8k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2646
2647
34.8k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2648
34.8k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2649
34.8k
          ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2650
34.8k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2651
2652
34.8k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2653
34.8k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2654
34.8k
          tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2655
34.8k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2656
2657
34.8k
          y = h;
2658
587k
          do {
2659
587k
            s += 2 * src_stride;
2660
587k
            y_convolve_6tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2661
587k
                                      tt_256, r);
2662
587k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2663
2664
587k
            ss_256[0] = ss_256[1];
2665
587k
            ss_256[1] = ss_256[2];
2666
587k
            ss_256[3] = ss_256[4];
2667
587k
            ss_256[4] = ss_256[5];
2668
2669
587k
            tt_256[0] = tt_256[1];
2670
587k
            tt_256[1] = tt_256[2];
2671
587k
            tt_256[3] = tt_256[4];
2672
587k
            tt_256[4] = tt_256[5];
2673
587k
            d += 2 * dst_stride;
2674
587k
            y -= 2;
2675
587k
          } while (y);
2676
2677
34.8k
          x += 32;
2678
34.8k
        } while (x < w);
2679
29.0k
      }
2680
186k
    }
2681
276k
  } else if (vert_tap == 8) {
2682
    // vert_filt as 8 tap
2683
15.2k
    const uint8_t *src_ptr = src - 3 * src_stride;
2684
2685
15.2k
    if (w <= 4) {
2686
5.00k
      prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2687
2688
5.00k
      y = h;
2689
2690
5.00k
      if (w == 2) {
2691
1.35k
        __m128i s_16[8], ss_128[4];
2692
2693
1.35k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2694
1.35k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2695
1.35k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2696
1.35k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2697
1.35k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2698
1.35k
        s_16[5] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 5 * src_stride));
2699
1.35k
        s_16[6] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 6 * src_stride));
2700
2701
1.35k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2702
1.35k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2703
1.35k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2704
1.35k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2705
1.35k
        const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
2706
1.35k
        const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
2707
2708
1.35k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2709
1.35k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2710
1.35k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2711
2712
5.41k
        do {
2713
5.41k
          const __m128i res = y_convolve_8tap_2x2_ssse3(
2714
5.41k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2715
5.41k
          const __m128i r = sr_y_round_sse2(res);
2716
5.41k
          pack_store_2x2_sse2(r, dst, dst_stride);
2717
5.41k
          ss_128[0] = ss_128[1];
2718
5.41k
          ss_128[1] = ss_128[2];
2719
5.41k
          ss_128[2] = ss_128[3];
2720
5.41k
          src_ptr += 2 * src_stride;
2721
5.41k
          dst += 2 * dst_stride;
2722
5.41k
          y -= 2;
2723
5.41k
        } while (y);
2724
3.65k
      } else {
2725
3.65k
        __m128i s_32[8], ss_128[4];
2726
2727
3.65k
        assert(w == 4);
2728
2729
3.65k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2730
3.65k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2731
3.65k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2732
3.65k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2733
3.65k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2734
3.65k
        s_32[5] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 5 * src_stride));
2735
3.65k
        s_32[6] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 6 * src_stride));
2736
2737
3.65k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2738
3.65k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2739
3.65k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2740
3.65k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2741
3.65k
        const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
2742
3.65k
        const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
2743
2744
3.65k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2745
3.65k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2746
3.65k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2747
2748
20.1k
        do {
2749
20.1k
          const __m128i res = y_convolve_8tap_4x2_ssse3(
2750
20.1k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2751
20.1k
          const __m128i r = sr_y_round_sse2(res);
2752
20.1k
          pack_store_4x2_sse2(r, dst, dst_stride);
2753
20.1k
          ss_128[0] = ss_128[1];
2754
20.1k
          ss_128[1] = ss_128[2];
2755
20.1k
          ss_128[2] = ss_128[3];
2756
20.1k
          src_ptr += 2 * src_stride;
2757
20.1k
          dst += 2 * dst_stride;
2758
20.1k
          y -= 2;
2759
20.1k
        } while (y);
2760
3.65k
      }
2761
10.2k
    } else {
2762
10.2k
      prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2763
2764
10.2k
      if (w == 8) {
2765
4.09k
        __m128i s_64[8];
2766
4.09k
        __m256i ss_256[4];
2767
2768
4.09k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2769
4.09k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2770
4.09k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2771
4.09k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2772
4.09k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2773
4.09k
        s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
2774
4.09k
        s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
2775
2776
        // Load lines a and b. Line a to lower 128, line b to upper 128
2777
4.09k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2778
4.09k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2779
4.09k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2780
4.09k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2781
4.09k
        const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
2782
4.09k
        const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
2783
2784
4.09k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2785
4.09k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2786
4.09k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2787
2788
4.09k
        y = h;
2789
24.8k
        do {
2790
24.8k
          const __m256i res = y_convolve_8tap_8x2_avx2(
2791
24.8k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2792
24.8k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2793
24.8k
          ss_256[0] = ss_256[1];
2794
24.8k
          ss_256[1] = ss_256[2];
2795
24.8k
          ss_256[2] = ss_256[3];
2796
24.8k
          src_ptr += 2 * src_stride;
2797
24.8k
          dst += 2 * dst_stride;
2798
24.8k
          y -= 2;
2799
24.8k
        } while (y);
2800
6.13k
      } else if (w == 16) {
2801
3.03k
        __m128i s_128[8];
2802
3.03k
        __m256i ss_256[8], r[2];
2803
2804
3.03k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2805
3.03k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2806
3.03k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2807
3.03k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2808
3.03k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2809
3.03k
        s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
2810
3.03k
        s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
2811
2812
        // Load lines a and b. Line a to lower 128, line b to upper 128
2813
3.03k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2814
3.03k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2815
3.03k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2816
3.03k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2817
3.03k
        const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
2818
3.03k
        const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
2819
2820
3.03k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2821
3.03k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2822
3.03k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2823
2824
3.03k
        ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
2825
3.03k
        ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
2826
3.03k
        ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
2827
2828
3.03k
        y = h;
2829
26.2k
        do {
2830
26.2k
          y_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2831
26.2k
                                    ss_256, r);
2832
26.2k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2833
2834
26.2k
          ss_256[0] = ss_256[1];
2835
26.2k
          ss_256[1] = ss_256[2];
2836
26.2k
          ss_256[2] = ss_256[3];
2837
2838
26.2k
          ss_256[4] = ss_256[5];
2839
26.2k
          ss_256[5] = ss_256[6];
2840
26.2k
          ss_256[6] = ss_256[7];
2841
26.2k
          src_ptr += 2 * src_stride;
2842
26.2k
          dst += 2 * dst_stride;
2843
26.2k
          y -= 2;
2844
26.2k
        } while (y);
2845
3.09k
      } else {
2846
3.09k
        __m256i s_256[8], ss_256[8], tt_256[8], r[4];
2847
2848
3.09k
        assert(!(w % 32));
2849
2850
3.09k
        x = 0;
2851
4.70k
        do {
2852
4.70k
          const uint8_t *s = src_ptr + x;
2853
4.70k
          uint8_t *d = dst + x;
2854
2855
4.70k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2856
4.70k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2857
4.70k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2858
4.70k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2859
4.70k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2860
4.70k
          s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
2861
4.70k
          s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
2862
2863
4.70k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2864
4.70k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2865
4.70k
          ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
2866
4.70k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2867
4.70k
          ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2868
4.70k
          ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
2869
2870
4.70k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2871
4.70k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2872
4.70k
          tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
2873
4.70k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2874
4.70k
          tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2875
4.70k
          tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
2876
2877
4.70k
          y = h;
2878
125k
          do {
2879
125k
            y_convolve_8tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2880
125k
                                      tt_256, r);
2881
125k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2882
2883
125k
            ss_256[0] = ss_256[1];
2884
125k
            ss_256[1] = ss_256[2];
2885
125k
            ss_256[2] = ss_256[3];
2886
125k
            ss_256[4] = ss_256[5];
2887
125k
            ss_256[5] = ss_256[6];
2888
125k
            ss_256[6] = ss_256[7];
2889
2890
125k
            tt_256[0] = tt_256[1];
2891
125k
            tt_256[1] = tt_256[2];
2892
125k
            tt_256[2] = tt_256[3];
2893
125k
            tt_256[4] = tt_256[5];
2894
125k
            tt_256[5] = tt_256[6];
2895
125k
            tt_256[6] = tt_256[7];
2896
125k
            s += 2 * src_stride;
2897
125k
            d += 2 * dst_stride;
2898
125k
            y -= 2;
2899
125k
          } while (y);
2900
2901
4.70k
          x += 32;
2902
4.70k
        } while (x < w);
2903
3.09k
      }
2904
10.2k
    }
2905
15.2k
  }
2906
625k
}
2907
2908
static inline void sr_x_2tap_32_avx2(const uint8_t *const src,
2909
                                     const __m256i coeffs[1],
2910
168k
                                     uint8_t *const dst) {
2911
168k
  __m256i r[2];
2912
2913
168k
  x_convolve_2tap_32_avx2(src, coeffs, r);
2914
168k
  sr_x_round_store_32_avx2(r, dst);
2915
168k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_2tap_32_avx2
convolve_avx2.c:sr_x_2tap_32_avx2
Line
Count
Source
2910
168k
                                     uint8_t *const dst) {
2911
168k
  __m256i r[2];
2912
2913
168k
  x_convolve_2tap_32_avx2(src, coeffs, r);
2914
168k
  sr_x_round_store_32_avx2(r, dst);
2915
168k
}
2916
2917
static inline void sr_x_6tap_32_avx2(const uint8_t *const src,
2918
                                     const __m256i coeffs[3],
2919
                                     const __m256i filt[3],
2920
1.45M
                                     uint8_t *const dst) {
2921
1.45M
  __m256i r[2];
2922
2923
1.45M
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
2924
1.45M
  sr_x_round_store_32_avx2(r, dst);
2925
1.45M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_6tap_32_avx2
convolve_avx2.c:sr_x_6tap_32_avx2
Line
Count
Source
2920
1.45M
                                     uint8_t *const dst) {
2921
1.45M
  __m256i r[2];
2922
2923
1.45M
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
2924
1.45M
  sr_x_round_store_32_avx2(r, dst);
2925
1.45M
}
2926
2927
static AOM_FORCE_INLINE void sr_x_8tap_32_avx2(const uint8_t *const src,
2928
                                               const __m256i coeffs[4],
2929
                                               const __m256i filt[4],
2930
224k
                                               uint8_t *const dst) {
2931
224k
  __m256i r[2];
2932
2933
224k
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
2934
224k
  sr_x_round_store_32_avx2(r, dst);
2935
224k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_8tap_32_avx2
convolve_avx2.c:sr_x_8tap_32_avx2
Line
Count
Source
2930
224k
                                               uint8_t *const dst) {
2931
224k
  __m256i r[2];
2932
2933
224k
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
2934
224k
  sr_x_round_store_32_avx2(r, dst);
2935
224k
}
2936
2937
static AOM_FORCE_INLINE void av1_convolve_x_sr_specialized_avx2(
2938
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2939
    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
2940
601k
    const int32_t subpel_x_q4, ConvolveParams *conv_params) {
2941
601k
  int32_t y = h;
2942
601k
  __m128i coeffs_128[4];
2943
601k
  __m256i coeffs_256[4];
2944
2945
601k
  assert(conv_params->round_0 == 3);
2946
601k
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
2947
601k
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
2948
601k
  (void)conv_params;
2949
2950
601k
  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
2951
2952
601k
  if (horz_tap == 2) {
2953
    // horz_filt as 2 tap
2954
44.2k
    const uint8_t *src_ptr = src;
2955
2956
44.2k
    if (subpel_x_q4 != 8) {
2957
20.3k
      if (w <= 8) {
2958
15.8k
        prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4,
2959
15.8k
                                       coeffs_128);
2960
2961
15.8k
        if (w == 2) {
2962
5.74k
          do {
2963
5.74k
            const __m128i res =
2964
5.74k
                x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
2965
5.74k
            const __m128i r = sr_x_round_sse2(res);
2966
5.74k
            pack_store_2x2_sse2(r, dst, dst_stride);
2967
5.74k
            src_ptr += 2 * src_stride;
2968
5.74k
            dst += 2 * dst_stride;
2969
5.74k
            y -= 2;
2970
5.74k
          } while (y);
2971
12.7k
        } else if (w == 4) {
2972
25.4k
          do {
2973
25.4k
            const __m128i res =
2974
25.4k
                x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
2975
25.4k
            const __m128i r = sr_x_round_sse2(res);
2976
25.4k
            pack_store_4x2_sse2(r, dst, dst_stride);
2977
25.4k
            src_ptr += 2 * src_stride;
2978
25.4k
            dst += 2 * dst_stride;
2979
25.4k
            y -= 2;
2980
25.4k
          } while (y);
2981
7.28k
        } else {
2982
5.45k
          assert(w == 8);
2983
2984
21.1k
          do {
2985
21.1k
            __m128i res[2];
2986
2987
21.1k
            x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, res);
2988
21.1k
            res[0] = sr_x_round_sse2(res[0]);
2989
21.1k
            res[1] = sr_x_round_sse2(res[1]);
2990
21.1k
            const __m128i d = _mm_packus_epi16(res[0], res[1]);
2991
21.1k
            _mm_storel_epi64((__m128i *)dst, d);
2992
21.1k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2993
2994
21.1k
            src_ptr += 2 * src_stride;
2995
21.1k
            dst += 2 * dst_stride;
2996
21.1k
            y -= 2;
2997
21.1k
          } while (y);
2998
5.45k
        }
2999
15.8k
      } else {
3000
4.54k
        prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3001
3002
4.54k
        if (w == 16) {
3003
14.4k
          do {
3004
14.4k
            __m256i r[2];
3005
3006
14.4k
            x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
3007
14.4k
            sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3008
14.4k
            src_ptr += 2 * src_stride;
3009
14.4k
            dst += 2 * dst_stride;
3010
14.4k
            y -= 2;
3011
14.4k
          } while (y);
3012
2.67k
        } else if (w == 32) {
3013
22.9k
          do {
3014
22.9k
            sr_x_2tap_32_avx2(src_ptr, coeffs_256, dst);
3015
22.9k
            src_ptr += src_stride;
3016
22.9k
            dst += dst_stride;
3017
22.9k
          } while (--y);
3018
945
        } else if (w == 64) {
3019
32.6k
          do {
3020
32.6k
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3021
32.6k
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3022
32.6k
            src_ptr += src_stride;
3023
32.6k
            dst += dst_stride;
3024
32.6k
          } while (--y);
3025
719
        } else {
3026
206
          assert(w == 128);
3027
3028
20.0k
          do {
3029
20.0k
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3030
20.0k
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3031
20.0k
            sr_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, dst + 2 * 32);
3032
20.0k
            sr_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, dst + 3 * 32);
3033
20.0k
            src_ptr += src_stride;
3034
20.0k
            dst += dst_stride;
3035
20.0k
          } while (--y);
3036
206
        }
3037
4.54k
      }
3038
23.8k
    } else {
3039
      // average to get half pel
3040
23.8k
      if (w == 2) {
3041
6.61k
        do {
3042
6.61k
          __m128i s_128;
3043
3044
6.61k
          s_128 = load_u8_4x2_sse4_1(src_ptr, src_stride);
3045
6.61k
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3046
6.61k
          const __m128i d = _mm_avg_epu8(s_128, s1);
3047
6.61k
          *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(d);
3048
6.61k
          *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(d, 2);
3049
3050
6.61k
          src_ptr += 2 * src_stride;
3051
6.61k
          dst += 2 * dst_stride;
3052
6.61k
          y -= 2;
3053
6.61k
        } while (y);
3054
20.6k
      } else if (w == 4) {
3055
27.1k
        do {
3056
27.1k
          __m128i s_128;
3057
3058
27.1k
          s_128 = load_u8_8x2_sse2(src_ptr, src_stride);
3059
27.1k
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3060
27.1k
          const __m128i d = _mm_avg_epu8(s_128, s1);
3061
27.1k
          xx_storel_32(dst, d);
3062
27.1k
          *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(d, 2);
3063
3064
27.1k
          src_ptr += 2 * src_stride;
3065
27.1k
          dst += 2 * dst_stride;
3066
27.1k
          y -= 2;
3067
27.1k
        } while (y);
3068
11.1k
      } else if (w == 8) {
3069
23.2k
        do {
3070
23.2k
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3071
23.2k
          const __m128i s10 =
3072
23.2k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3073
23.2k
          const __m128i s01 = _mm_srli_si128(s00, 1);
3074
23.2k
          const __m128i s11 = _mm_srli_si128(s10, 1);
3075
23.2k
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3076
23.2k
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3077
23.2k
          _mm_storel_epi64((__m128i *)dst, d0);
3078
23.2k
          _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
3079
3080
23.2k
          src_ptr += 2 * src_stride;
3081
23.2k
          dst += 2 * dst_stride;
3082
23.2k
          y -= 2;
3083
23.2k
        } while (y);
3084
6.24k
      } else if (w == 16) {
3085
19.4k
        do {
3086
19.4k
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3087
19.4k
          const __m128i s01 = _mm_loadu_si128((__m128i *)(src_ptr + 1));
3088
19.4k
          const __m128i s10 =
3089
19.4k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3090
19.4k
          const __m128i s11 =
3091
19.4k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
3092
19.4k
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3093
19.4k
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3094
19.4k
          _mm_storeu_si128((__m128i *)dst, d0);
3095
19.4k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
3096
3097
19.4k
          src_ptr += 2 * src_stride;
3098
19.4k
          dst += 2 * dst_stride;
3099
19.4k
          y -= 2;
3100
19.4k
        } while (y);
3101
3.01k
      } else if (w == 32) {
3102
30.5k
        do {
3103
30.5k
          sr_x_2tap_32_avg_avx2(src_ptr, dst);
3104
30.5k
          src_ptr += src_stride;
3105
30.5k
          dst += dst_stride;
3106
30.5k
        } while (--y);
3107
1.22k
      } else if (w == 64) {
3108
21.5k
        do {
3109
21.5k
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3110
21.5k
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3111
21.5k
          src_ptr += src_stride;
3112
21.5k
          dst += dst_stride;
3113
21.5k
        } while (--y);
3114
458
      } else {
3115
217
        assert(w == 128);
3116
3117
23.2k
        do {
3118
23.2k
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3119
23.2k
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3120
23.2k
          sr_x_2tap_32_avg_avx2(src_ptr + 2 * 32, dst + 2 * 32);
3121
23.2k
          sr_x_2tap_32_avg_avx2(src_ptr + 3 * 32, dst + 3 * 32);
3122
23.2k
          src_ptr += src_stride;
3123
23.2k
          dst += dst_stride;
3124
23.2k
        } while (--y);
3125
217
      }
3126
23.8k
    }
3127
556k
  } else if (horz_tap == 4) {
3128
    // horz_filt as 4 tap
3129
251k
    const uint8_t *src_ptr = src - 1;
3130
3131
251k
    prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3132
3133
251k
    if (w == 2) {
3134
141k
      do {
3135
141k
        const __m128i res =
3136
141k
            x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
3137
141k
        const __m128i r = sr_x_round_sse2(res);
3138
141k
        pack_store_2x2_sse2(r, dst, dst_stride);
3139
141k
        src_ptr += 2 * src_stride;
3140
141k
        dst += 2 * dst_stride;
3141
141k
        y -= 2;
3142
141k
      } while (y);
3143
198k
    } else if (w == 4) {
3144
623k
      do {
3145
623k
        const __m128i res =
3146
623k
            x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3147
623k
        const __m128i r = sr_x_round_sse2(res);
3148
623k
        pack_store_4x2_sse2(r, dst, dst_stride);
3149
623k
        src_ptr += 2 * src_stride;
3150
623k
        dst += 2 * dst_stride;
3151
623k
        y -= 2;
3152
623k
      } while (y);
3153
180k
    } else if (w == 8) {
3154
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3155
      // rewrite this for better performance later.
3156
11.3k
      __m256i filt_256[2];
3157
11.3k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3158
3159
11.3k
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3160
11.3k
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3161
49.5k
      for (int i = 0; i < h; i += 2) {
3162
38.2k
        const __m256i data = _mm256_permute2x128_si256(
3163
38.2k
            _mm256_castsi128_si256(
3164
38.2k
                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
3165
38.2k
            _mm256_castsi128_si256(_mm_loadu_si128(
3166
38.2k
                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
3167
38.2k
            0x20);
3168
3169
38.2k
        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3170
38.2k
        res_16b = sr_x_round_avx2(res_16b);
3171
3172
38.2k
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3173
3174
38.2k
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
3175
38.2k
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
3176
3177
38.2k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
3178
38.2k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
3179
38.2k
      }
3180
11.3k
    } else {
3181
6.11k
      assert(!(w % 16));
3182
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3183
      // rewrite this for better performance later.
3184
6.11k
      __m256i filt_256[2];
3185
6.11k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3186
6.11k
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3187
6.11k
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3188
3189
134k
      for (int i = 0; i < h; ++i) {
3190
551k
        for (int j = 0; j < w; j += 16) {
3191
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
3192
          // 18 19 20 21 22 23
3193
423k
          const __m256i data = _mm256_inserti128_si256(
3194
423k
              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
3195
423k
              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
3196
423k
              1);
3197
3198
423k
          __m256i res_16b =
3199
423k
              convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3200
423k
          res_16b = sr_x_round_avx2(res_16b);
3201
3202
          /* rounding code */
3203
          // 8 bit conversion and saturation to uint8
3204
423k
          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3205
3206
          // Store values into the destination buffer
3207
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
3208
423k
          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
3209
423k
          __m128i res = _mm256_castsi256_si128(res_8b);
3210
423k
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
3211
423k
        }
3212
128k
      }
3213
6.11k
    }
3214
305k
  } else {
3215
305k
    __m256i filt_256[4];
3216
3217
305k
    filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3218
305k
    filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3219
305k
    filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
3220
3221
305k
    if (horz_tap == 6) {
3222
      // horz_filt as 6 tap
3223
292k
      const uint8_t *src_ptr = src - 2;
3224
3225
292k
      prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3226
3227
292k
      if (w == 8) {
3228
628k
        do {
3229
628k
          const __m256i res = x_convolve_6tap_8x2_avx2(src_ptr, src_stride,
3230
628k
                                                       coeffs_256, filt_256);
3231
628k
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3232
628k
          src_ptr += 2 * src_stride;
3233
628k
          dst += 2 * dst_stride;
3234
628k
          y -= 2;
3235
628k
        } while (y);
3236
163k
      } else if (w == 16) {
3237
509k
        do {
3238
509k
          __m256i r[2];
3239
3240
509k
          x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3241
509k
                                    r);
3242
509k
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3243
509k
          src_ptr += 2 * src_stride;
3244
509k
          dst += 2 * dst_stride;
3245
509k
          y -= 2;
3246
509k
        } while (y);
3247
98.0k
      } else if (w == 32) {
3248
475k
        do {
3249
475k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3250
475k
          src_ptr += src_stride;
3251
475k
          dst += dst_stride;
3252
475k
        } while (--y);
3253
24.5k
      } else if (w == 64) {
3254
229k
        do {
3255
229k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3256
229k
          sr_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3257
229k
          src_ptr += src_stride;
3258
229k
          dst += dst_stride;
3259
229k
        } while (--y);
3260
4.82k
      } else {
3261
1.09k
        assert(w == 128);
3262
3263
129k
        do {
3264
129k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3265
129k
          sr_x_6tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3266
129k
                            dst + 1 * 32);
3267
129k
          sr_x_6tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3268
129k
                            dst + 2 * 32);
3269
129k
          sr_x_6tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3270
129k
                            dst + 3 * 32);
3271
129k
          src_ptr += src_stride;
3272
129k
          dst += dst_stride;
3273
129k
        } while (--y);
3274
1.10k
      }
3275
292k
    } else if (horz_tap == 8) {
3276
      // horz_filt as 8 tap
3277
13.1k
      const uint8_t *src_ptr = src - 3;
3278
3279
13.1k
      filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
3280
3281
13.1k
      prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3282
3283
13.1k
      if (w == 8) {
3284
26.5k
        do {
3285
26.5k
          const __m256i res = x_convolve_8tap_8x2_avx2(src_ptr, src_stride,
3286
26.5k
                                                       coeffs_256, filt_256);
3287
26.5k
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3288
26.5k
          src_ptr += 2 * src_stride;
3289
26.5k
          dst += 2 * dst_stride;
3290
26.5k
          y -= 2;
3291
26.5k
        } while (y);
3292
6.62k
      } else if (w == 16) {
3293
24.2k
        do {
3294
24.2k
          __m256i r[2];
3295
3296
24.2k
          x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3297
24.2k
                                    r);
3298
24.2k
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3299
24.2k
          src_ptr += 2 * src_stride;
3300
24.2k
          dst += 2 * dst_stride;
3301
24.2k
          y -= 2;
3302
24.2k
        } while (y);
3303
3.98k
      } else if (w == 32) {
3304
37.5k
        do {
3305
37.5k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3306
37.5k
          src_ptr += src_stride;
3307
37.5k
          dst += dst_stride;
3308
37.5k
        } while (--y);
3309
1.52k
      } else if (w == 64) {
3310
39.3k
        do {
3311
39.3k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3312
39.3k
          sr_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3313
39.3k
          src_ptr += src_stride;
3314
39.3k
          dst += dst_stride;
3315
39.3k
        } while (--y);
3316
826
      } else {
3317
288
        assert(w == 128);
3318
3319
27.1k
        do {
3320
27.1k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3321
27.1k
          sr_x_8tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3322
27.1k
                            dst + 1 * 32);
3323
27.1k
          sr_x_8tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3324
27.1k
                            dst + 2 * 32);
3325
27.1k
          sr_x_8tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3326
27.1k
                            dst + 3 * 32);
3327
27.1k
          src_ptr += src_stride;
3328
27.1k
          dst += dst_stride;
3329
27.1k
        } while (--y);
3330
288
      }
3331
13.1k
    }
3332
305k
  }
3333
601k
}
Unexecuted instantiation: convolve_2d_avx2.c:av1_convolve_x_sr_specialized_avx2
convolve_avx2.c:av1_convolve_x_sr_specialized_avx2
Line
Count
Source
2940
601k
    const int32_t subpel_x_q4, ConvolveParams *conv_params) {
2941
601k
  int32_t y = h;
2942
601k
  __m128i coeffs_128[4];
2943
601k
  __m256i coeffs_256[4];
2944
2945
601k
  assert(conv_params->round_0 == 3);
2946
601k
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
2947
601k
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
2948
601k
  (void)conv_params;
2949
2950
601k
  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
2951
2952
601k
  if (horz_tap == 2) {
2953
    // horz_filt as 2 tap
2954
44.2k
    const uint8_t *src_ptr = src;
2955
2956
44.2k
    if (subpel_x_q4 != 8) {
2957
20.3k
      if (w <= 8) {
2958
15.8k
        prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4,
2959
15.8k
                                       coeffs_128);
2960
2961
15.8k
        if (w == 2) {
2962
5.74k
          do {
2963
5.74k
            const __m128i res =
2964
5.74k
                x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
2965
5.74k
            const __m128i r = sr_x_round_sse2(res);
2966
5.74k
            pack_store_2x2_sse2(r, dst, dst_stride);
2967
5.74k
            src_ptr += 2 * src_stride;
2968
5.74k
            dst += 2 * dst_stride;
2969
5.74k
            y -= 2;
2970
5.74k
          } while (y);
2971
12.7k
        } else if (w == 4) {
2972
25.4k
          do {
2973
25.4k
            const __m128i res =
2974
25.4k
                x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
2975
25.4k
            const __m128i r = sr_x_round_sse2(res);
2976
25.4k
            pack_store_4x2_sse2(r, dst, dst_stride);
2977
25.4k
            src_ptr += 2 * src_stride;
2978
25.4k
            dst += 2 * dst_stride;
2979
25.4k
            y -= 2;
2980
25.4k
          } while (y);
2981
7.28k
        } else {
2982
5.45k
          assert(w == 8);
2983
2984
21.1k
          do {
2985
21.1k
            __m128i res[2];
2986
2987
21.1k
            x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, res);
2988
21.1k
            res[0] = sr_x_round_sse2(res[0]);
2989
21.1k
            res[1] = sr_x_round_sse2(res[1]);
2990
21.1k
            const __m128i d = _mm_packus_epi16(res[0], res[1]);
2991
21.1k
            _mm_storel_epi64((__m128i *)dst, d);
2992
21.1k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2993
2994
21.1k
            src_ptr += 2 * src_stride;
2995
21.1k
            dst += 2 * dst_stride;
2996
21.1k
            y -= 2;
2997
21.1k
          } while (y);
2998
5.45k
        }
2999
15.8k
      } else {
3000
4.54k
        prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3001
3002
4.54k
        if (w == 16) {
3003
14.4k
          do {
3004
14.4k
            __m256i r[2];
3005
3006
14.4k
            x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
3007
14.4k
            sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3008
14.4k
            src_ptr += 2 * src_stride;
3009
14.4k
            dst += 2 * dst_stride;
3010
14.4k
            y -= 2;
3011
14.4k
          } while (y);
3012
2.67k
        } else if (w == 32) {
3013
22.9k
          do {
3014
22.9k
            sr_x_2tap_32_avx2(src_ptr, coeffs_256, dst);
3015
22.9k
            src_ptr += src_stride;
3016
22.9k
            dst += dst_stride;
3017
22.9k
          } while (--y);
3018
945
        } else if (w == 64) {
3019
32.6k
          do {
3020
32.6k
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3021
32.6k
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3022
32.6k
            src_ptr += src_stride;
3023
32.6k
            dst += dst_stride;
3024
32.6k
          } while (--y);
3025
719
        } else {
3026
206
          assert(w == 128);
3027
3028
20.0k
          do {
3029
20.0k
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3030
20.0k
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3031
20.0k
            sr_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, dst + 2 * 32);
3032
20.0k
            sr_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, dst + 3 * 32);
3033
20.0k
            src_ptr += src_stride;
3034
20.0k
            dst += dst_stride;
3035
20.0k
          } while (--y);
3036
206
        }
3037
4.54k
      }
3038
23.8k
    } else {
3039
      // average to get half pel
3040
23.8k
      if (w == 2) {
3041
6.61k
        do {
3042
6.61k
          __m128i s_128;
3043
3044
6.61k
          s_128 = load_u8_4x2_sse4_1(src_ptr, src_stride);
3045
6.61k
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3046
6.61k
          const __m128i d = _mm_avg_epu8(s_128, s1);
3047
6.61k
          *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(d);
3048
6.61k
          *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(d, 2);
3049
3050
6.61k
          src_ptr += 2 * src_stride;
3051
6.61k
          dst += 2 * dst_stride;
3052
6.61k
          y -= 2;
3053
6.61k
        } while (y);
3054
20.6k
      } else if (w == 4) {
3055
27.1k
        do {
3056
27.1k
          __m128i s_128;
3057
3058
27.1k
          s_128 = load_u8_8x2_sse2(src_ptr, src_stride);
3059
27.1k
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3060
27.1k
          const __m128i d = _mm_avg_epu8(s_128, s1);
3061
27.1k
          xx_storel_32(dst, d);
3062
27.1k
          *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(d, 2);
3063
3064
27.1k
          src_ptr += 2 * src_stride;
3065
27.1k
          dst += 2 * dst_stride;
3066
27.1k
          y -= 2;
3067
27.1k
        } while (y);
3068
11.1k
      } else if (w == 8) {
3069
23.2k
        do {
3070
23.2k
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3071
23.2k
          const __m128i s10 =
3072
23.2k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3073
23.2k
          const __m128i s01 = _mm_srli_si128(s00, 1);
3074
23.2k
          const __m128i s11 = _mm_srli_si128(s10, 1);
3075
23.2k
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3076
23.2k
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3077
23.2k
          _mm_storel_epi64((__m128i *)dst, d0);
3078
23.2k
          _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
3079
3080
23.2k
          src_ptr += 2 * src_stride;
3081
23.2k
          dst += 2 * dst_stride;
3082
23.2k
          y -= 2;
3083
23.2k
        } while (y);
3084
6.24k
      } else if (w == 16) {
3085
19.4k
        do {
3086
19.4k
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3087
19.4k
          const __m128i s01 = _mm_loadu_si128((__m128i *)(src_ptr + 1));
3088
19.4k
          const __m128i s10 =
3089
19.4k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3090
19.4k
          const __m128i s11 =
3091
19.4k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
3092
19.4k
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3093
19.4k
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3094
19.4k
          _mm_storeu_si128((__m128i *)dst, d0);
3095
19.4k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
3096
3097
19.4k
          src_ptr += 2 * src_stride;
3098
19.4k
          dst += 2 * dst_stride;
3099
19.4k
          y -= 2;
3100
19.4k
        } while (y);
3101
3.01k
      } else if (w == 32) {
3102
30.5k
        do {
3103
30.5k
          sr_x_2tap_32_avg_avx2(src_ptr, dst);
3104
30.5k
          src_ptr += src_stride;
3105
30.5k
          dst += dst_stride;
3106
30.5k
        } while (--y);
3107
1.22k
      } else if (w == 64) {
3108
21.5k
        do {
3109
21.5k
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3110
21.5k
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3111
21.5k
          src_ptr += src_stride;
3112
21.5k
          dst += dst_stride;
3113
21.5k
        } while (--y);
3114
458
      } else {
3115
217
        assert(w == 128);
3116
3117
23.2k
        do {
3118
23.2k
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3119
23.2k
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3120
23.2k
          sr_x_2tap_32_avg_avx2(src_ptr + 2 * 32, dst + 2 * 32);
3121
23.2k
          sr_x_2tap_32_avg_avx2(src_ptr + 3 * 32, dst + 3 * 32);
3122
23.2k
          src_ptr += src_stride;
3123
23.2k
          dst += dst_stride;
3124
23.2k
        } while (--y);
3125
217
      }
3126
23.8k
    }
3127
556k
  } else if (horz_tap == 4) {
3128
    // horz_filt as 4 tap
3129
251k
    const uint8_t *src_ptr = src - 1;
3130
3131
251k
    prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3132
3133
251k
    if (w == 2) {
3134
141k
      do {
3135
141k
        const __m128i res =
3136
141k
            x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
3137
141k
        const __m128i r = sr_x_round_sse2(res);
3138
141k
        pack_store_2x2_sse2(r, dst, dst_stride);
3139
141k
        src_ptr += 2 * src_stride;
3140
141k
        dst += 2 * dst_stride;
3141
141k
        y -= 2;
3142
141k
      } while (y);
3143
198k
    } else if (w == 4) {
3144
623k
      do {
3145
623k
        const __m128i res =
3146
623k
            x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3147
623k
        const __m128i r = sr_x_round_sse2(res);
3148
623k
        pack_store_4x2_sse2(r, dst, dst_stride);
3149
623k
        src_ptr += 2 * src_stride;
3150
623k
        dst += 2 * dst_stride;
3151
623k
        y -= 2;
3152
623k
      } while (y);
3153
180k
    } else if (w == 8) {
3154
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3155
      // rewrite this for better performance later.
3156
11.3k
      __m256i filt_256[2];
3157
11.3k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3158
3159
11.3k
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3160
11.3k
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3161
49.5k
      for (int i = 0; i < h; i += 2) {
3162
38.2k
        const __m256i data = _mm256_permute2x128_si256(
3163
38.2k
            _mm256_castsi128_si256(
3164
38.2k
                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
3165
38.2k
            _mm256_castsi128_si256(_mm_loadu_si128(
3166
38.2k
                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
3167
38.2k
            0x20);
3168
3169
38.2k
        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3170
38.2k
        res_16b = sr_x_round_avx2(res_16b);
3171
3172
38.2k
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3173
3174
38.2k
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
3175
38.2k
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
3176
3177
38.2k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
3178
38.2k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
3179
38.2k
      }
3180
11.3k
    } else {
3181
6.11k
      assert(!(w % 16));
3182
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3183
      // rewrite this for better performance later.
3184
6.11k
      __m256i filt_256[2];
3185
6.11k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3186
6.11k
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3187
6.11k
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3188
3189
134k
      for (int i = 0; i < h; ++i) {
3190
551k
        for (int j = 0; j < w; j += 16) {
3191
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
3192
          // 18 19 20 21 22 23
3193
423k
          const __m256i data = _mm256_inserti128_si256(
3194
423k
              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
3195
423k
              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
3196
423k
              1);
3197
3198
423k
          __m256i res_16b =
3199
423k
              convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3200
423k
          res_16b = sr_x_round_avx2(res_16b);
3201
3202
          /* rounding code */
3203
          // 8 bit conversion and saturation to uint8
3204
423k
          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3205
3206
          // Store values into the destination buffer
3207
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
3208
423k
          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
3209
423k
          __m128i res = _mm256_castsi256_si128(res_8b);
3210
423k
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
3211
423k
        }
3212
128k
      }
3213
6.11k
    }
3214
305k
  } else {
3215
305k
    __m256i filt_256[4];
3216
3217
305k
    filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3218
305k
    filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3219
305k
    filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
3220
3221
305k
    if (horz_tap == 6) {
3222
      // horz_filt as 6 tap
3223
292k
      const uint8_t *src_ptr = src - 2;
3224
3225
292k
      prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3226
3227
292k
      if (w == 8) {
3228
628k
        do {
3229
628k
          const __m256i res = x_convolve_6tap_8x2_avx2(src_ptr, src_stride,
3230
628k
                                                       coeffs_256, filt_256);
3231
628k
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3232
628k
          src_ptr += 2 * src_stride;
3233
628k
          dst += 2 * dst_stride;
3234
628k
          y -= 2;
3235
628k
        } while (y);
3236
163k
      } else if (w == 16) {
3237
509k
        do {
3238
509k
          __m256i r[2];
3239
3240
509k
          x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3241
509k
                                    r);
3242
509k
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3243
509k
          src_ptr += 2 * src_stride;
3244
509k
          dst += 2 * dst_stride;
3245
509k
          y -= 2;
3246
509k
        } while (y);
3247
98.0k
      } else if (w == 32) {
3248
475k
        do {
3249
475k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3250
475k
          src_ptr += src_stride;
3251
475k
          dst += dst_stride;
3252
475k
        } while (--y);
3253
24.5k
      } else if (w == 64) {
3254
229k
        do {
3255
229k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3256
229k
          sr_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3257
229k
          src_ptr += src_stride;
3258
229k
          dst += dst_stride;
3259
229k
        } while (--y);
3260
4.82k
      } else {
3261
1.09k
        assert(w == 128);
3262
3263
129k
        do {
3264
129k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3265
129k
          sr_x_6tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3266
129k
                            dst + 1 * 32);
3267
129k
          sr_x_6tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3268
129k
                            dst + 2 * 32);
3269
129k
          sr_x_6tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3270
129k
                            dst + 3 * 32);
3271
129k
          src_ptr += src_stride;
3272
129k
          dst += dst_stride;
3273
129k
        } while (--y);
3274
1.10k
      }
3275
292k
    } else if (horz_tap == 8) {
3276
      // horz_filt as 8 tap
3277
13.1k
      const uint8_t *src_ptr = src - 3;
3278
3279
13.1k
      filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
3280
3281
13.1k
      prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3282
3283
13.1k
      if (w == 8) {
3284
26.5k
        do {
3285
26.5k
          const __m256i res = x_convolve_8tap_8x2_avx2(src_ptr, src_stride,
3286
26.5k
                                                       coeffs_256, filt_256);
3287
26.5k
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3288
26.5k
          src_ptr += 2 * src_stride;
3289
26.5k
          dst += 2 * dst_stride;
3290
26.5k
          y -= 2;
3291
26.5k
        } while (y);
3292
6.62k
      } else if (w == 16) {
3293
24.2k
        do {
3294
24.2k
          __m256i r[2];
3295
3296
24.2k
          x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3297
24.2k
                                    r);
3298
24.2k
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3299
24.2k
          src_ptr += 2 * src_stride;
3300
24.2k
          dst += 2 * dst_stride;
3301
24.2k
          y -= 2;
3302
24.2k
        } while (y);
3303
3.98k
      } else if (w == 32) {
3304
37.5k
        do {
3305
37.5k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3306
37.5k
          src_ptr += src_stride;
3307
37.5k
          dst += dst_stride;
3308
37.5k
        } while (--y);
3309
1.52k
      } else if (w == 64) {
3310
39.3k
        do {
3311
39.3k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3312
39.3k
          sr_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3313
39.3k
          src_ptr += src_stride;
3314
39.3k
          dst += dst_stride;
3315
39.3k
        } while (--y);
3316
826
      } else {
3317
288
        assert(w == 128);
3318
3319
27.1k
        do {
3320
27.1k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3321
27.1k
          sr_x_8tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3322
27.1k
                            dst + 1 * 32);
3323
27.1k
          sr_x_8tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3324
27.1k
                            dst + 2 * 32);
3325
27.1k
          sr_x_8tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3326
27.1k
                            dst + 3 * 32);
3327
27.1k
          src_ptr += src_stride;
3328
27.1k
          dst += dst_stride;
3329
27.1k
        } while (--y);
3330
288
      }
3331
13.1k
    }
3332
305k
  }
3333
601k
}
3334
3335
#endif  // THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_