Coverage Report

Created: 2026-03-08 06:51

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/third_party/SVT-AV1/convolve_avx2.h
Line
Count
Source
1
/*
2
 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#ifndef THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
13
#define THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
14
15
#include "EbMemory_AVX2.h"
16
#include "EbMemory_SSE4_1.h"
17
#include "synonyms.h"
18
19
#include "aom_dsp/aom_filter.h"
20
#include "aom_dsp/x86/convolve_avx2.h"
21
#include "aom_dsp/x86/mem_sse2.h"
22
23
static inline void populate_coeffs_4tap_avx2(const __m128i coeffs_128,
24
176k
                                             __m256i coeffs[2]) {
25
176k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
26
27
  // coeffs 2 3 2 3 2 3 2 3
28
176k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
29
  // coeffs 4 5 4 5 4 5 4 5
30
176k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
31
176k
}
32
33
static inline void populate_coeffs_6tap_avx2(const __m128i coeffs_128,
34
201k
                                             __m256i coeffs[3]) {
35
201k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
36
37
  // coeffs 1 2 1 2 1 2 1 2
38
201k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
39
  // coeffs 3 4 3 4 3 4 3 4
40
201k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
41
  // coeffs 5 6 5 6 5 6 5 6
42
201k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
43
201k
}
44
45
static inline void populate_coeffs_8tap_avx2(const __m128i coeffs_128,
46
11.5k
                                             __m256i coeffs[4]) {
47
11.5k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
48
49
  // coeffs 0 1 0 1 0 1 0 1
50
11.5k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
51
  // coeffs 2 3 2 3 2 3 2 3
52
11.5k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
53
  // coeffs 4 5 4 5 4 5 4 5
54
11.5k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
55
  // coeffs 6 7 6 7 6 7 6 7
56
11.5k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
57
11.5k
}
58
59
static inline void prepare_half_coeffs_2tap_ssse3(
60
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
61
11.0k
    __m128i *const coeffs /* [1] */) {
62
11.0k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
63
11.0k
      filter_params, subpel_q4 & SUBPEL_MASK);
64
11.0k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
65
66
  // right shift all filter co-efficients by 1 to reduce the bits required.
67
  // This extra right shift will be taken care of at the end while rounding
68
  // the result.
69
  // Since all filter co-efficients are even, this change will not affect the
70
  // end result
71
11.0k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
72
11.0k
                            _mm_set1_epi16((short)0xffff)));
73
74
11.0k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
75
76
  // coeffs 3 4 3 4 3 4 3 4
77
11.0k
  *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
78
11.0k
}
79
80
static inline void prepare_half_coeffs_4tap_ssse3(
81
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
82
166k
    __m128i *const coeffs /* [2] */) {
83
166k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
84
166k
      filter_params, subpel_q4 & SUBPEL_MASK);
85
166k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
86
87
  // right shift all filter co-efficients by 1 to reduce the bits required.
88
  // This extra right shift will be taken care of at the end while rounding
89
  // the result.
90
  // Since all filter co-efficients are even, this change will not affect the
91
  // end result
92
166k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
93
166k
                            _mm_set1_epi16((short)0xffff)));
94
95
166k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
96
97
  // coeffs 2 3 2 3 2 3 2 3
98
166k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
99
  // coeffs 4 5 4 5 4 5 4 5
100
166k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
101
166k
}
102
103
static inline void prepare_half_coeffs_6tap_ssse3(
104
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
105
96.7k
    __m128i *const coeffs /* [3] */) {
106
96.7k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
107
96.7k
      filter_params, subpel_q4 & SUBPEL_MASK);
108
96.7k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
109
110
  // right shift all filter co-efficients by 1 to reduce the bits required.
111
  // This extra right shift will be taken care of at the end while rounding
112
  // the result.
113
  // Since all filter co-efficients are even, this change will not affect the
114
  // end result
115
96.7k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
116
96.7k
                            _mm_set1_epi16((short)0xffff)));
117
118
96.7k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
119
120
  // coeffs 1 2 1 2 1 2 1 2
121
96.7k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
122
  // coeffs 3 4 3 4 3 4 3 4
123
96.7k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
124
  // coeffs 5 6 5 6 5 6 5 6
125
96.7k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0C0Au));
126
96.7k
}
127
128
static inline void prepare_half_coeffs_8tap_ssse3(
129
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
130
7.23k
    __m128i *const coeffs /* [4] */) {
131
7.23k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
132
7.23k
      filter_params, subpel_q4 & SUBPEL_MASK);
133
7.23k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
134
135
  // right shift all filter co-efficients by 1 to reduce the bits required.
136
  // This extra right shift will be taken care of at the end while rounding
137
  // the result.
138
  // Since all filter co-efficients are even, this change will not affect the
139
  // end result
140
7.23k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
141
7.23k
                            _mm_set1_epi16((short)0xffff)));
142
143
7.23k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
144
145
  // coeffs 0 1 0 1 0 1 0 1
146
7.23k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
147
  // coeffs 2 3 2 3 2 3 2 3
148
7.23k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
149
  // coeffs 4 5 4 5 4 5 4 5
150
7.23k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
151
  // coeffs 6 7 6 7 6 7 6 7
152
7.23k
  coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
153
7.23k
}
154
155
static inline void prepare_half_coeffs_2tap_avx2(
156
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
157
3.81k
    __m256i *const coeffs /* [1] */) {
158
3.81k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
159
3.81k
      filter_params, subpel_q4 & SUBPEL_MASK);
160
3.81k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
161
3.81k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
162
163
  // right shift all filter co-efficients by 1 to reduce the bits required.
164
  // This extra right shift will be taken care of at the end while rounding
165
  // the result.
166
  // Since all filter co-efficients are even, this change will not affect the
167
  // end result
168
3.81k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
169
3.81k
                            _mm_set1_epi16((short)0xffff)));
170
171
3.81k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
172
173
  // coeffs 3 4 3 4 3 4 3 4
174
3.81k
  *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
175
3.81k
}
176
177
static inline void prepare_half_coeffs_4tap_avx2(
178
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
179
176k
    __m256i *const coeffs /* [2] */) {
180
176k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
181
176k
      filter_params, subpel_q4 & SUBPEL_MASK);
182
176k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
183
184
  // right shift all filter co-efficients by 1 to reduce the bits required.
185
  // This extra right shift will be taken care of at the end while rounding
186
  // the result.
187
  // Since all filter co-efficients are even, this change will not affect the
188
  // end result
189
176k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
190
176k
                            _mm_set1_epi16((short)0xffff)));
191
176k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
192
176k
  populate_coeffs_4tap_avx2(coeffs_1, coeffs);
193
176k
}
194
195
static inline void prepare_half_coeffs_6tap_avx2(
196
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
197
201k
    __m256i *const coeffs /* [3] */) {
198
201k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
199
201k
      filter_params, subpel_q4 & SUBPEL_MASK);
200
201k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
201
202
  // right shift all filter co-efficients by 1 to reduce the bits required.
203
  // This extra right shift will be taken care of at the end while rounding
204
  // the result.
205
  // Since all filter co-efficients are even, this change will not affect the
206
  // end result
207
201k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
208
201k
                            _mm_set1_epi16((short)0xffff)));
209
201k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
210
201k
  populate_coeffs_6tap_avx2(coeffs_1, coeffs);
211
201k
}
212
213
static inline void prepare_half_coeffs_8tap_avx2(
214
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
215
11.5k
    __m256i *const coeffs /* [4] */) {
216
11.5k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
217
11.5k
      filter_params, subpel_q4 & SUBPEL_MASK);
218
11.5k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
219
220
  // right shift all filter co-efficients by 1 to reduce the bits required.
221
  // This extra right shift will be taken care of at the end while rounding
222
  // the result.
223
  // Since all filter co-efficients are even, this change will not affect the
224
  // end result
225
11.5k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
226
11.5k
                            _mm_set1_epi16((short)0xffff)));
227
11.5k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
228
11.5k
  populate_coeffs_8tap_avx2(coeffs_1, coeffs);
229
11.5k
}
230
231
static inline void prepare_coeffs_2tap_sse2(
232
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
233
0
    __m128i *const coeffs /* [1] */) {
234
0
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
235
0
      filter_params, subpel_q4 & SUBPEL_MASK);
236
0
237
0
  const __m128i coeff = _mm_cvtsi32_si128(loadu_int32(filter + 3));
238
0
239
0
  // coeffs 3 4 3 4 3 4 3 4
240
0
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
241
0
}
242
243
static inline void prepare_coeffs_4tap_sse2(
244
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
245
0
    __m128i *const coeffs /* [2] */) {
246
0
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
247
0
      filter_params, subpel_q4 & SUBPEL_MASK);
248
0
249
0
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
250
0
251
0
  // coeffs 2 3 2 3 2 3 2 3
252
0
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x55);
253
0
  // coeffs 4 5 4 5 4 5 4 5
254
0
  coeffs[1] = _mm_shuffle_epi32(coeff, 0xaa);
255
0
}
256
257
static inline void prepare_coeffs_6tap_ssse3(
258
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
259
0
    __m128i *const coeffs /* [3] */) {
260
0
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
261
0
      filter_params, subpel_q4 & SUBPEL_MASK);
262
0
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
263
0
264
0
  // coeffs 1 2 1 2 1 2 1 2
265
0
  coeffs[0] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x05040302u));
266
0
  // coeffs 3 4 3 4 3 4 3 4
267
0
  coeffs[1] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x09080706u));
268
0
  // coeffs 5 6 5 6 5 6 5 6
269
0
  coeffs[2] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x0D0C0B0Au));
270
0
}
271
272
static inline void prepare_coeffs_8tap_sse2(
273
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
274
0
    __m128i *const coeffs /* [4] */) {
275
0
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
276
0
      filter_params, subpel_q4 & SUBPEL_MASK);
277
0
278
0
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
279
0
280
0
  // coeffs 0 1 0 1 0 1 0 1
281
0
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
282
0
  // coeffs 2 3 2 3 2 3 2 3
283
0
  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
284
0
  // coeffs 4 5 4 5 4 5 4 5
285
0
  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
286
0
  // coeffs 6 7 6 7 6 7 6 7
287
0
  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
288
0
}
289
290
static inline void prepare_coeffs_2tap_avx2(
291
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
292
0
    __m256i *const coeffs /* [1] */) {
293
0
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
294
0
      filter_params, subpel_q4 & SUBPEL_MASK);
295
0
296
0
  const __m128i coeff_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
297
0
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
298
0
299
0
  // coeffs 3 4 3 4 3 4 3 4
300
0
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
301
0
}
302
303
static inline void prepare_coeffs_4tap_avx2(
304
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
305
0
    __m256i *const coeffs /* [2] */) {
306
0
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
307
0
      filter_params, subpel_q4 & SUBPEL_MASK);
308
0
309
0
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
310
0
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
311
0
312
0
  // coeffs 2 3 2 3 2 3 2 3
313
0
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
314
0
  // coeffs 4 5 4 5 4 5 4 5
315
0
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
316
0
}
317
318
static inline void prepare_coeffs_6tap_avx2(
319
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
320
0
    __m256i *const coeffs /* [3]*/) {
321
0
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
322
0
      filter_params, subpel_q4 & SUBPEL_MASK);
323
0
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
324
0
  const __m256i coeff = _mm256_broadcastsi128_si256(coeffs_8);
325
0
326
0
  // coeffs 1 2 1 2 1 2 1 2
327
0
  coeffs[0] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x05040302u));
328
0
  // coeffs 3 4 3 4 3 4 3 4
329
0
  coeffs[1] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x09080706u));
330
0
  // coeffs 5 6 5 6 5 6 5 6
331
0
  coeffs[2] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x0D0C0B0Au));
332
0
}
333
334
static inline void prepare_coeffs_8tap_avx2(
335
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
336
0
    __m256i *const coeffs /* [4] */) {
337
0
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
338
0
      filter_params, subpel_q4 & SUBPEL_MASK);
339
0
340
0
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
341
0
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
342
0
343
0
  // coeffs 0 1 0 1 0 1 0 1
344
0
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
345
0
  // coeffs 2 3 2 3 2 3 2 3
346
0
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
347
0
  // coeffs 4 5 4 5 4 5 4 5
348
0
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
349
0
  // coeffs 6 7 6 7 6 7 6 7
350
0
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
351
0
}
352
353
static inline void load_16bit_5rows_avx2(const int16_t *const src,
354
                                         const ptrdiff_t stride,
355
0
                                         __m256i dst[5]) {
356
0
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
357
0
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
358
0
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
359
0
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
360
0
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
361
0
}
362
363
static inline void load_16bit_7rows_avx2(const int16_t *const src,
364
                                         const ptrdiff_t stride,
365
0
                                         __m256i dst[7]) {
366
0
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
367
0
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
368
0
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
369
0
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
370
0
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
371
0
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
372
0
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
373
0
}
374
375
static AOM_FORCE_INLINE void load_16bit_8rows_avx2(const int16_t *const src,
376
                                                   const ptrdiff_t stride,
377
0
                                                   __m256i dst[8]) {
378
0
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
379
0
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
380
0
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
381
0
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
382
0
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
383
0
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
384
0
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
385
0
  dst[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
386
0
}
387
388
static AOM_FORCE_INLINE void loadu_unpack_16bit_5rows_avx2(
389
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[5],
390
0
    __m256i ss_256[5], __m256i tt_256[5]) {
391
0
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
392
0
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
393
0
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
394
0
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
395
0
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
396
0
397
0
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
398
0
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
399
0
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
400
0
  ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
401
0
402
0
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
403
0
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
404
0
  tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
405
0
  tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
406
0
}
407
408
static AOM_FORCE_INLINE void loadu_unpack_16bit_3rows_avx2(
409
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[3],
410
0
    __m256i ss_256[3], __m256i tt_256[3]) {
411
0
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
412
0
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
413
0
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
414
0
415
0
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
416
0
  ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
417
0
418
0
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
419
0
  tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
420
0
}
421
422
static inline void convolve_8tap_unpack_avx2(const __m256i s[6],
423
0
                                             __m256i ss[7]) {
424
0
  ss[0] = _mm256_unpacklo_epi16(s[0], s[1]);
425
0
  ss[1] = _mm256_unpacklo_epi16(s[2], s[3]);
426
0
  ss[2] = _mm256_unpacklo_epi16(s[4], s[5]);
427
0
  ss[4] = _mm256_unpackhi_epi16(s[0], s[1]);
428
0
  ss[5] = _mm256_unpackhi_epi16(s[2], s[3]);
429
0
  ss[6] = _mm256_unpackhi_epi16(s[4], s[5]);
430
0
}
431
432
static inline __m128i convolve_2tap_ssse3(const __m128i ss[1],
433
45.3k
                                          const __m128i coeffs[1]) {
434
45.3k
  return _mm_maddubs_epi16(ss[0], coeffs[0]);
435
45.3k
}
436
437
static inline __m128i convolve_4tap_ssse3(const __m128i ss[2],
438
329k
                                          const __m128i coeffs[2]) {
439
329k
  const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
440
329k
  const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
441
329k
  return _mm_add_epi16(res_23, res_45);
442
329k
}
443
444
static inline __m128i convolve_6tap_ssse3(const __m128i ss[3],
445
500k
                                          const __m128i coeffs[3]) {
446
500k
  const __m128i res_12 = _mm_maddubs_epi16(ss[0], coeffs[0]);
447
500k
  const __m128i res_34 = _mm_maddubs_epi16(ss[1], coeffs[1]);
448
500k
  const __m128i res_56 = _mm_maddubs_epi16(ss[2], coeffs[2]);
449
500k
  const __m128i res_1256 = _mm_add_epi16(res_12, res_56);
450
500k
  return _mm_add_epi16(res_1256, res_34);
451
500k
}
452
453
static inline __m128i convolve_8tap_ssse3(const __m128i ss[4],
454
36.6k
                                          const __m128i coeffs[4]) {
455
36.6k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
456
36.6k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
457
36.6k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
458
36.6k
  const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
459
36.6k
  const __m128i res_0145 = _mm_add_epi16(res_01, res_45);
460
36.6k
  const __m128i res_2367 = _mm_add_epi16(res_23, res_67);
461
36.6k
  return _mm_add_epi16(res_0145, res_2367);
462
36.6k
}
463
464
static inline __m256i convolve_2tap_avx2(const __m256i ss[1],
465
268k
                                         const __m256i coeffs[1]) {
466
268k
  return _mm256_maddubs_epi16(ss[0], coeffs[0]);
467
268k
}
468
469
static inline __m256i convolve_4tap_avx2(const __m256i ss[2],
470
1.32M
                                         const __m256i coeffs[2]) {
471
1.32M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
472
1.32M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
473
1.32M
  return _mm256_add_epi16(res_23, res_45);
474
1.32M
}
475
476
static inline __m256i convolve_6tap_avx2(const __m256i ss[3],
477
4.20M
                                         const __m256i coeffs[3]) {
478
4.20M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
479
4.20M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
480
4.20M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
481
4.20M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
482
4.20M
  return _mm256_add_epi16(res_0145, res_23);
483
4.20M
}
484
485
static inline __m256i convolve_8tap_avx2(const __m256i ss[4],
486
296k
                                         const __m256i coeffs[4]) {
487
296k
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
488
296k
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
489
296k
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
490
296k
  const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
491
296k
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
492
296k
  const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
493
296k
  return _mm256_add_epi16(res_0145, res_2367);
494
296k
}
495
496
static inline __m128i convolve16_2tap_sse2(const __m128i ss[1],
497
0
                                           const __m128i coeffs[1]) {
498
0
  return _mm_madd_epi16(ss[0], coeffs[0]);
499
0
}
500
501
static inline __m128i convolve16_4tap_sse2(const __m128i ss[2],
502
0
                                           const __m128i coeffs[2]) {
503
0
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
504
0
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
505
0
  return _mm_add_epi32(res_01, res_23);
506
0
}
507
508
static inline __m128i convolve16_6tap_sse2(const __m128i ss[3],
509
0
                                           const __m128i coeffs[3]) {
510
0
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
511
0
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
512
0
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
513
0
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
514
0
  return _mm_add_epi32(res_0123, res_45);
515
0
}
516
517
static inline __m128i convolve16_8tap_sse2(const __m128i ss[4],
518
0
                                           const __m128i coeffs[4]) {
519
0
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
520
0
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
521
0
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
522
0
  const __m128i res_67 = _mm_madd_epi16(ss[3], coeffs[3]);
523
0
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
524
0
  const __m128i res_4567 = _mm_add_epi32(res_45, res_67);
525
0
  return _mm_add_epi32(res_0123, res_4567);
526
0
}
527
528
static inline __m256i convolve16_2tap_avx2(const __m256i ss[1],
529
0
                                           const __m256i coeffs[1]) {
530
0
  return _mm256_madd_epi16(ss[0], coeffs[0]);
531
0
}
532
533
static inline __m256i convolve16_4tap_avx2(const __m256i ss[2],
534
0
                                           const __m256i coeffs[2]) {
535
0
  const __m256i res_1 = _mm256_madd_epi16(ss[0], coeffs[0]);
536
0
  const __m256i res_2 = _mm256_madd_epi16(ss[1], coeffs[1]);
537
0
  return _mm256_add_epi32(res_1, res_2);
538
0
}
539
540
static inline __m256i convolve16_6tap_avx2(const __m256i ss[3],
541
0
                                           const __m256i coeffs[3]) {
542
0
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
543
0
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
544
0
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
545
0
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
546
0
  return _mm256_add_epi32(res_0123, res_45);
547
0
}
548
549
static inline __m256i convolve16_8tap_avx2(const __m256i ss[4],
550
0
                                           const __m256i coeffs[4]) {
551
0
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
552
0
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
553
0
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
554
0
  const __m256i res_67 = _mm256_madd_epi16(ss[3], coeffs[3]);
555
0
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
556
0
  const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
557
0
  return _mm256_add_epi32(res_0123, res_4567);
558
0
}
559
560
static inline __m256i x_convolve_4tap_avx2(const __m256i data,
561
                                           const __m256i coeffs[2],
562
0
                                           const __m256i filt[2]) {
563
0
  __m256i ss[2];
564
0
565
0
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
566
0
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
567
0
568
0
  return convolve_4tap_avx2(ss, coeffs);
569
0
}
570
571
static inline __m256i x_convolve_6tap_avx2(const __m256i data,
572
                                           const __m256i coeffs[3],
573
0
                                           const __m256i filt[3]) {
574
0
  __m256i ss[3];
575
0
576
0
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
577
0
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
578
0
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
579
0
580
0
  return convolve_6tap_avx2(ss, coeffs);
581
0
}
582
583
static inline __m256i x_convolve_8tap_avx2(const __m256i data,
584
                                           const __m256i coeffs[4],
585
0
                                           const __m256i filt[4]) {
586
0
  __m256i ss[4];
587
0
588
0
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
589
0
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
590
0
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
591
0
  ss[3] = _mm256_shuffle_epi8(data, filt[3]);
592
0
593
0
  return convolve_8tap_avx2(ss, coeffs);
594
0
}
595
596
6.09M
static inline __m256i sr_y_round_avx2(const __m256i src) {
597
6.09M
  const __m256i round = _mm256_set1_epi16(32);
598
6.09M
  const __m256i dst = _mm256_add_epi16(src, round);
599
6.09M
  return _mm256_srai_epi16(dst, FILTER_BITS - 1);
600
6.09M
}
601
602
0
static inline __m128i xy_x_round_sse2(const __m128i src) {
603
0
  const __m128i round = _mm_set1_epi16(2);
604
0
  const __m128i dst = _mm_add_epi16(src, round);
605
0
  return _mm_srai_epi16(dst, 2);
606
0
}
607
608
0
static inline __m256i xy_x_round_avx2(const __m256i src) {
609
0
  const __m256i round = _mm256_set1_epi16(2);
610
0
  const __m256i dst = _mm256_add_epi16(src, round);
611
0
  return _mm256_srai_epi16(dst, 2);
612
0
}
613
614
static inline void xy_x_round_store_2x2_sse2(const __m128i res,
615
0
                                             int16_t *const dst) {
616
0
  const __m128i d = xy_x_round_sse2(res);
617
0
  _mm_storel_epi64((__m128i *)dst, d);
618
0
}
619
620
static inline void xy_x_round_store_4x2_sse2(const __m128i res,
621
0
                                             int16_t *const dst) {
622
0
  const __m128i d = xy_x_round_sse2(res);
623
0
  _mm_storeu_si128((__m128i *)dst, d);
624
0
}
625
626
static inline void xy_x_round_store_8x2_sse2(const __m128i res[2],
627
0
                                             int16_t *const dst) {
628
0
  __m128i r[2];
629
0
630
0
  r[0] = xy_x_round_sse2(res[0]);
631
0
  r[1] = xy_x_round_sse2(res[1]);
632
0
  _mm_storeu_si128((__m128i *)dst, r[0]);
633
0
  _mm_storeu_si128((__m128i *)(dst + 8), r[1]);
634
0
}
635
636
static inline void xy_x_round_store_8x2_avx2(const __m256i res,
637
0
                                             int16_t *const dst) {
638
0
  const __m256i d = xy_x_round_avx2(res);
639
0
  _mm256_storeu_si256((__m256i *)dst, d);
640
0
}
641
642
static inline void xy_x_round_store_32_avx2(const __m256i res[2],
643
0
                                            int16_t *const dst) {
644
0
  __m256i r[2];
645
0
646
0
  r[0] = xy_x_round_avx2(res[0]);
647
0
  r[1] = xy_x_round_avx2(res[1]);
648
0
  const __m256i d0 =
649
0
      _mm256_inserti128_si256(r[0], _mm256_castsi256_si128(r[1]), 1);
650
0
  const __m256i d1 =
651
0
      _mm256_inserti128_si256(r[1], _mm256_extracti128_si256(r[0], 1), 0);
652
0
  _mm256_storeu_si256((__m256i *)dst, d0);
653
0
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
654
0
}
655
656
0
static inline __m128i xy_y_round_sse2(const __m128i src) {
657
0
  const __m128i round = _mm_set1_epi32(1024);
658
0
  const __m128i dst = _mm_add_epi32(src, round);
659
0
  return _mm_srai_epi32(dst, 11);
660
0
}
661
662
0
static inline __m128i xy_y_round_half_pel_sse2(const __m128i src) {
663
0
  const __m128i round = _mm_set1_epi16(16);
664
0
  const __m128i dst = _mm_add_epi16(src, round);
665
0
  return _mm_srai_epi16(dst, 5);
666
0
}
667
668
0
static inline __m256i xy_y_round_avx2(const __m256i src) {
669
0
  const __m256i round = _mm256_set1_epi32(1024);
670
0
  const __m256i dst = _mm256_add_epi32(src, round);
671
0
  return _mm256_srai_epi32(dst, 11);
672
0
}
673
674
0
static inline __m256i xy_y_round_16_avx2(const __m256i r[2]) {
675
0
  const __m256i r0 = xy_y_round_avx2(r[0]);
676
0
  const __m256i r1 = xy_y_round_avx2(r[1]);
677
0
  return _mm256_packs_epi32(r0, r1);
678
0
}
679
680
0
static inline __m256i xy_y_round_half_pel_avx2(const __m256i src) {
681
0
  const __m256i round = _mm256_set1_epi16(16);
682
0
  const __m256i dst = _mm256_add_epi16(src, round);
683
0
  return _mm256_srai_epi16(dst, 5);
684
0
}
685
686
static inline void pack_store_2x2_sse2(const __m128i res, uint8_t *const dst,
687
136k
                                       const ptrdiff_t stride) {
688
136k
  const __m128i d = _mm_packus_epi16(res, res);
689
136k
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
690
136k
  *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
691
136k
}
692
693
static inline void pack_store_4x2_sse2(const __m128i res, uint8_t *const dst,
694
761k
                                       const ptrdiff_t stride) {
695
761k
  const __m128i d = _mm_packus_epi16(res, res);
696
761k
  store_u8_4x2_sse2(d, dst, stride);
697
761k
}
698
699
static inline void pack_store_4x2_avx2(const __m256i res, uint8_t *const dst,
700
0
                                       const ptrdiff_t stride) {
701
0
  const __m256i d = _mm256_packus_epi16(res, res);
702
0
  const __m128i d0 = _mm256_castsi256_si128(d);
703
0
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
704
0
705
0
  xx_storel_32(dst, d0);
706
0
  xx_storel_32(dst + stride, d1);
707
0
}
708
709
static inline void pack_store_8x2_avx2(const __m256i res, uint8_t *const dst,
710
830k
                                       const ptrdiff_t stride) {
711
830k
  const __m256i d = _mm256_packus_epi16(res, res);
712
830k
  const __m128i d0 = _mm256_castsi256_si128(d);
713
830k
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
714
830k
  _mm_storel_epi64((__m128i *)dst, d0);
715
830k
  _mm_storel_epi64((__m128i *)(dst + stride), d1);
716
830k
}
717
718
static inline void pack_store_16x2_avx2(const __m256i res0, const __m256i res1,
719
                                        uint8_t *const dst,
720
716k
                                        const ptrdiff_t stride) {
721
716k
  const __m256i d = _mm256_packus_epi16(res0, res1);
722
716k
  storeu_u8_16x2_avx2(d, dst, stride);
723
716k
}
724
725
static inline void xy_y_pack_store_16x2_avx2(const __m256i res0,
726
                                             const __m256i res1,
727
                                             uint8_t *const dst,
728
0
                                             const ptrdiff_t stride) {
729
0
  const __m256i t = _mm256_packus_epi16(res0, res1);
730
0
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
731
0
  storeu_u8_16x2_avx2(d, dst, stride);
732
0
}
733
734
static inline void pack_store_32_avx2(const __m256i res0, const __m256i res1,
735
0
                                      uint8_t *const dst) {
736
0
  const __m256i t = _mm256_packus_epi16(res0, res1);
737
0
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
738
0
  _mm256_storeu_si256((__m256i *)dst, d);
739
0
}
740
741
static inline void xy_y_round_store_2x2_sse2(const __m128i res,
742
                                             uint8_t *const dst,
743
0
                                             const ptrdiff_t stride) {
744
0
  const __m128i r = xy_y_round_sse2(res);
745
0
  const __m128i rr = _mm_packs_epi32(r, r);
746
0
  pack_store_2x2_sse2(rr, dst, stride);
747
0
}
748
749
static inline void xy_y_round_store_4x2_avx2(const __m256i res,
750
                                             uint8_t *const dst,
751
0
                                             const ptrdiff_t stride) {
752
0
  const __m256i r = xy_y_round_avx2(res);
753
0
  const __m256i rr = _mm256_packs_epi32(r, r);
754
0
  pack_store_4x2_avx2(rr, dst, stride);
755
0
}
756
757
static inline void xy_y_pack_store_32_avx2(const __m256i res0,
758
                                           const __m256i res1,
759
0
                                           uint8_t *const dst) {
760
0
  const __m256i d = _mm256_packus_epi16(res0, res1);
761
0
  // d = _mm256_permute4x64_epi64(d, 0xD8);
762
0
  _mm256_storeu_si256((__m256i *)dst, d);
763
0
}
764
765
static inline void xy_y_round_store_32_avx2(const __m256i r0[2],
766
                                            const __m256i r1[2],
767
0
                                            uint8_t *const dst) {
768
0
  const __m256i ra = xy_y_round_16_avx2(r0);
769
0
  const __m256i rb = xy_y_round_16_avx2(r1);
770
0
  xy_y_pack_store_32_avx2(ra, rb, dst);
771
0
}
772
773
static inline void convolve_store_32_avx2(const __m256i res0,
774
                                          const __m256i res1,
775
1.91M
                                          uint8_t *const dst) {
776
1.91M
  const __m256i d = _mm256_packus_epi16(res0, res1);
777
1.91M
  _mm256_storeu_si256((__m256i *)dst, d);
778
1.91M
}
779
780
0
static inline __m128i sr_x_round_sse2(const __m128i src) {
781
0
  const __m128i round = _mm_set1_epi16(34);
782
0
  const __m128i dst = _mm_add_epi16(src, round);
783
0
  return _mm_srai_epi16(dst, 6);
784
0
}
785
786
0
static inline __m256i sr_x_round_avx2(const __m256i src) {
787
0
  const __m256i round = _mm256_set1_epi16(34);
788
0
  const __m256i dst = _mm256_add_epi16(src, round);
789
0
  return _mm256_srai_epi16(dst, 6);
790
0
}
791
792
911k
static inline __m128i sr_y_round_sse2(const __m128i src) {
793
911k
  const __m128i round = _mm_set1_epi16(32);
794
911k
  const __m128i dst = _mm_add_epi16(src, round);
795
911k
  return _mm_srai_epi16(dst, FILTER_BITS - 1);
796
911k
}
797
798
static inline void sr_x_round_store_8x2_avx2(const __m256i res,
799
                                             uint8_t *const dst,
800
0
                                             const ptrdiff_t dst_stride) {
801
0
  const __m256i r = sr_x_round_avx2(res);
802
0
  pack_store_8x2_avx2(r, dst, dst_stride);
803
0
}
804
805
static inline void sr_x_round_store_16x2_avx2(const __m256i res[2],
806
                                              uint8_t *const dst,
807
0
                                              const ptrdiff_t dst_stride) {
808
0
  __m256i r[2];
809
0
810
0
  r[0] = sr_x_round_avx2(res[0]);
811
0
  r[1] = sr_x_round_avx2(res[1]);
812
0
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
813
0
}
814
815
static inline void sr_x_round_store_32_avx2(const __m256i res[2],
816
0
                                            uint8_t *const dst) {
817
0
  __m256i r[2];
818
0
819
0
  r[0] = sr_x_round_avx2(res[0]);
820
0
  r[1] = sr_x_round_avx2(res[1]);
821
0
  convolve_store_32_avx2(r[0], r[1], dst);
822
0
}
823
824
static inline void sr_y_round_store_8x2_avx2(const __m256i res,
825
                                             uint8_t *const dst,
826
830k
                                             const ptrdiff_t dst_stride) {
827
830k
  const __m256i r = sr_y_round_avx2(res);
828
830k
  pack_store_8x2_avx2(r, dst, dst_stride);
829
830k
}
830
831
static inline void sr_y_round_store_16x2_avx2(const __m256i res[2],
832
                                              uint8_t *const dst,
833
716k
                                              const ptrdiff_t dst_stride) {
834
716k
  __m256i r[2];
835
836
716k
  r[0] = sr_y_round_avx2(res[0]);
837
716k
  r[1] = sr_y_round_avx2(res[1]);
838
716k
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
839
716k
}
840
841
static inline void sr_y_2tap_32_avg_avx2(const uint8_t *const src,
842
                                         const __m256i s0, __m256i *const s1,
843
97.4k
                                         uint8_t *const dst) {
844
97.4k
  *s1 = _mm256_loadu_si256((__m256i *)src);
845
97.4k
  const __m256i d = _mm256_avg_epu8(s0, *s1);
846
97.4k
  _mm256_storeu_si256((__m256i *)dst, d);
847
97.4k
}
848
849
static inline void sr_x_2tap_32_avg_avx2(const uint8_t *const src,
850
0
                                         uint8_t *const dst) {
851
0
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
852
0
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
853
0
  const __m256i d = _mm256_avg_epu8(s0, s1);
854
0
  _mm256_storeu_si256((__m256i *)dst, d);
855
0
}
856
857
static inline __m128i x_convolve_2tap_2x2_sse4_1(const uint8_t *const src,
858
                                                 const ptrdiff_t stride,
859
0
                                                 const __m128i coeffs[1]) {
860
0
  const __m128i sfl =
861
0
      _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
862
0
  const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
863
0
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
864
0
  return convolve_2tap_ssse3(&ss, coeffs);
865
0
}
866
867
static inline __m128i x_convolve_2tap_4x2_ssse3(const uint8_t *const src,
868
                                                const ptrdiff_t stride,
869
0
                                                const __m128i coeffs[1]) {
870
0
  const __m128i sfl =
871
0
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
872
0
  const __m128i s_128 = load_u8_8x2_sse2(src, stride);
873
0
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
874
0
  return convolve_2tap_ssse3(&ss, coeffs);
875
0
}
876
877
static inline void x_convolve_2tap_8x2_ssse3(const uint8_t *const src,
878
                                             const ptrdiff_t stride,
879
                                             const __m128i coeffs[1],
880
0
                                             __m128i r[2]) {
881
0
  __m128i ss[2];
882
0
  const __m128i s00 = _mm_loadu_si128((__m128i *)src);
883
0
  const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
884
0
  const __m128i s01 = _mm_srli_si128(s00, 1);
885
0
  const __m128i s11 = _mm_srli_si128(s10, 1);
886
0
  ss[0] = _mm_unpacklo_epi8(s00, s01);
887
0
  ss[1] = _mm_unpacklo_epi8(s10, s11);
888
0
889
0
  r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
890
0
  r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
891
0
}
892
893
static inline __m256i x_convolve_2tap_8x2_avx2(const uint8_t *const src,
894
                                               const ptrdiff_t stride,
895
0
                                               const __m256i coeffs[1]) {
896
0
  __m128i s_128[2][2];
897
0
  __m256i s_256[2];
898
0
899
0
  s_128[0][0] = _mm_loadu_si128((__m128i *)src);
900
0
  s_128[1][0] = _mm_loadu_si128((__m128i *)(src + stride));
901
0
  s_128[0][1] = _mm_srli_si128(s_128[0][0], 1);
902
0
  s_128[1][1] = _mm_srli_si128(s_128[1][0], 1);
903
0
  s_256[0] = _mm256_setr_m128i(s_128[0][0], s_128[1][0]);
904
0
  s_256[1] = _mm256_setr_m128i(s_128[0][1], s_128[1][1]);
905
0
  const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
906
0
  return convolve_2tap_avx2(&ss, coeffs);
907
0
}
908
909
static inline void x_convolve_2tap_16x2_avx2(const uint8_t *const src,
910
                                             const ptrdiff_t stride,
911
                                             const __m256i coeffs[1],
912
0
                                             __m256i r[2]) {
913
0
  const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
914
0
  const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
915
0
  const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
916
0
  const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
917
0
  r[0] = convolve_2tap_avx2(&s0, coeffs);
918
0
  r[1] = convolve_2tap_avx2(&s1, coeffs);
919
0
}
920
921
static inline void x_convolve_2tap_32_avx2(const uint8_t *const src,
922
                                           const __m256i coeffs[1],
923
0
                                           __m256i r[2]) {
924
0
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
925
0
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
926
0
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
927
0
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
928
0
929
0
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
930
0
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
931
0
}
932
933
static inline __m128i x_convolve_4tap_2x2_ssse3(const uint8_t *const src,
934
                                                const ptrdiff_t stride,
935
0
                                                const __m128i coeffs[2]) {
936
0
  const __m128i sfl0 =
937
0
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
938
0
  const __m128i sfl1 =
939
0
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
940
0
  const __m128i s = load_u8_8x2_sse2(src, stride);
941
0
  __m128i ss[2];
942
0
943
0
  ss[0] = _mm_shuffle_epi8(s, sfl0);
944
0
  ss[1] = _mm_shuffle_epi8(s, sfl1);
945
0
  return convolve_4tap_ssse3(ss, coeffs);
946
0
}
947
948
static inline __m128i x_convolve_4tap_4x2_ssse3(const uint8_t *const src,
949
                                                const ptrdiff_t stride,
950
0
                                                const __m128i coeffs[2]) {
951
0
  const __m128i s = load_u8_8x2_sse2(src, stride);
952
0
  const __m128i sfl0 =
953
0
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
954
0
  const __m128i sfl1 =
955
0
      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
956
0
  __m128i ss[2];
957
0
958
0
  ss[0] = _mm_shuffle_epi8(s, sfl0);
959
0
  ss[1] = _mm_shuffle_epi8(s, sfl1);
960
0
  return convolve_4tap_ssse3(ss, coeffs);
961
0
}
962
963
static inline __m256i x_convolve_4tap_8x2_avx2(const uint8_t *const src,
964
                                               const ptrdiff_t stride,
965
                                               const __m256i coeffs[2],
966
0
                                               const __m256i filt[2]) {
967
0
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
968
0
  return x_convolve_4tap_avx2(s_256, coeffs, filt);
969
0
}
970
971
static inline void x_convolve_4tap_16x2_avx2(const uint8_t *const src,
972
                                             const int32_t src_stride,
973
                                             const __m256i coeffs[2],
974
                                             const __m256i filt[2],
975
0
                                             __m256i r[2]) {
976
0
  r[0] = x_convolve_4tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
977
0
  r[1] = x_convolve_4tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
978
0
}
979
980
static inline void x_convolve_4tap_32_avx2(const uint8_t *const src,
981
                                           const __m256i coeffs[2],
982
                                           const __m256i filt[2],
983
0
                                           __m256i r[2]) {
984
0
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
985
0
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
986
0
987
0
  r[0] = x_convolve_4tap_avx2(s0_256, coeffs, filt);
988
0
  r[1] = x_convolve_4tap_avx2(s1_256, coeffs, filt);
989
0
}
990
991
static inline __m128i x_convolve_6tap_2x2_ssse3(const uint8_t *const src,
992
                                                const ptrdiff_t stride,
993
0
                                                const __m128i coeffs[3]) {
994
0
  const __m128i sfl0 =
995
0
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
996
0
  const __m128i sfl1 =
997
0
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
998
0
  const __m128i sfl2 =
999
0
      _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
1000
0
1001
0
  const __m128i s = load_u8_8x2_sse2(src, stride);
1002
0
  __m128i ss[3];
1003
0
1004
0
  ss[0] = _mm_shuffle_epi8(s, sfl0);
1005
0
  ss[1] = _mm_shuffle_epi8(s, sfl1);
1006
0
  ss[2] = _mm_shuffle_epi8(s, sfl2);
1007
0
  return convolve_6tap_ssse3(ss, coeffs);
1008
0
}
1009
1010
static inline __m128i x_convolve_6tap_4x2_ssse3(const uint8_t *const src,
1011
                                                const ptrdiff_t stride,
1012
0
                                                const __m128i coeffs[3]) {
1013
0
  const __m128i s = load_u8_8x2_sse2(src, stride);
1014
0
  const __m128i sfl0 =
1015
0
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
1016
0
  const __m128i sfl1 =
1017
0
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
1018
0
  const __m128i sfl2 =
1019
0
      _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
1020
0
  __m128i ss[3];
1021
0
1022
0
  ss[0] = _mm_shuffle_epi8(s, sfl0);
1023
0
  ss[1] = _mm_shuffle_epi8(s, sfl1);
1024
0
  ss[2] = _mm_shuffle_epi8(s, sfl2);
1025
0
  return convolve_6tap_ssse3(ss, coeffs);
1026
0
}
1027
1028
static inline __m256i x_convolve_6tap_8x2_avx2(const uint8_t *const src,
1029
                                               const ptrdiff_t stride,
1030
                                               const __m256i coeffs[3],
1031
0
                                               const __m256i filt[3]) {
1032
0
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1033
0
  return x_convolve_6tap_avx2(s_256, coeffs, filt);
1034
0
}
1035
1036
static inline void x_convolve_6tap_16x2_avx2(const uint8_t *const src,
1037
                                             const int32_t src_stride,
1038
                                             const __m256i coeffs[3],
1039
                                             const __m256i filt[3],
1040
0
                                             __m256i r[2]) {
1041
0
  r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1042
0
  r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1043
0
}
1044
1045
static inline void x_convolve_6tap_32_avx2(const uint8_t *const src,
1046
                                           const __m256i coeffs[3],
1047
                                           const __m256i filt[3],
1048
0
                                           __m256i r[2]) {
1049
0
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1050
0
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1051
0
1052
0
  r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
1053
0
  r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
1054
0
}
1055
1056
static inline __m256i x_convolve_8tap_8x2_avx2(const uint8_t *const src,
1057
                                               const ptrdiff_t stride,
1058
                                               const __m256i coeffs[4],
1059
0
                                               const __m256i filt[4]) {
1060
0
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1061
0
  return x_convolve_8tap_avx2(s_256, coeffs, filt);
1062
0
}
1063
1064
static AOM_FORCE_INLINE void x_convolve_8tap_16x2_avx2(const uint8_t *const src,
1065
                                                       const int32_t src_stride,
1066
                                                       const __m256i coeffs[4],
1067
                                                       const __m256i filt[4],
1068
0
                                                       __m256i r[2]) {
1069
0
  r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1070
0
  r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1071
0
}
1072
1073
static AOM_FORCE_INLINE void x_convolve_8tap_32_avx2(const uint8_t *const src,
1074
                                                     const __m256i coeffs[4],
1075
                                                     const __m256i filt[4],
1076
0
                                                     __m256i r[2]) {
1077
0
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1078
0
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1079
0
1080
0
  r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
1081
0
  r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
1082
0
}
1083
1084
static inline __m128i y_convolve_2tap_2x2_ssse3(const uint8_t *const src,
1085
                                                const ptrdiff_t stride,
1086
                                                const __m128i coeffs[1],
1087
3.50k
                                                __m128i s_16[2]) {
1088
3.50k
  __m128i s_128[2];
1089
1090
3.50k
  s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
1091
3.50k
  s_128[0] = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1092
3.50k
  s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
1093
3.50k
  s_128[1] = _mm_unpacklo_epi16(s_16[1], s_16[0]);
1094
3.50k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1095
3.50k
  return convolve_2tap_ssse3(&ss, coeffs);
1096
3.50k
}
1097
1098
static inline __m128i y_convolve_2tap_4x2_ssse3(const uint8_t *const src,
1099
                                                const ptrdiff_t stride,
1100
                                                const __m128i coeffs[1],
1101
14.5k
                                                __m128i s_32[2]) {
1102
14.5k
  __m128i s_128[2];
1103
1104
14.5k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1105
14.5k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1106
14.5k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1107
14.5k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1108
14.5k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1109
14.5k
  return convolve_2tap_ssse3(&ss, coeffs);
1110
14.5k
}
1111
1112
static inline __m256i y_convolve_2tap_8x2_avx2(const uint8_t *const src,
1113
                                               const ptrdiff_t stride,
1114
                                               const __m256i coeffs[1],
1115
0
                                               __m128i s_64[2]) {
1116
0
  __m256i s_256[2];
1117
0
1118
0
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + stride));
1119
0
  s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
1120
0
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1121
0
  s_256[1] = _mm256_setr_m128i(s_64[1], s_64[0]);
1122
0
  const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1123
0
  return convolve_2tap_avx2(&ss, coeffs);
1124
0
}
1125
1126
static inline void y_convolve_2tap_16x2_avx2(const uint8_t *const src,
1127
                                             const ptrdiff_t stride,
1128
                                             const __m256i coeffs[1],
1129
16.0k
                                             __m128i s_128[2], __m256i r[2]) {
1130
16.0k
  __m256i s_256[2];
1131
1132
16.0k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + stride));
1133
16.0k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1134
16.0k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1135
16.0k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1136
16.0k
  const __m256i ss0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1137
16.0k
  const __m256i ss1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1138
16.0k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1139
16.0k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1140
16.0k
}
1141
1142
static inline void y_convolve_2tap_32_avx2(const uint8_t *const src,
1143
                                           const __m256i coeffs[1],
1144
                                           const __m256i s0, __m256i *const s1,
1145
118k
                                           __m256i r[2]) {
1146
118k
  *s1 = _mm256_loadu_si256((__m256i *)src);
1147
118k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, *s1);
1148
118k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, *s1);
1149
118k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1150
118k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1151
118k
}
1152
1153
static inline __m128i y_convolve_4tap_2x2_ssse3(const uint8_t *const src,
1154
                                                const ptrdiff_t stride,
1155
                                                const __m128i coeffs[2],
1156
                                                __m128i s_16[4],
1157
52.8k
                                                __m128i ss_128[2]) {
1158
52.8k
  s_16[3] = _mm_cvtsi32_si128(loadu_int16(src + stride));
1159
52.8k
  const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1160
52.8k
  s_16[2] = _mm_cvtsi32_si128(loadu_int16(src + 2 * stride));
1161
52.8k
  const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[2]);
1162
52.8k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1163
52.8k
  return convolve_4tap_ssse3(ss_128, coeffs);
1164
52.8k
}
1165
1166
static inline __m128i y_convolve_4tap_4x2_ssse3(const uint8_t *const src,
1167
                                                const ptrdiff_t stride,
1168
                                                const __m128i coeffs[2],
1169
                                                __m128i s_32[4],
1170
276k
                                                __m128i ss_128[2]) {
1171
276k
  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1172
276k
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1173
276k
  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1174
276k
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1175
276k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1176
276k
  return convolve_4tap_ssse3(ss_128, coeffs);
1177
276k
}
1178
1179
static inline __m256i y_convolve_4tap_8x2_avx2(const uint8_t *const src,
1180
                                               const ptrdiff_t stride,
1181
                                               const __m256i coeffs[2],
1182
                                               __m128i s_64[4],
1183
232k
                                               __m256i ss_256[2]) {
1184
232k
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + stride));
1185
232k
  const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1186
232k
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1187
232k
  const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[2]);
1188
232k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1189
232k
  return convolve_4tap_avx2(ss_256, coeffs);
1190
232k
}
1191
1192
static inline void y_convolve_4tap_16x2_avx2(const uint8_t *const src,
1193
                                             const ptrdiff_t stride,
1194
                                             const __m256i coeffs[2],
1195
                                             __m128i s_128[4],
1196
144k
                                             __m256i ss_256[4], __m256i r[2]) {
1197
144k
  s_128[3] = _mm_loadu_si128((__m128i *)(src + stride));
1198
144k
  const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1199
144k
  s_128[2] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1200
144k
  const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[2]);
1201
144k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1202
144k
  ss_256[3] = _mm256_unpackhi_epi8(src23, src34);
1203
144k
  r[0] = convolve_4tap_avx2(ss_256, coeffs);
1204
144k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1205
144k
}
1206
1207
static inline __m128i y_convolve_6tap_2x2_ssse3(const uint8_t *const src,
1208
                                                const ptrdiff_t stride,
1209
                                                const __m128i coeffs[3],
1210
                                                __m128i s_16[6],
1211
74.2k
                                                __m128i ss_128[3]) {
1212
74.2k
  s_16[5] = _mm_cvtsi32_si128(loadu_int16(src + 3 * stride));
1213
74.2k
  const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
1214
74.2k
  s_16[4] = _mm_cvtsi32_si128(loadu_int16(src + 4 * stride));
1215
74.2k
  const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[4]);
1216
74.2k
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1217
74.2k
  return convolve_6tap_ssse3(ss_128, coeffs);
1218
74.2k
}
1219
1220
static inline void y_convolve_4tap_32x2_avx2(
1221
    const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[2],
1222
200k
    __m256i s_256[4], __m256i ss_256[4], __m256i tt_256[4], __m256i r[4]) {
1223
200k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
1224
200k
  ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1225
200k
  ss_256[3] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1226
200k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
1227
200k
  tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[2]);
1228
200k
  tt_256[3] = _mm256_unpackhi_epi8(s_256[3], s_256[2]);
1229
200k
  r[0] = convolve_4tap_avx2(ss_256 + 0, coeffs);
1230
200k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1231
200k
  r[2] = convolve_4tap_avx2(tt_256 + 0, coeffs);
1232
200k
  r[3] = convolve_4tap_avx2(tt_256 + 2, coeffs);
1233
200k
}
1234
1235
static inline __m128i y_convolve_6tap_4x2_ssse3(const uint8_t *const src,
1236
                                                const ptrdiff_t stride,
1237
                                                const __m128i coeffs[3],
1238
                                                __m128i s_32[6],
1239
426k
                                                __m128i ss_128[3]) {
1240
426k
  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 3 * stride));
1241
426k
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1242
426k
  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 4 * stride));
1243
426k
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1244
426k
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1245
426k
  return convolve_6tap_ssse3(ss_128, coeffs);
1246
426k
}
1247
1248
static inline __m256i y_convolve_6tap_8x2_avx2(const uint8_t *const src,
1249
                                               const ptrdiff_t stride,
1250
                                               const __m256i coeffs[3],
1251
                                               __m128i s_64[6],
1252
563k
                                               __m256i ss_256[3]) {
1253
563k
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 3 * stride));
1254
563k
  const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
1255
563k
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 4 * stride));
1256
563k
  const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[4]);
1257
563k
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1258
563k
  return convolve_6tap_avx2(ss_256, coeffs);
1259
563k
}
1260
1261
static inline void y_convolve_6tap_16x2_avx2(const uint8_t *const src,
1262
                                             const ptrdiff_t stride,
1263
                                             const __m256i coeffs[3],
1264
                                             __m128i s_128[6],
1265
527k
                                             __m256i ss_256[6], __m256i r[2]) {
1266
527k
  s_128[5] = _mm_loadu_si128((__m128i *)(src + 3 * stride));
1267
527k
  const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
1268
527k
  s_128[4] = _mm_loadu_si128((__m128i *)(src + 4 * stride));
1269
527k
  const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[4]);
1270
527k
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1271
527k
  ss_256[5] = _mm256_unpackhi_epi8(src45, src56);
1272
527k
  r[0] = convolve_6tap_avx2(ss_256, coeffs);
1273
527k
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1274
527k
}
1275
1276
static inline void y_convolve_6tap_32x2_avx2(
1277
    const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[3],
1278
646k
    __m256i s_256[6], __m256i ss_256[6], __m256i tt_256[6], __m256i r[4]) {
1279
646k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1280
646k
  ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
1281
646k
  ss_256[5] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
1282
646k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1283
646k
  tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[4]);
1284
646k
  tt_256[5] = _mm256_unpackhi_epi8(s_256[5], s_256[4]);
1285
646k
  r[0] = convolve_6tap_avx2(ss_256 + 0, coeffs);
1286
646k
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1287
646k
  r[2] = convolve_6tap_avx2(tt_256 + 0, coeffs);
1288
646k
  r[3] = convolve_6tap_avx2(tt_256 + 3, coeffs);
1289
646k
}
1290
1291
static inline __m128i y_convolve_8tap_2x2_ssse3(const uint8_t *const src,
1292
                                                const ptrdiff_t stride,
1293
                                                const __m128i coeffs[4],
1294
                                                __m128i s_16[8],
1295
6.01k
                                                __m128i ss_128[4]) {
1296
6.01k
  s_16[7] = _mm_cvtsi32_si128(loadu_int16(src + 7 * stride));
1297
6.01k
  const __m128i src67 = _mm_unpacklo_epi16(s_16[6], s_16[7]);
1298
6.01k
  s_16[6] = _mm_cvtsi32_si128(loadu_int16(src + 8 * stride));
1299
6.01k
  const __m128i src78 = _mm_unpacklo_epi16(s_16[7], s_16[6]);
1300
6.01k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1301
6.01k
  return convolve_8tap_ssse3(ss_128, coeffs);
1302
6.01k
}
1303
1304
static inline __m128i y_convolve_8tap_4x2_ssse3(const uint8_t *const src,
1305
                                                const ptrdiff_t stride,
1306
                                                const __m128i coeffs[4],
1307
                                                __m128i s_32[8],
1308
30.6k
                                                __m128i ss_128[4]) {
1309
30.6k
  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * stride));
1310
30.6k
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1311
30.6k
  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * stride));
1312
30.6k
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1313
30.6k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1314
30.6k
  return convolve_8tap_ssse3(ss_128, coeffs);
1315
30.6k
}
1316
1317
static inline __m256i y_convolve_8tap_8x2_avx2(const uint8_t *const src,
1318
                                               const ptrdiff_t stride,
1319
                                               const __m256i coeffs[4],
1320
                                               __m128i s_64[8],
1321
34.1k
                                               __m256i ss_256[4]) {
1322
34.1k
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * stride));
1323
34.1k
  const __m256i src67 = _mm256_setr_m128i(s_64[6], s_64[7]);
1324
34.1k
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * stride));
1325
34.1k
  const __m256i src78 = _mm256_setr_m128i(s_64[7], s_64[6]);
1326
34.1k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1327
34.1k
  return convolve_8tap_avx2(ss_256, coeffs);
1328
34.1k
}
1329
1330
static inline void y_convolve_8tap_16x2_avx2(const uint8_t *const src,
1331
                                             const ptrdiff_t stride,
1332
                                             const __m256i coeffs[4],
1333
                                             __m128i s_128[8],
1334
28.9k
                                             __m256i ss_256[8], __m256i r[2]) {
1335
28.9k
  s_128[7] = _mm_loadu_si128((__m128i *)(src + 7 * stride));
1336
28.9k
  const __m256i src67 = _mm256_setr_m128i(s_128[6], s_128[7]);
1337
28.9k
  s_128[6] = _mm_loadu_si128((__m128i *)(src + 8 * stride));
1338
28.9k
  const __m256i src78 = _mm256_setr_m128i(s_128[7], s_128[6]);
1339
28.9k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1340
28.9k
  ss_256[7] = _mm256_unpackhi_epi8(src67, src78);
1341
28.9k
  r[0] = convolve_8tap_avx2(ss_256, coeffs);
1342
28.9k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1343
28.9k
}
1344
1345
static inline void y_convolve_8tap_32x2_avx2(
1346
    const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1347
51.0k
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1348
51.0k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1349
51.0k
  ss_256[3] = _mm256_unpacklo_epi8(s_256[6], s_256[7]);
1350
51.0k
  ss_256[7] = _mm256_unpackhi_epi8(s_256[6], s_256[7]);
1351
51.0k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1352
51.0k
  tt_256[3] = _mm256_unpacklo_epi8(s_256[7], s_256[6]);
1353
51.0k
  tt_256[7] = _mm256_unpackhi_epi8(s_256[7], s_256[6]);
1354
51.0k
  r[0] = convolve_8tap_avx2(ss_256 + 0, coeffs);
1355
51.0k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1356
51.0k
  r[2] = convolve_8tap_avx2(tt_256 + 0, coeffs);
1357
51.0k
  r[3] = convolve_8tap_avx2(tt_256 + 4, coeffs);
1358
51.0k
}
1359
1360
static inline void xy_x_convolve_2tap_32_avx2(const uint8_t *const src,
1361
                                              const __m256i coeffs[1],
1362
0
                                              __m256i r[2]) {
1363
0
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
1364
0
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
1365
0
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
1366
0
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
1367
0
1368
0
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1369
0
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1370
0
}
1371
1372
static inline void xy_x_2tap_32_avx2(const uint8_t *const src,
1373
                                     const __m256i coeffs[1],
1374
0
                                     int16_t *const dst) {
1375
0
  __m256i r[2];
1376
0
1377
0
  xy_x_convolve_2tap_32_avx2(src, coeffs, r);
1378
0
  const __m256i d0 = xy_x_round_avx2(r[0]);
1379
0
  const __m256i d1 = xy_x_round_avx2(r[1]);
1380
0
  _mm256_storeu_si256((__m256i *)dst, d0);
1381
0
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1382
0
}
1383
1384
static inline void xy_x_4tap_32_avx2(const uint8_t *const src,
1385
                                     const __m256i coeffs[2],
1386
                                     const __m256i filt[2],
1387
0
                                     int16_t *const dst) {
1388
0
  __m256i r[2];
1389
0
1390
0
  x_convolve_4tap_32_avx2(src, coeffs, filt, r);
1391
0
  const __m256i d0 = xy_x_round_avx2(r[0]);
1392
0
  const __m256i d1 = xy_x_round_avx2(r[1]);
1393
0
  _mm256_storeu_si256((__m256i *)dst, d0);
1394
0
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1395
0
}
1396
1397
static inline void xy_x_6tap_32_avx2(const uint8_t *const src,
1398
                                     const __m256i coeffs[3],
1399
                                     const __m256i filt[3],
1400
0
                                     int16_t *const dst) {
1401
0
  __m256i r[2];
1402
0
1403
0
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
1404
0
  const __m256i d0 = xy_x_round_avx2(r[0]);
1405
0
  const __m256i d1 = xy_x_round_avx2(r[1]);
1406
0
  _mm256_storeu_si256((__m256i *)dst, d0);
1407
0
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1408
0
}
1409
1410
static inline void xy_x_8tap_32_avx2(const uint8_t *const src,
1411
                                     const __m256i coeffs[4],
1412
                                     const __m256i filt[4],
1413
0
                                     int16_t *const dst) {
1414
0
  __m256i r[2];
1415
0
1416
0
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
1417
0
  const __m256i d0 = xy_x_round_avx2(r[0]);
1418
0
  const __m256i d1 = xy_x_round_avx2(r[1]);
1419
0
  _mm256_storeu_si256((__m256i *)dst, d0);
1420
0
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1421
0
}
1422
1423
static inline __m128i xy_y_convolve_2tap_2x2_sse2(const int16_t *const src,
1424
                                                  __m128i s_32[2],
1425
0
                                                  const __m128i coeffs[1]) {
1426
0
  __m128i s_128[2];
1427
0
1428
0
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1429
0
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1430
0
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1431
0
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1432
0
  const __m128i ss = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1433
0
  return convolve16_2tap_sse2(&ss, coeffs);
1434
0
}
1435
1436
static inline __m128i xy_y_convolve_2tap_2x2_half_pel_sse2(
1437
0
    const int16_t *const src, __m128i s_32[2]) {
1438
0
  __m128i s_128[2];
1439
0
1440
0
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1441
0
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1442
0
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1443
0
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1444
0
  return _mm_add_epi16(s_128[0], s_128[1]);
1445
0
}
1446
1447
static inline void xy_y_convolve_2tap_4x2_sse2(const int16_t *const src,
1448
                                               __m128i s_64[2],
1449
                                               const __m128i coeffs[1],
1450
0
                                               __m128i r[2]) {
1451
0
  __m128i s_128[2];
1452
0
1453
0
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1454
0
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1455
0
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1456
0
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1457
0
  const __m128i ss0 = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1458
0
  const __m128i ss1 = _mm_unpackhi_epi16(s_128[0], s_128[1]);
1459
0
  r[0] = convolve16_2tap_sse2(&ss0, coeffs);
1460
0
  r[1] = convolve16_2tap_sse2(&ss1, coeffs);
1461
0
}
1462
1463
static inline __m128i xy_y_convolve_2tap_4x2_half_pel_sse2(
1464
0
    const int16_t *const src, __m128i s_64[2]) {
1465
0
  __m128i s_128[2];
1466
0
1467
0
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1468
0
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1469
0
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1470
0
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1471
0
  return _mm_add_epi16(s_128[0], s_128[1]);
1472
0
}
1473
1474
static inline void xy_y_convolve_2tap_16_avx2(const __m256i s0,
1475
                                              const __m256i s1,
1476
                                              const __m256i coeffs[1],
1477
0
                                              __m256i r[2]) {
1478
0
  const __m256i ss0 = _mm256_unpacklo_epi16(s0, s1);
1479
0
  const __m256i ss1 = _mm256_unpackhi_epi16(s0, s1);
1480
0
  r[0] = convolve16_2tap_avx2(&ss0, coeffs);
1481
0
  r[1] = convolve16_2tap_avx2(&ss1, coeffs);
1482
0
}
1483
1484
static inline void xy_y_convolve_2tap_8x2_avx2(const int16_t *const src,
1485
                                               __m128i s_128[2],
1486
                                               const __m256i coeffs[1],
1487
0
                                               __m256i r[2]) {
1488
0
  __m256i s_256[2];
1489
0
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1490
0
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1491
0
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1492
0
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1493
0
  xy_y_convolve_2tap_16_avx2(s_256[0], s_256[1], coeffs, r);
1494
0
}
1495
1496
static inline __m256i xy_y_convolve_2tap_8x2_half_pel_avx2(
1497
0
    const int16_t *const src, __m128i s_128[2]) {
1498
0
  __m256i s_256[2];
1499
0
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1500
0
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1501
0
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1502
0
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1503
0
  return _mm256_add_epi16(s_256[0], s_256[1]);
1504
0
}
1505
1506
static inline void xy_y_convolve_2tap_16x2_half_pel_avx2(
1507
0
    const int16_t *const src, __m256i s_256[2], __m256i r[2]) {
1508
0
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1509
0
  r[0] = _mm256_add_epi16(s_256[0], s_256[1]);
1510
0
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1511
0
  r[1] = _mm256_add_epi16(s_256[1], s_256[0]);
1512
0
}
1513
1514
static inline void xy_y_store_16x2_avx2(const __m256i r[2], uint8_t *const dst,
1515
0
                                        const ptrdiff_t stride) {
1516
0
  const __m256i t = _mm256_packus_epi16(r[0], r[1]);
1517
0
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
1518
0
  storeu_u8_16x2_avx2(d, dst, stride);
1519
0
}
1520
1521
static inline void xy_y_convolve_2tap_16x2_avx2(const int16_t *const src,
1522
                                                __m256i s[2],
1523
                                                const __m256i coeffs[1],
1524
0
                                                __m256i r[4]) {
1525
0
  s[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1526
0
  xy_y_convolve_2tap_16_avx2(s[0], s[1], coeffs, r + 0);
1527
0
  s[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1528
0
  xy_y_convolve_2tap_16_avx2(s[1], s[0], coeffs, r + 2);
1529
0
}
1530
1531
static inline void xy_y_convolve_2tap_32_avx2(const int16_t *const src,
1532
                                              const __m256i s0[2],
1533
                                              __m256i s1[2],
1534
                                              const __m256i coeffs[1],
1535
0
                                              __m256i r[4]) {
1536
0
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1537
0
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1538
0
  xy_y_convolve_2tap_16_avx2(s0[0], s1[0], coeffs, r + 0);
1539
0
  xy_y_convolve_2tap_16_avx2(s0[1], s1[1], coeffs, r + 2);
1540
0
}
1541
1542
static inline void xy_y_convolve_2tap_32_all_avx2(const int16_t *const src,
1543
                                                  const __m256i s0[2],
1544
                                                  __m256i s1[2],
1545
                                                  const __m256i coeffs[1],
1546
0
                                                  uint8_t *const dst) {
1547
0
  __m256i r[4];
1548
0
1549
0
  xy_y_convolve_2tap_32_avx2(src, s0, s1, coeffs, r);
1550
0
  xy_y_round_store_32_avx2(r + 0, r + 2, dst);
1551
0
}
1552
1553
static inline void xy_y_convolve_2tap_half_pel_32_avx2(const int16_t *const src,
1554
                                                       const __m256i s0[2],
1555
                                                       __m256i s1[2],
1556
0
                                                       __m256i r[2]) {
1557
0
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1558
0
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1559
0
  r[0] = _mm256_add_epi16(s0[0], s1[0]);
1560
0
  r[1] = _mm256_add_epi16(s0[1], s1[1]);
1561
0
}
1562
1563
static inline void xy_y_convolve_2tap_half_pel_32_all_avx2(
1564
    const int16_t *const src, const __m256i s0[2], __m256i s1[2],
1565
0
    uint8_t *const dst) {
1566
0
  __m256i r[2];
1567
0
1568
0
  xy_y_convolve_2tap_half_pel_32_avx2(src, s0, s1, r);
1569
0
  r[0] = xy_y_round_half_pel_avx2(r[0]);
1570
0
  r[1] = xy_y_round_half_pel_avx2(r[1]);
1571
0
  xy_y_pack_store_32_avx2(r[0], r[1], dst);
1572
0
}
1573
1574
static inline __m128i xy_y_convolve_4tap_2x2_sse2(const int16_t *const src,
1575
                                                  __m128i s_32[4],
1576
                                                  __m128i ss_128[2],
1577
0
                                                  const __m128i coeffs[2]) {
1578
0
  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + 3 * 2));
1579
0
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1580
0
  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 4 * 2));
1581
0
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1582
0
  ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1583
0
  const __m128i r = convolve16_4tap_sse2(ss_128, coeffs);
1584
0
  ss_128[0] = ss_128[1];
1585
0
  return r;
1586
0
}
1587
1588
static inline __m256i xy_y_convolve_4tap_4x2_avx2(const int16_t *const src,
1589
                                                  __m128i s_64[4],
1590
                                                  __m256i ss_256[2],
1591
0
                                                  const __m256i coeffs[2]) {
1592
0
  __m256i s_256[2];
1593
0
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + 3 * 4));
1594
0
  s_256[0] = _mm256_setr_m128i(s_64[2], s_64[3]);
1595
0
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 4 * 4));
1596
0
  s_256[1] = _mm256_setr_m128i(s_64[3], s_64[2]);
1597
0
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1598
0
  const __m256i r = convolve16_4tap_avx2(ss_256, coeffs);
1599
0
  ss_256[0] = ss_256[1];
1600
0
  return r;
1601
0
}
1602
1603
static inline void xy_y_convolve_4tap_16_avx2(const __m256i *const ss,
1604
                                              const __m256i coeffs[2],
1605
0
                                              __m256i r[2]) {
1606
0
  r[0] = convolve16_4tap_avx2(ss, coeffs);
1607
0
  r[1] = convolve16_4tap_avx2(ss + 2, coeffs);
1608
0
}
1609
1610
static inline void xy_y_convolve_4tap_8x2_avx2(const int16_t *const src,
1611
                                               __m256i ss_256[4],
1612
                                               const __m256i coeffs[2],
1613
0
                                               __m256i r[2]) {
1614
0
  __m256i s_256[2];
1615
0
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1616
0
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1617
0
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1618
0
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1619
0
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1620
0
  ss_256[0] = ss_256[1];
1621
0
  ss_256[2] = ss_256[3];
1622
0
}
1623
1624
static inline void xy_y_convolve_4tap_8x2_half_pel_avx2(
1625
    const int16_t *const src, const __m256i coeffs[1], __m256i s_256[4],
1626
0
    __m256i r[2]) {
1627
0
  __m256i a_256[2];
1628
0
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1629
0
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1630
0
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1631
0
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1632
0
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r);
1633
0
  s_256[0] = s_256[2];
1634
0
  s_256[1] = s_256[3];
1635
0
}
1636
1637
static inline void xy_y_convolve_4tap_16x2_avx2(
1638
    const int16_t *const src, __m256i s_256[4], __m256i ss_256[4],
1639
0
    __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4]) {
1640
0
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1641
0
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1642
0
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1643
0
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1644
0
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1645
0
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1646
0
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1647
0
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1648
0
  ss_256[0] = ss_256[1];
1649
0
  ss_256[2] = ss_256[3];
1650
0
  tt_256[0] = tt_256[1];
1651
0
  tt_256[2] = tt_256[3];
1652
0
}
1653
1654
static inline void xy_y_convolve_4tap_32x2_avx2(
1655
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[4],
1656
    __m256i ss_256[4], __m256i tt_256[4], const __m256i coeffs[2],
1657
0
    __m256i r[4]) {
1658
0
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1659
0
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1660
0
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1661
0
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1662
0
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1663
0
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1664
0
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1665
0
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1666
0
  ss_256[0] = ss_256[1];
1667
0
  ss_256[2] = ss_256[3];
1668
0
  tt_256[0] = tt_256[1];
1669
0
  tt_256[2] = tt_256[3];
1670
0
}
1671
1672
static inline void xy_y_convolve_4tap_16x2_half_pelavx2(
1673
    const int16_t *const src, __m256i s_256[5], const __m256i coeffs[1],
1674
0
    __m256i r[4]) {
1675
0
  __m256i a_256[2];
1676
0
1677
0
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1678
0
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1679
0
1680
0
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1681
0
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1682
0
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 0);
1683
0
1684
0
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1685
0
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[3]);
1686
0
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 2);
1687
0
1688
0
  s_256[0] = s_256[2];
1689
0
  s_256[1] = s_256[3];
1690
0
  s_256[2] = s_256[4];
1691
0
}
1692
1693
static inline __m128i xy_y_convolve_6tap_2x2_sse2(const int16_t *const src,
1694
                                                  __m128i s_32[6],
1695
                                                  __m128i ss_128[3],
1696
0
                                                  const __m128i coeffs[3]) {
1697
0
  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 5 * 2));
1698
0
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1699
0
  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 6 * 2));
1700
0
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1701
0
  ss_128[2] = _mm_unpacklo_epi16(src45, src56);
1702
0
  const __m128i r = convolve16_6tap_sse2(ss_128, coeffs);
1703
0
  ss_128[0] = ss_128[1];
1704
0
  ss_128[1] = ss_128[2];
1705
0
  return r;
1706
0
}
1707
1708
static inline __m256i xy_y_convolve_6tap_4x2_avx2(const int16_t *const src,
1709
                                                  __m128i s_64[6],
1710
                                                  __m256i ss_256[3],
1711
0
                                                  const __m256i coeffs[3]) {
1712
0
  __m256i s_256[2];
1713
0
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 5 * 4));
1714
0
  s_256[0] = _mm256_setr_m128i(s_64[4], s_64[5]);
1715
0
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 6 * 4));
1716
0
  s_256[1] = _mm256_setr_m128i(s_64[5], s_64[4]);
1717
0
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1718
0
  const __m256i r = convolve16_6tap_avx2(ss_256, coeffs);
1719
0
  ss_256[0] = ss_256[1];
1720
0
  ss_256[1] = ss_256[2];
1721
0
  return r;
1722
0
}
1723
1724
static inline void xy_y_convolve_6tap_16_avx2(const __m256i ss[6],
1725
                                              const __m256i coeffs[3],
1726
0
                                              __m256i r[2]) {
1727
0
  r[0] = convolve16_6tap_avx2(ss, coeffs);
1728
0
  r[1] = convolve16_6tap_avx2(ss + 3, coeffs);
1729
0
}
1730
1731
static inline void xy_y_convolve_6tap_8x2_avx2(const int16_t *const src,
1732
                                               __m256i ss_256[6],
1733
                                               const __m256i coeffs[3],
1734
0
                                               __m256i r[2]) {
1735
0
  __m256i s_256[2];
1736
0
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1737
0
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1738
0
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1739
0
  ss_256[5] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1740
0
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r);
1741
0
  ss_256[0] = ss_256[1];
1742
0
  ss_256[1] = ss_256[2];
1743
0
  ss_256[3] = ss_256[4];
1744
0
  ss_256[4] = ss_256[5];
1745
0
}
1746
1747
static inline void xy_y_convolve_6tap_8x2_half_pel_avx2(
1748
    const int16_t *const src, const __m256i coeffs[2], __m256i s_256[6],
1749
0
    __m256i r[2]) {
1750
0
  __m256i a_256[2], ss_256[4];
1751
0
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1752
0
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1753
0
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1754
0
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1755
0
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1756
0
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1757
0
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1758
0
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1759
0
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1760
0
  s_256[0] = s_256[2];
1761
0
  s_256[1] = s_256[3];
1762
0
  s_256[2] = s_256[4];
1763
0
  s_256[3] = s_256[5];
1764
0
}
1765
1766
static inline void xy_y_convolve_6tap_16x2_avx2(
1767
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
1768
    __m256i ss_256[6], __m256i tt_256[6], const __m256i coeffs[3],
1769
0
    __m256i r[4]) {
1770
0
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1771
0
  ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
1772
0
  ss_256[5] = _mm256_unpackhi_epi16(s_256[4], s_256[5]);
1773
0
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1774
0
  tt_256[2] = _mm256_unpacklo_epi16(s_256[5], s_256[4]);
1775
0
  tt_256[5] = _mm256_unpackhi_epi16(s_256[5], s_256[4]);
1776
0
1777
0
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r + 0);
1778
0
  xy_y_convolve_6tap_16_avx2(tt_256, coeffs, r + 2);
1779
0
1780
0
  ss_256[0] = ss_256[1];
1781
0
  ss_256[1] = ss_256[2];
1782
0
  ss_256[3] = ss_256[4];
1783
0
  ss_256[4] = ss_256[5];
1784
0
1785
0
  tt_256[0] = tt_256[1];
1786
0
  tt_256[1] = tt_256[2];
1787
0
  tt_256[3] = tt_256[4];
1788
0
  tt_256[4] = tt_256[5];
1789
0
}
1790
1791
static inline void xy_y_convolve_6tap_16x2_half_pel_avx2(
1792
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
1793
0
    __m256i ss_256[4], const __m256i coeffs[2], __m256i r[4]) {
1794
0
  __m256i a_256[2];
1795
0
1796
0
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1797
0
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1798
0
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1799
0
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1800
0
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1801
0
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1802
0
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1803
0
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1804
0
1805
0
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[5]);
1806
0
  s_256[0] = s_256[2];
1807
0
  s_256[2] = s_256[4];
1808
0
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1809
0
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1810
0
  s_256[1] = s_256[3];
1811
0
  s_256[3] = s_256[5];
1812
0
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1813
0
  ss_256[1] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
1814
0
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1815
0
  ss_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
1816
0
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1817
0
}
1818
1819
static inline __m128i xy_y_convolve_8tap_2x2_sse2(const int16_t *const src,
1820
                                                  __m128i s_32[8],
1821
                                                  __m128i ss_128[4],
1822
0
                                                  const __m128i coeffs[4]) {
1823
0
  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * 2));
1824
0
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1825
0
  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * 2));
1826
0
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1827
0
  ss_128[3] = _mm_unpacklo_epi16(src67, src78);
1828
0
  const __m128i r = convolve16_8tap_sse2(ss_128, coeffs);
1829
0
  ss_128[0] = ss_128[1];
1830
0
  ss_128[1] = ss_128[2];
1831
0
  ss_128[2] = ss_128[3];
1832
0
  return r;
1833
0
}
1834
1835
static inline __m256i xy_y_convolve_8tap_4x2_avx2(const int16_t *const src,
1836
                                                  __m128i s_64[8],
1837
                                                  __m256i ss_256[4],
1838
0
                                                  const __m256i coeffs[4]) {
1839
0
  __m256i s_256[2];
1840
0
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * 4));
1841
0
  s_256[0] = _mm256_setr_m128i(s_64[6], s_64[7]);
1842
0
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * 4));
1843
0
  s_256[1] = _mm256_setr_m128i(s_64[7], s_64[6]);
1844
0
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1845
0
  const __m256i r = convolve16_8tap_avx2(ss_256, coeffs);
1846
0
  ss_256[0] = ss_256[1];
1847
0
  ss_256[1] = ss_256[2];
1848
0
  ss_256[2] = ss_256[3];
1849
0
  return r;
1850
0
}
1851
1852
static inline void xy_y_convolve_8tap_16_avx2(const __m256i *const ss,
1853
                                              const __m256i coeffs[4],
1854
0
                                              __m256i r[2]) {
1855
0
  r[0] = convolve16_8tap_avx2(ss, coeffs);
1856
0
  r[1] = convolve16_8tap_avx2(ss + 4, coeffs);
1857
0
}
1858
1859
static inline void xy_y_convolve_8tap_8x2_avx2(const int16_t *const src,
1860
                                               __m256i ss_256[8],
1861
                                               const __m256i coeffs[4],
1862
0
                                               __m256i r[2]) {
1863
0
  __m256i s_256[2];
1864
0
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1865
0
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1866
0
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1867
0
  ss_256[7] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1868
0
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r);
1869
0
  ss_256[0] = ss_256[1];
1870
0
  ss_256[1] = ss_256[2];
1871
0
  ss_256[2] = ss_256[3];
1872
0
  ss_256[4] = ss_256[5];
1873
0
  ss_256[5] = ss_256[6];
1874
0
  ss_256[6] = ss_256[7];
1875
0
}
1876
1877
static inline void xy_y_convolve_8tap_8x2_half_pel_avx2(
1878
    const int16_t *const src, const __m256i coeffs[2], __m256i s_256[8],
1879
0
    __m256i r[2]) {
1880
0
  __m256i a_256[4], ss_256[4];
1881
0
1882
0
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1883
0
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1884
0
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1885
0
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1886
0
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1887
0
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1888
0
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1889
0
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1890
0
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1891
0
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1892
0
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1893
0
  s_256[0] = s_256[2];
1894
0
  s_256[1] = s_256[3];
1895
0
  s_256[2] = s_256[4];
1896
0
  s_256[3] = s_256[5];
1897
0
  s_256[4] = s_256[6];
1898
0
  s_256[5] = s_256[7];
1899
0
}
1900
1901
static AOM_FORCE_INLINE void xy_y_convolve_8tap_16x2_avx2(
1902
    const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1903
0
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1904
0
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1905
0
  ss_256[3] = _mm256_unpacklo_epi16(s_256[6], s_256[7]);
1906
0
  ss_256[7] = _mm256_unpackhi_epi16(s_256[6], s_256[7]);
1907
0
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1908
0
  tt_256[3] = _mm256_unpacklo_epi16(s_256[7], s_256[6]);
1909
0
  tt_256[7] = _mm256_unpackhi_epi16(s_256[7], s_256[6]);
1910
0
1911
0
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r + 0);
1912
0
  xy_y_convolve_8tap_16_avx2(tt_256, coeffs, r + 2);
1913
0
1914
0
  ss_256[0] = ss_256[1];
1915
0
  ss_256[1] = ss_256[2];
1916
0
  ss_256[2] = ss_256[3];
1917
0
  ss_256[4] = ss_256[5];
1918
0
  ss_256[5] = ss_256[6];
1919
0
  ss_256[6] = ss_256[7];
1920
0
1921
0
  tt_256[0] = tt_256[1];
1922
0
  tt_256[1] = tt_256[2];
1923
0
  tt_256[2] = tt_256[3];
1924
0
  tt_256[4] = tt_256[5];
1925
0
  tt_256[5] = tt_256[6];
1926
0
  tt_256[6] = tt_256[7];
1927
0
}
1928
1929
static inline void xy_y_convolve_8tap_16x2_half_pel_avx2(
1930
    const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1931
0
    __m256i s_256[8], __m256i r[4]) {
1932
0
  __m256i a_256[4], ss_256[4];
1933
0
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1934
0
1935
0
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1936
0
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1937
0
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1938
0
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1939
0
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1940
0
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1941
0
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1942
0
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1943
0
1944
0
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1945
0
1946
0
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[7]);
1947
0
  a_256[2] = _mm256_add_epi16(s_256[3], s_256[6]);
1948
0
  a_256[3] = _mm256_add_epi16(s_256[4], s_256[5]);
1949
0
  s_256[0] = s_256[2];
1950
0
  s_256[2] = s_256[4];
1951
0
  s_256[4] = s_256[6];
1952
0
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1953
0
1954
0
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[6]);
1955
0
  s_256[1] = s_256[3];
1956
0
  s_256[3] = s_256[5];
1957
0
  s_256[5] = s_256[7];
1958
0
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1959
0
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1960
0
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1961
0
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1962
0
1963
0
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1964
0
}
1965
1966
static inline void xy_y_round_store_8x2_avx2(const __m256i res[2],
1967
                                             uint8_t *const dst,
1968
0
                                             const ptrdiff_t stride) {
1969
0
  const __m256i r = xy_y_round_16_avx2(res);
1970
0
  pack_store_8x2_avx2(r, dst, stride);
1971
0
}
1972
1973
static inline void xy_y_round_store_16x2_avx2(const __m256i res[4],
1974
                                              uint8_t *const dst,
1975
0
                                              const ptrdiff_t stride) {
1976
0
  const __m256i r0 = xy_y_round_16_avx2(res + 0);
1977
0
  const __m256i r1 = xy_y_round_16_avx2(res + 2);
1978
0
  xy_y_pack_store_16x2_avx2(r0, r1, dst, stride);
1979
0
}
1980
1981
static inline void sr_y_round_store_32_avx2(const __m256i res[2],
1982
1.91M
                                            uint8_t *const dst) {
1983
1.91M
  __m256i r[2];
1984
1985
1.91M
  r[0] = sr_y_round_avx2(res[0]);
1986
1.91M
  r[1] = sr_y_round_avx2(res[1]);
1987
1.91M
  convolve_store_32_avx2(r[0], r[1], dst);
1988
1.91M
}
1989
1990
static inline void sr_y_round_store_32x2_avx2(const __m256i res[4],
1991
                                              uint8_t *const dst,
1992
898k
                                              const int32_t dst_stride) {
1993
898k
  sr_y_round_store_32_avx2(res, dst);
1994
898k
  sr_y_round_store_32_avx2(res + 2, dst + dst_stride);
1995
898k
}
1996
1997
static inline void sr_y_2tap_32_avx2(const uint8_t *const src,
1998
                                     const __m256i coeffs[1], const __m256i s0,
1999
118k
                                     __m256i *const s1, uint8_t *const dst) {
2000
118k
  __m256i r[2];
2001
118k
  y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
2002
118k
  sr_y_round_store_32_avx2(r, dst);
2003
118k
}
2004
2005
static AOM_FORCE_INLINE void av1_convolve_y_sr_specialized_avx2(
2006
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2007
    int32_t w, int32_t h, const InterpFilterParams *filter_params_y,
2008
705k
    const int32_t subpel_y_q4) {
2009
705k
  int32_t x, y;
2010
705k
  __m128i coeffs_128[4];
2011
705k
  __m256i coeffs_256[4];
2012
2013
705k
  int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
2014
2015
705k
  if (vert_tap == 2) {
2016
    // vert_filt as 2 tap
2017
45.6k
    const uint8_t *src_ptr = src;
2018
2019
45.6k
    y = h;
2020
2021
45.6k
    if (subpel_y_q4 != 8) {
2022
14.8k
      if (w <= 8) {
2023
11.0k
        prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4,
2024
11.0k
                                       coeffs_128);
2025
2026
11.0k
        if (w == 2) {
2027
1.87k
          __m128i s_16[2];
2028
2029
1.87k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2030
2031
3.50k
          do {
2032
3.50k
            const __m128i res = y_convolve_2tap_2x2_ssse3(src_ptr, src_stride,
2033
3.50k
                                                          coeffs_128, s_16);
2034
3.50k
            const __m128i r = sr_y_round_sse2(res);
2035
3.50k
            pack_store_2x2_sse2(r, dst, dst_stride);
2036
3.50k
            src_ptr += 2 * src_stride;
2037
3.50k
            dst += 2 * dst_stride;
2038
3.50k
            y -= 2;
2039
3.50k
          } while (y);
2040
9.16k
        } else if (w == 4) {
2041
5.03k
          __m128i s_32[2];
2042
2043
5.03k
          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2044
2045
14.5k
          do {
2046
14.5k
            const __m128i res = y_convolve_2tap_4x2_ssse3(src_ptr, src_stride,
2047
14.5k
                                                          coeffs_128, s_32);
2048
14.5k
            const __m128i r = sr_y_round_sse2(res);
2049
14.5k
            pack_store_4x2_sse2(r, dst, dst_stride);
2050
14.5k
            src_ptr += 2 * src_stride;
2051
14.5k
            dst += 2 * dst_stride;
2052
14.5k
            y -= 2;
2053
14.5k
          } while (y);
2054
5.03k
        } else {
2055
4.13k
          __m128i s_64[2], s_128[2];
2056
2057
4.13k
          assert(w == 8);
2058
2059
4.13k
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2060
2061
13.6k
          do {
2062
            // Note: Faster than binding to AVX2 registers.
2063
13.6k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2064
13.6k
            s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
2065
13.6k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2066
13.6k
            s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
2067
13.6k
            const __m128i ss0 = _mm_unpacklo_epi8(s_128[0], s_128[1]);
2068
13.6k
            const __m128i ss1 = _mm_unpackhi_epi8(s_128[0], s_128[1]);
2069
13.6k
            const __m128i res0 = convolve_2tap_ssse3(&ss0, coeffs_128);
2070
13.6k
            const __m128i res1 = convolve_2tap_ssse3(&ss1, coeffs_128);
2071
13.6k
            const __m128i r0 = sr_y_round_sse2(res0);
2072
13.6k
            const __m128i r1 = sr_y_round_sse2(res1);
2073
13.6k
            const __m128i d = _mm_packus_epi16(r0, r1);
2074
13.6k
            _mm_storel_epi64((__m128i *)dst, d);
2075
13.6k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2076
13.6k
            src_ptr += 2 * src_stride;
2077
13.6k
            dst += 2 * dst_stride;
2078
13.6k
            y -= 2;
2079
13.6k
          } while (y);
2080
4.13k
        }
2081
11.0k
      } else {
2082
3.81k
        prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2083
2084
3.81k
        if (w == 16) {
2085
2.21k
          __m128i s_128[2];
2086
2087
2.21k
          s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2088
2089
16.0k
          do {
2090
16.0k
            __m256i r[2];
2091
2092
16.0k
            y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2093
16.0k
                                      r);
2094
16.0k
            sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2095
16.0k
            src_ptr += 2 * src_stride;
2096
16.0k
            dst += 2 * dst_stride;
2097
16.0k
            y -= 2;
2098
16.0k
          } while (y);
2099
2.21k
        } else if (w == 32) {
2100
1.01k
          __m256i s_256[2];
2101
2102
1.01k
          s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2103
2104
15.3k
          do {
2105
15.3k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0],
2106
15.3k
                              &s_256[1], dst);
2107
15.3k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1],
2108
15.3k
                              &s_256[0], dst + dst_stride);
2109
15.3k
            src_ptr += 2 * src_stride;
2110
15.3k
            dst += 2 * dst_stride;
2111
15.3k
            y -= 2;
2112
15.3k
          } while (y);
2113
1.01k
        } else if (w == 64) {
2114
488
          __m256i s_256[2][2];
2115
2116
488
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2117
488
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2118
2119
12.1k
          do {
2120
12.1k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2121
12.1k
                              &s_256[1][0], dst);
2122
12.1k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 32, coeffs_256,
2123
12.1k
                              s_256[0][1], &s_256[1][1], dst + 32);
2124
12.1k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2125
12.1k
                              &s_256[0][0], dst + dst_stride);
2126
12.1k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 32, coeffs_256,
2127
12.1k
                              s_256[1][1], &s_256[0][1], dst + dst_stride + 32);
2128
2129
12.1k
            src_ptr += 2 * src_stride;
2130
12.1k
            dst += 2 * dst_stride;
2131
12.1k
            y -= 2;
2132
12.1k
          } while (y);
2133
488
        } else {
2134
106
          __m256i s_256[2][4];
2135
2136
106
          assert(w == 128);
2137
2138
106
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2139
106
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2140
106
          s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2141
106
          s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2142
2143
4.86k
          do {
2144
4.86k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2145
4.86k
                              &s_256[1][0], dst);
2146
4.86k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 1 * 32, coeffs_256,
2147
4.86k
                              s_256[0][1], &s_256[1][1], dst + 1 * 32);
2148
4.86k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 2 * 32, coeffs_256,
2149
4.86k
                              s_256[0][2], &s_256[1][2], dst + 2 * 32);
2150
4.86k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 3 * 32, coeffs_256,
2151
4.86k
                              s_256[0][3], &s_256[1][3], dst + 3 * 32);
2152
2153
4.86k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2154
4.86k
                              &s_256[0][0], dst + dst_stride);
2155
4.86k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32, coeffs_256,
2156
4.86k
                              s_256[1][1], &s_256[0][1],
2157
4.86k
                              dst + dst_stride + 1 * 32);
2158
4.86k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32, coeffs_256,
2159
4.86k
                              s_256[1][2], &s_256[0][2],
2160
4.86k
                              dst + dst_stride + 2 * 32);
2161
4.86k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32, coeffs_256,
2162
4.86k
                              s_256[1][3], &s_256[0][3],
2163
4.86k
                              dst + dst_stride + 3 * 32);
2164
2165
4.86k
            src_ptr += 2 * src_stride;
2166
4.86k
            dst += 2 * dst_stride;
2167
4.86k
            y -= 2;
2168
4.86k
          } while (y);
2169
106
        }
2170
3.81k
      }
2171
30.8k
    } else {
2172
      // average to get half pel
2173
30.8k
      if (w <= 8) {
2174
26.7k
        if (w == 2) {
2175
6.13k
          __m128i s_16[2];
2176
2177
6.13k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2178
2179
11.7k
          do {
2180
11.7k
            s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + src_stride));
2181
11.7k
            const __m128i d0 = _mm_avg_epu8(s_16[0], s_16[1]);
2182
11.7k
            *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d0);
2183
11.7k
            s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
2184
11.7k
            const __m128i d1 = _mm_avg_epu8(s_16[1], s_16[0]);
2185
11.7k
            *(int16_t *)(dst + dst_stride) = (int16_t)_mm_cvtsi128_si32(d1);
2186
11.7k
            src_ptr += 2 * src_stride;
2187
11.7k
            dst += 2 * dst_stride;
2188
11.7k
            y -= 2;
2189
11.7k
          } while (y);
2190
20.5k
        } else if (w == 4) {
2191
12.8k
          __m128i s_32[2];
2192
2193
12.8k
          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2194
2195
35.7k
          do {
2196
35.7k
            s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + src_stride));
2197
35.7k
            const __m128i d0 = _mm_avg_epu8(s_32[0], s_32[1]);
2198
35.7k
            xx_storel_32(dst, d0);
2199
35.7k
            s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2200
35.7k
            const __m128i d1 = _mm_avg_epu8(s_32[1], s_32[0]);
2201
35.7k
            xx_storel_32(dst + dst_stride, d1);
2202
35.7k
            src_ptr += 2 * src_stride;
2203
35.7k
            dst += 2 * dst_stride;
2204
35.7k
            y -= 2;
2205
35.7k
          } while (y);
2206
12.8k
        } else {
2207
7.70k
          __m128i s_64[2];
2208
2209
7.70k
          assert(w == 8);
2210
2211
7.70k
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2212
2213
25.4k
          do {
2214
            // Note: Faster than binding to AVX2 registers.
2215
25.4k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2216
25.4k
            const __m128i d0 = _mm_avg_epu8(s_64[0], s_64[1]);
2217
25.4k
            _mm_storel_epi64((__m128i *)dst, d0);
2218
25.4k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2219
25.4k
            const __m128i d1 = _mm_avg_epu8(s_64[1], s_64[0]);
2220
25.4k
            _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
2221
25.4k
            src_ptr += 2 * src_stride;
2222
25.4k
            dst += 2 * dst_stride;
2223
25.4k
            y -= 2;
2224
25.4k
          } while (y);
2225
7.70k
        }
2226
26.7k
      } else if (w == 16) {
2227
2.69k
        __m128i s_128[2];
2228
2229
2.69k
        s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2230
2231
15.9k
        do {
2232
15.9k
          s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
2233
15.9k
          const __m128i d0 = _mm_avg_epu8(s_128[0], s_128[1]);
2234
15.9k
          _mm_storeu_si128((__m128i *)dst, d0);
2235
15.9k
          s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2236
15.9k
          const __m128i d1 = _mm_avg_epu8(s_128[1], s_128[0]);
2237
15.9k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
2238
15.9k
          src_ptr += 2 * src_stride;
2239
15.9k
          dst += 2 * dst_stride;
2240
15.9k
          y -= 2;
2241
15.9k
        } while (y);
2242
2.69k
      } else if (w == 32) {
2243
968
        __m256i s_256[2];
2244
2245
968
        s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2246
2247
12.7k
        do {
2248
12.7k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0], &s_256[1], dst);
2249
12.7k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1], &s_256[0],
2250
12.7k
                                dst + dst_stride);
2251
12.7k
          src_ptr += 2 * src_stride;
2252
12.7k
          dst += 2 * dst_stride;
2253
12.7k
          y -= 2;
2254
12.7k
        } while (y);
2255
968
      } else if (w == 64) {
2256
342
        __m256i s_256[2][2];
2257
2258
342
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2259
342
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2260
2261
9.30k
        do {
2262
9.30k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2263
9.30k
                                dst);
2264
9.30k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 32, s_256[0][1],
2265
9.30k
                                &s_256[1][1], dst + 32);
2266
2267
9.30k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2268
9.30k
                                &s_256[0][0], dst + dst_stride);
2269
9.30k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 32, s_256[1][1],
2270
9.30k
                                &s_256[0][1], dst + dst_stride + 32);
2271
2272
9.30k
          src_ptr += 2 * src_stride;
2273
9.30k
          dst += 2 * dst_stride;
2274
9.30k
          y -= 2;
2275
9.30k
        } while (y);
2276
342
      } else {
2277
101
        __m256i s_256[2][4];
2278
2279
101
        assert(w == 128);
2280
2281
101
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2282
101
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2283
101
        s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2284
101
        s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2285
2286
4.35k
        do {
2287
4.35k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2288
4.35k
                                dst);
2289
4.35k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 1 * 32, s_256[0][1],
2290
4.35k
                                &s_256[1][1], dst + 1 * 32);
2291
4.35k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 2 * 32, s_256[0][2],
2292
4.35k
                                &s_256[1][2], dst + 2 * 32);
2293
4.35k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 3 * 32, s_256[0][3],
2294
4.35k
                                &s_256[1][3], dst + 3 * 32);
2295
2296
4.35k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2297
4.35k
                                &s_256[0][0], dst + dst_stride);
2298
4.35k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 1 * 32, s_256[1][1],
2299
4.35k
                                &s_256[0][1], dst + dst_stride + 1 * 32);
2300
4.35k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 2 * 32, s_256[1][2],
2301
4.35k
                                &s_256[0][2], dst + dst_stride + 2 * 32);
2302
4.35k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 3 * 32, s_256[1][3],
2303
4.35k
                                &s_256[0][3], dst + dst_stride + 3 * 32);
2304
2305
4.35k
          src_ptr += 2 * src_stride;
2306
4.35k
          dst += 2 * dst_stride;
2307
4.35k
          y -= 2;
2308
4.35k
        } while (y);
2309
101
      }
2310
30.8k
    }
2311
659k
  } else if (vert_tap == 4) {
2312
    // vert_filt as 4 tap
2313
342k
    const uint8_t *src_ptr = src - src_stride;
2314
2315
342k
    y = h;
2316
2317
342k
    if (w <= 4) {
2318
166k
      prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2319
2320
166k
      if (w == 2) {
2321
30.4k
        __m128i s_16[4], ss_128[2];
2322
2323
30.4k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2324
30.4k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2325
30.4k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2326
2327
30.4k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2328
30.4k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2329
2330
30.4k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2331
2332
52.8k
        do {
2333
52.8k
          src_ptr += 2 * src_stride;
2334
52.8k
          const __m128i res = y_convolve_4tap_2x2_ssse3(
2335
52.8k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2336
52.8k
          const __m128i r = sr_y_round_sse2(res);
2337
52.8k
          pack_store_2x2_sse2(r, dst, dst_stride);
2338
2339
52.8k
          ss_128[0] = ss_128[1];
2340
52.8k
          dst += 2 * dst_stride;
2341
52.8k
          y -= 2;
2342
52.8k
        } while (y);
2343
135k
      } else {
2344
135k
        __m128i s_32[4], ss_128[2];
2345
2346
135k
        assert(w == 4);
2347
2348
135k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2349
135k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2350
135k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2351
2352
135k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2353
135k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2354
2355
135k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2356
2357
276k
        do {
2358
276k
          src_ptr += 2 * src_stride;
2359
276k
          const __m128i res = y_convolve_4tap_4x2_ssse3(
2360
276k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2361
276k
          const __m128i r = sr_y_round_sse2(res);
2362
276k
          pack_store_4x2_sse2(r, dst, dst_stride);
2363
2364
276k
          ss_128[0] = ss_128[1];
2365
276k
          dst += 2 * dst_stride;
2366
276k
          y -= 2;
2367
276k
        } while (y);
2368
135k
      }
2369
176k
    } else {
2370
176k
      prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2371
2372
176k
      if (w == 8) {
2373
115k
        __m128i s_64[4];
2374
115k
        __m256i ss_256[2];
2375
2376
115k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2377
115k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2378
115k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2379
2380
        // Load lines a and b. Line a to lower 128, line b to upper 128
2381
115k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2382
115k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2383
2384
115k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2385
2386
232k
        do {
2387
232k
          src_ptr += 2 * src_stride;
2388
232k
          const __m256i res = y_convolve_4tap_8x2_avx2(
2389
232k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2390
232k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2391
2392
232k
          ss_256[0] = ss_256[1];
2393
232k
          dst += 2 * dst_stride;
2394
232k
          y -= 2;
2395
232k
        } while (y);
2396
115k
      } else if (w == 16) {
2397
55.4k
        __m128i s_128[4];
2398
55.4k
        __m256i ss_256[4], r[2];
2399
2400
55.4k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2401
55.4k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2402
55.4k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2403
2404
        // Load lines a and b. Line a to lower 128, line b to upper 128
2405
55.4k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2406
55.4k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2407
2408
55.4k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2409
55.4k
        ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
2410
2411
144k
        do {
2412
144k
          src_ptr += 2 * src_stride;
2413
144k
          y_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2414
144k
                                    ss_256, r);
2415
144k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2416
2417
144k
          ss_256[0] = ss_256[1];
2418
144k
          ss_256[2] = ss_256[3];
2419
144k
          dst += 2 * dst_stride;
2420
144k
          y -= 2;
2421
144k
        } while (y);
2422
55.4k
      } else if (w == 32) {
2423
        // AV1 standard won't have 32x4 case.
2424
        // This only favors some optimization feature which
2425
        // subsamples 32x8 to 32x4 and triggers 4-tap filter.
2426
2427
4.39k
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2428
2429
4.39k
        s_256[0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * src_stride));
2430
4.39k
        s_256[1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * src_stride));
2431
4.39k
        s_256[2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * src_stride));
2432
2433
4.39k
        ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2434
4.39k
        ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2435
2436
4.39k
        tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2437
4.39k
        tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2438
2439
33.4k
        do {
2440
33.4k
          src_ptr += 2 * src_stride;
2441
33.4k
          y_convolve_4tap_32x2_avx2(src_ptr, src_stride, coeffs_256, s_256,
2442
33.4k
                                    ss_256, tt_256, r);
2443
33.4k
          sr_y_round_store_32x2_avx2(r, dst, dst_stride);
2444
2445
33.4k
          ss_256[0] = ss_256[1];
2446
33.4k
          ss_256[2] = ss_256[3];
2447
2448
33.4k
          tt_256[0] = tt_256[1];
2449
33.4k
          tt_256[2] = tt_256[3];
2450
33.4k
          dst += 2 * dst_stride;
2451
33.4k
          y -= 2;
2452
33.4k
        } while (y);
2453
4.39k
      } else {
2454
1.57k
        assert(!(w % 32));
2455
2456
1.57k
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2457
1.57k
        x = 0;
2458
3.91k
        do {
2459
3.91k
          const uint8_t *s = src_ptr + x;
2460
3.91k
          uint8_t *d = dst + x;
2461
3.91k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2462
3.91k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2463
3.91k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2464
2465
3.91k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2466
3.91k
          ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2467
2468
3.91k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2469
3.91k
          tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2470
2471
3.91k
          y = h;
2472
167k
          do {
2473
167k
            s += 2 * src_stride;
2474
167k
            y_convolve_4tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2475
167k
                                      tt_256, r);
2476
167k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2477
2478
167k
            ss_256[0] = ss_256[1];
2479
167k
            ss_256[2] = ss_256[3];
2480
2481
167k
            tt_256[0] = tt_256[1];
2482
167k
            tt_256[2] = tt_256[3];
2483
167k
            d += 2 * dst_stride;
2484
167k
            y -= 2;
2485
167k
          } while (y);
2486
3.91k
          x += 32;
2487
3.91k
        } while (x < w);
2488
1.57k
      }
2489
176k
    }
2490
342k
  } else if (vert_tap == 6) {
2491
    // vert_filt as 6 tap
2492
298k
    const uint8_t *src_ptr = src - 2 * src_stride;
2493
2494
298k
    if (w <= 4) {
2495
96.7k
      prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2496
2497
96.7k
      y = h;
2498
2499
96.7k
      if (w == 2) {
2500
18.5k
        __m128i s_16[6], ss_128[3];
2501
2502
18.5k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2503
18.5k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2504
18.5k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2505
18.5k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2506
18.5k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2507
2508
18.5k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2509
18.5k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2510
18.5k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2511
18.5k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2512
2513
18.5k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2514
18.5k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2515
2516
74.2k
        do {
2517
74.2k
          src_ptr += 2 * src_stride;
2518
74.2k
          const __m128i res = y_convolve_6tap_2x2_ssse3(
2519
74.2k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2520
74.2k
          const __m128i r = sr_y_round_sse2(res);
2521
74.2k
          pack_store_2x2_sse2(r, dst, dst_stride);
2522
2523
74.2k
          ss_128[0] = ss_128[1];
2524
74.2k
          ss_128[1] = ss_128[2];
2525
74.2k
          dst += 2 * dst_stride;
2526
74.2k
          y -= 2;
2527
74.2k
        } while (y);
2528
78.2k
      } else {
2529
78.2k
        __m128i s_32[6], ss_128[3];
2530
2531
78.2k
        assert(w == 4);
2532
2533
78.2k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2534
78.2k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2535
78.2k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2536
78.2k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2537
78.2k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2538
2539
78.2k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2540
78.2k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2541
78.2k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2542
78.2k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2543
2544
78.2k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2545
78.2k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2546
2547
426k
        do {
2548
426k
          src_ptr += 2 * src_stride;
2549
426k
          const __m128i res = y_convolve_6tap_4x2_ssse3(
2550
426k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2551
426k
          const __m128i r = sr_y_round_sse2(res);
2552
426k
          pack_store_4x2_sse2(r, dst, dst_stride);
2553
2554
426k
          ss_128[0] = ss_128[1];
2555
426k
          ss_128[1] = ss_128[2];
2556
426k
          dst += 2 * dst_stride;
2557
426k
          y -= 2;
2558
426k
        } while (y);
2559
78.2k
      }
2560
201k
    } else {
2561
201k
      prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2562
2563
201k
      if (w == 8) {
2564
98.7k
        __m128i s_64[6];
2565
98.7k
        __m256i ss_256[3];
2566
2567
98.7k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2568
98.7k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2569
98.7k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2570
98.7k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2571
98.7k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2572
2573
        // Load lines a and b. Line a to lower 128, line b to upper 128
2574
98.7k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2575
98.7k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2576
98.7k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2577
98.7k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2578
2579
98.7k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2580
98.7k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2581
2582
98.7k
        y = h;
2583
563k
        do {
2584
563k
          src_ptr += 2 * src_stride;
2585
563k
          const __m256i res = y_convolve_6tap_8x2_avx2(
2586
563k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2587
563k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2588
2589
563k
          ss_256[0] = ss_256[1];
2590
563k
          ss_256[1] = ss_256[2];
2591
563k
          dst += 2 * dst_stride;
2592
563k
          y -= 2;
2593
563k
        } while (y);
2594
102k
      } else if (w == 16) {
2595
71.4k
        __m128i s_128[6];
2596
71.4k
        __m256i ss_256[6], r[2];
2597
2598
71.4k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2599
71.4k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2600
71.4k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2601
71.4k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2602
71.4k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2603
2604
        // Load lines a and b. Line a to lower 128, line b to upper 128
2605
71.4k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2606
71.4k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2607
71.4k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2608
71.4k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2609
2610
71.4k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2611
71.4k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2612
2613
71.4k
        ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
2614
71.4k
        ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
2615
2616
71.4k
        y = h;
2617
527k
        do {
2618
527k
          src_ptr += 2 * src_stride;
2619
527k
          y_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2620
527k
                                    ss_256, r);
2621
527k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2622
2623
527k
          ss_256[0] = ss_256[1];
2624
527k
          ss_256[1] = ss_256[2];
2625
2626
527k
          ss_256[3] = ss_256[4];
2627
527k
          ss_256[4] = ss_256[5];
2628
527k
          dst += 2 * dst_stride;
2629
527k
          y -= 2;
2630
527k
        } while (y);
2631
71.4k
      } else {
2632
31.3k
        __m256i s_256[6], ss_256[6], tt_256[6], r[4];
2633
2634
31.3k
        assert(!(w % 32));
2635
2636
31.3k
        x = 0;
2637
37.5k
        do {
2638
37.5k
          const uint8_t *s = src_ptr + x;
2639
37.5k
          uint8_t *d = dst + x;
2640
2641
37.5k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2642
37.5k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2643
37.5k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2644
37.5k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2645
37.5k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2646
2647
37.5k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2648
37.5k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2649
37.5k
          ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2650
37.5k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2651
2652
37.5k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2653
37.5k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2654
37.5k
          tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2655
37.5k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2656
2657
37.5k
          y = h;
2658
646k
          do {
2659
646k
            s += 2 * src_stride;
2660
646k
            y_convolve_6tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2661
646k
                                      tt_256, r);
2662
646k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2663
2664
646k
            ss_256[0] = ss_256[1];
2665
646k
            ss_256[1] = ss_256[2];
2666
646k
            ss_256[3] = ss_256[4];
2667
646k
            ss_256[4] = ss_256[5];
2668
2669
646k
            tt_256[0] = tt_256[1];
2670
646k
            tt_256[1] = tt_256[2];
2671
646k
            tt_256[3] = tt_256[4];
2672
646k
            tt_256[4] = tt_256[5];
2673
646k
            d += 2 * dst_stride;
2674
646k
            y -= 2;
2675
646k
          } while (y);
2676
2677
37.5k
          x += 32;
2678
37.5k
        } while (x < w);
2679
31.3k
      }
2680
201k
    }
2681
298k
  } else if (vert_tap == 8) {
2682
    // vert_filt as 8 tap
2683
18.7k
    const uint8_t *src_ptr = src - 3 * src_stride;
2684
2685
18.7k
    if (w <= 4) {
2686
7.23k
      prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2687
2688
7.23k
      y = h;
2689
2690
7.23k
      if (w == 2) {
2691
1.50k
        __m128i s_16[8], ss_128[4];
2692
2693
1.50k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2694
1.50k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2695
1.50k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2696
1.50k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2697
1.50k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2698
1.50k
        s_16[5] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 5 * src_stride));
2699
1.50k
        s_16[6] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 6 * src_stride));
2700
2701
1.50k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2702
1.50k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2703
1.50k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2704
1.50k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2705
1.50k
        const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
2706
1.50k
        const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
2707
2708
1.50k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2709
1.50k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2710
1.50k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2711
2712
6.01k
        do {
2713
6.01k
          const __m128i res = y_convolve_8tap_2x2_ssse3(
2714
6.01k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2715
6.01k
          const __m128i r = sr_y_round_sse2(res);
2716
6.01k
          pack_store_2x2_sse2(r, dst, dst_stride);
2717
6.01k
          ss_128[0] = ss_128[1];
2718
6.01k
          ss_128[1] = ss_128[2];
2719
6.01k
          ss_128[2] = ss_128[3];
2720
6.01k
          src_ptr += 2 * src_stride;
2721
6.01k
          dst += 2 * dst_stride;
2722
6.01k
          y -= 2;
2723
6.01k
        } while (y);
2724
5.73k
      } else {
2725
5.73k
        __m128i s_32[8], ss_128[4];
2726
2727
5.73k
        assert(w == 4);
2728
2729
5.73k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2730
5.73k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2731
5.73k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2732
5.73k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2733
5.73k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2734
5.73k
        s_32[5] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 5 * src_stride));
2735
5.73k
        s_32[6] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 6 * src_stride));
2736
2737
5.73k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2738
5.73k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2739
5.73k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2740
5.73k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2741
5.73k
        const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
2742
5.73k
        const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
2743
2744
5.73k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2745
5.73k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2746
5.73k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2747
2748
30.6k
        do {
2749
30.6k
          const __m128i res = y_convolve_8tap_4x2_ssse3(
2750
30.6k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2751
30.6k
          const __m128i r = sr_y_round_sse2(res);
2752
30.6k
          pack_store_4x2_sse2(r, dst, dst_stride);
2753
30.6k
          ss_128[0] = ss_128[1];
2754
30.6k
          ss_128[1] = ss_128[2];
2755
30.6k
          ss_128[2] = ss_128[3];
2756
30.6k
          src_ptr += 2 * src_stride;
2757
30.6k
          dst += 2 * dst_stride;
2758
30.6k
          y -= 2;
2759
30.6k
        } while (y);
2760
5.73k
      }
2761
11.5k
    } else {
2762
11.5k
      prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2763
2764
11.5k
      if (w == 8) {
2765
5.84k
        __m128i s_64[8];
2766
5.84k
        __m256i ss_256[4];
2767
2768
5.84k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2769
5.84k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2770
5.84k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2771
5.84k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2772
5.84k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2773
5.84k
        s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
2774
5.84k
        s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
2775
2776
        // Load lines a and b. Line a to lower 128, line b to upper 128
2777
5.84k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2778
5.84k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2779
5.84k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2780
5.84k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2781
5.84k
        const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
2782
5.84k
        const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
2783
2784
5.84k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2785
5.84k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2786
5.84k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2787
2788
5.84k
        y = h;
2789
34.1k
        do {
2790
34.1k
          const __m256i res = y_convolve_8tap_8x2_avx2(
2791
34.1k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2792
34.1k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2793
34.1k
          ss_256[0] = ss_256[1];
2794
34.1k
          ss_256[1] = ss_256[2];
2795
34.1k
          ss_256[2] = ss_256[3];
2796
34.1k
          src_ptr += 2 * src_stride;
2797
34.1k
          dst += 2 * dst_stride;
2798
34.1k
          y -= 2;
2799
34.1k
        } while (y);
2800
5.84k
      } else if (w == 16) {
2801
3.91k
        __m128i s_128[8];
2802
3.91k
        __m256i ss_256[8], r[2];
2803
2804
3.91k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2805
3.91k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2806
3.91k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2807
3.91k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2808
3.91k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2809
3.91k
        s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
2810
3.91k
        s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
2811
2812
        // Load lines a and b. Line a to lower 128, line b to upper 128
2813
3.91k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2814
3.91k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2815
3.91k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2816
3.91k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2817
3.91k
        const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
2818
3.91k
        const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
2819
2820
3.91k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2821
3.91k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2822
3.91k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2823
2824
3.91k
        ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
2825
3.91k
        ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
2826
3.91k
        ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
2827
2828
3.91k
        y = h;
2829
28.9k
        do {
2830
28.9k
          y_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2831
28.9k
                                    ss_256, r);
2832
28.9k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2833
2834
28.9k
          ss_256[0] = ss_256[1];
2835
28.9k
          ss_256[1] = ss_256[2];
2836
28.9k
          ss_256[2] = ss_256[3];
2837
2838
28.9k
          ss_256[4] = ss_256[5];
2839
28.9k
          ss_256[5] = ss_256[6];
2840
28.9k
          ss_256[6] = ss_256[7];
2841
28.9k
          src_ptr += 2 * src_stride;
2842
28.9k
          dst += 2 * dst_stride;
2843
28.9k
          y -= 2;
2844
28.9k
        } while (y);
2845
3.91k
      } else {
2846
1.78k
        __m256i s_256[8], ss_256[8], tt_256[8], r[4];
2847
2848
1.78k
        assert(!(w % 32));
2849
2850
1.78k
        x = 0;
2851
2.52k
        do {
2852
2.52k
          const uint8_t *s = src_ptr + x;
2853
2.52k
          uint8_t *d = dst + x;
2854
2855
2.52k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2856
2.52k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2857
2.52k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2858
2.52k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2859
2.52k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2860
2.52k
          s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
2861
2.52k
          s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
2862
2863
2.52k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2864
2.52k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2865
2.52k
          ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
2866
2.52k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2867
2.52k
          ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2868
2.52k
          ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
2869
2870
2.52k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2871
2.52k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2872
2.52k
          tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
2873
2.52k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2874
2.52k
          tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2875
2.52k
          tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
2876
2877
2.52k
          y = h;
2878
51.0k
          do {
2879
51.0k
            y_convolve_8tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2880
51.0k
                                      tt_256, r);
2881
51.0k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2882
2883
51.0k
            ss_256[0] = ss_256[1];
2884
51.0k
            ss_256[1] = ss_256[2];
2885
51.0k
            ss_256[2] = ss_256[3];
2886
51.0k
            ss_256[4] = ss_256[5];
2887
51.0k
            ss_256[5] = ss_256[6];
2888
51.0k
            ss_256[6] = ss_256[7];
2889
2890
51.0k
            tt_256[0] = tt_256[1];
2891
51.0k
            tt_256[1] = tt_256[2];
2892
51.0k
            tt_256[2] = tt_256[3];
2893
51.0k
            tt_256[4] = tt_256[5];
2894
51.0k
            tt_256[5] = tt_256[6];
2895
51.0k
            tt_256[6] = tt_256[7];
2896
51.0k
            s += 2 * src_stride;
2897
51.0k
            d += 2 * dst_stride;
2898
51.0k
            y -= 2;
2899
51.0k
          } while (y);
2900
2901
2.52k
          x += 32;
2902
2.52k
        } while (x < w);
2903
1.78k
      }
2904
11.5k
    }
2905
18.7k
  }
2906
705k
}
2907
2908
static inline void sr_x_2tap_32_avx2(const uint8_t *const src,
2909
                                     const __m256i coeffs[1],
2910
0
                                     uint8_t *const dst) {
2911
0
  __m256i r[2];
2912
0
2913
0
  x_convolve_2tap_32_avx2(src, coeffs, r);
2914
0
  sr_x_round_store_32_avx2(r, dst);
2915
0
}
2916
2917
static inline void sr_x_6tap_32_avx2(const uint8_t *const src,
2918
                                     const __m256i coeffs[3],
2919
                                     const __m256i filt[3],
2920
0
                                     uint8_t *const dst) {
2921
0
  __m256i r[2];
2922
0
2923
0
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
2924
0
  sr_x_round_store_32_avx2(r, dst);
2925
0
}
2926
2927
static AOM_FORCE_INLINE void sr_x_8tap_32_avx2(const uint8_t *const src,
2928
                                               const __m256i coeffs[4],
2929
                                               const __m256i filt[4],
2930
0
                                               uint8_t *const dst) {
2931
0
  __m256i r[2];
2932
0
2933
0
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
2934
0
  sr_x_round_store_32_avx2(r, dst);
2935
0
}
2936
2937
static AOM_FORCE_INLINE void av1_convolve_x_sr_specialized_avx2(
2938
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2939
    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
2940
0
    const int32_t subpel_x_q4, ConvolveParams *conv_params) {
2941
0
  int32_t y = h;
2942
0
  __m128i coeffs_128[4];
2943
0
  __m256i coeffs_256[4];
2944
0
2945
0
  assert(conv_params->round_0 == 3);
2946
0
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
2947
0
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
2948
0
  (void)conv_params;
2949
0
2950
0
  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
2951
0
2952
0
  if (horz_tap == 2) {
2953
0
    // horz_filt as 2 tap
2954
0
    const uint8_t *src_ptr = src;
2955
0
2956
0
    if (subpel_x_q4 != 8) {
2957
0
      if (w <= 8) {
2958
0
        prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4,
2959
0
                                       coeffs_128);
2960
0
2961
0
        if (w == 2) {
2962
0
          do {
2963
0
            const __m128i res =
2964
0
                x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
2965
0
            const __m128i r = sr_x_round_sse2(res);
2966
0
            pack_store_2x2_sse2(r, dst, dst_stride);
2967
0
            src_ptr += 2 * src_stride;
2968
0
            dst += 2 * dst_stride;
2969
0
            y -= 2;
2970
0
          } while (y);
2971
0
        } else if (w == 4) {
2972
0
          do {
2973
0
            const __m128i res =
2974
0
                x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
2975
0
            const __m128i r = sr_x_round_sse2(res);
2976
0
            pack_store_4x2_sse2(r, dst, dst_stride);
2977
0
            src_ptr += 2 * src_stride;
2978
0
            dst += 2 * dst_stride;
2979
0
            y -= 2;
2980
0
          } while (y);
2981
0
        } else {
2982
0
          assert(w == 8);
2983
0
2984
0
          do {
2985
0
            __m128i res[2];
2986
0
2987
0
            x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, res);
2988
0
            res[0] = sr_x_round_sse2(res[0]);
2989
0
            res[1] = sr_x_round_sse2(res[1]);
2990
0
            const __m128i d = _mm_packus_epi16(res[0], res[1]);
2991
0
            _mm_storel_epi64((__m128i *)dst, d);
2992
0
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2993
0
2994
0
            src_ptr += 2 * src_stride;
2995
0
            dst += 2 * dst_stride;
2996
0
            y -= 2;
2997
0
          } while (y);
2998
0
        }
2999
0
      } else {
3000
0
        prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3001
0
3002
0
        if (w == 16) {
3003
0
          do {
3004
0
            __m256i r[2];
3005
0
3006
0
            x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
3007
0
            sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3008
0
            src_ptr += 2 * src_stride;
3009
0
            dst += 2 * dst_stride;
3010
0
            y -= 2;
3011
0
          } while (y);
3012
0
        } else if (w == 32) {
3013
0
          do {
3014
0
            sr_x_2tap_32_avx2(src_ptr, coeffs_256, dst);
3015
0
            src_ptr += src_stride;
3016
0
            dst += dst_stride;
3017
0
          } while (--y);
3018
0
        } else if (w == 64) {
3019
0
          do {
3020
0
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3021
0
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3022
0
            src_ptr += src_stride;
3023
0
            dst += dst_stride;
3024
0
          } while (--y);
3025
0
        } else {
3026
0
          assert(w == 128);
3027
0
3028
0
          do {
3029
0
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3030
0
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3031
0
            sr_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, dst + 2 * 32);
3032
0
            sr_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, dst + 3 * 32);
3033
0
            src_ptr += src_stride;
3034
0
            dst += dst_stride;
3035
0
          } while (--y);
3036
0
        }
3037
0
      }
3038
0
    } else {
3039
0
      // average to get half pel
3040
0
      if (w == 2) {
3041
0
        do {
3042
0
          __m128i s_128;
3043
0
3044
0
          s_128 = load_u8_4x2_sse4_1(src_ptr, src_stride);
3045
0
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3046
0
          const __m128i d = _mm_avg_epu8(s_128, s1);
3047
0
          *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(d);
3048
0
          *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(d, 2);
3049
0
3050
0
          src_ptr += 2 * src_stride;
3051
0
          dst += 2 * dst_stride;
3052
0
          y -= 2;
3053
0
        } while (y);
3054
0
      } else if (w == 4) {
3055
0
        do {
3056
0
          __m128i s_128;
3057
0
3058
0
          s_128 = load_u8_8x2_sse2(src_ptr, src_stride);
3059
0
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3060
0
          const __m128i d = _mm_avg_epu8(s_128, s1);
3061
0
          xx_storel_32(dst, d);
3062
0
          *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(d, 2);
3063
0
3064
0
          src_ptr += 2 * src_stride;
3065
0
          dst += 2 * dst_stride;
3066
0
          y -= 2;
3067
0
        } while (y);
3068
0
      } else if (w == 8) {
3069
0
        do {
3070
0
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3071
0
          const __m128i s10 =
3072
0
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3073
0
          const __m128i s01 = _mm_srli_si128(s00, 1);
3074
0
          const __m128i s11 = _mm_srli_si128(s10, 1);
3075
0
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3076
0
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3077
0
          _mm_storel_epi64((__m128i *)dst, d0);
3078
0
          _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
3079
0
3080
0
          src_ptr += 2 * src_stride;
3081
0
          dst += 2 * dst_stride;
3082
0
          y -= 2;
3083
0
        } while (y);
3084
0
      } else if (w == 16) {
3085
0
        do {
3086
0
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3087
0
          const __m128i s01 = _mm_loadu_si128((__m128i *)(src_ptr + 1));
3088
0
          const __m128i s10 =
3089
0
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3090
0
          const __m128i s11 =
3091
0
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
3092
0
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3093
0
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3094
0
          _mm_storeu_si128((__m128i *)dst, d0);
3095
0
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
3096
0
3097
0
          src_ptr += 2 * src_stride;
3098
0
          dst += 2 * dst_stride;
3099
0
          y -= 2;
3100
0
        } while (y);
3101
0
      } else if (w == 32) {
3102
0
        do {
3103
0
          sr_x_2tap_32_avg_avx2(src_ptr, dst);
3104
0
          src_ptr += src_stride;
3105
0
          dst += dst_stride;
3106
0
        } while (--y);
3107
0
      } else if (w == 64) {
3108
0
        do {
3109
0
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3110
0
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3111
0
          src_ptr += src_stride;
3112
0
          dst += dst_stride;
3113
0
        } while (--y);
3114
0
      } else {
3115
0
        assert(w == 128);
3116
0
3117
0
        do {
3118
0
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3119
0
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3120
0
          sr_x_2tap_32_avg_avx2(src_ptr + 2 * 32, dst + 2 * 32);
3121
0
          sr_x_2tap_32_avg_avx2(src_ptr + 3 * 32, dst + 3 * 32);
3122
0
          src_ptr += src_stride;
3123
0
          dst += dst_stride;
3124
0
        } while (--y);
3125
0
      }
3126
0
    }
3127
0
  } else if (horz_tap == 4) {
3128
0
    // horz_filt as 4 tap
3129
0
    const uint8_t *src_ptr = src - 1;
3130
0
3131
0
    prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3132
0
3133
0
    if (w == 2) {
3134
0
      do {
3135
0
        const __m128i res =
3136
0
            x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
3137
0
        const __m128i r = sr_x_round_sse2(res);
3138
0
        pack_store_2x2_sse2(r, dst, dst_stride);
3139
0
        src_ptr += 2 * src_stride;
3140
0
        dst += 2 * dst_stride;
3141
0
        y -= 2;
3142
0
      } while (y);
3143
0
    } else if (w == 4) {
3144
0
      do {
3145
0
        const __m128i res =
3146
0
            x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3147
0
        const __m128i r = sr_x_round_sse2(res);
3148
0
        pack_store_4x2_sse2(r, dst, dst_stride);
3149
0
        src_ptr += 2 * src_stride;
3150
0
        dst += 2 * dst_stride;
3151
0
        y -= 2;
3152
0
      } while (y);
3153
0
    } else if (w == 8) {
3154
0
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3155
0
      // rewrite this for better performance later.
3156
0
      __m256i filt_256[2];
3157
0
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3158
0
3159
0
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3160
0
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3161
0
      for (int i = 0; i < h; i += 2) {
3162
0
        const __m256i data = _mm256_permute2x128_si256(
3163
0
            _mm256_castsi128_si256(
3164
0
                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
3165
0
            _mm256_castsi128_si256(_mm_loadu_si128(
3166
0
                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
3167
0
            0x20);
3168
0
3169
0
        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3170
0
        res_16b = sr_x_round_avx2(res_16b);
3171
0
3172
0
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3173
0
3174
0
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
3175
0
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
3176
0
3177
0
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
3178
0
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
3179
0
      }
3180
0
    } else {
3181
0
      assert(!(w % 16));
3182
0
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3183
0
      // rewrite this for better performance later.
3184
0
      __m256i filt_256[2];
3185
0
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3186
0
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3187
0
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3188
0
3189
0
      for (int i = 0; i < h; ++i) {
3190
0
        for (int j = 0; j < w; j += 16) {
3191
0
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
3192
0
          // 18 19 20 21 22 23
3193
0
          const __m256i data = _mm256_inserti128_si256(
3194
0
              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
3195
0
              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
3196
0
              1);
3197
0
3198
0
          __m256i res_16b =
3199
0
              convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3200
0
          res_16b = sr_x_round_avx2(res_16b);
3201
0
3202
0
          /* rounding code */
3203
0
          // 8 bit conversion and saturation to uint8
3204
0
          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3205
0
3206
0
          // Store values into the destination buffer
3207
0
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
3208
0
          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
3209
0
          __m128i res = _mm256_castsi256_si128(res_8b);
3210
0
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
3211
0
        }
3212
0
      }
3213
0
    }
3214
0
  } else {
3215
0
    __m256i filt_256[4];
3216
0
3217
0
    filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3218
0
    filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3219
0
    filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
3220
0
3221
0
    if (horz_tap == 6) {
3222
0
      // horz_filt as 6 tap
3223
0
      const uint8_t *src_ptr = src - 2;
3224
0
3225
0
      prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3226
0
3227
0
      if (w == 8) {
3228
0
        do {
3229
0
          const __m256i res = x_convolve_6tap_8x2_avx2(src_ptr, src_stride,
3230
0
                                                       coeffs_256, filt_256);
3231
0
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3232
0
          src_ptr += 2 * src_stride;
3233
0
          dst += 2 * dst_stride;
3234
0
          y -= 2;
3235
0
        } while (y);
3236
0
      } else if (w == 16) {
3237
0
        do {
3238
0
          __m256i r[2];
3239
0
3240
0
          x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3241
0
                                    r);
3242
0
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3243
0
          src_ptr += 2 * src_stride;
3244
0
          dst += 2 * dst_stride;
3245
0
          y -= 2;
3246
0
        } while (y);
3247
0
      } else if (w == 32) {
3248
0
        do {
3249
0
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3250
0
          src_ptr += src_stride;
3251
0
          dst += dst_stride;
3252
0
        } while (--y);
3253
0
      } else if (w == 64) {
3254
0
        do {
3255
0
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3256
0
          sr_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3257
0
          src_ptr += src_stride;
3258
0
          dst += dst_stride;
3259
0
        } while (--y);
3260
0
      } else {
3261
0
        assert(w == 128);
3262
0
3263
0
        do {
3264
0
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3265
0
          sr_x_6tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3266
0
                            dst + 1 * 32);
3267
0
          sr_x_6tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3268
0
                            dst + 2 * 32);
3269
0
          sr_x_6tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3270
0
                            dst + 3 * 32);
3271
0
          src_ptr += src_stride;
3272
0
          dst += dst_stride;
3273
0
        } while (--y);
3274
0
      }
3275
0
    } else if (horz_tap == 8) {
3276
0
      // horz_filt as 8 tap
3277
0
      const uint8_t *src_ptr = src - 3;
3278
0
3279
0
      filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
3280
0
3281
0
      prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3282
0
3283
0
      if (w == 8) {
3284
0
        do {
3285
0
          const __m256i res = x_convolve_8tap_8x2_avx2(src_ptr, src_stride,
3286
0
                                                       coeffs_256, filt_256);
3287
0
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3288
0
          src_ptr += 2 * src_stride;
3289
0
          dst += 2 * dst_stride;
3290
0
          y -= 2;
3291
0
        } while (y);
3292
0
      } else if (w == 16) {
3293
0
        do {
3294
0
          __m256i r[2];
3295
0
3296
0
          x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3297
0
                                    r);
3298
0
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3299
0
          src_ptr += 2 * src_stride;
3300
0
          dst += 2 * dst_stride;
3301
0
          y -= 2;
3302
0
        } while (y);
3303
0
      } else if (w == 32) {
3304
0
        do {
3305
0
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3306
0
          src_ptr += src_stride;
3307
0
          dst += dst_stride;
3308
0
        } while (--y);
3309
0
      } else if (w == 64) {
3310
0
        do {
3311
0
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3312
0
          sr_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3313
0
          src_ptr += src_stride;
3314
0
          dst += dst_stride;
3315
0
        } while (--y);
3316
0
      } else {
3317
0
        assert(w == 128);
3318
0
3319
0
        do {
3320
0
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3321
0
          sr_x_8tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3322
0
                            dst + 1 * 32);
3323
0
          sr_x_8tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3324
0
                            dst + 2 * 32);
3325
0
          sr_x_8tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3326
0
                            dst + 3 * 32);
3327
0
          src_ptr += src_stride;
3328
0
          dst += dst_stride;
3329
0
        } while (--y);
3330
0
      }
3331
0
    }
3332
0
  }
3333
0
}
3334
3335
#endif  // THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_