Coverage Report

Created: 2023-06-07 06:31

/src/aom/av1/common/x86/convolve_avx2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <immintrin.h>
13
14
#include "config/av1_rtcd.h"
15
16
#include "third_party/SVT-AV1/convolve_avx2.h"
17
18
#include "aom_dsp/aom_dsp_common.h"
19
#include "aom_dsp/x86/convolve_avx2.h"
20
#include "aom_dsp/x86/convolve_common_intrin.h"
21
#include "aom_dsp/x86/synonyms.h"
22
23
static AOM_INLINE void av1_convolve_y_sr_general_avx2(
24
    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
25
0
    int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) {
26
  // right shift is F-1 because we are already dividing
27
  // filter co-efficients by 2
28
0
  const int right_shift_bits = (FILTER_BITS - 1);
29
0
  __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
30
0
  __m256i right_shift_const = _mm256_set1_epi16((1 << right_shift_bits) >> 1);
31
32
0
  __m256i coeffs[6], s[12];
33
0
  __m128i d[10];
34
35
0
  int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
36
37
0
  if (vert_tap == 6)
38
0
    prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
39
0
  else if (vert_tap == 12) {
40
0
    prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs);
41
0
  } else {
42
0
    prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
43
0
  }
44
45
  // vert_filt as 4 tap
46
0
  if (vert_tap == 4) {
47
0
    const int fo_vert = 1;
48
0
    const uint8_t *const src_ptr = src - fo_vert * src_stride;
49
0
    for (int j = 0; j < w; j += 16) {
50
0
      const uint8_t *data = &src_ptr[j];
51
0
      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
52
0
      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
53
0
      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
54
0
      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
55
0
      d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
56
57
      // Load lines a and b. Line a to lower 128, line b to upper 128
58
0
      const __m256i src_01a = _mm256_permute2x128_si256(
59
0
          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
60
61
0
      const __m256i src_12a = _mm256_permute2x128_si256(
62
0
          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
63
64
0
      const __m256i src_23a = _mm256_permute2x128_si256(
65
0
          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
66
67
0
      const __m256i src_34a = _mm256_permute2x128_si256(
68
0
          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
69
70
0
      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
71
0
      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
72
73
0
      s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
74
0
      s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
75
76
0
      for (i = 0; i < h; i += 2) {
77
0
        data = &src_ptr[i * src_stride + j];
78
0
        d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
79
0
        const __m256i src_45a = _mm256_permute2x128_si256(
80
0
            _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
81
82
0
        d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
83
0
        const __m256i src_56a = _mm256_permute2x128_si256(
84
0
            _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
85
86
0
        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
87
0
        s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
88
89
0
        const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
90
        /* rounding code */
91
        // shift by F - 1
92
0
        const __m256i res_16b_lo = _mm256_sra_epi16(
93
0
            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
94
        // 8 bit conversion and saturation to uint8
95
0
        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
96
97
0
        if (w - j > 8) {
98
0
          const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
99
100
          /* rounding code */
101
          // shift by F - 1
102
0
          const __m256i res_16b_hi = _mm256_sra_epi16(
103
0
              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
104
          // 8 bit conversion and saturation to uint8
105
0
          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
106
107
0
          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
108
109
0
          const __m128i res_0 = _mm256_castsi256_si128(res_a);
110
0
          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
111
112
0
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
113
0
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
114
0
                           res_1);
115
0
        } else {
116
0
          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
117
0
          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
118
0
          if (w - j > 4) {
119
0
            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
120
0
            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
121
0
                             res_1);
122
0
          } else if (w - j > 2) {
123
0
            xx_storel_32(&dst[i * dst_stride + j], res_0);
124
0
            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
125
0
          } else {
126
0
            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
127
0
            __m128i *const p_1 =
128
0
                (__m128i *)&dst[i * dst_stride + j + dst_stride];
129
0
            *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
130
0
            *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
131
0
          }
132
0
        }
133
0
        s[0] = s[1];
134
0
        s[1] = s[2];
135
136
0
        s[3] = s[4];
137
0
        s[4] = s[5];
138
0
      }
139
0
    }
140
0
  } else if (vert_tap == 6) {
141
0
    const int fo_vert = vert_tap / 2 - 1;
142
0
    const uint8_t *const src_ptr = src - fo_vert * src_stride;
143
144
0
    for (int j = 0; j < w; j += 16) {
145
0
      const uint8_t *data = &src_ptr[j];
146
0
      __m256i src6;
147
148
0
      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
149
0
      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
150
0
      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
151
0
      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
152
      // Load lines a and b. Line a to lower 128, line b to upper 128
153
0
      const __m256i src_01a = _mm256_permute2x128_si256(
154
0
          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
155
156
0
      const __m256i src_12a = _mm256_permute2x128_si256(
157
0
          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
158
159
0
      const __m256i src_23a = _mm256_permute2x128_si256(
160
0
          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
161
162
0
      src6 = _mm256_castsi128_si256(
163
0
          _mm_loadu_si128((__m128i *)(data + 4 * src_stride)));
164
0
      const __m256i src_34a =
165
0
          _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20);
166
167
0
      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
168
0
      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
169
170
0
      s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
171
0
      s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
172
173
0
      for (i = 0; i < h; i += 2) {
174
0
        data = &src_ptr[i * src_stride + j];
175
0
        const __m256i src_45a = _mm256_permute2x128_si256(
176
0
            src6,
177
0
            _mm256_castsi128_si256(
178
0
                _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
179
0
            0x20);
180
181
0
        src6 = _mm256_castsi128_si256(
182
0
            _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
183
0
        const __m256i src_56a = _mm256_permute2x128_si256(
184
0
            _mm256_castsi128_si256(
185
0
                _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
186
0
            src6, 0x20);
187
188
0
        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
189
0
        s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
190
191
0
        const __m256i res_lo = convolve_lowbd_6tap(s, coeffs);
192
193
        /* rounding code */
194
        // shift by F - 1
195
0
        const __m256i res_16b_lo = _mm256_sra_epi16(
196
0
            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
197
        // 8 bit conversion and saturation to uint8
198
0
        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
199
200
0
        if (w - j > 8) {
201
0
          const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs);
202
203
          /* rounding code */
204
          // shift by F - 1
205
0
          const __m256i res_16b_hi = _mm256_sra_epi16(
206
0
              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
207
          // 8 bit conversion and saturation to uint8
208
0
          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
209
210
0
          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
211
212
0
          const __m128i res_0 = _mm256_castsi256_si128(res_a);
213
0
          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
214
215
0
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
216
0
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
217
0
                           res_1);
218
0
        } else {
219
0
          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
220
0
          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
221
0
          if (w - j > 4) {
222
0
            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
223
0
            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
224
0
                             res_1);
225
0
          } else if (w - j > 2) {
226
0
            xx_storel_32(&dst[i * dst_stride + j], res_0);
227
0
            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
228
0
          } else {
229
0
            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
230
0
            __m128i *const p_1 =
231
0
                (__m128i *)&dst[i * dst_stride + j + dst_stride];
232
0
            *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
233
0
            *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
234
0
          }
235
0
        }
236
0
        s[0] = s[1];
237
0
        s[1] = s[2];
238
0
        s[3] = s[4];
239
0
        s[4] = s[5];
240
0
      }
241
0
    }
242
0
  } else if (vert_tap == 12) {  // vert_tap == 12
243
0
    const int fo_vert = filter_params_y->taps / 2 - 1;
244
0
    const uint8_t *const src_ptr = src - fo_vert * src_stride;
245
0
    const __m256i v_zero = _mm256_setzero_si256();
246
0
    right_shift = _mm_cvtsi32_si128(FILTER_BITS);
247
0
    right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1);
248
249
0
    for (int j = 0; j < w; j += 8) {
250
0
      const uint8_t *data = &src_ptr[j];
251
0
      __m256i src10;
252
253
0
      d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
254
0
      d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
255
0
      d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
256
0
      d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride));
257
0
      d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride));
258
0
      d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride));
259
0
      d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
260
0
      d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride));
261
0
      d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
262
0
      d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride));
263
      // Load lines a and b. Line a to lower 128, line b to upper 128
264
0
      const __m256i src_01a = _mm256_permute2x128_si256(
265
0
          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
266
267
0
      const __m256i src_12a = _mm256_permute2x128_si256(
268
0
          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
269
270
0
      const __m256i src_23a = _mm256_permute2x128_si256(
271
0
          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
272
273
0
      const __m256i src_34a = _mm256_permute2x128_si256(
274
0
          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
275
276
0
      const __m256i src_45a = _mm256_permute2x128_si256(
277
0
          _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
278
279
0
      const __m256i src_56a = _mm256_permute2x128_si256(
280
0
          _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20);
281
282
0
      const __m256i src_67a = _mm256_permute2x128_si256(
283
0
          _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20);
284
285
0
      const __m256i src_78a = _mm256_permute2x128_si256(
286
0
          _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20);
287
288
0
      const __m256i src_89a = _mm256_permute2x128_si256(
289
0
          _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20);
290
291
0
      src10 = _mm256_castsi128_si256(
292
0
          _mm_loadl_epi64((__m128i *)(data + 10 * src_stride)));
293
0
      const __m256i src_910a =
294
0
          _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20);
295
296
0
      const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero);
297
0
      const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero);
298
0
      const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero);
299
0
      const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero);
300
0
      const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero);
301
0
      const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero);
302
0
      const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero);
303
0
      const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero);
304
0
      const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero);
305
0
      const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero);
306
307
0
      s[0] = _mm256_unpacklo_epi16(src_01, src_12);
308
0
      s[1] = _mm256_unpacklo_epi16(src_23, src_34);
309
0
      s[2] = _mm256_unpacklo_epi16(src_45, src_56);
310
0
      s[3] = _mm256_unpacklo_epi16(src_67, src_78);
311
0
      s[4] = _mm256_unpacklo_epi16(src_89, src_910);
312
313
0
      s[6] = _mm256_unpackhi_epi16(src_01, src_12);
314
0
      s[7] = _mm256_unpackhi_epi16(src_23, src_34);
315
0
      s[8] = _mm256_unpackhi_epi16(src_45, src_56);
316
0
      s[9] = _mm256_unpackhi_epi16(src_67, src_78);
317
0
      s[10] = _mm256_unpackhi_epi16(src_89, src_910);
318
319
0
      for (i = 0; i < h; i += 2) {
320
0
        data = &src_ptr[i * src_stride + j];
321
0
        const __m256i src_1011a = _mm256_permute2x128_si256(
322
0
            src10,
323
0
            _mm256_castsi128_si256(
324
0
                _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
325
0
            0x20);
326
327
0
        src10 = _mm256_castsi128_si256(
328
0
            _mm_loadl_epi64((__m128i *)(data + 12 * src_stride)));
329
330
0
        const __m256i src_1112a = _mm256_permute2x128_si256(
331
0
            _mm256_castsi128_si256(
332
0
                _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
333
0
            src10, 0x20);
334
335
0
        const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero);
336
0
        const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero);
337
338
0
        s[5] = _mm256_unpacklo_epi16(src_1011, src_1112);
339
0
        s[11] = _mm256_unpackhi_epi16(src_1011, src_1112);
340
341
0
        const __m256i res_lo = convolve_12taps(s, coeffs);
342
343
0
        const __m256i res_32b_lo = _mm256_sra_epi32(
344
0
            _mm256_add_epi32(res_lo, right_shift_const), right_shift);
345
        // 8 bit conversion and saturation to uint8
346
0
        __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
347
0
        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
348
349
0
        if (w - j > 4) {
350
0
          const __m256i res_hi = convolve_12taps(s + 6, coeffs);
351
352
0
          const __m256i res_32b_hi = _mm256_sra_epi32(
353
0
              _mm256_add_epi32(res_hi, right_shift_const), right_shift);
354
0
          __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi);
355
          // 8 bit conversion and saturation to uint8
356
0
          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
357
358
0
          __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi);
359
360
0
          const __m128i res_0 = _mm256_extracti128_si256(res_a, 0);
361
0
          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
362
363
0
          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
364
0
          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
365
0
                           res_1);
366
0
        } else {
367
0
          const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
368
0
          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
369
0
          if (w - j > 2) {
370
0
            *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0);
371
0
            *(int *)&dst[i * dst_stride + j + dst_stride] =
372
0
                _mm_cvtsi128_si32(res_1);
373
0
          } else {
374
0
            *(uint16_t *)&dst[i * dst_stride + j] =
375
0
                (uint16_t)_mm_cvtsi128_si32(res_0);
376
0
            *(uint16_t *)&dst[i * dst_stride + j + dst_stride] =
377
0
                (uint16_t)_mm_cvtsi128_si32(res_1);
378
0
          }
379
0
        }
380
0
        s[0] = s[1];
381
0
        s[1] = s[2];
382
0
        s[2] = s[3];
383
0
        s[3] = s[4];
384
0
        s[4] = s[5];
385
386
0
        s[6] = s[7];
387
0
        s[7] = s[8];
388
0
        s[8] = s[9];
389
0
        s[9] = s[10];
390
0
        s[10] = s[11];
391
0
      }
392
0
    }
393
0
  } else {
394
0
    const int fo_vert = filter_params_y->taps / 2 - 1;
395
0
    const uint8_t *const src_ptr = src - fo_vert * src_stride;
396
397
0
    for (int j = 0; j < w; j += 16) {
398
0
      const uint8_t *data = &src_ptr[j];
399
0
      __m256i src6;
400
401
0
      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
402
0
      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
403
0
      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
404
0
      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
405
0
      d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
406
0
      d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
407
      // Load lines a and b. Line a to lower 128, line b to upper 128
408
0
      const __m256i src_01a = _mm256_permute2x128_si256(
409
0
          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
410
411
0
      const __m256i src_12a = _mm256_permute2x128_si256(
412
0
          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
413
414
0
      const __m256i src_23a = _mm256_permute2x128_si256(
415
0
          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
416
417
0
      const __m256i src_34a = _mm256_permute2x128_si256(
418
0
          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
419
420
0
      const __m256i src_45a = _mm256_permute2x128_si256(
421
0
          _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
422
423
0
      src6 = _mm256_castsi128_si256(
424
0
          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
425
0
      const __m256i src_56a =
426
0
          _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
427
428
0
      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
429
0
      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
430
0
      s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
431
432
0
      s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
433
0
      s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
434
0
      s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
435
436
0
      for (i = 0; i < h; i += 2) {
437
0
        data = &src_ptr[i * src_stride + j];
438
0
        const __m256i src_67a = _mm256_permute2x128_si256(
439
0
            src6,
440
0
            _mm256_castsi128_si256(
441
0
                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
442
0
            0x20);
443
444
0
        src6 = _mm256_castsi128_si256(
445
0
            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
446
0
        const __m256i src_78a = _mm256_permute2x128_si256(
447
0
            _mm256_castsi128_si256(
448
0
                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
449
0
            src6, 0x20);
450
451
0
        s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
452
0
        s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
453
454
0
        const __m256i res_lo = convolve_lowbd(s, coeffs);
455
456
        /* rounding code */
457
        // shift by F - 1
458
0
        const __m256i res_16b_lo = _mm256_sra_epi16(
459
0
            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
460
        // 8 bit conversion and saturation to uint8
461
0
        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
462
463
0
        if (w - j > 8) {
464
0
          const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
465
466
          /* rounding code */
467
          // shift by F - 1
468
0
          const __m256i res_16b_hi = _mm256_sra_epi16(
469
0
              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
470
          // 8 bit conversion and saturation to uint8
471
0
          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
472
473
0
          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
474
475
0
          const __m128i res_0 = _mm256_castsi256_si128(res_a);
476
0
          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
477
478
0
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
479
0
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
480
0
                           res_1);
481
0
        } else {
482
0
          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
483
0
          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
484
0
          if (w - j > 4) {
485
0
            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
486
0
            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
487
0
                             res_1);
488
0
          } else if (w - j > 2) {
489
0
            xx_storel_32(&dst[i * dst_stride + j], res_0);
490
0
            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
491
0
          } else {
492
0
            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
493
0
            __m128i *const p_1 =
494
0
                (__m128i *)&dst[i * dst_stride + j + dst_stride];
495
0
            *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
496
0
            *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
497
0
          }
498
0
        }
499
0
        s[0] = s[1];
500
0
        s[1] = s[2];
501
0
        s[2] = s[3];
502
503
0
        s[4] = s[5];
504
0
        s[5] = s[6];
505
0
        s[6] = s[7];
506
0
      }
507
0
    }
508
0
  }
509
0
}
510
511
void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
512
                            uint8_t *dst, int32_t dst_stride, int32_t w,
513
                            int32_t h,
514
                            const InterpFilterParams *filter_params_y,
515
1.39M
                            const int32_t subpel_y_q4) {
516
1.39M
  const int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
517
518
1.39M
  if (vert_tap == 12) {
519
0
    av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
520
0
                                   filter_params_y, subpel_y_q4);
521
1.39M
  } else {
522
1.39M
    av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
523
1.39M
                                       filter_params_y, subpel_y_q4);
524
1.39M
  }
525
1.39M
}
526
527
static AOM_INLINE void av1_convolve_x_sr_general_avx2(
528
    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
529
    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
530
0
    ConvolveParams *conv_params) {
531
0
  const int bits = FILTER_BITS - conv_params->round_0;
532
0
  const __m128i round_shift = _mm_cvtsi32_si128(bits);
533
0
  __m256i round_0_const =
534
0
      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
535
0
  __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
536
0
  __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
537
0
  int i, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
538
539
0
  assert(bits >= 0);
540
0
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
541
0
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
542
0
  assert(conv_params->round_0 > 0);
543
544
0
  __m256i coeffs[6], filt[4];
545
0
  filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
546
0
  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
547
548
0
  if (horiz_tap == 6)
549
0
    prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs);
550
0
  else if (horiz_tap == 12) {
551
0
    prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs);
552
0
  } else {
553
0
    prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
554
0
  }
555
556
  // horz_filt as 4 tap
557
0
  if (horiz_tap == 4) {
558
0
    const int fo_horiz = 1;
559
0
    const uint8_t *const src_ptr = src - fo_horiz;
560
0
    if (w <= 8) {
561
0
      for (i = 0; i < h; i += 2) {
562
0
        const __m256i data = _mm256_permute2x128_si256(
563
0
            _mm256_castsi128_si256(
564
0
                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
565
0
            _mm256_castsi128_si256(_mm_loadu_si128(
566
0
                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
567
0
            0x20);
568
569
0
        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
570
571
0
        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
572
0
                                   round_0_shift);
573
574
0
        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
575
0
                                   round_shift);
576
577
        /* rounding code */
578
        // 8 bit conversion and saturation to uint8
579
0
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
580
581
0
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
582
0
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
583
584
0
        if (w > 4) {
585
0
          _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
586
0
          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
587
0
        } else if (w > 2) {
588
0
          xx_storel_32(&dst[i * dst_stride], res_0);
589
0
          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
590
0
        } else {
591
0
          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
592
0
          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
593
0
          *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
594
0
          *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
595
0
        }
596
0
      }
597
0
    } else {
598
0
      for (i = 0; i < h; ++i) {
599
0
        for (int j = 0; j < w; j += 16) {
600
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
601
          // 18 19 20 21 22 23
602
0
          const __m256i data = _mm256_inserti128_si256(
603
0
              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
604
0
              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
605
0
              1);
606
607
0
          __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
608
609
0
          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
610
0
                                     round_0_shift);
611
612
0
          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
613
0
                                     round_shift);
614
615
          /* rounding code */
616
          // 8 bit conversion and saturation to uint8
617
0
          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
618
619
          // Store values into the destination buffer
620
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
621
0
          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
622
0
          __m128i res = _mm256_castsi256_si128(res_8b);
623
0
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
624
0
        }
625
0
      }
626
0
    }
627
0
  } else if (horiz_tap == 6) {
628
0
    const int fo_horiz = horiz_tap / 2 - 1;
629
0
    const uint8_t *const src_ptr = src - fo_horiz;
630
0
    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
631
0
    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
632
633
0
    if (w <= 8) {
634
0
      for (i = 0; i < h; i += 2) {
635
0
        const __m256i data = _mm256_permute2x128_si256(
636
0
            _mm256_castsi128_si256(
637
0
                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
638
0
            _mm256_castsi128_si256(_mm_loadu_si128(
639
0
                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
640
0
            0x20);
641
642
0
        __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
643
644
0
        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
645
0
                                   round_0_shift);
646
647
0
        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
648
0
                                   round_shift);
649
650
        /* rounding code */
651
        // 8 bit conversion and saturation to uint8
652
0
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
653
654
0
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
655
0
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
656
0
        if (w > 4) {
657
0
          _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
658
0
          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
659
0
        } else if (w > 2) {
660
0
          xx_storel_32(&dst[i * dst_stride], res_0);
661
0
          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
662
0
        } else {
663
0
          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
664
0
          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
665
0
          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
666
0
          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
667
0
        }
668
0
      }
669
0
    } else {
670
0
      for (i = 0; i < h; ++i) {
671
0
        for (int j = 0; j < w; j += 16) {
672
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
673
          // 18 19 20 21 22 23
674
0
          const __m256i data = _mm256_inserti128_si256(
675
0
              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
676
0
              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
677
0
              1);
678
679
0
          __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
680
681
0
          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
682
0
                                     round_0_shift);
683
684
0
          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
685
0
                                     round_shift);
686
687
          /* rounding code */
688
          // 8 bit conversion and saturation to uint8
689
0
          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
690
691
          // Store values into the destination buffer
692
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
693
0
          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
694
0
          __m128i res = _mm256_castsi256_si128(res_8b);
695
0
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
696
0
        }
697
0
      }
698
0
    }
699
0
  } else if (horiz_tap == 12) {  // horiz_tap == 12
700
0
    const int fo_horiz = filter_params_x->taps / 2 - 1;
701
0
    const uint8_t *const src_ptr = src - fo_horiz;
702
0
    const __m256i v_zero = _mm256_setzero_si256();
703
0
    round_0_const = _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1);
704
0
    round_const = _mm256_set1_epi32((1 << bits) >> 1);
705
0
    round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
706
0
    __m256i s[6];
707
708
0
    if (w <= 4) {
709
0
      for (i = 0; i < h; i += 2) {
710
0
        const __m256i data = _mm256_permute2x128_si256(
711
0
            _mm256_castsi128_si256(
712
0
                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
713
0
            _mm256_castsi128_si256(_mm_loadu_si128(
714
0
                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
715
0
            0x20);
716
        // row0 0..7 row1 0..7
717
0
        const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
718
        // row0 8..F row1 8..F
719
0
        const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
720
721
        // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03
722
0
        const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
723
        // row0 04 04 .. 07 07 row1 04 04 .. 07 07
724
0
        const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
725
726
        // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B
727
0
        const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
728
        // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F
729
0
        const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
730
731
        // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14
732
0
        s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
733
        // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
734
0
        s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
735
        // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18
736
0
        s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
737
        // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A
738
0
        s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
739
        // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C
740
0
        s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
741
        // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E
742
0
        s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
743
744
0
        const __m256i res_lo = convolve_12taps(s, coeffs);
745
746
0
        __m256i res_32b_lo = _mm256_sra_epi32(
747
0
            _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
748
749
        // 00 01 02 03 10 12 13 14
750
0
        res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const),
751
0
                                      round_shift);
752
        // 8 bit conversion and saturation to uint8
753
        // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13
754
0
        __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
755
        // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
756
        // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
757
0
        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
758
759
        // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
760
0
        const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
761
        // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
762
0
        const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
763
0
        if (w > 2) {
764
          // 00 01 02 03
765
0
          *(int *)&dst[i * dst_stride] = _mm_cvtsi128_si32(res_0);
766
          // 10 11 12 13
767
0
          *(int *)&dst[i * dst_stride + dst_stride] = _mm_cvtsi128_si32(res_1);
768
0
        } else {
769
          // 00 01
770
0
          *(uint16_t *)&dst[i * dst_stride] =
771
0
              (uint16_t)_mm_cvtsi128_si32(res_0);
772
          // 10 11
773
0
          *(uint16_t *)&dst[i * dst_stride + dst_stride] =
774
0
              (uint16_t)_mm_cvtsi128_si32(res_1);
775
0
        }
776
0
      }
777
0
    } else {
778
0
      for (i = 0; i < h; i++) {
779
0
        for (int j = 0; j < w; j += 8) {
780
0
          const __m256i data = _mm256_permute2x128_si256(
781
0
              _mm256_castsi128_si256(
782
0
                  _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
783
0
              _mm256_castsi128_si256(_mm_loadu_si128(
784
0
                  (__m128i *)(&src_ptr[i * src_stride + j + 4]))),
785
0
              0x20);
786
          // row0 0..7 4..B
787
0
          const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
788
          // row0 8..F C..13
789
0
          const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
790
791
          // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07
792
0
          const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
793
          // row0 04 04 .. 07 07 08 08 .. 0B 0B
794
0
          const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
795
796
          // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F
797
0
          const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
798
          // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13
799
0
          const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
800
801
0
          s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
802
0
          s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
803
0
          s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
804
0
          s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
805
0
          s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
806
0
          s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
807
808
0
          const __m256i res_lo = convolve_12taps(s, coeffs);
809
810
0
          __m256i res_32b_lo = _mm256_sra_epi32(
811
0
              _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
812
813
0
          res_32b_lo = _mm256_sra_epi32(
814
0
              _mm256_add_epi32(res_32b_lo, round_const), round_shift);
815
          // 8 bit conversion and saturation to uint8
816
0
          __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
817
0
          __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
818
0
          const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
819
0
          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
820
0
          *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0);
821
0
          *(int *)&dst[i * dst_stride + j + 4] = _mm_cvtsi128_si32(res_1);
822
0
        }
823
0
      }
824
0
    }
825
0
  } else {
826
0
    const int fo_horiz = filter_params_x->taps / 2 - 1;
827
0
    const uint8_t *const src_ptr = src - fo_horiz;
828
0
    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
829
0
    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
830
831
0
    if (w <= 8) {
832
0
      for (i = 0; i < h; i += 2) {
833
0
        const __m256i data = _mm256_permute2x128_si256(
834
0
            _mm256_castsi128_si256(
835
0
                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
836
0
            _mm256_castsi128_si256(_mm_loadu_si128(
837
0
                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
838
0
            0x20);
839
840
0
        __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
841
842
0
        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
843
0
                                   round_0_shift);
844
845
0
        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
846
0
                                   round_shift);
847
848
        /* rounding code */
849
        // 8 bit conversion and saturation to uint8
850
0
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
851
852
0
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
853
0
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
854
0
        if (w > 4) {
855
0
          _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
856
0
          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
857
0
        } else if (w > 2) {
858
0
          xx_storel_32(&dst[i * dst_stride], res_0);
859
0
          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
860
0
        } else {
861
0
          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
862
0
          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
863
0
          *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
864
0
          *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
865
0
        }
866
0
      }
867
0
    } else {
868
0
      for (i = 0; i < h; ++i) {
869
0
        for (int j = 0; j < w; j += 16) {
870
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
871
          // 18 19 20 21 22 23
872
0
          const __m256i data = _mm256_inserti128_si256(
873
0
              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
874
0
              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
875
0
              1);
876
877
0
          __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
878
879
0
          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
880
0
                                     round_0_shift);
881
882
0
          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
883
0
                                     round_shift);
884
885
          /* rounding code */
886
          // 8 bit conversion and saturation to uint8
887
0
          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
888
889
          // Store values into the destination buffer
890
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
891
0
          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
892
0
          __m128i res = _mm256_castsi256_si128(res_8b);
893
0
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
894
0
        }
895
0
      }
896
0
    }
897
0
  }
898
0
}
899
900
void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
901
                            uint8_t *dst, int32_t dst_stride, int32_t w,
902
                            int32_t h,
903
                            const InterpFilterParams *filter_params_x,
904
                            const int32_t subpel_x_q4,
905
1.47M
                            ConvolveParams *conv_params) {
906
1.47M
  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
907
908
1.47M
  if (horz_tap == 12) {
909
0
    av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
910
0
                                   filter_params_x, subpel_x_q4, conv_params);
911
1.47M
  } else {
912
1.47M
    av1_convolve_x_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
913
1.47M
                                       filter_params_x, subpel_x_q4,
914
1.47M
                                       conv_params);
915
1.47M
  }
916
1.47M
}