Coverage Report

Created: 2026-03-31 06:59

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/av1/common/x86/convolve_avx2.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <immintrin.h>
13
14
#include "config/av1_rtcd.h"
15
16
#include "aom_dsp/aom_dsp_common.h"
17
#include "aom_dsp/x86/convolve_avx2.h"
18
#include "aom_dsp/x86/convolve_common_intrin.h"
19
#include "aom_dsp/x86/synonyms.h"
20
21
void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
22
                            uint8_t *dst, int32_t dst_stride, int32_t w,
23
                            int32_t h,
24
                            const InterpFilterParams *filter_params_y,
25
486k
                            const int32_t subpel_y_qn) {
26
486k
  __m128i coeffs_128[4];
27
486k
  __m256i coeffs[6];
28
486k
  int x = 0, y = h;
29
30
486k
  int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
31
486k
  assert(vert_tap == 2 || vert_tap == 4 || vert_tap == 6 || vert_tap == 8 ||
32
486k
         vert_tap == 12);
33
486k
  assert(!(w % 2));
34
486k
  assert(!(h % 2));
35
36
486k
  const int fo_vert = vert_tap / 2 - 1;
37
486k
  const uint8_t *const src_ptr = src - fo_vert * src_stride;
38
486k
  const uint8_t *data = src_ptr;
39
486k
  uint8_t *dst_ptr = dst;
40
41
486k
  if (vert_tap == 2) {
42
37.3k
    if (subpel_y_qn != 8) {
43
13.5k
      if (w <= 4) {
44
6.71k
        prepare_coeffs_2t_ssse3(filter_params_y, subpel_y_qn, coeffs_128);
45
6.71k
        __m128i d[2], res;
46
6.71k
        if (w == 2) {
47
1.67k
          d[0] = _mm_cvtsi32_si128(loadu_int16(data));
48
49
3.20k
          do {
50
3.20k
            convolve_y_2tap_2x2_ssse3(data, src_stride, coeffs_128, d, &res);
51
3.20k
            res = round_sr_y_ssse3(res);
52
3.20k
            pack_store_u8_2x2_sse2(res, dst_ptr, dst_stride);
53
54
3.20k
            dst_ptr += 2 * dst_stride;
55
3.20k
            data += 2 * src_stride;
56
3.20k
            y -= 2;
57
3.20k
          } while (y > 0);
58
5.04k
        } else {
59
5.04k
          assert(w == 4);
60
5.04k
          d[0] = _mm_cvtsi32_si128(loadu_int32(data));
61
62
14.8k
          do {
63
14.8k
            convolve_y_2tap_4x2_ssse3(data, src_stride, coeffs_128, d, &res);
64
14.8k
            res = round_sr_y_ssse3(res);
65
14.8k
            pack_store_u8_4x2_sse2(res, dst_ptr, dst_stride);
66
67
14.8k
            dst_ptr += 2 * dst_stride;
68
14.8k
            data += 2 * src_stride;
69
14.8k
            y -= 2;
70
14.8k
          } while (y > 0);
71
5.04k
        }
72
6.84k
      } else {
73
6.84k
        prepare_coeffs_2t_lowbd(filter_params_y, subpel_y_qn, coeffs);
74
75
6.84k
        if (w == 8) {
76
3.84k
          __m128i d[2];
77
3.84k
          d[0] = _mm_loadl_epi64((__m128i *)data);
78
79
12.5k
          do {
80
12.5k
            __m256i res;
81
12.5k
            convolve_y_2tap_8x2_avx2(data, src_stride, coeffs, d, &res);
82
12.5k
            round_pack_store_y_8x2_avx2(res, dst_ptr, dst_stride);
83
84
12.5k
            dst_ptr += 2 * dst_stride;
85
12.5k
            data += 2 * src_stride;
86
12.5k
            y -= 2;
87
88
12.5k
          } while (y > 0);
89
90
3.84k
        } else if (w == 16) {
91
1.86k
          __m128i d[2];
92
1.86k
          d[0] = _mm_loadu_si128((__m128i *)data);
93
94
12.8k
          do {
95
12.8k
            __m256i res[2];
96
12.8k
            convolve_y_2tap_16x2_avx2(data, src_stride, coeffs, d, res);
97
12.8k
            round_pack_store_y_16x2_avx2(res, dst_ptr, dst_stride);
98
99
12.8k
            dst_ptr += 2 * dst_stride;
100
12.8k
            data += 2 * src_stride;
101
12.8k
            y -= 2;
102
12.8k
          } while (y > 0);
103
104
1.86k
        } else {
105
1.13k
          assert(!(w % 32));
106
107
1.13k
          __m256i d[2];
108
1.68k
          do {
109
1.68k
            data = src_ptr + x;
110
1.68k
            dst_ptr = dst + x;
111
1.68k
            y = h;
112
113
1.68k
            d[0] = _mm256_loadu_si256((__m256i *)data);
114
115
39.6k
            do {
116
39.6k
              __m256i res[4];
117
39.6k
              convolve_y_2tap_32x2_avx2(data, src_stride, coeffs, d, res);
118
39.6k
              round_pack_store_y_32x2_avx2(res, dst_ptr, dst_stride);
119
120
39.6k
              dst_ptr += 2 * dst_stride;
121
39.6k
              data += 2 * src_stride;
122
39.6k
              y -= 2;
123
39.6k
            } while (y > 0);
124
125
1.68k
            x += 32;
126
1.68k
          } while (x < w);
127
1.13k
        }
128
6.84k
      }
129
23.7k
    } else {
130
23.7k
      if (w <= 16) {
131
22.7k
        __m128i s[2], res;
132
133
22.7k
        if (w == 2) {
134
4.81k
          s[0] = _mm_cvtsi32_si128(loadu_int16(data));
135
136
9.22k
          do {
137
9.22k
            s[1] = _mm_cvtsi32_si128(loadu_int16(data + src_stride));
138
9.22k
            res = _mm_avg_epu8(s[0], s[1]);
139
9.22k
            xx_storel_16(dst_ptr, res);
140
9.22k
            s[0] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride));
141
9.22k
            res = _mm_avg_epu8(s[1], s[0]);
142
9.22k
            xx_storel_16(dst_ptr + dst_stride, res);
143
144
9.22k
            data += 2 * src_stride;
145
9.22k
            dst_ptr += 2 * dst_stride;
146
9.22k
            y -= 2;
147
9.22k
          } while (y > 0);
148
17.9k
        } else if (w == 4) {
149
9.57k
          s[0] = _mm_cvtsi32_si128(loadu_int32(data));
150
151
26.4k
          do {
152
26.4k
            s[1] = _mm_cvtsi32_si128(loadu_int32(data + src_stride));
153
26.4k
            res = _mm_avg_epu8(s[0], s[1]);
154
26.4k
            xx_storel_32(dst_ptr, res);
155
26.4k
            s[0] = _mm_cvtsi32_si128(loadu_int32(data + 2 * src_stride));
156
26.4k
            res = _mm_avg_epu8(s[1], s[0]);
157
26.4k
            xx_storel_32(dst_ptr + dst_stride, res);
158
159
26.4k
            data += 2 * src_stride;
160
26.4k
            dst_ptr += 2 * dst_stride;
161
26.4k
            y -= 2;
162
26.4k
          } while (y > 0);
163
9.57k
        } else if (w == 8) {
164
6.32k
          s[0] = _mm_loadl_epi64((__m128i *)data);
165
166
21.1k
          do {
167
21.1k
            s[1] = _mm_loadl_epi64((__m128i *)(data + src_stride));
168
21.1k
            res = _mm_avg_epu8(s[0], s[1]);
169
21.1k
            _mm_storel_epi64((__m128i *)dst_ptr, res);
170
21.1k
            s[0] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
171
21.1k
            res = _mm_avg_epu8(s[1], s[0]);
172
21.1k
            _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res);
173
174
21.1k
            data += 2 * src_stride;
175
21.1k
            dst_ptr += 2 * dst_stride;
176
21.1k
            y -= 2;
177
21.1k
          } while (y > 0);
178
6.32k
        } else {
179
2.07k
          assert(w == 16);
180
181
2.07k
          s[0] = _mm_loadu_si128((__m128i *)data);
182
183
11.8k
          do {
184
11.8k
            s[1] = _mm_loadu_si128((__m128i *)(data + src_stride));
185
11.8k
            res = _mm_avg_epu8(s[0], s[1]);
186
11.8k
            _mm_storeu_si128((__m128i *)dst_ptr, res);
187
11.8k
            s[0] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
188
11.8k
            res = _mm_avg_epu8(s[1], s[0]);
189
11.8k
            _mm_storeu_si128((__m128i *)(dst_ptr + dst_stride), res);
190
191
11.8k
            data += 2 * src_stride;
192
11.8k
            dst_ptr += 2 * dst_stride;
193
11.8k
            y -= 2;
194
11.8k
          } while (y > 0);
195
2.07k
        }
196
22.7k
      } else {
197
982
        assert(!(w % 32));
198
199
982
        __m256i s[2], res;
200
1.45k
        do {
201
1.45k
          data = src_ptr + x;
202
1.45k
          dst_ptr = dst + x;
203
1.45k
          y = h;
204
205
1.45k
          s[0] = _mm256_loadu_si256((__m256i *)data);
206
207
37.1k
          do {
208
37.1k
            s[1] = _mm256_loadu_si256((__m256i *)(data + src_stride));
209
37.1k
            res = _mm256_avg_epu8(s[0], s[1]);
210
37.1k
            _mm256_storeu_si256((__m256i *)dst_ptr, res);
211
37.1k
            s[0] = _mm256_loadu_si256((__m256i *)(data + 2 * src_stride));
212
37.1k
            res = _mm256_avg_epu8(s[1], s[0]);
213
37.1k
            _mm256_storeu_si256((__m256i *)(dst_ptr + dst_stride), res);
214
215
37.1k
            data += 2 * src_stride;
216
37.1k
            dst_ptr += 2 * dst_stride;
217
37.1k
            y -= 2;
218
37.1k
          } while (y > 0);
219
220
1.45k
          x += 32;
221
1.45k
        } while (x < w);
222
982
      }
223
23.7k
    }
224
449k
  } else if (vert_tap == 4) {
225
237k
    if (w <= 4) {
226
113k
      prepare_coeffs_4t_ssse3(filter_params_y, subpel_y_qn, coeffs_128);
227
113k
      __m128i d[4], s[2];
228
229
113k
      if (w == 2) {
230
19.9k
        d[0] = _mm_cvtsi32_si128(loadu_int16(data + 0 * src_stride));
231
19.9k
        d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * src_stride));
232
19.9k
        d[2] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride));
233
234
19.9k
        const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]);
235
19.9k
        const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[2]);
236
237
19.9k
        s[0] = _mm_unpacklo_epi8(src_01a, src_12a);
238
33.6k
        do {
239
33.6k
          __m128i res;
240
33.6k
          convolve_y_4tap_2x2_ssse3(data, src_stride, coeffs_128, d, s, &res);
241
33.6k
          res = round_sr_y_ssse3(res);
242
33.6k
          pack_store_u8_2x2_sse2(res, dst_ptr, dst_stride);
243
244
33.6k
          dst_ptr += 2 * dst_stride;
245
33.6k
          data += 2 * src_stride;
246
33.6k
          y -= 2;
247
248
33.6k
          s[0] = s[1];
249
33.6k
        } while (y > 0);
250
251
93.9k
      } else {
252
93.9k
        assert(w == 4);
253
254
93.9k
        d[0] = _mm_cvtsi32_si128(loadu_int32(data + 0 * src_stride));
255
93.9k
        d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * src_stride));
256
93.9k
        d[2] = _mm_cvtsi32_si128(loadu_int32(data + 2 * src_stride));
257
258
93.9k
        const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]);
259
93.9k
        const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[2]);
260
261
93.9k
        s[0] = _mm_unpacklo_epi8(src_01a, src_12a);
262
185k
        do {
263
185k
          __m128i res;
264
185k
          convolve_y_4tap_4x2_ssse3(data, src_stride, coeffs_128, d, s, &res);
265
185k
          res = round_sr_y_ssse3(res);
266
185k
          pack_store_u8_4x2_sse2(res, dst_ptr, dst_stride);
267
268
185k
          dst_ptr += 2 * dst_stride;
269
185k
          data += 2 * src_stride;
270
185k
          y -= 2;
271
272
185k
          s[0] = s[1];
273
185k
        } while (y > 0);
274
93.9k
      }
275
123k
    } else {
276
123k
      prepare_coeffs_4t_lowbd(filter_params_y, subpel_y_qn, coeffs);
277
278
123k
      if (w == 8) {
279
81.8k
        __m128i d[4];
280
81.8k
        __m256i s[2];
281
282
81.8k
        d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
283
81.8k
        d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
284
81.8k
        d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
285
286
81.8k
        const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
287
81.8k
        const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]);
288
289
81.8k
        s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
290
160k
        do {
291
160k
          __m256i res;
292
160k
          convolve_y_4tap_8x2_avx2(data, src_stride, coeffs, d, s, &res);
293
160k
          round_pack_store_y_8x2_avx2(res, dst_ptr, dst_stride);
294
295
160k
          dst_ptr += 2 * dst_stride;
296
160k
          data += 2 * src_stride;
297
160k
          y -= 2;
298
299
160k
          s[0] = s[1];
300
160k
        } while (y > 0);
301
81.8k
      } else if (w == 16) {
302
37.6k
        __m128i d[4];
303
37.6k
        __m256i s[4];
304
305
37.6k
        d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
306
37.6k
        d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
307
37.6k
        d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
308
309
37.6k
        const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
310
37.6k
        const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]);
311
312
37.6k
        s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
313
37.6k
        s[2] = _mm256_unpackhi_epi8(src_01a, src_12a);
314
315
92.8k
        do {
316
92.8k
          __m256i res[2];
317
92.8k
          convolve_y_4tap_16x2_avx2(data, src_stride, coeffs, d, s, res);
318
92.8k
          round_pack_store_y_16x2_avx2(res, dst_ptr, dst_stride);
319
320
92.8k
          dst_ptr += 2 * dst_stride;
321
92.8k
          data += 2 * src_stride;
322
92.8k
          y -= 2;
323
324
92.8k
          s[0] = s[1];
325
92.8k
          s[2] = s[3];
326
92.8k
        } while (y > 0);
327
37.6k
      } else {
328
3.81k
        assert(!(w % 32));
329
330
3.81k
        __m256i d[4], s1[4], s2[4];
331
5.60k
        do {
332
5.60k
          data = src_ptr + x;
333
5.60k
          dst_ptr = dst + x;
334
5.60k
          y = h;
335
336
5.60k
          d[0] = _mm256_loadu_si256((__m256i *)(data + 0 * src_stride));
337
5.60k
          d[1] = _mm256_loadu_si256((__m256i *)(data + 1 * src_stride));
338
5.60k
          d[2] = _mm256_loadu_si256((__m256i *)(data + 2 * src_stride));
339
340
5.60k
          s1[0] = _mm256_unpacklo_epi8(d[0], d[1]);
341
5.60k
          s1[2] = _mm256_unpackhi_epi8(d[0], d[1]);
342
343
5.60k
          s2[0] = _mm256_unpacklo_epi8(d[1], d[2]);
344
5.60k
          s2[2] = _mm256_unpackhi_epi8(d[1], d[2]);
345
346
141k
          do {
347
141k
            __m256i res[4];
348
141k
            convolve_y_4tap_32x2_avx2(data, src_stride, coeffs, d, s1, s2, res);
349
141k
            round_pack_store_y_32x2_avx2(res, dst_ptr, dst_stride);
350
351
141k
            dst_ptr += 2 * dst_stride;
352
141k
            data += 2 * src_stride;
353
141k
            y -= 2;
354
355
141k
            s1[0] = s1[1];
356
141k
            s1[2] = s1[3];
357
358
141k
            s2[0] = s2[1];
359
141k
            s2[2] = s2[3];
360
141k
          } while (y > 0);
361
362
5.60k
          x += 32;
363
5.60k
        } while (x < w);
364
3.81k
      }
365
123k
    }
366
237k
  } else if (vert_tap == 6) {
367
200k
    if (w <= 4) {
368
61.9k
      prepare_coeffs_6t_ssse3(filter_params_y, subpel_y_qn, coeffs_128);
369
370
61.9k
      __m128i d[6], s[3];
371
61.9k
      if (w == 2) {
372
11.6k
        d[0] = _mm_cvtsi32_si128(loadu_int16(data + 0 * src_stride));
373
11.6k
        d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * src_stride));
374
11.6k
        d[2] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride));
375
11.6k
        d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * src_stride));
376
11.6k
        d[4] = _mm_cvtsi32_si128(loadu_int16(data + 4 * src_stride));
377
378
11.6k
        const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]);
379
11.6k
        const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[2]);
380
11.6k
        const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]);
381
11.6k
        const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[4]);
382
383
11.6k
        s[0] = _mm_unpacklo_epi8(src_01a, src_12a);
384
11.6k
        s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
385
386
46.5k
        do {
387
46.5k
          __m128i res;
388
46.5k
          convolve_y_6tap_2x2_ssse3(data, src_stride, coeffs_128, d, s, &res);
389
46.5k
          res = round_sr_y_ssse3(res);
390
46.5k
          pack_store_u8_2x2_sse2(res, dst_ptr, dst_stride);
391
392
46.5k
          dst_ptr += 2 * dst_stride;
393
46.5k
          data += 2 * src_stride;
394
46.5k
          y -= 2;
395
396
46.5k
          s[0] = s[1];
397
46.5k
          s[1] = s[2];
398
46.5k
        } while (y > 0);
399
400
50.2k
      } else {
401
50.2k
        assert(w == 4);
402
50.2k
        d[0] = _mm_cvtsi32_si128(loadu_int32(data + 0 * src_stride));
403
50.2k
        d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * src_stride));
404
50.2k
        d[2] = _mm_cvtsi32_si128(loadu_int32(data + 2 * src_stride));
405
50.2k
        d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * src_stride));
406
50.2k
        d[4] = _mm_cvtsi32_si128(loadu_int32(data + 4 * src_stride));
407
408
50.2k
        const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]);
409
50.2k
        const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[2]);
410
50.2k
        const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]);
411
50.2k
        const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[4]);
412
413
50.2k
        s[0] = _mm_unpacklo_epi8(src_01a, src_12a);
414
50.2k
        s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
415
416
274k
        do {
417
274k
          __m128i res;
418
274k
          convolve_y_6tap_4x2_ssse3(data, src_stride, coeffs_128, d, s, &res);
419
274k
          res = round_sr_y_ssse3(res);
420
274k
          pack_store_u8_4x2_sse2(res, dst_ptr, dst_stride);
421
422
274k
          dst_ptr += 2 * dst_stride;
423
274k
          data += 2 * src_stride;
424
274k
          y -= 2;
425
426
274k
          s[0] = s[1];
427
274k
          s[1] = s[2];
428
274k
        } while (y > 0);
429
50.2k
      }
430
138k
    } else {
431
138k
      prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
432
433
138k
      if (w == 8) {
434
66.7k
        __m128i d[6];
435
66.7k
        __m256i s[3];
436
437
66.7k
        d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
438
66.7k
        d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
439
66.7k
        d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
440
66.7k
        d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride));
441
66.7k
        d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride));
442
443
66.7k
        const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
444
66.7k
        const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]);
445
66.7k
        const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
446
66.7k
        const __m256i src_34a = _mm256_setr_m128i(d[3], d[4]);
447
448
66.7k
        s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
449
66.7k
        s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
450
451
382k
        do {
452
382k
          __m256i res;
453
382k
          convolve_y_6tap_8x2_avx2(data, src_stride, coeffs, d, s, &res);
454
382k
          round_pack_store_y_8x2_avx2(res, dst_ptr, dst_stride);
455
456
382k
          dst_ptr += 2 * dst_stride;
457
382k
          data += 2 * src_stride;
458
382k
          y -= 2;
459
460
382k
          s[0] = s[1];
461
382k
          s[1] = s[2];
462
382k
        } while (y > 0);
463
464
71.3k
      } else {
465
71.3k
        assert(!(w % 16));
466
467
71.3k
        __m128i d[6];
468
71.3k
        __m256i s[6];
469
101k
        do {
470
101k
          data = src_ptr + x;
471
101k
          dst_ptr = dst + x;
472
101k
          y = h;
473
474
101k
          d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
475
101k
          d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
476
101k
          d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
477
101k
          d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
478
101k
          d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
479
480
101k
          const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
481
101k
          const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]);
482
101k
          const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
483
101k
          const __m256i src_34a = _mm256_setr_m128i(d[3], d[4]);
484
485
101k
          s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
486
101k
          s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
487
488
101k
          s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
489
101k
          s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
490
491
1.24M
          do {
492
1.24M
            __m256i res[2];
493
1.24M
            convolve_y_6tap_16x2_avx2(data, src_stride, coeffs, d, s, res);
494
1.24M
            round_pack_store_y_16x2_avx2(res, dst_ptr, dst_stride);
495
496
1.24M
            dst_ptr += 2 * dst_stride;
497
1.24M
            data += 2 * src_stride;
498
1.24M
            y -= 2;
499
500
1.24M
            s[0] = s[1];
501
1.24M
            s[1] = s[2];
502
503
1.24M
            s[3] = s[4];
504
1.24M
            s[4] = s[5];
505
1.24M
          } while (y > 0);
506
507
101k
          x += 16;
508
101k
        } while (x < w);
509
71.3k
      }
510
138k
    }
511
200k
  } else if (vert_tap == 12) {  // vert_tap == 12
512
0
    __m128i d[12];
513
0
    __m256i s[12];
514
0
    prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs);
515
0
    const __m256i v_zero = _mm256_setzero_si256();
516
0
    __m128i right_shift = _mm_cvtsi32_si128(FILTER_BITS);
517
0
    __m256i right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1);
518
519
0
    for (int j = 0; j < w; j += 8) {
520
0
      data = &src_ptr[j];
521
0
      __m256i src10;
522
523
0
      d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
524
0
      d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
525
0
      d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
526
0
      d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride));
527
0
      d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride));
528
0
      d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride));
529
0
      d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
530
0
      d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride));
531
0
      d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
532
0
      d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride));
533
      // Load lines a and b. Line a to lower 128, line b to upper 128
534
0
      const __m256i src_01a = _mm256_permute2x128_si256(
535
0
          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
536
537
0
      const __m256i src_12a = _mm256_permute2x128_si256(
538
0
          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
539
540
0
      const __m256i src_23a = _mm256_permute2x128_si256(
541
0
          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
542
543
0
      const __m256i src_34a = _mm256_permute2x128_si256(
544
0
          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
545
546
0
      const __m256i src_45a = _mm256_permute2x128_si256(
547
0
          _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
548
549
0
      const __m256i src_56a = _mm256_permute2x128_si256(
550
0
          _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20);
551
552
0
      const __m256i src_67a = _mm256_permute2x128_si256(
553
0
          _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20);
554
555
0
      const __m256i src_78a = _mm256_permute2x128_si256(
556
0
          _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20);
557
558
0
      const __m256i src_89a = _mm256_permute2x128_si256(
559
0
          _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20);
560
561
0
      src10 = _mm256_castsi128_si256(
562
0
          _mm_loadl_epi64((__m128i *)(data + 10 * src_stride)));
563
0
      const __m256i src_910a =
564
0
          _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20);
565
566
0
      const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero);
567
0
      const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero);
568
0
      const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero);
569
0
      const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero);
570
0
      const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero);
571
0
      const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero);
572
0
      const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero);
573
0
      const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero);
574
0
      const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero);
575
0
      const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero);
576
577
0
      s[0] = _mm256_unpacklo_epi16(src_01, src_12);
578
0
      s[1] = _mm256_unpacklo_epi16(src_23, src_34);
579
0
      s[2] = _mm256_unpacklo_epi16(src_45, src_56);
580
0
      s[3] = _mm256_unpacklo_epi16(src_67, src_78);
581
0
      s[4] = _mm256_unpacklo_epi16(src_89, src_910);
582
583
0
      s[6] = _mm256_unpackhi_epi16(src_01, src_12);
584
0
      s[7] = _mm256_unpackhi_epi16(src_23, src_34);
585
0
      s[8] = _mm256_unpackhi_epi16(src_45, src_56);
586
0
      s[9] = _mm256_unpackhi_epi16(src_67, src_78);
587
0
      s[10] = _mm256_unpackhi_epi16(src_89, src_910);
588
589
0
      for (i = 0; i < h; i += 2) {
590
0
        data = &src_ptr[i * src_stride + j];
591
0
        const __m256i src_1011a = _mm256_permute2x128_si256(
592
0
            src10,
593
0
            _mm256_castsi128_si256(
594
0
                _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
595
0
            0x20);
596
597
0
        src10 = _mm256_castsi128_si256(
598
0
            _mm_loadl_epi64((__m128i *)(data + 12 * src_stride)));
599
600
0
        const __m256i src_1112a = _mm256_permute2x128_si256(
601
0
            _mm256_castsi128_si256(
602
0
                _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
603
0
            src10, 0x20);
604
605
0
        const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero);
606
0
        const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero);
607
608
0
        s[5] = _mm256_unpacklo_epi16(src_1011, src_1112);
609
0
        s[11] = _mm256_unpackhi_epi16(src_1011, src_1112);
610
611
0
        const __m256i res_lo = convolve_12taps(s, coeffs);
612
613
0
        const __m256i res_32b_lo = _mm256_sra_epi32(
614
0
            _mm256_add_epi32(res_lo, right_shift_const), right_shift);
615
        // 8 bit conversion and saturation to uint8
616
0
        __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
617
0
        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
618
619
0
        if (w - j > 4) {
620
0
          const __m256i res_hi = convolve_12taps(s + 6, coeffs);
621
622
0
          const __m256i res_32b_hi = _mm256_sra_epi32(
623
0
              _mm256_add_epi32(res_hi, right_shift_const), right_shift);
624
0
          __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi);
625
          // 8 bit conversion and saturation to uint8
626
0
          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
627
628
0
          __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi);
629
630
0
          const __m128i res_0 = _mm256_extracti128_si256(res_a, 0);
631
0
          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
632
633
0
          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
634
0
          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
635
0
                           res_1);
636
0
        } else {
637
0
          const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
638
0
          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
639
0
          if (w - j > 2) {
640
0
            xx_storel_32(&dst[i * dst_stride + j], res_0);
641
0
            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
642
0
          } else {
643
0
            xx_storel_16(&dst[i * dst_stride + j], res_0);
644
0
            xx_storel_16(&dst[i * dst_stride + j + dst_stride], res_1);
645
0
          }
646
0
        }
647
0
        s[0] = s[1];
648
0
        s[1] = s[2];
649
0
        s[2] = s[3];
650
0
        s[3] = s[4];
651
0
        s[4] = s[5];
652
653
0
        s[6] = s[7];
654
0
        s[7] = s[8];
655
0
        s[8] = s[9];
656
0
        s[9] = s[10];
657
0
        s[10] = s[11];
658
0
      }
659
0
    }
660
11.8k
  } else {
661
11.8k
    assert(vert_tap == 8);
662
663
11.8k
    if (w <= 4) {
664
5.08k
      prepare_coeffs_ssse3(filter_params_y, subpel_y_qn, coeffs_128);
665
666
5.08k
      __m128i d[8], s[4], res;
667
5.08k
      if (w == 2) {
668
1.11k
        d[0] = _mm_cvtsi32_si128(loadu_int16(data + 0 * src_stride));
669
1.11k
        d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * src_stride));
670
1.11k
        d[2] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride));
671
1.11k
        d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * src_stride));
672
1.11k
        d[4] = _mm_cvtsi32_si128(loadu_int16(data + 4 * src_stride));
673
1.11k
        d[5] = _mm_cvtsi32_si128(loadu_int16(data + 5 * src_stride));
674
1.11k
        d[6] = _mm_cvtsi32_si128(loadu_int16(data + 6 * src_stride));
675
676
1.11k
        const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]);
677
1.11k
        const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[2]);
678
1.11k
        const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]);
679
1.11k
        const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[4]);
680
1.11k
        const __m128i src_45a = _mm_unpacklo_epi16(d[4], d[5]);
681
1.11k
        const __m128i src_56a = _mm_unpacklo_epi16(d[5], d[6]);
682
683
1.11k
        s[0] = _mm_unpacklo_epi8(src_01a, src_12a);
684
1.11k
        s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
685
1.11k
        s[2] = _mm_unpacklo_epi8(src_45a, src_56a);
686
687
4.46k
        do {
688
4.46k
          convolve_y_8tap_2x2_ssse3(data, src_stride, coeffs_128, d, s, &res);
689
4.46k
          res = round_sr_y_ssse3(res);
690
4.46k
          pack_store_u8_2x2_sse2(res, dst_ptr, dst_stride);
691
692
4.46k
          dst_ptr += 2 * dst_stride;
693
4.46k
          data += 2 * src_stride;
694
4.46k
          y -= 2;
695
696
4.46k
          s[0] = s[1];
697
4.46k
          s[1] = s[2];
698
4.46k
          s[2] = s[3];
699
4.46k
        } while (y > 0);
700
701
3.97k
      } else {
702
3.97k
        assert(w == 4);
703
704
3.97k
        d[0] = _mm_cvtsi32_si128(loadu_int32(data + 0 * src_stride));
705
3.97k
        d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * src_stride));
706
3.97k
        d[2] = _mm_cvtsi32_si128(loadu_int32(data + 2 * src_stride));
707
3.97k
        d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * src_stride));
708
3.97k
        d[4] = _mm_cvtsi32_si128(loadu_int32(data + 4 * src_stride));
709
3.97k
        d[5] = _mm_cvtsi32_si128(loadu_int32(data + 5 * src_stride));
710
3.97k
        d[6] = _mm_cvtsi32_si128(loadu_int32(data + 6 * src_stride));
711
712
3.97k
        const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]);
713
3.97k
        const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[2]);
714
3.97k
        const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]);
715
3.97k
        const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[4]);
716
3.97k
        const __m128i src_45a = _mm_unpacklo_epi32(d[4], d[5]);
717
3.97k
        const __m128i src_56a = _mm_unpacklo_epi32(d[5], d[6]);
718
719
3.97k
        s[0] = _mm_unpacklo_epi8(src_01a, src_12a);
720
3.97k
        s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
721
3.97k
        s[2] = _mm_unpacklo_epi8(src_45a, src_56a);
722
723
21.6k
        do {
724
21.6k
          convolve_y_8tap_4x2_ssse3(data, src_stride, coeffs_128, d, s, &res);
725
21.6k
          res = round_sr_y_ssse3(res);
726
21.6k
          pack_store_u8_4x2_sse2(res, dst_ptr, dst_stride);
727
728
21.6k
          dst_ptr += 2 * dst_stride;
729
21.6k
          data += 2 * src_stride;
730
21.6k
          y -= 2;
731
732
21.6k
          s[0] = s[1];
733
21.6k
          s[1] = s[2];
734
21.6k
          s[2] = s[3];
735
21.6k
        } while (y > 0);
736
3.97k
      }
737
6.74k
    } else {
738
6.74k
      prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
739
740
6.74k
      if (w == 8) {
741
3.43k
        __m128i d[8];
742
3.43k
        __m256i s[4];
743
744
3.43k
        d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
745
3.43k
        d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
746
3.43k
        d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
747
3.43k
        d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride));
748
3.43k
        d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride));
749
3.43k
        d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride));
750
3.43k
        d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
751
752
3.43k
        const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
753
3.43k
        const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]);
754
3.43k
        const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
755
3.43k
        const __m256i src_34a = _mm256_setr_m128i(d[3], d[4]);
756
3.43k
        const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]);
757
3.43k
        const __m256i src_56a = _mm256_setr_m128i(d[5], d[6]);
758
759
3.43k
        s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
760
3.43k
        s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
761
3.43k
        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
762
763
20.9k
        do {
764
20.9k
          __m256i res;
765
20.9k
          convolve_y_8tap_8x2_avx2(data, src_stride, coeffs, d, s, &res);
766
20.9k
          round_pack_store_y_8x2_avx2(res, dst_ptr, dst_stride);
767
768
20.9k
          dst_ptr += 2 * dst_stride;
769
20.9k
          data += 2 * src_stride;
770
20.9k
          y -= 2;
771
772
20.9k
          s[0] = s[1];
773
20.9k
          s[1] = s[2];
774
20.9k
          s[2] = s[3];
775
20.9k
        } while (y > 0);
776
777
3.43k
      } else {
778
3.31k
        assert(!(w % 16));
779
780
3.31k
        __m128i d[8];
781
3.31k
        __m256i s[8];
782
5.39k
        do {
783
5.39k
          data = src_ptr + x;
784
5.39k
          dst_ptr = dst + x;
785
5.39k
          y = h;
786
787
5.39k
          d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
788
5.39k
          d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
789
5.39k
          d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
790
5.39k
          d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
791
5.39k
          d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
792
5.39k
          d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
793
5.39k
          d[6] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
794
795
5.39k
          const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
796
5.39k
          const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]);
797
5.39k
          const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
798
5.39k
          const __m256i src_34a = _mm256_setr_m128i(d[3], d[4]);
799
5.39k
          const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]);
800
5.39k
          const __m256i src_56a = _mm256_setr_m128i(d[5], d[6]);
801
802
5.39k
          s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
803
5.39k
          s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
804
5.39k
          s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
805
806
5.39k
          s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
807
5.39k
          s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
808
5.39k
          s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
809
810
79.4k
          do {
811
79.4k
            __m256i res[2];
812
79.4k
            convolve_y_8tap_16x2_avx2(data, src_stride, coeffs, d, s, res);
813
79.4k
            round_pack_store_y_16x2_avx2(res, dst_ptr, dst_stride);
814
815
79.4k
            dst_ptr += 2 * dst_stride;
816
79.4k
            data += 2 * src_stride;
817
79.4k
            y -= 2;
818
819
79.4k
            s[0] = s[1];
820
79.4k
            s[1] = s[2];
821
79.4k
            s[2] = s[3];
822
823
79.4k
            s[4] = s[5];
824
79.4k
            s[5] = s[6];
825
79.4k
            s[6] = s[7];
826
79.4k
          } while (y > 0);
827
828
5.39k
          x += 16;
829
5.39k
        } while (x < w);
830
3.31k
      }
831
6.74k
    }
832
11.8k
  }
833
486k
}
834
835
void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
836
                            uint8_t *dst, int32_t dst_stride, int32_t w,
837
                            int32_t h,
838
                            const InterpFilterParams *filter_params_x,
839
                            const int32_t subpel_x_qn,
840
501k
                            ConvolveParams *conv_params) {
841
501k
  const int bits = FILTER_BITS - conv_params->round_0;
842
501k
  int i, j, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
843
844
501k
  assert(bits >= 0);
845
501k
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
846
501k
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
847
501k
  assert(conv_params->round_0 > 0);
848
849
501k
  assert(horiz_tap == 2 || horiz_tap == 4 || horiz_tap == 6 || horiz_tap == 8 ||
850
501k
         horiz_tap == 12);
851
501k
  assert((!(w % 2)) || (w <= 128));
852
501k
  assert((h % 2) == 0);
853
854
501k
  __m256i coeffs[6] = { 0 }, filt[4] = { 0 };
855
501k
  __m128i coeffs_128[4] = { 0 };
856
857
501k
  i = 0;
858
  // horz_filt as 4 tap
859
501k
  if (horiz_tap == 4) {
860
    // since fo_horiz = 1
861
202k
    const uint8_t *src_ptr = src - 1;
862
202k
    if (w == 2) {
863
33.4k
      prepare_coeffs_4t_ssse3(filter_params_x, subpel_x_qn, coeffs_128);
864
88.0k
      do {
865
88.0k
        const __m128i res =
866
88.0k
            convolve_x_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
867
88.0k
        const __m128i reg = round_sr_x_ssse3(res);
868
88.0k
        pack_store_u8_2x2_sse2(reg, dst, dst_stride);
869
88.0k
        src_ptr += 2 * src_stride;
870
88.0k
        dst += 2 * dst_stride;
871
88.0k
        h -= 2;
872
88.0k
      } while (h);
873
168k
    } else if (w == 4) {
874
153k
      prepare_coeffs_4t_ssse3(filter_params_x, subpel_x_qn, coeffs_128);
875
507k
      do {
876
507k
        const __m128i reg =
877
507k
            convolve_x_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
878
507k
        const __m128i res = round_sr_x_ssse3(reg);
879
507k
        pack_store_u8_4x2_sse2(res, dst, dst_stride);
880
507k
        src_ptr += 2 * src_stride;
881
507k
        dst += 2 * dst_stride;
882
507k
        h -= 2;
883
507k
      } while (h);
884
153k
    } else if (w == 8) {
885
8.79k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
886
8.79k
      filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
887
8.79k
      filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
888
34.3k
      do {
889
34.3k
        const __m256i data = _mm256_setr_m128i(
890
34.3k
            _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride])),
891
34.3k
            _mm_loadu_si128(
892
34.3k
                (__m128i *)(&src_ptr[i * src_stride + src_stride])));
893
894
34.3k
        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
895
896
34.3k
        res_16b = round_sr_x_avx2(res_16b);
897
898
        /* rounding code */
899
        // 8 bit conversion and saturation to uint8
900
34.3k
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
901
902
34.3k
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
903
34.3k
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
904
905
34.3k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
906
34.3k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
907
34.3k
        i += 2;
908
34.3k
      } while (i < h);
909
8.79k
    } else {
910
6.94k
      assert(!(w % 16));
911
6.94k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
912
6.94k
      filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
913
6.94k
      filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
914
159k
      do {
915
159k
        j = 0;
916
554k
        do {
917
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
918
          // 18 19 20 21 22 23
919
554k
          const __m256i data = _mm256_inserti128_si256(
920
554k
              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
921
554k
              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
922
554k
              1);
923
924
554k
          __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
925
926
554k
          res_16b = round_sr_x_avx2(res_16b);
927
928
          /* rounding code */
929
          // 8 bit conversion and saturation to uint8
930
554k
          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
931
932
          // Store values into the destination buffer
933
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
934
554k
          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
935
554k
          __m128i res = _mm256_castsi256_si128(res_8b);
936
554k
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
937
554k
          j += 16;
938
554k
        } while (j < w);
939
159k
        i++;
940
159k
      } while (i < h);
941
6.94k
    }
942
299k
  } else if (horiz_tap == 6) {
943
    // since (horiz_tap/2 - 1 == 2)
944
245k
    const uint8_t *src_ptr = src - 2;
945
245k
    prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs);
946
245k
    filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
947
245k
    filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
948
245k
    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
949
245k
    if (w == 8) {
950
522k
      do {
951
522k
        const __m256i data = _mm256_setr_m128i(
952
522k
            _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride])),
953
522k
            _mm_loadu_si128(
954
522k
                (__m128i *)(&src_ptr[i * src_stride + src_stride])));
955
956
522k
        __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
957
958
522k
        res_16b = round_sr_x_avx2(res_16b);
959
960
        /* rounding code */
961
        // 8 bit conversion and saturation to uint8
962
522k
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
963
964
522k
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
965
522k
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
966
522k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
967
522k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
968
522k
        i += 2;
969
522k
      } while (i < h);
970
140k
    } else if (w == 16) {
971
415k
      do {
972
415k
        __m256i data[2] = { 0 };
973
974
415k
        load_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs, filt, data);
975
415k
        round_pack_store_16x2_avx2(data, dst, dst_stride);
976
415k
        src_ptr += 2 * src_stride;
977
415k
        dst += 2 * dst_stride;
978
415k
        h -= 2;
979
415k
      } while (h);
980
80.8k
    } else if (w == 32) {
981
386k
      do {
982
386k
        convolve_sr_store_6tap_32_avx2(src_ptr, coeffs, filt, dst);
983
386k
        src_ptr += src_stride;
984
386k
        dst += dst_stride;
985
386k
      } while ((--h) > 0);
986
19.7k
    } else if (w == 64) {
987
160k
      do {
988
160k
        convolve_sr_store_6tap_32_avx2(src_ptr, coeffs, filt, dst);
989
160k
        convolve_sr_store_6tap_32_avx2(src_ptr + 32, coeffs, filt, dst + 32);
990
160k
        src_ptr += src_stride;
991
160k
        dst += dst_stride;
992
160k
      } while ((--h) > 0);
993
3.25k
    } else {
994
529
      assert(w == 128);
995
996
65.5k
      do {
997
65.5k
        convolve_sr_store_6tap_32_avx2(src_ptr, coeffs, filt, dst);
998
65.5k
        convolve_sr_store_6tap_32_avx2(src_ptr + SECOND_32_BLK, coeffs, filt,
999
65.5k
                                       dst + SECOND_32_BLK);
1000
65.5k
        convolve_sr_store_6tap_32_avx2(src_ptr + THIRD_32_BLK, coeffs, filt,
1001
65.5k
                                       dst + THIRD_32_BLK);
1002
65.5k
        convolve_sr_store_6tap_32_avx2(src_ptr + FOURTH_32_BLK, coeffs, filt,
1003
65.5k
                                       dst + FOURTH_32_BLK);
1004
65.5k
        src_ptr += src_stride;
1005
65.5k
        dst += dst_stride;
1006
65.5k
      } while ((--h) > 0);
1007
566
    }
1008
245k
  } else if (horiz_tap == 8) {
1009
    // since (horiz_tap / 2 - 1) == 3
1010
12.9k
    const uint8_t *src_ptr = src - 3;
1011
12.9k
    prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
1012
12.9k
    filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
1013
12.9k
    filt[1] =
1014
12.9k
        _mm256_load_si256((__m256i const *)(filt_global_avx2 + SECOND_32_BLK));
1015
12.9k
    filt[2] =
1016
12.9k
        _mm256_load_si256((__m256i const *)(filt_global_avx2 + THIRD_32_BLK));
1017
12.9k
    filt[3] =
1018
12.9k
        _mm256_load_si256((__m256i const *)(filt_global_avx2 + FOURTH_32_BLK));
1019
1020
12.9k
    if (w == 8) {
1021
28.7k
      do {
1022
28.7k
        const __m256i data = _mm256_setr_m128i(
1023
28.7k
            _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride])),
1024
28.7k
            _mm_loadu_si128(
1025
28.7k
                (__m128i *)(&src_ptr[i * src_stride + src_stride])));
1026
1027
28.7k
        __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
1028
1029
28.7k
        res_16b = round_sr_x_avx2(res_16b);
1030
1031
        /* rounding code */
1032
        // 8 bit conversion and saturation to uint8
1033
28.7k
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
1034
1035
28.7k
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
1036
28.7k
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
1037
28.7k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
1038
28.7k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
1039
28.7k
        i += 2;
1040
28.7k
      } while (i < h);
1041
6.96k
    } else if (w == 16) {
1042
23.2k
      do {
1043
23.2k
        __m256i data[2] = { 0 };
1044
1045
23.2k
        load_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs, filt, data);
1046
23.2k
        round_pack_store_16x2_avx2(data, dst, dst_stride);
1047
23.2k
        src_ptr += 2 * src_stride;
1048
23.2k
        dst += 2 * dst_stride;
1049
23.2k
        h -= 2;
1050
23.2k
      } while (h);
1051
4.12k
    } else if (w == 32) {
1052
27.9k
      do {
1053
27.9k
        load_convolve_round_8tap_32_avx2(src_ptr, coeffs, filt, dst);
1054
27.9k
        src_ptr += src_stride;
1055
27.9k
        dst += dst_stride;
1056
27.9k
      } while ((--h) > 0);
1057
1.26k
    } else if (w == 64) {
1058
22.2k
      do {
1059
22.2k
        load_convolve_round_8tap_32_avx2(src_ptr, coeffs, filt, dst);
1060
22.2k
        load_convolve_round_8tap_32_avx2(src_ptr + 32, coeffs, filt, dst + 32);
1061
22.2k
        src_ptr += src_stride;
1062
22.2k
        dst += dst_stride;
1063
22.2k
      } while ((--h) > 0);
1064
489
    } else {
1065
148
      assert(w == 128);
1066
14.0k
      do {
1067
14.0k
        load_convolve_round_8tap_32_avx2(src_ptr, coeffs, filt, dst);
1068
14.0k
        load_convolve_round_8tap_32_avx2(src_ptr + SECOND_32_BLK, coeffs, filt,
1069
14.0k
                                         dst + SECOND_32_BLK);
1070
14.0k
        load_convolve_round_8tap_32_avx2(src_ptr + THIRD_32_BLK, coeffs, filt,
1071
14.0k
                                         dst + THIRD_32_BLK);
1072
14.0k
        load_convolve_round_8tap_32_avx2(src_ptr + FOURTH_32_BLK, coeffs, filt,
1073
14.0k
                                         dst + FOURTH_32_BLK);
1074
14.0k
        src_ptr += src_stride;
1075
14.0k
        dst += dst_stride;
1076
14.0k
      } while ((--h) > 0);
1077
148
    }
1078
41.1k
  } else if (horiz_tap == 12) {  // horiz_tap == 12
1079
0
    const int fo_horiz = filter_params_x->taps / 2 - 1;
1080
0
    prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs);
1081
0
    const __m128i round_shift = _mm_cvtsi32_si128(bits);
1082
0
    const uint8_t *const src_ptr = src - fo_horiz;
1083
0
    const __m256i v_zero = _mm256_setzero_si256();
1084
0
    __m256i round_0_const =
1085
0
        _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1);
1086
0
    __m256i round_const = _mm256_set1_epi32((1 << bits) >> 1);
1087
0
    __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
1088
0
    __m256i s[6] = { 0 };
1089
1090
0
    if (w <= 4) {
1091
0
      do {
1092
0
        const __m256i data = _mm256_permute2x128_si256(
1093
0
            _mm256_castsi128_si256(
1094
0
                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
1095
0
            _mm256_castsi128_si256(_mm_loadu_si128(
1096
0
                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
1097
0
            0x20);
1098
        // row0 0..7 row1 0..7
1099
0
        const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
1100
        // row0 8..F row1 8..F
1101
0
        const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
1102
1103
        // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03
1104
0
        const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
1105
        // row0 04 04 .. 07 07 row1 04 04 .. 07 07
1106
0
        const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
1107
1108
        // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B
1109
0
        const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
1110
        // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F
1111
0
        const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
1112
1113
        // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14
1114
0
        s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
1115
        // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
1116
0
        s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
1117
        // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18
1118
0
        s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
1119
        // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A
1120
0
        s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
1121
        // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C
1122
0
        s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
1123
        // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E
1124
0
        s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
1125
1126
0
        const __m256i res_lo = convolve_12taps(s, coeffs);
1127
1128
0
        __m256i res_32b_lo = _mm256_sra_epi32(
1129
0
            _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
1130
1131
        // 00 01 02 03 10 12 13 14
1132
0
        res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const),
1133
0
                                      round_shift);
1134
        // 8 bit conversion and saturation to uint8
1135
        // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13
1136
0
        __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
1137
        // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
1138
        // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
1139
0
        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
1140
1141
        // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
1142
0
        const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
1143
        // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
1144
0
        const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
1145
0
        if (w > 2) {
1146
          // 00 01 02 03
1147
0
          xx_storel_32(&dst[i * dst_stride], res_0);
1148
          // 10 11 12 13
1149
0
          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
1150
0
        } else {
1151
          // 00 01
1152
0
          xx_storel_16(&dst[i * dst_stride], res_0);
1153
          // 10 11
1154
0
          xx_storel_16(&dst[i * dst_stride + dst_stride], res_1);
1155
0
        }
1156
0
        i += 2;
1157
0
      } while (i < h);
1158
0
    } else {
1159
0
      assert(!(w % 8));
1160
0
      do {
1161
0
        j = 0;
1162
0
        do {
1163
0
          const __m256i data = _mm256_permute2x128_si256(
1164
0
              _mm256_castsi128_si256(
1165
0
                  _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
1166
0
              _mm256_castsi128_si256(_mm_loadu_si128(
1167
0
                  (__m128i *)(&src_ptr[i * src_stride + j + 4]))),
1168
0
              0x20);
1169
          // row0 0..7 4..B
1170
0
          const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
1171
          // row0 8..F C..13
1172
0
          const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
1173
1174
          // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07
1175
0
          const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
1176
          // row0 04 04 .. 07 07 08 08 .. 0B 0B
1177
0
          const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
1178
1179
          // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F
1180
0
          const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
1181
          // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13
1182
0
          const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
1183
1184
0
          s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
1185
0
          s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
1186
0
          s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
1187
0
          s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
1188
0
          s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
1189
0
          s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
1190
1191
0
          const __m256i res_lo = convolve_12taps(s, coeffs);
1192
1193
0
          __m256i res_32b_lo = _mm256_sra_epi32(
1194
0
              _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
1195
1196
0
          res_32b_lo = _mm256_sra_epi32(
1197
0
              _mm256_add_epi32(res_32b_lo, round_const), round_shift);
1198
          // 8 bit conversion and saturation to uint8
1199
0
          __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
1200
0
          __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
1201
0
          const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
1202
0
          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
1203
0
          xx_storel_32(&dst[i * dst_stride + j], res_0);
1204
0
          xx_storel_32(&dst[i * dst_stride + j + 4], res_1);
1205
1206
0
          j += 8;
1207
0
        } while (j < w);
1208
0
        i++;
1209
0
      } while (i < h);
1210
0
    }
1211
41.1k
  } else {
1212
41.1k
    assert(horiz_tap == 2);
1213
    // since (filter_params_x->taps / 2 - 1) == 0
1214
41.1k
    const uint8_t *src_ptr = src;
1215
41.1k
    if (subpel_x_qn != 8) {
1216
11.2k
      if (w <= 8) {
1217
8.17k
        prepare_coeffs_2t_ssse3(filter_params_x, subpel_x_qn, coeffs_128);
1218
1219
8.17k
        if (w == 2) {
1220
2.69k
          do {
1221
2.69k
            const __m128i data =
1222
2.69k
                convolve_x_2tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
1223
2.69k
            const __m128i reg = round_sr_x_ssse3(data);
1224
2.69k
            pack_store_u8_2x2_sse2(reg, dst, dst_stride);
1225
2.69k
            src_ptr += 2 * src_stride;
1226
2.69k
            dst += 2 * dst_stride;
1227
2.69k
            h -= 2;
1228
2.69k
          } while (h);
1229
6.97k
        } else if (w == 4) {
1230
10.7k
          do {
1231
10.7k
            const __m128i data =
1232
10.7k
                convolve_x_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
1233
10.7k
            const __m128i reg = round_sr_x_ssse3(data);
1234
10.7k
            pack_store_u8_4x2_sse2(reg, dst, dst_stride);
1235
10.7k
            src_ptr += 2 * src_stride;
1236
10.7k
            dst += 2 * dst_stride;
1237
10.7k
            h -= 2;
1238
10.7k
          } while (h);
1239
3.61k
        } else {
1240
3.35k
          assert(w == 8);
1241
1242
10.8k
          do {
1243
10.8k
            __m128i data[2] = { 0 };
1244
1245
10.8k
            convolve_x_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, data);
1246
10.8k
            data[0] = round_sr_x_ssse3(data[0]);
1247
10.8k
            data[1] = round_sr_x_ssse3(data[1]);
1248
10.8k
            const __m128i reg = _mm_packus_epi16(data[0], data[1]);
1249
10.8k
            _mm_storel_epi64((__m128i *)dst, reg);
1250
10.8k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), reg);
1251
1252
10.8k
            src_ptr += 2 * src_stride;
1253
10.8k
            dst += 2 * dst_stride;
1254
10.8k
            h -= 2;
1255
10.8k
          } while (h);
1256
3.35k
        }
1257
8.17k
      } else {
1258
3.09k
        prepare_coeffs_2t_lowbd(filter_params_x, subpel_x_qn, coeffs);
1259
1260
3.09k
        if (w == 16) {
1261
9.00k
          do {
1262
9.00k
            __m256i data[2] = { 0 };
1263
1264
9.00k
            convolve_x_2tap_16x2_avx2(src_ptr, src_stride, coeffs, data);
1265
9.00k
            round_pack_store_16x2_avx2(data, dst, dst_stride);
1266
9.00k
            src_ptr += 2 * src_stride;
1267
9.00k
            dst += 2 * dst_stride;
1268
9.00k
            h -= 2;
1269
9.00k
          } while (h);
1270
1.76k
        } else if (w == 32) {
1271
15.5k
          do {
1272
15.5k
            convolve_round_2tap_32_avx2(src_ptr, coeffs, dst);
1273
15.5k
            src_ptr += src_stride;
1274
15.5k
            dst += dst_stride;
1275
15.5k
          } while ((--h) > 0);
1276
699
        } else if (w == 64) {
1277
23.5k
          do {
1278
23.5k
            convolve_round_2tap_32_avx2(src_ptr, coeffs, dst);
1279
23.5k
            convolve_round_2tap_32_avx2(src_ptr + SECOND_32_BLK, coeffs,
1280
23.5k
                                        dst + SECOND_32_BLK);
1281
23.5k
            src_ptr += src_stride;
1282
23.5k
            dst += dst_stride;
1283
23.5k
          } while ((--h) > 0);
1284
566
        } else {
1285
133
          assert(w == 128);
1286
1287
10.6k
          do {
1288
10.6k
            convolve_round_2tap_32_avx2(src_ptr, coeffs, dst);
1289
10.6k
            convolve_round_2tap_32_avx2(src_ptr + (SECOND_32_BLK), coeffs,
1290
10.6k
                                        dst + (SECOND_32_BLK));
1291
10.6k
            convolve_round_2tap_32_avx2(src_ptr + (THIRD_32_BLK), coeffs,
1292
10.6k
                                        dst + (THIRD_32_BLK));
1293
10.6k
            convolve_round_2tap_32_avx2(src_ptr + (FOURTH_32_BLK), coeffs,
1294
10.6k
                                        dst + (FOURTH_32_BLK));
1295
10.6k
            src_ptr += src_stride;
1296
10.6k
            dst += dst_stride;
1297
10.6k
          } while ((--h) > 0);
1298
133
        }
1299
3.09k
      }
1300
29.9k
    } else {
1301
29.9k
      if (w == 2) {
1302
8.86k
        do {
1303
8.86k
          __m128i data = load_x_u8_4x2_sse4(src_ptr, src_stride);
1304
8.86k
          const __m128i reg1 = _mm_srli_si128(data, 1);
1305
8.86k
          const __m128i reg2 = _mm_avg_epu8(data, reg1);
1306
8.86k
          xx_storel_16(dst, reg2);
1307
8.86k
          {
1308
8.86k
            uint16_t val = (uint16_t)_mm_extract_epi16(reg2, 2);
1309
8.86k
            memcpy(dst + dst_stride, &val, sizeof(val));
1310
8.86k
          }
1311
8.86k
          src_ptr += 2 * src_stride;
1312
8.86k
          dst += 2 * dst_stride;
1313
8.86k
          h -= 2;
1314
8.86k
        } while (h);
1315
25.6k
      } else if (w == 4) {
1316
33.8k
        do {
1317
33.8k
          __m128i data = load_8bit_8x2_to_1_reg_sse2(
1318
33.8k
              src_ptr, (int)(sizeof(*src_ptr) * src_stride));
1319
33.8k
          const __m128i reg1 = _mm_srli_si128(data, 1);
1320
33.8k
          const __m128i reg2 = _mm_avg_epu8(data, reg1);
1321
33.8k
          xx_storel_32(dst, reg2);
1322
33.8k
          {
1323
33.8k
            int32_t val = _mm_extract_epi32(reg2, 2);
1324
33.8k
            memcpy(dst + dst_stride, &val, sizeof(val));
1325
33.8k
          }
1326
1327
33.8k
          src_ptr += 2 * src_stride;
1328
33.8k
          dst += 2 * dst_stride;
1329
33.8k
          h -= 2;
1330
33.8k
        } while (h);
1331
13.0k
      } else if (w == 8) {
1332
25.9k
        do {
1333
25.9k
          const __m128i data00 = _mm_loadu_si128((__m128i *)src_ptr);
1334
25.9k
          const __m128i data10 =
1335
25.9k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
1336
25.9k
          const __m128i data01 = _mm_srli_si128(data00, 1);
1337
25.9k
          const __m128i data11 = _mm_srli_si128(data10, 1);
1338
25.9k
          const __m128i reg0 = _mm_avg_epu8(data00, data01);
1339
25.9k
          const __m128i reg1 = _mm_avg_epu8(data10, data11);
1340
25.9k
          _mm_storel_epi64((__m128i *)dst, reg0);
1341
25.9k
          _mm_storel_epi64((__m128i *)(dst + dst_stride), reg1);
1342
1343
25.9k
          src_ptr += 2 * src_stride;
1344
25.9k
          dst += 2 * dst_stride;
1345
25.9k
          h -= 2;
1346
25.9k
        } while (h);
1347
8.53k
      } else if (w == 16) {
1348
15.7k
        do {
1349
15.7k
          const __m128i data00 = _mm_loadu_si128((__m128i *)src_ptr);
1350
15.7k
          const __m128i data01 = _mm_loadu_si128((__m128i *)(src_ptr + 1));
1351
15.7k
          const __m128i data10 =
1352
15.7k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
1353
15.7k
          const __m128i data11 =
1354
15.7k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
1355
15.7k
          const __m128i reg0 = _mm_avg_epu8(data00, data01);
1356
15.7k
          const __m128i reg1 = _mm_avg_epu8(data10, data11);
1357
15.7k
          _mm_storeu_si128((__m128i *)dst, reg0);
1358
15.7k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), reg1);
1359
1360
15.7k
          src_ptr += 2 * src_stride;
1361
15.7k
          dst += 2 * dst_stride;
1362
15.7k
          h -= 2;
1363
15.7k
        } while (h);
1364
3.08k
      } else if (w == 32) {
1365
22.6k
        do {
1366
22.6k
          load_avg_store_2tap_32_avx2(src_ptr, dst);
1367
22.6k
          src_ptr += src_stride;
1368
22.6k
          dst += dst_stride;
1369
22.6k
        } while ((--h) > 0);
1370
950
      } else if (w == 64) {
1371
15.8k
        do {
1372
15.8k
          load_avg_store_2tap_32_avx2(src_ptr, dst);
1373
15.8k
          load_avg_store_2tap_32_avx2(src_ptr + (SECOND_32_BLK),
1374
15.8k
                                      dst + (SECOND_32_BLK));
1375
15.8k
          src_ptr += src_stride;
1376
15.8k
          dst += dst_stride;
1377
15.8k
        } while ((--h) > 0);
1378
313
      } else {
1379
172
        assert(w == 128);
1380
1381
15.4k
        do {
1382
15.4k
          load_avg_store_2tap_32_avx2(src_ptr, dst);
1383
15.4k
          load_avg_store_2tap_32_avx2(src_ptr + (SECOND_32_BLK),
1384
15.4k
                                      dst + (SECOND_32_BLK));
1385
15.4k
          load_avg_store_2tap_32_avx2(src_ptr + (THIRD_32_BLK),
1386
15.4k
                                      dst + (THIRD_32_BLK));
1387
15.4k
          load_avg_store_2tap_32_avx2(src_ptr + (FOURTH_32_BLK),
1388
15.4k
                                      dst + (FOURTH_32_BLK));
1389
15.4k
          src_ptr += src_stride;
1390
15.4k
          dst += dst_stride;
1391
15.4k
        } while ((--h) > 0);
1392
172
      }
1393
29.9k
    }
1394
41.1k
  }
1395
501k
}