Coverage Report

Created: 2026-06-30 06:53

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/av1/common/x86/convolve_avx2.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <immintrin.h>
13
14
#include "config/av1_rtcd.h"
15
16
#include "aom_dsp/aom_dsp_common.h"
17
#include "aom_dsp/x86/convolve_avx2.h"
18
#include "aom_dsp/x86/convolve_common_intrin.h"
19
#include "aom_dsp/x86/synonyms.h"
20
21
void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
22
                            uint8_t *dst, int32_t dst_stride, int32_t w,
23
                            int32_t h,
24
                            const InterpFilterParams *filter_params_y,
25
541k
                            const int32_t subpel_y_qn) {
26
541k
  __m128i coeffs_128[4];
27
541k
  __m256i coeffs[6];
28
541k
  int x = 0, y = h;
29
30
541k
  int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
31
541k
  assert(vert_tap == 2 || vert_tap == 4 || vert_tap == 6 || vert_tap == 8 ||
32
541k
         vert_tap == 12);
33
541k
  assert(!(w % 2));
34
541k
  assert(!(h % 2));
35
36
541k
  const int fo_vert = vert_tap / 2 - 1;
37
541k
  const uint8_t *const src_ptr = src - fo_vert * src_stride;
38
541k
  const uint8_t *data = src_ptr;
39
541k
  uint8_t *dst_ptr = dst;
40
41
541k
  if (vert_tap == 2) {
42
28.2k
    if (subpel_y_qn != 8) {
43
12.8k
      if (w <= 4) {
44
6.18k
        prepare_coeffs_2t_ssse3(filter_params_y, subpel_y_qn, coeffs_128);
45
6.18k
        __m128i d[2], res;
46
6.18k
        if (w == 2) {
47
1.57k
          d[0] = _mm_cvtsi32_si128(loadu_int16(data));
48
49
3.21k
          do {
50
3.21k
            convolve_y_2tap_2x2_ssse3(data, src_stride, coeffs_128, d, &res);
51
3.21k
            res = round_sr_y_ssse3(res);
52
3.21k
            pack_store_u8_2x2_sse2(res, dst_ptr, dst_stride);
53
54
3.21k
            dst_ptr += 2 * dst_stride;
55
3.21k
            data += 2 * src_stride;
56
3.21k
            y -= 2;
57
3.21k
          } while (y > 0);
58
4.60k
        } else {
59
4.60k
          assert(w == 4);
60
4.60k
          d[0] = _mm_cvtsi32_si128(loadu_int32(data));
61
62
13.8k
          do {
63
13.8k
            convolve_y_2tap_4x2_ssse3(data, src_stride, coeffs_128, d, &res);
64
13.8k
            res = round_sr_y_ssse3(res);
65
13.8k
            pack_store_u8_4x2_sse2(res, dst_ptr, dst_stride);
66
67
13.8k
            dst_ptr += 2 * dst_stride;
68
13.8k
            data += 2 * src_stride;
69
13.8k
            y -= 2;
70
13.8k
          } while (y > 0);
71
4.60k
        }
72
6.64k
      } else {
73
6.64k
        prepare_coeffs_2t_lowbd(filter_params_y, subpel_y_qn, coeffs);
74
75
6.64k
        if (w == 8) {
76
3.65k
          __m128i d[2];
77
3.65k
          d[0] = _mm_loadl_epi64((__m128i *)data);
78
79
12.4k
          do {
80
12.4k
            __m256i res;
81
12.4k
            convolve_y_2tap_8x2_avx2(data, src_stride, coeffs, d, &res);
82
12.4k
            round_pack_store_y_8x2_avx2(res, dst_ptr, dst_stride);
83
84
12.4k
            dst_ptr += 2 * dst_stride;
85
12.4k
            data += 2 * src_stride;
86
12.4k
            y -= 2;
87
88
12.4k
          } while (y > 0);
89
90
3.65k
        } else if (w == 16) {
91
1.81k
          __m128i d[2];
92
1.81k
          d[0] = _mm_loadu_si128((__m128i *)data);
93
94
12.8k
          do {
95
12.8k
            __m256i res[2];
96
12.8k
            convolve_y_2tap_16x2_avx2(data, src_stride, coeffs, d, res);
97
12.8k
            round_pack_store_y_16x2_avx2(res, dst_ptr, dst_stride);
98
99
12.8k
            dst_ptr += 2 * dst_stride;
100
12.8k
            data += 2 * src_stride;
101
12.8k
            y -= 2;
102
12.8k
          } while (y > 0);
103
104
1.81k
        } else {
105
1.17k
          assert(!(w % 32));
106
107
1.17k
          __m256i d[2];
108
1.71k
          do {
109
1.71k
            data = src_ptr + x;
110
1.71k
            dst_ptr = dst + x;
111
1.71k
            y = h;
112
113
1.71k
            d[0] = _mm256_loadu_si256((__m256i *)data);
114
115
38.7k
            do {
116
38.7k
              __m256i res[4];
117
38.7k
              convolve_y_2tap_32x2_avx2(data, src_stride, coeffs, d, res);
118
38.7k
              round_pack_store_y_32x2_avx2(res, dst_ptr, dst_stride);
119
120
38.7k
              dst_ptr += 2 * dst_stride;
121
38.7k
              data += 2 * src_stride;
122
38.7k
              y -= 2;
123
38.7k
            } while (y > 0);
124
125
1.71k
            x += 32;
126
1.71k
          } while (x < w);
127
1.17k
        }
128
6.64k
      }
129
15.4k
    } else {
130
15.4k
      if (w <= 16) {
131
14.5k
        __m128i s[2], res;
132
133
14.5k
        if (w == 2) {
134
3.54k
          s[0] = _mm_cvtsi32_si128(loadu_int16(data));
135
136
6.70k
          do {
137
6.70k
            s[1] = _mm_cvtsi32_si128(loadu_int16(data + src_stride));
138
6.70k
            res = _mm_avg_epu8(s[0], s[1]);
139
6.70k
            xx_storel_16(dst_ptr, res);
140
6.70k
            s[0] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride));
141
6.70k
            res = _mm_avg_epu8(s[1], s[0]);
142
6.70k
            xx_storel_16(dst_ptr + dst_stride, res);
143
144
6.70k
            data += 2 * src_stride;
145
6.70k
            dst_ptr += 2 * dst_stride;
146
6.70k
            y -= 2;
147
6.70k
          } while (y > 0);
148
11.0k
        } else if (w == 4) {
149
5.87k
          s[0] = _mm_cvtsi32_si128(loadu_int32(data));
150
151
16.3k
          do {
152
16.3k
            s[1] = _mm_cvtsi32_si128(loadu_int32(data + src_stride));
153
16.3k
            res = _mm_avg_epu8(s[0], s[1]);
154
16.3k
            xx_storel_32(dst_ptr, res);
155
16.3k
            s[0] = _mm_cvtsi32_si128(loadu_int32(data + 2 * src_stride));
156
16.3k
            res = _mm_avg_epu8(s[1], s[0]);
157
16.3k
            xx_storel_32(dst_ptr + dst_stride, res);
158
159
16.3k
            data += 2 * src_stride;
160
16.3k
            dst_ptr += 2 * dst_stride;
161
16.3k
            y -= 2;
162
16.3k
          } while (y > 0);
163
5.87k
        } else if (w == 8) {
164
3.79k
          s[0] = _mm_loadl_epi64((__m128i *)data);
165
166
12.7k
          do {
167
12.7k
            s[1] = _mm_loadl_epi64((__m128i *)(data + src_stride));
168
12.7k
            res = _mm_avg_epu8(s[0], s[1]);
169
12.7k
            _mm_storel_epi64((__m128i *)dst_ptr, res);
170
12.7k
            s[0] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
171
12.7k
            res = _mm_avg_epu8(s[1], s[0]);
172
12.7k
            _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res);
173
174
12.7k
            data += 2 * src_stride;
175
12.7k
            dst_ptr += 2 * dst_stride;
176
12.7k
            y -= 2;
177
12.7k
          } while (y > 0);
178
3.79k
        } else {
179
1.37k
          assert(w == 16);
180
181
1.37k
          s[0] = _mm_loadu_si128((__m128i *)data);
182
183
7.82k
          do {
184
7.82k
            s[1] = _mm_loadu_si128((__m128i *)(data + src_stride));
185
7.82k
            res = _mm_avg_epu8(s[0], s[1]);
186
7.82k
            _mm_storeu_si128((__m128i *)dst_ptr, res);
187
7.82k
            s[0] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
188
7.82k
            res = _mm_avg_epu8(s[1], s[0]);
189
7.82k
            _mm_storeu_si128((__m128i *)(dst_ptr + dst_stride), res);
190
191
7.82k
            data += 2 * src_stride;
192
7.82k
            dst_ptr += 2 * dst_stride;
193
7.82k
            y -= 2;
194
7.82k
          } while (y > 0);
195
1.37k
        }
196
14.5k
      } else {
197
877
        assert(!(w % 32));
198
199
877
        __m256i s[2], res;
200
1.30k
        do {
201
1.30k
          data = src_ptr + x;
202
1.30k
          dst_ptr = dst + x;
203
1.30k
          y = h;
204
205
1.30k
          s[0] = _mm256_loadu_si256((__m256i *)data);
206
207
34.1k
          do {
208
34.1k
            s[1] = _mm256_loadu_si256((__m256i *)(data + src_stride));
209
34.1k
            res = _mm256_avg_epu8(s[0], s[1]);
210
34.1k
            _mm256_storeu_si256((__m256i *)dst_ptr, res);
211
34.1k
            s[0] = _mm256_loadu_si256((__m256i *)(data + 2 * src_stride));
212
34.1k
            res = _mm256_avg_epu8(s[1], s[0]);
213
34.1k
            _mm256_storeu_si256((__m256i *)(dst_ptr + dst_stride), res);
214
215
34.1k
            data += 2 * src_stride;
216
34.1k
            dst_ptr += 2 * dst_stride;
217
34.1k
            y -= 2;
218
34.1k
          } while (y > 0);
219
220
1.30k
          x += 32;
221
1.30k
        } while (x < w);
222
877
      }
223
15.4k
    }
224
512k
  } else if (vert_tap == 4) {
225
272k
    if (w <= 4) {
226
123k
      prepare_coeffs_4t_ssse3(filter_params_y, subpel_y_qn, coeffs_128);
227
123k
      __m128i d[4], s[2];
228
229
123k
      if (w == 2) {
230
22.1k
        d[0] = _mm_cvtsi32_si128(loadu_int16(data + 0 * src_stride));
231
22.1k
        d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * src_stride));
232
22.1k
        d[2] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride));
233
234
22.1k
        const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]);
235
22.1k
        const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[2]);
236
237
22.1k
        s[0] = _mm_unpacklo_epi8(src_01a, src_12a);
238
37.0k
        do {
239
37.0k
          __m128i res;
240
37.0k
          convolve_y_4tap_2x2_ssse3(data, src_stride, coeffs_128, d, s, &res);
241
37.0k
          res = round_sr_y_ssse3(res);
242
37.0k
          pack_store_u8_2x2_sse2(res, dst_ptr, dst_stride);
243
244
37.0k
          dst_ptr += 2 * dst_stride;
245
37.0k
          data += 2 * src_stride;
246
37.0k
          y -= 2;
247
248
37.0k
          s[0] = s[1];
249
37.0k
        } while (y > 0);
250
251
101k
      } else {
252
101k
        assert(w == 4);
253
254
101k
        d[0] = _mm_cvtsi32_si128(loadu_int32(data + 0 * src_stride));
255
101k
        d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * src_stride));
256
101k
        d[2] = _mm_cvtsi32_si128(loadu_int32(data + 2 * src_stride));
257
258
101k
        const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]);
259
101k
        const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[2]);
260
261
101k
        s[0] = _mm_unpacklo_epi8(src_01a, src_12a);
262
199k
        do {
263
199k
          __m128i res;
264
199k
          convolve_y_4tap_4x2_ssse3(data, src_stride, coeffs_128, d, s, &res);
265
199k
          res = round_sr_y_ssse3(res);
266
199k
          pack_store_u8_4x2_sse2(res, dst_ptr, dst_stride);
267
268
199k
          dst_ptr += 2 * dst_stride;
269
199k
          data += 2 * src_stride;
270
199k
          y -= 2;
271
272
199k
          s[0] = s[1];
273
199k
        } while (y > 0);
274
101k
      }
275
148k
    } else {
276
148k
      prepare_coeffs_4t_lowbd(filter_params_y, subpel_y_qn, coeffs);
277
278
148k
      if (w == 8) {
279
96.8k
        __m128i d[4];
280
96.8k
        __m256i s[2];
281
282
96.8k
        d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
283
96.8k
        d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
284
96.8k
        d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
285
286
96.8k
        const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
287
96.8k
        const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]);
288
289
96.8k
        s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
290
180k
        do {
291
180k
          __m256i res;
292
180k
          convolve_y_4tap_8x2_avx2(data, src_stride, coeffs, d, s, &res);
293
180k
          round_pack_store_y_8x2_avx2(res, dst_ptr, dst_stride);
294
295
180k
          dst_ptr += 2 * dst_stride;
296
180k
          data += 2 * src_stride;
297
180k
          y -= 2;
298
299
180k
          s[0] = s[1];
300
180k
        } while (y > 0);
301
96.8k
      } else if (w == 16) {
302
47.9k
        __m128i d[4];
303
47.9k
        __m256i s[4];
304
305
47.9k
        d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
306
47.9k
        d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
307
47.9k
        d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
308
309
47.9k
        const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
310
47.9k
        const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]);
311
312
47.9k
        s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
313
47.9k
        s[2] = _mm256_unpackhi_epi8(src_01a, src_12a);
314
315
111k
        do {
316
111k
          __m256i res[2];
317
111k
          convolve_y_4tap_16x2_avx2(data, src_stride, coeffs, d, s, res);
318
111k
          round_pack_store_y_16x2_avx2(res, dst_ptr, dst_stride);
319
320
111k
          dst_ptr += 2 * dst_stride;
321
111k
          data += 2 * src_stride;
322
111k
          y -= 2;
323
324
111k
          s[0] = s[1];
325
111k
          s[2] = s[3];
326
111k
        } while (y > 0);
327
47.9k
      } else {
328
3.84k
        assert(!(w % 32));
329
330
3.85k
        __m256i d[4], s1[4], s2[4];
331
5.39k
        do {
332
5.39k
          data = src_ptr + x;
333
5.39k
          dst_ptr = dst + x;
334
5.39k
          y = h;
335
336
5.39k
          d[0] = _mm256_loadu_si256((__m256i *)(data + 0 * src_stride));
337
5.39k
          d[1] = _mm256_loadu_si256((__m256i *)(data + 1 * src_stride));
338
5.39k
          d[2] = _mm256_loadu_si256((__m256i *)(data + 2 * src_stride));
339
340
5.39k
          s1[0] = _mm256_unpacklo_epi8(d[0], d[1]);
341
5.39k
          s1[2] = _mm256_unpackhi_epi8(d[0], d[1]);
342
343
5.39k
          s2[0] = _mm256_unpacklo_epi8(d[1], d[2]);
344
5.39k
          s2[2] = _mm256_unpackhi_epi8(d[1], d[2]);
345
346
132k
          do {
347
132k
            __m256i res[4];
348
132k
            convolve_y_4tap_32x2_avx2(data, src_stride, coeffs, d, s1, s2, res);
349
132k
            round_pack_store_y_32x2_avx2(res, dst_ptr, dst_stride);
350
351
132k
            dst_ptr += 2 * dst_stride;
352
132k
            data += 2 * src_stride;
353
132k
            y -= 2;
354
355
132k
            s1[0] = s1[1];
356
132k
            s1[2] = s1[3];
357
358
132k
            s2[0] = s2[1];
359
132k
            s2[2] = s2[3];
360
132k
          } while (y > 0);
361
362
5.39k
          x += 32;
363
5.39k
        } while (x < w);
364
3.85k
      }
365
148k
    }
366
272k
  } else if (vert_tap == 6) {
367
227k
    if (w <= 4) {
368
67.2k
      prepare_coeffs_6t_ssse3(filter_params_y, subpel_y_qn, coeffs_128);
369
370
67.2k
      __m128i d[6], s[3];
371
67.2k
      if (w == 2) {
372
12.3k
        d[0] = _mm_cvtsi32_si128(loadu_int16(data + 0 * src_stride));
373
12.3k
        d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * src_stride));
374
12.3k
        d[2] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride));
375
12.3k
        d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * src_stride));
376
12.3k
        d[4] = _mm_cvtsi32_si128(loadu_int16(data + 4 * src_stride));
377
378
12.3k
        const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]);
379
12.3k
        const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[2]);
380
12.3k
        const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]);
381
12.3k
        const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[4]);
382
383
12.3k
        s[0] = _mm_unpacklo_epi8(src_01a, src_12a);
384
12.3k
        s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
385
386
49.3k
        do {
387
49.3k
          __m128i res;
388
49.3k
          convolve_y_6tap_2x2_ssse3(data, src_stride, coeffs_128, d, s, &res);
389
49.3k
          res = round_sr_y_ssse3(res);
390
49.3k
          pack_store_u8_2x2_sse2(res, dst_ptr, dst_stride);
391
392
49.3k
          dst_ptr += 2 * dst_stride;
393
49.3k
          data += 2 * src_stride;
394
49.3k
          y -= 2;
395
396
49.3k
          s[0] = s[1];
397
49.3k
          s[1] = s[2];
398
49.3k
        } while (y > 0);
399
400
54.8k
      } else {
401
54.8k
        assert(w == 4);
402
54.8k
        d[0] = _mm_cvtsi32_si128(loadu_int32(data + 0 * src_stride));
403
54.8k
        d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * src_stride));
404
54.8k
        d[2] = _mm_cvtsi32_si128(loadu_int32(data + 2 * src_stride));
405
54.8k
        d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * src_stride));
406
54.8k
        d[4] = _mm_cvtsi32_si128(loadu_int32(data + 4 * src_stride));
407
408
54.8k
        const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]);
409
54.8k
        const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[2]);
410
54.8k
        const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]);
411
54.8k
        const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[4]);
412
413
54.8k
        s[0] = _mm_unpacklo_epi8(src_01a, src_12a);
414
54.8k
        s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
415
416
299k
        do {
417
299k
          __m128i res;
418
299k
          convolve_y_6tap_4x2_ssse3(data, src_stride, coeffs_128, d, s, &res);
419
299k
          res = round_sr_y_ssse3(res);
420
299k
          pack_store_u8_4x2_sse2(res, dst_ptr, dst_stride);
421
422
299k
          dst_ptr += 2 * dst_stride;
423
299k
          data += 2 * src_stride;
424
299k
          y -= 2;
425
426
299k
          s[0] = s[1];
427
299k
          s[1] = s[2];
428
299k
        } while (y > 0);
429
54.8k
      }
430
159k
    } else {
431
159k
      prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
432
433
159k
      if (w == 8) {
434
72.6k
        __m128i d[6];
435
72.6k
        __m256i s[3];
436
437
72.6k
        d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
438
72.6k
        d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
439
72.6k
        d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
440
72.6k
        d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride));
441
72.6k
        d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride));
442
443
72.6k
        const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
444
72.6k
        const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]);
445
72.6k
        const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
446
72.6k
        const __m256i src_34a = _mm256_setr_m128i(d[3], d[4]);
447
448
72.6k
        s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
449
72.6k
        s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
450
451
407k
        do {
452
407k
          __m256i res;
453
407k
          convolve_y_6tap_8x2_avx2(data, src_stride, coeffs, d, s, &res);
454
407k
          round_pack_store_y_8x2_avx2(res, dst_ptr, dst_stride);
455
456
407k
          dst_ptr += 2 * dst_stride;
457
407k
          data += 2 * src_stride;
458
407k
          y -= 2;
459
460
407k
          s[0] = s[1];
461
407k
          s[1] = s[2];
462
407k
        } while (y > 0);
463
464
87.1k
      } else {
465
87.1k
        assert(!(w % 16));
466
467
87.1k
        __m128i d[6];
468
87.1k
        __m256i s[6];
469
129k
        do {
470
129k
          data = src_ptr + x;
471
129k
          dst_ptr = dst + x;
472
129k
          y = h;
473
474
129k
          d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
475
129k
          d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
476
129k
          d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
477
129k
          d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
478
129k
          d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
479
480
129k
          const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
481
129k
          const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]);
482
129k
          const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
483
129k
          const __m256i src_34a = _mm256_setr_m128i(d[3], d[4]);
484
485
129k
          s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
486
129k
          s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
487
488
129k
          s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
489
129k
          s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
490
491
1.68M
          do {
492
1.68M
            __m256i res[2];
493
1.68M
            convolve_y_6tap_16x2_avx2(data, src_stride, coeffs, d, s, res);
494
1.68M
            round_pack_store_y_16x2_avx2(res, dst_ptr, dst_stride);
495
496
1.68M
            dst_ptr += 2 * dst_stride;
497
1.68M
            data += 2 * src_stride;
498
1.68M
            y -= 2;
499
500
1.68M
            s[0] = s[1];
501
1.68M
            s[1] = s[2];
502
503
1.68M
            s[3] = s[4];
504
1.68M
            s[4] = s[5];
505
1.68M
          } while (y > 0);
506
507
129k
          x += 16;
508
129k
        } while (x < w);
509
87.1k
      }
510
159k
    }
511
227k
  } else if (vert_tap == 12) {  // vert_tap == 12
512
0
    __m128i d[12];
513
0
    __m256i s[12];
514
0
    prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs);
515
0
    const __m256i v_zero = _mm256_setzero_si256();
516
0
    __m128i right_shift = _mm_cvtsi32_si128(FILTER_BITS);
517
0
    __m256i right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1);
518
519
0
    for (int j = 0; j < w; j += 8) {
520
0
      data = &src_ptr[j];
521
0
      __m256i src10;
522
523
0
      d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
524
0
      d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
525
0
      d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
526
0
      d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride));
527
0
      d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride));
528
0
      d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride));
529
0
      d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
530
0
      d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride));
531
0
      d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
532
0
      d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride));
533
      // Load lines a and b. Line a to lower 128, line b to upper 128
534
0
      const __m256i src_01a = _mm256_permute2x128_si256(
535
0
          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
536
537
0
      const __m256i src_12a = _mm256_permute2x128_si256(
538
0
          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
539
540
0
      const __m256i src_23a = _mm256_permute2x128_si256(
541
0
          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
542
543
0
      const __m256i src_34a = _mm256_permute2x128_si256(
544
0
          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
545
546
0
      const __m256i src_45a = _mm256_permute2x128_si256(
547
0
          _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
548
549
0
      const __m256i src_56a = _mm256_permute2x128_si256(
550
0
          _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20);
551
552
0
      const __m256i src_67a = _mm256_permute2x128_si256(
553
0
          _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20);
554
555
0
      const __m256i src_78a = _mm256_permute2x128_si256(
556
0
          _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20);
557
558
0
      const __m256i src_89a = _mm256_permute2x128_si256(
559
0
          _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20);
560
561
0
      src10 = _mm256_castsi128_si256(
562
0
          _mm_loadl_epi64((__m128i *)(data + 10 * src_stride)));
563
0
      const __m256i src_910a =
564
0
          _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20);
565
566
0
      const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero);
567
0
      const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero);
568
0
      const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero);
569
0
      const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero);
570
0
      const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero);
571
0
      const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero);
572
0
      const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero);
573
0
      const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero);
574
0
      const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero);
575
0
      const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero);
576
577
0
      s[0] = _mm256_unpacklo_epi16(src_01, src_12);
578
0
      s[1] = _mm256_unpacklo_epi16(src_23, src_34);
579
0
      s[2] = _mm256_unpacklo_epi16(src_45, src_56);
580
0
      s[3] = _mm256_unpacklo_epi16(src_67, src_78);
581
0
      s[4] = _mm256_unpacklo_epi16(src_89, src_910);
582
583
0
      s[6] = _mm256_unpackhi_epi16(src_01, src_12);
584
0
      s[7] = _mm256_unpackhi_epi16(src_23, src_34);
585
0
      s[8] = _mm256_unpackhi_epi16(src_45, src_56);
586
0
      s[9] = _mm256_unpackhi_epi16(src_67, src_78);
587
0
      s[10] = _mm256_unpackhi_epi16(src_89, src_910);
588
589
0
      for (i = 0; i < h; i += 2) {
590
0
        data = &src_ptr[i * src_stride + j];
591
0
        const __m256i src_1011a = _mm256_permute2x128_si256(
592
0
            src10,
593
0
            _mm256_castsi128_si256(
594
0
                _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
595
0
            0x20);
596
597
0
        src10 = _mm256_castsi128_si256(
598
0
            _mm_loadl_epi64((__m128i *)(data + 12 * src_stride)));
599
600
0
        const __m256i src_1112a = _mm256_permute2x128_si256(
601
0
            _mm256_castsi128_si256(
602
0
                _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
603
0
            src10, 0x20);
604
605
0
        const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero);
606
0
        const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero);
607
608
0
        s[5] = _mm256_unpacklo_epi16(src_1011, src_1112);
609
0
        s[11] = _mm256_unpackhi_epi16(src_1011, src_1112);
610
611
0
        const __m256i res_lo = convolve_12taps(s, coeffs);
612
613
0
        const __m256i res_32b_lo = _mm256_sra_epi32(
614
0
            _mm256_add_epi32(res_lo, right_shift_const), right_shift);
615
        // 8 bit conversion and saturation to uint8
616
0
        __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
617
0
        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
618
619
0
        if (w - j > 4) {
620
0
          const __m256i res_hi = convolve_12taps(s + 6, coeffs);
621
622
0
          const __m256i res_32b_hi = _mm256_sra_epi32(
623
0
              _mm256_add_epi32(res_hi, right_shift_const), right_shift);
624
0
          __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi);
625
          // 8 bit conversion and saturation to uint8
626
0
          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
627
628
0
          __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi);
629
630
0
          const __m128i res_0 = _mm256_extracti128_si256(res_a, 0);
631
0
          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
632
633
0
          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
634
0
          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
635
0
                           res_1);
636
0
        } else {
637
0
          const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
638
0
          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
639
0
          if (w - j > 2) {
640
0
            xx_storel_32(&dst[i * dst_stride + j], res_0);
641
0
            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
642
0
          } else {
643
0
            xx_storel_16(&dst[i * dst_stride + j], res_0);
644
0
            xx_storel_16(&dst[i * dst_stride + j + dst_stride], res_1);
645
0
          }
646
0
        }
647
0
        s[0] = s[1];
648
0
        s[1] = s[2];
649
0
        s[2] = s[3];
650
0
        s[3] = s[4];
651
0
        s[4] = s[5];
652
653
0
        s[6] = s[7];
654
0
        s[7] = s[8];
655
0
        s[8] = s[9];
656
0
        s[9] = s[10];
657
0
        s[10] = s[11];
658
0
      }
659
0
    }
660
13.3k
  } else {
661
13.3k
    assert(vert_tap == 8);
662
663
13.4k
    if (w <= 4) {
664
5.89k
      prepare_coeffs_ssse3(filter_params_y, subpel_y_qn, coeffs_128);
665
666
5.89k
      __m128i d[8], s[4], res;
667
5.89k
      if (w == 2) {
668
1.28k
        d[0] = _mm_cvtsi32_si128(loadu_int16(data + 0 * src_stride));
669
1.28k
        d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * src_stride));
670
1.28k
        d[2] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride));
671
1.28k
        d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * src_stride));
672
1.28k
        d[4] = _mm_cvtsi32_si128(loadu_int16(data + 4 * src_stride));
673
1.28k
        d[5] = _mm_cvtsi32_si128(loadu_int16(data + 5 * src_stride));
674
1.28k
        d[6] = _mm_cvtsi32_si128(loadu_int16(data + 6 * src_stride));
675
676
1.28k
        const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]);
677
1.28k
        const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[2]);
678
1.28k
        const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]);
679
1.28k
        const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[4]);
680
1.28k
        const __m128i src_45a = _mm_unpacklo_epi16(d[4], d[5]);
681
1.28k
        const __m128i src_56a = _mm_unpacklo_epi16(d[5], d[6]);
682
683
1.28k
        s[0] = _mm_unpacklo_epi8(src_01a, src_12a);
684
1.28k
        s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
685
1.28k
        s[2] = _mm_unpacklo_epi8(src_45a, src_56a);
686
687
5.12k
        do {
688
5.12k
          convolve_y_8tap_2x2_ssse3(data, src_stride, coeffs_128, d, s, &res);
689
5.12k
          res = round_sr_y_ssse3(res);
690
5.12k
          pack_store_u8_2x2_sse2(res, dst_ptr, dst_stride);
691
692
5.12k
          dst_ptr += 2 * dst_stride;
693
5.12k
          data += 2 * src_stride;
694
5.12k
          y -= 2;
695
696
5.12k
          s[0] = s[1];
697
5.12k
          s[1] = s[2];
698
5.12k
          s[2] = s[3];
699
5.12k
        } while (y > 0);
700
701
4.61k
      } else {
702
4.61k
        assert(w == 4);
703
704
4.61k
        d[0] = _mm_cvtsi32_si128(loadu_int32(data + 0 * src_stride));
705
4.61k
        d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * src_stride));
706
4.61k
        d[2] = _mm_cvtsi32_si128(loadu_int32(data + 2 * src_stride));
707
4.61k
        d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * src_stride));
708
4.61k
        d[4] = _mm_cvtsi32_si128(loadu_int32(data + 4 * src_stride));
709
4.61k
        d[5] = _mm_cvtsi32_si128(loadu_int32(data + 5 * src_stride));
710
4.61k
        d[6] = _mm_cvtsi32_si128(loadu_int32(data + 6 * src_stride));
711
712
4.61k
        const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]);
713
4.61k
        const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[2]);
714
4.61k
        const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]);
715
4.61k
        const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[4]);
716
4.61k
        const __m128i src_45a = _mm_unpacklo_epi32(d[4], d[5]);
717
4.61k
        const __m128i src_56a = _mm_unpacklo_epi32(d[5], d[6]);
718
719
4.61k
        s[0] = _mm_unpacklo_epi8(src_01a, src_12a);
720
4.61k
        s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
721
4.61k
        s[2] = _mm_unpacklo_epi8(src_45a, src_56a);
722
723
25.4k
        do {
724
25.4k
          convolve_y_8tap_4x2_ssse3(data, src_stride, coeffs_128, d, s, &res);
725
25.4k
          res = round_sr_y_ssse3(res);
726
25.4k
          pack_store_u8_4x2_sse2(res, dst_ptr, dst_stride);
727
728
25.4k
          dst_ptr += 2 * dst_stride;
729
25.4k
          data += 2 * src_stride;
730
25.4k
          y -= 2;
731
732
25.4k
          s[0] = s[1];
733
25.4k
          s[1] = s[2];
734
25.4k
          s[2] = s[3];
735
25.4k
        } while (y > 0);
736
4.61k
      }
737
7.53k
    } else {
738
7.53k
      prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
739
740
7.53k
      if (w == 8) {
741
3.80k
        __m128i d[8];
742
3.80k
        __m256i s[4];
743
744
3.80k
        d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
745
3.80k
        d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
746
3.80k
        d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
747
3.80k
        d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride));
748
3.80k
        d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride));
749
3.80k
        d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride));
750
3.80k
        d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
751
752
3.80k
        const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
753
3.80k
        const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]);
754
3.80k
        const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
755
3.80k
        const __m256i src_34a = _mm256_setr_m128i(d[3], d[4]);
756
3.80k
        const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]);
757
3.80k
        const __m256i src_56a = _mm256_setr_m128i(d[5], d[6]);
758
759
3.80k
        s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
760
3.80k
        s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
761
3.80k
        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
762
763
24.1k
        do {
764
24.1k
          __m256i res;
765
24.1k
          convolve_y_8tap_8x2_avx2(data, src_stride, coeffs, d, s, &res);
766
24.1k
          round_pack_store_y_8x2_avx2(res, dst_ptr, dst_stride);
767
768
24.1k
          dst_ptr += 2 * dst_stride;
769
24.1k
          data += 2 * src_stride;
770
24.1k
          y -= 2;
771
772
24.1k
          s[0] = s[1];
773
24.1k
          s[1] = s[2];
774
24.1k
          s[2] = s[3];
775
24.1k
        } while (y > 0);
776
777
3.80k
      } else {
778
3.73k
        assert(!(w % 16));
779
780
3.73k
        __m128i d[8];
781
3.73k
        __m256i s[8];
782
5.83k
        do {
783
5.83k
          data = src_ptr + x;
784
5.83k
          dst_ptr = dst + x;
785
5.83k
          y = h;
786
787
5.83k
          d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
788
5.83k
          d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
789
5.83k
          d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
790
5.83k
          d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
791
5.83k
          d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
792
5.83k
          d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
793
5.83k
          d[6] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
794
795
5.83k
          const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
796
5.83k
          const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]);
797
5.83k
          const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
798
5.83k
          const __m256i src_34a = _mm256_setr_m128i(d[3], d[4]);
799
5.83k
          const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]);
800
5.83k
          const __m256i src_56a = _mm256_setr_m128i(d[5], d[6]);
801
802
5.83k
          s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
803
5.83k
          s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
804
5.83k
          s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
805
806
5.83k
          s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
807
5.83k
          s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
808
5.83k
          s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
809
810
79.1k
          do {
811
79.1k
            __m256i res[2];
812
79.1k
            convolve_y_8tap_16x2_avx2(data, src_stride, coeffs, d, s, res);
813
79.1k
            round_pack_store_y_16x2_avx2(res, dst_ptr, dst_stride);
814
815
79.1k
            dst_ptr += 2 * dst_stride;
816
79.1k
            data += 2 * src_stride;
817
79.1k
            y -= 2;
818
819
79.1k
            s[0] = s[1];
820
79.1k
            s[1] = s[2];
821
79.1k
            s[2] = s[3];
822
823
79.1k
            s[4] = s[5];
824
79.1k
            s[5] = s[6];
825
79.1k
            s[6] = s[7];
826
79.1k
          } while (y > 0);
827
828
5.83k
          x += 16;
829
5.83k
        } while (x < w);
830
3.73k
      }
831
7.53k
    }
832
13.4k
  }
833
541k
}
834
835
void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
836
                            uint8_t *dst, int32_t dst_stride, int32_t w,
837
                            int32_t h,
838
                            const InterpFilterParams *filter_params_x,
839
                            const int32_t subpel_x_qn,
840
477k
                            ConvolveParams *conv_params) {
841
477k
  const int bits = FILTER_BITS - conv_params->round_0;
842
477k
  int i, j, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
843
844
477k
  assert(bits >= 0);
845
477k
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
846
477k
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
847
477k
  assert(conv_params->round_0 > 0);
848
849
477k
  assert(horiz_tap == 2 || horiz_tap == 4 || horiz_tap == 6 || horiz_tap == 8 ||
850
477k
         horiz_tap == 12);
851
477k
  assert((!(w % 2)) || (w <= 128));
852
477k
  assert((h % 2) == 0);
853
854
477k
  __m256i coeffs[6] = { 0 }, filt[4] = { 0 };
855
477k
  __m128i coeffs_128[4] = { 0 };
856
857
477k
  i = 0;
858
  // horz_filt as 4 tap
859
477k
  if (horiz_tap == 4) {
860
    // since fo_horiz = 1
861
174k
    const uint8_t *src_ptr = src - 1;
862
174k
    if (w == 2) {
863
29.3k
      prepare_coeffs_4t_ssse3(filter_params_x, subpel_x_qn, coeffs_128);
864
74.3k
      do {
865
74.3k
        const __m128i res =
866
74.3k
            convolve_x_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
867
74.3k
        const __m128i reg = round_sr_x_ssse3(res);
868
74.3k
        pack_store_u8_2x2_sse2(reg, dst, dst_stride);
869
74.3k
        src_ptr += 2 * src_stride;
870
74.3k
        dst += 2 * dst_stride;
871
74.3k
        h -= 2;
872
74.3k
      } while (h);
873
145k
    } else if (w == 4) {
874
132k
      prepare_coeffs_4t_ssse3(filter_params_x, subpel_x_qn, coeffs_128);
875
429k
      do {
876
429k
        const __m128i reg =
877
429k
            convolve_x_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
878
429k
        const __m128i res = round_sr_x_ssse3(reg);
879
429k
        pack_store_u8_4x2_sse2(res, dst, dst_stride);
880
429k
        src_ptr += 2 * src_stride;
881
429k
        dst += 2 * dst_stride;
882
429k
        h -= 2;
883
429k
      } while (h);
884
132k
    } else if (w == 8) {
885
6.70k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
886
6.70k
      filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
887
6.70k
      filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
888
24.7k
      do {
889
24.7k
        const __m256i data = _mm256_setr_m128i(
890
24.7k
            _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride])),
891
24.7k
            _mm_loadu_si128(
892
24.7k
                (__m128i *)(&src_ptr[i * src_stride + src_stride])));
893
894
24.7k
        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
895
896
24.7k
        res_16b = round_sr_x_avx2(res_16b);
897
898
        /* rounding code */
899
        // 8 bit conversion and saturation to uint8
900
24.7k
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
901
902
24.7k
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
903
24.7k
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
904
905
24.7k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
906
24.7k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
907
24.7k
        i += 2;
908
24.7k
      } while (i < h);
909
6.70k
    } else {
910
6.43k
      assert(!(w % 16));
911
6.43k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
912
6.43k
      filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
913
6.43k
      filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
914
163k
      do {
915
163k
        j = 0;
916
602k
        do {
917
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
918
          // 18 19 20 21 22 23
919
602k
          const __m256i data = _mm256_inserti128_si256(
920
602k
              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
921
602k
              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
922
602k
              1);
923
924
602k
          __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
925
926
602k
          res_16b = round_sr_x_avx2(res_16b);
927
928
          /* rounding code */
929
          // 8 bit conversion and saturation to uint8
930
602k
          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
931
932
          // Store values into the destination buffer
933
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
934
602k
          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
935
602k
          __m128i res = _mm256_castsi256_si128(res_8b);
936
602k
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
937
602k
          j += 16;
938
602k
        } while (j < w);
939
163k
        i++;
940
163k
      } while (i < h);
941
6.43k
    }
942
302k
  } else if (horiz_tap == 6) {
943
    // since (horiz_tap/2 - 1 == 2)
944
251k
    const uint8_t *src_ptr = src - 2;
945
251k
    prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs);
946
251k
    filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
947
251k
    filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
948
251k
    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
949
251k
    if (w == 8) {
950
473k
      do {
951
473k
        const __m256i data = _mm256_setr_m128i(
952
473k
            _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride])),
953
473k
            _mm_loadu_si128(
954
473k
                (__m128i *)(&src_ptr[i * src_stride + src_stride])));
955
956
473k
        __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
957
958
473k
        res_16b = round_sr_x_avx2(res_16b);
959
960
        /* rounding code */
961
        // 8 bit conversion and saturation to uint8
962
473k
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
963
964
473k
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
965
473k
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
966
473k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
967
473k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
968
473k
        i += 2;
969
473k
      } while (i < h);
970
133k
    } else if (w == 16) {
971
433k
      do {
972
433k
        __m256i data[2] = { 0 };
973
974
433k
        load_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs, filt, data);
975
433k
        round_pack_store_16x2_avx2(data, dst, dst_stride);
976
433k
        src_ptr += 2 * src_stride;
977
433k
        dst += 2 * dst_stride;
978
433k
        h -= 2;
979
433k
      } while (h);
980
88.0k
    } else if (w == 32) {
981
453k
      do {
982
453k
        convolve_sr_store_6tap_32_avx2(src_ptr, coeffs, filt, dst);
983
453k
        src_ptr += src_stride;
984
453k
        dst += dst_stride;
985
453k
      } while ((--h) > 0);
986
23.4k
    } else if (w == 64) {
987
241k
      do {
988
241k
        convolve_sr_store_6tap_32_avx2(src_ptr, coeffs, filt, dst);
989
241k
        convolve_sr_store_6tap_32_avx2(src_ptr + 32, coeffs, filt, dst + 32);
990
241k
        src_ptr += src_stride;
991
241k
        dst += dst_stride;
992
241k
      } while ((--h) > 0);
993
4.88k
    } else {
994
1.05k
      assert(w == 128);
995
996
132k
      do {
997
132k
        convolve_sr_store_6tap_32_avx2(src_ptr, coeffs, filt, dst);
998
132k
        convolve_sr_store_6tap_32_avx2(src_ptr + SECOND_32_BLK, coeffs, filt,
999
132k
                                       dst + SECOND_32_BLK);
1000
132k
        convolve_sr_store_6tap_32_avx2(src_ptr + THIRD_32_BLK, coeffs, filt,
1001
132k
                                       dst + THIRD_32_BLK);
1002
132k
        convolve_sr_store_6tap_32_avx2(src_ptr + FOURTH_32_BLK, coeffs, filt,
1003
132k
                                       dst + FOURTH_32_BLK);
1004
132k
        src_ptr += src_stride;
1005
132k
        dst += dst_stride;
1006
132k
      } while ((--h) > 0);
1007
1.17k
    }
1008
251k
  } else if (horiz_tap == 8) {
1009
    // since (horiz_tap / 2 - 1) == 3
1010
12.7k
    const uint8_t *src_ptr = src - 3;
1011
12.7k
    prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
1012
12.7k
    filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
1013
12.7k
    filt[1] =
1014
12.7k
        _mm256_load_si256((__m256i const *)(filt_global_avx2 + SECOND_32_BLK));
1015
12.7k
    filt[2] =
1016
12.7k
        _mm256_load_si256((__m256i const *)(filt_global_avx2 + THIRD_32_BLK));
1017
12.7k
    filt[3] =
1018
12.7k
        _mm256_load_si256((__m256i const *)(filt_global_avx2 + FOURTH_32_BLK));
1019
1020
12.7k
    if (w == 8) {
1021
23.7k
      do {
1022
23.7k
        const __m256i data = _mm256_setr_m128i(
1023
23.7k
            _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride])),
1024
23.7k
            _mm_loadu_si128(
1025
23.7k
                (__m128i *)(&src_ptr[i * src_stride + src_stride])));
1026
1027
23.7k
        __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
1028
1029
23.7k
        res_16b = round_sr_x_avx2(res_16b);
1030
1031
        /* rounding code */
1032
        // 8 bit conversion and saturation to uint8
1033
23.7k
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
1034
1035
23.7k
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
1036
23.7k
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
1037
23.7k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
1038
23.7k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
1039
23.7k
        i += 2;
1040
23.7k
      } while (i < h);
1041
6.42k
    } else if (w == 16) {
1042
23.5k
      do {
1043
23.5k
        __m256i data[2] = { 0 };
1044
1045
23.5k
        load_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs, filt, data);
1046
23.5k
        round_pack_store_16x2_avx2(data, dst, dst_stride);
1047
23.5k
        src_ptr += 2 * src_stride;
1048
23.5k
        dst += 2 * dst_stride;
1049
23.5k
        h -= 2;
1050
23.5k
      } while (h);
1051
4.13k
    } else if (w == 32) {
1052
35.4k
      do {
1053
35.4k
        load_convolve_round_8tap_32_avx2(src_ptr, coeffs, filt, dst);
1054
35.4k
        src_ptr += src_stride;
1055
35.4k
        dst += dst_stride;
1056
35.4k
      } while ((--h) > 0);
1057
1.43k
    } else if (w == 64) {
1058
27.4k
      do {
1059
27.4k
        load_convolve_round_8tap_32_avx2(src_ptr, coeffs, filt, dst);
1060
27.4k
        load_convolve_round_8tap_32_avx2(src_ptr + 32, coeffs, filt, dst + 32);
1061
27.4k
        src_ptr += src_stride;
1062
27.4k
        dst += dst_stride;
1063
27.4k
      } while ((--h) > 0);
1064
564
    } else {
1065
167
      assert(w == 128);
1066
17.7k
      do {
1067
17.7k
        load_convolve_round_8tap_32_avx2(src_ptr, coeffs, filt, dst);
1068
17.7k
        load_convolve_round_8tap_32_avx2(src_ptr + SECOND_32_BLK, coeffs, filt,
1069
17.7k
                                         dst + SECOND_32_BLK);
1070
17.7k
        load_convolve_round_8tap_32_avx2(src_ptr + THIRD_32_BLK, coeffs, filt,
1071
17.7k
                                         dst + THIRD_32_BLK);
1072
17.7k
        load_convolve_round_8tap_32_avx2(src_ptr + FOURTH_32_BLK, coeffs, filt,
1073
17.7k
                                         dst + FOURTH_32_BLK);
1074
17.7k
        src_ptr += src_stride;
1075
17.7k
        dst += dst_stride;
1076
17.7k
      } while ((--h) > 0);
1077
169
    }
1078
38.7k
  } else if (horiz_tap == 12) {  // horiz_tap == 12
1079
0
    const int fo_horiz = filter_params_x->taps / 2 - 1;
1080
0
    prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs);
1081
0
    const __m128i round_shift = _mm_cvtsi32_si128(bits);
1082
0
    const uint8_t *const src_ptr = src - fo_horiz;
1083
0
    const __m256i v_zero = _mm256_setzero_si256();
1084
0
    __m256i round_0_const =
1085
0
        _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1);
1086
0
    __m256i round_const = _mm256_set1_epi32((1 << bits) >> 1);
1087
0
    __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
1088
0
    __m256i s[6] = { 0 };
1089
1090
0
    if (w <= 4) {
1091
0
      do {
1092
0
        const __m256i data = _mm256_permute2x128_si256(
1093
0
            _mm256_castsi128_si256(
1094
0
                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
1095
0
            _mm256_castsi128_si256(_mm_loadu_si128(
1096
0
                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
1097
0
            0x20);
1098
        // row0 0..7 row1 0..7
1099
0
        const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
1100
        // row0 8..F row1 8..F
1101
0
        const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
1102
1103
        // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03
1104
0
        const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
1105
        // row0 04 04 .. 07 07 row1 04 04 .. 07 07
1106
0
        const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
1107
1108
        // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B
1109
0
        const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
1110
        // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F
1111
0
        const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
1112
1113
        // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14
1114
0
        s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
1115
        // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
1116
0
        s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
1117
        // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18
1118
0
        s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
1119
        // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A
1120
0
        s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
1121
        // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C
1122
0
        s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
1123
        // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E
1124
0
        s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
1125
1126
0
        const __m256i res_lo = convolve_12taps(s, coeffs);
1127
1128
0
        __m256i res_32b_lo = _mm256_sra_epi32(
1129
0
            _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
1130
1131
        // 00 01 02 03 10 12 13 14
1132
0
        res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const),
1133
0
                                      round_shift);
1134
        // 8 bit conversion and saturation to uint8
1135
        // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13
1136
0
        __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
1137
        // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
1138
        // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
1139
0
        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
1140
1141
        // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
1142
0
        const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
1143
        // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
1144
0
        const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
1145
0
        if (w > 2) {
1146
          // 00 01 02 03
1147
0
          xx_storel_32(&dst[i * dst_stride], res_0);
1148
          // 10 11 12 13
1149
0
          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
1150
0
        } else {
1151
          // 00 01
1152
0
          xx_storel_16(&dst[i * dst_stride], res_0);
1153
          // 10 11
1154
0
          xx_storel_16(&dst[i * dst_stride + dst_stride], res_1);
1155
0
        }
1156
0
        i += 2;
1157
0
      } while (i < h);
1158
0
    } else {
1159
0
      assert(!(w % 8));
1160
0
      do {
1161
0
        j = 0;
1162
0
        do {
1163
0
          const __m256i data = _mm256_permute2x128_si256(
1164
0
              _mm256_castsi128_si256(
1165
0
                  _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
1166
0
              _mm256_castsi128_si256(_mm_loadu_si128(
1167
0
                  (__m128i *)(&src_ptr[i * src_stride + j + 4]))),
1168
0
              0x20);
1169
          // row0 0..7 4..B
1170
0
          const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
1171
          // row0 8..F C..13
1172
0
          const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
1173
1174
          // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07
1175
0
          const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
1176
          // row0 04 04 .. 07 07 08 08 .. 0B 0B
1177
0
          const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
1178
1179
          // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F
1180
0
          const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
1181
          // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13
1182
0
          const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
1183
1184
0
          s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
1185
0
          s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
1186
0
          s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
1187
0
          s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
1188
0
          s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
1189
0
          s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
1190
1191
0
          const __m256i res_lo = convolve_12taps(s, coeffs);
1192
1193
0
          __m256i res_32b_lo = _mm256_sra_epi32(
1194
0
              _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
1195
1196
0
          res_32b_lo = _mm256_sra_epi32(
1197
0
              _mm256_add_epi32(res_32b_lo, round_const), round_shift);
1198
          // 8 bit conversion and saturation to uint8
1199
0
          __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
1200
0
          __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
1201
0
          const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
1202
0
          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
1203
0
          xx_storel_32(&dst[i * dst_stride + j], res_0);
1204
0
          xx_storel_32(&dst[i * dst_stride + j + 4], res_1);
1205
1206
0
          j += 8;
1207
0
        } while (j < w);
1208
0
        i++;
1209
0
      } while (i < h);
1210
0
    }
1211
38.7k
  } else {
1212
38.7k
    assert(horiz_tap == 2);
1213
    // since (filter_params_x->taps / 2 - 1) == 0
1214
38.7k
    const uint8_t *src_ptr = src;
1215
38.7k
    if (subpel_x_qn != 8) {
1216
16.5k
      if (w <= 8) {
1217
12.8k
        prepare_coeffs_2t_ssse3(filter_params_x, subpel_x_qn, coeffs_128);
1218
1219
12.8k
        if (w == 2) {
1220
4.21k
          do {
1221
4.21k
            const __m128i data =
1222
4.21k
                convolve_x_2tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
1223
4.21k
            const __m128i reg = round_sr_x_ssse3(data);
1224
4.21k
            pack_store_u8_2x2_sse2(reg, dst, dst_stride);
1225
4.21k
            src_ptr += 2 * src_stride;
1226
4.21k
            dst += 2 * dst_stride;
1227
4.21k
            h -= 2;
1228
4.21k
          } while (h);
1229
11.0k
        } else if (w == 4) {
1230
17.7k
          do {
1231
17.7k
            const __m128i data =
1232
17.7k
                convolve_x_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
1233
17.7k
            const __m128i reg = round_sr_x_ssse3(data);
1234
17.7k
            pack_store_u8_4x2_sse2(reg, dst, dst_stride);
1235
17.7k
            src_ptr += 2 * src_stride;
1236
17.7k
            dst += 2 * dst_stride;
1237
17.7k
            h -= 2;
1238
17.7k
          } while (h);
1239
5.98k
        } else {
1240
5.06k
          assert(w == 8);
1241
1242
17.3k
          do {
1243
17.3k
            __m128i data[2] = { 0 };
1244
1245
17.3k
            convolve_x_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, data);
1246
17.3k
            data[0] = round_sr_x_ssse3(data[0]);
1247
17.3k
            data[1] = round_sr_x_ssse3(data[1]);
1248
17.3k
            const __m128i reg = _mm_packus_epi16(data[0], data[1]);
1249
17.3k
            _mm_storel_epi64((__m128i *)dst, reg);
1250
17.3k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), reg);
1251
1252
17.3k
            src_ptr += 2 * src_stride;
1253
17.3k
            dst += 2 * dst_stride;
1254
17.3k
            h -= 2;
1255
17.3k
          } while (h);
1256
5.06k
        }
1257
12.8k
      } else {
1258
3.74k
        prepare_coeffs_2t_lowbd(filter_params_x, subpel_x_qn, coeffs);
1259
1260
3.74k
        if (w == 16) {
1261
11.8k
          do {
1262
11.8k
            __m256i data[2] = { 0 };
1263
1264
11.8k
            convolve_x_2tap_16x2_avx2(src_ptr, src_stride, coeffs, data);
1265
11.8k
            round_pack_store_16x2_avx2(data, dst, dst_stride);
1266
11.8k
            src_ptr += 2 * src_stride;
1267
11.8k
            dst += 2 * dst_stride;
1268
11.8k
            h -= 2;
1269
11.8k
          } while (h);
1270
2.14k
        } else if (w == 32) {
1271
21.7k
          do {
1272
21.7k
            convolve_round_2tap_32_avx2(src_ptr, coeffs, dst);
1273
21.7k
            src_ptr += src_stride;
1274
21.7k
            dst += dst_stride;
1275
21.7k
          } while ((--h) > 0);
1276
859
        } else if (w == 64) {
1277
25.5k
          do {
1278
25.5k
            convolve_round_2tap_32_avx2(src_ptr, coeffs, dst);
1279
25.5k
            convolve_round_2tap_32_avx2(src_ptr + SECOND_32_BLK, coeffs,
1280
25.5k
                                        dst + SECOND_32_BLK);
1281
25.5k
            src_ptr += src_stride;
1282
25.5k
            dst += dst_stride;
1283
25.5k
          } while ((--h) > 0);
1284
588
        } else {
1285
157
          assert(w == 128);
1286
1287
12.8k
          do {
1288
12.8k
            convolve_round_2tap_32_avx2(src_ptr, coeffs, dst);
1289
12.8k
            convolve_round_2tap_32_avx2(src_ptr + (SECOND_32_BLK), coeffs,
1290
12.8k
                                        dst + (SECOND_32_BLK));
1291
12.8k
            convolve_round_2tap_32_avx2(src_ptr + (THIRD_32_BLK), coeffs,
1292
12.8k
                                        dst + (THIRD_32_BLK));
1293
12.8k
            convolve_round_2tap_32_avx2(src_ptr + (FOURTH_32_BLK), coeffs,
1294
12.8k
                                        dst + (FOURTH_32_BLK));
1295
12.8k
            src_ptr += src_stride;
1296
12.8k
            dst += dst_stride;
1297
12.8k
          } while ((--h) > 0);
1298
157
        }
1299
3.74k
      }
1300
22.1k
    } else {
1301
22.1k
      if (w == 2) {
1302
6.42k
        do {
1303
6.42k
          __m128i data = load_x_u8_4x2_sse4(src_ptr, src_stride);
1304
6.42k
          const __m128i reg1 = _mm_srli_si128(data, 1);
1305
6.42k
          const __m128i reg2 = _mm_avg_epu8(data, reg1);
1306
6.42k
          xx_storel_16(dst, reg2);
1307
6.42k
          {
1308
6.42k
            uint16_t val = (uint16_t)_mm_extract_epi16(reg2, 2);
1309
6.42k
            memcpy(dst + dst_stride, &val, sizeof(val));
1310
6.42k
          }
1311
6.42k
          src_ptr += 2 * src_stride;
1312
6.42k
          dst += 2 * dst_stride;
1313
6.42k
          h -= 2;
1314
6.42k
        } while (h);
1315
19.0k
      } else if (w == 4) {
1316
22.1k
        do {
1317
22.1k
          __m128i data = load_8bit_8x2_to_1_reg_sse2(
1318
22.1k
              src_ptr, (int)(sizeof(*src_ptr) * src_stride));
1319
22.1k
          const __m128i reg1 = _mm_srli_si128(data, 1);
1320
22.1k
          const __m128i reg2 = _mm_avg_epu8(data, reg1);
1321
22.1k
          xx_storel_32(dst, reg2);
1322
22.1k
          {
1323
22.1k
            int32_t val = _mm_extract_epi32(reg2, 2);
1324
22.1k
            memcpy(dst + dst_stride, &val, sizeof(val));
1325
22.1k
          }
1326
1327
22.1k
          src_ptr += 2 * src_stride;
1328
22.1k
          dst += 2 * dst_stride;
1329
22.1k
          h -= 2;
1330
22.1k
        } while (h);
1331
11.5k
      } else if (w == 8) {
1332
25.2k
        do {
1333
25.2k
          const __m128i data00 = _mm_loadu_si128((__m128i *)src_ptr);
1334
25.2k
          const __m128i data10 =
1335
25.2k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
1336
25.2k
          const __m128i data01 = _mm_srli_si128(data00, 1);
1337
25.2k
          const __m128i data11 = _mm_srli_si128(data10, 1);
1338
25.2k
          const __m128i reg0 = _mm_avg_epu8(data00, data01);
1339
25.2k
          const __m128i reg1 = _mm_avg_epu8(data10, data11);
1340
25.2k
          _mm_storel_epi64((__m128i *)dst, reg0);
1341
25.2k
          _mm_storel_epi64((__m128i *)(dst + dst_stride), reg1);
1342
1343
25.2k
          src_ptr += 2 * src_stride;
1344
25.2k
          dst += 2 * dst_stride;
1345
25.2k
          h -= 2;
1346
25.2k
        } while (h);
1347
6.51k
      } else if (w == 16) {
1348
20.2k
        do {
1349
20.2k
          const __m128i data00 = _mm_loadu_si128((__m128i *)src_ptr);
1350
20.2k
          const __m128i data01 = _mm_loadu_si128((__m128i *)(src_ptr + 1));
1351
20.2k
          const __m128i data10 =
1352
20.2k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
1353
20.2k
          const __m128i data11 =
1354
20.2k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
1355
20.2k
          const __m128i reg0 = _mm_avg_epu8(data00, data01);
1356
20.2k
          const __m128i reg1 = _mm_avg_epu8(data10, data11);
1357
20.2k
          _mm_storeu_si128((__m128i *)dst, reg0);
1358
20.2k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), reg1);
1359
1360
20.2k
          src_ptr += 2 * src_stride;
1361
20.2k
          dst += 2 * dst_stride;
1362
20.2k
          h -= 2;
1363
20.2k
        } while (h);
1364
3.43k
      } else if (w == 32) {
1365
26.0k
        do {
1366
26.0k
          load_avg_store_2tap_32_avx2(src_ptr, dst);
1367
26.0k
          src_ptr += src_stride;
1368
26.0k
          dst += dst_stride;
1369
26.0k
        } while ((--h) > 0);
1370
1.09k
      } else if (w == 64) {
1371
18.7k
        do {
1372
18.7k
          load_avg_store_2tap_32_avx2(src_ptr, dst);
1373
18.7k
          load_avg_store_2tap_32_avx2(src_ptr + (SECOND_32_BLK),
1374
18.7k
                                      dst + (SECOND_32_BLK));
1375
18.7k
          src_ptr += src_stride;
1376
18.7k
          dst += dst_stride;
1377
18.7k
        } while ((--h) > 0);
1378
378
      } else {
1379
146
        assert(w == 128);
1380
1381
13.2k
        do {
1382
13.2k
          load_avg_store_2tap_32_avx2(src_ptr, dst);
1383
13.2k
          load_avg_store_2tap_32_avx2(src_ptr + (SECOND_32_BLK),
1384
13.2k
                                      dst + (SECOND_32_BLK));
1385
13.2k
          load_avg_store_2tap_32_avx2(src_ptr + (THIRD_32_BLK),
1386
13.2k
                                      dst + (THIRD_32_BLK));
1387
13.2k
          load_avg_store_2tap_32_avx2(src_ptr + (FOURTH_32_BLK),
1388
13.2k
                                      dst + (FOURTH_32_BLK));
1389
13.2k
          src_ptr += src_stride;
1390
13.2k
          dst += dst_stride;
1391
13.2k
        } while ((--h) > 0);
1392
146
      }
1393
22.1k
    }
1394
38.7k
  }
1395
477k
}