Coverage Report

Created: 2026-05-16 06:27

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/av1/common/x86/convolve_avx2.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <immintrin.h>
13
14
#include "config/av1_rtcd.h"
15
16
#include "aom_dsp/aom_dsp_common.h"
17
#include "aom_dsp/x86/convolve_avx2.h"
18
#include "aom_dsp/x86/convolve_common_intrin.h"
19
#include "aom_dsp/x86/synonyms.h"
20
21
void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
22
                            uint8_t *dst, int32_t dst_stride, int32_t w,
23
                            int32_t h,
24
                            const InterpFilterParams *filter_params_y,
25
503k
                            const int32_t subpel_y_qn) {
26
503k
  __m128i coeffs_128[4];
27
503k
  __m256i coeffs[6];
28
503k
  int x = 0, y = h;
29
30
503k
  int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
31
503k
  assert(vert_tap == 2 || vert_tap == 4 || vert_tap == 6 || vert_tap == 8 ||
32
503k
         vert_tap == 12);
33
503k
  assert(!(w % 2));
34
503k
  assert(!(h % 2));
35
36
503k
  const int fo_vert = vert_tap / 2 - 1;
37
503k
  const uint8_t *const src_ptr = src - fo_vert * src_stride;
38
503k
  const uint8_t *data = src_ptr;
39
503k
  uint8_t *dst_ptr = dst;
40
41
503k
  if (vert_tap == 2) {
42
31.8k
    if (subpel_y_qn != 8) {
43
13.6k
      if (w <= 4) {
44
6.73k
        prepare_coeffs_2t_ssse3(filter_params_y, subpel_y_qn, coeffs_128);
45
6.73k
        __m128i d[2], res;
46
6.73k
        if (w == 2) {
47
1.75k
          d[0] = _mm_cvtsi32_si128(loadu_int16(data));
48
49
3.55k
          do {
50
3.55k
            convolve_y_2tap_2x2_ssse3(data, src_stride, coeffs_128, d, &res);
51
3.55k
            res = round_sr_y_ssse3(res);
52
3.55k
            pack_store_u8_2x2_sse2(res, dst_ptr, dst_stride);
53
54
3.55k
            dst_ptr += 2 * dst_stride;
55
3.55k
            data += 2 * src_stride;
56
3.55k
            y -= 2;
57
3.55k
          } while (y > 0);
58
4.98k
        } else {
59
4.98k
          assert(w == 4);
60
4.98k
          d[0] = _mm_cvtsi32_si128(loadu_int32(data));
61
62
15.3k
          do {
63
15.3k
            convolve_y_2tap_4x2_ssse3(data, src_stride, coeffs_128, d, &res);
64
15.3k
            res = round_sr_y_ssse3(res);
65
15.3k
            pack_store_u8_4x2_sse2(res, dst_ptr, dst_stride);
66
67
15.3k
            dst_ptr += 2 * dst_stride;
68
15.3k
            data += 2 * src_stride;
69
15.3k
            y -= 2;
70
15.3k
          } while (y > 0);
71
4.98k
        }
72
6.91k
      } else {
73
6.91k
        prepare_coeffs_2t_lowbd(filter_params_y, subpel_y_qn, coeffs);
74
75
6.91k
        if (w == 8) {
76
3.69k
          __m128i d[2];
77
3.69k
          d[0] = _mm_loadl_epi64((__m128i *)data);
78
79
12.7k
          do {
80
12.7k
            __m256i res;
81
12.7k
            convolve_y_2tap_8x2_avx2(data, src_stride, coeffs, d, &res);
82
12.7k
            round_pack_store_y_8x2_avx2(res, dst_ptr, dst_stride);
83
84
12.7k
            dst_ptr += 2 * dst_stride;
85
12.7k
            data += 2 * src_stride;
86
12.7k
            y -= 2;
87
88
12.7k
          } while (y > 0);
89
90
3.69k
        } else if (w == 16) {
91
1.99k
          __m128i d[2];
92
1.99k
          d[0] = _mm_loadu_si128((__m128i *)data);
93
94
13.9k
          do {
95
13.9k
            __m256i res[2];
96
13.9k
            convolve_y_2tap_16x2_avx2(data, src_stride, coeffs, d, res);
97
13.9k
            round_pack_store_y_16x2_avx2(res, dst_ptr, dst_stride);
98
99
13.9k
            dst_ptr += 2 * dst_stride;
100
13.9k
            data += 2 * src_stride;
101
13.9k
            y -= 2;
102
13.9k
          } while (y > 0);
103
104
1.99k
        } else {
105
1.23k
          assert(!(w % 32));
106
107
1.23k
          __m256i d[2];
108
1.79k
          do {
109
1.79k
            data = src_ptr + x;
110
1.79k
            dst_ptr = dst + x;
111
1.79k
            y = h;
112
113
1.79k
            d[0] = _mm256_loadu_si256((__m256i *)data);
114
115
41.0k
            do {
116
41.0k
              __m256i res[4];
117
41.0k
              convolve_y_2tap_32x2_avx2(data, src_stride, coeffs, d, res);
118
41.0k
              round_pack_store_y_32x2_avx2(res, dst_ptr, dst_stride);
119
120
41.0k
              dst_ptr += 2 * dst_stride;
121
41.0k
              data += 2 * src_stride;
122
41.0k
              y -= 2;
123
41.0k
            } while (y > 0);
124
125
1.79k
            x += 32;
126
1.79k
          } while (x < w);
127
1.23k
        }
128
6.91k
      }
129
18.1k
    } else {
130
18.1k
      if (w <= 16) {
131
17.2k
        __m128i s[2], res;
132
133
17.2k
        if (w == 2) {
134
3.93k
          s[0] = _mm_cvtsi32_si128(loadu_int16(data));
135
136
7.46k
          do {
137
7.46k
            s[1] = _mm_cvtsi32_si128(loadu_int16(data + src_stride));
138
7.46k
            res = _mm_avg_epu8(s[0], s[1]);
139
7.46k
            xx_storel_16(dst_ptr, res);
140
7.46k
            s[0] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride));
141
7.46k
            res = _mm_avg_epu8(s[1], s[0]);
142
7.46k
            xx_storel_16(dst_ptr + dst_stride, res);
143
144
7.46k
            data += 2 * src_stride;
145
7.46k
            dst_ptr += 2 * dst_stride;
146
7.46k
            y -= 2;
147
7.46k
          } while (y > 0);
148
13.3k
        } else if (w == 4) {
149
7.09k
          s[0] = _mm_cvtsi32_si128(loadu_int32(data));
150
151
20.1k
          do {
152
20.1k
            s[1] = _mm_cvtsi32_si128(loadu_int32(data + src_stride));
153
20.1k
            res = _mm_avg_epu8(s[0], s[1]);
154
20.1k
            xx_storel_32(dst_ptr, res);
155
20.1k
            s[0] = _mm_cvtsi32_si128(loadu_int32(data + 2 * src_stride));
156
20.1k
            res = _mm_avg_epu8(s[1], s[0]);
157
20.1k
            xx_storel_32(dst_ptr + dst_stride, res);
158
159
20.1k
            data += 2 * src_stride;
160
20.1k
            dst_ptr += 2 * dst_stride;
161
20.1k
            y -= 2;
162
20.1k
          } while (y > 0);
163
7.09k
        } else if (w == 8) {
164
4.69k
          s[0] = _mm_loadl_epi64((__m128i *)data);
165
166
16.1k
          do {
167
16.1k
            s[1] = _mm_loadl_epi64((__m128i *)(data + src_stride));
168
16.1k
            res = _mm_avg_epu8(s[0], s[1]);
169
16.1k
            _mm_storel_epi64((__m128i *)dst_ptr, res);
170
16.1k
            s[0] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
171
16.1k
            res = _mm_avg_epu8(s[1], s[0]);
172
16.1k
            _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res);
173
174
16.1k
            data += 2 * src_stride;
175
16.1k
            dst_ptr += 2 * dst_stride;
176
16.1k
            y -= 2;
177
16.1k
          } while (y > 0);
178
4.69k
        } else {
179
1.51k
          assert(w == 16);
180
181
1.51k
          s[0] = _mm_loadu_si128((__m128i *)data);
182
183
8.59k
          do {
184
8.59k
            s[1] = _mm_loadu_si128((__m128i *)(data + src_stride));
185
8.59k
            res = _mm_avg_epu8(s[0], s[1]);
186
8.59k
            _mm_storeu_si128((__m128i *)dst_ptr, res);
187
8.59k
            s[0] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
188
8.59k
            res = _mm_avg_epu8(s[1], s[0]);
189
8.59k
            _mm_storeu_si128((__m128i *)(dst_ptr + dst_stride), res);
190
191
8.59k
            data += 2 * src_stride;
192
8.59k
            dst_ptr += 2 * dst_stride;
193
8.59k
            y -= 2;
194
8.59k
          } while (y > 0);
195
1.51k
        }
196
17.2k
      } else {
197
944
        assert(!(w % 32));
198
199
944
        __m256i s[2], res;
200
1.34k
        do {
201
1.34k
          data = src_ptr + x;
202
1.34k
          dst_ptr = dst + x;
203
1.34k
          y = h;
204
205
1.34k
          s[0] = _mm256_loadu_si256((__m256i *)data);
206
207
33.0k
          do {
208
33.0k
            s[1] = _mm256_loadu_si256((__m256i *)(data + src_stride));
209
33.0k
            res = _mm256_avg_epu8(s[0], s[1]);
210
33.0k
            _mm256_storeu_si256((__m256i *)dst_ptr, res);
211
33.0k
            s[0] = _mm256_loadu_si256((__m256i *)(data + 2 * src_stride));
212
33.0k
            res = _mm256_avg_epu8(s[1], s[0]);
213
33.0k
            _mm256_storeu_si256((__m256i *)(dst_ptr + dst_stride), res);
214
215
33.0k
            data += 2 * src_stride;
216
33.0k
            dst_ptr += 2 * dst_stride;
217
33.0k
            y -= 2;
218
33.0k
          } while (y > 0);
219
220
1.34k
          x += 32;
221
1.34k
        } while (x < w);
222
944
      }
223
18.1k
    }
224
471k
  } else if (vert_tap == 4) {
225
251k
    if (w <= 4) {
226
118k
      prepare_coeffs_4t_ssse3(filter_params_y, subpel_y_qn, coeffs_128);
227
118k
      __m128i d[4], s[2];
228
229
118k
      if (w == 2) {
230
20.0k
        d[0] = _mm_cvtsi32_si128(loadu_int16(data + 0 * src_stride));
231
20.0k
        d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * src_stride));
232
20.0k
        d[2] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride));
233
234
20.0k
        const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]);
235
20.0k
        const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[2]);
236
237
20.0k
        s[0] = _mm_unpacklo_epi8(src_01a, src_12a);
238
34.1k
        do {
239
34.1k
          __m128i res;
240
34.1k
          convolve_y_4tap_2x2_ssse3(data, src_stride, coeffs_128, d, s, &res);
241
34.1k
          res = round_sr_y_ssse3(res);
242
34.1k
          pack_store_u8_2x2_sse2(res, dst_ptr, dst_stride);
243
244
34.1k
          dst_ptr += 2 * dst_stride;
245
34.1k
          data += 2 * src_stride;
246
34.1k
          y -= 2;
247
248
34.1k
          s[0] = s[1];
249
34.1k
        } while (y > 0);
250
251
98.3k
      } else {
252
98.3k
        assert(w == 4);
253
254
98.3k
        d[0] = _mm_cvtsi32_si128(loadu_int32(data + 0 * src_stride));
255
98.3k
        d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * src_stride));
256
98.3k
        d[2] = _mm_cvtsi32_si128(loadu_int32(data + 2 * src_stride));
257
258
98.3k
        const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]);
259
98.3k
        const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[2]);
260
261
98.3k
        s[0] = _mm_unpacklo_epi8(src_01a, src_12a);
262
194k
        do {
263
194k
          __m128i res;
264
194k
          convolve_y_4tap_4x2_ssse3(data, src_stride, coeffs_128, d, s, &res);
265
194k
          res = round_sr_y_ssse3(res);
266
194k
          pack_store_u8_4x2_sse2(res, dst_ptr, dst_stride);
267
268
194k
          dst_ptr += 2 * dst_stride;
269
194k
          data += 2 * src_stride;
270
194k
          y -= 2;
271
272
194k
          s[0] = s[1];
273
194k
        } while (y > 0);
274
98.3k
      }
275
133k
    } else {
276
133k
      prepare_coeffs_4t_lowbd(filter_params_y, subpel_y_qn, coeffs);
277
278
133k
      if (w == 8) {
279
89.0k
        __m128i d[4];
280
89.0k
        __m256i s[2];
281
282
89.0k
        d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
283
89.0k
        d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
284
89.0k
        d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
285
286
89.0k
        const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
287
89.0k
        const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]);
288
289
89.0k
        s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
290
169k
        do {
291
169k
          __m256i res;
292
169k
          convolve_y_4tap_8x2_avx2(data, src_stride, coeffs, d, s, &res);
293
169k
          round_pack_store_y_8x2_avx2(res, dst_ptr, dst_stride);
294
295
169k
          dst_ptr += 2 * dst_stride;
296
169k
          data += 2 * src_stride;
297
169k
          y -= 2;
298
299
169k
          s[0] = s[1];
300
169k
        } while (y > 0);
301
89.0k
      } else if (w == 16) {
302
41.2k
        __m128i d[4];
303
41.2k
        __m256i s[4];
304
305
41.2k
        d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
306
41.2k
        d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
307
41.2k
        d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
308
309
41.2k
        const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
310
41.2k
        const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]);
311
312
41.2k
        s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
313
41.2k
        s[2] = _mm256_unpackhi_epi8(src_01a, src_12a);
314
315
97.3k
        do {
316
97.3k
          __m256i res[2];
317
97.3k
          convolve_y_4tap_16x2_avx2(data, src_stride, coeffs, d, s, res);
318
97.3k
          round_pack_store_y_16x2_avx2(res, dst_ptr, dst_stride);
319
320
97.3k
          dst_ptr += 2 * dst_stride;
321
97.3k
          data += 2 * src_stride;
322
97.3k
          y -= 2;
323
324
97.3k
          s[0] = s[1];
325
97.3k
          s[2] = s[3];
326
97.3k
        } while (y > 0);
327
41.2k
      } else {
328
3.30k
        assert(!(w % 32));
329
330
3.30k
        __m256i d[4], s1[4], s2[4];
331
4.27k
        do {
332
4.27k
          data = src_ptr + x;
333
4.27k
          dst_ptr = dst + x;
334
4.27k
          y = h;
335
336
4.27k
          d[0] = _mm256_loadu_si256((__m256i *)(data + 0 * src_stride));
337
4.27k
          d[1] = _mm256_loadu_si256((__m256i *)(data + 1 * src_stride));
338
4.27k
          d[2] = _mm256_loadu_si256((__m256i *)(data + 2 * src_stride));
339
340
4.27k
          s1[0] = _mm256_unpacklo_epi8(d[0], d[1]);
341
4.27k
          s1[2] = _mm256_unpackhi_epi8(d[0], d[1]);
342
343
4.27k
          s2[0] = _mm256_unpacklo_epi8(d[1], d[2]);
344
4.27k
          s2[2] = _mm256_unpackhi_epi8(d[1], d[2]);
345
346
87.2k
          do {
347
87.2k
            __m256i res[4];
348
87.2k
            convolve_y_4tap_32x2_avx2(data, src_stride, coeffs, d, s1, s2, res);
349
87.2k
            round_pack_store_y_32x2_avx2(res, dst_ptr, dst_stride);
350
351
87.2k
            dst_ptr += 2 * dst_stride;
352
87.2k
            data += 2 * src_stride;
353
87.2k
            y -= 2;
354
355
87.2k
            s1[0] = s1[1];
356
87.2k
            s1[2] = s1[3];
357
358
87.2k
            s2[0] = s2[1];
359
87.2k
            s2[2] = s2[3];
360
87.2k
          } while (y > 0);
361
362
4.27k
          x += 32;
363
4.27k
        } while (x < w);
364
3.30k
      }
365
133k
    }
366
251k
  } else if (vert_tap == 6) {
367
204k
    if (w <= 4) {
368
63.2k
      prepare_coeffs_6t_ssse3(filter_params_y, subpel_y_qn, coeffs_128);
369
370
63.2k
      __m128i d[6], s[3];
371
63.2k
      if (w == 2) {
372
12.1k
        d[0] = _mm_cvtsi32_si128(loadu_int16(data + 0 * src_stride));
373
12.1k
        d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * src_stride));
374
12.1k
        d[2] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride));
375
12.1k
        d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * src_stride));
376
12.1k
        d[4] = _mm_cvtsi32_si128(loadu_int16(data + 4 * src_stride));
377
378
12.1k
        const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]);
379
12.1k
        const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[2]);
380
12.1k
        const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]);
381
12.1k
        const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[4]);
382
383
12.1k
        s[0] = _mm_unpacklo_epi8(src_01a, src_12a);
384
12.1k
        s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
385
386
48.7k
        do {
387
48.7k
          __m128i res;
388
48.7k
          convolve_y_6tap_2x2_ssse3(data, src_stride, coeffs_128, d, s, &res);
389
48.7k
          res = round_sr_y_ssse3(res);
390
48.7k
          pack_store_u8_2x2_sse2(res, dst_ptr, dst_stride);
391
392
48.7k
          dst_ptr += 2 * dst_stride;
393
48.7k
          data += 2 * src_stride;
394
48.7k
          y -= 2;
395
396
48.7k
          s[0] = s[1];
397
48.7k
          s[1] = s[2];
398
48.7k
        } while (y > 0);
399
400
51.0k
      } else {
401
51.0k
        assert(w == 4);
402
51.0k
        d[0] = _mm_cvtsi32_si128(loadu_int32(data + 0 * src_stride));
403
51.0k
        d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * src_stride));
404
51.0k
        d[2] = _mm_cvtsi32_si128(loadu_int32(data + 2 * src_stride));
405
51.0k
        d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * src_stride));
406
51.0k
        d[4] = _mm_cvtsi32_si128(loadu_int32(data + 4 * src_stride));
407
408
51.0k
        const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]);
409
51.0k
        const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[2]);
410
51.0k
        const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]);
411
51.0k
        const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[4]);
412
413
51.0k
        s[0] = _mm_unpacklo_epi8(src_01a, src_12a);
414
51.0k
        s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
415
416
279k
        do {
417
279k
          __m128i res;
418
279k
          convolve_y_6tap_4x2_ssse3(data, src_stride, coeffs_128, d, s, &res);
419
279k
          res = round_sr_y_ssse3(res);
420
279k
          pack_store_u8_4x2_sse2(res, dst_ptr, dst_stride);
421
422
279k
          dst_ptr += 2 * dst_stride;
423
279k
          data += 2 * src_stride;
424
279k
          y -= 2;
425
426
279k
          s[0] = s[1];
427
279k
          s[1] = s[2];
428
279k
        } while (y > 0);
429
51.0k
      }
430
141k
    } else {
431
141k
      prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
432
433
141k
      if (w == 8) {
434
65.7k
        __m128i d[6];
435
65.7k
        __m256i s[3];
436
437
65.7k
        d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
438
65.7k
        d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
439
65.7k
        d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
440
65.7k
        d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride));
441
65.7k
        d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride));
442
443
65.7k
        const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
444
65.7k
        const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]);
445
65.7k
        const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
446
65.7k
        const __m256i src_34a = _mm256_setr_m128i(d[3], d[4]);
447
448
65.7k
        s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
449
65.7k
        s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
450
451
370k
        do {
452
370k
          __m256i res;
453
370k
          convolve_y_6tap_8x2_avx2(data, src_stride, coeffs, d, s, &res);
454
370k
          round_pack_store_y_8x2_avx2(res, dst_ptr, dst_stride);
455
456
370k
          dst_ptr += 2 * dst_stride;
457
370k
          data += 2 * src_stride;
458
370k
          y -= 2;
459
460
370k
          s[0] = s[1];
461
370k
          s[1] = s[2];
462
370k
        } while (y > 0);
463
464
75.6k
      } else {
465
75.6k
        assert(!(w % 16));
466
467
75.6k
        __m128i d[6];
468
75.6k
        __m256i s[6];
469
111k
        do {
470
111k
          data = src_ptr + x;
471
111k
          dst_ptr = dst + x;
472
111k
          y = h;
473
474
111k
          d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
475
111k
          d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
476
111k
          d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
477
111k
          d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
478
111k
          d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
479
480
111k
          const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
481
111k
          const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]);
482
111k
          const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
483
111k
          const __m256i src_34a = _mm256_setr_m128i(d[3], d[4]);
484
485
111k
          s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
486
111k
          s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
487
488
111k
          s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
489
111k
          s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
490
491
1.42M
          do {
492
1.42M
            __m256i res[2];
493
1.42M
            convolve_y_6tap_16x2_avx2(data, src_stride, coeffs, d, s, res);
494
1.42M
            round_pack_store_y_16x2_avx2(res, dst_ptr, dst_stride);
495
496
1.42M
            dst_ptr += 2 * dst_stride;
497
1.42M
            data += 2 * src_stride;
498
1.42M
            y -= 2;
499
500
1.42M
            s[0] = s[1];
501
1.42M
            s[1] = s[2];
502
503
1.42M
            s[3] = s[4];
504
1.42M
            s[4] = s[5];
505
1.42M
          } while (y > 0);
506
507
111k
          x += 16;
508
111k
        } while (x < w);
509
75.6k
      }
510
141k
    }
511
204k
  } else if (vert_tap == 12) {  // vert_tap == 12
512
0
    __m128i d[12];
513
0
    __m256i s[12];
514
0
    prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs);
515
0
    const __m256i v_zero = _mm256_setzero_si256();
516
0
    __m128i right_shift = _mm_cvtsi32_si128(FILTER_BITS);
517
0
    __m256i right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1);
518
519
0
    for (int j = 0; j < w; j += 8) {
520
0
      data = &src_ptr[j];
521
0
      __m256i src10;
522
523
0
      d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
524
0
      d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
525
0
      d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
526
0
      d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride));
527
0
      d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride));
528
0
      d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride));
529
0
      d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
530
0
      d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride));
531
0
      d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
532
0
      d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride));
533
      // Load lines a and b. Line a to lower 128, line b to upper 128
534
0
      const __m256i src_01a = _mm256_permute2x128_si256(
535
0
          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
536
537
0
      const __m256i src_12a = _mm256_permute2x128_si256(
538
0
          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
539
540
0
      const __m256i src_23a = _mm256_permute2x128_si256(
541
0
          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
542
543
0
      const __m256i src_34a = _mm256_permute2x128_si256(
544
0
          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
545
546
0
      const __m256i src_45a = _mm256_permute2x128_si256(
547
0
          _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
548
549
0
      const __m256i src_56a = _mm256_permute2x128_si256(
550
0
          _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20);
551
552
0
      const __m256i src_67a = _mm256_permute2x128_si256(
553
0
          _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20);
554
555
0
      const __m256i src_78a = _mm256_permute2x128_si256(
556
0
          _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20);
557
558
0
      const __m256i src_89a = _mm256_permute2x128_si256(
559
0
          _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20);
560
561
0
      src10 = _mm256_castsi128_si256(
562
0
          _mm_loadl_epi64((__m128i *)(data + 10 * src_stride)));
563
0
      const __m256i src_910a =
564
0
          _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20);
565
566
0
      const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero);
567
0
      const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero);
568
0
      const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero);
569
0
      const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero);
570
0
      const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero);
571
0
      const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero);
572
0
      const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero);
573
0
      const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero);
574
0
      const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero);
575
0
      const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero);
576
577
0
      s[0] = _mm256_unpacklo_epi16(src_01, src_12);
578
0
      s[1] = _mm256_unpacklo_epi16(src_23, src_34);
579
0
      s[2] = _mm256_unpacklo_epi16(src_45, src_56);
580
0
      s[3] = _mm256_unpacklo_epi16(src_67, src_78);
581
0
      s[4] = _mm256_unpacklo_epi16(src_89, src_910);
582
583
0
      s[6] = _mm256_unpackhi_epi16(src_01, src_12);
584
0
      s[7] = _mm256_unpackhi_epi16(src_23, src_34);
585
0
      s[8] = _mm256_unpackhi_epi16(src_45, src_56);
586
0
      s[9] = _mm256_unpackhi_epi16(src_67, src_78);
587
0
      s[10] = _mm256_unpackhi_epi16(src_89, src_910);
588
589
0
      for (i = 0; i < h; i += 2) {
590
0
        data = &src_ptr[i * src_stride + j];
591
0
        const __m256i src_1011a = _mm256_permute2x128_si256(
592
0
            src10,
593
0
            _mm256_castsi128_si256(
594
0
                _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
595
0
            0x20);
596
597
0
        src10 = _mm256_castsi128_si256(
598
0
            _mm_loadl_epi64((__m128i *)(data + 12 * src_stride)));
599
600
0
        const __m256i src_1112a = _mm256_permute2x128_si256(
601
0
            _mm256_castsi128_si256(
602
0
                _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
603
0
            src10, 0x20);
604
605
0
        const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero);
606
0
        const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero);
607
608
0
        s[5] = _mm256_unpacklo_epi16(src_1011, src_1112);
609
0
        s[11] = _mm256_unpackhi_epi16(src_1011, src_1112);
610
611
0
        const __m256i res_lo = convolve_12taps(s, coeffs);
612
613
0
        const __m256i res_32b_lo = _mm256_sra_epi32(
614
0
            _mm256_add_epi32(res_lo, right_shift_const), right_shift);
615
        // 8 bit conversion and saturation to uint8
616
0
        __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
617
0
        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
618
619
0
        if (w - j > 4) {
620
0
          const __m256i res_hi = convolve_12taps(s + 6, coeffs);
621
622
0
          const __m256i res_32b_hi = _mm256_sra_epi32(
623
0
              _mm256_add_epi32(res_hi, right_shift_const), right_shift);
624
0
          __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi);
625
          // 8 bit conversion and saturation to uint8
626
0
          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
627
628
0
          __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi);
629
630
0
          const __m128i res_0 = _mm256_extracti128_si256(res_a, 0);
631
0
          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
632
633
0
          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
634
0
          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
635
0
                           res_1);
636
0
        } else {
637
0
          const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
638
0
          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
639
0
          if (w - j > 2) {
640
0
            xx_storel_32(&dst[i * dst_stride + j], res_0);
641
0
            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
642
0
          } else {
643
0
            xx_storel_16(&dst[i * dst_stride + j], res_0);
644
0
            xx_storel_16(&dst[i * dst_stride + j + dst_stride], res_1);
645
0
          }
646
0
        }
647
0
        s[0] = s[1];
648
0
        s[1] = s[2];
649
0
        s[2] = s[3];
650
0
        s[3] = s[4];
651
0
        s[4] = s[5];
652
653
0
        s[6] = s[7];
654
0
        s[7] = s[8];
655
0
        s[8] = s[9];
656
0
        s[9] = s[10];
657
0
        s[10] = s[11];
658
0
      }
659
0
    }
660
15.3k
  } else {
661
15.3k
    assert(vert_tap == 8);
662
663
15.3k
    if (w <= 4) {
664
6.74k
      prepare_coeffs_ssse3(filter_params_y, subpel_y_qn, coeffs_128);
665
666
6.74k
      __m128i d[8], s[4], res;
667
6.74k
      if (w == 2) {
668
1.48k
        d[0] = _mm_cvtsi32_si128(loadu_int16(data + 0 * src_stride));
669
1.48k
        d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * src_stride));
670
1.48k
        d[2] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride));
671
1.48k
        d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * src_stride));
672
1.48k
        d[4] = _mm_cvtsi32_si128(loadu_int16(data + 4 * src_stride));
673
1.48k
        d[5] = _mm_cvtsi32_si128(loadu_int16(data + 5 * src_stride));
674
1.48k
        d[6] = _mm_cvtsi32_si128(loadu_int16(data + 6 * src_stride));
675
676
1.48k
        const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]);
677
1.48k
        const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[2]);
678
1.48k
        const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]);
679
1.48k
        const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[4]);
680
1.48k
        const __m128i src_45a = _mm_unpacklo_epi16(d[4], d[5]);
681
1.48k
        const __m128i src_56a = _mm_unpacklo_epi16(d[5], d[6]);
682
683
1.48k
        s[0] = _mm_unpacklo_epi8(src_01a, src_12a);
684
1.48k
        s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
685
1.48k
        s[2] = _mm_unpacklo_epi8(src_45a, src_56a);
686
687
5.93k
        do {
688
5.93k
          convolve_y_8tap_2x2_ssse3(data, src_stride, coeffs_128, d, s, &res);
689
5.93k
          res = round_sr_y_ssse3(res);
690
5.93k
          pack_store_u8_2x2_sse2(res, dst_ptr, dst_stride);
691
692
5.93k
          dst_ptr += 2 * dst_stride;
693
5.93k
          data += 2 * src_stride;
694
5.93k
          y -= 2;
695
696
5.93k
          s[0] = s[1];
697
5.93k
          s[1] = s[2];
698
5.93k
          s[2] = s[3];
699
5.93k
        } while (y > 0);
700
701
5.26k
      } else {
702
5.26k
        assert(w == 4);
703
704
5.26k
        d[0] = _mm_cvtsi32_si128(loadu_int32(data + 0 * src_stride));
705
5.26k
        d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * src_stride));
706
5.26k
        d[2] = _mm_cvtsi32_si128(loadu_int32(data + 2 * src_stride));
707
5.26k
        d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * src_stride));
708
5.26k
        d[4] = _mm_cvtsi32_si128(loadu_int32(data + 4 * src_stride));
709
5.26k
        d[5] = _mm_cvtsi32_si128(loadu_int32(data + 5 * src_stride));
710
5.26k
        d[6] = _mm_cvtsi32_si128(loadu_int32(data + 6 * src_stride));
711
712
5.26k
        const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]);
713
5.26k
        const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[2]);
714
5.26k
        const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]);
715
5.26k
        const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[4]);
716
5.26k
        const __m128i src_45a = _mm_unpacklo_epi32(d[4], d[5]);
717
5.26k
        const __m128i src_56a = _mm_unpacklo_epi32(d[5], d[6]);
718
719
5.26k
        s[0] = _mm_unpacklo_epi8(src_01a, src_12a);
720
5.26k
        s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
721
5.26k
        s[2] = _mm_unpacklo_epi8(src_45a, src_56a);
722
723
30.0k
        do {
724
30.0k
          convolve_y_8tap_4x2_ssse3(data, src_stride, coeffs_128, d, s, &res);
725
30.0k
          res = round_sr_y_ssse3(res);
726
30.0k
          pack_store_u8_4x2_sse2(res, dst_ptr, dst_stride);
727
728
30.0k
          dst_ptr += 2 * dst_stride;
729
30.0k
          data += 2 * src_stride;
730
30.0k
          y -= 2;
731
732
30.0k
          s[0] = s[1];
733
30.0k
          s[1] = s[2];
734
30.0k
          s[2] = s[3];
735
30.0k
        } while (y > 0);
736
5.26k
      }
737
8.64k
    } else {
738
8.64k
      prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
739
740
8.64k
      if (w == 8) {
741
4.40k
        __m128i d[8];
742
4.40k
        __m256i s[4];
743
744
4.40k
        d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
745
4.40k
        d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
746
4.40k
        d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
747
4.40k
        d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride));
748
4.40k
        d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride));
749
4.40k
        d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride));
750
4.40k
        d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
751
752
4.40k
        const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
753
4.40k
        const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]);
754
4.40k
        const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
755
4.40k
        const __m256i src_34a = _mm256_setr_m128i(d[3], d[4]);
756
4.40k
        const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]);
757
4.40k
        const __m256i src_56a = _mm256_setr_m128i(d[5], d[6]);
758
759
4.40k
        s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
760
4.40k
        s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
761
4.40k
        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
762
763
30.5k
        do {
764
30.5k
          __m256i res;
765
30.5k
          convolve_y_8tap_8x2_avx2(data, src_stride, coeffs, d, s, &res);
766
30.5k
          round_pack_store_y_8x2_avx2(res, dst_ptr, dst_stride);
767
768
30.5k
          dst_ptr += 2 * dst_stride;
769
30.5k
          data += 2 * src_stride;
770
30.5k
          y -= 2;
771
772
30.5k
          s[0] = s[1];
773
30.5k
          s[1] = s[2];
774
30.5k
          s[2] = s[3];
775
30.5k
        } while (y > 0);
776
777
4.40k
      } else {
778
4.24k
        assert(!(w % 16));
779
780
4.24k
        __m128i d[8];
781
4.24k
        __m256i s[8];
782
6.62k
        do {
783
6.62k
          data = src_ptr + x;
784
6.62k
          dst_ptr = dst + x;
785
6.62k
          y = h;
786
787
6.62k
          d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
788
6.62k
          d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
789
6.62k
          d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
790
6.62k
          d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
791
6.62k
          d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
792
6.62k
          d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
793
6.62k
          d[6] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
794
795
6.62k
          const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
796
6.62k
          const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]);
797
6.62k
          const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
798
6.62k
          const __m256i src_34a = _mm256_setr_m128i(d[3], d[4]);
799
6.62k
          const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]);
800
6.62k
          const __m256i src_56a = _mm256_setr_m128i(d[5], d[6]);
801
802
6.62k
          s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
803
6.62k
          s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
804
6.62k
          s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
805
806
6.62k
          s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
807
6.62k
          s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
808
6.62k
          s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
809
810
94.4k
          do {
811
94.4k
            __m256i res[2];
812
94.4k
            convolve_y_8tap_16x2_avx2(data, src_stride, coeffs, d, s, res);
813
94.4k
            round_pack_store_y_16x2_avx2(res, dst_ptr, dst_stride);
814
815
94.4k
            dst_ptr += 2 * dst_stride;
816
94.4k
            data += 2 * src_stride;
817
94.4k
            y -= 2;
818
819
94.4k
            s[0] = s[1];
820
94.4k
            s[1] = s[2];
821
94.4k
            s[2] = s[3];
822
823
94.4k
            s[4] = s[5];
824
94.4k
            s[5] = s[6];
825
94.4k
            s[6] = s[7];
826
94.4k
          } while (y > 0);
827
828
6.62k
          x += 16;
829
6.62k
        } while (x < w);
830
4.24k
      }
831
8.64k
    }
832
15.3k
  }
833
503k
}
834
835
void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
836
                            uint8_t *dst, int32_t dst_stride, int32_t w,
837
                            int32_t h,
838
                            const InterpFilterParams *filter_params_x,
839
                            const int32_t subpel_x_qn,
840
409k
                            ConvolveParams *conv_params) {
841
409k
  const int bits = FILTER_BITS - conv_params->round_0;
842
409k
  int i, j, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
843
844
409k
  assert(bits >= 0);
845
409k
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
846
409k
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
847
409k
  assert(conv_params->round_0 > 0);
848
849
409k
  assert(horiz_tap == 2 || horiz_tap == 4 || horiz_tap == 6 || horiz_tap == 8 ||
850
409k
         horiz_tap == 12);
851
409k
  assert((!(w % 2)) || (w <= 128));
852
409k
  assert((h % 2) == 0);
853
854
409k
  __m256i coeffs[6] = { 0 }, filt[4] = { 0 };
855
409k
  __m128i coeffs_128[4] = { 0 };
856
857
409k
  i = 0;
858
  // horz_filt as 4 tap
859
409k
  if (horiz_tap == 4) {
860
    // since fo_horiz = 1
861
154k
    const uint8_t *src_ptr = src - 1;
862
154k
    if (w == 2) {
863
24.7k
      prepare_coeffs_4t_ssse3(filter_params_x, subpel_x_qn, coeffs_128);
864
64.2k
      do {
865
64.2k
        const __m128i res =
866
64.2k
            convolve_x_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
867
64.2k
        const __m128i reg = round_sr_x_ssse3(res);
868
64.2k
        pack_store_u8_2x2_sse2(reg, dst, dst_stride);
869
64.2k
        src_ptr += 2 * src_stride;
870
64.2k
        dst += 2 * dst_stride;
871
64.2k
        h -= 2;
872
64.2k
      } while (h);
873
129k
    } else if (w == 4) {
874
116k
      prepare_coeffs_4t_ssse3(filter_params_x, subpel_x_qn, coeffs_128);
875
383k
      do {
876
383k
        const __m128i reg =
877
383k
            convolve_x_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
878
383k
        const __m128i res = round_sr_x_ssse3(reg);
879
383k
        pack_store_u8_4x2_sse2(res, dst, dst_stride);
880
383k
        src_ptr += 2 * src_stride;
881
383k
        dst += 2 * dst_stride;
882
383k
        h -= 2;
883
383k
      } while (h);
884
116k
    } else if (w == 8) {
885
6.99k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
886
6.99k
      filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
887
6.99k
      filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
888
25.9k
      do {
889
25.9k
        const __m256i data = _mm256_setr_m128i(
890
25.9k
            _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride])),
891
25.9k
            _mm_loadu_si128(
892
25.9k
                (__m128i *)(&src_ptr[i * src_stride + src_stride])));
893
894
25.9k
        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
895
896
25.9k
        res_16b = round_sr_x_avx2(res_16b);
897
898
        /* rounding code */
899
        // 8 bit conversion and saturation to uint8
900
25.9k
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
901
902
25.9k
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
903
25.9k
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
904
905
25.9k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
906
25.9k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
907
25.9k
        i += 2;
908
25.9k
      } while (i < h);
909
6.99k
    } else {
910
5.53k
      assert(!(w % 16));
911
5.53k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
912
5.53k
      filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
913
5.53k
      filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
914
121k
      do {
915
121k
        j = 0;
916
399k
        do {
917
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
918
          // 18 19 20 21 22 23
919
399k
          const __m256i data = _mm256_inserti128_si256(
920
399k
              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
921
399k
              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
922
399k
              1);
923
924
399k
          __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
925
926
399k
          res_16b = round_sr_x_avx2(res_16b);
927
928
          /* rounding code */
929
          // 8 bit conversion and saturation to uint8
930
399k
          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
931
932
          // Store values into the destination buffer
933
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
934
399k
          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
935
399k
          __m128i res = _mm256_castsi256_si128(res_8b);
936
399k
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
937
399k
          j += 16;
938
399k
        } while (j < w);
939
121k
        i++;
940
121k
      } while (i < h);
941
5.53k
    }
942
255k
  } else if (horiz_tap == 6) {
943
    // since (horiz_tap/2 - 1 == 2)
944
202k
    const uint8_t *src_ptr = src - 2;
945
202k
    prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs);
946
202k
    filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
947
202k
    filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
948
202k
    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
949
202k
    if (w == 8) {
950
396k
      do {
951
396k
        const __m256i data = _mm256_setr_m128i(
952
396k
            _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride])),
953
396k
            _mm_loadu_si128(
954
396k
                (__m128i *)(&src_ptr[i * src_stride + src_stride])));
955
956
396k
        __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
957
958
396k
        res_16b = round_sr_x_avx2(res_16b);
959
960
        /* rounding code */
961
        // 8 bit conversion and saturation to uint8
962
396k
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
963
964
396k
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
965
396k
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
966
396k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
967
396k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
968
396k
        i += 2;
969
396k
      } while (i < h);
970
109k
    } else if (w == 16) {
971
345k
      do {
972
345k
        __m256i data[2] = { 0 };
973
974
345k
        load_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs, filt, data);
975
345k
        round_pack_store_16x2_avx2(data, dst, dst_stride);
976
345k
        src_ptr += 2 * src_stride;
977
345k
        dst += 2 * dst_stride;
978
345k
        h -= 2;
979
345k
      } while (h);
980
68.9k
    } else if (w == 32) {
981
366k
      do {
982
366k
        convolve_sr_store_6tap_32_avx2(src_ptr, coeffs, filt, dst);
983
366k
        src_ptr += src_stride;
984
366k
        dst += dst_stride;
985
366k
      } while ((--h) > 0);
986
18.6k
    } else if (w == 64) {
987
210k
      do {
988
210k
        convolve_sr_store_6tap_32_avx2(src_ptr, coeffs, filt, dst);
989
210k
        convolve_sr_store_6tap_32_avx2(src_ptr + 32, coeffs, filt, dst + 32);
990
210k
        src_ptr += src_stride;
991
210k
        dst += dst_stride;
992
210k
      } while ((--h) > 0);
993
4.17k
    } else {
994
1.02k
      assert(w == 128);
995
996
118k
      do {
997
118k
        convolve_sr_store_6tap_32_avx2(src_ptr, coeffs, filt, dst);
998
118k
        convolve_sr_store_6tap_32_avx2(src_ptr + SECOND_32_BLK, coeffs, filt,
999
118k
                                       dst + SECOND_32_BLK);
1000
118k
        convolve_sr_store_6tap_32_avx2(src_ptr + THIRD_32_BLK, coeffs, filt,
1001
118k
                                       dst + THIRD_32_BLK);
1002
118k
        convolve_sr_store_6tap_32_avx2(src_ptr + FOURTH_32_BLK, coeffs, filt,
1003
118k
                                       dst + FOURTH_32_BLK);
1004
118k
        src_ptr += src_stride;
1005
118k
        dst += dst_stride;
1006
118k
      } while ((--h) > 0);
1007
1.03k
    }
1008
202k
  } else if (horiz_tap == 8) {
1009
    // since (horiz_tap / 2 - 1) == 3
1010
14.9k
    const uint8_t *src_ptr = src - 3;
1011
14.9k
    prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
1012
14.9k
    filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
1013
14.9k
    filt[1] =
1014
14.9k
        _mm256_load_si256((__m256i const *)(filt_global_avx2 + SECOND_32_BLK));
1015
14.9k
    filt[2] =
1016
14.9k
        _mm256_load_si256((__m256i const *)(filt_global_avx2 + THIRD_32_BLK));
1017
14.9k
    filt[3] =
1018
14.9k
        _mm256_load_si256((__m256i const *)(filt_global_avx2 + FOURTH_32_BLK));
1019
1020
14.9k
    if (w == 8) {
1021
30.3k
      do {
1022
30.3k
        const __m256i data = _mm256_setr_m128i(
1023
30.3k
            _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride])),
1024
30.3k
            _mm_loadu_si128(
1025
30.3k
                (__m128i *)(&src_ptr[i * src_stride + src_stride])));
1026
1027
30.3k
        __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
1028
1029
30.3k
        res_16b = round_sr_x_avx2(res_16b);
1030
1031
        /* rounding code */
1032
        // 8 bit conversion and saturation to uint8
1033
30.3k
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
1034
1035
30.3k
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
1036
30.3k
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
1037
30.3k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
1038
30.3k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
1039
30.3k
        i += 2;
1040
30.3k
      } while (i < h);
1041
7.70k
    } else if (w == 16) {
1042
28.6k
      do {
1043
28.6k
        __m256i data[2] = { 0 };
1044
1045
28.6k
        load_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs, filt, data);
1046
28.6k
        round_pack_store_16x2_avx2(data, dst, dst_stride);
1047
28.6k
        src_ptr += 2 * src_stride;
1048
28.6k
        dst += 2 * dst_stride;
1049
28.6k
        h -= 2;
1050
28.6k
      } while (h);
1051
4.93k
    } else if (w == 32) {
1052
37.6k
      do {
1053
37.6k
        load_convolve_round_8tap_32_avx2(src_ptr, coeffs, filt, dst);
1054
37.6k
        src_ptr += src_stride;
1055
37.6k
        dst += dst_stride;
1056
37.6k
      } while ((--h) > 0);
1057
1.56k
    } else if (w == 64) {
1058
27.8k
      do {
1059
27.8k
        load_convolve_round_8tap_32_avx2(src_ptr, coeffs, filt, dst);
1060
27.8k
        load_convolve_round_8tap_32_avx2(src_ptr + 32, coeffs, filt, dst + 32);
1061
27.8k
        src_ptr += src_stride;
1062
27.8k
        dst += dst_stride;
1063
27.8k
      } while ((--h) > 0);
1064
557
    } else {
1065
157
      assert(w == 128);
1066
16.0k
      do {
1067
16.0k
        load_convolve_round_8tap_32_avx2(src_ptr, coeffs, filt, dst);
1068
16.0k
        load_convolve_round_8tap_32_avx2(src_ptr + SECOND_32_BLK, coeffs, filt,
1069
16.0k
                                         dst + SECOND_32_BLK);
1070
16.0k
        load_convolve_round_8tap_32_avx2(src_ptr + THIRD_32_BLK, coeffs, filt,
1071
16.0k
                                         dst + THIRD_32_BLK);
1072
16.0k
        load_convolve_round_8tap_32_avx2(src_ptr + FOURTH_32_BLK, coeffs, filt,
1073
16.0k
                                         dst + FOURTH_32_BLK);
1074
16.0k
        src_ptr += src_stride;
1075
16.0k
        dst += dst_stride;
1076
16.0k
      } while ((--h) > 0);
1077
158
    }
1078
38.4k
  } else if (horiz_tap == 12) {  // horiz_tap == 12
1079
0
    const int fo_horiz = filter_params_x->taps / 2 - 1;
1080
0
    prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs);
1081
0
    const __m128i round_shift = _mm_cvtsi32_si128(bits);
1082
0
    const uint8_t *const src_ptr = src - fo_horiz;
1083
0
    const __m256i v_zero = _mm256_setzero_si256();
1084
0
    __m256i round_0_const =
1085
0
        _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1);
1086
0
    __m256i round_const = _mm256_set1_epi32((1 << bits) >> 1);
1087
0
    __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
1088
0
    __m256i s[6] = { 0 };
1089
1090
0
    if (w <= 4) {
1091
0
      do {
1092
0
        const __m256i data = _mm256_permute2x128_si256(
1093
0
            _mm256_castsi128_si256(
1094
0
                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
1095
0
            _mm256_castsi128_si256(_mm_loadu_si128(
1096
0
                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
1097
0
            0x20);
1098
        // row0 0..7 row1 0..7
1099
0
        const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
1100
        // row0 8..F row1 8..F
1101
0
        const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
1102
1103
        // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03
1104
0
        const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
1105
        // row0 04 04 .. 07 07 row1 04 04 .. 07 07
1106
0
        const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
1107
1108
        // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B
1109
0
        const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
1110
        // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F
1111
0
        const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
1112
1113
        // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14
1114
0
        s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
1115
        // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
1116
0
        s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
1117
        // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18
1118
0
        s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
1119
        // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A
1120
0
        s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
1121
        // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C
1122
0
        s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
1123
        // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E
1124
0
        s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
1125
1126
0
        const __m256i res_lo = convolve_12taps(s, coeffs);
1127
1128
0
        __m256i res_32b_lo = _mm256_sra_epi32(
1129
0
            _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
1130
1131
        // 00 01 02 03 10 12 13 14
1132
0
        res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const),
1133
0
                                      round_shift);
1134
        // 8 bit conversion and saturation to uint8
1135
        // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13
1136
0
        __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
1137
        // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
1138
        // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
1139
0
        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
1140
1141
        // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
1142
0
        const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
1143
        // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
1144
0
        const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
1145
0
        if (w > 2) {
1146
          // 00 01 02 03
1147
0
          xx_storel_32(&dst[i * dst_stride], res_0);
1148
          // 10 11 12 13
1149
0
          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
1150
0
        } else {
1151
          // 00 01
1152
0
          xx_storel_16(&dst[i * dst_stride], res_0);
1153
          // 10 11
1154
0
          xx_storel_16(&dst[i * dst_stride + dst_stride], res_1);
1155
0
        }
1156
0
        i += 2;
1157
0
      } while (i < h);
1158
0
    } else {
1159
0
      assert(!(w % 8));
1160
0
      do {
1161
0
        j = 0;
1162
0
        do {
1163
0
          const __m256i data = _mm256_permute2x128_si256(
1164
0
              _mm256_castsi128_si256(
1165
0
                  _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
1166
0
              _mm256_castsi128_si256(_mm_loadu_si128(
1167
0
                  (__m128i *)(&src_ptr[i * src_stride + j + 4]))),
1168
0
              0x20);
1169
          // row0 0..7 4..B
1170
0
          const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
1171
          // row0 8..F C..13
1172
0
          const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
1173
1174
          // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07
1175
0
          const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
1176
          // row0 04 04 .. 07 07 08 08 .. 0B 0B
1177
0
          const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
1178
1179
          // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F
1180
0
          const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
1181
          // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13
1182
0
          const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
1183
1184
0
          s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
1185
0
          s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
1186
0
          s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
1187
0
          s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
1188
0
          s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
1189
0
          s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
1190
1191
0
          const __m256i res_lo = convolve_12taps(s, coeffs);
1192
1193
0
          __m256i res_32b_lo = _mm256_sra_epi32(
1194
0
              _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
1195
1196
0
          res_32b_lo = _mm256_sra_epi32(
1197
0
              _mm256_add_epi32(res_32b_lo, round_const), round_shift);
1198
          // 8 bit conversion and saturation to uint8
1199
0
          __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
1200
0
          __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
1201
0
          const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
1202
0
          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
1203
0
          xx_storel_32(&dst[i * dst_stride + j], res_0);
1204
0
          xx_storel_32(&dst[i * dst_stride + j + 4], res_1);
1205
1206
0
          j += 8;
1207
0
        } while (j < w);
1208
0
        i++;
1209
0
      } while (i < h);
1210
0
    }
1211
38.4k
  } else {
1212
38.4k
    assert(horiz_tap == 2);
1213
    // since (filter_params_x->taps / 2 - 1) == 0
1214
38.4k
    const uint8_t *src_ptr = src;
1215
38.4k
    if (subpel_x_qn != 8) {
1216
14.9k
      if (w <= 8) {
1217
11.1k
        prepare_coeffs_2t_ssse3(filter_params_x, subpel_x_qn, coeffs_128);
1218
1219
11.1k
        if (w == 2) {
1220
3.11k
          do {
1221
3.11k
            const __m128i data =
1222
3.11k
                convolve_x_2tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
1223
3.11k
            const __m128i reg = round_sr_x_ssse3(data);
1224
3.11k
            pack_store_u8_2x2_sse2(reg, dst, dst_stride);
1225
3.11k
            src_ptr += 2 * src_stride;
1226
3.11k
            dst += 2 * dst_stride;
1227
3.11k
            h -= 2;
1228
3.11k
          } while (h);
1229
9.79k
        } else if (w == 4) {
1230
15.1k
          do {
1231
15.1k
            const __m128i data =
1232
15.1k
                convolve_x_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
1233
15.1k
            const __m128i reg = round_sr_x_ssse3(data);
1234
15.1k
            pack_store_u8_4x2_sse2(reg, dst, dst_stride);
1235
15.1k
            src_ptr += 2 * src_stride;
1236
15.1k
            dst += 2 * dst_stride;
1237
15.1k
            h -= 2;
1238
15.1k
          } while (h);
1239
5.05k
        } else {
1240
4.73k
          assert(w == 8);
1241
1242
17.6k
          do {
1243
17.6k
            __m128i data[2] = { 0 };
1244
1245
17.6k
            convolve_x_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, data);
1246
17.6k
            data[0] = round_sr_x_ssse3(data[0]);
1247
17.6k
            data[1] = round_sr_x_ssse3(data[1]);
1248
17.6k
            const __m128i reg = _mm_packus_epi16(data[0], data[1]);
1249
17.6k
            _mm_storel_epi64((__m128i *)dst, reg);
1250
17.6k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), reg);
1251
1252
17.6k
            src_ptr += 2 * src_stride;
1253
17.6k
            dst += 2 * dst_stride;
1254
17.6k
            h -= 2;
1255
17.6k
          } while (h);
1256
4.73k
        }
1257
11.1k
      } else {
1258
3.77k
        prepare_coeffs_2t_lowbd(filter_params_x, subpel_x_qn, coeffs);
1259
1260
3.77k
        if (w == 16) {
1261
13.6k
          do {
1262
13.6k
            __m256i data[2] = { 0 };
1263
1264
13.6k
            convolve_x_2tap_16x2_avx2(src_ptr, src_stride, coeffs, data);
1265
13.6k
            round_pack_store_16x2_avx2(data, dst, dst_stride);
1266
13.6k
            src_ptr += 2 * src_stride;
1267
13.6k
            dst += 2 * dst_stride;
1268
13.6k
            h -= 2;
1269
13.6k
          } while (h);
1270
2.21k
        } else if (w == 32) {
1271
22.0k
          do {
1272
22.0k
            convolve_round_2tap_32_avx2(src_ptr, coeffs, dst);
1273
22.0k
            src_ptr += src_stride;
1274
22.0k
            dst += dst_stride;
1275
22.0k
          } while ((--h) > 0);
1276
829
        } else if (w == 64) {
1277
24.9k
          do {
1278
24.9k
            convolve_round_2tap_32_avx2(src_ptr, coeffs, dst);
1279
24.9k
            convolve_round_2tap_32_avx2(src_ptr + SECOND_32_BLK, coeffs,
1280
24.9k
                                        dst + SECOND_32_BLK);
1281
24.9k
            src_ptr += src_stride;
1282
24.9k
            dst += dst_stride;
1283
24.9k
          } while ((--h) > 0);
1284
592
        } else {
1285
140
          assert(w == 128);
1286
1287
10.5k
          do {
1288
10.5k
            convolve_round_2tap_32_avx2(src_ptr, coeffs, dst);
1289
10.5k
            convolve_round_2tap_32_avx2(src_ptr + (SECOND_32_BLK), coeffs,
1290
10.5k
                                        dst + (SECOND_32_BLK));
1291
10.5k
            convolve_round_2tap_32_avx2(src_ptr + (THIRD_32_BLK), coeffs,
1292
10.5k
                                        dst + (THIRD_32_BLK));
1293
10.5k
            convolve_round_2tap_32_avx2(src_ptr + (FOURTH_32_BLK), coeffs,
1294
10.5k
                                        dst + (FOURTH_32_BLK));
1295
10.5k
            src_ptr += src_stride;
1296
10.5k
            dst += dst_stride;
1297
10.5k
          } while ((--h) > 0);
1298
140
        }
1299
3.77k
      }
1300
23.5k
    } else {
1301
23.5k
      if (w == 2) {
1302
7.73k
        do {
1303
7.73k
          __m128i data = load_x_u8_4x2_sse4(src_ptr, src_stride);
1304
7.73k
          const __m128i reg1 = _mm_srli_si128(data, 1);
1305
7.73k
          const __m128i reg2 = _mm_avg_epu8(data, reg1);
1306
7.73k
          xx_storel_16(dst, reg2);
1307
7.73k
          {
1308
7.73k
            uint16_t val = (uint16_t)_mm_extract_epi16(reg2, 2);
1309
7.73k
            memcpy(dst + dst_stride, &val, sizeof(val));
1310
7.73k
          }
1311
7.73k
          src_ptr += 2 * src_stride;
1312
7.73k
          dst += 2 * dst_stride;
1313
7.73k
          h -= 2;
1314
7.73k
        } while (h);
1315
20.0k
      } else if (w == 4) {
1316
23.5k
        do {
1317
23.5k
          __m128i data = load_8bit_8x2_to_1_reg_sse2(
1318
23.5k
              src_ptr, (int)(sizeof(*src_ptr) * src_stride));
1319
23.5k
          const __m128i reg1 = _mm_srli_si128(data, 1);
1320
23.5k
          const __m128i reg2 = _mm_avg_epu8(data, reg1);
1321
23.5k
          xx_storel_32(dst, reg2);
1322
23.5k
          {
1323
23.5k
            int32_t val = _mm_extract_epi32(reg2, 2);
1324
23.5k
            memcpy(dst + dst_stride, &val, sizeof(val));
1325
23.5k
          }
1326
1327
23.5k
          src_ptr += 2 * src_stride;
1328
23.5k
          dst += 2 * dst_stride;
1329
23.5k
          h -= 2;
1330
23.5k
        } while (h);
1331
11.6k
      } else if (w == 8) {
1332
26.7k
        do {
1333
26.7k
          const __m128i data00 = _mm_loadu_si128((__m128i *)src_ptr);
1334
26.7k
          const __m128i data10 =
1335
26.7k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
1336
26.7k
          const __m128i data01 = _mm_srli_si128(data00, 1);
1337
26.7k
          const __m128i data11 = _mm_srli_si128(data10, 1);
1338
26.7k
          const __m128i reg0 = _mm_avg_epu8(data00, data01);
1339
26.7k
          const __m128i reg1 = _mm_avg_epu8(data10, data11);
1340
26.7k
          _mm_storel_epi64((__m128i *)dst, reg0);
1341
26.7k
          _mm_storel_epi64((__m128i *)(dst + dst_stride), reg1);
1342
1343
26.7k
          src_ptr += 2 * src_stride;
1344
26.7k
          dst += 2 * dst_stride;
1345
26.7k
          h -= 2;
1346
26.7k
        } while (h);
1347
6.95k
      } else if (w == 16) {
1348
20.2k
        do {
1349
20.2k
          const __m128i data00 = _mm_loadu_si128((__m128i *)src_ptr);
1350
20.2k
          const __m128i data01 = _mm_loadu_si128((__m128i *)(src_ptr + 1));
1351
20.2k
          const __m128i data10 =
1352
20.2k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
1353
20.2k
          const __m128i data11 =
1354
20.2k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
1355
20.2k
          const __m128i reg0 = _mm_avg_epu8(data00, data01);
1356
20.2k
          const __m128i reg1 = _mm_avg_epu8(data10, data11);
1357
20.2k
          _mm_storeu_si128((__m128i *)dst, reg0);
1358
20.2k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), reg1);
1359
1360
20.2k
          src_ptr += 2 * src_stride;
1361
20.2k
          dst += 2 * dst_stride;
1362
20.2k
          h -= 2;
1363
20.2k
        } while (h);
1364
3.25k
      } else if (w == 32) {
1365
21.5k
        do {
1366
21.5k
          load_avg_store_2tap_32_avx2(src_ptr, dst);
1367
21.5k
          src_ptr += src_stride;
1368
21.5k
          dst += dst_stride;
1369
21.5k
        } while ((--h) > 0);
1370
1.00k
      } else if (w == 64) {
1371
13.5k
        do {
1372
13.5k
          load_avg_store_2tap_32_avx2(src_ptr, dst);
1373
13.5k
          load_avg_store_2tap_32_avx2(src_ptr + (SECOND_32_BLK),
1374
13.5k
                                      dst + (SECOND_32_BLK));
1375
13.5k
          src_ptr += src_stride;
1376
13.5k
          dst += dst_stride;
1377
13.5k
        } while ((--h) > 0);
1378
266
      } else {
1379
144
        assert(w == 128);
1380
1381
12.8k
        do {
1382
12.8k
          load_avg_store_2tap_32_avx2(src_ptr, dst);
1383
12.8k
          load_avg_store_2tap_32_avx2(src_ptr + (SECOND_32_BLK),
1384
12.8k
                                      dst + (SECOND_32_BLK));
1385
12.8k
          load_avg_store_2tap_32_avx2(src_ptr + (THIRD_32_BLK),
1386
12.8k
                                      dst + (THIRD_32_BLK));
1387
12.8k
          load_avg_store_2tap_32_avx2(src_ptr + (FOURTH_32_BLK),
1388
12.8k
                                      dst + (FOURTH_32_BLK));
1389
12.8k
          src_ptr += src_stride;
1390
12.8k
          dst += dst_stride;
1391
12.8k
        } while ((--h) > 0);
1392
144
      }
1393
23.5k
    }
1394
38.4k
  }
1395
409k
}