Coverage Report

Created: 2025-07-23 06:32

/src/aom/third_party/SVT-AV1/convolve_2d_avx2.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#ifndef THIRD_PARTY_SVT_AV1_CONVOLVE_2D_AVX2_H_
13
#define THIRD_PARTY_SVT_AV1_CONVOLVE_2D_AVX2_H_
14
15
#include "convolve_avx2.h"
16
17
static void convolve_2d_sr_hor_2tap_avx2(
18
    const uint8_t *const src, const int32_t src_stride, const int32_t w,
19
    const int32_t h, const InterpFilterParams *const filter_params_x,
20
74.4k
    const int32_t subpel_x_q4, int16_t *const im_block) {
21
74.4k
  const uint8_t *src_ptr = src;
22
74.4k
  int32_t y = h;
23
74.4k
  int16_t *im = im_block;
24
25
74.4k
  if (w <= 8) {
26
59.6k
    __m128i coeffs_128;
27
28
59.6k
    prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4, &coeffs_128);
29
30
59.6k
    if (w == 2) {
31
28.0k
      do {
32
28.0k
        const __m128i r =
33
28.0k
            x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, &coeffs_128);
34
28.0k
        xy_x_round_store_2x2_sse2(r, im);
35
28.0k
        src_ptr += 2 * src_stride;
36
28.0k
        im += 2 * 2;
37
28.0k
        y -= 2;
38
28.0k
      } while (y);
39
50.5k
    } else if (w == 4) {
40
113k
      do {
41
113k
        const __m128i r =
42
113k
            x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, &coeffs_128);
43
113k
        xy_x_round_store_4x2_sse2(r, im);
44
113k
        src_ptr += 2 * src_stride;
45
113k
        im += 2 * 4;
46
113k
        y -= 2;
47
113k
      } while (y);
48
28.2k
    } else {
49
22.3k
      assert(w == 8);
50
51
103k
      do {
52
103k
        __m128i r[2];
53
54
103k
        x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, &coeffs_128, r);
55
103k
        xy_x_round_store_8x2_sse2(r, im);
56
103k
        src_ptr += 2 * src_stride;
57
103k
        im += 2 * 8;
58
103k
        y -= 2;
59
103k
      } while (y);
60
22.3k
    }
61
59.6k
  } else {
62
14.7k
    __m256i coeffs_256;
63
64
14.7k
    prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, &coeffs_256);
65
66
14.7k
    if (w == 16) {
67
59.3k
      do {
68
59.3k
        __m256i r[2];
69
70
59.3k
        x_convolve_2tap_16x2_avx2(src_ptr, src_stride, &coeffs_256, r);
71
59.3k
        xy_x_round_store_32_avx2(r, im);
72
59.3k
        src_ptr += 2 * src_stride;
73
59.3k
        im += 2 * 16;
74
59.3k
        y -= 2;
75
59.3k
      } while (y);
76
9.58k
    } else if (w == 32) {
77
75.5k
      do {
78
75.5k
        xy_x_2tap_32_avx2(src_ptr, &coeffs_256, im);
79
75.5k
        src_ptr += src_stride;
80
75.5k
        im += 32;
81
75.5k
      } while (--y);
82
3.08k
    } else if (w == 64) {
83
87.3k
      do {
84
87.3k
        xy_x_2tap_32_avx2(src_ptr + 0 * 32, &coeffs_256, im + 0 * 32);
85
87.3k
        xy_x_2tap_32_avx2(src_ptr + 1 * 32, &coeffs_256, im + 1 * 32);
86
87.3k
        src_ptr += src_stride;
87
87.3k
        im += 64;
88
87.3k
      } while (--y);
89
1.71k
    } else {
90
418
      assert(w == 128);
91
92
42.1k
      do {
93
42.1k
        xy_x_2tap_32_avx2(src_ptr + 0 * 32, &coeffs_256, im + 0 * 32);
94
42.1k
        xy_x_2tap_32_avx2(src_ptr + 1 * 32, &coeffs_256, im + 1 * 32);
95
42.1k
        xy_x_2tap_32_avx2(src_ptr + 2 * 32, &coeffs_256, im + 2 * 32);
96
42.1k
        xy_x_2tap_32_avx2(src_ptr + 3 * 32, &coeffs_256, im + 3 * 32);
97
42.1k
        src_ptr += src_stride;
98
42.1k
        im += 128;
99
42.1k
      } while (--y);
100
418
    }
101
14.7k
  }
102
74.4k
}
103
104
static void convolve_2d_sr_hor_4tap_ssse3(
105
    const uint8_t *const src, const int32_t src_stride, const int32_t w,
106
    const int32_t h, const InterpFilterParams *const filter_params_x,
107
625k
    const int32_t subpel_x_q4, int16_t *const im_block) {
108
625k
  const uint8_t *src_ptr = src - 1;
109
625k
  int32_t y = h;
110
625k
  int16_t *im = im_block;
111
112
625k
  if (w <= 4) {
113
583k
    __m128i coeffs_128[2];
114
115
583k
    prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
116
583k
    if (w == 2) {
117
597k
      do {
118
597k
        const __m128i r =
119
597k
            x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
120
597k
        xy_x_round_store_2x2_sse2(r, im);
121
597k
        src_ptr += 2 * src_stride;
122
597k
        im += 2 * 2;
123
597k
        y -= 2;
124
597k
      } while (y);
125
466k
    } else if (w == 4) {
126
2.58M
      do {
127
2.58M
        const __m128i r =
128
2.58M
            x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
129
2.58M
        xy_x_round_store_4x2_sse2(r, im);
130
2.58M
        src_ptr += 2 * src_stride;
131
2.58M
        im += 2 * 4;
132
2.58M
        y -= 2;
133
2.58M
      } while (y);
134
466k
    }
135
583k
  } else {
136
    // TODO(chiyotsai@google.com): Add better optimization
137
42.6k
    __m256i coeffs_256[2], filt_256[2];
138
139
42.6k
    prepare_half_coeffs_4tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
140
42.6k
    filt_256[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
141
42.6k
    filt_256[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
142
143
42.6k
    if (w == 8) {
144
138k
      do {
145
138k
        __m256i res =
146
138k
            x_convolve_4tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
147
138k
        xy_x_round_store_8x2_avx2(res, im);
148
149
138k
        src_ptr += 2 * src_stride;
150
138k
        im += 2 * 8;
151
138k
        y -= 2;
152
138k
      } while (y);
153
23.9k
    } else if (w == 16) {
154
93.1k
      do {
155
93.1k
        __m256i r[2];
156
157
93.1k
        x_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
158
93.1k
        xy_x_round_store_32_avx2(r, im);
159
93.1k
        src_ptr += 2 * src_stride;
160
93.1k
        im += 2 * 16;
161
93.1k
        y -= 2;
162
93.1k
      } while (y);
163
12.7k
    } else if (w == 32) {
164
96.9k
      do {
165
96.9k
        xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
166
167
96.9k
        src_ptr += src_stride;
168
96.9k
        im += 32;
169
96.9k
      } while (--y);
170
3.69k
    } else if (w == 64) {
171
96.2k
      do {
172
96.2k
        xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
173
96.2k
        xy_x_4tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
174
96.2k
        src_ptr += src_stride;
175
96.2k
        im += 64;
176
96.2k
      } while (--y);
177
1.78k
    } else {
178
390
      assert(w == 128);
179
180
46.8k
      do {
181
46.8k
        xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
182
46.8k
        xy_x_4tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
183
46.8k
        xy_x_4tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64);
184
46.8k
        xy_x_4tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96);
185
46.8k
        src_ptr += src_stride;
186
46.8k
        im += 128;
187
46.8k
      } while (--y);
188
400
    }
189
42.6k
  }
190
625k
}
191
192
static void convolve_2d_sr_hor_6tap_avx2(
193
    const uint8_t *const src, const int32_t src_stride, const int32_t w,
194
    const int32_t h, const InterpFilterParams *const filter_params_x,
195
657k
    const int32_t subpel_x_q4, int16_t *const im_block) {
196
657k
  const uint8_t *src_ptr = src - 2;
197
657k
  int32_t y = h;
198
657k
  int16_t *im = im_block;
199
200
657k
  if (w <= 4) {
201
0
    __m128i coeffs_128[3];
202
203
0
    prepare_half_coeffs_6tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
204
0
    if (w == 2) {
205
0
      do {
206
0
        const __m128i r =
207
0
            x_convolve_6tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
208
0
        xy_x_round_store_2x2_sse2(r, im);
209
0
        src_ptr += 2 * src_stride;
210
0
        im += 2 * 2;
211
0
        y -= 2;
212
0
      } while (y);
213
0
    } else if (w == 4) {
214
0
      do {
215
0
        const __m128i r =
216
0
            x_convolve_6tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
217
0
        xy_x_round_store_4x2_sse2(r, im);
218
0
        src_ptr += 2 * src_stride;
219
0
        im += 2 * 4;
220
0
        y -= 2;
221
0
      } while (y);
222
0
    }
223
657k
  } else {
224
657k
    __m256i coeffs_256[3], filt_256[3];
225
226
657k
    filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
227
657k
    filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
228
657k
    filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
229
230
657k
    prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
231
232
657k
    if (w == 8) {
233
2.37M
      do {
234
2.37M
        const __m256i res =
235
2.37M
            x_convolve_6tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
236
2.37M
        xy_x_round_store_8x2_avx2(res, im);
237
238
2.37M
        src_ptr += 2 * src_stride;
239
2.37M
        im += 2 * 8;
240
2.37M
        y -= 2;
241
2.37M
      } while (y);
242
392k
    } else if (w == 16) {
243
1.58M
      do {
244
1.58M
        __m256i r[2];
245
246
1.58M
        x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
247
1.58M
        xy_x_round_store_32_avx2(r, im);
248
1.58M
        src_ptr += 2 * src_stride;
249
1.58M
        im += 2 * 16;
250
1.58M
        y -= 2;
251
1.58M
      } while (y);
252
204k
    } else if (w == 32) {
253
1.27M
      do {
254
1.27M
        xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
255
1.27M
        src_ptr += src_stride;
256
1.27M
        im += 32;
257
1.27M
      } while (--y);
258
48.7k
    } else if (w == 64) {
259
561k
      do {
260
561k
        xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
261
561k
        xy_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
262
561k
        src_ptr += src_stride;
263
561k
        im += 64;
264
561k
      } while (--y);
265
9.86k
    } else {
266
1.57k
      assert(w == 128);
267
268
193k
      do {
269
193k
        xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
270
193k
        xy_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
271
193k
        xy_x_6tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64);
272
193k
        xy_x_6tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96);
273
193k
        src_ptr += src_stride;
274
193k
        im += 128;
275
193k
      } while (--y);
276
1.63k
    }
277
657k
  }
278
657k
}
279
280
static void convolve_2d_sr_hor_8tap_avx2(
281
    const uint8_t *const src, const int32_t src_stride, const int32_t w,
282
    const int32_t h, const InterpFilterParams *const filter_params_x,
283
56.4k
    const int32_t subpel_x_q4, int16_t *const im_block) {
284
56.4k
  const uint8_t *src_ptr = src - 3;
285
56.4k
  int32_t y = h;
286
56.4k
  int16_t *im = im_block;
287
56.4k
  __m256i coeffs_256[4], filt_256[4];
288
289
56.4k
  filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
290
56.4k
  filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
291
56.4k
  filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
292
56.4k
  filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
293
294
56.4k
  prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
295
296
56.4k
  if (w == 8) {
297
135k
    do {
298
135k
      const __m256i res =
299
135k
          x_convolve_8tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
300
135k
      xy_x_round_store_8x2_avx2(res, im);
301
135k
      src_ptr += 2 * src_stride;
302
135k
      im += 2 * 8;
303
135k
      y -= 2;
304
135k
    } while (y);
305
35.4k
  } else if (w == 16) {
306
88.8k
    do {
307
88.8k
      __m256i r[2];
308
309
88.8k
      x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
310
88.8k
      xy_x_round_store_32_avx2(r, im);
311
88.8k
      src_ptr += 2 * src_stride;
312
88.8k
      im += 2 * 16;
313
88.8k
      y -= 2;
314
88.8k
    } while (y);
315
25.2k
  } else if (w == 32) {
316
430k
    do {
317
430k
      xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
318
430k
      src_ptr += src_stride;
319
430k
      im += 32;
320
430k
    } while (--y);
321
16.7k
  } else if (w == 64) {
322
359k
    do {
323
359k
      xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
324
359k
      xy_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
325
359k
      src_ptr += src_stride;
326
359k
      im += 64;
327
359k
    } while (--y);
328
8.19k
  } else {
329
278
    assert(w == 128);
330
331
33.7k
    do {
332
33.7k
      xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
333
33.7k
      xy_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
334
33.7k
      xy_x_8tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64);
335
33.7k
      xy_x_8tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96);
336
33.7k
      src_ptr += src_stride;
337
33.7k
      im += 128;
338
33.7k
    } while (--y);
339
277
  }
340
56.4k
}
341
342
static void convolve_2d_sr_ver_2tap_avx2(
343
    const int16_t *const im_block, const int32_t w, const int32_t h,
344
    const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
345
51.1k
    uint8_t *dst, const int32_t dst_stride) {
346
51.1k
  const int16_t *im = im_block;
347
51.1k
  int32_t y = h;
348
349
51.1k
  if (w <= 4) {
350
26.8k
    __m128i coeffs_128;
351
352
26.8k
    prepare_coeffs_2tap_sse2(filter_params_y, subpel_y_q4, &coeffs_128);
353
354
26.8k
    if (w == 2) {
355
6.76k
      __m128i s_32[2];
356
357
6.76k
      s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
358
359
13.2k
      do {
360
13.2k
        const __m128i res = xy_y_convolve_2tap_2x2_sse2(im, s_32, &coeffs_128);
361
13.2k
        xy_y_round_store_2x2_sse2(res, dst, dst_stride);
362
13.2k
        im += 2 * 2;
363
13.2k
        dst += 2 * dst_stride;
364
13.2k
        y -= 2;
365
13.2k
      } while (y);
366
20.0k
    } else {
367
20.0k
      __m128i s_64[2], r[2];
368
369
20.0k
      assert(w == 4);
370
371
20.0k
      s_64[0] = _mm_loadl_epi64((__m128i *)im);
372
373
57.1k
      do {
374
57.1k
        xy_y_convolve_2tap_4x2_sse2(im, s_64, &coeffs_128, r);
375
57.1k
        r[0] = xy_y_round_sse2(r[0]);
376
57.1k
        r[1] = xy_y_round_sse2(r[1]);
377
57.1k
        const __m128i rr = _mm_packs_epi32(r[0], r[1]);
378
57.1k
        pack_store_4x2_sse2(rr, dst, dst_stride);
379
57.1k
        im += 2 * 4;
380
57.1k
        dst += 2 * dst_stride;
381
57.1k
        y -= 2;
382
57.1k
      } while (y);
383
20.0k
    }
384
26.8k
  } else {
385
24.2k
    __m256i coeffs_256;
386
387
24.2k
    prepare_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, &coeffs_256);
388
389
24.2k
    if (w == 8) {
390
14.9k
      __m128i s_128[2];
391
14.9k
      __m256i r[2];
392
393
14.9k
      s_128[0] = _mm_loadu_si128((__m128i *)im);
394
395
49.2k
      do {
396
49.2k
        xy_y_convolve_2tap_8x2_avx2(im, s_128, &coeffs_256, r);
397
49.2k
        xy_y_round_store_8x2_avx2(r, dst, dst_stride);
398
49.2k
        im += 2 * 8;
399
49.2k
        dst += 2 * dst_stride;
400
49.2k
        y -= 2;
401
49.2k
      } while (y);
402
14.9k
    } else if (w == 16) {
403
6.22k
      __m256i s_256[2], r[4];
404
405
6.22k
      s_256[0] = _mm256_loadu_si256((__m256i *)im);
406
407
31.2k
      do {
408
31.2k
        xy_y_convolve_2tap_16x2_avx2(im, s_256, &coeffs_256, r);
409
31.2k
        xy_y_round_store_16x2_avx2(r, dst, dst_stride);
410
31.2k
        im += 2 * 16;
411
31.2k
        dst += 2 * dst_stride;
412
31.2k
        y -= 2;
413
31.2k
      } while (y);
414
6.22k
    } else if (w == 32) {
415
1.80k
      __m256i s_256[2][2];
416
417
1.80k
      s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
418
1.80k
      s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
419
420
20.0k
      do {
421
20.0k
        xy_y_convolve_2tap_32_all_avx2(im + 32, s_256[0], s_256[1], &coeffs_256,
422
20.0k
                                       dst);
423
20.0k
        im += 2 * 32;
424
20.0k
        xy_y_convolve_2tap_32_all_avx2(im, s_256[1], s_256[0], &coeffs_256,
425
20.0k
                                       dst + dst_stride);
426
20.0k
        dst += 2 * dst_stride;
427
20.0k
        y -= 2;
428
20.0k
      } while (y);
429
1.80k
    } else if (w == 64) {
430
1.13k
      __m256i s_256[2][4];
431
432
1.13k
      s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
433
1.13k
      s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
434
1.13k
      s_256[0][2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
435
1.13k
      s_256[0][3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
436
437
26.7k
      do {
438
26.7k
        xy_y_convolve_2tap_32_all_avx2(im + 64, s_256[0] + 0, s_256[1] + 0,
439
26.7k
                                       &coeffs_256, dst);
440
26.7k
        xy_y_convolve_2tap_32_all_avx2(im + 96, s_256[0] + 2, s_256[1] + 2,
441
26.7k
                                       &coeffs_256, dst + 32);
442
26.7k
        im += 2 * 64;
443
26.7k
        xy_y_convolve_2tap_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
444
26.7k
                                       &coeffs_256, dst + dst_stride);
445
26.7k
        xy_y_convolve_2tap_32_all_avx2(im + 32, s_256[1] + 2, s_256[0] + 2,
446
26.7k
                                       &coeffs_256, dst + dst_stride + 32);
447
26.7k
        dst += 2 * dst_stride;
448
26.7k
        y -= 2;
449
26.7k
      } while (y);
450
1.13k
    } else {
451
192
      __m256i s_256[2][8];
452
453
192
      assert(w == 128);
454
455
192
      load_16bit_8rows_avx2(im, 16, s_256[0]);
456
457
9.47k
      do {
458
9.47k
        xy_y_convolve_2tap_32_all_avx2(im + 128, s_256[0] + 0, s_256[1] + 0,
459
9.47k
                                       &coeffs_256, dst);
460
9.47k
        xy_y_convolve_2tap_32_all_avx2(im + 160, s_256[0] + 2, s_256[1] + 2,
461
9.47k
                                       &coeffs_256, dst + 1 * 32);
462
9.47k
        xy_y_convolve_2tap_32_all_avx2(im + 192, s_256[0] + 4, s_256[1] + 4,
463
9.47k
                                       &coeffs_256, dst + 2 * 32);
464
9.47k
        xy_y_convolve_2tap_32_all_avx2(im + 224, s_256[0] + 6, s_256[1] + 6,
465
9.47k
                                       &coeffs_256, dst + 3 * 32);
466
9.47k
        im += 2 * 128;
467
9.47k
        xy_y_convolve_2tap_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
468
9.47k
                                       &coeffs_256, dst + dst_stride);
469
9.47k
        xy_y_convolve_2tap_32_all_avx2(im + 32, s_256[1] + 2, s_256[0] + 2,
470
9.47k
                                       &coeffs_256, dst + dst_stride + 1 * 32);
471
9.47k
        xy_y_convolve_2tap_32_all_avx2(im + 64, s_256[1] + 4, s_256[0] + 4,
472
9.47k
                                       &coeffs_256, dst + dst_stride + 2 * 32);
473
9.47k
        xy_y_convolve_2tap_32_all_avx2(im + 96, s_256[1] + 6, s_256[0] + 6,
474
9.47k
                                       &coeffs_256, dst + dst_stride + 3 * 32);
475
9.47k
        dst += 2 * dst_stride;
476
9.47k
        y -= 2;
477
9.47k
      } while (y);
478
192
    }
479
24.2k
  }
480
51.1k
}
481
482
static void convolve_2d_sr_ver_2tap_half_avx2(
483
    const int16_t *const im_block, const int32_t w, const int32_t h,
484
    const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
485
23.3k
    uint8_t *dst, const int32_t dst_stride) {
486
23.3k
  const int16_t *im = im_block;
487
23.3k
  int32_t y = h;
488
489
23.3k
  (void)filter_params_y;
490
23.3k
  (void)subpel_y_q4;
491
492
23.3k
  if (w == 2) {
493
2.33k
    __m128i s_32[2];
494
495
2.33k
    s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
496
497
5.72k
    do {
498
5.72k
      const __m128i res = xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
499
5.72k
      const __m128i r = xy_y_round_half_pel_sse2(res);
500
5.72k
      pack_store_2x2_sse2(r, dst, dst_stride);
501
5.72k
      im += 2 * 2;
502
5.72k
      dst += 2 * dst_stride;
503
5.72k
      y -= 2;
504
5.72k
    } while (y);
505
20.9k
  } else if (w == 4) {
506
8.16k
    __m128i s_64[2];
507
508
8.16k
    s_64[0] = _mm_loadl_epi64((__m128i *)im);
509
510
28.5k
    do {
511
28.5k
      const __m128i res = xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
512
28.5k
      const __m128i r = xy_y_round_half_pel_sse2(res);
513
28.5k
      pack_store_4x2_sse2(r, dst, dst_stride);
514
28.5k
      im += 2 * 4;
515
28.5k
      dst += 2 * dst_stride;
516
28.5k
      y -= 2;
517
28.5k
    } while (y);
518
12.8k
  } else if (w == 8) {
519
7.37k
    __m128i s_128[2];
520
521
7.37k
    s_128[0] = _mm_loadu_si128((__m128i *)im);
522
523
31.4k
    do {
524
31.4k
      const __m256i res = xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
525
31.4k
      const __m256i r = xy_y_round_half_pel_avx2(res);
526
31.4k
      pack_store_8x2_avx2(r, dst, dst_stride);
527
31.4k
      im += 2 * 8;
528
31.4k
      dst += 2 * dst_stride;
529
31.4k
      y -= 2;
530
31.4k
    } while (y);
531
7.37k
  } else if (w == 16) {
532
3.35k
    __m256i s_256[2], r[2];
533
534
3.35k
    s_256[0] = _mm256_loadu_si256((__m256i *)im);
535
536
18.5k
    do {
537
18.5k
      xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
538
18.5k
      r[0] = xy_y_round_half_pel_avx2(r[0]);
539
18.5k
      r[1] = xy_y_round_half_pel_avx2(r[1]);
540
18.5k
      xy_y_pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
541
18.5k
      im += 2 * 16;
542
18.5k
      dst += 2 * dst_stride;
543
18.5k
      y -= 2;
544
18.5k
    } while (y);
545
3.35k
  } else if (w == 32) {
546
1.28k
    __m256i s_256[2][2];
547
548
1.28k
    s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
549
1.28k
    s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
550
551
14.6k
    do {
552
14.6k
      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 32, s_256[0], s_256[1], dst);
553
14.6k
      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 2 * 32, s_256[1], s_256[0],
554
14.6k
                                              dst + dst_stride);
555
14.6k
      im += 2 * 32;
556
14.6k
      dst += 2 * dst_stride;
557
14.6k
      y -= 2;
558
14.6k
    } while (y);
559
1.28k
  } else if (w == 64) {
560
574
    __m256i s_256[2][4];
561
562
574
    s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
563
574
    s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
564
574
    s_256[0][2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
565
574
    s_256[0][3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
566
567
15.1k
    do {
568
15.1k
      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 64, s_256[0] + 0,
569
15.1k
                                              s_256[1] + 0, dst);
570
15.1k
      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 96, s_256[0] + 2,
571
15.1k
                                              s_256[1] + 2, dst + 32);
572
15.1k
      im += 2 * 64;
573
15.1k
      xy_y_convolve_2tap_half_pel_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
574
15.1k
                                              dst + dst_stride);
575
15.1k
      xy_y_convolve_2tap_half_pel_32_all_avx2(
576
15.1k
          im + 32, s_256[1] + 2, s_256[0] + 2, dst + dst_stride + 32);
577
15.1k
      dst += 2 * dst_stride;
578
15.1k
      y -= 2;
579
15.1k
    } while (y);
580
574
  } else {
581
226
    __m256i s_256[2][8];
582
583
226
    assert(w == 128);
584
585
226
    load_16bit_8rows_avx2(im, 16, s_256[0]);
586
587
11.1k
    do {
588
11.1k
      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 128, s_256[0] + 0,
589
11.1k
                                              s_256[1] + 0, dst);
590
11.1k
      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 160, s_256[0] + 2,
591
11.1k
                                              s_256[1] + 2, dst + 1 * 32);
592
11.1k
      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 192, s_256[0] + 4,
593
11.1k
                                              s_256[1] + 4, dst + 2 * 32);
594
11.1k
      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 224, s_256[0] + 6,
595
11.1k
                                              s_256[1] + 6, dst + 3 * 32);
596
11.1k
      im += 2 * 128;
597
11.1k
      xy_y_convolve_2tap_half_pel_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
598
11.1k
                                              dst + dst_stride);
599
11.1k
      xy_y_convolve_2tap_half_pel_32_all_avx2(
600
11.1k
          im + 32, s_256[1] + 2, s_256[0] + 2, dst + dst_stride + 1 * 32);
601
11.1k
      xy_y_convolve_2tap_half_pel_32_all_avx2(
602
11.1k
          im + 64, s_256[1] + 4, s_256[0] + 4, dst + dst_stride + 2 * 32);
603
11.1k
      xy_y_convolve_2tap_half_pel_32_all_avx2(
604
11.1k
          im + 96, s_256[1] + 6, s_256[0] + 6, dst + dst_stride + 3 * 32);
605
11.1k
      dst += 2 * dst_stride;
606
11.1k
      y -= 2;
607
11.1k
    } while (y);
608
226
  }
609
23.3k
}
610
611
static void convolve_2d_sr_ver_4tap_avx2(
612
    const int16_t *const im_block, const int32_t w, const int32_t h,
613
    const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
614
709k
    uint8_t *dst, const int32_t dst_stride) {
615
709k
  const int16_t *im = im_block;
616
709k
  int32_t y = h;
617
618
709k
  if (w == 2) {
619
67.4k
    __m128i coeffs_128[2], s_32[4], ss_128[2];
620
621
67.4k
    prepare_coeffs_4tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
622
623
67.4k
    s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
624
67.4k
    s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
625
67.4k
    s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
626
627
67.4k
    const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
628
67.4k
    const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
629
630
67.4k
    ss_128[0] = _mm_unpacklo_epi16(src01, src12);
631
632
115k
    do {
633
115k
      const __m128i res =
634
115k
          xy_y_convolve_4tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
635
115k
      xy_y_round_store_2x2_sse2(res, dst, dst_stride);
636
115k
      im += 2 * 2;
637
115k
      dst += 2 * dst_stride;
638
115k
      y -= 2;
639
115k
    } while (y);
640
642k
  } else {
641
642k
    __m256i coeffs_256[2];
642
643
642k
    prepare_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
644
645
642k
    if (w == 4) {
646
302k
      __m128i s_64[4];
647
302k
      __m256i s_256[2], ss_256[2];
648
649
302k
      s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
650
302k
      s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
651
302k
      s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
652
653
      // Load lines a and b. Line a to lower 128, line b to upper 128
654
302k
      s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
655
302k
      s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
656
657
302k
      ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
658
659
597k
      do {
660
597k
        const __m256i res =
661
597k
            xy_y_convolve_4tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
662
597k
        xy_y_round_store_4x2_avx2(res, dst, dst_stride);
663
597k
        im += 2 * 4;
664
597k
        dst += 2 * dst_stride;
665
597k
        y -= 2;
666
597k
      } while (y);
667
340k
    } else if (w == 8) {
668
237k
      __m256i s_256[4], r[2];
669
670
237k
      s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
671
237k
      s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
672
673
237k
      if (subpel_y_q4 != 8) {
674
198k
        __m256i ss_256[4];
675
676
198k
        ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
677
198k
        ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
678
679
381k
        do {
680
381k
          xy_y_convolve_4tap_8x2_avx2(im, ss_256, coeffs_256, r);
681
381k
          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
682
381k
          im += 2 * 8;
683
381k
          dst += 2 * dst_stride;
684
381k
          y -= 2;
685
381k
        } while (y);
686
198k
      } else {
687
69.5k
        do {
688
69.5k
          xy_y_convolve_4tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r);
689
69.5k
          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
690
69.5k
          im += 2 * 8;
691
69.5k
          dst += 2 * dst_stride;
692
69.5k
          y -= 2;
693
69.5k
        } while (y);
694
39.1k
      }
695
237k
    } else if (w == 16) {
696
94.4k
      __m256i s_256[5];
697
698
94.4k
      s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
699
94.4k
      s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
700
94.4k
      s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
701
702
94.4k
      if (subpel_y_q4 != 8) {
703
75.0k
        __m256i ss_256[4], tt_256[4], r[4];
704
705
75.0k
        ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
706
75.0k
        ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
707
708
75.0k
        tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
709
75.0k
        tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
710
711
188k
        do {
712
188k
          xy_y_convolve_4tap_16x2_avx2(im, s_256, ss_256, tt_256, coeffs_256,
713
188k
                                       r);
714
188k
          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
715
188k
          im += 2 * 16;
716
188k
          dst += 2 * dst_stride;
717
188k
          y -= 2;
718
188k
        } while (y);
719
75.0k
      } else {
720
19.3k
        __m256i r[4];
721
722
38.7k
        do {
723
38.7k
          xy_y_convolve_4tap_16x2_half_pelavx2(im, s_256, coeffs_256, r);
724
38.7k
          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
725
38.7k
          im += 2 * 16;
726
38.7k
          dst += 2 * dst_stride;
727
38.7k
          y -= 2;
728
38.7k
        } while (y);
729
19.3k
      }
730
94.4k
    } else {
731
      /*It's a special condition for OBMC. A/c  to Av1 spec 4-tap won't
732
      support for width(w)>16, but for OBMC while predicting above block
733
      it reduces size block to Wx(h/2), for example, if above block size
734
      is 32x8, we get block size as 32x4 for OBMC.*/
735
7.88k
      int32_t x = 0;
736
737
7.88k
      assert(!(w % 32));
738
739
7.91k
      __m256i s_256[2][4], ss_256[2][4], tt_256[2][4], r0[4], r1[4];
740
10.7k
      do {
741
10.7k
        const int16_t *s = im + x;
742
10.7k
        uint8_t *d = dst + x;
743
744
10.7k
        loadu_unpack_16bit_3rows_avx2(s, w, s_256[0], ss_256[0], tt_256[0]);
745
10.7k
        loadu_unpack_16bit_3rows_avx2(s + 16, w, s_256[1], ss_256[1],
746
10.7k
                                      tt_256[1]);
747
748
10.7k
        y = h;
749
203k
        do {
750
203k
          xy_y_convolve_4tap_32x2_avx2(s, w, s_256[0], ss_256[0], tt_256[0],
751
203k
                                       coeffs_256, r0);
752
203k
          xy_y_convolve_4tap_32x2_avx2(s + 16, w, s_256[1], ss_256[1],
753
203k
                                       tt_256[1], coeffs_256, r1);
754
755
203k
          xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
756
203k
          xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
757
758
203k
          s += 2 * w;
759
203k
          d += 2 * dst_stride;
760
203k
          y -= 2;
761
203k
        } while (y);
762
763
10.7k
        x += 32;
764
10.7k
      } while (x < w);
765
7.91k
    }
766
642k
  }
767
709k
}
768
769
static void convolve_2d_sr_ver_6tap_avx2(
770
    const int16_t *const im_block, const int32_t w, const int32_t h,
771
    const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
772
577k
    uint8_t *dst, const int32_t dst_stride) {
773
577k
  const int16_t *im = im_block;
774
577k
  int32_t y;
775
776
577k
  if (w == 2) {
777
46.6k
    __m128i coeffs_128[3], s_32[6], ss_128[3];
778
779
46.6k
    prepare_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
780
781
46.6k
    s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
782
46.6k
    s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
783
46.6k
    s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
784
46.6k
    s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
785
46.6k
    s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
786
787
46.6k
    const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
788
46.6k
    const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
789
46.6k
    const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
790
46.6k
    const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
791
792
46.6k
    ss_128[0] = _mm_unpacklo_epi16(src01, src12);
793
46.6k
    ss_128[1] = _mm_unpacklo_epi16(src23, src34);
794
795
46.6k
    y = h;
796
186k
    do {
797
186k
      const __m128i res =
798
186k
          xy_y_convolve_6tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
799
186k
      xy_y_round_store_2x2_sse2(res, dst, dst_stride);
800
186k
      im += 2 * 2;
801
186k
      dst += 2 * dst_stride;
802
186k
      y -= 2;
803
186k
    } while (y);
804
530k
  } else {
805
530k
    __m256i coeffs_256[3];
806
807
530k
    prepare_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
808
809
530k
    if (w == 4) {
810
155k
      __m128i s_64[6];
811
155k
      __m256i s_256[6], ss_256[3];
812
813
155k
      s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
814
155k
      s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
815
155k
      s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
816
155k
      s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
817
155k
      s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
818
819
      // Load lines a and b. Line a to lower 128, line b to upper 128
820
155k
      s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
821
155k
      s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
822
155k
      s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
823
155k
      s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
824
825
155k
      ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
826
155k
      ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
827
828
155k
      y = h;
829
835k
      do {
830
835k
        const __m256i res =
831
835k
            xy_y_convolve_6tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
832
835k
        xy_y_round_store_4x2_avx2(res, dst, dst_stride);
833
835k
        im += 2 * 4;
834
835k
        dst += 2 * dst_stride;
835
835k
        y -= 2;
836
835k
      } while (y);
837
375k
    } else if (w == 8) {
838
190k
      __m256i s_256[6], r[2];
839
840
190k
      s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
841
190k
      s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
842
190k
      s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
843
190k
      s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
844
190k
      y = h;
845
846
190k
      if (subpel_y_q4 != 8) {
847
147k
        __m256i ss_256[6];
848
849
147k
        ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
850
147k
        ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
851
852
147k
        ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
853
147k
        ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
854
855
809k
        do {
856
809k
          xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
857
809k
          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
858
809k
          im += 2 * 8;
859
809k
          dst += 2 * dst_stride;
860
809k
          y -= 2;
861
809k
        } while (y);
862
147k
      } else {
863
246k
        do {
864
246k
          xy_y_convolve_6tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r);
865
246k
          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
866
246k
          im += 2 * 8;
867
246k
          dst += 2 * dst_stride;
868
246k
          y -= 2;
869
246k
        } while (y);
870
42.6k
      }
871
190k
    } else if (w == 16) {
872
126k
      __m256i s_256[6];
873
874
126k
      s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
875
126k
      s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
876
126k
      s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
877
126k
      s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
878
126k
      s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 16));
879
126k
      y = h;
880
881
126k
      if (subpel_y_q4 != 8) {
882
94.9k
        __m256i ss_256[6], tt_256[6], r[4];
883
884
94.9k
        ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
885
94.9k
        ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
886
94.9k
        ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
887
94.9k
        ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
888
889
94.9k
        tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
890
94.9k
        tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
891
94.9k
        tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
892
94.9k
        tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
893
894
671k
        do {
895
671k
          xy_y_convolve_6tap_16x2_avx2(im, 16, s_256, ss_256, tt_256,
896
671k
                                       coeffs_256, r);
897
671k
          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
898
671k
          im += 2 * 16;
899
671k
          dst += 2 * dst_stride;
900
671k
          y -= 2;
901
671k
        } while (y);
902
94.9k
      } else {
903
32.0k
        __m256i ss_256[4], r[4];
904
905
229k
        do {
906
229k
          xy_y_convolve_6tap_16x2_half_pel_avx2(im, 16, s_256, ss_256,
907
229k
                                                coeffs_256, r);
908
229k
          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
909
910
229k
          im += 2 * 16;
911
229k
          dst += 2 * dst_stride;
912
229k
          y -= 2;
913
229k
        } while (y);
914
32.0k
      }
915
126k
    } else {
916
58.2k
      int32_t x = 0;
917
918
58.2k
      assert(!(w % 32));
919
920
58.3k
      __m256i s_256[2][6], ss_256[2][6], tt_256[2][6], r0[4], r1[4];
921
922
73.2k
      do {
923
73.2k
        const int16_t *s = im + x;
924
73.2k
        uint8_t *d = dst + x;
925
926
73.2k
        loadu_unpack_16bit_5rows_avx2(s, w, s_256[0], ss_256[0], tt_256[0]);
927
73.2k
        loadu_unpack_16bit_5rows_avx2(s + 16, w, s_256[1], ss_256[1],
928
73.2k
                                      tt_256[1]);
929
930
73.2k
        y = h;
931
1.37M
        do {
932
1.37M
          xy_y_convolve_6tap_16x2_avx2(s, w, s_256[0], ss_256[0], tt_256[0],
933
1.37M
                                       coeffs_256, r0);
934
1.37M
          xy_y_convolve_6tap_16x2_avx2(s + 16, w, s_256[1], ss_256[1],
935
1.37M
                                       tt_256[1], coeffs_256, r1);
936
937
1.37M
          xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
938
1.37M
          xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
939
940
1.37M
          s += 2 * w;
941
1.37M
          d += 2 * dst_stride;
942
1.37M
          y -= 2;
943
1.37M
        } while (y);
944
945
73.2k
        x += 32;
946
73.2k
      } while (x < w);
947
58.3k
    }
948
530k
  }
949
577k
}
950
951
static void convolve_2d_sr_ver_8tap_avx2(
952
    const int16_t *const im_block, const int32_t w, const int32_t h,
953
    const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
954
52.1k
    uint8_t *dst, const int32_t dst_stride) {
955
52.1k
  const int16_t *im = im_block;
956
52.1k
  int32_t y;
957
958
52.1k
  if (w == 2) {
959
2.52k
    __m128i coeffs_128[4], s_32[8], ss_128[4];
960
961
2.52k
    prepare_coeffs_8tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
962
963
2.52k
    s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
964
2.52k
    s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
965
2.52k
    s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
966
2.52k
    s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
967
2.52k
    s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
968
2.52k
    s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(im + 5 * 2));
969
2.52k
    s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(im + 6 * 2));
970
971
2.52k
    const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
972
2.52k
    const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
973
2.52k
    const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
974
2.52k
    const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
975
2.52k
    const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
976
2.52k
    const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
977
978
2.52k
    ss_128[0] = _mm_unpacklo_epi16(src01, src12);
979
2.52k
    ss_128[1] = _mm_unpacklo_epi16(src23, src34);
980
2.52k
    ss_128[2] = _mm_unpacklo_epi16(src45, src56);
981
982
2.52k
    y = h;
983
10.0k
    do {
984
10.0k
      const __m128i res =
985
10.0k
          xy_y_convolve_8tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
986
10.0k
      xy_y_round_store_2x2_sse2(res, dst, dst_stride);
987
10.0k
      im += 2 * 2;
988
10.0k
      dst += 2 * dst_stride;
989
10.0k
      y -= 2;
990
10.0k
    } while (y);
991
49.6k
  } else {
992
49.6k
    __m256i coeffs_256[4];
993
994
49.6k
    prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
995
996
49.6k
    if (w == 4) {
997
9.07k
      __m128i s_64[8];
998
9.07k
      __m256i s_256[8], ss_256[4];
999
1000
9.07k
      s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
1001
9.07k
      s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
1002
9.07k
      s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
1003
9.07k
      s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
1004
9.07k
      s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
1005
9.07k
      s_64[5] = _mm_loadl_epi64((__m128i *)(im + 5 * 4));
1006
9.07k
      s_64[6] = _mm_loadl_epi64((__m128i *)(im + 6 * 4));
1007
1008
      // Load lines a and b. Line a to lower 128, line b to upper 128
1009
9.07k
      s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
1010
9.07k
      s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
1011
9.07k
      s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
1012
9.07k
      s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
1013
9.07k
      s_256[4] = _mm256_setr_m128i(s_64[4], s_64[5]);
1014
9.07k
      s_256[5] = _mm256_setr_m128i(s_64[5], s_64[6]);
1015
1016
9.07k
      ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1017
9.07k
      ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1018
9.07k
      ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
1019
1020
9.07k
      y = h;
1021
47.8k
      do {
1022
47.8k
        const __m256i res =
1023
47.8k
            xy_y_convolve_8tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
1024
47.8k
        xy_y_round_store_4x2_avx2(res, dst, dst_stride);
1025
47.8k
        im += 2 * 4;
1026
47.8k
        dst += 2 * dst_stride;
1027
47.8k
        y -= 2;
1028
47.8k
      } while (y);
1029
40.5k
    } else if (w == 8) {
1030
9.29k
      __m256i s_256[8], r[2];
1031
1032
9.29k
      s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
1033
9.29k
      s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
1034
9.29k
      s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
1035
9.29k
      s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
1036
9.29k
      s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 8));
1037
9.29k
      s_256[5] = _mm256_loadu_si256((__m256i *)(im + 5 * 8));
1038
9.29k
      y = h;
1039
1040
9.29k
      if (subpel_y_q4 != 8) {
1041
6.06k
        __m256i ss_256[8];
1042
1043
6.06k
        convolve_8tap_unpack_avx2(s_256, ss_256);
1044
1045
33.8k
        do {
1046
33.8k
          xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
1047
33.8k
          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
1048
33.8k
          im += 2 * 8;
1049
33.8k
          dst += 2 * dst_stride;
1050
33.8k
          y -= 2;
1051
33.8k
        } while (y);
1052
6.06k
      } else {
1053
19.7k
        do {
1054
19.7k
          xy_y_convolve_8tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r);
1055
19.7k
          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
1056
19.7k
          im += 2 * 8;
1057
19.7k
          dst += 2 * dst_stride;
1058
19.7k
          y -= 2;
1059
19.7k
        } while (y);
1060
3.23k
      }
1061
31.2k
    } else if (w == 16) {
1062
6.07k
      __m256i s_256[8], r[4];
1063
1064
6.07k
      load_16bit_7rows_avx2(im, 16, s_256);
1065
6.07k
      y = h;
1066
1067
6.07k
      if (subpel_y_q4 != 8) {
1068
4.07k
        __m256i ss_256[8], tt_256[8];
1069
1070
4.07k
        convolve_8tap_unpack_avx2(s_256, ss_256);
1071
4.07k
        convolve_8tap_unpack_avx2(s_256 + 1, tt_256);
1072
1073
32.6k
        do {
1074
32.6k
          xy_y_convolve_8tap_16x2_avx2(im, 16, coeffs_256, s_256, ss_256,
1075
32.6k
                                       tt_256, r);
1076
32.6k
          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
1077
1078
32.6k
          im += 2 * 16;
1079
32.6k
          dst += 2 * dst_stride;
1080
32.6k
          y -= 2;
1081
32.6k
        } while (y);
1082
4.07k
      } else {
1083
14.5k
        do {
1084
14.5k
          xy_y_convolve_8tap_16x2_half_pel_avx2(im, 16, coeffs_256, s_256, r);
1085
14.5k
          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
1086
1087
14.5k
          im += 2 * 16;
1088
14.5k
          dst += 2 * dst_stride;
1089
14.5k
          y -= 2;
1090
14.5k
        } while (y);
1091
2.00k
      }
1092
25.1k
    } else {
1093
25.1k
      int32_t x = 0;
1094
25.1k
      __m256i s_256[2][8], r0[4], r1[4];
1095
1096
25.1k
      assert(!(w % 32));
1097
1098
25.1k
      __m256i ss_256[2][8], tt_256[2][8];
1099
1100
34.1k
      do {
1101
34.1k
        const int16_t *s = im + x;
1102
34.1k
        uint8_t *d = dst + x;
1103
1104
34.1k
        load_16bit_7rows_avx2(s, w, s_256[0]);
1105
34.1k
        convolve_8tap_unpack_avx2(s_256[0], ss_256[0]);
1106
34.1k
        convolve_8tap_unpack_avx2(s_256[0] + 1, tt_256[0]);
1107
1108
34.1k
        load_16bit_7rows_avx2(s + 16, w, s_256[1]);
1109
34.1k
        convolve_8tap_unpack_avx2(s_256[1], ss_256[1]);
1110
34.1k
        convolve_8tap_unpack_avx2(s_256[1] + 1, tt_256[1]);
1111
1112
34.1k
        y = h;
1113
503k
        do {
1114
503k
          xy_y_convolve_8tap_16x2_avx2(s, w, coeffs_256, s_256[0], ss_256[0],
1115
503k
                                       tt_256[0], r0);
1116
503k
          xy_y_convolve_8tap_16x2_avx2(s + 16, w, coeffs_256, s_256[1],
1117
503k
                                       ss_256[1], tt_256[1], r1);
1118
503k
          xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
1119
503k
          xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
1120
1121
503k
          s += 2 * w;
1122
503k
          d += 2 * dst_stride;
1123
503k
          y -= 2;
1124
503k
        } while (y);
1125
1126
34.1k
        x += 32;
1127
34.1k
      } while (x < w);
1128
25.1k
    }
1129
49.6k
  }
1130
52.1k
}
1131
1132
typedef void (*Convolve2dSrHorTapFunc)(
1133
    const uint8_t *const src, const int32_t src_stride, const int32_t w,
1134
    const int32_t h, const InterpFilterParams *const filter_params_x,
1135
    const int32_t subpel_x_q4, int16_t *const im_block);
1136
1137
typedef void (*Convolve2dSrVerTapFunc)(
1138
    const int16_t *const im_block, const int32_t w, const int32_t h,
1139
    const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
1140
    uint8_t *dst, const int32_t dst_stride);
1141
1142
static AOM_FORCE_INLINE void av1_convolve_2d_sr_specialized_avx2(
1143
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
1144
    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
1145
    const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
1146
1.41M
    const int32_t subpel_y_q4, ConvolveParams *conv_params) {
1147
1.41M
  static const Convolve2dSrHorTapFunc
1148
1.41M
      convolve_2d_sr_hor_tap_func_table[MAX_FILTER_TAP + 1] = {
1149
1.41M
        NULL,
1150
1.41M
        NULL,
1151
1.41M
        convolve_2d_sr_hor_2tap_avx2,
1152
1.41M
        NULL,
1153
1.41M
        convolve_2d_sr_hor_4tap_ssse3,
1154
1.41M
        NULL,
1155
1.41M
        convolve_2d_sr_hor_6tap_avx2,
1156
1.41M
        NULL,
1157
1.41M
        convolve_2d_sr_hor_8tap_avx2
1158
1.41M
      };
1159
1.41M
  static const Convolve2dSrVerTapFunc
1160
1.41M
      convolve_2d_sr_ver_tap_func_table[MAX_FILTER_TAP + 1] = {
1161
1.41M
        NULL,
1162
1.41M
        convolve_2d_sr_ver_2tap_half_avx2,
1163
1.41M
        convolve_2d_sr_ver_2tap_avx2,
1164
1.41M
        convolve_2d_sr_ver_4tap_avx2,
1165
1.41M
        convolve_2d_sr_ver_4tap_avx2,
1166
1.41M
        convolve_2d_sr_ver_6tap_avx2,
1167
1.41M
        convolve_2d_sr_ver_6tap_avx2,
1168
1.41M
        convolve_2d_sr_ver_8tap_avx2,
1169
1.41M
        convolve_2d_sr_ver_8tap_avx2
1170
1.41M
      };
1171
1.41M
  const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_q4);
1172
1.41M
  const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_q4);
1173
1174
1.41M
  assert(tap_x != 12 && tap_y != 12);
1175
1176
1.41M
  const uint8_t *src_ptr = src - ((tap_y >> 1) - 1) * src_stride;
1177
  // Note: im_block is 8-pixel interlaced for width 32 and up, to avoid data
1178
  //       permutation.
1179
1.41M
  DECLARE_ALIGNED(32, int16_t,
1180
1.41M
                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1181
1182
1.41M
  (void)conv_params;
1183
1184
1.41M
  assert(conv_params->round_0 == 3);
1185
1.41M
  assert(conv_params->round_1 == 11);
1186
1187
  // horizontal filter
1188
1.41M
  int32_t hh = h + tap_y;
1189
1.41M
  assert(!(hh % 2));
1190
1191
1.41M
  convolve_2d_sr_hor_tap_func_table[tap_x](
1192
1.41M
      src_ptr, src_stride, w, hh, filter_params_x, subpel_x_q4, im_block);
1193
1194
  // vertical filter
1195
1.41M
  convolve_2d_sr_ver_tap_func_table[tap_y - (subpel_y_q4 == 8)](
1196
1.41M
      im_block, w, h, filter_params_y, subpel_y_q4, dst, dst_stride);
1197
1.41M
}
1198
1199
#endif  // THIRD_PARTY_SVT_AV1_CONVOLVE_2D_AVX2_H_