Coverage Report

Created: 2026-05-16 06:27

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/av1/common/x86/convolve_2d_avx2.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <immintrin.h>
13
14
#include "config/av1_rtcd.h"
15
16
#include "aom_dsp/x86/convolve_avx2.h"
17
#include "aom_dsp/aom_filter.h"
18
#include "aom_dsp/x86/synonyms.h"
19
20
#include "av1/common/convolve.h"
21
22
static void convolve_2d_sr_w4_avx2(
23
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
24
    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
25
    const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn,
26
546k
    const int32_t subpel_y_qn, ConvolveParams *conv_params) {
27
546k
  int i;
28
546k
  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 4]);
29
546k
  uint8_t *dst_ptr = dst;
30
546k
  assert(conv_params->round_0 == 3);
31
546k
  assert(conv_params->round_1 == 11);
32
33
546k
  const __m128i round_const_h = _mm_set1_epi16(1 << (conv_params->round_0 - 2));
34
546k
  const __m256i round_const_v =
35
546k
      _mm256_set1_epi32(1 << (conv_params->round_1 - 1));
36
37
546k
  __m128i filt[2], coeffs_h[2] = { 0 };
38
546k
  __m256i coeffs_v[4] = { 0 };
39
40
546k
  const int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
41
546k
  const int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
42
43
546k
  assert(horiz_tap == 2 || horiz_tap == 4);
44
546k
  assert(vert_tap == 2 || vert_tap == 4 || vert_tap == 6 || vert_tap == 8);
45
46
546k
  if (horiz_tap == 2)
47
19.1k
    prepare_coeffs_2t_ssse3(filter_params_x, subpel_x_qn, coeffs_h);
48
527k
  else
49
527k
    prepare_coeffs_4t_ssse3(filter_params_x, subpel_x_qn, coeffs_h);
50
51
546k
  if (vert_tap == 2)
52
19.1k
    prepare_coeffs_2t(filter_params_y, subpel_y_qn, coeffs_v);
53
527k
  else if (vert_tap == 4)
54
346k
    prepare_coeffs_4t(filter_params_y, subpel_y_qn, coeffs_v);
55
180k
  else if (vert_tap == 6)
56
168k
    prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
57
11.6k
  else
58
11.6k
    prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
59
60
546k
  int im_h = h + vert_tap - 1;
61
546k
  const int fo_vert = vert_tap / 2 - 1;
62
546k
  const int fo_horiz = horiz_tap / 2 - 1;
63
546k
  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
64
65
546k
  filt[0] = _mm_load_si128((__m128i const *)filt1_global_sse2);
66
546k
  filt[1] = _mm_load_si128((__m128i const *)filt2_global_sse2);
67
68
546k
  if (horiz_tap == 2) {
69
19.1k
    CONVOLVE_SR_HOR_FILTER_2TAP_W4
70
527k
  } else {
71
527k
    CONVOLVE_SR_HOR_FILTER_4TAP_W4
72
527k
  }
73
74
546k
  if (vert_tap == 2) {
75
19.1k
    CONVOLVE_SR_VER_FILTER_2TAP_W4
76
527k
  } else if (vert_tap == 4) {
77
346k
    CONVOLVE_SR_VER_FILTER_4TAP_W4
78
346k
  } else if (vert_tap == 6) {
79
168k
    CONVOLVE_SR_VER_FILTER_6TAP_W4
80
168k
  } else {
81
11.7k
    CONVOLVE_SR_VER_FILTER_8TAP_W4
82
11.7k
  }
83
546k
}
84
85
static void convolve_2d_sr_avx2(const uint8_t *src, int src_stride,
86
                                uint8_t *dst, int dst_stride, int w, int h,
87
                                const InterpFilterParams *filter_params_x,
88
                                const InterpFilterParams *filter_params_y,
89
                                const int subpel_x_qn, const int subpel_y_qn,
90
742k
                                ConvolveParams *conv_params) {
91
742k
  if (filter_params_x->taps > 8) {
92
0
    const int bd = 8;
93
0
    int im_stride = 8, i;
94
0
    const int strip_stride = (MAX_SB_SIZE + MAX_FILTER_TAP) * 8;
95
0
    DECLARE_ALIGNED(
96
0
        32, int16_t,
97
0
        im_block_buf[(MAX_SB_SIZE / 8) * (MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
98
0
    const int bits =
99
0
        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
100
0
    const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
101
102
0
    assert(conv_params->round_0 > 0);
103
104
0
    const __m256i round_const_h12 = _mm256_set1_epi32(
105
0
        ((1 << (conv_params->round_0)) >> 1) + (1 << (bd + FILTER_BITS - 1)));
106
0
    const __m128i round_shift_h12 = _mm_cvtsi32_si128(conv_params->round_0);
107
108
0
    const __m256i sum_round_v = _mm256_set1_epi32(
109
0
        (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
110
0
    const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
111
112
0
    const __m256i round_const_v = _mm256_set1_epi32(
113
0
        ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
114
0
        ((1 << (offset_bits - conv_params->round_1)) >> 1));
115
0
    const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
116
117
0
    __m256i coeffs_h[6] = { 0 }, coeffs_v[6] = { 0 };
118
119
0
    int horiz_tap = 12;
120
0
    int vert_tap = 12;
121
122
0
    prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs_h);
123
0
    prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs_v);
124
125
0
    int im_h = h + vert_tap - 1;
126
0
    const int fo_vert = vert_tap / 2 - 1;
127
0
    const int fo_horiz = horiz_tap / 2 - 1;
128
0
    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
129
130
0
    const __m256i v_zero = _mm256_setzero_si256();
131
0
    __m256i s[12];
132
0
    if (w <= 4) {
133
0
      for (i = 0; i < im_h; i += 2) {
134
0
        for (int j = 0; j < w; j += 8) {
135
0
          int16_t *strip_im_block = &im_block_buf[(j / 8) * strip_stride];
136
0
          const __m256i data = _mm256_permute2x128_si256(
137
0
              _mm256_castsi128_si256(
138
0
                  _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
139
0
              _mm256_castsi128_si256(_mm_loadu_si128(
140
0
                  (__m128i *)(&src_ptr[i * src_stride + src_stride + j]))),
141
0
              0x20);
142
0
          const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
143
0
          const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
144
0
          const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
145
0
          const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
146
147
0
          const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
148
0
          const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
149
150
0
          s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
151
0
          s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
152
0
          s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
153
0
          s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
154
0
          s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
155
0
          s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
156
157
0
          const __m256i res_lo = convolve_12taps(s, coeffs_h);
158
159
0
          __m256i res_32b_lo = _mm256_sra_epi32(
160
0
              _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);
161
0
          __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
162
0
          const __m128i res_0 = _mm256_extracti128_si256(res_16b_lo, 0);
163
0
          const __m128i res_1 = _mm256_extracti128_si256(res_16b_lo, 1);
164
0
          if (w > 2) {
165
0
            _mm_storel_epi64((__m128i *)&strip_im_block[i * im_stride], res_0);
166
0
            _mm_storel_epi64(
167
0
                (__m128i *)&strip_im_block[i * im_stride + im_stride], res_1);
168
0
          } else {
169
0
            uint32_t horiz_2;
170
0
            horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_0);
171
0
            strip_im_block[i * im_stride] = (uint16_t)horiz_2;
172
0
            strip_im_block[i * im_stride + 1] = (uint16_t)(horiz_2 >> 16);
173
0
            horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_1);
174
0
            strip_im_block[i * im_stride + im_stride] = (uint16_t)horiz_2;
175
0
            strip_im_block[i * im_stride + im_stride + 1] =
176
0
                (uint16_t)(horiz_2 >> 16);
177
0
          }
178
0
        }
179
0
      }
180
0
    } else {
181
0
      for (i = 0; i < im_h; i++) {
182
0
        for (int j = 0; j < w; j += 8) {
183
0
          int16_t *strip_im_block = &im_block_buf[(j / 8) * strip_stride];
184
0
          const __m256i data = _mm256_permute2x128_si256(
185
0
              _mm256_castsi128_si256(
186
0
                  _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
187
0
              _mm256_castsi128_si256(_mm_loadu_si128(
188
0
                  (__m128i *)(&src_ptr[i * src_stride + j + 4]))),
189
0
              0x20);
190
0
          const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
191
0
          const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
192
193
0
          const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
194
0
          const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
195
196
0
          const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
197
0
          const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
198
199
0
          s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
200
0
          s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
201
0
          s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
202
0
          s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
203
0
          s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
204
0
          s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
205
206
0
          const __m256i res_lo = convolve_12taps(s, coeffs_h);
207
208
0
          __m256i res_32b_lo = _mm256_sra_epi32(
209
0
              _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);
210
211
0
          __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
212
0
          _mm_store_si128((__m128i *)&strip_im_block[i * im_stride],
213
0
                          _mm256_extracti128_si256(
214
0
                              _mm256_permute4x64_epi64(res_16b_lo, 0x88), 0));
215
0
        }
216
0
      }
217
0
    }
218
219
0
    for (int j = 0; j < w; j += 8) {
220
0
      const int16_t *im_block = &im_block_buf[(j / 8) * strip_stride];
221
0
      CONVOLVE_SR_VERTICAL_FILTER_12TAP
222
0
    }
223
742k
  } else {
224
742k
    int im_stride = 8, i;
225
742k
    const int strip_stride = (MAX_SB_SIZE + MAX_FILTER_TAP) * 8;
226
742k
    DECLARE_ALIGNED(
227
742k
        32, int16_t,
228
742k
        im_block_buf[(MAX_SB_SIZE / 8) * (MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
229
230
742k
    assert(conv_params->round_0 == 3);
231
742k
    assert(conv_params->round_1 == 11);
232
233
742k
    const __m256i round_const_h =
234
742k
        _mm256_set1_epi16(1 << (conv_params->round_0 - 2));
235
742k
    const __m256i round_const_v =
236
742k
        _mm256_set1_epi32(1 << (conv_params->round_1 - 1));
237
238
742k
    __m256i filt[4], coeffs_h[4] = { 0 }, coeffs_v[4] = { 0 };
239
240
742k
    int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
241
742k
    int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
242
243
742k
    assert(horiz_tap == 2 || horiz_tap == 4 || horiz_tap == 6 ||
244
742k
           horiz_tap == 8);
245
742k
    assert(vert_tap == 2 || vert_tap == 4 || vert_tap == 6 || vert_tap == 8);
246
247
742k
    if (horiz_tap == 2)
248
17.5k
      prepare_coeffs_2t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
249
724k
    else if (horiz_tap == 4)
250
34.9k
      prepare_coeffs_4t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
251
689k
    else if (horiz_tap == 6)
252
632k
      prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
253
57.5k
    else
254
57.5k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
255
256
742k
    if (vert_tap == 2)
257
17.5k
      prepare_coeffs_2t(filter_params_y, subpel_y_qn, coeffs_v);
258
724k
    else if (vert_tap == 4)
259
346k
      prepare_coeffs_4t(filter_params_y, subpel_y_qn, coeffs_v);
260
378k
    else if (vert_tap == 6)
261
339k
      prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
262
38.8k
    else
263
38.8k
      prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
264
265
742k
    int im_h = h + vert_tap - 1;
266
742k
    const int fo_vert = vert_tap / 2 - 1;
267
742k
    const int fo_horiz = horiz_tap / 2 - 1;
268
742k
    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
269
270
742k
    filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
271
742k
    filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
272
742k
    filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
273
742k
    filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
274
275
742k
    if (subpel_x_qn == 0 && subpel_y_qn == 0) {
276
0
      for (i = 0; i < h; ++i) {
277
0
        for (int j = 0; j < w; j += 8) {
278
0
          _mm_storel_epi64(
279
0
              (__m128i *)&dst[i * dst_stride + j],
280
0
              _mm_loadl_epi64((const __m128i *)&src[i * src_stride + j]));
281
0
        }
282
0
      }
283
0
      return;
284
0
    }
285
286
5.63M
    for (i = 0; i < (im_h - 1); i += 2) {
287
4.89M
      const uint8_t *src_row0 = &src_ptr[i * src_stride];
288
4.89M
      const uint8_t *src_row1 = &src_ptr[(i + 1) * src_stride];
289
18.2M
      for (int j = 0; j < w; j += 8) {
290
13.4M
        int16_t *strip_im_block = &im_block_buf[(j / 8) * strip_stride];
291
13.4M
        __m256i data =
292
13.4M
            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src_row0[j]));
293
13.4M
        data = _mm256_inserti128_si256(
294
13.4M
            data, _mm_loadu_si128((__m128i *)&src_row1[j]), 1);
295
296
13.4M
        __m256i res;
297
13.4M
        if (horiz_tap == 2)
298
402k
          res = convolve_lowbd_x_2tap(data, coeffs_h, filt);
299
12.9M
        else if (horiz_tap == 4)
300
797k
          res = convolve_lowbd_x_4tap(data, coeffs_h, filt);
301
12.2M
        else if (horiz_tap == 6)
302
9.91M
          res = convolve_lowbd_x_6tap(data, coeffs_h, filt);
303
2.28M
        else
304
2.28M
          res = convolve_lowbd_x(data, coeffs_h, filt);
305
306
13.4M
        res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2);
307
13.4M
        _mm256_store_si256((__m256i *)&strip_im_block[i * 8], res);
308
13.4M
      }
309
4.89M
    }
310
742k
    {
311
742k
      const uint8_t *src_row0 = &src_ptr[i * src_stride];
312
2.07M
      for (int j = 0; j < w; j += 8) {
313
1.32M
        int16_t *strip_im_block = &im_block_buf[(j / 8) * strip_stride];
314
1.32M
        __m256i data_1 =
315
1.32M
            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src_row0[j]));
316
1.32M
        __m256i res;
317
1.32M
        if (horiz_tap == 2)
318
33.4k
          res = convolve_lowbd_x_2tap(data_1, coeffs_h, filt);
319
1.29M
        else if (horiz_tap == 4)
320
64.0k
          res = convolve_lowbd_x_4tap(data_1, coeffs_h, filt);
321
1.23M
        else if (horiz_tap == 6)
322
1.06M
          res = convolve_lowbd_x_6tap(data_1, coeffs_h, filt);
323
170k
        else
324
170k
          res = convolve_lowbd_x(data_1, coeffs_h, filt);
325
326
1.32M
        res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2);
327
1.32M
        _mm_store_si128((__m128i *)&strip_im_block[i * 8],
328
1.32M
                        _mm256_castsi256_si128(res));
329
1.32M
      }
330
742k
    }
331
332
2.07M
    for (int j = 0; j < w; j += 8) {
333
1.32M
      const int16_t *im_block = &im_block_buf[(j / 8) * strip_stride];
334
1.32M
      uint8_t *dst_ptr = dst + j;
335
1.32M
      if (vert_tap == 2) {
336
33.4k
        CONVOLVE_SR_VERTICAL_FILTER_2TAP
337
1.29M
      } else if (vert_tap == 4) {
338
469k
        CONVOLVE_SR_VERTICAL_FILTER_4TAP
339
825k
      } else if (vert_tap == 6) {
340
679k
        CONVOLVE_SR_VERTICAL_FILTER_6TAP
341
679k
      } else {
342
146k
        CONVOLVE_SR_VERTICAL_FILTER_8TAP
343
146k
      }
344
1.32M
    }
345
742k
  }
346
742k
}
347
348
void av1_convolve_2d_sr_avx2(
349
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
350
    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
351
    const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn,
352
1.28M
    const int32_t subpel_y_qn, ConvolveParams *conv_params) {
353
1.28M
  const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_qn);
354
1.28M
  const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_qn);
355
356
1.28M
  const bool use_12tap = (tap_x == 12 || tap_y == 12);
357
1.28M
  if (w <= 4 && !use_12tap) {
358
546k
    convolve_2d_sr_w4_avx2(src, src_stride, dst, dst_stride, w, h,
359
546k
                           filter_params_x, filter_params_y, subpel_x_qn,
360
546k
                           subpel_y_qn, conv_params);
361
742k
  } else {
362
742k
    convolve_2d_sr_avx2(src, src_stride, dst, dst_stride, w, h, filter_params_x,
363
742k
                        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
364
742k
  }
365
1.28M
}