Coverage Report

Created: 2026-06-30 06:53

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/av1/common/x86/convolve_2d_avx2.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <immintrin.h>
13
#include <stdint.h>
14
15
#include "config/av1_rtcd.h"
16
17
#include "aom_dsp/x86/convolve_avx2.h"
18
#include "aom_dsp/aom_filter.h"
19
#include "aom_dsp/x86/synonyms.h"
20
21
#include "av1/common/convolve.h"
22
23
static void convolve_2d_sr_w4_avx2(
24
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
25
    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
26
    const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn,
27
584k
    const int32_t subpel_y_qn, ConvolveParams *conv_params) {
28
584k
  int i;
29
584k
  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 4]);
30
584k
  uint8_t *dst_ptr = dst;
31
584k
  assert(conv_params->round_0 == 3);
32
584k
  assert(conv_params->round_1 == 11);
33
34
584k
  const __m128i round_const_h = _mm_set1_epi16(1 << (conv_params->round_0 - 2));
35
584k
  const __m256i round_const_v =
36
584k
      _mm256_set1_epi32(1 << (conv_params->round_1 - 1));
37
38
584k
  __m128i filt[2], coeffs_h[2] = { 0 };
39
584k
  __m256i coeffs_v[4] = { 0 };
40
41
584k
  const int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
42
584k
  const int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
43
44
584k
  assert(horiz_tap == 2 || horiz_tap == 4);
45
584k
  assert(vert_tap == 2 || vert_tap == 4 || vert_tap == 6 || vert_tap == 8);
46
47
584k
  if (horiz_tap == 2)
48
15.6k
    prepare_coeffs_2t_ssse3(filter_params_x, subpel_x_qn, coeffs_h);
49
568k
  else
50
568k
    prepare_coeffs_4t_ssse3(filter_params_x, subpel_x_qn, coeffs_h);
51
52
584k
  if (vert_tap == 2)
53
15.6k
    prepare_coeffs_2t(filter_params_y, subpel_y_qn, coeffs_v);
54
568k
  else if (vert_tap == 4)
55
377k
    prepare_coeffs_4t(filter_params_y, subpel_y_qn, coeffs_v);
56
191k
  else if (vert_tap == 6)
57
181k
    prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
58
9.94k
  else
59
9.94k
    prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
60
61
584k
  int im_h = h + vert_tap - 1;
62
584k
  const int fo_vert = vert_tap / 2 - 1;
63
584k
  const int fo_horiz = horiz_tap / 2 - 1;
64
584k
  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
65
66
584k
  filt[0] = _mm_load_si128((__m128i const *)filt1_global_sse2);
67
584k
  filt[1] = _mm_load_si128((__m128i const *)filt2_global_sse2);
68
69
584k
  if (horiz_tap == 2) {
70
15.6k
    CONVOLVE_SR_HOR_FILTER_2TAP_W4
71
568k
  } else {
72
568k
    CONVOLVE_SR_HOR_FILTER_4TAP_W4
73
568k
  }
74
75
584k
  if (vert_tap == 2) {
76
15.6k
    CONVOLVE_SR_VER_FILTER_2TAP_W4
77
568k
  } else if (vert_tap == 4) {
78
377k
    CONVOLVE_SR_VER_FILTER_4TAP_W4
79
377k
  } else if (vert_tap == 6) {
80
181k
    CONVOLVE_SR_VER_FILTER_6TAP_W4
81
181k
  } else {
82
9.94k
    CONVOLVE_SR_VER_FILTER_8TAP_W4
83
9.94k
  }
84
584k
}
85
86
static void convolve_2d_sr_avx2(const uint8_t *src, int src_stride,
87
                                uint8_t *dst, int dst_stride, int w, int h,
88
                                const InterpFilterParams *filter_params_x,
89
                                const InterpFilterParams *filter_params_y,
90
                                const int subpel_x_qn, const int subpel_y_qn,
91
831k
                                ConvolveParams *conv_params) {
92
831k
  if (filter_params_x->taps > 8) {
93
0
    const int bd = 8;
94
0
    int im_stride = 8, i;
95
0
    const int strip_stride = (MAX_SB_SIZE + MAX_FILTER_TAP) * 8;
96
0
    DECLARE_ALIGNED(
97
0
        32, int16_t,
98
0
        im_block_buf[(MAX_SB_SIZE / 8) * (MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
99
0
    const int bits =
100
0
        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
101
0
    const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
102
103
0
    assert(conv_params->round_0 > 0);
104
105
0
    const __m256i round_const_h12 = _mm256_set1_epi32(
106
0
        ((1 << (conv_params->round_0)) >> 1) + (1 << (bd + FILTER_BITS - 1)));
107
0
    const __m128i round_shift_h12 = _mm_cvtsi32_si128(conv_params->round_0);
108
109
0
    const __m256i sum_round_v = _mm256_set1_epi32(
110
0
        (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
111
0
    const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
112
113
0
    const __m256i round_const_v = _mm256_set1_epi32(
114
0
        ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
115
0
        ((1 << (offset_bits - conv_params->round_1)) >> 1));
116
0
    const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
117
118
0
    __m256i coeffs_h[6] = { 0 }, coeffs_v[6] = { 0 };
119
120
0
    int horiz_tap = 12;
121
0
    int vert_tap = 12;
122
123
0
    prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs_h);
124
0
    prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs_v);
125
126
0
    int im_h = h + vert_tap - 1;
127
0
    const int fo_vert = vert_tap / 2 - 1;
128
0
    const int fo_horiz = horiz_tap / 2 - 1;
129
0
    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
130
131
0
    const __m256i v_zero = _mm256_setzero_si256();
132
0
    __m256i s[12];
133
0
    if (w <= 4) {
134
0
      for (i = 0; i < im_h; i += 2) {
135
0
        for (int j = 0; j < w; j += 8) {
136
0
          int16_t *strip_im_block = &im_block_buf[(j / 8) * strip_stride];
137
0
          const __m256i data = _mm256_permute2x128_si256(
138
0
              _mm256_castsi128_si256(
139
0
                  _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
140
0
              _mm256_castsi128_si256(_mm_loadu_si128(
141
0
                  (__m128i *)(&src_ptr[i * src_stride + src_stride + j]))),
142
0
              0x20);
143
0
          const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
144
0
          const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
145
0
          const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
146
0
          const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
147
148
0
          const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
149
0
          const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
150
151
0
          s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
152
0
          s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
153
0
          s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
154
0
          s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
155
0
          s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
156
0
          s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
157
158
0
          const __m256i res_lo = convolve_12taps(s, coeffs_h);
159
160
0
          __m256i res_32b_lo = _mm256_sra_epi32(
161
0
              _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);
162
0
          __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
163
0
          const __m128i res_0 = _mm256_extracti128_si256(res_16b_lo, 0);
164
0
          const __m128i res_1 = _mm256_extracti128_si256(res_16b_lo, 1);
165
0
          if (w > 2) {
166
0
            _mm_storel_epi64((__m128i *)&strip_im_block[i * im_stride], res_0);
167
0
            _mm_storel_epi64(
168
0
                (__m128i *)&strip_im_block[i * im_stride + im_stride], res_1);
169
0
          } else {
170
0
            uint32_t horiz_2;
171
0
            horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_0);
172
0
            strip_im_block[i * im_stride] = (uint16_t)horiz_2;
173
0
            strip_im_block[i * im_stride + 1] = (uint16_t)(horiz_2 >> 16);
174
0
            horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_1);
175
0
            strip_im_block[i * im_stride + im_stride] = (uint16_t)horiz_2;
176
0
            strip_im_block[i * im_stride + im_stride + 1] =
177
0
                (uint16_t)(horiz_2 >> 16);
178
0
          }
179
0
        }
180
0
      }
181
0
    } else {
182
0
      for (i = 0; i < im_h; i++) {
183
0
        for (int j = 0; j < w; j += 8) {
184
0
          int16_t *strip_im_block = &im_block_buf[(j / 8) * strip_stride];
185
0
          const __m256i data = _mm256_permute2x128_si256(
186
0
              _mm256_castsi128_si256(
187
0
                  _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
188
0
              _mm256_castsi128_si256(_mm_loadu_si128(
189
0
                  (__m128i *)(&src_ptr[i * src_stride + j + 4]))),
190
0
              0x20);
191
0
          const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
192
0
          const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
193
194
0
          const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
195
0
          const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
196
197
0
          const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
198
0
          const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
199
200
0
          s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
201
0
          s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
202
0
          s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
203
0
          s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
204
0
          s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
205
0
          s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
206
207
0
          const __m256i res_lo = convolve_12taps(s, coeffs_h);
208
209
0
          __m256i res_32b_lo = _mm256_sra_epi32(
210
0
              _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);
211
212
0
          __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
213
0
          _mm_store_si128((__m128i *)&strip_im_block[i * im_stride],
214
0
                          _mm256_extracti128_si256(
215
0
                              _mm256_permute4x64_epi64(res_16b_lo, 0x88), 0));
216
0
        }
217
0
      }
218
0
    }
219
220
0
    for (int j = 0; j < w; j += 8) {
221
0
      const int16_t *im_block = &im_block_buf[(j / 8) * strip_stride];
222
0
      CONVOLVE_SR_VERTICAL_FILTER_12TAP
223
0
    }
224
831k
  } else {
225
831k
    int im_stride = 8, i;
226
831k
    const int strip_stride = (MAX_SB_SIZE + MAX_FILTER_TAP) * 8;
227
831k
    DECLARE_ALIGNED(
228
831k
        32, int16_t,
229
831k
        im_block_buf[(MAX_SB_SIZE / 8) * (MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
230
231
831k
    assert(conv_params->round_0 == 3);
232
831k
    assert(conv_params->round_1 == 11);
233
234
831k
    const __m256i round_const_h =
235
831k
        _mm256_set1_epi16(1 << (conv_params->round_0 - 2));
236
831k
    const __m256i round_const_v =
237
831k
        _mm256_set1_epi32(1 << (conv_params->round_1 - 1));
238
239
831k
    __m256i filt[4], coeffs_h[4] = { 0 }, coeffs_v[4] = { 0 };
240
241
831k
    int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
242
831k
    int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
243
244
831k
    assert(horiz_tap == 2 || horiz_tap == 4 || horiz_tap == 6 ||
245
831k
           horiz_tap == 8);
246
831k
    assert(vert_tap == 2 || vert_tap == 4 || vert_tap == 6 || vert_tap == 8);
247
248
831k
    if (horiz_tap == 2)
249
14.7k
      prepare_coeffs_2t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
250
816k
    else if (horiz_tap == 4)
251
38.0k
      prepare_coeffs_4t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
252
778k
    else if (horiz_tap == 6)
253
730k
      prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
254
47.6k
    else
255
47.6k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
256
257
831k
    if (vert_tap == 2)
258
14.7k
      prepare_coeffs_2t(filter_params_y, subpel_y_qn, coeffs_v);
259
816k
    else if (vert_tap == 4)
260
402k
      prepare_coeffs_4t(filter_params_y, subpel_y_qn, coeffs_v);
261
414k
    else if (vert_tap == 6)
262
382k
      prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
263
31.7k
    else
264
31.7k
      prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
265
266
831k
    int im_h = h + vert_tap - 1;
267
831k
    const int fo_vert = vert_tap / 2 - 1;
268
831k
    const int fo_horiz = horiz_tap / 2 - 1;
269
831k
    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
270
271
831k
    filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
272
831k
    filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
273
831k
    filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
274
831k
    filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
275
276
831k
    if (subpel_x_qn == 0 && subpel_y_qn == 0) {
277
0
      for (i = 0; i < h; ++i) {
278
0
        for (int j = 0; j < w; j += 8) {
279
0
          _mm_storel_epi64(
280
0
              (__m128i *)&dst[i * dst_stride + j],
281
0
              _mm_loadl_epi64((const __m128i *)&src[i * src_stride + j]));
282
0
        }
283
0
      }
284
0
      return;
285
0
    }
286
287
6.15M
    for (i = 0; i < (im_h - 1); i += 2) {
288
5.32M
      const uint8_t *src_row0 = &src_ptr[i * src_stride];
289
5.32M
      const uint8_t *src_row1 = &src_ptr[(i + 1) * src_stride];
290
19.5M
      for (int j = 0; j < w; j += 8) {
291
14.2M
        int16_t *strip_im_block = &im_block_buf[(j / 8) * strip_stride];
292
14.2M
        __m256i data =
293
14.2M
            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src_row0[j]));
294
14.2M
        data = _mm256_inserti128_si256(
295
14.2M
            data, _mm_loadu_si128((__m128i *)&src_row1[j]), 1);
296
297
14.2M
        __m256i res;
298
14.2M
        if (horiz_tap == 2)
299
403k
          res = convolve_lowbd_x_2tap(data, coeffs_h, filt);
300
13.8M
        else if (horiz_tap == 4)
301
973k
          res = convolve_lowbd_x_4tap(data, coeffs_h, filt);
302
12.8M
        else if (horiz_tap == 6)
303
11.0M
          res = convolve_lowbd_x_6tap(data, coeffs_h, filt);
304
1.82M
        else
305
1.82M
          res = convolve_lowbd_x(data, coeffs_h, filt);
306
307
14.2M
        res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2);
308
14.2M
        _mm256_store_si256((__m256i *)&strip_im_block[i * 8], res);
309
14.2M
      }
310
5.32M
    }
311
831k
    {
312
831k
      const uint8_t *src_row0 = &src_ptr[i * src_stride];
313
2.29M
      for (int j = 0; j < w; j += 8) {
314
1.46M
        int16_t *strip_im_block = &im_block_buf[(j / 8) * strip_stride];
315
1.46M
        __m256i data_1 =
316
1.46M
            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src_row0[j]));
317
1.46M
        __m256i res;
318
1.46M
        if (horiz_tap == 2)
319
30.6k
          res = convolve_lowbd_x_2tap(data_1, coeffs_h, filt);
320
1.43M
        else if (horiz_tap == 4)
321
72.2k
          res = convolve_lowbd_x_4tap(data_1, coeffs_h, filt);
322
1.35M
        else if (horiz_tap == 6)
323
1.22M
          res = convolve_lowbd_x_6tap(data_1, coeffs_h, filt);
324
136k
        else
325
136k
          res = convolve_lowbd_x(data_1, coeffs_h, filt);
326
327
1.46M
        res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2);
328
1.46M
        _mm_store_si128((__m128i *)&strip_im_block[i * 8],
329
1.46M
                        _mm256_castsi256_si128(res));
330
1.46M
      }
331
831k
    }
332
333
2.29M
    for (int j = 0; j < w; j += 8) {
334
1.45M
      const int16_t *im_block = &im_block_buf[(j / 8) * strip_stride];
335
1.45M
      uint8_t *dst_ptr = dst + j;
336
1.45M
      if (vert_tap == 2) {
337
30.5k
        CONVOLVE_SR_VERTICAL_FILTER_2TAP
338
1.42M
      } else if (vert_tap == 4) {
339
546k
        CONVOLVE_SR_VERTICAL_FILTER_4TAP
340
882k
      } else if (vert_tap == 6) {
341
766k
        CONVOLVE_SR_VERTICAL_FILTER_6TAP
342
766k
      } else {
343
115k
        CONVOLVE_SR_VERTICAL_FILTER_8TAP
344
115k
      }
345
1.45M
    }
346
831k
  }
347
831k
}
348
349
void av1_convolve_2d_sr_avx2(
350
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
351
    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
352
    const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn,
353
1.41M
    const int32_t subpel_y_qn, ConvolveParams *conv_params) {
354
1.41M
  const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_qn);
355
1.41M
  const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_qn);
356
357
1.41M
  const bool use_12tap = (tap_x == 12 || tap_y == 12);
358
1.41M
  if (w <= 4 && !use_12tap) {
359
584k
    convolve_2d_sr_w4_avx2(src, src_stride, dst, dst_stride, w, h,
360
584k
                           filter_params_x, filter_params_y, subpel_x_qn,
361
584k
                           subpel_y_qn, conv_params);
362
831k
  } else {
363
831k
    convolve_2d_sr_avx2(src, src_stride, dst, dst_stride, w, h, filter_params_x,
364
831k
                        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
365
831k
  }
366
1.41M
}