Coverage Report

Created: 2026-03-08 06:51

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/av1/common/x86/convolve_2d_avx2.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <immintrin.h>
13
14
#include "config/av1_rtcd.h"
15
16
#include "aom_dsp/x86/convolve_avx2.h"
17
#include "aom_dsp/aom_filter.h"
18
#include "aom_dsp/x86/synonyms.h"
19
20
#include "av1/common/convolve.h"
21
22
static void convolve_2d_sr_w4_avx2(
23
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
24
    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
25
    const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn,
26
831k
    const int32_t subpel_y_qn, ConvolveParams *conv_params) {
27
831k
  int i;
28
831k
  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 4]);
29
831k
  uint8_t *dst_ptr = dst;
30
831k
  assert(conv_params->round_0 == 3);
31
831k
  assert(conv_params->round_1 == 11);
32
33
831k
  const __m128i round_const_h = _mm_set1_epi16(1 << (conv_params->round_0 - 2));
34
831k
  const __m256i round_const_v =
35
831k
      _mm256_set1_epi32(1 << (conv_params->round_1 - 1));
36
37
831k
  __m128i filt[2], coeffs_h[2] = { 0 };
38
831k
  __m256i coeffs_v[4] = { 0 };
39
40
831k
  const int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
41
831k
  const int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
42
43
831k
  assert(horiz_tap == 2 || horiz_tap == 4);
44
831k
  assert(vert_tap == 2 || vert_tap == 4 || vert_tap == 6 || vert_tap == 8);
45
46
831k
  if (horiz_tap == 2)
47
29.2k
    prepare_coeffs_2t_ssse3(filter_params_x, subpel_x_qn, coeffs_h);
48
801k
  else
49
801k
    prepare_coeffs_4t_ssse3(filter_params_x, subpel_x_qn, coeffs_h);
50
51
831k
  if (vert_tap == 2)
52
29.2k
    prepare_coeffs_2t(filter_params_y, subpel_y_qn, coeffs_v);
53
801k
  else if (vert_tap == 4)
54
520k
    prepare_coeffs_4t(filter_params_y, subpel_y_qn, coeffs_v);
55
281k
  else if (vert_tap == 6)
56
268k
    prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
57
13.4k
  else
58
13.4k
    prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
59
60
831k
  int im_h = h + vert_tap - 1;
61
831k
  const int fo_vert = vert_tap / 2 - 1;
62
831k
  const int fo_horiz = horiz_tap / 2 - 1;
63
831k
  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
64
65
831k
  filt[0] = _mm_load_si128((__m128i const *)filt1_global_sse2);
66
831k
  filt[1] = _mm_load_si128((__m128i const *)filt2_global_sse2);
67
68
831k
  if (horiz_tap == 2) {
69
29.2k
    CONVOLVE_SR_HOR_FILTER_2TAP_W4
70
801k
  } else {
71
801k
    CONVOLVE_SR_HOR_FILTER_4TAP_W4
72
801k
  }
73
74
831k
  if (vert_tap == 2) {
75
29.2k
    CONVOLVE_SR_VER_FILTER_2TAP_W4
76
801k
  } else if (vert_tap == 4) {
77
520k
    CONVOLVE_SR_VER_FILTER_4TAP_W4
78
520k
  } else if (vert_tap == 6) {
79
268k
    CONVOLVE_SR_VER_FILTER_6TAP_W4
80
268k
  } else {
81
13.4k
    CONVOLVE_SR_VER_FILTER_8TAP_W4
82
13.4k
  }
83
831k
}
84
85
static void convolve_2d_sr_general_avx2(
86
    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
87
    int h, const InterpFilterParams *filter_params_x,
88
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
89
1.05M
    const int subpel_y_qn, ConvolveParams *conv_params) {
90
1.05M
  if (filter_params_x->taps > 8) {
91
0
    const int bd = 8;
92
0
    int im_stride = 8, i;
93
0
    DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
94
0
    const int bits =
95
0
        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
96
0
    const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
97
98
0
    assert(conv_params->round_0 > 0);
99
100
0
    const __m256i round_const_h12 = _mm256_set1_epi32(
101
0
        ((1 << (conv_params->round_0)) >> 1) + (1 << (bd + FILTER_BITS - 1)));
102
0
    const __m128i round_shift_h12 = _mm_cvtsi32_si128(conv_params->round_0);
103
104
0
    const __m256i sum_round_v = _mm256_set1_epi32(
105
0
        (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
106
0
    const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
107
108
0
    const __m256i round_const_v = _mm256_set1_epi32(
109
0
        ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
110
0
        ((1 << (offset_bits - conv_params->round_1)) >> 1));
111
0
    const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
112
113
0
    __m256i coeffs_h[6] = { 0 }, coeffs_v[6] = { 0 };
114
115
0
    int horiz_tap = 12;
116
0
    int vert_tap = 12;
117
118
0
    prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs_h);
119
0
    prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs_v);
120
121
0
    int im_h = h + vert_tap - 1;
122
0
    const int fo_vert = vert_tap / 2 - 1;
123
0
    const int fo_horiz = horiz_tap / 2 - 1;
124
0
    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
125
126
0
    for (int j = 0; j < w; j += 8) {
127
0
      CONVOLVE_SR_HORIZONTAL_FILTER_12TAP
128
0
      CONVOLVE_SR_VERTICAL_FILTER_12TAP
129
0
    }
130
1.05M
  } else {
131
1.05M
    int im_stride = 8, i;
132
1.05M
    DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
133
134
1.05M
    assert(conv_params->round_0 == 3);
135
1.05M
    assert(conv_params->round_1 == 11);
136
137
1.05M
    const __m256i round_const_h =
138
1.05M
        _mm256_set1_epi16(1 << (conv_params->round_0 - 2));
139
1.05M
    const __m256i round_const_v =
140
1.05M
        _mm256_set1_epi32(1 << (conv_params->round_1 - 1));
141
142
1.05M
    __m256i filt[4], coeffs_h[4] = { 0 }, coeffs_v[4] = { 0 };
143
144
1.05M
    int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
145
1.05M
    int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
146
147
1.05M
    assert(horiz_tap == 2 || horiz_tap == 4 || horiz_tap == 6 ||
148
1.05M
           horiz_tap == 8);
149
1.05M
    assert(vert_tap == 2 || vert_tap == 4 || vert_tap == 6 || vert_tap == 8);
150
151
1.05M
    if (horiz_tap == 2)
152
26.3k
      prepare_coeffs_2t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
153
1.03M
    else if (horiz_tap == 4)
154
72.4k
      prepare_coeffs_4t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
155
958k
    else if (horiz_tap == 6)
156
884k
      prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
157
73.9k
    else
158
73.9k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
159
160
1.05M
    if (vert_tap == 2)
161
26.3k
      prepare_coeffs_2t(filter_params_y, subpel_y_qn, coeffs_v);
162
1.03M
    else if (vert_tap == 4)
163
482k
      prepare_coeffs_4t(filter_params_y, subpel_y_qn, coeffs_v);
164
548k
    else if (vert_tap == 6)
165
492k
      prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
166
56.0k
    else
167
56.0k
      prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
168
169
1.05M
    int im_h = h + vert_tap - 1;
170
1.05M
    const int fo_vert = vert_tap / 2 - 1;
171
1.05M
    const int fo_horiz = horiz_tap / 2 - 1;
172
1.05M
    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
173
174
1.05M
    filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
175
1.05M
    filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
176
1.05M
    filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
177
1.05M
    filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
178
179
2.94M
    for (int j = 0; j < w; j += 8) {
180
1.88M
      if (horiz_tap == 2) {
181
55.5k
        CONVOLVE_SR_HORIZONTAL_FILTER_2TAP
182
1.82M
      } else if (horiz_tap == 4) {
183
135k
        CONVOLVE_SR_HORIZONTAL_FILTER_4TAP
184
1.69M
      } else if (horiz_tap == 6) {
185
1.45M
        CONVOLVE_SR_HORIZONTAL_FILTER_6TAP
186
1.45M
      } else {
187
238k
        CONVOLVE_SR_HORIZONTAL_FILTER_8TAP
188
238k
      }
189
190
1.88M
      uint8_t *dst_ptr = dst + j;
191
1.88M
      if (vert_tap == 2) {
192
55.5k
        CONVOLVE_SR_VERTICAL_FILTER_2TAP
193
1.82M
      } else if (vert_tap == 4) {
194
662k
        CONVOLVE_SR_VERTICAL_FILTER_4TAP
195
1.16M
      } else if (vert_tap == 6) {
196
953k
        CONVOLVE_SR_VERTICAL_FILTER_6TAP
197
953k
      } else {
198
213k
        CONVOLVE_SR_VERTICAL_FILTER_8TAP
199
213k
      }
200
1.88M
    }
201
1.05M
  }
202
1.05M
}
203
204
void av1_convolve_2d_sr_avx2(
205
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
206
    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
207
    const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn,
208
1.88M
    const int32_t subpel_y_qn, ConvolveParams *conv_params) {
209
1.88M
  const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_qn);
210
1.88M
  const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_qn);
211
212
1.88M
  const bool use_12tap = (tap_x == 12 || tap_y == 12);
213
1.88M
  if (w <= 4 && !use_12tap) {
214
831k
    convolve_2d_sr_w4_avx2(src, src_stride, dst, dst_stride, w, h,
215
831k
                           filter_params_x, filter_params_y, subpel_x_qn,
216
831k
                           subpel_y_qn, conv_params);
217
1.05M
  } else {
218
1.05M
    convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
219
1.05M
                                filter_params_x, filter_params_y, subpel_x_qn,
220
1.05M
                                subpel_y_qn, conv_params);
221
1.05M
  }
222
1.88M
}