Coverage Report

Created: 2026-03-31 06:59

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/av1/common/x86/convolve_2d_avx2.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <immintrin.h>
13
14
#include "config/av1_rtcd.h"
15
16
#include "aom_dsp/x86/convolve_avx2.h"
17
#include "aom_dsp/aom_filter.h"
18
#include "aom_dsp/x86/synonyms.h"
19
20
#include "av1/common/convolve.h"
21
22
static void convolve_2d_sr_w4_avx2(
23
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
24
    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
25
    const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn,
26
551k
    const int32_t subpel_y_qn, ConvolveParams *conv_params) {
27
551k
  int i;
28
551k
  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 4]);
29
551k
  uint8_t *dst_ptr = dst;
30
551k
  assert(conv_params->round_0 == 3);
31
551k
  assert(conv_params->round_1 == 11);
32
33
551k
  const __m128i round_const_h = _mm_set1_epi16(1 << (conv_params->round_0 - 2));
34
551k
  const __m256i round_const_v =
35
551k
      _mm256_set1_epi32(1 << (conv_params->round_1 - 1));
36
37
551k
  __m128i filt[2], coeffs_h[2] = { 0 };
38
551k
  __m256i coeffs_v[4] = { 0 };
39
40
551k
  const int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
41
551k
  const int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
42
43
551k
  assert(horiz_tap == 2 || horiz_tap == 4);
44
551k
  assert(vert_tap == 2 || vert_tap == 4 || vert_tap == 6 || vert_tap == 8);
45
46
551k
  if (horiz_tap == 2)
47
22.8k
    prepare_coeffs_2t_ssse3(filter_params_x, subpel_x_qn, coeffs_h);
48
528k
  else
49
528k
    prepare_coeffs_4t_ssse3(filter_params_x, subpel_x_qn, coeffs_h);
50
51
551k
  if (vert_tap == 2)
52
22.8k
    prepare_coeffs_2t(filter_params_y, subpel_y_qn, coeffs_v);
53
528k
  else if (vert_tap == 4)
54
347k
    prepare_coeffs_4t(filter_params_y, subpel_y_qn, coeffs_v);
55
180k
  else if (vert_tap == 6)
56
170k
    prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
57
10.2k
  else
58
10.2k
    prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
59
60
551k
  int im_h = h + vert_tap - 1;
61
551k
  const int fo_vert = vert_tap / 2 - 1;
62
551k
  const int fo_horiz = horiz_tap / 2 - 1;
63
551k
  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
64
65
551k
  filt[0] = _mm_load_si128((__m128i const *)filt1_global_sse2);
66
551k
  filt[1] = _mm_load_si128((__m128i const *)filt2_global_sse2);
67
68
551k
  if (horiz_tap == 2) {
69
22.8k
    CONVOLVE_SR_HOR_FILTER_2TAP_W4
70
528k
  } else {
71
528k
    CONVOLVE_SR_HOR_FILTER_4TAP_W4
72
528k
  }
73
74
551k
  if (vert_tap == 2) {
75
22.8k
    CONVOLVE_SR_VER_FILTER_2TAP_W4
76
528k
  } else if (vert_tap == 4) {
77
347k
    CONVOLVE_SR_VER_FILTER_4TAP_W4
78
347k
  } else if (vert_tap == 6) {
79
170k
    CONVOLVE_SR_VER_FILTER_6TAP_W4
80
170k
  } else {
81
10.2k
    CONVOLVE_SR_VER_FILTER_8TAP_W4
82
10.2k
  }
83
551k
}
84
85
static void convolve_2d_sr_avx2(const uint8_t *src, int src_stride,
86
                                uint8_t *dst, int dst_stride, int w, int h,
87
                                const InterpFilterParams *filter_params_x,
88
                                const InterpFilterParams *filter_params_y,
89
                                const int subpel_x_qn, const int subpel_y_qn,
90
704k
                                ConvolveParams *conv_params) {
91
704k
  if (filter_params_x->taps > 8) {
92
0
    const int bd = 8;
93
0
    int im_stride = 8, i;
94
0
    DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
95
0
    const int bits =
96
0
        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
97
0
    const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
98
99
0
    assert(conv_params->round_0 > 0);
100
101
0
    const __m256i round_const_h12 = _mm256_set1_epi32(
102
0
        ((1 << (conv_params->round_0)) >> 1) + (1 << (bd + FILTER_BITS - 1)));
103
0
    const __m128i round_shift_h12 = _mm_cvtsi32_si128(conv_params->round_0);
104
105
0
    const __m256i sum_round_v = _mm256_set1_epi32(
106
0
        (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
107
0
    const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
108
109
0
    const __m256i round_const_v = _mm256_set1_epi32(
110
0
        ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
111
0
        ((1 << (offset_bits - conv_params->round_1)) >> 1));
112
0
    const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
113
114
0
    __m256i coeffs_h[6] = { 0 }, coeffs_v[6] = { 0 };
115
116
0
    int horiz_tap = 12;
117
0
    int vert_tap = 12;
118
119
0
    prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs_h);
120
0
    prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs_v);
121
122
0
    int im_h = h + vert_tap - 1;
123
0
    const int fo_vert = vert_tap / 2 - 1;
124
0
    const int fo_horiz = horiz_tap / 2 - 1;
125
0
    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
126
127
0
    for (int j = 0; j < w; j += 8) {
128
0
      CONVOLVE_SR_HORIZONTAL_FILTER_12TAP
129
0
      CONVOLVE_SR_VERTICAL_FILTER_12TAP
130
0
    }
131
704k
  } else {
132
704k
    int im_stride = 8, i;
133
704k
    DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
134
135
704k
    assert(conv_params->round_0 == 3);
136
704k
    assert(conv_params->round_1 == 11);
137
138
704k
    const __m256i round_const_h =
139
704k
        _mm256_set1_epi16(1 << (conv_params->round_0 - 2));
140
704k
    const __m256i round_const_v =
141
704k
        _mm256_set1_epi32(1 << (conv_params->round_1 - 1));
142
143
704k
    __m256i filt[4], coeffs_h[4] = { 0 }, coeffs_v[4] = { 0 };
144
145
704k
    int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
146
704k
    int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
147
148
704k
    assert(horiz_tap == 2 || horiz_tap == 4 || horiz_tap == 6 ||
149
704k
           horiz_tap == 8);
150
704k
    assert(vert_tap == 2 || vert_tap == 4 || vert_tap == 6 || vert_tap == 8);
151
152
704k
    if (horiz_tap == 2)
153
20.4k
      prepare_coeffs_2t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
154
684k
    else if (horiz_tap == 4)
155
39.7k
      prepare_coeffs_4t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
156
644k
    else if (horiz_tap == 6)
157
595k
      prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
158
49.4k
    else
159
49.4k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
160
161
704k
    if (vert_tap == 2)
162
20.4k
      prepare_coeffs_2t(filter_params_y, subpel_y_qn, coeffs_v);
163
684k
    else if (vert_tap == 4)
164
322k
      prepare_coeffs_4t(filter_params_y, subpel_y_qn, coeffs_v);
165
361k
    else if (vert_tap == 6)
166
325k
      prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
167
36.1k
    else
168
36.1k
      prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
169
170
704k
    int im_h = h + vert_tap - 1;
171
704k
    const int fo_vert = vert_tap / 2 - 1;
172
704k
    const int fo_horiz = horiz_tap / 2 - 1;
173
704k
    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
174
175
704k
    filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
176
704k
    filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
177
704k
    filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
178
704k
    filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
179
180
1.94M
    for (int j = 0; j < w; j += 8) {
181
1.24M
      if (horiz_tap == 2) {
182
39.8k
        CONVOLVE_SR_HORIZONTAL_FILTER_2TAP
183
1.20M
      } else if (horiz_tap == 4) {
184
73.0k
        CONVOLVE_SR_HORIZONTAL_FILTER_4TAP
185
1.13M
      } else if (horiz_tap == 6) {
186
976k
        CONVOLVE_SR_HORIZONTAL_FILTER_6TAP
187
976k
      } else {
188
154k
        CONVOLVE_SR_HORIZONTAL_FILTER_8TAP
189
154k
      }
190
191
1.24M
      uint8_t *dst_ptr = dst + j;
192
1.24M
      if (vert_tap == 2) {
193
39.8k
        CONVOLVE_SR_VERTICAL_FILTER_2TAP
194
1.20M
      } else if (vert_tap == 4) {
195
436k
        CONVOLVE_SR_VERTICAL_FILTER_4TAP
196
767k
      } else if (vert_tap == 6) {
197
631k
        CONVOLVE_SR_VERTICAL_FILTER_6TAP
198
631k
      } else {
199
136k
        CONVOLVE_SR_VERTICAL_FILTER_8TAP
200
136k
      }
201
1.24M
    }
202
704k
  }
203
704k
}
204
205
void av1_convolve_2d_sr_avx2(
206
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
207
    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
208
    const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn,
209
1.25M
    const int32_t subpel_y_qn, ConvolveParams *conv_params) {
210
1.25M
  const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_qn);
211
1.25M
  const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_qn);
212
213
1.25M
  const bool use_12tap = (tap_x == 12 || tap_y == 12);
214
1.25M
  if (w <= 4 && !use_12tap) {
215
551k
    convolve_2d_sr_w4_avx2(src, src_stride, dst, dst_stride, w, h,
216
551k
                           filter_params_x, filter_params_y, subpel_x_qn,
217
551k
                           subpel_y_qn, conv_params);
218
704k
  } else {
219
704k
    convolve_2d_sr_avx2(src, src_stride, dst, dst_stride, w, h, filter_params_x,
220
704k
                        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
221
704k
  }
222
1.25M
}