Coverage Report

Created: 2024-06-18 06:48

/src/aom/av1/common/x86/convolve_2d_avx2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <immintrin.h>
13
14
#include "config/av1_rtcd.h"
15
16
#include "third_party/SVT-AV1/convolve_2d_avx2.h"
17
18
#include "aom_dsp/x86/convolve_avx2.h"
19
#include "aom_dsp/aom_filter.h"
20
#include "aom_dsp/x86/synonyms.h"
21
22
#include "av1/common/convolve.h"
23
24
static void convolve_2d_sr_general_avx2(
25
    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
26
    int h, const InterpFilterParams *filter_params_x,
27
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
28
0
    const int subpel_y_qn, ConvolveParams *conv_params) {
29
0
  if (filter_params_x->taps > 8) {
30
0
    const int bd = 8;
31
0
    int im_stride = 8, i;
32
0
    DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
33
0
    const int bits =
34
0
        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
35
0
    const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
36
37
0
    assert(conv_params->round_0 > 0);
38
39
0
    const __m256i round_const_h12 = _mm256_set1_epi32(
40
0
        ((1 << (conv_params->round_0)) >> 1) + (1 << (bd + FILTER_BITS - 1)));
41
0
    const __m128i round_shift_h12 = _mm_cvtsi32_si128(conv_params->round_0);
42
43
0
    const __m256i sum_round_v = _mm256_set1_epi32(
44
0
        (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
45
0
    const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
46
47
0
    const __m256i round_const_v = _mm256_set1_epi32(
48
0
        ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
49
0
        ((1 << (offset_bits - conv_params->round_1)) >> 1));
50
0
    const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
51
52
0
    __m256i coeffs_h[6] = { 0 }, coeffs_v[6] = { 0 };
53
54
0
    int horiz_tap = 12;
55
0
    int vert_tap = 12;
56
57
0
    prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs_h);
58
0
    prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs_v);
59
60
0
    int im_h = h + vert_tap - 1;
61
0
    const int fo_vert = vert_tap / 2 - 1;
62
0
    const int fo_horiz = horiz_tap / 2 - 1;
63
0
    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
64
65
0
    for (int j = 0; j < w; j += 8) {
66
0
      CONVOLVE_SR_HORIZONTAL_FILTER_12TAP
67
0
      CONVOLVE_SR_VERTICAL_FILTER_12TAP
68
0
    }
69
0
  } else {
70
0
    const int bd = 8;
71
0
    int im_stride = 8, i;
72
0
    DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
73
0
    const int bits =
74
0
        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
75
0
    const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
76
77
0
    assert(conv_params->round_0 > 0);
78
79
0
    const __m256i round_const_h =
80
0
        _mm256_set1_epi16(((1 << (conv_params->round_0 - 1)) >> 1) +
81
0
                          (1 << (bd + FILTER_BITS - 2)));
82
0
    const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
83
84
0
    const __m256i sum_round_v = _mm256_set1_epi32(
85
0
        (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
86
0
    const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
87
88
0
    const __m256i round_const_v = _mm256_set1_epi32(
89
0
        ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
90
0
        ((1 << (offset_bits - conv_params->round_1)) >> 1));
91
0
    const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
92
93
0
    __m256i filt[4], coeffs_h[4], coeffs_v[4];
94
95
0
    prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
96
0
    prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
97
98
0
    int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
99
0
    int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
100
101
0
    if (horiz_tap == 6)
102
0
      prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
103
0
    else
104
0
      prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
105
106
0
    if (vert_tap == 6)
107
0
      prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
108
0
    else
109
0
      prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
110
111
0
    int im_h = h + vert_tap - 1;
112
0
    const int fo_vert = vert_tap / 2 - 1;
113
0
    const int fo_horiz = horiz_tap / 2 - 1;
114
0
    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
115
116
0
    filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
117
0
    filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
118
0
    filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
119
0
    filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
120
121
0
    for (int j = 0; j < w; j += 8) {
122
0
      if (horiz_tap == 4) {
123
0
        CONVOLVE_SR_HORIZONTAL_FILTER_4TAP
124
0
      } else if (horiz_tap == 6) {
125
0
        CONVOLVE_SR_HORIZONTAL_FILTER_6TAP
126
0
      } else {
127
0
        CONVOLVE_SR_HORIZONTAL_FILTER_8TAP
128
0
      }
129
130
0
      if (vert_tap == 4) {
131
0
        CONVOLVE_SR_VERTICAL_FILTER_4TAP
132
0
      } else if (vert_tap == 6) {
133
0
        CONVOLVE_SR_VERTICAL_FILTER_6TAP
134
0
      } else {
135
0
        CONVOLVE_SR_VERTICAL_FILTER_8TAP
136
0
      }
137
0
    }
138
0
  }
139
0
}
140
141
void av1_convolve_2d_sr_avx2(
142
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
143
    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
144
    const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
145
3.95M
    const int32_t subpel_y_q4, ConvolveParams *conv_params) {
146
3.95M
  const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_q4);
147
3.95M
  const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_q4);
148
149
3.95M
  const bool use_general = (tap_x == 12 || tap_y == 12);
150
3.95M
  if (use_general) {
151
0
    convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
152
0
                                filter_params_x, filter_params_y, subpel_x_q4,
153
0
                                subpel_y_q4, conv_params);
154
3.95M
  } else {
155
3.95M
    av1_convolve_2d_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
156
3.95M
                                        filter_params_x, filter_params_y,
157
3.95M
                                        subpel_x_q4, subpel_y_q4, conv_params);
158
3.95M
  }
159
3.95M
}