/src/aom/av1/common/x86/convolve_2d_avx2.c

Source (jump to first uncovered line)
/*
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include <immintrin.h>

#include "config/av1_rtcd.h"

#include "third_party/SVT-AV1/convolve_2d_avx2.h"

#include "aom_dsp/x86/convolve_avx2.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/synonyms.h"

#include "av1/common/convolve.h"

void av1_convolve_2d_sr_general_avx2(const uint8_t *src, int src_stride,
                                     uint8_t *dst, int dst_stride, int w, int h,
                                     const InterpFilterParams *filter_params_x,
                                     const InterpFilterParams *filter_params_y,
                                     const int subpel_x_qn,
                                     const int subpel_y_qn,
                                     ConvolveParams *conv_params) {
  if (filter_params_x->taps > 8) {
    const int bd = 8;
    int im_stride = 8, i;
    DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
    const int bits =
        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
    const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;

    assert(conv_params->round_0 > 0);

    const __m256i round_const_h12 = _mm256_set1_epi32(
        ((1 << (conv_params->round_0)) >> 1) + (1 << (bd + FILTER_BITS - 1)));
    const __m128i round_shift_h12 = _mm_cvtsi32_si128(conv_params->round_0);

    const __m256i sum_round_v = _mm256_set1_epi32(
        (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
    const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);

    const __m256i round_const_v = _mm256_set1_epi32(
        ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
        ((1 << (offset_bits - conv_params->round_1)) >> 1));
    const __m128i round_shift_v = _mm_cvtsi32_si128(bits);

    __m256i coeffs_h[6] = { 0 }, coeffs_v[6] = { 0 };

    int horiz_tap = 12;
    int vert_tap = 12;

    prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs_h);
    prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs_v);

    int im_h = h + vert_tap - 1;
    const int fo_vert = vert_tap / 2 - 1;
    const int fo_horiz = horiz_tap / 2 - 1;
    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;

    for (int j = 0; j < w; j += 8) {
      CONVOLVE_SR_HORIZONTAL_FILTER_12TAP
      CONVOLVE_SR_VERTICAL_FILTER_12TAP
    }
  } else {
    const int bd = 8;
    int im_stride = 8, i;
    DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
    const int bits =
        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
    const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;

    assert(conv_params->round_0 > 0);

    const __m256i round_const_h =
        _mm256_set1_epi16(((1 << (conv_params->round_0 - 1)) >> 1) +
                          (1 << (bd + FILTER_BITS - 2)));
    const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);

    const __m256i sum_round_v = _mm256_set1_epi32(
        (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
    const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);

    const __m256i round_const_v = _mm256_set1_epi32(
        ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
        ((1 << (offset_bits - conv_params->round_1)) >> 1));
    const __m128i round_shift_v = _mm_cvtsi32_si128(bits);

    __m256i filt[4], coeffs_h[4], coeffs_v[4];

    prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
    prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);

    int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
    int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);

    if (horiz_tap == 6)
      prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
    else
      prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);

    if (vert_tap == 6)
      prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
    else
      prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);

    int im_h = h + vert_tap - 1;
    const int fo_vert = vert_tap / 2 - 1;
    const int fo_horiz = horiz_tap / 2 - 1;
    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;

    filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
    filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
    filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
    filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);

    for (int j = 0; j < w; j += 8) {
      if (horiz_tap == 4) {
        CONVOLVE_SR_HORIZONTAL_FILTER_4TAP
      } else if (horiz_tap == 6) {
        CONVOLVE_SR_HORIZONTAL_FILTER_6TAP
      } else {
        CONVOLVE_SR_HORIZONTAL_FILTER_8TAP
      }

      if (vert_tap == 4) {
        CONVOLVE_SR_VERTICAL_FILTER_4TAP
      } else if (vert_tap == 6) {
        CONVOLVE_SR_VERTICAL_FILTER_6TAP
      } else {
        CONVOLVE_SR_VERTICAL_FILTER_8TAP
      }
    }
  }
}

void av1_convolve_2d_sr_avx2(
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
    const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
    const int32_t subpel_y_q4, ConvolveParams *conv_params) {
  const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_q4);
  const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_q4);

  const bool use_general = (tap_x == 12 || tap_y == 12);
  if (use_general) {
    av1_convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
                                    filter_params_x, filter_params_y,
                                    subpel_x_q4, subpel_y_q4, conv_params);
  } else {
    av1_convolve_2d_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
                                        filter_params_x, filter_params_y,
                                        subpel_x_q4, subpel_y_q4, conv_params);
  }
}

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright (c) 2017, Alliance for Open Media. All rights reserved
3		*
4		* This source code is subject to the terms of the BSD 2 Clause License and
5		* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6		* was not distributed with this source code in the LICENSE file, you can
7		* obtain it at www.aomedia.org/license/software. If the Alliance for Open
8		* Media Patent License 1.0 was not distributed with this source code in the
9		* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10		*/
11
12		#include <immintrin.h>
13
14		#include "config/av1_rtcd.h"
15
16		#include "third_party/SVT-AV1/convolve_2d_avx2.h"
17
18		#include "aom_dsp/x86/convolve_avx2.h"
19		#include "aom_dsp/aom_filter.h"
20		#include "aom_dsp/x86/synonyms.h"
21
22		#include "av1/common/convolve.h"
23
24		void av1_convolve_2d_sr_general_avx2(const uint8_t *src, int src_stride,
25		uint8_t *dst, int dst_stride, int w, int h,
26		const InterpFilterParams *filter_params_x,
27		const InterpFilterParams *filter_params_y,
28		const int subpel_x_qn,
29		const int subpel_y_qn,
30	0	ConvolveParams *conv_params) {
31	0	if (filter_params_x->taps > 8) {
32	0	const int bd = 8;
33	0	int im_stride = 8, i;
34	0	DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
35	0	const int bits =
36	0	FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
37	0	const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
38
39	0	assert(conv_params->round_0 > 0);
40
41	0	const __m256i round_const_h12 = _mm256_set1_epi32(
42	0	((1 << (conv_params->round_0)) >> 1) + (1 << (bd + FILTER_BITS - 1)));
43	0	const __m128i round_shift_h12 = _mm_cvtsi32_si128(conv_params->round_0);
44
45	0	const __m256i sum_round_v = _mm256_set1_epi32(
46	0	(1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
47	0	const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
48
49	0	const __m256i round_const_v = _mm256_set1_epi32(
50	0	((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
51	0	((1 << (offset_bits - conv_params->round_1)) >> 1));
52	0	const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
53
54	0	__m256i coeffs_h[6] = { 0 }, coeffs_v[6] = { 0 };
55
56	0	int horiz_tap = 12;
57	0	int vert_tap = 12;
58
59	0	prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs_h);
60	0	prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs_v);
61
62	0	int im_h = h + vert_tap - 1;
63	0	const int fo_vert = vert_tap / 2 - 1;
64	0	const int fo_horiz = horiz_tap / 2 - 1;
65	0	const uint8_t const src_ptr = src - fo_vert src_stride - fo_horiz;
66
67	0	for (int j = 0; j < w; j += 8) {
68	0	CONVOLVE_SR_HORIZONTAL_FILTER_12TAP
69	0	CONVOLVE_SR_VERTICAL_FILTER_12TAP
70	0	}
71	0	} else {
72	0	const int bd = 8;
73	0	int im_stride = 8, i;
74	0	DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
75	0	const int bits =
76	0	FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
77	0	const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
78
79	0	assert(conv_params->round_0 > 0);
80
81	0	const __m256i round_const_h =
82	0	_mm256_set1_epi16(((1 << (conv_params->round_0 - 1)) >> 1) +
83	0	(1 << (bd + FILTER_BITS - 2)));
84	0	const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
85
86	0	const __m256i sum_round_v = _mm256_set1_epi32(
87	0	(1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
88	0	const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
89
90	0	const __m256i round_const_v = _mm256_set1_epi32(
91	0	((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
92	0	((1 << (offset_bits - conv_params->round_1)) >> 1));
93	0	const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
94
95	0	__m256i filt[4], coeffs_h[4], coeffs_v[4];
96
97	0	prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
98	0	prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
99
100	0	int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
101	0	int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
102
103	0	if (horiz_tap == 6)
104	0	prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
105	0	else
106	0	prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
107
108	0	if (vert_tap == 6)
109	0	prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
110	0	else
111	0	prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
112
113	0	int im_h = h + vert_tap - 1;
114	0	const int fo_vert = vert_tap / 2 - 1;
115	0	const int fo_horiz = horiz_tap / 2 - 1;
116	0	const uint8_t const src_ptr = src - fo_vert src_stride - fo_horiz;
117
118	0	filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
119	0	filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
120	0	filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
121	0	filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
122
123	0	for (int j = 0; j < w; j += 8) {
124	0	if (horiz_tap == 4) {
125	0	CONVOLVE_SR_HORIZONTAL_FILTER_4TAP
126	0	} else if (horiz_tap == 6) {
127	0	CONVOLVE_SR_HORIZONTAL_FILTER_6TAP
128	0	} else {
129	0	CONVOLVE_SR_HORIZONTAL_FILTER_8TAP
130	0	}
131
132	0	if (vert_tap == 4) {
133	0	CONVOLVE_SR_VERTICAL_FILTER_4TAP
134	0	} else if (vert_tap == 6) {
135	0	CONVOLVE_SR_VERTICAL_FILTER_6TAP
136	0	} else {
137	0	CONVOLVE_SR_VERTICAL_FILTER_8TAP
138	0	}
139	0	}
140	0	}
141	0	}
142
143		void av1_convolve_2d_sr_avx2(
144		const uint8_t src, int32_t src_stride, uint8_t dst, int32_t dst_stride,
145		int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
146		const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
147	3.87M	const int32_t subpel_y_q4, ConvolveParams *conv_params) {
148	3.87M	const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_q4);
149	3.87M	const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_q4);
150
151	3.87M	const bool use_general = (tap_x == 12 \|\| tap_y == 12);
152	3.87M	if (use_general) {
153	0	av1_convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
154	0	filter_params_x, filter_params_y,
155	0	subpel_x_q4, subpel_y_q4, conv_params);
156	3.87M	} else {
157	3.87M	av1_convolve_2d_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
158	3.87M	filter_params_x, filter_params_y,
159	3.87M	subpel_x_q4, subpel_y_q4, conv_params);
160	3.87M	}
161	3.87M	}

Coverage Report

Created: 2023-06-07 06:31