/src/aom/av1/common/x86/convolve_2d_avx2.c

Source
/*
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include <immintrin.h>

#include "config/av1_rtcd.h"

#include "aom_dsp/x86/convolve_avx2.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/synonyms.h"

#include "av1/common/convolve.h"

static void convolve_2d_sr_w4_avx2(
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
    const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn,
    const int32_t subpel_y_qn, ConvolveParams *conv_params) {
  int i;
  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 4]);
  uint8_t *dst_ptr = dst;
  assert(conv_params->round_0 == 3);
  assert(conv_params->round_1 == 11);

  const __m128i round_const_h = _mm_set1_epi16(1 << (conv_params->round_0 - 2));
  const __m256i round_const_v =
      _mm256_set1_epi32(1 << (conv_params->round_1 - 1));

  __m128i filt[2], coeffs_h[2] = { 0 };
  __m256i coeffs_v[4] = { 0 };

  const int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
  const int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);

  assert(horiz_tap == 2 || horiz_tap == 4);
  assert(vert_tap == 2 || vert_tap == 4 || vert_tap == 6 || vert_tap == 8);

  if (horiz_tap == 2)
    prepare_coeffs_2t_ssse3(filter_params_x, subpel_x_qn, coeffs_h);
  else
    prepare_coeffs_4t_ssse3(filter_params_x, subpel_x_qn, coeffs_h);

  if (vert_tap == 2)
    prepare_coeffs_2t(filter_params_y, subpel_y_qn, coeffs_v);
  else if (vert_tap == 4)
    prepare_coeffs_4t(filter_params_y, subpel_y_qn, coeffs_v);
  else if (vert_tap == 6)
    prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
  else
    prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);

  int im_h = h + vert_tap - 1;
  const int fo_vert = vert_tap / 2 - 1;
  const int fo_horiz = horiz_tap / 2 - 1;
  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;

  filt[0] = _mm_load_si128((__m128i const *)filt1_global_sse2);
  filt[1] = _mm_load_si128((__m128i const *)filt2_global_sse2);

  if (horiz_tap == 2) {
    CONVOLVE_SR_HOR_FILTER_2TAP_W4
  } else {
    CONVOLVE_SR_HOR_FILTER_4TAP_W4
  }

  if (vert_tap == 2) {
    CONVOLVE_SR_VER_FILTER_2TAP_W4
  } else if (vert_tap == 4) {
    CONVOLVE_SR_VER_FILTER_4TAP_W4
  } else if (vert_tap == 6) {
    CONVOLVE_SR_VER_FILTER_6TAP_W4
  } else {
    CONVOLVE_SR_VER_FILTER_8TAP_W4
  }
}

static void convolve_2d_sr_avx2(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride, int w, int h,
                                const InterpFilterParams *filter_params_x,
                                const InterpFilterParams *filter_params_y,
                                const int subpel_x_qn, const int subpel_y_qn,
                                ConvolveParams *conv_params) {
  if (filter_params_x->taps > 8) {
    const int bd = 8;
    int im_stride = 8, i;
    const int strip_stride = (MAX_SB_SIZE + MAX_FILTER_TAP) * 8;
    DECLARE_ALIGNED(
        32, int16_t,
        im_block_buf[(MAX_SB_SIZE / 8) * (MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
    const int bits =
        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
    const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;

    assert(conv_params->round_0 > 0);

    const __m256i round_const_h12 = _mm256_set1_epi32(
        ((1 << (conv_params->round_0)) >> 1) + (1 << (bd + FILTER_BITS - 1)));
    const __m128i round_shift_h12 = _mm_cvtsi32_si128(conv_params->round_0);

    const __m256i sum_round_v = _mm256_set1_epi32(
        (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
    const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);

    const __m256i round_const_v = _mm256_set1_epi32(
        ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
        ((1 << (offset_bits - conv_params->round_1)) >> 1));
    const __m128i round_shift_v = _mm_cvtsi32_si128(bits);

    __m256i coeffs_h[6] = { 0 }, coeffs_v[6] = { 0 };

    int horiz_tap = 12;
    int vert_tap = 12;

    prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs_h);
    prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs_v);

    int im_h = h + vert_tap - 1;
    const int fo_vert = vert_tap / 2 - 1;
    const int fo_horiz = horiz_tap / 2 - 1;
    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;

    const __m256i v_zero = _mm256_setzero_si256();
    __m256i s[12];
    if (w <= 4) {
      for (i = 0; i < im_h; i += 2) {
        for (int j = 0; j < w; j += 8) {
          int16_t *strip_im_block = &im_block_buf[(j / 8) * strip_stride];
          const __m256i data = _mm256_permute2x128_si256(
              _mm256_castsi128_si256(
                  _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
              _mm256_castsi128_si256(_mm_loadu_si128(
                  (__m128i *)(&src_ptr[i * src_stride + src_stride + j]))),
              0x20);
          const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
          const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
          const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
          const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);

          const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
          const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);

          s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
          s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
          s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
          s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
          s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
          s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);

          const __m256i res_lo = convolve_12taps(s, coeffs_h);

          __m256i res_32b_lo = _mm256_sra_epi32(
              _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);
          __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
          const __m128i res_0 = _mm256_extracti128_si256(res_16b_lo, 0);
          const __m128i res_1 = _mm256_extracti128_si256(res_16b_lo, 1);
          if (w > 2) {
            _mm_storel_epi64((__m128i *)&strip_im_block[i * im_stride], res_0);
            _mm_storel_epi64(
                (__m128i *)&strip_im_block[i * im_stride + im_stride], res_1);
          } else {
            uint32_t horiz_2;
            horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_0);
            strip_im_block[i * im_stride] = (uint16_t)horiz_2;
            strip_im_block[i * im_stride + 1] = (uint16_t)(horiz_2 >> 16);
            horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_1);
            strip_im_block[i * im_stride + im_stride] = (uint16_t)horiz_2;
            strip_im_block[i * im_stride + im_stride + 1] =
                (uint16_t)(horiz_2 >> 16);
          }
        }
      }
    } else {
      for (i = 0; i < im_h; i++) {
        for (int j = 0; j < w; j += 8) {
          int16_t *strip_im_block = &im_block_buf[(j / 8) * strip_stride];
          const __m256i data = _mm256_permute2x128_si256(
              _mm256_castsi128_si256(
                  _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
              _mm256_castsi128_si256(_mm_loadu_si128(
                  (__m128i *)(&src_ptr[i * src_stride + j + 4]))),
              0x20);
          const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
          const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);

          const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
          const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);

          const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
          const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);

          s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
          s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
          s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
          s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
          s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
          s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);

          const __m256i res_lo = convolve_12taps(s, coeffs_h);

          __m256i res_32b_lo = _mm256_sra_epi32(
              _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);

          __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
          _mm_store_si128((__m128i *)&strip_im_block[i * im_stride],
                          _mm256_extracti128_si256(
                              _mm256_permute4x64_epi64(res_16b_lo, 0x88), 0));
        }
      }
    }

    for (int j = 0; j < w; j += 8) {
      const int16_t *im_block = &im_block_buf[(j / 8) * strip_stride];
      CONVOLVE_SR_VERTICAL_FILTER_12TAP
    }
  } else {
    int im_stride = 8, i;
    const int strip_stride = (MAX_SB_SIZE + MAX_FILTER_TAP) * 8;
    DECLARE_ALIGNED(
        32, int16_t,
        im_block_buf[(MAX_SB_SIZE / 8) * (MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);

    assert(conv_params->round_0 == 3);
    assert(conv_params->round_1 == 11);

    const __m256i round_const_h =
        _mm256_set1_epi16(1 << (conv_params->round_0 - 2));
    const __m256i round_const_v =
        _mm256_set1_epi32(1 << (conv_params->round_1 - 1));

    __m256i filt[4], coeffs_h[4] = { 0 }, coeffs_v[4] = { 0 };

    int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
    int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);

    assert(horiz_tap == 2 || horiz_tap == 4 || horiz_tap == 6 ||
           horiz_tap == 8);
    assert(vert_tap == 2 || vert_tap == 4 || vert_tap == 6 || vert_tap == 8);

    if (horiz_tap == 2)
      prepare_coeffs_2t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
    else if (horiz_tap == 4)
      prepare_coeffs_4t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
    else if (horiz_tap == 6)
      prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
    else
      prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);

    if (vert_tap == 2)
      prepare_coeffs_2t(filter_params_y, subpel_y_qn, coeffs_v);
    else if (vert_tap == 4)
      prepare_coeffs_4t(filter_params_y, subpel_y_qn, coeffs_v);
    else if (vert_tap == 6)
      prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
    else
      prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);

    int im_h = h + vert_tap - 1;
    const int fo_vert = vert_tap / 2 - 1;
    const int fo_horiz = horiz_tap / 2 - 1;
    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;

    filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
    filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
    filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
    filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);

    if (subpel_x_qn == 0 && subpel_y_qn == 0) {
      for (i = 0; i < h; ++i) {
        for (int j = 0; j < w; j += 8) {
          _mm_storel_epi64(
              (__m128i *)&dst[i * dst_stride + j],
              _mm_loadl_epi64((const __m128i *)&src[i * src_stride + j]));
        }
      }
      return;
    }

    for (i = 0; i < (im_h - 1); i += 2) {
      const uint8_t *src_row0 = &src_ptr[i * src_stride];
      const uint8_t *src_row1 = &src_ptr[(i + 1) * src_stride];
      for (int j = 0; j < w; j += 8) {
        int16_t *strip_im_block = &im_block_buf[(j / 8) * strip_stride];
        __m256i data =
            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src_row0[j]));
        data = _mm256_inserti128_si256(
            data, _mm_loadu_si128((__m128i *)&src_row1[j]), 1);

        __m256i res;
        if (horiz_tap == 2)
          res = convolve_lowbd_x_2tap(data, coeffs_h, filt);
        else if (horiz_tap == 4)
          res = convolve_lowbd_x_4tap(data, coeffs_h, filt);
        else if (horiz_tap == 6)
          res = convolve_lowbd_x_6tap(data, coeffs_h, filt);
        else
          res = convolve_lowbd_x(data, coeffs_h, filt);

        res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2);
        _mm256_store_si256((__m256i *)&strip_im_block[i * 8], res);
      }
    }
    {
      const uint8_t *src_row0 = &src_ptr[i * src_stride];
      for (int j = 0; j < w; j += 8) {
        int16_t *strip_im_block = &im_block_buf[(j / 8) * strip_stride];
        __m256i data_1 =
            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src_row0[j]));
        __m256i res;
        if (horiz_tap == 2)
          res = convolve_lowbd_x_2tap(data_1, coeffs_h, filt);
        else if (horiz_tap == 4)
          res = convolve_lowbd_x_4tap(data_1, coeffs_h, filt);
        else if (horiz_tap == 6)
          res = convolve_lowbd_x_6tap(data_1, coeffs_h, filt);
        else
          res = convolve_lowbd_x(data_1, coeffs_h, filt);

        res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2);
        _mm_store_si128((__m128i *)&strip_im_block[i * 8],
                        _mm256_castsi256_si128(res));
      }
    }

    for (int j = 0; j < w; j += 8) {
      const int16_t *im_block = &im_block_buf[(j / 8) * strip_stride];
      uint8_t *dst_ptr = dst + j;
      if (vert_tap == 2) {
        CONVOLVE_SR_VERTICAL_FILTER_2TAP
      } else if (vert_tap == 4) {
        CONVOLVE_SR_VERTICAL_FILTER_4TAP
      } else if (vert_tap == 6) {
        CONVOLVE_SR_VERTICAL_FILTER_6TAP
      } else {
        CONVOLVE_SR_VERTICAL_FILTER_8TAP
      }
    }
  }
}

void av1_convolve_2d_sr_avx2(
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
    const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn,
    const int32_t subpel_y_qn, ConvolveParams *conv_params) {
  const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_qn);
  const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_qn);

  const bool use_12tap = (tap_x == 12 || tap_y == 12);
  if (w <= 4 && !use_12tap) {
    convolve_2d_sr_w4_avx2(src, src_stride, dst, dst_stride, w, h,
                           filter_params_x, filter_params_y, subpel_x_qn,
                           subpel_y_qn, conv_params);
  } else {
    convolve_2d_sr_avx2(src, src_stride, dst, dst_stride, w, h, filter_params_x,
                        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
  }
}

Coverage Report

Created: 2026-05-16 06:27

Line	Count	Source
1		/*
2		* Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3		*
4		* This source code is subject to the terms of the BSD 2 Clause License and
5		* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6		* was not distributed with this source code in the LICENSE file, you can
7		* obtain it at www.aomedia.org/license/software. If the Alliance for Open
8		* Media Patent License 1.0 was not distributed with this source code in the
9		* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10		*/
11
12		#include <immintrin.h>
13
14		#include "config/av1_rtcd.h"
15
16		#include "aom_dsp/x86/convolve_avx2.h"
17		#include "aom_dsp/aom_filter.h"
18		#include "aom_dsp/x86/synonyms.h"
19
20		#include "av1/common/convolve.h"
21
22		static void convolve_2d_sr_w4_avx2(
23		const uint8_t src, int32_t src_stride, uint8_t dst, int32_t dst_stride,
24		int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
25		const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn,
26	546k	const int32_t subpel_y_qn, ConvolveParams *conv_params) {
27	546k	int i;
28	546k	DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 4]);
29	546k	uint8_t *dst_ptr = dst;
30	546k	assert(conv_params->round_0 == 3);
31	546k	assert(conv_params->round_1 == 11);
32
33	546k	const __m128i round_const_h = _mm_set1_epi16(1 << (conv_params->round_0 - 2));
34	546k	const __m256i round_const_v =
35	546k	_mm256_set1_epi32(1 << (conv_params->round_1 - 1));
36
37	546k	__m128i filt[2], coeffs_h[2] = { 0 };
38	546k	__m256i coeffs_v[4] = { 0 };
39
40	546k	const int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
41	546k	const int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
42
43	546k	assert(horiz_tap == 2 \|\| horiz_tap == 4);
44	546k	assert(vert_tap == 2 \|\| vert_tap == 4 \|\| vert_tap == 6 \|\| vert_tap == 8);
45
46	546k	if (horiz_tap == 2)
47	19.1k	prepare_coeffs_2t_ssse3(filter_params_x, subpel_x_qn, coeffs_h);
48	527k	else
49	527k	prepare_coeffs_4t_ssse3(filter_params_x, subpel_x_qn, coeffs_h);
50
51	546k	if (vert_tap == 2)
52	19.1k	prepare_coeffs_2t(filter_params_y, subpel_y_qn, coeffs_v);
53	527k	else if (vert_tap == 4)
54	346k	prepare_coeffs_4t(filter_params_y, subpel_y_qn, coeffs_v);
55	180k	else if (vert_tap == 6)
56	168k	prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
57	11.6k	else
58	11.6k	prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
59
60	546k	int im_h = h + vert_tap - 1;
61	546k	const int fo_vert = vert_tap / 2 - 1;
62	546k	const int fo_horiz = horiz_tap / 2 - 1;
63	546k	const uint8_t const src_ptr = src - fo_vert src_stride - fo_horiz;
64
65	546k	filt[0] = _mm_load_si128((__m128i const *)filt1_global_sse2);
66	546k	filt[1] = _mm_load_si128((__m128i const *)filt2_global_sse2);
67
68	546k	if (horiz_tap == 2) {
69	19.1k	CONVOLVE_SR_HOR_FILTER_2TAP_W4
70	527k	} else {
71	527k	CONVOLVE_SR_HOR_FILTER_4TAP_W4
72	527k	}
73
74	546k	if (vert_tap == 2) {
75	19.1k	CONVOLVE_SR_VER_FILTER_2TAP_W4
76	527k	} else if (vert_tap == 4) {
77	346k	CONVOLVE_SR_VER_FILTER_4TAP_W4
78	346k	} else if (vert_tap == 6) {
79	168k	CONVOLVE_SR_VER_FILTER_6TAP_W4
80	168k	} else {
81	11.7k	CONVOLVE_SR_VER_FILTER_8TAP_W4
82	11.7k	}
83	546k	}
84
85		static void convolve_2d_sr_avx2(const uint8_t *src, int src_stride,
86		uint8_t *dst, int dst_stride, int w, int h,
87		const InterpFilterParams *filter_params_x,
88		const InterpFilterParams *filter_params_y,
89		const int subpel_x_qn, const int subpel_y_qn,
90	742k	ConvolveParams *conv_params) {
91	742k	if (filter_params_x->taps > 8) {
92	0	const int bd = 8;
93	0	int im_stride = 8, i;
94	0	const int strip_stride = (MAX_SB_SIZE + MAX_FILTER_TAP) * 8;
95	0	DECLARE_ALIGNED(
96	0	32, int16_t,
97	0	im_block_buf[(MAX_SB_SIZE / 8) * (MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
98	0	const int bits =
99	0	FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
100	0	const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
101
102	0	assert(conv_params->round_0 > 0);
103
104	0	const __m256i round_const_h12 = _mm256_set1_epi32(
105	0	((1 << (conv_params->round_0)) >> 1) + (1 << (bd + FILTER_BITS - 1)));
106	0	const __m128i round_shift_h12 = _mm_cvtsi32_si128(conv_params->round_0);
107
108	0	const __m256i sum_round_v = _mm256_set1_epi32(
109	0	(1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
110	0	const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
111
112	0	const __m256i round_const_v = _mm256_set1_epi32(
113	0	((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
114	0	((1 << (offset_bits - conv_params->round_1)) >> 1));
115	0	const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
116
117	0	__m256i coeffs_h[6] = { 0 }, coeffs_v[6] = { 0 };
118
119	0	int horiz_tap = 12;
120	0	int vert_tap = 12;
121
122	0	prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs_h);
123	0	prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs_v);
124
125	0	int im_h = h + vert_tap - 1;
126	0	const int fo_vert = vert_tap / 2 - 1;
127	0	const int fo_horiz = horiz_tap / 2 - 1;
128	0	const uint8_t const src_ptr = src - fo_vert src_stride - fo_horiz;
129
130	0	const __m256i v_zero = _mm256_setzero_si256();
131	0	__m256i s[12];
132	0	if (w <= 4) {
133	0	for (i = 0; i < im_h; i += 2) {
134	0	for (int j = 0; j < w; j += 8) {
135	0	int16_t strip_im_block = &im_block_buf[(j / 8) strip_stride];
136	0	const __m256i data = _mm256_permute2x128_si256(
137	0	_mm256_castsi128_si256(
138	0	_mm_loadu_si128((__m128i )(&src_ptr[i src_stride + j]))),
139	0	_mm256_castsi128_si256(_mm_loadu_si128(
140	0	(__m128i )(&src_ptr[i src_stride + src_stride + j]))),
141	0	0x20);
142	0	const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
143	0	const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
144	0	const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
145	0	const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
146
147	0	const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
148	0	const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
149
150	0	s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
151	0	s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
152	0	s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
153	0	s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
154	0	s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
155	0	s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
156
157	0	const __m256i res_lo = convolve_12taps(s, coeffs_h);
158
159	0	__m256i res_32b_lo = _mm256_sra_epi32(
160	0	_mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);
161	0	__m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
162	0	const __m128i res_0 = _mm256_extracti128_si256(res_16b_lo, 0);
163	0	const __m128i res_1 = _mm256_extracti128_si256(res_16b_lo, 1);
164	0	if (w > 2) {
165	0	_mm_storel_epi64((__m128i )&strip_im_block[i im_stride], res_0);
166	0	_mm_storel_epi64(
167	0	(__m128i )&strip_im_block[i im_stride + im_stride], res_1);
168	0	} else {
169	0	uint32_t horiz_2;
170	0	horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_0);
171	0	strip_im_block[i * im_stride] = (uint16_t)horiz_2;
172	0	strip_im_block[i * im_stride + 1] = (uint16_t)(horiz_2 >> 16);
173	0	horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_1);
174	0	strip_im_block[i * im_stride + im_stride] = (uint16_t)horiz_2;
175	0	strip_im_block[i * im_stride + im_stride + 1] =
176	0	(uint16_t)(horiz_2 >> 16);
177	0	}
178	0	}
179	0	}
180	0	} else {
181	0	for (i = 0; i < im_h; i++) {
182	0	for (int j = 0; j < w; j += 8) {
183	0	int16_t strip_im_block = &im_block_buf[(j / 8) strip_stride];
184	0	const __m256i data = _mm256_permute2x128_si256(
185	0	_mm256_castsi128_si256(
186	0	_mm_loadu_si128((__m128i )(&src_ptr[i src_stride + j]))),
187	0	_mm256_castsi128_si256(_mm_loadu_si128(
188	0	(__m128i )(&src_ptr[i src_stride + j + 4]))),
189	0	0x20);
190	0	const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
191	0	const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
192
193	0	const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
194	0	const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
195
196	0	const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
197	0	const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
198
199	0	s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
200	0	s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
201	0	s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
202	0	s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
203	0	s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
204	0	s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
205
206	0	const __m256i res_lo = convolve_12taps(s, coeffs_h);
207
208	0	__m256i res_32b_lo = _mm256_sra_epi32(
209	0	_mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);
210
211	0	__m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
212	0	_mm_store_si128((__m128i )&strip_im_block[i im_stride],
213	0	_mm256_extracti128_si256(
214	0	_mm256_permute4x64_epi64(res_16b_lo, 0x88), 0));
215	0	}
216	0	}
217	0	}
218
219	0	for (int j = 0; j < w; j += 8) {
220	0	const int16_t im_block = &im_block_buf[(j / 8) strip_stride];
221	0	CONVOLVE_SR_VERTICAL_FILTER_12TAP
222	0	}
223	742k	} else {
224	742k	int im_stride = 8, i;
225	742k	const int strip_stride = (MAX_SB_SIZE + MAX_FILTER_TAP) * 8;
226	742k	DECLARE_ALIGNED(
227	742k	32, int16_t,
228	742k	im_block_buf[(MAX_SB_SIZE / 8) * (MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
229
230	742k	assert(conv_params->round_0 == 3);
231	742k	assert(conv_params->round_1 == 11);
232
233	742k	const __m256i round_const_h =
234	742k	_mm256_set1_epi16(1 << (conv_params->round_0 - 2));
235	742k	const __m256i round_const_v =
236	742k	_mm256_set1_epi32(1 << (conv_params->round_1 - 1));
237
238	742k	__m256i filt[4], coeffs_h[4] = { 0 }, coeffs_v[4] = { 0 };
239
240	742k	int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
241	742k	int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
242
243	742k	assert(horiz_tap == 2 \|\| horiz_tap == 4 \|\| horiz_tap == 6 \|\|
244	742k	horiz_tap == 8);
245	742k	assert(vert_tap == 2 \|\| vert_tap == 4 \|\| vert_tap == 6 \|\| vert_tap == 8);
246
247	742k	if (horiz_tap == 2)
248	17.5k	prepare_coeffs_2t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
249	724k	else if (horiz_tap == 4)
250	34.9k	prepare_coeffs_4t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
251	689k	else if (horiz_tap == 6)
252	632k	prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
253	57.5k	else
254	57.5k	prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
255
256	742k	if (vert_tap == 2)
257	17.5k	prepare_coeffs_2t(filter_params_y, subpel_y_qn, coeffs_v);
258	724k	else if (vert_tap == 4)
259	346k	prepare_coeffs_4t(filter_params_y, subpel_y_qn, coeffs_v);
260	378k	else if (vert_tap == 6)
261	339k	prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
262	38.8k	else
263	38.8k	prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
264
265	742k	int im_h = h + vert_tap - 1;
266	742k	const int fo_vert = vert_tap / 2 - 1;
267	742k	const int fo_horiz = horiz_tap / 2 - 1;
268	742k	const uint8_t const src_ptr = src - fo_vert src_stride - fo_horiz;
269
270	742k	filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
271	742k	filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
272	742k	filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
273	742k	filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
274
275	742k	if (subpel_x_qn == 0 && subpel_y_qn == 0) {
276	0	for (i = 0; i < h; ++i) {
277	0	for (int j = 0; j < w; j += 8) {
278	0	_mm_storel_epi64(
279	0	(__m128i )&dst[i dst_stride + j],
280	0	_mm_loadl_epi64((const __m128i )&src[i src_stride + j]));
281	0	}
282	0	}
283	0	return;
284	0	}
285
286	5.63M	for (i = 0; i < (im_h - 1); i += 2) {
287	4.89M	const uint8_t src_row0 = &src_ptr[i src_stride];
288	4.89M	const uint8_t src_row1 = &src_ptr[(i + 1) src_stride];
289	18.2M	for (int j = 0; j < w; j += 8) {
290	13.4M	int16_t strip_im_block = &im_block_buf[(j / 8) strip_stride];
291	13.4M	__m256i data =
292	13.4M	_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src_row0[j]));
293	13.4M	data = _mm256_inserti128_si256(
294	13.4M	data, _mm_loadu_si128((__m128i *)&src_row1[j]), 1);
295
296	13.4M	__m256i res;
297	13.4M	if (horiz_tap == 2)
298	402k	res = convolve_lowbd_x_2tap(data, coeffs_h, filt);
299	12.9M	else if (horiz_tap == 4)
300	797k	res = convolve_lowbd_x_4tap(data, coeffs_h, filt);
301	12.2M	else if (horiz_tap == 6)
302	9.91M	res = convolve_lowbd_x_6tap(data, coeffs_h, filt);
303	2.28M	else
304	2.28M	res = convolve_lowbd_x(data, coeffs_h, filt);
305
306	13.4M	res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2);
307	13.4M	_mm256_store_si256((__m256i )&strip_im_block[i 8], res);
308	13.4M	}
309	4.89M	}
310	742k	{
311	742k	const uint8_t src_row0 = &src_ptr[i src_stride];
312	2.07M	for (int j = 0; j < w; j += 8) {
313	1.32M	int16_t strip_im_block = &im_block_buf[(j / 8) strip_stride];
314	1.32M	__m256i data_1 =
315	1.32M	_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src_row0[j]));
316	1.32M	__m256i res;
317	1.32M	if (horiz_tap == 2)
318	33.4k	res = convolve_lowbd_x_2tap(data_1, coeffs_h, filt);
319	1.29M	else if (horiz_tap == 4)
320	64.0k	res = convolve_lowbd_x_4tap(data_1, coeffs_h, filt);
321	1.23M	else if (horiz_tap == 6)
322	1.06M	res = convolve_lowbd_x_6tap(data_1, coeffs_h, filt);
323	170k	else
324	170k	res = convolve_lowbd_x(data_1, coeffs_h, filt);
325
326	1.32M	res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2);
327	1.32M	_mm_store_si128((__m128i )&strip_im_block[i 8],
328	1.32M	_mm256_castsi256_si128(res));
329	1.32M	}
330	742k	}
331
332	2.07M	for (int j = 0; j < w; j += 8) {
333	1.32M	const int16_t im_block = &im_block_buf[(j / 8) strip_stride];
334	1.32M	uint8_t *dst_ptr = dst + j;
335	1.32M	if (vert_tap == 2) {
336	33.4k	CONVOLVE_SR_VERTICAL_FILTER_2TAP
337	1.29M	} else if (vert_tap == 4) {
338	469k	CONVOLVE_SR_VERTICAL_FILTER_4TAP
339	825k	} else if (vert_tap == 6) {
340	679k	CONVOLVE_SR_VERTICAL_FILTER_6TAP
341	679k	} else {
342	146k	CONVOLVE_SR_VERTICAL_FILTER_8TAP
343	146k	}
344	1.32M	}
345	742k	}
346	742k	}
347
348		void av1_convolve_2d_sr_avx2(
349		const uint8_t src, int32_t src_stride, uint8_t dst, int32_t dst_stride,
350		int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
351		const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn,
352	1.28M	const int32_t subpel_y_qn, ConvolveParams *conv_params) {
353	1.28M	const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_qn);
354	1.28M	const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_qn);
355
356	1.28M	const bool use_12tap = (tap_x == 12 \|\| tap_y == 12);
357	1.28M	if (w <= 4 && !use_12tap) {
358	546k	convolve_2d_sr_w4_avx2(src, src_stride, dst, dst_stride, w, h,
359	546k	filter_params_x, filter_params_y, subpel_x_qn,
360	546k	subpel_y_qn, conv_params);
361	742k	} else {
362	742k	convolve_2d_sr_avx2(src, src_stride, dst, dst_stride, w, h, filter_params_x,
363	742k	filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
364	742k	}
365	1.28M	}