/src/aom/av1/common/x86/convolve_2d_avx2.c

Source
/*
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include <immintrin.h>
#include <stdint.h>

#include "config/av1_rtcd.h"

#include "aom_dsp/x86/convolve_avx2.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/synonyms.h"

#include "av1/common/convolve.h"

static void convolve_2d_sr_w4_avx2(
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
    const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn,
    const int32_t subpel_y_qn, ConvolveParams *conv_params) {
  int i;
  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 4]);
  uint8_t *dst_ptr = dst;
  assert(conv_params->round_0 == 3);
  assert(conv_params->round_1 == 11);

  const __m128i round_const_h = _mm_set1_epi16(1 << (conv_params->round_0 - 2));
  const __m256i round_const_v =
      _mm256_set1_epi32(1 << (conv_params->round_1 - 1));

  __m128i filt[2], coeffs_h[2] = { 0 };
  __m256i coeffs_v[4] = { 0 };

  const int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
  const int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);

  assert(horiz_tap == 2 || horiz_tap == 4);
  assert(vert_tap == 2 || vert_tap == 4 || vert_tap == 6 || vert_tap == 8);

  if (horiz_tap == 2)
    prepare_coeffs_2t_ssse3(filter_params_x, subpel_x_qn, coeffs_h);
  else
    prepare_coeffs_4t_ssse3(filter_params_x, subpel_x_qn, coeffs_h);

  if (vert_tap == 2)
    prepare_coeffs_2t(filter_params_y, subpel_y_qn, coeffs_v);
  else if (vert_tap == 4)
    prepare_coeffs_4t(filter_params_y, subpel_y_qn, coeffs_v);
  else if (vert_tap == 6)
    prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
  else
    prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);

  int im_h = h + vert_tap - 1;
  const int fo_vert = vert_tap / 2 - 1;
  const int fo_horiz = horiz_tap / 2 - 1;
  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;

  filt[0] = _mm_load_si128((__m128i const *)filt1_global_sse2);
  filt[1] = _mm_load_si128((__m128i const *)filt2_global_sse2);

  if (horiz_tap == 2) {
    CONVOLVE_SR_HOR_FILTER_2TAP_W4
  } else {
    CONVOLVE_SR_HOR_FILTER_4TAP_W4
  }

  if (vert_tap == 2) {
    CONVOLVE_SR_VER_FILTER_2TAP_W4
  } else if (vert_tap == 4) {
    CONVOLVE_SR_VER_FILTER_4TAP_W4
  } else if (vert_tap == 6) {
    CONVOLVE_SR_VER_FILTER_6TAP_W4
  } else {
    CONVOLVE_SR_VER_FILTER_8TAP_W4
  }
}

static void convolve_2d_sr_avx2(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride, int w, int h,
                                const InterpFilterParams *filter_params_x,
                                const InterpFilterParams *filter_params_y,
                                const int subpel_x_qn, const int subpel_y_qn,
                                ConvolveParams *conv_params) {
  if (filter_params_x->taps > 8) {
    const int bd = 8;
    int im_stride = 8, i;
    const int strip_stride = (MAX_SB_SIZE + MAX_FILTER_TAP) * 8;
    DECLARE_ALIGNED(
        32, int16_t,
        im_block_buf[(MAX_SB_SIZE / 8) * (MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
    const int bits =
        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
    const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;

    assert(conv_params->round_0 > 0);

    const __m256i round_const_h12 = _mm256_set1_epi32(
        ((1 << (conv_params->round_0)) >> 1) + (1 << (bd + FILTER_BITS - 1)));
    const __m128i round_shift_h12 = _mm_cvtsi32_si128(conv_params->round_0);

    const __m256i sum_round_v = _mm256_set1_epi32(
        (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
    const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);

    const __m256i round_const_v = _mm256_set1_epi32(
        ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
        ((1 << (offset_bits - conv_params->round_1)) >> 1));
    const __m128i round_shift_v = _mm_cvtsi32_si128(bits);

    __m256i coeffs_h[6] = { 0 }, coeffs_v[6] = { 0 };

    int horiz_tap = 12;
    int vert_tap = 12;

    prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs_h);
    prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs_v);

    int im_h = h + vert_tap - 1;
    const int fo_vert = vert_tap / 2 - 1;
    const int fo_horiz = horiz_tap / 2 - 1;
    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;

    const __m256i v_zero = _mm256_setzero_si256();
    __m256i s[12];
    if (w <= 4) {
      for (i = 0; i < im_h; i += 2) {
        for (int j = 0; j < w; j += 8) {
          int16_t *strip_im_block = &im_block_buf[(j / 8) * strip_stride];
          const __m256i data = _mm256_permute2x128_si256(
              _mm256_castsi128_si256(
                  _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
              _mm256_castsi128_si256(_mm_loadu_si128(
                  (__m128i *)(&src_ptr[i * src_stride + src_stride + j]))),
              0x20);
          const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
          const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
          const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
          const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);

          const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
          const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);

          s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
          s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
          s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
          s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
          s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
          s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);

          const __m256i res_lo = convolve_12taps(s, coeffs_h);

          __m256i res_32b_lo = _mm256_sra_epi32(
              _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);
          __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
          const __m128i res_0 = _mm256_extracti128_si256(res_16b_lo, 0);
          const __m128i res_1 = _mm256_extracti128_si256(res_16b_lo, 1);
          if (w > 2) {
            _mm_storel_epi64((__m128i *)&strip_im_block[i * im_stride], res_0);
            _mm_storel_epi64(
                (__m128i *)&strip_im_block[i * im_stride + im_stride], res_1);
          } else {
            uint32_t horiz_2;
            horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_0);
            strip_im_block[i * im_stride] = (uint16_t)horiz_2;
            strip_im_block[i * im_stride + 1] = (uint16_t)(horiz_2 >> 16);
            horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_1);
            strip_im_block[i * im_stride + im_stride] = (uint16_t)horiz_2;
            strip_im_block[i * im_stride + im_stride + 1] =
                (uint16_t)(horiz_2 >> 16);
          }
        }
      }
    } else {
      for (i = 0; i < im_h; i++) {
        for (int j = 0; j < w; j += 8) {
          int16_t *strip_im_block = &im_block_buf[(j / 8) * strip_stride];
          const __m256i data = _mm256_permute2x128_si256(
              _mm256_castsi128_si256(
                  _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
              _mm256_castsi128_si256(_mm_loadu_si128(
                  (__m128i *)(&src_ptr[i * src_stride + j + 4]))),
              0x20);
          const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
          const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);

          const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
          const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);

          const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
          const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);

          s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
          s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
          s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
          s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
          s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
          s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);

          const __m256i res_lo = convolve_12taps(s, coeffs_h);

          __m256i res_32b_lo = _mm256_sra_epi32(
              _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);

          __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
          _mm_store_si128((__m128i *)&strip_im_block[i * im_stride],
                          _mm256_extracti128_si256(
                              _mm256_permute4x64_epi64(res_16b_lo, 0x88), 0));
        }
      }
    }

    for (int j = 0; j < w; j += 8) {
      const int16_t *im_block = &im_block_buf[(j / 8) * strip_stride];
      CONVOLVE_SR_VERTICAL_FILTER_12TAP
    }
  } else {
    int im_stride = 8, i;
    const int strip_stride = (MAX_SB_SIZE + MAX_FILTER_TAP) * 8;
    DECLARE_ALIGNED(
        32, int16_t,
        im_block_buf[(MAX_SB_SIZE / 8) * (MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);

    assert(conv_params->round_0 == 3);
    assert(conv_params->round_1 == 11);

    const __m256i round_const_h =
        _mm256_set1_epi16(1 << (conv_params->round_0 - 2));
    const __m256i round_const_v =
        _mm256_set1_epi32(1 << (conv_params->round_1 - 1));

    __m256i filt[4], coeffs_h[4] = { 0 }, coeffs_v[4] = { 0 };

    int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
    int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);

    assert(horiz_tap == 2 || horiz_tap == 4 || horiz_tap == 6 ||
           horiz_tap == 8);
    assert(vert_tap == 2 || vert_tap == 4 || vert_tap == 6 || vert_tap == 8);

    if (horiz_tap == 2)
      prepare_coeffs_2t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
    else if (horiz_tap == 4)
      prepare_coeffs_4t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
    else if (horiz_tap == 6)
      prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
    else
      prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);

    if (vert_tap == 2)
      prepare_coeffs_2t(filter_params_y, subpel_y_qn, coeffs_v);
    else if (vert_tap == 4)
      prepare_coeffs_4t(filter_params_y, subpel_y_qn, coeffs_v);
    else if (vert_tap == 6)
      prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
    else
      prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);

    int im_h = h + vert_tap - 1;
    const int fo_vert = vert_tap / 2 - 1;
    const int fo_horiz = horiz_tap / 2 - 1;
    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;

    filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
    filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
    filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
    filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);

    if (subpel_x_qn == 0 && subpel_y_qn == 0) {
      for (i = 0; i < h; ++i) {
        for (int j = 0; j < w; j += 8) {
          _mm_storel_epi64(
              (__m128i *)&dst[i * dst_stride + j],
              _mm_loadl_epi64((const __m128i *)&src[i * src_stride + j]));
        }
      }
      return;
    }

    for (i = 0; i < (im_h - 1); i += 2) {
      const uint8_t *src_row0 = &src_ptr[i * src_stride];
      const uint8_t *src_row1 = &src_ptr[(i + 1) * src_stride];
      for (int j = 0; j < w; j += 8) {
        int16_t *strip_im_block = &im_block_buf[(j / 8) * strip_stride];
        __m256i data =
            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src_row0[j]));
        data = _mm256_inserti128_si256(
            data, _mm_loadu_si128((__m128i *)&src_row1[j]), 1);

        __m256i res;
        if (horiz_tap == 2)
          res = convolve_lowbd_x_2tap(data, coeffs_h, filt);
        else if (horiz_tap == 4)
          res = convolve_lowbd_x_4tap(data, coeffs_h, filt);
        else if (horiz_tap == 6)
          res = convolve_lowbd_x_6tap(data, coeffs_h, filt);
        else
          res = convolve_lowbd_x(data, coeffs_h, filt);

        res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2);
        _mm256_store_si256((__m256i *)&strip_im_block[i * 8], res);
      }
    }
    {
      const uint8_t *src_row0 = &src_ptr[i * src_stride];
      for (int j = 0; j < w; j += 8) {
        int16_t *strip_im_block = &im_block_buf[(j / 8) * strip_stride];
        __m256i data_1 =
            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src_row0[j]));
        __m256i res;
        if (horiz_tap == 2)
          res = convolve_lowbd_x_2tap(data_1, coeffs_h, filt);
        else if (horiz_tap == 4)
          res = convolve_lowbd_x_4tap(data_1, coeffs_h, filt);
        else if (horiz_tap == 6)
          res = convolve_lowbd_x_6tap(data_1, coeffs_h, filt);
        else
          res = convolve_lowbd_x(data_1, coeffs_h, filt);

        res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2);
        _mm_store_si128((__m128i *)&strip_im_block[i * 8],
                        _mm256_castsi256_si128(res));
      }
    }

    for (int j = 0; j < w; j += 8) {
      const int16_t *im_block = &im_block_buf[(j / 8) * strip_stride];
      uint8_t *dst_ptr = dst + j;
      if (vert_tap == 2) {
        CONVOLVE_SR_VERTICAL_FILTER_2TAP
      } else if (vert_tap == 4) {
        CONVOLVE_SR_VERTICAL_FILTER_4TAP
      } else if (vert_tap == 6) {
        CONVOLVE_SR_VERTICAL_FILTER_6TAP
      } else {
        CONVOLVE_SR_VERTICAL_FILTER_8TAP
      }
    }
  }
}

void av1_convolve_2d_sr_avx2(
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
    const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn,
    const int32_t subpel_y_qn, ConvolveParams *conv_params) {
  const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_qn);
  const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_qn);

  const bool use_12tap = (tap_x == 12 || tap_y == 12);
  if (w <= 4 && !use_12tap) {
    convolve_2d_sr_w4_avx2(src, src_stride, dst, dst_stride, w, h,
                           filter_params_x, filter_params_y, subpel_x_qn,
                           subpel_y_qn, conv_params);
  } else {
    convolve_2d_sr_avx2(src, src_stride, dst, dst_stride, w, h, filter_params_x,
                        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
  }
}

Coverage Report

Created: 2026-06-30 06:53

Line	Count	Source
1		/*
2		* Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3		*
4		* This source code is subject to the terms of the BSD 2 Clause License and
5		* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6		* was not distributed with this source code in the LICENSE file, you can
7		* obtain it at www.aomedia.org/license/software. If the Alliance for Open
8		* Media Patent License 1.0 was not distributed with this source code in the
9		* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10		*/
11
12		#include <immintrin.h>
13		#include <stdint.h>
14
15		#include "config/av1_rtcd.h"
16
17		#include "aom_dsp/x86/convolve_avx2.h"
18		#include "aom_dsp/aom_filter.h"
19		#include "aom_dsp/x86/synonyms.h"
20
21		#include "av1/common/convolve.h"
22
23		static void convolve_2d_sr_w4_avx2(
24		const uint8_t src, int32_t src_stride, uint8_t dst, int32_t dst_stride,
25		int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
26		const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn,
27	584k	const int32_t subpel_y_qn, ConvolveParams *conv_params) {
28	584k	int i;
29	584k	DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 4]);
30	584k	uint8_t *dst_ptr = dst;
31	584k	assert(conv_params->round_0 == 3);
32	584k	assert(conv_params->round_1 == 11);
33
34	584k	const __m128i round_const_h = _mm_set1_epi16(1 << (conv_params->round_0 - 2));
35	584k	const __m256i round_const_v =
36	584k	_mm256_set1_epi32(1 << (conv_params->round_1 - 1));
37
38	584k	__m128i filt[2], coeffs_h[2] = { 0 };
39	584k	__m256i coeffs_v[4] = { 0 };
40
41	584k	const int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
42	584k	const int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
43
44	584k	assert(horiz_tap == 2 \|\| horiz_tap == 4);
45	584k	assert(vert_tap == 2 \|\| vert_tap == 4 \|\| vert_tap == 6 \|\| vert_tap == 8);
46
47	584k	if (horiz_tap == 2)
48	15.6k	prepare_coeffs_2t_ssse3(filter_params_x, subpel_x_qn, coeffs_h);
49	568k	else
50	568k	prepare_coeffs_4t_ssse3(filter_params_x, subpel_x_qn, coeffs_h);
51
52	584k	if (vert_tap == 2)
53	15.6k	prepare_coeffs_2t(filter_params_y, subpel_y_qn, coeffs_v);
54	568k	else if (vert_tap == 4)
55	377k	prepare_coeffs_4t(filter_params_y, subpel_y_qn, coeffs_v);
56	191k	else if (vert_tap == 6)
57	181k	prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
58	9.94k	else
59	9.94k	prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
60
61	584k	int im_h = h + vert_tap - 1;
62	584k	const int fo_vert = vert_tap / 2 - 1;
63	584k	const int fo_horiz = horiz_tap / 2 - 1;
64	584k	const uint8_t const src_ptr = src - fo_vert src_stride - fo_horiz;
65
66	584k	filt[0] = _mm_load_si128((__m128i const *)filt1_global_sse2);
67	584k	filt[1] = _mm_load_si128((__m128i const *)filt2_global_sse2);
68
69	584k	if (horiz_tap == 2) {
70	15.6k	CONVOLVE_SR_HOR_FILTER_2TAP_W4
71	568k	} else {
72	568k	CONVOLVE_SR_HOR_FILTER_4TAP_W4
73	568k	}
74
75	584k	if (vert_tap == 2) {
76	15.6k	CONVOLVE_SR_VER_FILTER_2TAP_W4
77	568k	} else if (vert_tap == 4) {
78	377k	CONVOLVE_SR_VER_FILTER_4TAP_W4
79	377k	} else if (vert_tap == 6) {
80	181k	CONVOLVE_SR_VER_FILTER_6TAP_W4
81	181k	} else {
82	9.94k	CONVOLVE_SR_VER_FILTER_8TAP_W4
83	9.94k	}
84	584k	}
85
86		static void convolve_2d_sr_avx2(const uint8_t *src, int src_stride,
87		uint8_t *dst, int dst_stride, int w, int h,
88		const InterpFilterParams *filter_params_x,
89		const InterpFilterParams *filter_params_y,
90		const int subpel_x_qn, const int subpel_y_qn,
91	831k	ConvolveParams *conv_params) {
92	831k	if (filter_params_x->taps > 8) {
93	0	const int bd = 8;
94	0	int im_stride = 8, i;
95	0	const int strip_stride = (MAX_SB_SIZE + MAX_FILTER_TAP) * 8;
96	0	DECLARE_ALIGNED(
97	0	32, int16_t,
98	0	im_block_buf[(MAX_SB_SIZE / 8) * (MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
99	0	const int bits =
100	0	FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
101	0	const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
102
103	0	assert(conv_params->round_0 > 0);
104
105	0	const __m256i round_const_h12 = _mm256_set1_epi32(
106	0	((1 << (conv_params->round_0)) >> 1) + (1 << (bd + FILTER_BITS - 1)));
107	0	const __m128i round_shift_h12 = _mm_cvtsi32_si128(conv_params->round_0);
108
109	0	const __m256i sum_round_v = _mm256_set1_epi32(
110	0	(1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
111	0	const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
112
113	0	const __m256i round_const_v = _mm256_set1_epi32(
114	0	((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
115	0	((1 << (offset_bits - conv_params->round_1)) >> 1));
116	0	const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
117
118	0	__m256i coeffs_h[6] = { 0 }, coeffs_v[6] = { 0 };
119
120	0	int horiz_tap = 12;
121	0	int vert_tap = 12;
122
123	0	prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs_h);
124	0	prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs_v);
125
126	0	int im_h = h + vert_tap - 1;
127	0	const int fo_vert = vert_tap / 2 - 1;
128	0	const int fo_horiz = horiz_tap / 2 - 1;
129	0	const uint8_t const src_ptr = src - fo_vert src_stride - fo_horiz;
130
131	0	const __m256i v_zero = _mm256_setzero_si256();
132	0	__m256i s[12];
133	0	if (w <= 4) {
134	0	for (i = 0; i < im_h; i += 2) {
135	0	for (int j = 0; j < w; j += 8) {
136	0	int16_t strip_im_block = &im_block_buf[(j / 8) strip_stride];
137	0	const __m256i data = _mm256_permute2x128_si256(
138	0	_mm256_castsi128_si256(
139	0	_mm_loadu_si128((__m128i )(&src_ptr[i src_stride + j]))),
140	0	_mm256_castsi128_si256(_mm_loadu_si128(
141	0	(__m128i )(&src_ptr[i src_stride + src_stride + j]))),
142	0	0x20);
143	0	const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
144	0	const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
145	0	const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
146	0	const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
147
148	0	const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
149	0	const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
150
151	0	s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
152	0	s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
153	0	s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
154	0	s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
155	0	s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
156	0	s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
157
158	0	const __m256i res_lo = convolve_12taps(s, coeffs_h);
159
160	0	__m256i res_32b_lo = _mm256_sra_epi32(
161	0	_mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);
162	0	__m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
163	0	const __m128i res_0 = _mm256_extracti128_si256(res_16b_lo, 0);
164	0	const __m128i res_1 = _mm256_extracti128_si256(res_16b_lo, 1);
165	0	if (w > 2) {
166	0	_mm_storel_epi64((__m128i )&strip_im_block[i im_stride], res_0);
167	0	_mm_storel_epi64(
168	0	(__m128i )&strip_im_block[i im_stride + im_stride], res_1);
169	0	} else {
170	0	uint32_t horiz_2;
171	0	horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_0);
172	0	strip_im_block[i * im_stride] = (uint16_t)horiz_2;
173	0	strip_im_block[i * im_stride + 1] = (uint16_t)(horiz_2 >> 16);
174	0	horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_1);
175	0	strip_im_block[i * im_stride + im_stride] = (uint16_t)horiz_2;
176	0	strip_im_block[i * im_stride + im_stride + 1] =
177	0	(uint16_t)(horiz_2 >> 16);
178	0	}
179	0	}
180	0	}
181	0	} else {
182	0	for (i = 0; i < im_h; i++) {
183	0	for (int j = 0; j < w; j += 8) {
184	0	int16_t strip_im_block = &im_block_buf[(j / 8) strip_stride];
185	0	const __m256i data = _mm256_permute2x128_si256(
186	0	_mm256_castsi128_si256(
187	0	_mm_loadu_si128((__m128i )(&src_ptr[i src_stride + j]))),
188	0	_mm256_castsi128_si256(_mm_loadu_si128(
189	0	(__m128i )(&src_ptr[i src_stride + j + 4]))),
190	0	0x20);
191	0	const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
192	0	const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
193
194	0	const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
195	0	const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
196
197	0	const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
198	0	const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
199
200	0	s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
201	0	s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
202	0	s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
203	0	s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
204	0	s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
205	0	s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
206
207	0	const __m256i res_lo = convolve_12taps(s, coeffs_h);
208
209	0	__m256i res_32b_lo = _mm256_sra_epi32(
210	0	_mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);
211
212	0	__m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
213	0	_mm_store_si128((__m128i )&strip_im_block[i im_stride],
214	0	_mm256_extracti128_si256(
215	0	_mm256_permute4x64_epi64(res_16b_lo, 0x88), 0));
216	0	}
217	0	}
218	0	}
219
220	0	for (int j = 0; j < w; j += 8) {
221	0	const int16_t im_block = &im_block_buf[(j / 8) strip_stride];
222	0	CONVOLVE_SR_VERTICAL_FILTER_12TAP
223	0	}
224	831k	} else {
225	831k	int im_stride = 8, i;
226	831k	const int strip_stride = (MAX_SB_SIZE + MAX_FILTER_TAP) * 8;
227	831k	DECLARE_ALIGNED(
228	831k	32, int16_t,
229	831k	im_block_buf[(MAX_SB_SIZE / 8) * (MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
230
231	831k	assert(conv_params->round_0 == 3);
232	831k	assert(conv_params->round_1 == 11);
233
234	831k	const __m256i round_const_h =
235	831k	_mm256_set1_epi16(1 << (conv_params->round_0 - 2));
236	831k	const __m256i round_const_v =
237	831k	_mm256_set1_epi32(1 << (conv_params->round_1 - 1));
238
239	831k	__m256i filt[4], coeffs_h[4] = { 0 }, coeffs_v[4] = { 0 };
240
241	831k	int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
242	831k	int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
243
244	831k	assert(horiz_tap == 2 \|\| horiz_tap == 4 \|\| horiz_tap == 6 \|\|
245	831k	horiz_tap == 8);
246	831k	assert(vert_tap == 2 \|\| vert_tap == 4 \|\| vert_tap == 6 \|\| vert_tap == 8);
247
248	831k	if (horiz_tap == 2)
249	14.7k	prepare_coeffs_2t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
250	816k	else if (horiz_tap == 4)
251	38.0k	prepare_coeffs_4t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
252	778k	else if (horiz_tap == 6)
253	730k	prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
254	47.6k	else
255	47.6k	prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
256
257	831k	if (vert_tap == 2)
258	14.7k	prepare_coeffs_2t(filter_params_y, subpel_y_qn, coeffs_v);
259	816k	else if (vert_tap == 4)
260	402k	prepare_coeffs_4t(filter_params_y, subpel_y_qn, coeffs_v);
261	414k	else if (vert_tap == 6)
262	382k	prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
263	31.7k	else
264	31.7k	prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
265
266	831k	int im_h = h + vert_tap - 1;
267	831k	const int fo_vert = vert_tap / 2 - 1;
268	831k	const int fo_horiz = horiz_tap / 2 - 1;
269	831k	const uint8_t const src_ptr = src - fo_vert src_stride - fo_horiz;
270
271	831k	filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
272	831k	filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
273	831k	filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
274	831k	filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
275
276	831k	if (subpel_x_qn == 0 && subpel_y_qn == 0) {
277	0	for (i = 0; i < h; ++i) {
278	0	for (int j = 0; j < w; j += 8) {
279	0	_mm_storel_epi64(
280	0	(__m128i )&dst[i dst_stride + j],
281	0	_mm_loadl_epi64((const __m128i )&src[i src_stride + j]));
282	0	}
283	0	}
284	0	return;
285	0	}
286
287	6.15M	for (i = 0; i < (im_h - 1); i += 2) {
288	5.32M	const uint8_t src_row0 = &src_ptr[i src_stride];
289	5.32M	const uint8_t src_row1 = &src_ptr[(i + 1) src_stride];
290	19.5M	for (int j = 0; j < w; j += 8) {
291	14.2M	int16_t strip_im_block = &im_block_buf[(j / 8) strip_stride];
292	14.2M	__m256i data =
293	14.2M	_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src_row0[j]));
294	14.2M	data = _mm256_inserti128_si256(
295	14.2M	data, _mm_loadu_si128((__m128i *)&src_row1[j]), 1);
296
297	14.2M	__m256i res;
298	14.2M	if (horiz_tap == 2)
299	403k	res = convolve_lowbd_x_2tap(data, coeffs_h, filt);
300	13.8M	else if (horiz_tap == 4)
301	973k	res = convolve_lowbd_x_4tap(data, coeffs_h, filt);
302	12.8M	else if (horiz_tap == 6)
303	11.0M	res = convolve_lowbd_x_6tap(data, coeffs_h, filt);
304	1.82M	else
305	1.82M	res = convolve_lowbd_x(data, coeffs_h, filt);
306
307	14.2M	res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2);
308	14.2M	_mm256_store_si256((__m256i )&strip_im_block[i 8], res);
309	14.2M	}
310	5.32M	}
311	831k	{
312	831k	const uint8_t src_row0 = &src_ptr[i src_stride];
313	2.29M	for (int j = 0; j < w; j += 8) {
314	1.46M	int16_t strip_im_block = &im_block_buf[(j / 8) strip_stride];
315	1.46M	__m256i data_1 =
316	1.46M	_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src_row0[j]));
317	1.46M	__m256i res;
318	1.46M	if (horiz_tap == 2)
319	30.6k	res = convolve_lowbd_x_2tap(data_1, coeffs_h, filt);
320	1.43M	else if (horiz_tap == 4)
321	72.2k	res = convolve_lowbd_x_4tap(data_1, coeffs_h, filt);
322	1.35M	else if (horiz_tap == 6)
323	1.22M	res = convolve_lowbd_x_6tap(data_1, coeffs_h, filt);
324	136k	else
325	136k	res = convolve_lowbd_x(data_1, coeffs_h, filt);
326
327	1.46M	res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2);
328	1.46M	_mm_store_si128((__m128i )&strip_im_block[i 8],
329	1.46M	_mm256_castsi256_si128(res));
330	1.46M	}
331	831k	}
332
333	2.29M	for (int j = 0; j < w; j += 8) {
334	1.45M	const int16_t im_block = &im_block_buf[(j / 8) strip_stride];
335	1.45M	uint8_t *dst_ptr = dst + j;
336	1.45M	if (vert_tap == 2) {
337	30.5k	CONVOLVE_SR_VERTICAL_FILTER_2TAP
338	1.42M	} else if (vert_tap == 4) {
339	546k	CONVOLVE_SR_VERTICAL_FILTER_4TAP
340	882k	} else if (vert_tap == 6) {
341	766k	CONVOLVE_SR_VERTICAL_FILTER_6TAP
342	766k	} else {
343	115k	CONVOLVE_SR_VERTICAL_FILTER_8TAP
344	115k	}
345	1.45M	}
346	831k	}
347	831k	}
348
349		void av1_convolve_2d_sr_avx2(
350		const uint8_t src, int32_t src_stride, uint8_t dst, int32_t dst_stride,
351		int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
352		const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn,
353	1.41M	const int32_t subpel_y_qn, ConvolveParams *conv_params) {
354	1.41M	const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_qn);
355	1.41M	const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_qn);
356
357	1.41M	const bool use_12tap = (tap_x == 12 \|\| tap_y == 12);
358	1.41M	if (w <= 4 && !use_12tap) {
359	584k	convolve_2d_sr_w4_avx2(src, src_stride, dst, dst_stride, w, h,
360	584k	filter_params_x, filter_params_y, subpel_x_qn,
361	584k	subpel_y_qn, conv_params);
362	831k	} else {
363	831k	convolve_2d_sr_avx2(src, src_stride, dst, dst_stride, w, h, filter_params_x,
364	831k	filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
365	831k	}
366	1.41M	}