/src/aom/aom_dsp/x86/highbd_convolve_sse2.c

Source (jump to first uncovered line)
/*
 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
#include <emmintrin.h>

#include "config/aom_dsp_rtcd.h"
#include "aom_dsp/x86/convolve.h"

// -----------------------------------------------------------------------------

static void aom_highbd_filter_block1d4_v4_sse2(
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  __m128i filtersReg;
  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
  __m128i srcReg23_lo, srcReg34_lo;
  __m128i srcReg45_lo, srcReg56_lo;
  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
  __m128i resReg23_45_lo, resReg34_56_lo;
  __m128i resReg23_45, resReg34_56;
  __m128i addFilterReg64, secondFilters, thirdFilters;
  unsigned int i;
  ptrdiff_t src_stride, dst_stride;

  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
  addFilterReg64 = _mm_set1_epi32(64);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);

  // coeffs 0 1 0 1 2 3 2 3
  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
  // coeffs 4 5 4 5 6 7 6 7
  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);

  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5

  // multiply the size of the source and destination stride by two
  src_stride = src_pitch << 1;
  dst_stride = dst_pitch << 1;

  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
  srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);

  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
  srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);

  for (i = height; i > 1; i -= 2) {
    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
    srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);

    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
    srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);

    // multiply 2 adjacent elements with the filter and add the result

    resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
    resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
    resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
    resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);

    resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
    resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);

    // shift by 7 bit each 32 bit
    resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
    resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
    resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
    resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);

    // shrink to 16 bit each 32 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
    // result
    resReg23_45 = _mm_packs_epi32(resReg23_45_lo, _mm_setzero_si128());
    resReg34_56 = _mm_packs_epi32(resReg34_56_lo, _mm_setzero_si128());

    resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
    resReg23_45 = _mm_min_epi16(resReg23_45, max);
    resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
    resReg34_56 = _mm_min_epi16(resReg34_56, max);

    src_ptr += src_stride;

    _mm_storel_epi64((__m128i *)dst_ptr, (resReg23_45));
    _mm_storel_epi64((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));

    dst_ptr += dst_stride;

    // save part of the registers for next strides
    srcReg23_lo = srcReg45_lo;
    srcReg34_lo = srcReg56_lo;
    srcReg4 = srcReg6;
  }
}

static void aom_highbd_filter_block1d4_h4_sse2(
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  __m128i filtersReg;
  __m128i addFilterReg64;
  __m128i secondFilters, thirdFilters;
  __m128i srcRegFilt32b1_1;
  __m128i srcReg32b1;
  unsigned int i;
  src_ptr -= 3;
  addFilterReg64 = _mm_set1_epi32(64);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  const __m128i max = _mm_set1_epi16((1 << bd) - 1);

  // coeffs 0 1 0 1 2 3 2 3
  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
  // coeffs 4 5 4 5 6 7 6 7
  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);

  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5

  for (i = height; i > 0; i -= 1) {
    srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));

    __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
    __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
    __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
    __m128i ss_23 = _mm_unpacklo_epi32(srcReg32b1, ss_3_1);
    __m128i ss_45 = _mm_unpacklo_epi32(ss_4_1, ss_5_1);

    ss_23 = _mm_madd_epi16(ss_23, secondFilters);
    ss_45 = _mm_madd_epi16(ss_45, thirdFilters);
    srcRegFilt32b1_1 = _mm_add_epi32(ss_23, ss_45);

    // shift by 7 bit each 32 bit
    srcRegFilt32b1_1 = _mm_add_epi32(srcRegFilt32b1_1, addFilterReg64);
    srcRegFilt32b1_1 = _mm_srai_epi32(srcRegFilt32b1_1, 7);

    srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
    srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
    srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);

    src_ptr += src_pitch;

    _mm_storel_epi64((__m128i *)dst_ptr, srcRegFilt32b1_1);

    dst_ptr += dst_pitch;
  }
}

static void aom_highbd_filter_block1d8_v4_sse2(
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  __m128i filtersReg;
  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
  __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
  __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
  __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
  __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
  __m128i resReg23_45, resReg34_56;
  __m128i addFilterReg64, secondFilters, thirdFilters;
  unsigned int i;
  ptrdiff_t src_stride, dst_stride;

  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
  addFilterReg64 = _mm_set1_epi32(64);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);

  // coeffs 0 1 0 1 2 3 2 3
  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
  // coeffs 4 5 4 5 6 7 6 7
  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);

  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5

  // multiple the size of the source and destination stride by two
  src_stride = src_pitch << 1;
  dst_stride = dst_pitch << 1;

  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
  srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
  srcReg23_hi = _mm_unpackhi_epi16(srcReg2, srcReg3);

  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
  srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
  srcReg34_hi = _mm_unpackhi_epi16(srcReg3, srcReg4);

  for (i = height; i > 1; i -= 2) {
    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));

    srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
    srcReg45_hi = _mm_unpackhi_epi16(srcReg4, srcReg5);

    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));

    srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
    srcReg56_hi = _mm_unpackhi_epi16(srcReg5, srcReg6);

    // multiply 2 adjacent elements with the filter and add the result

    resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
    resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
    resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
    resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);

    resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
    resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);

    // multiply 2 adjacent elements with the filter and add the result

    resReg23_hi = _mm_madd_epi16(srcReg23_hi, secondFilters);
    resReg34_hi = _mm_madd_epi16(srcReg34_hi, secondFilters);
    resReg45_hi = _mm_madd_epi16(srcReg45_hi, thirdFilters);
    resReg56_hi = _mm_madd_epi16(srcReg56_hi, thirdFilters);

    resReg23_45_hi = _mm_add_epi32(resReg23_hi, resReg45_hi);
    resReg34_56_hi = _mm_add_epi32(resReg34_hi, resReg56_hi);

    // shift by 7 bit each 32 bit
    resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
    resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
    resReg23_45_hi = _mm_add_epi32(resReg23_45_hi, addFilterReg64);
    resReg34_56_hi = _mm_add_epi32(resReg34_56_hi, addFilterReg64);
    resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
    resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
    resReg23_45_hi = _mm_srai_epi32(resReg23_45_hi, 7);
    resReg34_56_hi = _mm_srai_epi32(resReg34_56_hi, 7);

    // shrink to 16 bit each 32 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
    // result
    resReg23_45 = _mm_packs_epi32(resReg23_45_lo, resReg23_45_hi);
    resReg34_56 = _mm_packs_epi32(resReg34_56_lo, resReg34_56_hi);

    resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
    resReg23_45 = _mm_min_epi16(resReg23_45, max);
    resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
    resReg34_56 = _mm_min_epi16(resReg34_56, max);

    src_ptr += src_stride;

    _mm_store_si128((__m128i *)dst_ptr, (resReg23_45));
    _mm_store_si128((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));

    dst_ptr += dst_stride;

    // save part of the registers for next strides
    srcReg23_lo = srcReg45_lo;
    srcReg23_hi = srcReg45_hi;
    srcReg34_lo = srcReg56_lo;
    srcReg34_hi = srcReg56_hi;
    srcReg4 = srcReg6;
  }
}

static void aom_highbd_filter_block1d8_h4_sse2(
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  __m128i filtersReg;
  __m128i addFilterReg64;
  __m128i secondFilters, thirdFilters;
  __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
  __m128i srcReg32b1, srcReg32b2;
  unsigned int i;
  src_ptr -= 3;
  addFilterReg64 = _mm_set1_epi32(64);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  const __m128i max = _mm_set1_epi16((1 << bd) - 1);

  // coeffs 0 1 0 1 2 3 2 3
  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
  // coeffs 4 5 4 5 6 7 6 7
  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);

  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5

  for (i = height; i > 0; i -= 1) {
    srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
    srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 6));

    __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
    __m128i ss_4_2 = _mm_srli_si128(srcReg32b2, 4);
    __m128i ss_4 = _mm_unpacklo_epi64(ss_4_1, ss_4_2);

    __m128i d1 = _mm_madd_epi16(srcReg32b1, secondFilters);
    __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);

    __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
    __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
    __m128i ss_3_2 = _mm_srli_si128(srcReg32b2, 2);
    __m128i ss_5_2 = _mm_srli_si128(srcReg32b2, 6);
    __m128i ss_3 = _mm_unpacklo_epi64(ss_3_1, ss_3_2);
    __m128i ss_5 = _mm_unpacklo_epi64(ss_5_1, ss_5_2);

    d1 = _mm_madd_epi16(ss_3, secondFilters);
    d2 = _mm_madd_epi16(ss_5, thirdFilters);
    srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);

    __m128i res_lo_1 = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
    __m128i res_hi_1 = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);

    // shift by 7 bit each 32 bit
    res_lo_1 = _mm_add_epi32(res_lo_1, addFilterReg64);
    res_hi_1 = _mm_add_epi32(res_hi_1, addFilterReg64);
    res_lo_1 = _mm_srai_epi32(res_lo_1, 7);
    res_hi_1 = _mm_srai_epi32(res_hi_1, 7);

    srcRegFilt32b1_1 = _mm_packs_epi32(res_lo_1, res_hi_1);

    srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
    srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);

    src_ptr += src_pitch;

    _mm_store_si128((__m128i *)dst_ptr, srcRegFilt32b1_1);

    dst_ptr += dst_pitch;
  }
}

static void aom_highbd_filter_block1d16_v4_sse2(
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
                                     height, filter, bd);
  aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
                                     dst_pitch, height, filter, bd);
}

static void aom_highbd_filter_block1d16_h4_sse2(
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
  aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
                                     height, filter, bd);
  aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
                                     dst_pitch, height, filter, bd);
}

// From aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;

// From aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;

// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
//                                      ptrdiff_t src_stride,
//                                      uint8_t *dst,
//                                      ptrdiff_t dst_stride,
//                                      const int16_t *filter_x,
//                                      int x_step_q4,
//                                      const int16_t *filter_y,
//                                      int y_step_q4,
//                                      int w, int h, int bd);
// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
//                                     ptrdiff_t src_stride,
//                                     uint8_t *dst,
//                                     ptrdiff_t dst_stride,
//                                     const int16_t *filter_x,
//                                     int x_step_q4,
//                                     const int16_t *filter_y,
//                                     int y_step_q4,
//                                     int w, int h, int bd);
HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)

Coverage Report

Created: 2025-06-13 07:07

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3		*
4		* This source code is subject to the terms of the BSD 2 Clause License and
5		* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6		* was not distributed with this source code in the LICENSE file, you can
7		* obtain it at www.aomedia.org/license/software. If the Alliance for Open
8		* Media Patent License 1.0 was not distributed with this source code in the
9		* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10		*/
11		#include <emmintrin.h>
12
13		#include "config/aom_dsp_rtcd.h"
14		#include "aom_dsp/x86/convolve.h"
15
16		// -----------------------------------------------------------------------------
17
18		static void aom_highbd_filter_block1d4_v4_sse2(
19		const uint16_t src_ptr, ptrdiff_t src_pitch, uint16_t dst_ptr,
20	0	ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
21	0	__m128i filtersReg;
22	0	__m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
23	0	__m128i srcReg23_lo, srcReg34_lo;
24	0	__m128i srcReg45_lo, srcReg56_lo;
25	0	__m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
26	0	__m128i resReg23_45_lo, resReg34_56_lo;
27	0	__m128i resReg23_45, resReg34_56;
28	0	__m128i addFilterReg64, secondFilters, thirdFilters;
29	0	unsigned int i;
30	0	ptrdiff_t src_stride, dst_stride;
31
32	0	const __m128i max = _mm_set1_epi16((1 << bd) - 1);
33	0	addFilterReg64 = _mm_set1_epi32(64);
34	0	filtersReg = _mm_loadu_si128((const __m128i *)filter);
35
36		// coeffs 0 1 0 1 2 3 2 3
37	0	const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
38		// coeffs 4 5 4 5 6 7 6 7
39	0	const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
40
41	0	secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
42	0	thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
43
44		// multiply the size of the source and destination stride by two
45	0	src_stride = src_pitch << 1;
46	0	dst_stride = dst_pitch << 1;
47
48	0	srcReg2 = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 2));
49	0	srcReg3 = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 3));
50	0	srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
51
52	0	srcReg4 = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 4));
53	0	srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
54
55	0	for (i = height; i > 1; i -= 2) {
56	0	srcReg5 = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 5));
57	0	srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
58
59	0	srcReg6 = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 6));
60	0	srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
61
62		// multiply 2 adjacent elements with the filter and add the result
63
64	0	resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
65	0	resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
66	0	resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
67	0	resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
68
69	0	resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
70	0	resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
71
72		// shift by 7 bit each 32 bit
73	0	resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
74	0	resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
75	0	resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
76	0	resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
77
78		// shrink to 16 bit each 32 bits, the first lane contain the first
79		// convolve result and the second lane contain the second convolve
80		// result
81	0	resReg23_45 = _mm_packs_epi32(resReg23_45_lo, _mm_setzero_si128());
82	0	resReg34_56 = _mm_packs_epi32(resReg34_56_lo, _mm_setzero_si128());
83
84	0	resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
85	0	resReg23_45 = _mm_min_epi16(resReg23_45, max);
86	0	resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
87	0	resReg34_56 = _mm_min_epi16(resReg34_56, max);
88
89	0	src_ptr += src_stride;
90
91	0	_mm_storel_epi64((__m128i *)dst_ptr, (resReg23_45));
92	0	_mm_storel_epi64((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
93
94	0	dst_ptr += dst_stride;
95
96		// save part of the registers for next strides
97	0	srcReg23_lo = srcReg45_lo;
98	0	srcReg34_lo = srcReg56_lo;
99	0	srcReg4 = srcReg6;
100	0	}
101	0	}
102
103		static void aom_highbd_filter_block1d4_h4_sse2(
104		const uint16_t src_ptr, ptrdiff_t src_pitch, uint16_t dst_ptr,
105	0	ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
106	0	__m128i filtersReg;
107	0	__m128i addFilterReg64;
108	0	__m128i secondFilters, thirdFilters;
109	0	__m128i srcRegFilt32b1_1;
110	0	__m128i srcReg32b1;
111	0	unsigned int i;
112	0	src_ptr -= 3;
113	0	addFilterReg64 = _mm_set1_epi32(64);
114	0	filtersReg = _mm_loadu_si128((const __m128i *)filter);
115	0	const __m128i max = _mm_set1_epi16((1 << bd) - 1);
116
117		// coeffs 0 1 0 1 2 3 2 3
118	0	const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
119		// coeffs 4 5 4 5 6 7 6 7
120	0	const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
121
122	0	secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
123	0	thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
124
125	0	for (i = height; i > 0; i -= 1) {
126	0	srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
127
128	0	__m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
129	0	__m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
130	0	__m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
131	0	__m128i ss_23 = _mm_unpacklo_epi32(srcReg32b1, ss_3_1);
132	0	__m128i ss_45 = _mm_unpacklo_epi32(ss_4_1, ss_5_1);
133
134	0	ss_23 = _mm_madd_epi16(ss_23, secondFilters);
135	0	ss_45 = _mm_madd_epi16(ss_45, thirdFilters);
136	0	srcRegFilt32b1_1 = _mm_add_epi32(ss_23, ss_45);
137
138		// shift by 7 bit each 32 bit
139	0	srcRegFilt32b1_1 = _mm_add_epi32(srcRegFilt32b1_1, addFilterReg64);
140	0	srcRegFilt32b1_1 = _mm_srai_epi32(srcRegFilt32b1_1, 7);
141
142	0	srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
143	0	srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
144	0	srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
145
146	0	src_ptr += src_pitch;
147
148	0	_mm_storel_epi64((__m128i *)dst_ptr, srcRegFilt32b1_1);
149
150	0	dst_ptr += dst_pitch;
151	0	}
152	0	}
153
154		static void aom_highbd_filter_block1d8_v4_sse2(
155		const uint16_t src_ptr, ptrdiff_t src_pitch, uint16_t dst_ptr,
156	0	ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
157	0	__m128i filtersReg;
158	0	__m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
159	0	__m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
160	0	__m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
161	0	__m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
162	0	__m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
163	0	__m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
164	0	__m128i resReg23_45, resReg34_56;
165	0	__m128i addFilterReg64, secondFilters, thirdFilters;
166	0	unsigned int i;
167	0	ptrdiff_t src_stride, dst_stride;
168
169	0	const __m128i max = _mm_set1_epi16((1 << bd) - 1);
170	0	addFilterReg64 = _mm_set1_epi32(64);
171	0	filtersReg = _mm_loadu_si128((const __m128i *)filter);
172
173		// coeffs 0 1 0 1 2 3 2 3
174	0	const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
175		// coeffs 4 5 4 5 6 7 6 7
176	0	const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
177
178	0	secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
179	0	thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
180
181		// multiple the size of the source and destination stride by two
182	0	src_stride = src_pitch << 1;
183	0	dst_stride = dst_pitch << 1;
184
185	0	srcReg2 = _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 2));
186	0	srcReg3 = _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 3));
187	0	srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
188	0	srcReg23_hi = _mm_unpackhi_epi16(srcReg2, srcReg3);
189
190	0	srcReg4 = _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 4));
191	0	srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
192	0	srcReg34_hi = _mm_unpackhi_epi16(srcReg3, srcReg4);
193
194	0	for (i = height; i > 1; i -= 2) {
195	0	srcReg5 = _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 5));
196
197	0	srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
198	0	srcReg45_hi = _mm_unpackhi_epi16(srcReg4, srcReg5);
199
200	0	srcReg6 = _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 6));
201
202	0	srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
203	0	srcReg56_hi = _mm_unpackhi_epi16(srcReg5, srcReg6);
204
205		// multiply 2 adjacent elements with the filter and add the result
206
207	0	resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
208	0	resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
209	0	resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
210	0	resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
211
212	0	resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
213	0	resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
214
215		// multiply 2 adjacent elements with the filter and add the result
216
217	0	resReg23_hi = _mm_madd_epi16(srcReg23_hi, secondFilters);
218	0	resReg34_hi = _mm_madd_epi16(srcReg34_hi, secondFilters);
219	0	resReg45_hi = _mm_madd_epi16(srcReg45_hi, thirdFilters);
220	0	resReg56_hi = _mm_madd_epi16(srcReg56_hi, thirdFilters);
221
222	0	resReg23_45_hi = _mm_add_epi32(resReg23_hi, resReg45_hi);
223	0	resReg34_56_hi = _mm_add_epi32(resReg34_hi, resReg56_hi);
224
225		// shift by 7 bit each 32 bit
226	0	resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
227	0	resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
228	0	resReg23_45_hi = _mm_add_epi32(resReg23_45_hi, addFilterReg64);
229	0	resReg34_56_hi = _mm_add_epi32(resReg34_56_hi, addFilterReg64);
230	0	resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
231	0	resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
232	0	resReg23_45_hi = _mm_srai_epi32(resReg23_45_hi, 7);
233	0	resReg34_56_hi = _mm_srai_epi32(resReg34_56_hi, 7);
234
235		// shrink to 16 bit each 32 bits, the first lane contain the first
236		// convolve result and the second lane contain the second convolve
237		// result
238	0	resReg23_45 = _mm_packs_epi32(resReg23_45_lo, resReg23_45_hi);
239	0	resReg34_56 = _mm_packs_epi32(resReg34_56_lo, resReg34_56_hi);
240
241	0	resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
242	0	resReg23_45 = _mm_min_epi16(resReg23_45, max);
243	0	resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
244	0	resReg34_56 = _mm_min_epi16(resReg34_56, max);
245
246	0	src_ptr += src_stride;
247
248	0	_mm_store_si128((__m128i *)dst_ptr, (resReg23_45));
249	0	_mm_store_si128((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
250
251	0	dst_ptr += dst_stride;
252
253		// save part of the registers for next strides
254	0	srcReg23_lo = srcReg45_lo;
255	0	srcReg23_hi = srcReg45_hi;
256	0	srcReg34_lo = srcReg56_lo;
257	0	srcReg34_hi = srcReg56_hi;
258	0	srcReg4 = srcReg6;
259	0	}
260	0	}
261
262		static void aom_highbd_filter_block1d8_h4_sse2(
263		const uint16_t src_ptr, ptrdiff_t src_pitch, uint16_t dst_ptr,
264	0	ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
265	0	__m128i filtersReg;
266	0	__m128i addFilterReg64;
267	0	__m128i secondFilters, thirdFilters;
268	0	__m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
269	0	__m128i srcReg32b1, srcReg32b2;
270	0	unsigned int i;
271	0	src_ptr -= 3;
272	0	addFilterReg64 = _mm_set1_epi32(64);
273	0	filtersReg = _mm_loadu_si128((const __m128i *)filter);
274	0	const __m128i max = _mm_set1_epi16((1 << bd) - 1);
275
276		// coeffs 0 1 0 1 2 3 2 3
277	0	const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
278		// coeffs 4 5 4 5 6 7 6 7
279	0	const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
280
281	0	secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
282	0	thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
283
284	0	for (i = height; i > 0; i -= 1) {
285	0	srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
286	0	srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 6));
287
288	0	__m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
289	0	__m128i ss_4_2 = _mm_srli_si128(srcReg32b2, 4);
290	0	__m128i ss_4 = _mm_unpacklo_epi64(ss_4_1, ss_4_2);
291
292	0	__m128i d1 = _mm_madd_epi16(srcReg32b1, secondFilters);
293	0	__m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
294	0	srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
295
296	0	__m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
297	0	__m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
298	0	__m128i ss_3_2 = _mm_srli_si128(srcReg32b2, 2);
299	0	__m128i ss_5_2 = _mm_srli_si128(srcReg32b2, 6);
300	0	__m128i ss_3 = _mm_unpacklo_epi64(ss_3_1, ss_3_2);
301	0	__m128i ss_5 = _mm_unpacklo_epi64(ss_5_1, ss_5_2);
302
303	0	d1 = _mm_madd_epi16(ss_3, secondFilters);
304	0	d2 = _mm_madd_epi16(ss_5, thirdFilters);
305	0	srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
306
307	0	__m128i res_lo_1 = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
308	0	__m128i res_hi_1 = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
309
310		// shift by 7 bit each 32 bit
311	0	res_lo_1 = _mm_add_epi32(res_lo_1, addFilterReg64);
312	0	res_hi_1 = _mm_add_epi32(res_hi_1, addFilterReg64);
313	0	res_lo_1 = _mm_srai_epi32(res_lo_1, 7);
314	0	res_hi_1 = _mm_srai_epi32(res_hi_1, 7);
315
316	0	srcRegFilt32b1_1 = _mm_packs_epi32(res_lo_1, res_hi_1);
317
318	0	srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
319	0	srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
320
321	0	src_ptr += src_pitch;
322
323	0	_mm_store_si128((__m128i *)dst_ptr, srcRegFilt32b1_1);
324
325	0	dst_ptr += dst_pitch;
326	0	}
327	0	}
328
329		static void aom_highbd_filter_block1d16_v4_sse2(
330		const uint16_t src_ptr, ptrdiff_t src_pitch, uint16_t dst_ptr,
331	0	ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
332	0	aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
333	0	height, filter, bd);
334	0	aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
335	0	dst_pitch, height, filter, bd);
336	0	}
337
338		static void aom_highbd_filter_block1d16_h4_sse2(
339		const uint16_t src_ptr, ptrdiff_t src_pitch, uint16_t dst_ptr,
340	0	ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
341	0	aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
342	0	height, filter, bd);
343	0	aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
344	0	dst_pitch, height, filter, bd);
345	0	}
346
347		// From aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
348		highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
349		highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
350		highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
351		highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
352		highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
353		highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
354
355		// From aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
356		highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
357		highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
358		highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
359		highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
360		highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
361		highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
362
363		// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
364		// ptrdiff_t src_stride,
365		// uint8_t *dst,
366		// ptrdiff_t dst_stride,
367		// const int16_t *filter_x,
368		// int x_step_q4,
369		// const int16_t *filter_y,
370		// int y_step_q4,
371		// int w, int h, int bd);
372		// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
373		// ptrdiff_t src_stride,
374		// uint8_t *dst,
375		// ptrdiff_t dst_stride,
376		// const int16_t *filter_x,
377		// int x_step_q4,
378		// const int16_t *filter_y,
379		// int y_step_q4,
380		// int w, int h, int bd);
381		HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
382		HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)