/src/libvpx/vpx_dsp/x86/highbd_sad_avx2.c

Source (jump to first uncovered line)
/*
 *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
#include <immintrin.h>
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"

static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) {
  const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8));
  const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4));
  const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1),
                                    _mm256_extractf128_si256(t1, 1));
  return (unsigned int)_mm_cvtsi128_si32(sum);
}

static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
                                            const uint16_t *src, int src_stride,
                                            uint16_t *ref, int ref_stride,
                                            int height) {
  int i;
  for (i = 0; i < height; ++i) {
    // load src and all ref[]
    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
    const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
    const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
    const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
    // absolute differences between every ref[] to src
    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
    const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2));
    const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3));
    // sum every abs diff
    *sums_16 =
        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
    *sums_16 =
        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));

    src += src_stride;
    ref += ref_stride;
  }
}

static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr,
                                                         int src_stride,
                                                         const uint8_t *ref_ptr,
                                                         int ref_stride,
                                                         int n) {
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
  __m256i sums_32 = _mm256_setzero_si256();
  int i;

  for (i = 0; i < (n / 2); ++i) {
    __m256i sums_16 = _mm256_setzero_si256();

    highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2);

    /* sums_16 will outrange after 2 rows, so add current sums_16 to
     * sums_32*/
    sums_32 = _mm256_add_epi32(
        sums_32,
        _mm256_add_epi32(
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));

    src += src_stride << 1;
    ref += ref_stride << 1;
  }
  return calc_final(sums_32);
}

#define HIGHBD_SAD64XN(n)                                                      \
  unsigned int vpx_highbd_sad64x##n##_avx2(const uint8_t *src, int src_stride, \
                                           const uint8_t *ref,                 \
                                           int ref_stride) {                   \
    return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n);           \
  }

#define HIGHBD_SADSKIP64xN(n)                                                \
  unsigned int vpx_highbd_sad_skip_64x##n##_avx2(                            \
      const uint8_t *src, int src_stride, const uint8_t *ref,                \
      int ref_stride) {                                                      \
    return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
                                   n / 2);                                   \
  }

static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
                                            const uint16_t *src, int src_stride,
                                            uint16_t *ref, int ref_stride,
                                            int height) {
  int i;
  for (i = 0; i < height; ++i) {
    // load src and all ref[]
    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
    // absolute differences between every ref[] to src
    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
    // sum every abs diff
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);

    src += src_stride;
    ref += ref_stride;
  }
}

static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr,
                                                         int src_stride,
                                                         const uint8_t *ref_ptr,
                                                         int ref_stride,
                                                         int n) {
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
  __m256i sums_32 = _mm256_setzero_si256();
  int i;

  for (i = 0; i < (n / 8); ++i) {
    __m256i sums_16 = _mm256_setzero_si256();

    highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8);

    /* sums_16 will outrange after 8 rows, so add current sums_16 to
     * sums_32*/
    sums_32 = _mm256_add_epi32(
        sums_32,
        _mm256_add_epi32(
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));

    src += src_stride << 3;
    ref += ref_stride << 3;
  }
  return calc_final(sums_32);
}

#define HIGHBD_SAD32XN(n)                                                      \
  unsigned int vpx_highbd_sad32x##n##_avx2(const uint8_t *src, int src_stride, \
                                           const uint8_t *ref,                 \
                                           int ref_stride) {                   \
    return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n);           \
  }

#define HIGHBD_SADSKIP32xN(n)                                                \
  unsigned int vpx_highbd_sad_skip_32x##n##_avx2(                            \
      const uint8_t *src, int src_stride, const uint8_t *ref,                \
      int ref_stride) {                                                      \
    return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
                                   n / 2);                                   \
  }

static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
                                            const uint16_t *src, int src_stride,
                                            uint16_t *ref, int ref_stride,
                                            int height) {
  int i;
  for (i = 0; i < height; i += 2) {
    // load src and all ref[]
    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
    // absolute differences between every ref[] to src
    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
    // sum every abs diff
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);

    src += src_stride << 1;
    ref += ref_stride << 1;
  }
}

static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr,
                                                         int src_stride,
                                                         const uint8_t *ref_ptr,
                                                         int ref_stride,
                                                         int n) {
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
  __m256i sums_32 = _mm256_setzero_si256();
  const int height = VPXMIN(16, n);
  const int num_iters = n / height;
  int i;

  for (i = 0; i < num_iters; ++i) {
    __m256i sums_16 = _mm256_setzero_si256();

    highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height);

    // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
    sums_32 = _mm256_add_epi32(
        sums_32,
        _mm256_add_epi32(
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));

    src += src_stride << 4;
    ref += ref_stride << 4;
  }
  return calc_final(sums_32);
}

#define HIGHBD_SAD16XN(n)                                                      \
  unsigned int vpx_highbd_sad16x##n##_avx2(const uint8_t *src, int src_stride, \
                                           const uint8_t *ref,                 \
                                           int ref_stride) {                   \
    return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n);           \
  }

#define HIGHBD_SADSKIP16xN(n)                                                \
  unsigned int vpx_highbd_sad_skip_16x##n##_avx2(                            \
      const uint8_t *src, int src_stride, const uint8_t *ref,                \
      int ref_stride) {                                                      \
    return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
                                   n / 2);                                   \
  }

unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride,
                                      const uint8_t *ref_ptr, int ref_stride) {
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
  __m256i sums_16 = _mm256_setzero_si256();

  highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);

  {
    const __m256i sums_32 = _mm256_add_epi32(
        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
    return calc_final(sums_32);
  }
}

unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride,
                                     const uint8_t *ref_ptr, int ref_stride) {
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
  __m256i sums_16 = _mm256_setzero_si256();

  highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8);

  {
    const __m256i sums_32 = _mm256_add_epi32(
        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
    return calc_final(sums_32);
  }
}

// clang-format off
HIGHBD_SAD64XN(64)
HIGHBD_SADSKIP64xN(64)
HIGHBD_SAD64XN(32)
HIGHBD_SADSKIP64xN(32)
HIGHBD_SAD32XN(64)
HIGHBD_SADSKIP32xN(64)
HIGHBD_SAD32XN(32)
HIGHBD_SADSKIP32xN(32)
HIGHBD_SAD32XN(16)
HIGHBD_SADSKIP32xN(16)
HIGHBD_SAD16XN(32)
HIGHBD_SADSKIP16xN(32)
HIGHBD_SADSKIP16xN(16)
HIGHBD_SADSKIP16xN(8)
//clang-format on

// AVG -------------------------------------------------------------------------
static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16,
                                                const uint16_t *src,
                                                int src_stride, uint16_t *ref,
                                                int ref_stride, uint16_t *sec,
                                                int height) {
  int i;
  for (i = 0; i < height; ++i) {
    // load src and all ref[]
    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
    const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
    const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
    const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
    const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
    const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
    const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32));
    const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48));
    const __m256i avg0 = _mm256_avg_epu16(r0, x0);
    const __m256i avg1 = _mm256_avg_epu16(r1, x1);
    const __m256i avg2 = _mm256_avg_epu16(r2, x2);
    const __m256i avg3 = _mm256_avg_epu16(r3, x3);
    // absolute differences between every ref/pred avg to src
    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
    const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2));
    const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3));
    // sum every abs diff
    *sums_16 =
        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
    *sums_16 =
        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));

    src += src_stride;
    ref += ref_stride;
    sec += 64;
  }
}

#define HIGHBD_SAD64XN_AVG(n)                                                 \
  unsigned int vpx_highbd_sad64x##n##_avg_avx2(                               \
      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
      int ref_stride, const uint8_t *second_pred) {                           \
    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                             \
    uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);                         \
    __m256i sums_32 = _mm256_setzero_si256();                                 \
    int i;                                                                    \
                                                                              \
    for (i = 0; i < (n / 2); ++i) {                                           \
      __m256i sums_16 = _mm256_setzero_si256();                               \
                                                                              \
      highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2); \
                                                                              \
      /* sums_16 will outrange after 2 rows, so add current sums_16 to        \
       * sums_32*/                                                            \
      sums_32 = _mm256_add_epi32(                                             \
          sums_32,                                                            \
          _mm256_add_epi32(                                                   \
              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),         \
              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));  \
                                                                              \
      src += src_stride << 1;                                                 \
      ref += ref_stride << 1;                                                 \
      sec += 64 << 1;                                                         \
    }                                                                         \
    return calc_final(sums_32);                                               \
  }

// 64x64
HIGHBD_SAD64XN_AVG(64)

// 64x32
HIGHBD_SAD64XN_AVG(32)

static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16,
                                                const uint16_t *src,
                                                int src_stride, uint16_t *ref,
                                                int ref_stride, uint16_t *sec,
                                                int height) {
  int i;
  for (i = 0; i < height; ++i) {
    // load src and all ref[]
    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
    const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
    const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
    const __m256i avg0 = _mm256_avg_epu16(r0, x0);
    const __m256i avg1 = _mm256_avg_epu16(r1, x1);
    // absolute differences between every ref/pred avg to src
    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
    // sum every abs diff
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);

    src += src_stride;
    ref += ref_stride;
    sec += 32;
  }
}

#define HIGHBD_SAD32XN_AVG(n)                                                 \
  unsigned int vpx_highbd_sad32x##n##_avg_avx2(                               \
      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
      int ref_stride, const uint8_t *second_pred) {                           \
    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                             \
    uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);                         \
    __m256i sums_32 = _mm256_setzero_si256();                                 \
    int i;                                                                    \
                                                                              \
    for (i = 0; i < (n / 8); ++i) {                                           \
      __m256i sums_16 = _mm256_setzero_si256();                               \
                                                                              \
      highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); \
                                                                              \
      /* sums_16 will outrange after 8 rows, so add current sums_16 to        \
       * sums_32*/                                                            \
      sums_32 = _mm256_add_epi32(                                             \
          sums_32,                                                            \
          _mm256_add_epi32(                                                   \
              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),         \
              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));  \
                                                                              \
      src += src_stride << 3;                                                 \
      ref += ref_stride << 3;                                                 \
      sec += 32 << 3;                                                         \
    }                                                                         \
    return calc_final(sums_32);                                               \
  }

// 32x64
HIGHBD_SAD32XN_AVG(64)

// 32x32
HIGHBD_SAD32XN_AVG(32)

// 32x16
HIGHBD_SAD32XN_AVG(16)

static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16,
                                                const uint16_t *src,
                                                int src_stride, uint16_t *ref,
                                                int ref_stride, uint16_t *sec,
                                                int height) {
  int i;
  for (i = 0; i < height; i += 2) {
    // load src and all ref[]
    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
    const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
    const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
    const __m256i avg0 = _mm256_avg_epu16(r0, x0);
    const __m256i avg1 = _mm256_avg_epu16(r1, x1);
    // absolute differences between every ref[] to src
    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
    // sum every abs diff
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);

    src += src_stride << 1;
    ref += ref_stride << 1;
    sec += 32;
  }
}

unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr,
                                          int src_stride,
                                          const uint8_t *ref_ptr,
                                          int ref_stride,
                                          const uint8_t *second_pred) {
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
  uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
  __m256i sums_32 = _mm256_setzero_si256();
  int i;

  for (i = 0; i < 2; ++i) {
    __m256i sums_16 = _mm256_setzero_si256();

    highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);

    // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
    sums_32 = _mm256_add_epi32(
        sums_32,
        _mm256_add_epi32(
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));

    src += src_stride << 4;
    ref += ref_stride << 4;
    sec += 16 << 4;
  }
  return calc_final(sums_32);
}

unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr,
                                          int src_stride,
                                          const uint8_t *ref_ptr,
                                          int ref_stride,
                                          const uint8_t *second_pred) {
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
  uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
  __m256i sums_16 = _mm256_setzero_si256();

  highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);

  {
    const __m256i sums_32 = _mm256_add_epi32(
        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
    return calc_final(sums_32);
  }
}

unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride,
                                         const uint8_t *ref_ptr, int ref_stride,
                                         const uint8_t *second_pred) {
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
  uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
  __m256i sums_16 = _mm256_setzero_si256();

  highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8);

  {
    const __m256i sums_32 = _mm256_add_epi32(
        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
    return calc_final(sums_32);
  }
}

Coverage Report

Created: 2024-09-06 07:53

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3		*
4		* Use of this source code is governed by a BSD-style license
5		* that can be found in the LICENSE file in the root of the source
6		* tree. An additional intellectual property rights grant can be found
7		* in the file PATENTS. All contributing project authors may
8		* be found in the AUTHORS file in the root of the source tree.
9		*/
10		#include <immintrin.h>
11		#include "./vpx_dsp_rtcd.h"
12		#include "vpx/vpx_integer.h"
13
14	0	static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) {
15	0	const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8));
16	0	const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4));
17	0	const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1),
18	0	_mm256_extractf128_si256(t1, 1));
19	0	return (unsigned int)_mm_cvtsi128_si32(sum);
20	0	}
21
22		static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
23		const uint16_t *src, int src_stride,
24		uint16_t *ref, int ref_stride,
25	0	int height) {
26	0	int i;
27	0	for (i = 0; i < height; ++i) {
28		// load src and all ref[]
29	0	const __m256i s0 = _mm256_load_si256((const __m256i *)src);
30	0	const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
31	0	const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
32	0	const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
33	0	const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
34	0	const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
35	0	const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
36	0	const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
37		// absolute differences between every ref[] to src
38	0	const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
39	0	const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
40	0	const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2));
41	0	const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3));
42		// sum every abs diff
43	0	*sums_16 =
44	0	_mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
45	0	*sums_16 =
46	0	_mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
47
48	0	src += src_stride;
49	0	ref += ref_stride;
50	0	}
51	0	}
52
53		static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr,
54		int src_stride,
55		const uint8_t *ref_ptr,
56		int ref_stride,
57	0	int n) {
58	0	const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
59	0	uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
60	0	__m256i sums_32 = _mm256_setzero_si256();
61	0	int i;
62
63	0	for (i = 0; i < (n / 2); ++i) {
64	0	__m256i sums_16 = _mm256_setzero_si256();
65
66	0	highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2);
67
68		/* sums_16 will outrange after 2 rows, so add current sums_16 to
69		* sums_32*/
70	0	sums_32 = _mm256_add_epi32(
71	0	sums_32,
72	0	_mm256_add_epi32(
73	0	_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
74	0	_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
75
76	0	src += src_stride << 1;
77	0	ref += ref_stride << 1;
78	0	}
79	0	return calc_final(sums_32);
80	0	}
81
82		#define HIGHBD_SAD64XN(n) \
83		unsigned int vpx_highbd_sad64x##n##_avx2(const uint8_t *src, int src_stride, \
84		const uint8_t *ref, \
85	0	int ref_stride) { \
86	0	return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n); \
87	0	} Unexecuted instantiation: vpx_highbd_sad64x64_avx2 Unexecuted instantiation: vpx_highbd_sad64x32_avx2
88
89		#define HIGHBD_SADSKIP64xN(n) \
90		unsigned int vpx_highbd_sad_skip_64x##n##_avx2( \
91		const uint8_t src, int src_stride, const uint8_t ref, \
92	0	int ref_stride) { \
93	0	return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
94	0	n / 2); \
95	0	} Unexecuted instantiation: vpx_highbd_sad_skip_64x64_avx2 Unexecuted instantiation: vpx_highbd_sad_skip_64x32_avx2
96
97		static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
98		const uint16_t *src, int src_stride,
99		uint16_t *ref, int ref_stride,
100	0	int height) {
101	0	int i;
102	0	for (i = 0; i < height; ++i) {
103		// load src and all ref[]
104	0	const __m256i s0 = _mm256_load_si256((const __m256i *)src);
105	0	const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
106	0	const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
107	0	const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
108		// absolute differences between every ref[] to src
109	0	const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
110	0	const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
111		// sum every abs diff
112	0	sums_16 = _mm256_add_epi16(sums_16, abs_diff0);
113	0	sums_16 = _mm256_add_epi16(sums_16, abs_diff1);
114
115	0	src += src_stride;
116	0	ref += ref_stride;
117	0	}
118	0	}
119
120		static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr,
121		int src_stride,
122		const uint8_t *ref_ptr,
123		int ref_stride,
124	0	int n) {
125	0	const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
126	0	uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
127	0	__m256i sums_32 = _mm256_setzero_si256();
128	0	int i;
129
130	0	for (i = 0; i < (n / 8); ++i) {
131	0	__m256i sums_16 = _mm256_setzero_si256();
132
133	0	highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8);
134
135		/* sums_16 will outrange after 8 rows, so add current sums_16 to
136		* sums_32*/
137	0	sums_32 = _mm256_add_epi32(
138	0	sums_32,
139	0	_mm256_add_epi32(
140	0	_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
141	0	_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
142
143	0	src += src_stride << 3;
144	0	ref += ref_stride << 3;
145	0	}
146	0	return calc_final(sums_32);
147	0	}
148
149		#define HIGHBD_SAD32XN(n) \
150		unsigned int vpx_highbd_sad32x##n##_avx2(const uint8_t *src, int src_stride, \
151		const uint8_t *ref, \
152	0	int ref_stride) { \
153	0	return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n); \
154	0	} Unexecuted instantiation: vpx_highbd_sad32x64_avx2 Unexecuted instantiation: vpx_highbd_sad32x32_avx2 Unexecuted instantiation: vpx_highbd_sad32x16_avx2
155
156		#define HIGHBD_SADSKIP32xN(n) \
157		unsigned int vpx_highbd_sad_skip_32x##n##_avx2( \
158		const uint8_t src, int src_stride, const uint8_t ref, \
159	0	int ref_stride) { \
160	0	return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
161	0	n / 2); \
162	0	} Unexecuted instantiation: vpx_highbd_sad_skip_32x64_avx2 Unexecuted instantiation: vpx_highbd_sad_skip_32x32_avx2 Unexecuted instantiation: vpx_highbd_sad_skip_32x16_avx2
163
164		static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
165		const uint16_t *src, int src_stride,
166		uint16_t *ref, int ref_stride,
167	0	int height) {
168	0	int i;
169	0	for (i = 0; i < height; i += 2) {
170		// load src and all ref[]
171	0	const __m256i s0 = _mm256_load_si256((const __m256i *)src);
172	0	const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
173	0	const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
174	0	const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
175		// absolute differences between every ref[] to src
176	0	const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
177	0	const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
178		// sum every abs diff
179	0	sums_16 = _mm256_add_epi16(sums_16, abs_diff0);
180	0	sums_16 = _mm256_add_epi16(sums_16, abs_diff1);
181
182	0	src += src_stride << 1;
183	0	ref += ref_stride << 1;
184	0	}
185	0	}
186
187		static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr,
188		int src_stride,
189		const uint8_t *ref_ptr,
190		int ref_stride,
191	0	int n) {
192	0	const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
193	0	uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
194	0	__m256i sums_32 = _mm256_setzero_si256();
195	0	const int height = VPXMIN(16, n);
196	0	const int num_iters = n / height;
197	0	int i;
198
199	0	for (i = 0; i < num_iters; ++i) {
200	0	__m256i sums_16 = _mm256_setzero_si256();
201
202	0	highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height);
203
204		// sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
205	0	sums_32 = _mm256_add_epi32(
206	0	sums_32,
207	0	_mm256_add_epi32(
208	0	_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
209	0	_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
210
211	0	src += src_stride << 4;
212	0	ref += ref_stride << 4;
213	0	}
214	0	return calc_final(sums_32);
215	0	}
216
217		#define HIGHBD_SAD16XN(n) \
218		unsigned int vpx_highbd_sad16x##n##_avx2(const uint8_t *src, int src_stride, \
219		const uint8_t *ref, \
220	0	int ref_stride) { \
221	0	return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n); \
222	0	}
223
224		#define HIGHBD_SADSKIP16xN(n) \
225		unsigned int vpx_highbd_sad_skip_16x##n##_avx2( \
226		const uint8_t src, int src_stride, const uint8_t ref, \
227	0	int ref_stride) { \
228	0	return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
229	0	n / 2); \
230	0	} Unexecuted instantiation: vpx_highbd_sad_skip_16x32_avx2 Unexecuted instantiation: vpx_highbd_sad_skip_16x16_avx2 Unexecuted instantiation: vpx_highbd_sad_skip_16x8_avx2
231
232		unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride,
233	0	const uint8_t *ref_ptr, int ref_stride) {
234	0	const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
235	0	uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
236	0	__m256i sums_16 = _mm256_setzero_si256();
237
238	0	highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
239
240	0	{
241	0	const __m256i sums_32 = _mm256_add_epi32(
242	0	_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
243	0	_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
244	0	return calc_final(sums_32);
245	0	}
246	0	}
247
248		unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride,
249	0	const uint8_t *ref_ptr, int ref_stride) {
250	0	const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
251	0	uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
252	0	__m256i sums_16 = _mm256_setzero_si256();
253
254	0	highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8);
255
256	0	{
257	0	const __m256i sums_32 = _mm256_add_epi32(
258	0	_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
259	0	_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
260	0	return calc_final(sums_32);
261	0	}
262	0	}
263
264		// clang-format off
265		HIGHBD_SAD64XN(64)
266		HIGHBD_SADSKIP64xN(64)
267		HIGHBD_SAD64XN(32)
268		HIGHBD_SADSKIP64xN(32)
269		HIGHBD_SAD32XN(64)
270		HIGHBD_SADSKIP32xN(64)
271		HIGHBD_SAD32XN(32)
272		HIGHBD_SADSKIP32xN(32)
273		HIGHBD_SAD32XN(16)
274		HIGHBD_SADSKIP32xN(16)
275		HIGHBD_SAD16XN(32)
276		HIGHBD_SADSKIP16xN(32)
277		HIGHBD_SADSKIP16xN(16)
278		HIGHBD_SADSKIP16xN(8)
279		//clang-format on
280
281		// AVG -------------------------------------------------------------------------
282		static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16,
283		const uint16_t *src,
284		int src_stride, uint16_t *ref,
285		int ref_stride, uint16_t *sec,
286	0	int height) {
287	0	int i;
288	0	for (i = 0; i < height; ++i) {
289		// load src and all ref[]
290	0	const __m256i s0 = _mm256_load_si256((const __m256i *)src);
291	0	const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
292	0	const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
293	0	const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
294	0	const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
295	0	const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
296	0	const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
297	0	const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
298	0	const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
299	0	const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
300	0	const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32));
301	0	const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48));
302	0	const __m256i avg0 = _mm256_avg_epu16(r0, x0);
303	0	const __m256i avg1 = _mm256_avg_epu16(r1, x1);
304	0	const __m256i avg2 = _mm256_avg_epu16(r2, x2);
305	0	const __m256i avg3 = _mm256_avg_epu16(r3, x3);
306		// absolute differences between every ref/pred avg to src
307	0	const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
308	0	const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
309	0	const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2));
310	0	const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3));
311		// sum every abs diff
312	0	*sums_16 =
313	0	_mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
314	0	*sums_16 =
315	0	_mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
316
317	0	src += src_stride;
318	0	ref += ref_stride;
319	0	sec += 64;
320	0	}
321	0	}
322
323		#define HIGHBD_SAD64XN_AVG(n) \
324		unsigned int vpx_highbd_sad64x##n##_avg_avx2( \
325		const uint8_t src_ptr, int src_stride, const uint8_t ref_ptr, \
326	0	int ref_stride, const uint8_t *second_pred) { \
327	0	const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
328	0	uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
329	0	uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \
330	0	__m256i sums_32 = _mm256_setzero_si256(); \
331	0	int i; \
332	0	\
333	0	for (i = 0; i < (n / 2); ++i) { \
334	0	__m256i sums_16 = _mm256_setzero_si256(); \
335	0	\
336	0	highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2); \
337	0	\
338	0	/* sums_16 will outrange after 2 rows, so add current sums_16 to \
339	0	* sums_32*/ \
340	0	sums_32 = _mm256_add_epi32( \
341	0	sums_32, \
342	0	_mm256_add_epi32( \
343	0	_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
344	0	_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
345	0	\
346	0	src += src_stride << 1; \
347	0	ref += ref_stride << 1; \
348	0	sec += 64 << 1; \
349	0	} \
350	0	return calc_final(sums_32); \
351	0	} Unexecuted instantiation: vpx_highbd_sad64x64_avg_avx2 Unexecuted instantiation: vpx_highbd_sad64x32_avg_avx2
352
353		// 64x64
354		HIGHBD_SAD64XN_AVG(64)
355
356		// 64x32
357		HIGHBD_SAD64XN_AVG(32)
358
359		static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16,
360		const uint16_t *src,
361		int src_stride, uint16_t *ref,
362		int ref_stride, uint16_t *sec,
363	0	int height) {
364	0	int i;
365	0	for (i = 0; i < height; ++i) {
366		// load src and all ref[]
367	0	const __m256i s0 = _mm256_load_si256((const __m256i *)src);
368	0	const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
369	0	const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
370	0	const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
371	0	const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
372	0	const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
373	0	const __m256i avg0 = _mm256_avg_epu16(r0, x0);
374	0	const __m256i avg1 = _mm256_avg_epu16(r1, x1);
375		// absolute differences between every ref/pred avg to src
376	0	const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
377	0	const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
378		// sum every abs diff
379	0	sums_16 = _mm256_add_epi16(sums_16, abs_diff0);
380	0	sums_16 = _mm256_add_epi16(sums_16, abs_diff1);
381
382	0	src += src_stride;
383	0	ref += ref_stride;
384	0	sec += 32;
385	0	}
386	0	}
387
388		#define HIGHBD_SAD32XN_AVG(n) \
389		unsigned int vpx_highbd_sad32x##n##_avg_avx2( \
390		const uint8_t src_ptr, int src_stride, const uint8_t ref_ptr, \
391	0	int ref_stride, const uint8_t *second_pred) { \
392	0	const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
393	0	uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
394	0	uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \
395	0	__m256i sums_32 = _mm256_setzero_si256(); \
396	0	int i; \
397	0	\
398	0	for (i = 0; i < (n / 8); ++i) { \
399	0	__m256i sums_16 = _mm256_setzero_si256(); \
400	0	\
401	0	highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); \
402	0	\
403	0	/* sums_16 will outrange after 8 rows, so add current sums_16 to \
404	0	* sums_32*/ \
405	0	sums_32 = _mm256_add_epi32( \
406	0	sums_32, \
407	0	_mm256_add_epi32( \
408	0	_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
409	0	_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
410	0	\
411	0	src += src_stride << 3; \
412	0	ref += ref_stride << 3; \
413	0	sec += 32 << 3; \
414	0	} \
415	0	return calc_final(sums_32); \
416	0	} Unexecuted instantiation: vpx_highbd_sad32x64_avg_avx2 Unexecuted instantiation: vpx_highbd_sad32x32_avg_avx2 Unexecuted instantiation: vpx_highbd_sad32x16_avg_avx2
417
418		// 32x64
419		HIGHBD_SAD32XN_AVG(64)
420
421		// 32x32
422		HIGHBD_SAD32XN_AVG(32)
423
424		// 32x16
425		HIGHBD_SAD32XN_AVG(16)
426
427		static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16,
428		const uint16_t *src,
429		int src_stride, uint16_t *ref,
430		int ref_stride, uint16_t *sec,
431	0	int height) {
432	0	int i;
433	0	for (i = 0; i < height; i += 2) {
434		// load src and all ref[]
435	0	const __m256i s0 = _mm256_load_si256((const __m256i *)src);
436	0	const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
437	0	const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
438	0	const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
439	0	const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
440	0	const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
441	0	const __m256i avg0 = _mm256_avg_epu16(r0, x0);
442	0	const __m256i avg1 = _mm256_avg_epu16(r1, x1);
443		// absolute differences between every ref[] to src
444	0	const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
445	0	const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
446		// sum every abs diff
447	0	sums_16 = _mm256_add_epi16(sums_16, abs_diff0);
448	0	sums_16 = _mm256_add_epi16(sums_16, abs_diff1);
449
450	0	src += src_stride << 1;
451	0	ref += ref_stride << 1;
452	0	sec += 32;
453	0	}
454	0	}
455
456		unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr,
457		int src_stride,
458		const uint8_t *ref_ptr,
459		int ref_stride,
460	0	const uint8_t *second_pred) {
461	0	const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
462	0	uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
463	0	uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
464	0	__m256i sums_32 = _mm256_setzero_si256();
465	0	int i;
466
467	0	for (i = 0; i < 2; ++i) {
468	0	__m256i sums_16 = _mm256_setzero_si256();
469
470	0	highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
471
472		// sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
473	0	sums_32 = _mm256_add_epi32(
474	0	sums_32,
475	0	_mm256_add_epi32(
476	0	_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
477	0	_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
478
479	0	src += src_stride << 4;
480	0	ref += ref_stride << 4;
481	0	sec += 16 << 4;
482	0	}
483	0	return calc_final(sums_32);
484	0	}
485
486		unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr,
487		int src_stride,
488		const uint8_t *ref_ptr,
489		int ref_stride,
490	0	const uint8_t *second_pred) {
491	0	const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
492	0	uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
493	0	uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
494	0	__m256i sums_16 = _mm256_setzero_si256();
495
496	0	highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
497
498	0	{
499	0	const __m256i sums_32 = _mm256_add_epi32(
500	0	_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
501	0	_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
502	0	return calc_final(sums_32);
503	0	}
504	0	}
505
506		unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride,
507		const uint8_t *ref_ptr, int ref_stride,
508	0	const uint8_t *second_pred) {
509	0	const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
510	0	uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
511	0	uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
512	0	__m256i sums_16 = _mm256_setzero_si256();
513
514	0	highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8);
515
516	0	{
517	0	const __m256i sums_32 = _mm256_add_epi32(
518	0	_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
519	0	_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
520	0	return calc_final(sums_32);
521	0	}
522	0	}