/src/aom/av1/common/x86/cdef_block_avx2.c

Source (jump to first uncovered line)
/*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include "aom_dsp/aom_simd.h"
#define SIMD_FUNC(name) name##_avx2
#include "av1/common/cdef_block_simd.h"

// Mask used to shuffle the elements present in 256bit register.
const int shuffle_reg_256bit[8] = { 0x0b0a0d0c, 0x07060908, 0x03020504,
                                    0x0f0e0100, 0x0b0a0d0c, 0x07060908,
                                    0x03020504, 0x0f0e0100 };

/* partial A is a 16-bit vector of the form:
[x8 - - x1 | x16 - - x9] and partial B has the form:
[0  y1 - y7 | 0 y9 - y15].
This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
(x7^2+y2^7)*C7 + (x8^2+0^2)*C8 on each 128-bit lane. Here the C1..C8 constants
are in const1 and const2. */
static INLINE __m256i fold_mul_and_sum_avx2(__m256i *partiala,
                                            __m256i *partialb,
                                            const __m256i *const1,
                                            const __m256i *const2) {
  __m256i tmp;
  /* Reverse partial B. */
  *partialb = _mm256_shuffle_epi8(
      *partialb, _mm256_loadu_si256((const __m256i *)shuffle_reg_256bit));

  /* Interleave the x and y values of identical indices and pair x8 with 0. */
  tmp = *partiala;
  *partiala = _mm256_unpacklo_epi16(*partiala, *partialb);
  *partialb = _mm256_unpackhi_epi16(tmp, *partialb);

  /* Square and add the corresponding x and y values. */
  *partiala = _mm256_madd_epi16(*partiala, *partiala);
  *partialb = _mm256_madd_epi16(*partialb, *partialb);
  /* Multiply by constant. */
  *partiala = _mm256_mullo_epi32(*partiala, *const1);
  *partialb = _mm256_mullo_epi32(*partialb, *const2);
  /* Sum all results. */
  *partiala = _mm256_add_epi32(*partiala, *partialb);
  return *partiala;
}

static INLINE __m256i hsum4_avx2(__m256i *x0, __m256i *x1, __m256i *x2,
                                 __m256i *x3) {
  const __m256i t0 = _mm256_unpacklo_epi32(*x0, *x1);
  const __m256i t1 = _mm256_unpacklo_epi32(*x2, *x3);
  const __m256i t2 = _mm256_unpackhi_epi32(*x0, *x1);
  const __m256i t3 = _mm256_unpackhi_epi32(*x2, *x3);

  *x0 = _mm256_unpacklo_epi64(t0, t1);
  *x1 = _mm256_unpackhi_epi64(t0, t1);
  *x2 = _mm256_unpacklo_epi64(t2, t3);
  *x3 = _mm256_unpackhi_epi64(t2, t3);
  return _mm256_add_epi32(_mm256_add_epi32(*x0, *x1),
                          _mm256_add_epi32(*x2, *x3));
}

/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
to compute the remaining directions. */
static INLINE __m256i compute_directions_avx2(__m256i *lines,
                                              int32_t cost_frist_8x8[4],
                                              int32_t cost_second_8x8[4]) {
  __m256i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
  __m256i partial6;
  __m256i tmp;
  /* Partial sums for lines 0 and 1. */
  partial4a = _mm256_slli_si256(lines[0], 14);
  partial4b = _mm256_srli_si256(lines[0], 2);
  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[1], 12));
  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[1], 4));
  tmp = _mm256_add_epi16(lines[0], lines[1]);
  partial5a = _mm256_slli_si256(tmp, 10);
  partial5b = _mm256_srli_si256(tmp, 6);
  partial7a = _mm256_slli_si256(tmp, 4);
  partial7b = _mm256_srli_si256(tmp, 12);
  partial6 = tmp;

  /* Partial sums for lines 2 and 3. */
  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[2], 10));
  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[2], 6));
  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[3], 8));
  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[3], 8));
  tmp = _mm256_add_epi16(lines[2], lines[3]);
  partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 8));
  partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 8));
  partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 6));
  partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 10));
  partial6 = _mm256_add_epi16(partial6, tmp);

  /* Partial sums for lines 4 and 5. */
  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[4], 6));
  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[4], 10));
  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[5], 4));
  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[5], 12));
  tmp = _mm256_add_epi16(lines[4], lines[5]);
  partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 6));
  partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 10));
  partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 8));
  partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 8));
  partial6 = _mm256_add_epi16(partial6, tmp);

  /* Partial sums for lines 6 and 7. */
  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[6], 2));
  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[6], 14));
  partial4a = _mm256_add_epi16(partial4a, lines[7]);
  tmp = _mm256_add_epi16(lines[6], lines[7]);
  partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 4));
  partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 12));
  partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 10));
  partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 6));
  partial6 = _mm256_add_epi16(partial6, tmp);

  const __m256i const_reg_1 =
      _mm256_set_epi32(210, 280, 420, 840, 210, 280, 420, 840);
  const __m256i const_reg_2 =
      _mm256_set_epi32(105, 120, 140, 168, 105, 120, 140, 168);
  const __m256i const_reg_3 = _mm256_set_epi32(210, 420, 0, 0, 210, 420, 0, 0);
  const __m256i const_reg_4 =
      _mm256_set_epi32(105, 105, 105, 140, 105, 105, 105, 140);

  /* Compute costs in terms of partial sums. */
  partial4a =
      fold_mul_and_sum_avx2(&partial4a, &partial4b, &const_reg_1, &const_reg_2);
  partial7a =
      fold_mul_and_sum_avx2(&partial7a, &partial7b, &const_reg_3, &const_reg_4);
  partial5a =
      fold_mul_and_sum_avx2(&partial5a, &partial5b, &const_reg_3, &const_reg_4);
  partial6 = _mm256_madd_epi16(partial6, partial6);
  partial6 = _mm256_mullo_epi32(partial6, _mm256_set1_epi32(105));

  partial4a = hsum4_avx2(&partial4a, &partial5a, &partial6, &partial7a);
  _mm_storeu_si128((__m128i *)cost_frist_8x8,
                   _mm256_castsi256_si128(partial4a));
  _mm_storeu_si128((__m128i *)cost_second_8x8,
                   _mm256_extractf128_si256(partial4a, 1));

  return partial4a;
}

/* transpose and reverse the order of the lines -- equivalent to a 90-degree
counter-clockwise rotation of the pixels. */
static INLINE void array_reverse_transpose_8x8_avx2(__m256i *in, __m256i *res) {
  const __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
  const __m256i tr0_1 = _mm256_unpacklo_epi16(in[2], in[3]);
  const __m256i tr0_2 = _mm256_unpackhi_epi16(in[0], in[1]);
  const __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
  const __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
  const __m256i tr0_5 = _mm256_unpacklo_epi16(in[6], in[7]);
  const __m256i tr0_6 = _mm256_unpackhi_epi16(in[4], in[5]);
  const __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);

  const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
  const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
  const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
  const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
  const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
  const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
  const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
  const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);

  res[7] = _mm256_unpacklo_epi64(tr1_0, tr1_1);
  res[6] = _mm256_unpackhi_epi64(tr1_0, tr1_1);
  res[5] = _mm256_unpacklo_epi64(tr1_2, tr1_3);
  res[4] = _mm256_unpackhi_epi64(tr1_2, tr1_3);
  res[3] = _mm256_unpacklo_epi64(tr1_4, tr1_5);
  res[2] = _mm256_unpackhi_epi64(tr1_4, tr1_5);
  res[1] = _mm256_unpacklo_epi64(tr1_6, tr1_7);
  res[0] = _mm256_unpackhi_epi64(tr1_6, tr1_7);
}

void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2,
                             int stride, int32_t *var_out_1st,
                             int32_t *var_out_2nd, int coeff_shift,
                             int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
  int32_t cost_first_8x8[8];
  int32_t cost_second_8x8[8];
  // Used to store the best cost for 2 8x8's.
  int32_t best_cost[2] = { 0 };
  // Best direction for 2 8x8's.
  int best_dir[2] = { 0 };

  const __m128i const_coeff_shift_reg = _mm_cvtsi32_si128(coeff_shift);
  const __m256i const_128_reg = _mm256_set1_epi16(128);
  __m256i lines[8];
  for (int i = 0; i < 8; i++) {
    const __m128i src_1 = _mm_loadu_si128((const __m128i *)&img1[i * stride]);
    const __m128i src_2 = _mm_loadu_si128((const __m128i *)&img2[i * stride]);

    lines[i] = _mm256_insertf128_si256(_mm256_castsi128_si256(src_1), src_2, 1);
    lines[i] = _mm256_sub_epi16(
        _mm256_sra_epi16(lines[i], const_coeff_shift_reg), const_128_reg);
  }

  /* Compute "mostly vertical" directions. */
  const __m256i dir47 =
      compute_directions_avx2(lines, cost_first_8x8 + 4, cost_second_8x8 + 4);

  /* Transpose and reverse the order of the lines. */
  array_reverse_transpose_8x8_avx2(lines, lines);

  /* Compute "mostly horizontal" directions. */
  const __m256i dir03 =
      compute_directions_avx2(lines, cost_first_8x8, cost_second_8x8);

  __m256i max = _mm256_max_epi32(dir03, dir47);
  max =
      _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 8),
                                            _mm256_slli_si256(max, 16 - (8))));
  max =
      _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 4),
                                            _mm256_slli_si256(max, 16 - (4))));

  const __m128i first_8x8_output = _mm256_castsi256_si128(max);
  const __m128i second_8x8_output = _mm256_extractf128_si256(max, 1);
  const __m128i cmpeg_res_00 =
      _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir47));
  const __m128i cmpeg_res_01 =
      _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir03));
  const __m128i cmpeg_res_10 =
      _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir47, 1));
  const __m128i cmpeg_res_11 =
      _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir03, 1));
  const __m128i t_first_8x8 = _mm_packs_epi32(cmpeg_res_01, cmpeg_res_00);
  const __m128i t_second_8x8 = _mm_packs_epi32(cmpeg_res_11, cmpeg_res_10);

  best_cost[0] = _mm_cvtsi128_si32(_mm256_castsi256_si128(max));
  best_cost[1] = _mm_cvtsi128_si32(second_8x8_output);
  best_dir[0] = _mm_movemask_epi8(_mm_packs_epi16(t_first_8x8, t_first_8x8));
  best_dir[0] =
      get_msb(best_dir[0] ^ (best_dir[0] - 1));  // Count trailing zeros
  best_dir[1] = _mm_movemask_epi8(_mm_packs_epi16(t_second_8x8, t_second_8x8));
  best_dir[1] =
      get_msb(best_dir[1] ^ (best_dir[1] - 1));  // Count trailing zeros

  /* Difference between the optimal variance and the variance along the
     orthogonal direction. Again, the sum(x^2) terms cancel out. */
  *var_out_1st = best_cost[0] - cost_first_8x8[(best_dir[0] + 4) & 7];
  *var_out_2nd = best_cost[1] - cost_second_8x8[(best_dir[1] + 4) & 7];

  /* We'd normally divide by 840, but dividing by 1024 is close enough
  for what we're going to do with this. */
  *var_out_1st >>= 10;
  *var_out_2nd >>= 10;
  *out_dir_1st_8x8 = best_dir[0];
  *out_dir_2nd_8x8 = best_dir[1];
}

void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride,
                                        const uint8_t *src, int sstride,
                                        int width, int height) {
  int j = 0;
  int remaining_width = width;
  assert(height % 2 == 0);
  assert(height > 0);
  assert(width > 0);

  // Process multiple 32 pixels at a time.
  if (remaining_width > 31) {
    int i = 0;
    do {
      j = 0;
      do {
        __m128i row00 =
            _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + (j + 0)]);
        __m128i row01 = _mm_loadu_si128(
            (const __m128i *)&src[(i + 0) * sstride + (j + 16)]);
        __m128i row10 =
            _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + (j + 0)]);
        __m128i row11 = _mm_loadu_si128(
            (const __m128i *)&src[(i + 1) * sstride + (j + 16)]);
        _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 0)],
                            _mm256_cvtepu8_epi16(row00));
        _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 16)],
                            _mm256_cvtepu8_epi16(row01));
        _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 0)],
                            _mm256_cvtepu8_epi16(row10));
        _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 16)],
                            _mm256_cvtepu8_epi16(row11));
        j += 32;
      } while (j <= width - 32);
      i += 2;
    } while (i < height);
    remaining_width = width & 31;
  }

  // Process 16 pixels at a time.
  if (remaining_width > 15) {
    int i = 0;
    do {
      __m128i row0 =
          _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + j]);
      __m128i row1 =
          _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + j]);
      _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + j],
                          _mm256_cvtepu8_epi16(row0));
      _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + j],
                          _mm256_cvtepu8_epi16(row1));
      i += 2;
    } while (i < height);
    remaining_width = width & 15;
    j += 16;
  }

  // Process 8 pixels at a time.
  if (remaining_width > 7) {
    int i = 0;
    do {
      __m128i row0 =
          _mm_loadl_epi64((const __m128i *)&src[(i + 0) * sstride + j]);
      __m128i row1 =
          _mm_loadl_epi64((const __m128i *)&src[(i + 1) * sstride + j]);
      _mm_storeu_si128((__m128i *)&dst[(i + 0) * dstride + j],
                       _mm_unpacklo_epi8(row0, _mm_setzero_si128()));
      _mm_storeu_si128((__m128i *)&dst[(i + 1) * dstride + j],
                       _mm_unpacklo_epi8(row1, _mm_setzero_si128()));
      i += 2;
    } while (i < height);
    remaining_width = width & 7;
    j += 8;
  }

  // Process 4 pixels at a time.
  if (remaining_width > 3) {
    int i = 0;
    do {
      __m128i row0 =
          _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 0) * sstride + j]));
      __m128i row1 =
          _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 1) * sstride + j]));
      _mm_storel_epi64((__m128i *)&dst[(i + 0) * dstride + j],
                       _mm_unpacklo_epi8(row0, _mm_setzero_si128()));
      _mm_storel_epi64((__m128i *)&dst[(i + 1) * dstride + j],
                       _mm_unpacklo_epi8(row1, _mm_setzero_si128()));
      i += 2;
    } while (i < height);
    remaining_width = width & 3;
    j += 4;
  }

  // Process the remaining pixels.
  if (remaining_width) {
    for (int i = 0; i < height; i++) {
      for (int k = j; k < width; k++) {
        dst[i * dstride + k] = src[i * sstride + k];
      }
    }
  }
}

Coverage Report

Created: 2023-06-07 06:31

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright (c) 2016, Alliance for Open Media. All rights reserved
3		*
4		* This source code is subject to the terms of the BSD 2 Clause License and
5		* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6		* was not distributed with this source code in the LICENSE file, you can
7		* obtain it at www.aomedia.org/license/software. If the Alliance for Open
8		* Media Patent License 1.0 was not distributed with this source code in the
9		* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10		*/
11
12		#include "aom_dsp/aom_simd.h"
13		#define SIMD_FUNC(name) name##_avx2
14		#include "av1/common/cdef_block_simd.h"
15
16		// Mask used to shuffle the elements present in 256bit register.
17		const int shuffle_reg_256bit[8] = { 0x0b0a0d0c, 0x07060908, 0x03020504,
18		0x0f0e0100, 0x0b0a0d0c, 0x07060908,
19		0x03020504, 0x0f0e0100 };
20
21		/* partial A is a 16-bit vector of the form:
22		[x8 - - x1 \| x16 - - x9] and partial B has the form:
23		[0 y1 - y7 \| 0 y9 - y15].
24		This function computes (x1^2+y1^2)C1 + (x2^2+y2^2)C2 + ...
25		(x7^2+y2^7)C7 + (x8^2+0^2)C8 on each 128-bit lane. Here the C1..C8 constants
26		are in const1 and const2. */
27		static INLINE __m256i fold_mul_and_sum_avx2(__m256i *partiala,
28		__m256i *partialb,
29		const __m256i *const1,
30	78.7M	const __m256i *const2) {
31	78.7M	__m256i tmp;
32		/* Reverse partial B. */
33	78.7M	*partialb = _mm256_shuffle_epi8(
34	78.7M	partialb, _mm256_loadu_si256((const __m256i )shuffle_reg_256bit));
35
36		/* Interleave the x and y values of identical indices and pair x8 with 0. */
37	78.7M	tmp = *partiala;
38	78.7M	partiala = _mm256_unpacklo_epi16(partiala, *partialb);
39	78.7M	partialb = _mm256_unpackhi_epi16(tmp, partialb);
40
41		/* Square and add the corresponding x and y values. */
42	78.7M	partiala = _mm256_madd_epi16(partiala, *partiala);
43	78.7M	partialb = _mm256_madd_epi16(partialb, *partialb);
44		/* Multiply by constant. */
45	78.7M	partiala = _mm256_mullo_epi32(partiala, *const1);
46	78.7M	partialb = _mm256_mullo_epi32(partialb, *const2);
47		/* Sum all results. */
48	78.7M	partiala = _mm256_add_epi32(partiala, *partialb);
49	78.7M	return *partiala;
50	78.7M	}
51
52		static INLINE __m256i hsum4_avx2(__m256i x0, __m256i x1, __m256i *x2,
53	26.2M	__m256i *x3) {
54	26.2M	const __m256i t0 = _mm256_unpacklo_epi32(x0, x1);
55	26.2M	const __m256i t1 = _mm256_unpacklo_epi32(x2, x3);
56	26.2M	const __m256i t2 = _mm256_unpackhi_epi32(x0, x1);
57	26.2M	const __m256i t3 = _mm256_unpackhi_epi32(x2, x3);
58
59	26.2M	*x0 = _mm256_unpacklo_epi64(t0, t1);
60	26.2M	*x1 = _mm256_unpackhi_epi64(t0, t1);
61	26.2M	*x2 = _mm256_unpacklo_epi64(t2, t3);
62	26.2M	*x3 = _mm256_unpackhi_epi64(t2, t3);
63	26.2M	return _mm256_add_epi32(_mm256_add_epi32(x0, x1),
64	26.2M	_mm256_add_epi32(x2, x3));
65	26.2M	}
66
67		/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
68		to compute the remaining directions. */
69		static INLINE __m256i compute_directions_avx2(__m256i *lines,
70		int32_t cost_frist_8x8[4],
71	26.2M	int32_t cost_second_8x8[4]) {
72	26.2M	__m256i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
73	26.2M	__m256i partial6;
74	26.2M	__m256i tmp;
75		/* Partial sums for lines 0 and 1. */
76	26.2M	partial4a = _mm256_slli_si256(lines[0], 14);
77	26.2M	partial4b = _mm256_srli_si256(lines[0], 2);
78	26.2M	partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[1], 12));
79	26.2M	partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[1], 4));
80	26.2M	tmp = _mm256_add_epi16(lines[0], lines[1]);
81	26.2M	partial5a = _mm256_slli_si256(tmp, 10);
82	26.2M	partial5b = _mm256_srli_si256(tmp, 6);
83	26.2M	partial7a = _mm256_slli_si256(tmp, 4);
84	26.2M	partial7b = _mm256_srli_si256(tmp, 12);
85	26.2M	partial6 = tmp;
86
87		/* Partial sums for lines 2 and 3. */
88	26.2M	partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[2], 10));
89	26.2M	partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[2], 6));
90	26.2M	partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[3], 8));
91	26.2M	partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[3], 8));
92	26.2M	tmp = _mm256_add_epi16(lines[2], lines[3]);
93	26.2M	partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 8));
94	26.2M	partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 8));
95	26.2M	partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 6));
96	26.2M	partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 10));
97	26.2M	partial6 = _mm256_add_epi16(partial6, tmp);
98
99		/* Partial sums for lines 4 and 5. */
100	26.2M	partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[4], 6));
101	26.2M	partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[4], 10));
102	26.2M	partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[5], 4));
103	26.2M	partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[5], 12));
104	26.2M	tmp = _mm256_add_epi16(lines[4], lines[5]);
105	26.2M	partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 6));
106	26.2M	partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 10));
107	26.2M	partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 8));
108	26.2M	partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 8));
109	26.2M	partial6 = _mm256_add_epi16(partial6, tmp);
110
111		/* Partial sums for lines 6 and 7. */
112	26.2M	partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[6], 2));
113	26.2M	partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[6], 14));
114	26.2M	partial4a = _mm256_add_epi16(partial4a, lines[7]);
115	26.2M	tmp = _mm256_add_epi16(lines[6], lines[7]);
116	26.2M	partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 4));
117	26.2M	partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 12));
118	26.2M	partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 10));
119	26.2M	partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 6));
120	26.2M	partial6 = _mm256_add_epi16(partial6, tmp);
121
122	26.2M	const __m256i const_reg_1 =
123	26.2M	_mm256_set_epi32(210, 280, 420, 840, 210, 280, 420, 840);
124	26.2M	const __m256i const_reg_2 =
125	26.2M	_mm256_set_epi32(105, 120, 140, 168, 105, 120, 140, 168);
126	26.2M	const __m256i const_reg_3 = _mm256_set_epi32(210, 420, 0, 0, 210, 420, 0, 0);
127	26.2M	const __m256i const_reg_4 =
128	26.2M	_mm256_set_epi32(105, 105, 105, 140, 105, 105, 105, 140);
129
130		/* Compute costs in terms of partial sums. */
131	26.2M	partial4a =
132	26.2M	fold_mul_and_sum_avx2(&partial4a, &partial4b, &const_reg_1, &const_reg_2);
133	26.2M	partial7a =
134	26.2M	fold_mul_and_sum_avx2(&partial7a, &partial7b, &const_reg_3, &const_reg_4);
135	26.2M	partial5a =
136	26.2M	fold_mul_and_sum_avx2(&partial5a, &partial5b, &const_reg_3, &const_reg_4);
137	26.2M	partial6 = _mm256_madd_epi16(partial6, partial6);
138	26.2M	partial6 = _mm256_mullo_epi32(partial6, _mm256_set1_epi32(105));
139
140	26.2M	partial4a = hsum4_avx2(&partial4a, &partial5a, &partial6, &partial7a);
141	26.2M	_mm_storeu_si128((__m128i *)cost_frist_8x8,
142	26.2M	_mm256_castsi256_si128(partial4a));
143	26.2M	_mm_storeu_si128((__m128i *)cost_second_8x8,
144	26.2M	_mm256_extractf128_si256(partial4a, 1));
145
146	26.2M	return partial4a;
147	26.2M	}
148
149		/* transpose and reverse the order of the lines -- equivalent to a 90-degree
150		counter-clockwise rotation of the pixels. */
151	13.1M	static INLINE void array_reverse_transpose_8x8_avx2(__m256i in, __m256i res) {
152	13.1M	const __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
153	13.1M	const __m256i tr0_1 = _mm256_unpacklo_epi16(in[2], in[3]);
154	13.1M	const __m256i tr0_2 = _mm256_unpackhi_epi16(in[0], in[1]);
155	13.1M	const __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
156	13.1M	const __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
157	13.1M	const __m256i tr0_5 = _mm256_unpacklo_epi16(in[6], in[7]);
158	13.1M	const __m256i tr0_6 = _mm256_unpackhi_epi16(in[4], in[5]);
159	13.1M	const __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
160
161	13.1M	const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
162	13.1M	const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
163	13.1M	const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
164	13.1M	const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
165	13.1M	const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
166	13.1M	const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
167	13.1M	const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
168	13.1M	const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);
169
170	13.1M	res[7] = _mm256_unpacklo_epi64(tr1_0, tr1_1);
171	13.1M	res[6] = _mm256_unpackhi_epi64(tr1_0, tr1_1);
172	13.1M	res[5] = _mm256_unpacklo_epi64(tr1_2, tr1_3);
173	13.1M	res[4] = _mm256_unpackhi_epi64(tr1_2, tr1_3);
174	13.1M	res[3] = _mm256_unpacklo_epi64(tr1_4, tr1_5);
175	13.1M	res[2] = _mm256_unpackhi_epi64(tr1_4, tr1_5);
176	13.1M	res[1] = _mm256_unpacklo_epi64(tr1_6, tr1_7);
177	13.1M	res[0] = _mm256_unpackhi_epi64(tr1_6, tr1_7);
178	13.1M	}
179
180		void cdef_find_dir_dual_avx2(const uint16_t img1, const uint16_t img2,
181		int stride, int32_t *var_out_1st,
182		int32_t *var_out_2nd, int coeff_shift,
183	13.0M	int out_dir_1st_8x8, int out_dir_2nd_8x8) {
184	13.0M	int32_t cost_first_8x8[8];
185	13.0M	int32_t cost_second_8x8[8];
186		// Used to store the best cost for 2 8x8's.
187	13.0M	int32_t best_cost[2] = { 0 };
188		// Best direction for 2 8x8's.
189	13.0M	int best_dir[2] = { 0 };
190
191	13.0M	const __m128i const_coeff_shift_reg = _mm_cvtsi32_si128(coeff_shift);
192	13.0M	const __m256i const_128_reg = _mm256_set1_epi16(128);
193	13.0M	__m256i lines[8];
194	117M	for (int i = 0; i < 8; i++) {
195	104M	const __m128i src_1 = _mm_loadu_si128((const __m128i )&img1[i stride]);
196	104M	const __m128i src_2 = _mm_loadu_si128((const __m128i )&img2[i stride]);
197
198	104M	lines[i] = _mm256_insertf128_si256(_mm256_castsi128_si256(src_1), src_2, 1);
199	104M	lines[i] = _mm256_sub_epi16(
200	104M	_mm256_sra_epi16(lines[i], const_coeff_shift_reg), const_128_reg);
201	104M	}
202
203		/* Compute "mostly vertical" directions. */
204	13.0M	const __m256i dir47 =
205	13.0M	compute_directions_avx2(lines, cost_first_8x8 + 4, cost_second_8x8 + 4);
206
207		/* Transpose and reverse the order of the lines. */
208	13.0M	array_reverse_transpose_8x8_avx2(lines, lines);
209
210		/* Compute "mostly horizontal" directions. */
211	13.0M	const __m256i dir03 =
212	13.0M	compute_directions_avx2(lines, cost_first_8x8, cost_second_8x8);
213
214	13.0M	__m256i max = _mm256_max_epi32(dir03, dir47);
215	13.0M	max =
216	13.0M	_mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 8),
217	13.0M	_mm256_slli_si256(max, 16 - (8))));
218	13.0M	max =
219	13.0M	_mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 4),
220	13.0M	_mm256_slli_si256(max, 16 - (4))));
221
222	13.0M	const __m128i first_8x8_output = _mm256_castsi256_si128(max);
223	13.0M	const __m128i second_8x8_output = _mm256_extractf128_si256(max, 1);
224	13.0M	const __m128i cmpeg_res_00 =
225	13.0M	_mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir47));
226	13.0M	const __m128i cmpeg_res_01 =
227	13.0M	_mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir03));
228	13.0M	const __m128i cmpeg_res_10 =
229	13.0M	_mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir47, 1));
230	13.0M	const __m128i cmpeg_res_11 =
231	13.0M	_mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir03, 1));
232	13.0M	const __m128i t_first_8x8 = _mm_packs_epi32(cmpeg_res_01, cmpeg_res_00);
233	13.0M	const __m128i t_second_8x8 = _mm_packs_epi32(cmpeg_res_11, cmpeg_res_10);
234
235	13.0M	best_cost[0] = _mm_cvtsi128_si32(_mm256_castsi256_si128(max));
236	13.0M	best_cost[1] = _mm_cvtsi128_si32(second_8x8_output);
237	13.0M	best_dir[0] = _mm_movemask_epi8(_mm_packs_epi16(t_first_8x8, t_first_8x8));
238	13.0M	best_dir[0] =
239	13.0M	get_msb(best_dir[0] ^ (best_dir[0] - 1)); // Count trailing zeros
240	13.0M	best_dir[1] = _mm_movemask_epi8(_mm_packs_epi16(t_second_8x8, t_second_8x8));
241	13.0M	best_dir[1] =
242	13.0M	get_msb(best_dir[1] ^ (best_dir[1] - 1)); // Count trailing zeros
243
244		/* Difference between the optimal variance and the variance along the
245		orthogonal direction. Again, the sum(x^2) terms cancel out. */
246	13.0M	*var_out_1st = best_cost[0] - cost_first_8x8[(best_dir[0] + 4) & 7];
247	13.0M	*var_out_2nd = best_cost[1] - cost_second_8x8[(best_dir[1] + 4) & 7];
248
249		/* We'd normally divide by 840, but dividing by 1024 is close enough
250		for what we're going to do with this. */
251	13.0M	*var_out_1st >>= 10;
252	13.0M	*var_out_2nd >>= 10;
253	13.0M	*out_dir_1st_8x8 = best_dir[0];
254	13.0M	*out_dir_2nd_8x8 = best_dir[1];
255	13.0M	}
256
257		void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride,
258		const uint8_t *src, int sstride,
259	1.12M	int width, int height) {
260	1.12M	int j = 0;
261	1.12M	int remaining_width = width;
262	1.12M	assert(height % 2 == 0);
263	0	assert(height > 0);
264	0	assert(width > 0);
265
266		// Process multiple 32 pixels at a time.
267	1.12M	if (remaining_width > 31) {
268	1.11M	int i = 0;
269	20.2M	do {
270	20.2M	j = 0;
271	33.1M	do {
272	33.1M	__m128i row00 =
273	33.1M	_mm_loadu_si128((const __m128i )&src[(i + 0) sstride + (j + 0)]);
274	33.1M	__m128i row01 = _mm_loadu_si128(
275	33.1M	(const __m128i )&src[(i + 0) sstride + (j + 16)]);
276	33.1M	__m128i row10 =
277	33.1M	_mm_loadu_si128((const __m128i )&src[(i + 1) sstride + (j + 0)]);
278	33.1M	__m128i row11 = _mm_loadu_si128(
279	33.1M	(const __m128i )&src[(i + 1) sstride + (j + 16)]);
280	33.1M	_mm256_storeu_si256((__m256i )&dst[(i + 0) dstride + (j + 0)],
281	33.1M	_mm256_cvtepu8_epi16(row00));
282	33.1M	_mm256_storeu_si256((__m256i )&dst[(i + 0) dstride + (j + 16)],
283	33.1M	_mm256_cvtepu8_epi16(row01));
284	33.1M	_mm256_storeu_si256((__m256i )&dst[(i + 1) dstride + (j + 0)],
285	33.1M	_mm256_cvtepu8_epi16(row10));
286	33.1M	_mm256_storeu_si256((__m256i )&dst[(i + 1) dstride + (j + 16)],
287	33.1M	_mm256_cvtepu8_epi16(row11));
288	33.1M	j += 32;
289	33.1M	} while (j <= width - 32);
290	20.2M	i += 2;
291	20.2M	} while (i < height);
292	1.11M	remaining_width = width & 31;
293	1.11M	}
294
295		// Process 16 pixels at a time.
296	1.12M	if (remaining_width > 15) {
297	73.1k	int i = 0;
298	681k	do {
299	681k	__m128i row0 =
300	681k	_mm_loadu_si128((const __m128i )&src[(i + 0) sstride + j]);
301	681k	__m128i row1 =
302	681k	_mm_loadu_si128((const __m128i )&src[(i + 1) sstride + j]);
303	681k	_mm256_storeu_si256((__m256i )&dst[(i + 0) dstride + j],
304	681k	_mm256_cvtepu8_epi16(row0));
305	681k	_mm256_storeu_si256((__m256i )&dst[(i + 1) dstride + j],
306	681k	_mm256_cvtepu8_epi16(row1));
307	681k	i += 2;
308	681k	} while (i < height);
309	73.1k	remaining_width = width & 15;
310	73.1k	j += 16;
311	73.1k	}
312
313		// Process 8 pixels at a time.
314	1.12M	if (remaining_width > 7) {
315	909k	int i = 0;
316	19.1M	do {
317	19.1M	__m128i row0 =
318	19.1M	_mm_loadl_epi64((const __m128i )&src[(i + 0) sstride + j]);
319	19.1M	__m128i row1 =
320	19.1M	_mm_loadl_epi64((const __m128i )&src[(i + 1) sstride + j]);
321	19.1M	_mm_storeu_si128((__m128i )&dst[(i + 0) dstride + j],
322	19.1M	_mm_unpacklo_epi8(row0, _mm_setzero_si128()));
323	19.1M	_mm_storeu_si128((__m128i )&dst[(i + 1) dstride + j],
324	19.1M	_mm_unpacklo_epi8(row1, _mm_setzero_si128()));
325	19.1M	i += 2;
326	19.1M	} while (i < height);
327	909k	remaining_width = width & 7;
328	909k	j += 8;
329	909k	}
330
331		// Process 4 pixels at a time.
332	1.12M	if (remaining_width > 3) {
333	10.2k	int i = 0;
334	214k	do {
335	214k	__m128i row0 =
336	214k	_mm_cvtsi32_si128(((const int32_t )&src[(i + 0) * sstride + j]));
337	214k	__m128i row1 =
338	214k	_mm_cvtsi32_si128(((const int32_t )&src[(i + 1) * sstride + j]));
339	214k	_mm_storel_epi64((__m128i )&dst[(i + 0) dstride + j],
340	214k	_mm_unpacklo_epi8(row0, _mm_setzero_si128()));
341	214k	_mm_storel_epi64((__m128i )&dst[(i + 1) dstride + j],
342	214k	_mm_unpacklo_epi8(row1, _mm_setzero_si128()));
343	214k	i += 2;
344	214k	} while (i < height);
345	10.2k	remaining_width = width & 3;
346	10.2k	j += 4;
347	10.2k	}
348
349		// Process the remaining pixels.
350	1.12M	if (remaining_width) {
351	0	for (int i = 0; i < height; i++) {
352	0	for (int k = j; k < width; k++) {
353	0	dst[i * dstride + k] = src[i * sstride + k];
354	0	}
355	0	}
356	0	}
357	1.12M	}