/src/aom/av1/common/x86/intra_edge_sse4.c

Source
/*
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include <assert.h>
#include <smmintrin.h>

#include "config/aom_config.h"
#include "config/av1_rtcd.h"

void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
  if (!strength) return;

  DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = {
    { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 },  // strength 1: 4,8,4
    { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 },  // strength 2: 5,6,5
    { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 }  // strength 3: 2,4,4,4,2
  };

  DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = {
    { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
    { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 },
    { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 },
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
  };

  // Extend the first and last samples to simplify the loop for the 5-tap case
  p[-1] = p[0];
  __m128i last = _mm_set1_epi8((char)p[sz - 1]);
  _mm_storeu_si128((__m128i *)&p[sz], last);

  // Adjust input pointer for filter support area
  uint8_t *in = (strength == 3) ? p - 1 : p;

  // Avoid modifying first sample
  uint8_t *out = p + 1;
  int len = sz - 1;

  const int use_3tap_filter = (strength < 3);

  if (use_3tap_filter) {
    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
    __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]);
    __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]);
    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
    __m128i in0 = _mm_lddqu_si128((__m128i *)in);
    while (len > 0) {
      int n_out = (len < 8) ? len : 8;
      __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
      __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
      d0 = _mm_maddubs_epi16(d0, coef0);
      d1 = _mm_maddubs_epi16(d1, coef0);
      d0 = _mm_hadd_epi16(d0, d1);
      __m128i eight = _mm_set1_epi16(8);
      d0 = _mm_add_epi16(d0, eight);
      d0 = _mm_srai_epi16(d0, 4);
      d0 = _mm_packus_epi16(d0, d0);
      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
      __m128i n0 = _mm_set1_epi8(n_out);
      __m128i mask = _mm_cmpgt_epi8(n0, iden);
      out0 = _mm_blendv_epi8(out0, d0, mask);
      _mm_storel_epi64((__m128i *)out, out0);
      __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
      in0 = _mm_alignr_epi8(in1, in0, 8);
      in += 8;
      out += 8;
      len -= n_out;
    }
  } else {  // 5-tap filter
    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
    __m128i two = _mm_set1_epi8(2);
    __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]);
    __m128i shuf_b = _mm_add_epi8(shuf_a, two);
    __m128i shuf_c = _mm_add_epi8(shuf_b, two);
    __m128i shuf_d = _mm_add_epi8(shuf_c, two);
    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
    __m128i in0 = _mm_lddqu_si128((__m128i *)in);
    while (len > 0) {
      int n_out = (len < 8) ? len : 8;
      __m128i d0 = _mm_shuffle_epi8(in0, shuf_a);
      __m128i d1 = _mm_shuffle_epi8(in0, shuf_b);
      __m128i d2 = _mm_shuffle_epi8(in0, shuf_c);
      __m128i d3 = _mm_shuffle_epi8(in0, shuf_d);
      d0 = _mm_maddubs_epi16(d0, coef0);
      d1 = _mm_maddubs_epi16(d1, coef0);
      d2 = _mm_maddubs_epi16(d2, coef0);
      d3 = _mm_maddubs_epi16(d3, coef0);
      d0 = _mm_hadd_epi16(d0, d1);
      d2 = _mm_hadd_epi16(d2, d3);
      d0 = _mm_hadd_epi16(d0, d2);
      __m128i eight = _mm_set1_epi16(8);
      d0 = _mm_add_epi16(d0, eight);
      d0 = _mm_srai_epi16(d0, 4);
      d0 = _mm_packus_epi16(d0, d0);
      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
      __m128i n0 = _mm_set1_epi8(n_out);
      __m128i mask = _mm_cmpgt_epi8(n0, iden);
      out0 = _mm_blendv_epi8(out0, d0, mask);
      _mm_storel_epi64((__m128i *)out, out0);
      __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
      in0 = _mm_alignr_epi8(in1, in0, 8);
      in += 8;
      out += 8;
      len -= n_out;
    }
  }
}

void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
  // interpolate half-sample positions
  assert(sz <= 24);

  DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
    { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
  };

  DECLARE_ALIGNED(
      16, static const int8_t,
      v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
                          { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } };

  // Extend first/last samples (upper-left p[-1], last p[sz-1])
  // to support 4-tap filter
  p[-2] = p[-1];
  p[sz] = p[sz - 1];

  uint8_t *in = &p[-2];
  uint8_t *out = &p[-2];

  int n = sz + 1;  // Input length including upper-left sample

  __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
  __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);

  __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
  __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
  __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);

  while (n > 0) {
    __m128i in8 = _mm_alignr_epi8(in16, in0, 8);
    __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
    __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
    __m128i d2 = _mm_shuffle_epi8(in8, shuf0);
    __m128i d3 = _mm_shuffle_epi8(in8, shuf1);
    d0 = _mm_maddubs_epi16(d0, coef0);
    d1 = _mm_maddubs_epi16(d1, coef0);
    d2 = _mm_maddubs_epi16(d2, coef0);
    d3 = _mm_maddubs_epi16(d3, coef0);
    d0 = _mm_hadd_epi16(d0, d1);
    d2 = _mm_hadd_epi16(d2, d3);
    __m128i eight = _mm_set1_epi16(8);
    d0 = _mm_add_epi16(d0, eight);
    d2 = _mm_add_epi16(d2, eight);
    d0 = _mm_srai_epi16(d0, 4);
    d2 = _mm_srai_epi16(d2, 4);
    d0 = _mm_packus_epi16(d0, d2);
    __m128i in1 = _mm_alignr_epi8(in16, in0, 1);
    __m128i out0 = _mm_unpacklo_epi8(in1, d0);
    __m128i out1 = _mm_unpackhi_epi8(in1, d0);
    _mm_storeu_si128((__m128i *)&out[0], out0);
    _mm_storeu_si128((__m128i *)&out[16], out1);
    in0 = in16;
    in16 = _mm_setzero_si128();
    out += 32;
    n -= 16;
  }
}

#if CONFIG_AV1_HIGHBITDEPTH

void av1_highbd_filter_intra_edge_sse4_1(uint16_t *p, int sz, int strength) {
  if (!strength) return;

  DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = {
    { 4, 8, 4, 8, 4, 8, 4, 8 },  // strength 1: 4,8,4
    { 5, 6, 5, 6, 5, 6, 5, 6 },  // strength 2: 5,6,5
    { 2, 4, 2, 4, 2, 4, 2, 4 }   // strength 3: 2,4,4,4,2
  };

  DECLARE_ALIGNED(16, static const int16_t,
                  v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } };

  // Extend the first and last samples to simplify the loop for the 5-tap case
  p[-1] = p[0];
  __m128i last = _mm_set1_epi16(p[sz - 1]);
  _mm_storeu_si128((__m128i *)&p[sz], last);

  // Adjust input pointer for filter support area
  uint16_t *in = (strength == 3) ? p - 1 : p;

  // Avoid modifying first sample
  uint16_t *out = p + 1;
  int len = sz - 1;

  const int use_3tap_filter = (strength < 3);

  if (use_3tap_filter) {
    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
    __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
    __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
    while (len > 0) {
      int n_out = (len < 8) ? len : 8;
      __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
      __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
      __m128i in02 = _mm_add_epi16(in0, in2);
      __m128i d0 = _mm_unpacklo_epi16(in02, in1);
      __m128i d1 = _mm_unpackhi_epi16(in02, in1);
      d0 = _mm_mullo_epi16(d0, coef0);
      d1 = _mm_mullo_epi16(d1, coef0);
      d0 = _mm_hadd_epi16(d0, d1);
      __m128i eight = _mm_set1_epi16(8);
      d0 = _mm_add_epi16(d0, eight);
      d0 = _mm_srli_epi16(d0, 4);
      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
      __m128i n0 = _mm_set1_epi16(n_out);
      __m128i mask = _mm_cmpgt_epi16(n0, iden);
      out0 = _mm_blendv_epi8(out0, d0, mask);
      _mm_storeu_si128((__m128i *)out, out0);
      in += 8;
      in0 = in8;
      in8 = _mm_lddqu_si128((__m128i *)&in[8]);
      out += 8;
      len -= n_out;
    }
  } else {  // 5-tap filter
    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
    __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
    __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
    while (len > 0) {
      int n_out = (len < 8) ? len : 8;
      __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
      __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
      __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
      __m128i in4 = _mm_alignr_epi8(in8, in0, 8);
      __m128i in04 = _mm_add_epi16(in0, in4);
      __m128i in123 = _mm_add_epi16(in1, in2);
      in123 = _mm_add_epi16(in123, in3);
      __m128i d0 = _mm_unpacklo_epi16(in04, in123);
      __m128i d1 = _mm_unpackhi_epi16(in04, in123);
      d0 = _mm_mullo_epi16(d0, coef0);
      d1 = _mm_mullo_epi16(d1, coef0);
      d0 = _mm_hadd_epi16(d0, d1);
      __m128i eight = _mm_set1_epi16(8);
      d0 = _mm_add_epi16(d0, eight);
      d0 = _mm_srli_epi16(d0, 4);
      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
      __m128i n0 = _mm_set1_epi16(n_out);
      __m128i mask = _mm_cmpgt_epi16(n0, iden);
      out0 = _mm_blendv_epi8(out0, d0, mask);
      _mm_storeu_si128((__m128i *)out, out0);
      in += 8;
      in0 = in8;
      in8 = _mm_lddqu_si128((__m128i *)&in[8]);
      out += 8;
      len -= n_out;
    }
  }
}

void av1_highbd_upsample_intra_edge_sse4_1(uint16_t *p, int sz, int bd) {
  // interpolate half-sample positions
  assert(sz <= 24);

  DECLARE_ALIGNED(16, static const int16_t,
                  kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } };

  // Extend first/last samples (upper-left p[-1], last p[sz-1])
  // to support 4-tap filter
  p[-2] = p[-1];
  p[sz] = p[sz - 1];

  uint16_t *in = &p[-2];
  uint16_t *out = in;
  int n = sz + 1;

  __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
  __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
  __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
  __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]);

  while (n > 0) {
    __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
    __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
    __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
    __m128i sum0 = _mm_add_epi16(in0, in3);
    __m128i sum1 = _mm_add_epi16(in1, in2);
    __m128i d0 = _mm_unpacklo_epi16(sum0, sum1);
    __m128i d1 = _mm_unpackhi_epi16(sum0, sum1);
    __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
    d0 = _mm_madd_epi16(d0, coef0);
    d1 = _mm_madd_epi16(d1, coef0);
    __m128i eight = _mm_set1_epi32(8);
    d0 = _mm_add_epi32(d0, eight);
    d1 = _mm_add_epi32(d1, eight);
    d0 = _mm_srai_epi32(d0, 4);
    d1 = _mm_srai_epi32(d1, 4);
    d0 = _mm_packus_epi32(d0, d1);
    __m128i max0 = _mm_set1_epi16((1 << bd) - 1);
    d0 = _mm_min_epi16(d0, max0);
    __m128i out0 = _mm_unpacklo_epi16(in1, d0);
    __m128i out1 = _mm_unpackhi_epi16(in1, d0);
    _mm_storeu_si128((__m128i *)&out[0], out0);
    _mm_storeu_si128((__m128i *)&out[8], out1);
    in0 = in8;
    in8 = in16;
    in16 = in24;
    in24 = _mm_setzero_si128();
    out += 16;
    n -= 8;
  }
}

#endif  // CONFIG_AV1_HIGHBITDEPTH

Coverage Report

Created: 2025-06-13 07:07

Line	Count	Source
1		/*
2		* Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3		*
4		* This source code is subject to the terms of the BSD 2 Clause License and
5		* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6		* was not distributed with this source code in the LICENSE file, you can
7		* obtain it at www.aomedia.org/license/software. If the Alliance for Open
8		* Media Patent License 1.0 was not distributed with this source code in the
9		* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10		*/
11
12		#include <assert.h>
13		#include <smmintrin.h>
14
15		#include "config/aom_config.h"
16		#include "config/av1_rtcd.h"
17
18	3.57M	void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
19	3.57M	if (!strength) return;
20
21	2.39M	DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = {
22	2.39M	{ 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 }, // strength 1: 4,8,4
23	2.39M	{ 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 }, // strength 2: 5,6,5
24	2.39M	{ 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 } // strength 3: 2,4,4,4,2
25	2.39M	};
26
27	2.39M	DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = {
28	2.39M	{ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
29	2.39M	{ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 },
30	2.39M	{ 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 },
31	2.39M	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
32	2.39M	};
33
34		// Extend the first and last samples to simplify the loop for the 5-tap case
35	2.39M	p[-1] = p[0];
36	2.39M	__m128i last = _mm_set1_epi8((char)p[sz - 1]);
37	2.39M	_mm_storeu_si128((__m128i *)&p[sz], last);
38
39		// Adjust input pointer for filter support area
40	2.39M	uint8_t *in = (strength == 3) ? p - 1 : p;
41
42		// Avoid modifying first sample
43	2.39M	uint8_t *out = p + 1;
44	2.39M	int len = sz - 1;
45
46	2.39M	const int use_3tap_filter = (strength < 3);
47
48	2.39M	if (use_3tap_filter) {
49	1.12M	__m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
50	1.12M	__m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]);
51	1.12M	__m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]);
52	1.12M	__m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
53	1.12M	__m128i in0 = _mm_lddqu_si128((__m128i *)in);
54	2.83M	while (len > 0) {
55	1.71M	int n_out = (len < 8) ? len : 8;
56	1.71M	__m128i d0 = _mm_shuffle_epi8(in0, shuf0);
57	1.71M	__m128i d1 = _mm_shuffle_epi8(in0, shuf1);
58	1.71M	d0 = _mm_maddubs_epi16(d0, coef0);
59	1.71M	d1 = _mm_maddubs_epi16(d1, coef0);
60	1.71M	d0 = _mm_hadd_epi16(d0, d1);
61	1.71M	__m128i eight = _mm_set1_epi16(8);
62	1.71M	d0 = _mm_add_epi16(d0, eight);
63	1.71M	d0 = _mm_srai_epi16(d0, 4);
64	1.71M	d0 = _mm_packus_epi16(d0, d0);
65	1.71M	__m128i out0 = _mm_lddqu_si128((__m128i *)out);
66	1.71M	__m128i n0 = _mm_set1_epi8(n_out);
67	1.71M	__m128i mask = _mm_cmpgt_epi8(n0, iden);
68	1.71M	out0 = _mm_blendv_epi8(out0, d0, mask);
69	1.71M	_mm_storel_epi64((__m128i *)out, out0);
70	1.71M	__m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
71	1.71M	in0 = _mm_alignr_epi8(in1, in0, 8);
72	1.71M	in += 8;
73	1.71M	out += 8;
74	1.71M	len -= n_out;
75	1.71M	}
76	1.26M	} else { // 5-tap filter
77	1.26M	__m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
78	1.26M	__m128i two = _mm_set1_epi8(2);
79	1.26M	__m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]);
80	1.26M	__m128i shuf_b = _mm_add_epi8(shuf_a, two);
81	1.26M	__m128i shuf_c = _mm_add_epi8(shuf_b, two);
82	1.26M	__m128i shuf_d = _mm_add_epi8(shuf_c, two);
83	1.26M	__m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
84	1.26M	__m128i in0 = _mm_lddqu_si128((__m128i *)in);
85	6.48M	while (len > 0) {
86	5.22M	int n_out = (len < 8) ? len : 8;
87	5.22M	__m128i d0 = _mm_shuffle_epi8(in0, shuf_a);
88	5.22M	__m128i d1 = _mm_shuffle_epi8(in0, shuf_b);
89	5.22M	__m128i d2 = _mm_shuffle_epi8(in0, shuf_c);
90	5.22M	__m128i d3 = _mm_shuffle_epi8(in0, shuf_d);
91	5.22M	d0 = _mm_maddubs_epi16(d0, coef0);
92	5.22M	d1 = _mm_maddubs_epi16(d1, coef0);
93	5.22M	d2 = _mm_maddubs_epi16(d2, coef0);
94	5.22M	d3 = _mm_maddubs_epi16(d3, coef0);
95	5.22M	d0 = _mm_hadd_epi16(d0, d1);
96	5.22M	d2 = _mm_hadd_epi16(d2, d3);
97	5.22M	d0 = _mm_hadd_epi16(d0, d2);
98	5.22M	__m128i eight = _mm_set1_epi16(8);
99	5.22M	d0 = _mm_add_epi16(d0, eight);
100	5.22M	d0 = _mm_srai_epi16(d0, 4);
101	5.22M	d0 = _mm_packus_epi16(d0, d0);
102	5.22M	__m128i out0 = _mm_lddqu_si128((__m128i *)out);
103	5.22M	__m128i n0 = _mm_set1_epi8(n_out);
104	5.22M	__m128i mask = _mm_cmpgt_epi8(n0, iden);
105	5.22M	out0 = _mm_blendv_epi8(out0, d0, mask);
106	5.22M	_mm_storel_epi64((__m128i *)out, out0);
107	5.22M	__m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
108	5.22M	in0 = _mm_alignr_epi8(in1, in0, 8);
109	5.22M	in += 8;
110	5.22M	out += 8;
111	5.22M	len -= n_out;
112	5.22M	}
113	1.26M	}
114	2.39M	}
115
116	976k	void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
117		// interpolate half-sample positions
118	976k	assert(sz <= 24);
119
120	976k	DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
121	976k	{ -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
122	976k	};
123
124	976k	DECLARE_ALIGNED(
125	976k	16, static const int8_t,
126	976k	v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
127	976k	{ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } };
128
129		// Extend first/last samples (upper-left p[-1], last p[sz-1])
130		// to support 4-tap filter
131	976k	p[-2] = p[-1];
132	976k	p[sz] = p[sz - 1];
133
134	976k	uint8_t *in = &p[-2];
135	976k	uint8_t *out = &p[-2];
136
137	976k	int n = sz + 1; // Input length including upper-left sample
138
139	976k	__m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
140	976k	__m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
141
142	976k	__m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
143	976k	__m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
144	976k	__m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
145
146	2.05M	while (n > 0) {
147	1.08M	__m128i in8 = _mm_alignr_epi8(in16, in0, 8);
148	1.08M	__m128i d0 = _mm_shuffle_epi8(in0, shuf0);
149	1.08M	__m128i d1 = _mm_shuffle_epi8(in0, shuf1);
150	1.08M	__m128i d2 = _mm_shuffle_epi8(in8, shuf0);
151	1.08M	__m128i d3 = _mm_shuffle_epi8(in8, shuf1);
152	1.08M	d0 = _mm_maddubs_epi16(d0, coef0);
153	1.08M	d1 = _mm_maddubs_epi16(d1, coef0);
154	1.08M	d2 = _mm_maddubs_epi16(d2, coef0);
155	1.08M	d3 = _mm_maddubs_epi16(d3, coef0);
156	1.08M	d0 = _mm_hadd_epi16(d0, d1);
157	1.08M	d2 = _mm_hadd_epi16(d2, d3);
158	1.08M	__m128i eight = _mm_set1_epi16(8);
159	1.08M	d0 = _mm_add_epi16(d0, eight);
160	1.08M	d2 = _mm_add_epi16(d2, eight);
161	1.08M	d0 = _mm_srai_epi16(d0, 4);
162	1.08M	d2 = _mm_srai_epi16(d2, 4);
163	1.08M	d0 = _mm_packus_epi16(d0, d2);
164	1.08M	__m128i in1 = _mm_alignr_epi8(in16, in0, 1);
165	1.08M	__m128i out0 = _mm_unpacklo_epi8(in1, d0);
166	1.08M	__m128i out1 = _mm_unpackhi_epi8(in1, d0);
167	1.08M	_mm_storeu_si128((__m128i *)&out[0], out0);
168	1.08M	_mm_storeu_si128((__m128i *)&out[16], out1);
169	1.08M	in0 = in16;
170	1.08M	in16 = _mm_setzero_si128();
171	1.08M	out += 32;
172	1.08M	n -= 16;
173	1.08M	}
174	976k	}
175
176		#if CONFIG_AV1_HIGHBITDEPTH
177
178	3.57M	void av1_highbd_filter_intra_edge_sse4_1(uint16_t *p, int sz, int strength) {
179	3.57M	if (!strength) return;
180
181	2.47M	DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = {
182	2.47M	{ 4, 8, 4, 8, 4, 8, 4, 8 }, // strength 1: 4,8,4
183	2.47M	{ 5, 6, 5, 6, 5, 6, 5, 6 }, // strength 2: 5,6,5
184	2.47M	{ 2, 4, 2, 4, 2, 4, 2, 4 } // strength 3: 2,4,4,4,2
185	2.47M	};
186
187	2.47M	DECLARE_ALIGNED(16, static const int16_t,
188	2.47M	v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
189
190		// Extend the first and last samples to simplify the loop for the 5-tap case
191	2.47M	p[-1] = p[0];
192	2.47M	__m128i last = _mm_set1_epi16(p[sz - 1]);
193	2.47M	_mm_storeu_si128((__m128i *)&p[sz], last);
194
195		// Adjust input pointer for filter support area
196	2.47M	uint16_t *in = (strength == 3) ? p - 1 : p;
197
198		// Avoid modifying first sample
199	2.47M	uint16_t *out = p + 1;
200	2.47M	int len = sz - 1;
201
202	2.47M	const int use_3tap_filter = (strength < 3);
203
204	2.47M	if (use_3tap_filter) {
205	1.00M	__m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
206	1.00M	__m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
207	1.00M	__m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
208	1.00M	__m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
209	2.76M	while (len > 0) {
210	1.76M	int n_out = (len < 8) ? len : 8;
211	1.76M	__m128i in1 = _mm_alignr_epi8(in8, in0, 2);
212	1.76M	__m128i in2 = _mm_alignr_epi8(in8, in0, 4);
213	1.76M	__m128i in02 = _mm_add_epi16(in0, in2);
214	1.76M	__m128i d0 = _mm_unpacklo_epi16(in02, in1);
215	1.76M	__m128i d1 = _mm_unpackhi_epi16(in02, in1);
216	1.76M	d0 = _mm_mullo_epi16(d0, coef0);
217	1.76M	d1 = _mm_mullo_epi16(d1, coef0);
218	1.76M	d0 = _mm_hadd_epi16(d0, d1);
219	1.76M	__m128i eight = _mm_set1_epi16(8);
220	1.76M	d0 = _mm_add_epi16(d0, eight);
221	1.76M	d0 = _mm_srli_epi16(d0, 4);
222	1.76M	__m128i out0 = _mm_lddqu_si128((__m128i *)out);
223	1.76M	__m128i n0 = _mm_set1_epi16(n_out);
224	1.76M	__m128i mask = _mm_cmpgt_epi16(n0, iden);
225	1.76M	out0 = _mm_blendv_epi8(out0, d0, mask);
226	1.76M	_mm_storeu_si128((__m128i *)out, out0);
227	1.76M	in += 8;
228	1.76M	in0 = in8;
229	1.76M	in8 = _mm_lddqu_si128((__m128i *)&in[8]);
230	1.76M	out += 8;
231	1.76M	len -= n_out;
232	1.76M	}
233	1.46M	} else { // 5-tap filter
234	1.46M	__m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
235	1.46M	__m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
236	1.46M	__m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
237	1.46M	__m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
238	7.42M	while (len > 0) {
239	5.95M	int n_out = (len < 8) ? len : 8;
240	5.95M	__m128i in1 = _mm_alignr_epi8(in8, in0, 2);
241	5.95M	__m128i in2 = _mm_alignr_epi8(in8, in0, 4);
242	5.95M	__m128i in3 = _mm_alignr_epi8(in8, in0, 6);
243	5.95M	__m128i in4 = _mm_alignr_epi8(in8, in0, 8);
244	5.95M	__m128i in04 = _mm_add_epi16(in0, in4);
245	5.95M	__m128i in123 = _mm_add_epi16(in1, in2);
246	5.95M	in123 = _mm_add_epi16(in123, in3);
247	5.95M	__m128i d0 = _mm_unpacklo_epi16(in04, in123);
248	5.95M	__m128i d1 = _mm_unpackhi_epi16(in04, in123);
249	5.95M	d0 = _mm_mullo_epi16(d0, coef0);
250	5.95M	d1 = _mm_mullo_epi16(d1, coef0);
251	5.95M	d0 = _mm_hadd_epi16(d0, d1);
252	5.95M	__m128i eight = _mm_set1_epi16(8);
253	5.95M	d0 = _mm_add_epi16(d0, eight);
254	5.95M	d0 = _mm_srli_epi16(d0, 4);
255	5.95M	__m128i out0 = _mm_lddqu_si128((__m128i *)out);
256	5.95M	__m128i n0 = _mm_set1_epi16(n_out);
257	5.95M	__m128i mask = _mm_cmpgt_epi16(n0, iden);
258	5.95M	out0 = _mm_blendv_epi8(out0, d0, mask);
259	5.95M	_mm_storeu_si128((__m128i *)out, out0);
260	5.95M	in += 8;
261	5.95M	in0 = in8;
262	5.95M	in8 = _mm_lddqu_si128((__m128i *)&in[8]);
263	5.95M	out += 8;
264	5.95M	len -= n_out;
265	5.95M	}
266	1.46M	}
267	2.47M	}
268
269	847k	void av1_highbd_upsample_intra_edge_sse4_1(uint16_t *p, int sz, int bd) {
270		// interpolate half-sample positions
271	847k	assert(sz <= 24);
272
273	847k	DECLARE_ALIGNED(16, static const int16_t,
274	847k	kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } };
275
276		// Extend first/last samples (upper-left p[-1], last p[sz-1])
277		// to support 4-tap filter
278	847k	p[-2] = p[-1];
279	847k	p[sz] = p[sz - 1];
280
281	847k	uint16_t *in = &p[-2];
282	847k	uint16_t *out = in;
283	847k	int n = sz + 1;
284
285	847k	__m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
286	847k	__m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
287	847k	__m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
288	847k	__m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]);
289
290	2.46M	while (n > 0) {
291	1.62M	__m128i in1 = _mm_alignr_epi8(in8, in0, 2);
292	1.62M	__m128i in2 = _mm_alignr_epi8(in8, in0, 4);
293	1.62M	__m128i in3 = _mm_alignr_epi8(in8, in0, 6);
294	1.62M	__m128i sum0 = _mm_add_epi16(in0, in3);
295	1.62M	__m128i sum1 = _mm_add_epi16(in1, in2);
296	1.62M	__m128i d0 = _mm_unpacklo_epi16(sum0, sum1);
297	1.62M	__m128i d1 = _mm_unpackhi_epi16(sum0, sum1);
298	1.62M	__m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
299	1.62M	d0 = _mm_madd_epi16(d0, coef0);
300	1.62M	d1 = _mm_madd_epi16(d1, coef0);
301	1.62M	__m128i eight = _mm_set1_epi32(8);
302	1.62M	d0 = _mm_add_epi32(d0, eight);
303	1.62M	d1 = _mm_add_epi32(d1, eight);
304	1.62M	d0 = _mm_srai_epi32(d0, 4);
305	1.62M	d1 = _mm_srai_epi32(d1, 4);
306	1.62M	d0 = _mm_packus_epi32(d0, d1);
307	1.62M	__m128i max0 = _mm_set1_epi16((1 << bd) - 1);
308	1.62M	d0 = _mm_min_epi16(d0, max0);
309	1.62M	__m128i out0 = _mm_unpacklo_epi16(in1, d0);
310	1.62M	__m128i out1 = _mm_unpackhi_epi16(in1, d0);
311	1.62M	_mm_storeu_si128((__m128i *)&out[0], out0);
312	1.62M	_mm_storeu_si128((__m128i *)&out[8], out1);
313	1.62M	in0 = in8;
314	1.62M	in8 = in16;
315	1.62M	in16 = in24;
316	1.62M	in24 = _mm_setzero_si128();
317	1.62M	out += 16;
318	1.62M	n -= 8;
319	1.62M	}
320	847k	}
321
322		#endif // CONFIG_AV1_HIGHBITDEPTH