/src/libvpx/vpx_dsp/x86/post_proc_sse2.c

Source
/*
 *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include <assert.h>
#include <emmintrin.h>

#include <stdio.h>

#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
#include "vpx_dsp/x86/mem_sse2.h"

extern const int16_t vpx_rv[];

void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows,
                               int cols, int flimit) {
  int col;
  const __m128i zero = _mm_setzero_si128();
  const __m128i f = _mm_set1_epi32(flimit);
  DECLARE_ALIGNED(16, int16_t, above_context[8 * 8]);

  // 8 columns are processed at a time.
  // If rows is less than 8 the bottom border extension fails.
  assert(cols % 8 == 0);
  assert(rows >= 8);

  for (col = 0; col < cols; col += 8) {
    int row, i;
    __m128i s = _mm_loadl_epi64((__m128i *)dst);
    __m128i sum, sumsq_0, sumsq_1;
    __m128i tmp_0, tmp_1;
    __m128i below_context = _mm_setzero_si128();

    s = _mm_unpacklo_epi8(s, zero);

    for (i = 0; i < 8; ++i) {
      _mm_store_si128((__m128i *)above_context + i, s);
    }

    // sum *= 9
    sum = _mm_slli_epi16(s, 3);
    sum = _mm_add_epi16(s, sum);

    // sum^2 * 9 == (sum * 9) * sum
    tmp_0 = _mm_mullo_epi16(sum, s);
    tmp_1 = _mm_mulhi_epi16(sum, s);

    sumsq_0 = _mm_unpacklo_epi16(tmp_0, tmp_1);
    sumsq_1 = _mm_unpackhi_epi16(tmp_0, tmp_1);

    // Prime sum/sumsq
    for (i = 1; i <= 6; ++i) {
      __m128i a = _mm_loadl_epi64((__m128i *)(dst + i * pitch));
      a = _mm_unpacklo_epi8(a, zero);
      sum = _mm_add_epi16(sum, a);
      a = _mm_mullo_epi16(a, a);
      sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(a, zero));
      sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(a, zero));
    }

    for (row = 0; row < rows + 8; row++) {
      const __m128i above =
          _mm_load_si128((__m128i *)above_context + (row & 7));
      __m128i this_row = _mm_loadl_epi64((__m128i *)(dst + row * pitch));
      __m128i above_sq, below_sq;
      __m128i mask_0, mask_1;
      __m128i multmp_0, multmp_1;
      __m128i rv;
      __m128i out;

      this_row = _mm_unpacklo_epi8(this_row, zero);

      if (row + 7 < rows) {
        // Instead of copying the end context we just stop loading when we get
        // to the last one.
        below_context = _mm_loadl_epi64((__m128i *)(dst + (row + 7) * pitch));
        below_context = _mm_unpacklo_epi8(below_context, zero);
      }

      sum = _mm_sub_epi16(sum, above);
      sum = _mm_add_epi16(sum, below_context);

      // context^2 fits in 16 bits. Don't need to mulhi and combine. Just zero
      // extend. Unfortunately we can't do below_sq - above_sq in 16 bits
      // because x86 does not have unpack with sign extension.
      above_sq = _mm_mullo_epi16(above, above);
      sumsq_0 = _mm_sub_epi32(sumsq_0, _mm_unpacklo_epi16(above_sq, zero));
      sumsq_1 = _mm_sub_epi32(sumsq_1, _mm_unpackhi_epi16(above_sq, zero));

      below_sq = _mm_mullo_epi16(below_context, below_context);
      sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(below_sq, zero));
      sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(below_sq, zero));

      // sumsq * 16 - sumsq == sumsq * 15
      mask_0 = _mm_slli_epi32(sumsq_0, 4);
      mask_0 = _mm_sub_epi32(mask_0, sumsq_0);
      mask_1 = _mm_slli_epi32(sumsq_1, 4);
      mask_1 = _mm_sub_epi32(mask_1, sumsq_1);

      multmp_0 = _mm_mullo_epi16(sum, sum);
      multmp_1 = _mm_mulhi_epi16(sum, sum);

      mask_0 = _mm_sub_epi32(mask_0, _mm_unpacklo_epi16(multmp_0, multmp_1));
      mask_1 = _mm_sub_epi32(mask_1, _mm_unpackhi_epi16(multmp_0, multmp_1));

      // mask - f gives a negative value when mask < f
      mask_0 = _mm_sub_epi32(mask_0, f);
      mask_1 = _mm_sub_epi32(mask_1, f);

      // Shift the sign bit down to create a mask
      mask_0 = _mm_srai_epi32(mask_0, 31);
      mask_1 = _mm_srai_epi32(mask_1, 31);

      mask_0 = _mm_packs_epi32(mask_0, mask_1);

      rv = _mm_loadu_si128((__m128i const *)(vpx_rv + (row & 127)));

      mask_1 = _mm_add_epi16(rv, sum);
      mask_1 = _mm_add_epi16(mask_1, this_row);
      mask_1 = _mm_srai_epi16(mask_1, 4);

      mask_1 = _mm_and_si128(mask_0, mask_1);
      mask_0 = _mm_andnot_si128(mask_0, this_row);
      out = _mm_or_si128(mask_1, mask_0);

      _mm_storel_epi64((__m128i *)(dst + row * pitch),
                       _mm_packus_epi16(out, zero));

      _mm_store_si128((__m128i *)above_context + ((row + 8) & 7), this_row);
    }

    dst += 8;
  }
}

Line	Count	Source
1		/*
2		* Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3		*
4		* Use of this source code is governed by a BSD-style license
5		* that can be found in the LICENSE file in the root of the source
6		* tree. An additional intellectual property rights grant can be found
7		* in the file PATENTS. All contributing project authors may
8		* be found in the AUTHORS file in the root of the source tree.
9		*/
10
11		#include <assert.h>
12		#include <emmintrin.h>
13
14		#include <stdio.h>
15
16		#include "./vpx_dsp_rtcd.h"
17		#include "vpx/vpx_integer.h"
18		#include "vpx_dsp/x86/mem_sse2.h"
19
20		extern const int16_t vpx_rv[];
21
22		void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows,
23	43.1k	int cols, int flimit) {
24	43.1k	int col;
25	43.1k	const __m128i zero = _mm_setzero_si128();
26	43.1k	const __m128i f = _mm_set1_epi32(flimit);
27	43.1k	DECLARE_ALIGNED(16, int16_t, above_context[8 * 8]);
28
29		// 8 columns are processed at a time.
30		// If rows is less than 8 the bottom border extension fails.
31	43.1k	assert(cols % 8 == 0);
32	43.1k	assert(rows >= 8);
33
34	1.58M	for (col = 0; col < cols; col += 8) {
35	1.54M	int row, i;
36	1.54M	__m128i s = _mm_loadl_epi64((__m128i *)dst);
37	1.54M	__m128i sum, sumsq_0, sumsq_1;
38	1.54M	__m128i tmp_0, tmp_1;
39	1.54M	__m128i below_context = _mm_setzero_si128();
40
41	1.54M	s = _mm_unpacklo_epi8(s, zero);
42
43	13.8M	for (i = 0; i < 8; ++i) {
44	12.3M	_mm_store_si128((__m128i *)above_context + i, s);
45	12.3M	}
46
47		// sum *= 9
48	1.54M	sum = _mm_slli_epi16(s, 3);
49	1.54M	sum = _mm_add_epi16(s, sum);
50
51		// sum^2 * 9 == (sum * 9) * sum
52	1.54M	tmp_0 = _mm_mullo_epi16(sum, s);
53	1.54M	tmp_1 = _mm_mulhi_epi16(sum, s);
54
55	1.54M	sumsq_0 = _mm_unpacklo_epi16(tmp_0, tmp_1);
56	1.54M	sumsq_1 = _mm_unpackhi_epi16(tmp_0, tmp_1);
57
58		// Prime sum/sumsq
59	10.8M	for (i = 1; i <= 6; ++i) {
60	9.26M	__m128i a = _mm_loadl_epi64((__m128i )(dst + i pitch));
61	9.26M	a = _mm_unpacklo_epi8(a, zero);
62	9.26M	sum = _mm_add_epi16(sum, a);
63	9.26M	a = _mm_mullo_epi16(a, a);
64	9.26M	sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(a, zero));
65	9.26M	sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(a, zero));
66	9.26M	}
67
68	749M	for (row = 0; row < rows + 8; row++) {
69	748M	const __m128i above =
70	748M	_mm_load_si128((__m128i *)above_context + (row & 7));
71	748M	__m128i this_row = _mm_loadl_epi64((__m128i )(dst + row pitch));
72	748M	__m128i above_sq, below_sq;
73	748M	__m128i mask_0, mask_1;
74	748M	__m128i multmp_0, multmp_1;
75	748M	__m128i rv;
76	748M	__m128i out;
77
78	748M	this_row = _mm_unpacklo_epi8(this_row, zero);
79
80	748M	if (row + 7 < rows) {
81		// Instead of copying the end context we just stop loading when we get
82		// to the last one.
83	724M	below_context = _mm_loadl_epi64((__m128i )(dst + (row + 7) pitch));
84	724M	below_context = _mm_unpacklo_epi8(below_context, zero);
85	724M	}
86
87	748M	sum = _mm_sub_epi16(sum, above);
88	748M	sum = _mm_add_epi16(sum, below_context);
89
90		// context^2 fits in 16 bits. Don't need to mulhi and combine. Just zero
91		// extend. Unfortunately we can't do below_sq - above_sq in 16 bits
92		// because x86 does not have unpack with sign extension.
93	748M	above_sq = _mm_mullo_epi16(above, above);
94	748M	sumsq_0 = _mm_sub_epi32(sumsq_0, _mm_unpacklo_epi16(above_sq, zero));
95	748M	sumsq_1 = _mm_sub_epi32(sumsq_1, _mm_unpackhi_epi16(above_sq, zero));
96
97	748M	below_sq = _mm_mullo_epi16(below_context, below_context);
98	748M	sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(below_sq, zero));
99	748M	sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(below_sq, zero));
100
101		// sumsq * 16 - sumsq == sumsq * 15
102	748M	mask_0 = _mm_slli_epi32(sumsq_0, 4);
103	748M	mask_0 = _mm_sub_epi32(mask_0, sumsq_0);
104	748M	mask_1 = _mm_slli_epi32(sumsq_1, 4);
105	748M	mask_1 = _mm_sub_epi32(mask_1, sumsq_1);
106
107	748M	multmp_0 = _mm_mullo_epi16(sum, sum);
108	748M	multmp_1 = _mm_mulhi_epi16(sum, sum);
109
110	748M	mask_0 = _mm_sub_epi32(mask_0, _mm_unpacklo_epi16(multmp_0, multmp_1));
111	748M	mask_1 = _mm_sub_epi32(mask_1, _mm_unpackhi_epi16(multmp_0, multmp_1));
112
113		// mask - f gives a negative value when mask < f
114	748M	mask_0 = _mm_sub_epi32(mask_0, f);
115	748M	mask_1 = _mm_sub_epi32(mask_1, f);
116
117		// Shift the sign bit down to create a mask
118	748M	mask_0 = _mm_srai_epi32(mask_0, 31);
119	748M	mask_1 = _mm_srai_epi32(mask_1, 31);
120
121	748M	mask_0 = _mm_packs_epi32(mask_0, mask_1);
122
123	748M	rv = _mm_loadu_si128((__m128i const *)(vpx_rv + (row & 127)));
124
125	748M	mask_1 = _mm_add_epi16(rv, sum);
126	748M	mask_1 = _mm_add_epi16(mask_1, this_row);
127	748M	mask_1 = _mm_srai_epi16(mask_1, 4);
128
129	748M	mask_1 = _mm_and_si128(mask_0, mask_1);
130	748M	mask_0 = _mm_andnot_si128(mask_0, this_row);
131	748M	out = _mm_or_si128(mask_1, mask_0);
132
133	748M	_mm_storel_epi64((__m128i )(dst + row pitch),
134	748M	_mm_packus_epi16(out, zero));
135
136	748M	_mm_store_si128((__m128i *)above_context + ((row + 8) & 7), this_row);
137	748M	}
138
139	1.54M	dst += 8;
140	1.54M	}
141	43.1k	}

Coverage Report

Created: 2026-03-15 06:28