/src/libvpx/vpx_dsp/x86/post_proc_sse2.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2018 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <assert.h> |
12 | | #include <emmintrin.h> |
13 | | |
14 | | #include <stdio.h> |
15 | | |
16 | | #include "./vpx_dsp_rtcd.h" |
17 | | #include "vpx/vpx_integer.h" |
18 | | #include "vpx_dsp/x86/mem_sse2.h" |
19 | | |
20 | | extern const int16_t vpx_rv[]; |
21 | | |
22 | | void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, |
23 | 43.1k | int cols, int flimit) { |
24 | 43.1k | int col; |
25 | 43.1k | const __m128i zero = _mm_setzero_si128(); |
26 | 43.1k | const __m128i f = _mm_set1_epi32(flimit); |
27 | 43.1k | DECLARE_ALIGNED(16, int16_t, above_context[8 * 8]); |
28 | | |
29 | | // 8 columns are processed at a time. |
30 | | // If rows is less than 8 the bottom border extension fails. |
31 | 43.1k | assert(cols % 8 == 0); |
32 | 43.1k | assert(rows >= 8); |
33 | | |
34 | 1.58M | for (col = 0; col < cols; col += 8) { |
35 | 1.54M | int row, i; |
36 | 1.54M | __m128i s = _mm_loadl_epi64((__m128i *)dst); |
37 | 1.54M | __m128i sum, sumsq_0, sumsq_1; |
38 | 1.54M | __m128i tmp_0, tmp_1; |
39 | 1.54M | __m128i below_context = _mm_setzero_si128(); |
40 | | |
41 | 1.54M | s = _mm_unpacklo_epi8(s, zero); |
42 | | |
43 | 13.8M | for (i = 0; i < 8; ++i) { |
44 | 12.3M | _mm_store_si128((__m128i *)above_context + i, s); |
45 | 12.3M | } |
46 | | |
47 | | // sum *= 9 |
48 | 1.54M | sum = _mm_slli_epi16(s, 3); |
49 | 1.54M | sum = _mm_add_epi16(s, sum); |
50 | | |
51 | | // sum^2 * 9 == (sum * 9) * sum |
52 | 1.54M | tmp_0 = _mm_mullo_epi16(sum, s); |
53 | 1.54M | tmp_1 = _mm_mulhi_epi16(sum, s); |
54 | | |
55 | 1.54M | sumsq_0 = _mm_unpacklo_epi16(tmp_0, tmp_1); |
56 | 1.54M | sumsq_1 = _mm_unpackhi_epi16(tmp_0, tmp_1); |
57 | | |
58 | | // Prime sum/sumsq |
59 | 10.8M | for (i = 1; i <= 6; ++i) { |
60 | 9.26M | __m128i a = _mm_loadl_epi64((__m128i *)(dst + i * pitch)); |
61 | 9.26M | a = _mm_unpacklo_epi8(a, zero); |
62 | 9.26M | sum = _mm_add_epi16(sum, a); |
63 | 9.26M | a = _mm_mullo_epi16(a, a); |
64 | 9.26M | sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(a, zero)); |
65 | 9.26M | sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(a, zero)); |
66 | 9.26M | } |
67 | | |
68 | 749M | for (row = 0; row < rows + 8; row++) { |
69 | 748M | const __m128i above = |
70 | 748M | _mm_load_si128((__m128i *)above_context + (row & 7)); |
71 | 748M | __m128i this_row = _mm_loadl_epi64((__m128i *)(dst + row * pitch)); |
72 | 748M | __m128i above_sq, below_sq; |
73 | 748M | __m128i mask_0, mask_1; |
74 | 748M | __m128i multmp_0, multmp_1; |
75 | 748M | __m128i rv; |
76 | 748M | __m128i out; |
77 | | |
78 | 748M | this_row = _mm_unpacklo_epi8(this_row, zero); |
79 | | |
80 | 748M | if (row + 7 < rows) { |
81 | | // Instead of copying the end context we just stop loading when we get |
82 | | // to the last one. |
83 | 724M | below_context = _mm_loadl_epi64((__m128i *)(dst + (row + 7) * pitch)); |
84 | 724M | below_context = _mm_unpacklo_epi8(below_context, zero); |
85 | 724M | } |
86 | | |
87 | 748M | sum = _mm_sub_epi16(sum, above); |
88 | 748M | sum = _mm_add_epi16(sum, below_context); |
89 | | |
90 | | // context^2 fits in 16 bits. Don't need to mulhi and combine. Just zero |
91 | | // extend. Unfortunately we can't do below_sq - above_sq in 16 bits |
92 | | // because x86 does not have unpack with sign extension. |
93 | 748M | above_sq = _mm_mullo_epi16(above, above); |
94 | 748M | sumsq_0 = _mm_sub_epi32(sumsq_0, _mm_unpacklo_epi16(above_sq, zero)); |
95 | 748M | sumsq_1 = _mm_sub_epi32(sumsq_1, _mm_unpackhi_epi16(above_sq, zero)); |
96 | | |
97 | 748M | below_sq = _mm_mullo_epi16(below_context, below_context); |
98 | 748M | sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(below_sq, zero)); |
99 | 748M | sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(below_sq, zero)); |
100 | | |
101 | | // sumsq * 16 - sumsq == sumsq * 15 |
102 | 748M | mask_0 = _mm_slli_epi32(sumsq_0, 4); |
103 | 748M | mask_0 = _mm_sub_epi32(mask_0, sumsq_0); |
104 | 748M | mask_1 = _mm_slli_epi32(sumsq_1, 4); |
105 | 748M | mask_1 = _mm_sub_epi32(mask_1, sumsq_1); |
106 | | |
107 | 748M | multmp_0 = _mm_mullo_epi16(sum, sum); |
108 | 748M | multmp_1 = _mm_mulhi_epi16(sum, sum); |
109 | | |
110 | 748M | mask_0 = _mm_sub_epi32(mask_0, _mm_unpacklo_epi16(multmp_0, multmp_1)); |
111 | 748M | mask_1 = _mm_sub_epi32(mask_1, _mm_unpackhi_epi16(multmp_0, multmp_1)); |
112 | | |
113 | | // mask - f gives a negative value when mask < f |
114 | 748M | mask_0 = _mm_sub_epi32(mask_0, f); |
115 | 748M | mask_1 = _mm_sub_epi32(mask_1, f); |
116 | | |
117 | | // Shift the sign bit down to create a mask |
118 | 748M | mask_0 = _mm_srai_epi32(mask_0, 31); |
119 | 748M | mask_1 = _mm_srai_epi32(mask_1, 31); |
120 | | |
121 | 748M | mask_0 = _mm_packs_epi32(mask_0, mask_1); |
122 | | |
123 | 748M | rv = _mm_loadu_si128((__m128i const *)(vpx_rv + (row & 127))); |
124 | | |
125 | 748M | mask_1 = _mm_add_epi16(rv, sum); |
126 | 748M | mask_1 = _mm_add_epi16(mask_1, this_row); |
127 | 748M | mask_1 = _mm_srai_epi16(mask_1, 4); |
128 | | |
129 | 748M | mask_1 = _mm_and_si128(mask_0, mask_1); |
130 | 748M | mask_0 = _mm_andnot_si128(mask_0, this_row); |
131 | 748M | out = _mm_or_si128(mask_1, mask_0); |
132 | | |
133 | 748M | _mm_storel_epi64((__m128i *)(dst + row * pitch), |
134 | 748M | _mm_packus_epi16(out, zero)); |
135 | | |
136 | 748M | _mm_store_si128((__m128i *)above_context + ((row + 8) & 7), this_row); |
137 | 748M | } |
138 | | |
139 | 1.54M | dst += 8; |
140 | 1.54M | } |
141 | 43.1k | } |