/src/aom/av1/common/x86/intra_edge_sse4.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <assert.h> |
13 | | #include <smmintrin.h> |
14 | | |
15 | | #include "config/aom_config.h" |
16 | | #include "config/av1_rtcd.h" |
17 | | |
18 | 3.20M | void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) { |
19 | 3.20M | if (!strength) return; |
20 | | |
21 | 2.16M | DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = { |
22 | 2.16M | { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 }, // strength 1: 4,8,4 |
23 | 2.16M | { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 }, // strength 2: 5,6,5 |
24 | 2.16M | { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 } // strength 3: 2,4,4,4,2 |
25 | 2.16M | }; |
26 | | |
27 | 2.16M | DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = { |
28 | 2.16M | { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }, |
29 | 2.16M | { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }, |
30 | 2.16M | { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 }, |
31 | 2.16M | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, |
32 | 2.16M | }; |
33 | | |
34 | | // Extend the first and last samples to simplify the loop for the 5-tap case |
35 | 2.16M | p[-1] = p[0]; |
36 | 2.16M | __m128i last = _mm_set1_epi8((char)p[sz - 1]); |
37 | 2.16M | _mm_storeu_si128((__m128i *)&p[sz], last); |
38 | | |
39 | | // Adjust input pointer for filter support area |
40 | 2.16M | uint8_t *in = (strength == 3) ? p - 1 : p; |
41 | | |
42 | | // Avoid modifying first sample |
43 | 2.16M | uint8_t *out = p + 1; |
44 | 2.16M | int len = sz - 1; |
45 | | |
46 | 2.16M | const int use_3tap_filter = (strength < 3); |
47 | | |
48 | 2.16M | if (use_3tap_filter) { |
49 | 897k | __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); |
50 | 897k | __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]); |
51 | 897k | __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]); |
52 | 897k | __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]); |
53 | 897k | __m128i in0 = _mm_lddqu_si128((__m128i *)in); |
54 | 2.52M | while (len > 0) { |
55 | 1.62M | int n_out = (len < 8) ? len : 8; |
56 | 1.62M | __m128i d0 = _mm_shuffle_epi8(in0, shuf0); |
57 | 1.62M | __m128i d1 = _mm_shuffle_epi8(in0, shuf1); |
58 | 1.62M | d0 = _mm_maddubs_epi16(d0, coef0); |
59 | 1.62M | d1 = _mm_maddubs_epi16(d1, coef0); |
60 | 1.62M | d0 = _mm_hadd_epi16(d0, d1); |
61 | 1.62M | __m128i eight = _mm_set1_epi16(8); |
62 | 1.62M | d0 = _mm_add_epi16(d0, eight); |
63 | 1.62M | d0 = _mm_srai_epi16(d0, 4); |
64 | 1.62M | d0 = _mm_packus_epi16(d0, d0); |
65 | 1.62M | __m128i out0 = _mm_lddqu_si128((__m128i *)out); |
66 | 1.62M | __m128i n0 = _mm_set1_epi8(n_out); |
67 | 1.62M | __m128i mask = _mm_cmpgt_epi8(n0, iden); |
68 | 1.62M | out0 = _mm_blendv_epi8(out0, d0, mask); |
69 | 1.62M | _mm_storel_epi64((__m128i *)out, out0); |
70 | 1.62M | __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16)); |
71 | 1.62M | in0 = _mm_alignr_epi8(in1, in0, 8); |
72 | 1.62M | in += 8; |
73 | 1.62M | out += 8; |
74 | 1.62M | len -= n_out; |
75 | 1.62M | } |
76 | 1.26M | } else { // 5-tap filter |
77 | 1.26M | __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); |
78 | 1.26M | __m128i two = _mm_set1_epi8(2); |
79 | 1.26M | __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]); |
80 | 1.26M | __m128i shuf_b = _mm_add_epi8(shuf_a, two); |
81 | 1.26M | __m128i shuf_c = _mm_add_epi8(shuf_b, two); |
82 | 1.26M | __m128i shuf_d = _mm_add_epi8(shuf_c, two); |
83 | 1.26M | __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]); |
84 | 1.26M | __m128i in0 = _mm_lddqu_si128((__m128i *)in); |
85 | 6.43M | while (len > 0) { |
86 | 5.16M | int n_out = (len < 8) ? len : 8; |
87 | 5.16M | __m128i d0 = _mm_shuffle_epi8(in0, shuf_a); |
88 | 5.16M | __m128i d1 = _mm_shuffle_epi8(in0, shuf_b); |
89 | 5.16M | __m128i d2 = _mm_shuffle_epi8(in0, shuf_c); |
90 | 5.16M | __m128i d3 = _mm_shuffle_epi8(in0, shuf_d); |
91 | 5.16M | d0 = _mm_maddubs_epi16(d0, coef0); |
92 | 5.16M | d1 = _mm_maddubs_epi16(d1, coef0); |
93 | 5.16M | d2 = _mm_maddubs_epi16(d2, coef0); |
94 | 5.16M | d3 = _mm_maddubs_epi16(d3, coef0); |
95 | 5.16M | d0 = _mm_hadd_epi16(d0, d1); |
96 | 5.16M | d2 = _mm_hadd_epi16(d2, d3); |
97 | 5.16M | d0 = _mm_hadd_epi16(d0, d2); |
98 | 5.16M | __m128i eight = _mm_set1_epi16(8); |
99 | 5.16M | d0 = _mm_add_epi16(d0, eight); |
100 | 5.16M | d0 = _mm_srai_epi16(d0, 4); |
101 | 5.16M | d0 = _mm_packus_epi16(d0, d0); |
102 | 5.16M | __m128i out0 = _mm_lddqu_si128((__m128i *)out); |
103 | 5.16M | __m128i n0 = _mm_set1_epi8(n_out); |
104 | 5.16M | __m128i mask = _mm_cmpgt_epi8(n0, iden); |
105 | 5.16M | out0 = _mm_blendv_epi8(out0, d0, mask); |
106 | 5.16M | _mm_storel_epi64((__m128i *)out, out0); |
107 | 5.16M | __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16)); |
108 | 5.16M | in0 = _mm_alignr_epi8(in1, in0, 8); |
109 | 5.16M | in += 8; |
110 | 5.16M | out += 8; |
111 | 5.16M | len -= n_out; |
112 | 5.16M | } |
113 | 1.26M | } |
114 | 2.16M | } |
115 | | |
116 | 2.62M | void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength) { |
117 | 2.62M | if (!strength) return; |
118 | | |
119 | 1.79M | DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = { |
120 | 1.79M | { 4, 8, 4, 8, 4, 8, 4, 8 }, // strength 1: 4,8,4 |
121 | 1.79M | { 5, 6, 5, 6, 5, 6, 5, 6 }, // strength 2: 5,6,5 |
122 | 1.79M | { 2, 4, 2, 4, 2, 4, 2, 4 } // strength 3: 2,4,4,4,2 |
123 | 1.79M | }; |
124 | | |
125 | 1.79M | DECLARE_ALIGNED(16, static const int16_t, |
126 | 1.79M | v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } }; |
127 | | |
128 | | // Extend the first and last samples to simplify the loop for the 5-tap case |
129 | 1.79M | p[-1] = p[0]; |
130 | 1.79M | __m128i last = _mm_set1_epi16(p[sz - 1]); |
131 | 1.79M | _mm_storeu_si128((__m128i *)&p[sz], last); |
132 | | |
133 | | // Adjust input pointer for filter support area |
134 | 1.79M | uint16_t *in = (strength == 3) ? p - 1 : p; |
135 | | |
136 | | // Avoid modifying first sample |
137 | 1.79M | uint16_t *out = p + 1; |
138 | 1.79M | int len = sz - 1; |
139 | | |
140 | 1.79M | const int use_3tap_filter = (strength < 3); |
141 | | |
142 | 1.79M | if (use_3tap_filter) { |
143 | 677k | __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); |
144 | 677k | __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]); |
145 | 677k | __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); |
146 | 677k | __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); |
147 | 1.87M | while (len > 0) { |
148 | 1.19M | int n_out = (len < 8) ? len : 8; |
149 | 1.19M | __m128i in1 = _mm_alignr_epi8(in8, in0, 2); |
150 | 1.19M | __m128i in2 = _mm_alignr_epi8(in8, in0, 4); |
151 | 1.19M | __m128i in02 = _mm_add_epi16(in0, in2); |
152 | 1.19M | __m128i d0 = _mm_unpacklo_epi16(in02, in1); |
153 | 1.19M | __m128i d1 = _mm_unpackhi_epi16(in02, in1); |
154 | 1.19M | d0 = _mm_mullo_epi16(d0, coef0); |
155 | 1.19M | d1 = _mm_mullo_epi16(d1, coef0); |
156 | 1.19M | d0 = _mm_hadd_epi16(d0, d1); |
157 | 1.19M | __m128i eight = _mm_set1_epi16(8); |
158 | 1.19M | d0 = _mm_add_epi16(d0, eight); |
159 | 1.19M | d0 = _mm_srli_epi16(d0, 4); |
160 | 1.19M | __m128i out0 = _mm_lddqu_si128((__m128i *)out); |
161 | 1.19M | __m128i n0 = _mm_set1_epi16(n_out); |
162 | 1.19M | __m128i mask = _mm_cmpgt_epi16(n0, iden); |
163 | 1.19M | out0 = _mm_blendv_epi8(out0, d0, mask); |
164 | 1.19M | _mm_storeu_si128((__m128i *)out, out0); |
165 | 1.19M | in += 8; |
166 | 1.19M | in0 = in8; |
167 | 1.19M | in8 = _mm_lddqu_si128((__m128i *)&in[8]); |
168 | 1.19M | out += 8; |
169 | 1.19M | len -= n_out; |
170 | 1.19M | } |
171 | 1.12M | } else { // 5-tap filter |
172 | 1.12M | __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); |
173 | 1.12M | __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]); |
174 | 1.12M | __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); |
175 | 1.12M | __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); |
176 | 5.70M | while (len > 0) { |
177 | 4.58M | int n_out = (len < 8) ? len : 8; |
178 | 4.58M | __m128i in1 = _mm_alignr_epi8(in8, in0, 2); |
179 | 4.58M | __m128i in2 = _mm_alignr_epi8(in8, in0, 4); |
180 | 4.58M | __m128i in3 = _mm_alignr_epi8(in8, in0, 6); |
181 | 4.58M | __m128i in4 = _mm_alignr_epi8(in8, in0, 8); |
182 | 4.58M | __m128i in04 = _mm_add_epi16(in0, in4); |
183 | 4.58M | __m128i in123 = _mm_add_epi16(in1, in2); |
184 | 4.58M | in123 = _mm_add_epi16(in123, in3); |
185 | 4.58M | __m128i d0 = _mm_unpacklo_epi16(in04, in123); |
186 | 4.58M | __m128i d1 = _mm_unpackhi_epi16(in04, in123); |
187 | 4.58M | d0 = _mm_mullo_epi16(d0, coef0); |
188 | 4.58M | d1 = _mm_mullo_epi16(d1, coef0); |
189 | 4.58M | d0 = _mm_hadd_epi16(d0, d1); |
190 | 4.58M | __m128i eight = _mm_set1_epi16(8); |
191 | 4.58M | d0 = _mm_add_epi16(d0, eight); |
192 | 4.58M | d0 = _mm_srli_epi16(d0, 4); |
193 | 4.58M | __m128i out0 = _mm_lddqu_si128((__m128i *)out); |
194 | 4.58M | __m128i n0 = _mm_set1_epi16(n_out); |
195 | 4.58M | __m128i mask = _mm_cmpgt_epi16(n0, iden); |
196 | 4.58M | out0 = _mm_blendv_epi8(out0, d0, mask); |
197 | 4.58M | _mm_storeu_si128((__m128i *)out, out0); |
198 | 4.58M | in += 8; |
199 | 4.58M | in0 = in8; |
200 | 4.58M | in8 = _mm_lddqu_si128((__m128i *)&in[8]); |
201 | 4.58M | out += 8; |
202 | 4.58M | len -= n_out; |
203 | 4.58M | } |
204 | 1.12M | } |
205 | 1.79M | } |
206 | | |
207 | 760k | void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) { |
208 | | // interpolate half-sample positions |
209 | 760k | assert(sz <= 24); |
210 | | |
211 | 760k | DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = { |
212 | 760k | { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 } |
213 | 760k | }; |
214 | | |
215 | 760k | DECLARE_ALIGNED( |
216 | 760k | 16, static const int8_t, |
217 | 760k | v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }, |
218 | 760k | { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } }; |
219 | | |
220 | | // Extend first/last samples (upper-left p[-1], last p[sz-1]) |
221 | | // to support 4-tap filter |
222 | 760k | p[-2] = p[-1]; |
223 | 760k | p[sz] = p[sz - 1]; |
224 | | |
225 | 760k | uint8_t *in = &p[-2]; |
226 | 760k | uint8_t *out = &p[-2]; |
227 | | |
228 | 760k | int n = sz + 1; // Input length including upper-left sample |
229 | | |
230 | 760k | __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); |
231 | 760k | __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]); |
232 | | |
233 | 760k | __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]); |
234 | 760k | __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]); |
235 | 760k | __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]); |
236 | | |
237 | 1.65M | while (n > 0) { |
238 | 895k | __m128i in8 = _mm_alignr_epi8(in16, in0, 8); |
239 | 895k | __m128i d0 = _mm_shuffle_epi8(in0, shuf0); |
240 | 895k | __m128i d1 = _mm_shuffle_epi8(in0, shuf1); |
241 | 895k | __m128i d2 = _mm_shuffle_epi8(in8, shuf0); |
242 | 895k | __m128i d3 = _mm_shuffle_epi8(in8, shuf1); |
243 | 895k | d0 = _mm_maddubs_epi16(d0, coef0); |
244 | 895k | d1 = _mm_maddubs_epi16(d1, coef0); |
245 | 895k | d2 = _mm_maddubs_epi16(d2, coef0); |
246 | 895k | d3 = _mm_maddubs_epi16(d3, coef0); |
247 | 895k | d0 = _mm_hadd_epi16(d0, d1); |
248 | 895k | d2 = _mm_hadd_epi16(d2, d3); |
249 | 895k | __m128i eight = _mm_set1_epi16(8); |
250 | 895k | d0 = _mm_add_epi16(d0, eight); |
251 | 895k | d2 = _mm_add_epi16(d2, eight); |
252 | 895k | d0 = _mm_srai_epi16(d0, 4); |
253 | 895k | d2 = _mm_srai_epi16(d2, 4); |
254 | 895k | d0 = _mm_packus_epi16(d0, d2); |
255 | 895k | __m128i in1 = _mm_alignr_epi8(in16, in0, 1); |
256 | 895k | __m128i out0 = _mm_unpacklo_epi8(in1, d0); |
257 | 895k | __m128i out1 = _mm_unpackhi_epi8(in1, d0); |
258 | 895k | _mm_storeu_si128((__m128i *)&out[0], out0); |
259 | 895k | _mm_storeu_si128((__m128i *)&out[16], out1); |
260 | 895k | in0 = in16; |
261 | 895k | in16 = _mm_setzero_si128(); |
262 | 895k | out += 32; |
263 | 895k | n -= 16; |
264 | 895k | } |
265 | 760k | } |
266 | | |
267 | 591k | void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd) { |
268 | | // interpolate half-sample positions |
269 | 591k | assert(sz <= 24); |
270 | | |
271 | 591k | DECLARE_ALIGNED(16, static const int16_t, |
272 | 591k | kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } }; |
273 | | |
274 | | // Extend first/last samples (upper-left p[-1], last p[sz-1]) |
275 | | // to support 4-tap filter |
276 | 591k | p[-2] = p[-1]; |
277 | 591k | p[sz] = p[sz - 1]; |
278 | | |
279 | 591k | uint16_t *in = &p[-2]; |
280 | 591k | uint16_t *out = in; |
281 | 591k | int n = sz + 1; |
282 | | |
283 | 591k | __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); |
284 | 591k | __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); |
285 | 591k | __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]); |
286 | 591k | __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]); |
287 | | |
288 | 1.68M | while (n > 0) { |
289 | 1.09M | __m128i in1 = _mm_alignr_epi8(in8, in0, 2); |
290 | 1.09M | __m128i in2 = _mm_alignr_epi8(in8, in0, 4); |
291 | 1.09M | __m128i in3 = _mm_alignr_epi8(in8, in0, 6); |
292 | 1.09M | __m128i sum0 = _mm_add_epi16(in0, in3); |
293 | 1.09M | __m128i sum1 = _mm_add_epi16(in1, in2); |
294 | 1.09M | __m128i d0 = _mm_unpacklo_epi16(sum0, sum1); |
295 | 1.09M | __m128i d1 = _mm_unpackhi_epi16(sum0, sum1); |
296 | 1.09M | __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]); |
297 | 1.09M | d0 = _mm_madd_epi16(d0, coef0); |
298 | 1.09M | d1 = _mm_madd_epi16(d1, coef0); |
299 | 1.09M | __m128i eight = _mm_set1_epi32(8); |
300 | 1.09M | d0 = _mm_add_epi32(d0, eight); |
301 | 1.09M | d1 = _mm_add_epi32(d1, eight); |
302 | 1.09M | d0 = _mm_srai_epi32(d0, 4); |
303 | 1.09M | d1 = _mm_srai_epi32(d1, 4); |
304 | 1.09M | d0 = _mm_packus_epi32(d0, d1); |
305 | 1.09M | __m128i max0 = _mm_set1_epi16((1 << bd) - 1); |
306 | 1.09M | d0 = _mm_min_epi16(d0, max0); |
307 | 1.09M | __m128i out0 = _mm_unpacklo_epi16(in1, d0); |
308 | 1.09M | __m128i out1 = _mm_unpackhi_epi16(in1, d0); |
309 | 1.09M | _mm_storeu_si128((__m128i *)&out[0], out0); |
310 | 1.09M | _mm_storeu_si128((__m128i *)&out[8], out1); |
311 | 1.09M | in0 = in8; |
312 | 1.09M | in8 = in16; |
313 | 1.09M | in16 = in24; |
314 | 1.09M | in24 = _mm_setzero_si128(); |
315 | 1.09M | out += 16; |
316 | 1.09M | n -= 8; |
317 | 1.09M | } |
318 | 591k | } |