/src/aom/av1/common/x86/intra_edge_sse4.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <assert.h> |
13 | | #include <smmintrin.h> |
14 | | |
15 | | #include "config/aom_config.h" |
16 | | #include "config/av1_rtcd.h" |
17 | | |
18 | 3.57M | void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) { |
19 | 3.57M | if (!strength) return; |
20 | | |
21 | 2.39M | DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = { |
22 | 2.39M | { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 }, // strength 1: 4,8,4 |
23 | 2.39M | { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 }, // strength 2: 5,6,5 |
24 | 2.39M | { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 } // strength 3: 2,4,4,4,2 |
25 | 2.39M | }; |
26 | | |
27 | 2.39M | DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = { |
28 | 2.39M | { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }, |
29 | 2.39M | { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }, |
30 | 2.39M | { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 }, |
31 | 2.39M | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, |
32 | 2.39M | }; |
33 | | |
34 | | // Extend the first and last samples to simplify the loop for the 5-tap case |
35 | 2.39M | p[-1] = p[0]; |
36 | 2.39M | __m128i last = _mm_set1_epi8((char)p[sz - 1]); |
37 | 2.39M | _mm_storeu_si128((__m128i *)&p[sz], last); |
38 | | |
39 | | // Adjust input pointer for filter support area |
40 | 2.39M | uint8_t *in = (strength == 3) ? p - 1 : p; |
41 | | |
42 | | // Avoid modifying first sample |
43 | 2.39M | uint8_t *out = p + 1; |
44 | 2.39M | int len = sz - 1; |
45 | | |
46 | 2.39M | const int use_3tap_filter = (strength < 3); |
47 | | |
48 | 2.39M | if (use_3tap_filter) { |
49 | 1.12M | __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); |
50 | 1.12M | __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]); |
51 | 1.12M | __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]); |
52 | 1.12M | __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]); |
53 | 1.12M | __m128i in0 = _mm_lddqu_si128((__m128i *)in); |
54 | 2.83M | while (len > 0) { |
55 | 1.71M | int n_out = (len < 8) ? len : 8; |
56 | 1.71M | __m128i d0 = _mm_shuffle_epi8(in0, shuf0); |
57 | 1.71M | __m128i d1 = _mm_shuffle_epi8(in0, shuf1); |
58 | 1.71M | d0 = _mm_maddubs_epi16(d0, coef0); |
59 | 1.71M | d1 = _mm_maddubs_epi16(d1, coef0); |
60 | 1.71M | d0 = _mm_hadd_epi16(d0, d1); |
61 | 1.71M | __m128i eight = _mm_set1_epi16(8); |
62 | 1.71M | d0 = _mm_add_epi16(d0, eight); |
63 | 1.71M | d0 = _mm_srai_epi16(d0, 4); |
64 | 1.71M | d0 = _mm_packus_epi16(d0, d0); |
65 | 1.71M | __m128i out0 = _mm_lddqu_si128((__m128i *)out); |
66 | 1.71M | __m128i n0 = _mm_set1_epi8(n_out); |
67 | 1.71M | __m128i mask = _mm_cmpgt_epi8(n0, iden); |
68 | 1.71M | out0 = _mm_blendv_epi8(out0, d0, mask); |
69 | 1.71M | _mm_storel_epi64((__m128i *)out, out0); |
70 | 1.71M | __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16)); |
71 | 1.71M | in0 = _mm_alignr_epi8(in1, in0, 8); |
72 | 1.71M | in += 8; |
73 | 1.71M | out += 8; |
74 | 1.71M | len -= n_out; |
75 | 1.71M | } |
76 | 1.26M | } else { // 5-tap filter |
77 | 1.26M | __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); |
78 | 1.26M | __m128i two = _mm_set1_epi8(2); |
79 | 1.26M | __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]); |
80 | 1.26M | __m128i shuf_b = _mm_add_epi8(shuf_a, two); |
81 | 1.26M | __m128i shuf_c = _mm_add_epi8(shuf_b, two); |
82 | 1.26M | __m128i shuf_d = _mm_add_epi8(shuf_c, two); |
83 | 1.26M | __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]); |
84 | 1.26M | __m128i in0 = _mm_lddqu_si128((__m128i *)in); |
85 | 6.48M | while (len > 0) { |
86 | 5.22M | int n_out = (len < 8) ? len : 8; |
87 | 5.22M | __m128i d0 = _mm_shuffle_epi8(in0, shuf_a); |
88 | 5.22M | __m128i d1 = _mm_shuffle_epi8(in0, shuf_b); |
89 | 5.22M | __m128i d2 = _mm_shuffle_epi8(in0, shuf_c); |
90 | 5.22M | __m128i d3 = _mm_shuffle_epi8(in0, shuf_d); |
91 | 5.22M | d0 = _mm_maddubs_epi16(d0, coef0); |
92 | 5.22M | d1 = _mm_maddubs_epi16(d1, coef0); |
93 | 5.22M | d2 = _mm_maddubs_epi16(d2, coef0); |
94 | 5.22M | d3 = _mm_maddubs_epi16(d3, coef0); |
95 | 5.22M | d0 = _mm_hadd_epi16(d0, d1); |
96 | 5.22M | d2 = _mm_hadd_epi16(d2, d3); |
97 | 5.22M | d0 = _mm_hadd_epi16(d0, d2); |
98 | 5.22M | __m128i eight = _mm_set1_epi16(8); |
99 | 5.22M | d0 = _mm_add_epi16(d0, eight); |
100 | 5.22M | d0 = _mm_srai_epi16(d0, 4); |
101 | 5.22M | d0 = _mm_packus_epi16(d0, d0); |
102 | 5.22M | __m128i out0 = _mm_lddqu_si128((__m128i *)out); |
103 | 5.22M | __m128i n0 = _mm_set1_epi8(n_out); |
104 | 5.22M | __m128i mask = _mm_cmpgt_epi8(n0, iden); |
105 | 5.22M | out0 = _mm_blendv_epi8(out0, d0, mask); |
106 | 5.22M | _mm_storel_epi64((__m128i *)out, out0); |
107 | 5.22M | __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16)); |
108 | 5.22M | in0 = _mm_alignr_epi8(in1, in0, 8); |
109 | 5.22M | in += 8; |
110 | 5.22M | out += 8; |
111 | 5.22M | len -= n_out; |
112 | 5.22M | } |
113 | 1.26M | } |
114 | 2.39M | } |
115 | | |
116 | 976k | void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) { |
117 | | // interpolate half-sample positions |
118 | 976k | assert(sz <= 24); |
119 | | |
120 | 976k | DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = { |
121 | 976k | { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 } |
122 | 976k | }; |
123 | | |
124 | 976k | DECLARE_ALIGNED( |
125 | 976k | 16, static const int8_t, |
126 | 976k | v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }, |
127 | 976k | { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } }; |
128 | | |
129 | | // Extend first/last samples (upper-left p[-1], last p[sz-1]) |
130 | | // to support 4-tap filter |
131 | 976k | p[-2] = p[-1]; |
132 | 976k | p[sz] = p[sz - 1]; |
133 | | |
134 | 976k | uint8_t *in = &p[-2]; |
135 | 976k | uint8_t *out = &p[-2]; |
136 | | |
137 | 976k | int n = sz + 1; // Input length including upper-left sample |
138 | | |
139 | 976k | __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); |
140 | 976k | __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]); |
141 | | |
142 | 976k | __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]); |
143 | 976k | __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]); |
144 | 976k | __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]); |
145 | | |
146 | 2.05M | while (n > 0) { |
147 | 1.08M | __m128i in8 = _mm_alignr_epi8(in16, in0, 8); |
148 | 1.08M | __m128i d0 = _mm_shuffle_epi8(in0, shuf0); |
149 | 1.08M | __m128i d1 = _mm_shuffle_epi8(in0, shuf1); |
150 | 1.08M | __m128i d2 = _mm_shuffle_epi8(in8, shuf0); |
151 | 1.08M | __m128i d3 = _mm_shuffle_epi8(in8, shuf1); |
152 | 1.08M | d0 = _mm_maddubs_epi16(d0, coef0); |
153 | 1.08M | d1 = _mm_maddubs_epi16(d1, coef0); |
154 | 1.08M | d2 = _mm_maddubs_epi16(d2, coef0); |
155 | 1.08M | d3 = _mm_maddubs_epi16(d3, coef0); |
156 | 1.08M | d0 = _mm_hadd_epi16(d0, d1); |
157 | 1.08M | d2 = _mm_hadd_epi16(d2, d3); |
158 | 1.08M | __m128i eight = _mm_set1_epi16(8); |
159 | 1.08M | d0 = _mm_add_epi16(d0, eight); |
160 | 1.08M | d2 = _mm_add_epi16(d2, eight); |
161 | 1.08M | d0 = _mm_srai_epi16(d0, 4); |
162 | 1.08M | d2 = _mm_srai_epi16(d2, 4); |
163 | 1.08M | d0 = _mm_packus_epi16(d0, d2); |
164 | 1.08M | __m128i in1 = _mm_alignr_epi8(in16, in0, 1); |
165 | 1.08M | __m128i out0 = _mm_unpacklo_epi8(in1, d0); |
166 | 1.08M | __m128i out1 = _mm_unpackhi_epi8(in1, d0); |
167 | 1.08M | _mm_storeu_si128((__m128i *)&out[0], out0); |
168 | 1.08M | _mm_storeu_si128((__m128i *)&out[16], out1); |
169 | 1.08M | in0 = in16; |
170 | 1.08M | in16 = _mm_setzero_si128(); |
171 | 1.08M | out += 32; |
172 | 1.08M | n -= 16; |
173 | 1.08M | } |
174 | 976k | } |
175 | | |
176 | | #if CONFIG_AV1_HIGHBITDEPTH |
177 | | |
178 | 3.57M | void av1_highbd_filter_intra_edge_sse4_1(uint16_t *p, int sz, int strength) { |
179 | 3.57M | if (!strength) return; |
180 | | |
181 | 2.47M | DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = { |
182 | 2.47M | { 4, 8, 4, 8, 4, 8, 4, 8 }, // strength 1: 4,8,4 |
183 | 2.47M | { 5, 6, 5, 6, 5, 6, 5, 6 }, // strength 2: 5,6,5 |
184 | 2.47M | { 2, 4, 2, 4, 2, 4, 2, 4 } // strength 3: 2,4,4,4,2 |
185 | 2.47M | }; |
186 | | |
187 | 2.47M | DECLARE_ALIGNED(16, static const int16_t, |
188 | 2.47M | v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } }; |
189 | | |
190 | | // Extend the first and last samples to simplify the loop for the 5-tap case |
191 | 2.47M | p[-1] = p[0]; |
192 | 2.47M | __m128i last = _mm_set1_epi16(p[sz - 1]); |
193 | 2.47M | _mm_storeu_si128((__m128i *)&p[sz], last); |
194 | | |
195 | | // Adjust input pointer for filter support area |
196 | 2.47M | uint16_t *in = (strength == 3) ? p - 1 : p; |
197 | | |
198 | | // Avoid modifying first sample |
199 | 2.47M | uint16_t *out = p + 1; |
200 | 2.47M | int len = sz - 1; |
201 | | |
202 | 2.47M | const int use_3tap_filter = (strength < 3); |
203 | | |
204 | 2.47M | if (use_3tap_filter) { |
205 | 1.00M | __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); |
206 | 1.00M | __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]); |
207 | 1.00M | __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); |
208 | 1.00M | __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); |
209 | 2.76M | while (len > 0) { |
210 | 1.76M | int n_out = (len < 8) ? len : 8; |
211 | 1.76M | __m128i in1 = _mm_alignr_epi8(in8, in0, 2); |
212 | 1.76M | __m128i in2 = _mm_alignr_epi8(in8, in0, 4); |
213 | 1.76M | __m128i in02 = _mm_add_epi16(in0, in2); |
214 | 1.76M | __m128i d0 = _mm_unpacklo_epi16(in02, in1); |
215 | 1.76M | __m128i d1 = _mm_unpackhi_epi16(in02, in1); |
216 | 1.76M | d0 = _mm_mullo_epi16(d0, coef0); |
217 | 1.76M | d1 = _mm_mullo_epi16(d1, coef0); |
218 | 1.76M | d0 = _mm_hadd_epi16(d0, d1); |
219 | 1.76M | __m128i eight = _mm_set1_epi16(8); |
220 | 1.76M | d0 = _mm_add_epi16(d0, eight); |
221 | 1.76M | d0 = _mm_srli_epi16(d0, 4); |
222 | 1.76M | __m128i out0 = _mm_lddqu_si128((__m128i *)out); |
223 | 1.76M | __m128i n0 = _mm_set1_epi16(n_out); |
224 | 1.76M | __m128i mask = _mm_cmpgt_epi16(n0, iden); |
225 | 1.76M | out0 = _mm_blendv_epi8(out0, d0, mask); |
226 | 1.76M | _mm_storeu_si128((__m128i *)out, out0); |
227 | 1.76M | in += 8; |
228 | 1.76M | in0 = in8; |
229 | 1.76M | in8 = _mm_lddqu_si128((__m128i *)&in[8]); |
230 | 1.76M | out += 8; |
231 | 1.76M | len -= n_out; |
232 | 1.76M | } |
233 | 1.46M | } else { // 5-tap filter |
234 | 1.46M | __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); |
235 | 1.46M | __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]); |
236 | 1.46M | __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); |
237 | 1.46M | __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); |
238 | 7.42M | while (len > 0) { |
239 | 5.95M | int n_out = (len < 8) ? len : 8; |
240 | 5.95M | __m128i in1 = _mm_alignr_epi8(in8, in0, 2); |
241 | 5.95M | __m128i in2 = _mm_alignr_epi8(in8, in0, 4); |
242 | 5.95M | __m128i in3 = _mm_alignr_epi8(in8, in0, 6); |
243 | 5.95M | __m128i in4 = _mm_alignr_epi8(in8, in0, 8); |
244 | 5.95M | __m128i in04 = _mm_add_epi16(in0, in4); |
245 | 5.95M | __m128i in123 = _mm_add_epi16(in1, in2); |
246 | 5.95M | in123 = _mm_add_epi16(in123, in3); |
247 | 5.95M | __m128i d0 = _mm_unpacklo_epi16(in04, in123); |
248 | 5.95M | __m128i d1 = _mm_unpackhi_epi16(in04, in123); |
249 | 5.95M | d0 = _mm_mullo_epi16(d0, coef0); |
250 | 5.95M | d1 = _mm_mullo_epi16(d1, coef0); |
251 | 5.95M | d0 = _mm_hadd_epi16(d0, d1); |
252 | 5.95M | __m128i eight = _mm_set1_epi16(8); |
253 | 5.95M | d0 = _mm_add_epi16(d0, eight); |
254 | 5.95M | d0 = _mm_srli_epi16(d0, 4); |
255 | 5.95M | __m128i out0 = _mm_lddqu_si128((__m128i *)out); |
256 | 5.95M | __m128i n0 = _mm_set1_epi16(n_out); |
257 | 5.95M | __m128i mask = _mm_cmpgt_epi16(n0, iden); |
258 | 5.95M | out0 = _mm_blendv_epi8(out0, d0, mask); |
259 | 5.95M | _mm_storeu_si128((__m128i *)out, out0); |
260 | 5.95M | in += 8; |
261 | 5.95M | in0 = in8; |
262 | 5.95M | in8 = _mm_lddqu_si128((__m128i *)&in[8]); |
263 | 5.95M | out += 8; |
264 | 5.95M | len -= n_out; |
265 | 5.95M | } |
266 | 1.46M | } |
267 | 2.47M | } |
268 | | |
269 | 847k | void av1_highbd_upsample_intra_edge_sse4_1(uint16_t *p, int sz, int bd) { |
270 | | // interpolate half-sample positions |
271 | 847k | assert(sz <= 24); |
272 | | |
273 | 847k | DECLARE_ALIGNED(16, static const int16_t, |
274 | 847k | kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } }; |
275 | | |
276 | | // Extend first/last samples (upper-left p[-1], last p[sz-1]) |
277 | | // to support 4-tap filter |
278 | 847k | p[-2] = p[-1]; |
279 | 847k | p[sz] = p[sz - 1]; |
280 | | |
281 | 847k | uint16_t *in = &p[-2]; |
282 | 847k | uint16_t *out = in; |
283 | 847k | int n = sz + 1; |
284 | | |
285 | 847k | __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); |
286 | 847k | __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); |
287 | 847k | __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]); |
288 | 847k | __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]); |
289 | | |
290 | 2.46M | while (n > 0) { |
291 | 1.62M | __m128i in1 = _mm_alignr_epi8(in8, in0, 2); |
292 | 1.62M | __m128i in2 = _mm_alignr_epi8(in8, in0, 4); |
293 | 1.62M | __m128i in3 = _mm_alignr_epi8(in8, in0, 6); |
294 | 1.62M | __m128i sum0 = _mm_add_epi16(in0, in3); |
295 | 1.62M | __m128i sum1 = _mm_add_epi16(in1, in2); |
296 | 1.62M | __m128i d0 = _mm_unpacklo_epi16(sum0, sum1); |
297 | 1.62M | __m128i d1 = _mm_unpackhi_epi16(sum0, sum1); |
298 | 1.62M | __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]); |
299 | 1.62M | d0 = _mm_madd_epi16(d0, coef0); |
300 | 1.62M | d1 = _mm_madd_epi16(d1, coef0); |
301 | 1.62M | __m128i eight = _mm_set1_epi32(8); |
302 | 1.62M | d0 = _mm_add_epi32(d0, eight); |
303 | 1.62M | d1 = _mm_add_epi32(d1, eight); |
304 | 1.62M | d0 = _mm_srai_epi32(d0, 4); |
305 | 1.62M | d1 = _mm_srai_epi32(d1, 4); |
306 | 1.62M | d0 = _mm_packus_epi32(d0, d1); |
307 | 1.62M | __m128i max0 = _mm_set1_epi16((1 << bd) - 1); |
308 | 1.62M | d0 = _mm_min_epi16(d0, max0); |
309 | 1.62M | __m128i out0 = _mm_unpacklo_epi16(in1, d0); |
310 | 1.62M | __m128i out1 = _mm_unpackhi_epi16(in1, d0); |
311 | 1.62M | _mm_storeu_si128((__m128i *)&out[0], out0); |
312 | 1.62M | _mm_storeu_si128((__m128i *)&out[8], out1); |
313 | 1.62M | in0 = in8; |
314 | 1.62M | in8 = in16; |
315 | 1.62M | in16 = in24; |
316 | 1.62M | in24 = _mm_setzero_si128(); |
317 | 1.62M | out += 16; |
318 | 1.62M | n -= 8; |
319 | 1.62M | } |
320 | 847k | } |
321 | | |
322 | | #endif // CONFIG_AV1_HIGHBITDEPTH |