/src/aom/aom_dsp/x86/highbd_intrapred_sse2.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <emmintrin.h> |
13 | | |
14 | | #include "config/aom_dsp_rtcd.h" |
15 | | |
16 | | // ----------------------------------------------------------------------------- |
17 | | // H_PRED |
18 | | |
19 | | void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, |
20 | | const uint16_t *above, |
21 | 235k | const uint16_t *left, int bd) { |
22 | 235k | const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left); |
23 | 235k | const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); |
24 | 235k | const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); |
25 | 235k | const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); |
26 | 235k | const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); |
27 | 235k | (void)above; |
28 | 235k | (void)bd; |
29 | 235k | _mm_storel_epi64((__m128i *)dst, row0); |
30 | 235k | dst += stride; |
31 | 235k | _mm_storel_epi64((__m128i *)dst, row1); |
32 | 235k | dst += stride; |
33 | 235k | _mm_storel_epi64((__m128i *)dst, row2); |
34 | 235k | dst += stride; |
35 | 235k | _mm_storel_epi64((__m128i *)dst, row3); |
36 | 235k | } |
37 | | |
38 | | void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
39 | | const uint16_t *above, |
40 | 42.7k | const uint16_t *left, int bd) { |
41 | 42.7k | aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); |
42 | 42.7k | dst += stride << 2; |
43 | 42.7k | left += 4; |
44 | 42.7k | aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); |
45 | 42.7k | } |
46 | | |
47 | | void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
48 | | const uint16_t *above, |
49 | 111k | const uint16_t *left, int bd) { |
50 | 111k | const __m128i left_u16 = _mm_load_si128((const __m128i *)left); |
51 | 111k | const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); |
52 | 111k | const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); |
53 | 111k | const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); |
54 | 111k | const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); |
55 | 111k | (void)above; |
56 | 111k | (void)bd; |
57 | 111k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); |
58 | 111k | dst += stride; |
59 | 111k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); |
60 | 111k | dst += stride; |
61 | 111k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); |
62 | 111k | dst += stride; |
63 | 111k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); |
64 | 111k | } |
65 | | |
66 | | void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, |
67 | | const uint16_t *above, |
68 | 241k | const uint16_t *left, int bd) { |
69 | 241k | const __m128i left_u16 = _mm_load_si128((const __m128i *)left); |
70 | 241k | const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); |
71 | 241k | const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); |
72 | 241k | const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); |
73 | 241k | const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); |
74 | 241k | const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); |
75 | 241k | const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); |
76 | 241k | const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); |
77 | 241k | const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); |
78 | 241k | (void)above; |
79 | 241k | (void)bd; |
80 | 241k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); |
81 | 241k | dst += stride; |
82 | 241k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); |
83 | 241k | dst += stride; |
84 | 241k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); |
85 | 241k | dst += stride; |
86 | 241k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); |
87 | 241k | dst += stride; |
88 | 241k | _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4)); |
89 | 241k | dst += stride; |
90 | 241k | _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5)); |
91 | 241k | dst += stride; |
92 | 241k | _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6)); |
93 | 241k | dst += stride; |
94 | 241k | _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7)); |
95 | 241k | } |
96 | | |
97 | | void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
98 | | const uint16_t *above, |
99 | 35.6k | const uint16_t *left, int bd) { |
100 | 35.6k | aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); |
101 | 35.6k | dst += stride << 3; |
102 | 35.6k | left += 8; |
103 | 35.6k | aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); |
104 | 35.6k | } |
105 | | |
106 | | static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride, |
107 | 1.86M | const __m128i *row) { |
108 | 1.86M | const __m128i val = _mm_unpacklo_epi64(*row, *row); |
109 | 1.86M | _mm_store_si128((__m128i *)*dst, val); |
110 | 1.86M | _mm_store_si128((__m128i *)(*dst + 8), val); |
111 | 1.86M | *dst += stride; |
112 | 1.86M | } |
113 | | |
114 | | static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride, |
115 | 1.86M | const __m128i *row) { |
116 | 1.86M | const __m128i val = _mm_unpackhi_epi64(*row, *row); |
117 | 1.86M | _mm_store_si128((__m128i *)(*dst), val); |
118 | 1.86M | _mm_store_si128((__m128i *)(*dst + 8), val); |
119 | 1.86M | *dst += stride; |
120 | 1.86M | } |
121 | | |
122 | | static INLINE void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride, |
123 | 467k | const uint16_t *left) { |
124 | 467k | const __m128i left_u16 = _mm_load_si128((const __m128i *)left); |
125 | 467k | const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); |
126 | 467k | const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); |
127 | 467k | const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); |
128 | 467k | const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); |
129 | 467k | const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); |
130 | 467k | const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); |
131 | 467k | const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); |
132 | 467k | const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); |
133 | 467k | h_store_16_unpacklo(&dst, stride, &row0); |
134 | 467k | h_store_16_unpacklo(&dst, stride, &row1); |
135 | 467k | h_store_16_unpacklo(&dst, stride, &row2); |
136 | 467k | h_store_16_unpacklo(&dst, stride, &row3); |
137 | 467k | h_store_16_unpackhi(&dst, stride, &row4); |
138 | 467k | h_store_16_unpackhi(&dst, stride, &row5); |
139 | 467k | h_store_16_unpackhi(&dst, stride, &row6); |
140 | 467k | h_store_16_unpackhi(&dst, stride, &row7); |
141 | 467k | } |
142 | | |
143 | | void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
144 | | const uint16_t *above, |
145 | 71.9k | const uint16_t *left, int bd) { |
146 | 71.9k | (void)above; |
147 | 71.9k | (void)bd; |
148 | 71.9k | h_predictor_16x8(dst, stride, left); |
149 | 71.9k | } |
150 | | |
151 | | void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, |
152 | | const uint16_t *above, |
153 | 156k | const uint16_t *left, int bd) { |
154 | 156k | int i; |
155 | 156k | (void)above; |
156 | 156k | (void)bd; |
157 | | |
158 | 469k | for (i = 0; i < 2; i++, left += 8) { |
159 | 313k | h_predictor_16x8(dst, stride, left); |
160 | 313k | dst += stride << 3; |
161 | 313k | } |
162 | 156k | } |
163 | | |
164 | | void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
165 | | const uint16_t *above, |
166 | 20.6k | const uint16_t *left, int bd) { |
167 | 20.6k | int i; |
168 | 20.6k | (void)above; |
169 | 20.6k | (void)bd; |
170 | | |
171 | 103k | for (i = 0; i < 4; i++, left += 8) { |
172 | 82.7k | h_predictor_16x8(dst, stride, left); |
173 | 82.7k | dst += stride << 3; |
174 | 82.7k | } |
175 | 20.6k | } |
176 | | |
177 | | static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride, |
178 | 2.02M | const __m128i *row) { |
179 | 2.02M | const __m128i val = _mm_unpacklo_epi64(*row, *row); |
180 | 2.02M | _mm_store_si128((__m128i *)(*dst), val); |
181 | 2.02M | _mm_store_si128((__m128i *)(*dst + 8), val); |
182 | 2.02M | _mm_store_si128((__m128i *)(*dst + 16), val); |
183 | 2.02M | _mm_store_si128((__m128i *)(*dst + 24), val); |
184 | 2.02M | *dst += stride; |
185 | 2.02M | } |
186 | | |
187 | | static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride, |
188 | 2.02M | const __m128i *row) { |
189 | 2.02M | const __m128i val = _mm_unpackhi_epi64(*row, *row); |
190 | 2.02M | _mm_store_si128((__m128i *)(*dst), val); |
191 | 2.02M | _mm_store_si128((__m128i *)(*dst + 8), val); |
192 | 2.02M | _mm_store_si128((__m128i *)(*dst + 16), val); |
193 | 2.02M | _mm_store_si128((__m128i *)(*dst + 24), val); |
194 | 2.02M | *dst += stride; |
195 | 2.02M | } |
196 | | |
197 | | static INLINE void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride, |
198 | 505k | const uint16_t *left) { |
199 | 505k | const __m128i left_u16 = _mm_load_si128((const __m128i *)left); |
200 | 505k | const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); |
201 | 505k | const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); |
202 | 505k | const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); |
203 | 505k | const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); |
204 | 505k | const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); |
205 | 505k | const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); |
206 | 505k | const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); |
207 | 505k | const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); |
208 | 505k | h_store_32_unpacklo(&dst, stride, &row0); |
209 | 505k | h_store_32_unpacklo(&dst, stride, &row1); |
210 | 505k | h_store_32_unpacklo(&dst, stride, &row2); |
211 | 505k | h_store_32_unpacklo(&dst, stride, &row3); |
212 | 505k | h_store_32_unpackhi(&dst, stride, &row4); |
213 | 505k | h_store_32_unpackhi(&dst, stride, &row5); |
214 | 505k | h_store_32_unpackhi(&dst, stride, &row6); |
215 | 505k | h_store_32_unpackhi(&dst, stride, &row7); |
216 | 505k | } |
217 | | |
218 | | void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
219 | | const uint16_t *above, |
220 | 39.2k | const uint16_t *left, int bd) { |
221 | 39.2k | int i; |
222 | 39.2k | (void)above; |
223 | 39.2k | (void)bd; |
224 | | |
225 | 117k | for (i = 0; i < 2; i++, left += 8) { |
226 | 78.5k | h_predictor_32x8(dst, stride, left); |
227 | 78.5k | dst += stride << 3; |
228 | 78.5k | } |
229 | 39.2k | } |
230 | | |
231 | | void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, |
232 | | const uint16_t *above, |
233 | 107k | const uint16_t *left, int bd) { |
234 | 107k | int i; |
235 | 107k | (void)above; |
236 | 107k | (void)bd; |
237 | | |
238 | 536k | for (i = 0; i < 4; i++, left += 8) { |
239 | 429k | h_predictor_32x8(dst, stride, left); |
240 | 429k | dst += stride << 3; |
241 | 429k | } |
242 | 107k | } |
243 | | |
244 | | // ----------------------------------------------------------------------------- |
245 | | // DC_TOP, DC_LEFT, DC_128 |
246 | | |
247 | | // 4x4 |
248 | | |
249 | 2.11M | static INLINE __m128i dc_sum_4(const uint16_t *ref) { |
250 | 2.11M | const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref); |
251 | 2.11M | const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); |
252 | 2.11M | const __m128i a = _mm_add_epi16(_dcba, _xxdc); |
253 | 2.11M | return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); |
254 | 2.11M | } |
255 | | |
256 | | static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, |
257 | 1.41M | const __m128i *dc) { |
258 | 1.41M | const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); |
259 | 1.41M | int i; |
260 | 7.08M | for (i = 0; i < 4; ++i, dst += stride) { |
261 | 5.66M | _mm_storel_epi64((__m128i *)dst, dc_dup); |
262 | 5.66M | } |
263 | 1.41M | } |
264 | | |
265 | | void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, |
266 | | const uint16_t *above, |
267 | 94.1k | const uint16_t *left, int bd) { |
268 | 94.1k | const __m128i two = _mm_cvtsi32_si128(2); |
269 | 94.1k | const __m128i sum = dc_sum_4(left); |
270 | 94.1k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); |
271 | 94.1k | (void)above; |
272 | 94.1k | (void)bd; |
273 | 94.1k | dc_store_4x4(dst, stride, &dc); |
274 | 94.1k | } |
275 | | |
276 | | void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, |
277 | | const uint16_t *above, |
278 | 1.31M | const uint16_t *left, int bd) { |
279 | 1.31M | const __m128i two = _mm_cvtsi32_si128(2); |
280 | 1.31M | const __m128i sum = dc_sum_4(above); |
281 | 1.31M | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); |
282 | 1.31M | (void)left; |
283 | 1.31M | (void)bd; |
284 | 1.31M | dc_store_4x4(dst, stride, &dc); |
285 | 1.31M | } |
286 | | |
287 | | void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, |
288 | | const uint16_t *above, |
289 | 7.45k | const uint16_t *left, int bd) { |
290 | 7.45k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
291 | 7.45k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
292 | 7.45k | (void)above; |
293 | 7.45k | (void)left; |
294 | 7.45k | dc_store_4x4(dst, stride, &dc_dup); |
295 | 7.45k | } |
296 | | |
297 | | // ----------------------------------------------------------------------------- |
298 | | // 4x8 |
299 | | |
300 | | static INLINE void dc_store_4x8(uint16_t *dst, ptrdiff_t stride, |
301 | 17.1k | const __m128i *dc) { |
302 | 17.1k | const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); |
303 | 17.1k | int i; |
304 | 153k | for (i = 0; i < 8; ++i, dst += stride) { |
305 | 136k | _mm_storel_epi64((__m128i *)dst, dc_dup); |
306 | 136k | } |
307 | 17.1k | } |
308 | | |
309 | | // Shared with DC 8xh |
310 | 5.41M | static INLINE __m128i dc_sum_8(const uint16_t *ref) { |
311 | 5.41M | const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref); |
312 | 5.41M | const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8)); |
313 | 5.41M | const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); |
314 | 5.41M | const __m128i a = _mm_add_epi16(_dcba, _xxdc); |
315 | | |
316 | 5.41M | return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); |
317 | 5.41M | } |
318 | | |
319 | | void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
320 | | const uint16_t *above, |
321 | 2.56k | const uint16_t *left, int bd) { |
322 | 2.56k | const __m128i sum = dc_sum_8(left); |
323 | 2.56k | const __m128i four = _mm_cvtsi32_si128(4); |
324 | 2.56k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); |
325 | 2.56k | (void)above; |
326 | 2.56k | (void)bd; |
327 | 2.56k | dc_store_4x8(dst, stride, &dc); |
328 | 2.56k | } |
329 | | |
330 | | void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
331 | | const uint16_t *above, |
332 | 13.9k | const uint16_t *left, int bd) { |
333 | 13.9k | const __m128i two = _mm_cvtsi32_si128(2); |
334 | 13.9k | const __m128i sum = dc_sum_4(above); |
335 | 13.9k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); |
336 | 13.9k | (void)left; |
337 | 13.9k | (void)bd; |
338 | 13.9k | dc_store_4x8(dst, stride, &dc); |
339 | 13.9k | } |
340 | | |
341 | | void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
342 | | const uint16_t *above, |
343 | 549 | const uint16_t *left, int bd) { |
344 | 549 | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
345 | 549 | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
346 | 549 | (void)above; |
347 | 549 | (void)left; |
348 | 549 | dc_store_4x8(dst, stride, &dc_dup); |
349 | 549 | } |
350 | | |
351 | | // ----------------------------------------------------------------------------- |
352 | | // 8xh |
353 | | |
354 | | static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height, |
355 | 130k | const __m128i *dc) { |
356 | 130k | const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); |
357 | 130k | const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); |
358 | 130k | int i; |
359 | 1.18M | for (i = 0; i < height; ++i, dst += stride) { |
360 | 1.05M | _mm_store_si128((__m128i *)dst, dc_dup); |
361 | 1.05M | } |
362 | 130k | } |
363 | | |
364 | | // ----------------------------------------------------------------------------- |
365 | | // DC_TOP |
366 | | |
367 | | static INLINE void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride, |
368 | 69.2k | int height, const uint16_t *above) { |
369 | 69.2k | const __m128i four = _mm_cvtsi32_si128(4); |
370 | 69.2k | const __m128i sum = dc_sum_8(above); |
371 | 69.2k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); |
372 | 69.2k | dc_store_8xh(dst, stride, height, &dc); |
373 | 69.2k | } |
374 | | |
375 | | void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
376 | | const uint16_t *above, |
377 | 27.8k | const uint16_t *left, int bd) { |
378 | 27.8k | (void)left; |
379 | 27.8k | (void)bd; |
380 | 27.8k | dc_top_predictor_8xh(dst, stride, 4, above); |
381 | 27.8k | } |
382 | | |
383 | | void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, |
384 | | const uint16_t *above, |
385 | 33.8k | const uint16_t *left, int bd) { |
386 | 33.8k | (void)left; |
387 | 33.8k | (void)bd; |
388 | 33.8k | dc_top_predictor_8xh(dst, stride, 8, above); |
389 | 33.8k | } |
390 | | |
391 | | void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
392 | | const uint16_t *above, |
393 | 7.59k | const uint16_t *left, int bd) { |
394 | 7.59k | (void)left; |
395 | 7.59k | (void)bd; |
396 | 7.59k | dc_top_predictor_8xh(dst, stride, 16, above); |
397 | 7.59k | } |
398 | | |
399 | | // ----------------------------------------------------------------------------- |
400 | | // DC_LEFT |
401 | | |
402 | | void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
403 | | const uint16_t *above, |
404 | 3.84k | const uint16_t *left, int bd) { |
405 | 3.84k | const __m128i two = _mm_cvtsi32_si128(2); |
406 | 3.84k | const __m128i sum = dc_sum_4(left); |
407 | 3.84k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); |
408 | 3.84k | (void)above; |
409 | 3.84k | (void)bd; |
410 | 3.84k | dc_store_8xh(dst, stride, 4, &dc); |
411 | 3.84k | } |
412 | | |
413 | | void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, |
414 | | const uint16_t *above, |
415 | 46.6k | const uint16_t *left, int bd) { |
416 | 46.6k | const __m128i four = _mm_cvtsi32_si128(4); |
417 | 46.6k | const __m128i sum = dc_sum_8(left); |
418 | 46.6k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); |
419 | 46.6k | (void)above; |
420 | 46.6k | (void)bd; |
421 | 46.6k | dc_store_8xh(dst, stride, 8, &dc); |
422 | 46.6k | } |
423 | | |
424 | | // Shared with DC 16xh |
425 | 1.97M | static INLINE __m128i dc_sum_16(const uint16_t *ref) { |
426 | 1.97M | const __m128i sum_lo = dc_sum_8(ref); |
427 | 1.97M | const __m128i sum_hi = dc_sum_8(ref + 8); |
428 | 1.97M | return _mm_add_epi16(sum_lo, sum_hi); |
429 | 1.97M | } |
430 | | |
431 | | void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
432 | | const uint16_t *above, |
433 | 6.67k | const uint16_t *left, int bd) { |
434 | 6.67k | const __m128i eight = _mm_cvtsi32_si128(8); |
435 | 6.67k | const __m128i sum = dc_sum_16(left); |
436 | 6.67k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
437 | 6.67k | (void)above; |
438 | 6.67k | (void)bd; |
439 | 6.67k | dc_store_8xh(dst, stride, 16, &dc); |
440 | 6.67k | } |
441 | | |
442 | | // ----------------------------------------------------------------------------- |
443 | | // DC_128 |
444 | | |
445 | | static INLINE void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride, |
446 | 4.33k | int height, int bd) { |
447 | 4.33k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
448 | 4.33k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
449 | 4.33k | dc_store_8xh(dst, stride, height, &dc_dup); |
450 | 4.33k | } |
451 | | |
452 | | void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
453 | | const uint16_t *above, |
454 | 267 | const uint16_t *left, int bd) { |
455 | 267 | (void)above; |
456 | 267 | (void)left; |
457 | 267 | dc_128_predictor_8xh(dst, stride, 4, bd); |
458 | 267 | } |
459 | | |
460 | | void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, |
461 | | const uint16_t *above, |
462 | 1.47k | const uint16_t *left, int bd) { |
463 | 1.47k | (void)above; |
464 | 1.47k | (void)left; |
465 | 1.47k | dc_128_predictor_8xh(dst, stride, 8, bd); |
466 | 1.47k | } |
467 | | |
468 | | void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
469 | | const uint16_t *above, |
470 | 2.59k | const uint16_t *left, int bd) { |
471 | 2.59k | (void)above; |
472 | 2.59k | (void)left; |
473 | 2.59k | dc_128_predictor_8xh(dst, stride, 16, bd); |
474 | 2.59k | } |
475 | | |
476 | | // ----------------------------------------------------------------------------- |
477 | | // 16xh |
478 | | |
479 | | static INLINE void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height, |
480 | 89.9k | const __m128i *dc) { |
481 | 89.9k | const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); |
482 | 89.9k | const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); |
483 | 89.9k | int i; |
484 | 1.62M | for (i = 0; i < height; ++i, dst += stride) { |
485 | 1.53M | _mm_store_si128((__m128i *)dst, dc_dup); |
486 | 1.53M | _mm_store_si128((__m128i *)(dst + 8), dc_dup); |
487 | 1.53M | } |
488 | 89.9k | } |
489 | | |
490 | | // ----------------------------------------------------------------------------- |
491 | | // DC_LEFT |
492 | | |
493 | | void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
494 | | const uint16_t *above, |
495 | 6.98k | const uint16_t *left, int bd) { |
496 | 6.98k | const __m128i four = _mm_cvtsi32_si128(4); |
497 | 6.98k | const __m128i sum = dc_sum_8(left); |
498 | 6.98k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); |
499 | 6.98k | (void)above; |
500 | 6.98k | (void)bd; |
501 | 6.98k | dc_store_16xh(dst, stride, 8, &dc); |
502 | 6.98k | } |
503 | | |
504 | | void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, |
505 | | const uint16_t *above, |
506 | 40.7k | const uint16_t *left, int bd) { |
507 | 40.7k | const __m128i eight = _mm_cvtsi32_si128(8); |
508 | 40.7k | const __m128i sum = dc_sum_16(left); |
509 | 40.7k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
510 | 40.7k | (void)above; |
511 | 40.7k | (void)bd; |
512 | 40.7k | dc_store_16xh(dst, stride, 16, &dc); |
513 | 40.7k | } |
514 | | |
515 | | // Shared with 32xh |
516 | 457k | static INLINE __m128i dc_sum_32(const uint16_t *ref) { |
517 | 457k | const __m128i zero = _mm_setzero_si128(); |
518 | 457k | const __m128i sum_a = dc_sum_16(ref); |
519 | 457k | const __m128i sum_b = dc_sum_16(ref + 16); |
520 | | // 12 bit bd will outrange, so expand to 32 bit before adding final total |
521 | 457k | return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero), |
522 | 457k | _mm_unpacklo_epi16(sum_b, zero)); |
523 | 457k | } |
524 | | |
525 | | void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
526 | | const uint16_t *above, |
527 | 4.14k | const uint16_t *left, int bd) { |
528 | 4.14k | const __m128i sixteen = _mm_cvtsi32_si128(16); |
529 | 4.14k | const __m128i sum = dc_sum_32(left); |
530 | 4.14k | const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); |
531 | 4.14k | (void)above; |
532 | 4.14k | (void)bd; |
533 | 4.14k | dc_store_16xh(dst, stride, 32, &dc); |
534 | 4.14k | } |
535 | | |
536 | | // ----------------------------------------------------------------------------- |
537 | | // DC_TOP |
538 | | |
539 | | void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
540 | | const uint16_t *above, |
541 | 9.92k | const uint16_t *left, int bd) { |
542 | 9.92k | const __m128i eight = _mm_cvtsi32_si128(8); |
543 | 9.92k | const __m128i sum = dc_sum_16(above); |
544 | 9.92k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
545 | 9.92k | (void)left; |
546 | 9.92k | (void)bd; |
547 | 9.92k | dc_store_16xh(dst, stride, 8, &dc); |
548 | 9.92k | } |
549 | | |
550 | | void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, |
551 | | const uint16_t *above, |
552 | 12.3k | const uint16_t *left, int bd) { |
553 | 12.3k | const __m128i eight = _mm_cvtsi32_si128(8); |
554 | 12.3k | const __m128i sum = dc_sum_16(above); |
555 | 12.3k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
556 | 12.3k | (void)left; |
557 | 12.3k | (void)bd; |
558 | 12.3k | dc_store_16xh(dst, stride, 16, &dc); |
559 | 12.3k | } |
560 | | |
561 | | void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
562 | | const uint16_t *above, |
563 | 8.53k | const uint16_t *left, int bd) { |
564 | 8.53k | const __m128i eight = _mm_cvtsi32_si128(8); |
565 | 8.53k | const __m128i sum = dc_sum_16(above); |
566 | 8.53k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
567 | 8.53k | (void)left; |
568 | 8.53k | (void)bd; |
569 | 8.53k | dc_store_16xh(dst, stride, 32, &dc); |
570 | 8.53k | } |
571 | | |
572 | | // ----------------------------------------------------------------------------- |
573 | | // DC_128 |
574 | | |
575 | | void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
576 | | const uint16_t *above, |
577 | 360 | const uint16_t *left, int bd) { |
578 | 360 | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
579 | 360 | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
580 | 360 | (void)above; |
581 | 360 | (void)left; |
582 | 360 | dc_store_16xh(dst, stride, 8, &dc_dup); |
583 | 360 | } |
584 | | |
585 | | void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, |
586 | | const uint16_t *above, |
587 | 5.07k | const uint16_t *left, int bd) { |
588 | 5.07k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
589 | 5.07k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
590 | 5.07k | (void)above; |
591 | 5.07k | (void)left; |
592 | 5.07k | dc_store_16xh(dst, stride, 16, &dc_dup); |
593 | 5.07k | } |
594 | | |
595 | | void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
596 | | const uint16_t *above, |
597 | 1.86k | const uint16_t *left, int bd) { |
598 | 1.86k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
599 | 1.86k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
600 | 1.86k | (void)above; |
601 | 1.86k | (void)left; |
602 | 1.86k | dc_store_16xh(dst, stride, 32, &dc_dup); |
603 | 1.86k | } |
604 | | |
605 | | // ----------------------------------------------------------------------------- |
606 | | // 32xh |
607 | | |
608 | | static INLINE void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height, |
609 | 184k | const __m128i *dc) { |
610 | 184k | const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); |
611 | 184k | const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); |
612 | 184k | int i; |
613 | 5.55M | for (i = 0; i < height; ++i, dst += stride) { |
614 | 5.36M | _mm_store_si128((__m128i *)dst, dc_dup); |
615 | 5.36M | _mm_store_si128((__m128i *)(dst + 8), dc_dup); |
616 | 5.36M | _mm_store_si128((__m128i *)(dst + 16), dc_dup); |
617 | 5.36M | _mm_store_si128((__m128i *)(dst + 24), dc_dup); |
618 | 5.36M | } |
619 | 184k | } |
620 | | |
621 | | void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
622 | | const uint16_t *above, |
623 | 7.97k | const uint16_t *left, int bd) { |
624 | 7.97k | const __m128i eight = _mm_cvtsi32_si128(8); |
625 | 7.97k | const __m128i sum = dc_sum_16(left); |
626 | 7.97k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
627 | 7.97k | (void)above; |
628 | 7.97k | (void)bd; |
629 | 7.97k | dc_store_32xh(dst, stride, 16, &dc); |
630 | 7.97k | } |
631 | | |
632 | | void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, |
633 | | const uint16_t *above, |
634 | 76.9k | const uint16_t *left, int bd) { |
635 | 76.9k | const __m128i sixteen = _mm_cvtsi32_si128(16); |
636 | 76.9k | const __m128i sum = dc_sum_32(left); |
637 | 76.9k | const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); |
638 | 76.9k | (void)above; |
639 | 76.9k | (void)bd; |
640 | 76.9k | dc_store_32xh(dst, stride, 32, &dc); |
641 | 76.9k | } |
642 | | |
643 | | void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
644 | | const uint16_t *above, |
645 | 11.2k | const uint16_t *left, int bd) { |
646 | 11.2k | const __m128i sixteen = _mm_cvtsi32_si128(16); |
647 | 11.2k | const __m128i sum = dc_sum_32(above); |
648 | 11.2k | const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); |
649 | 11.2k | (void)left; |
650 | 11.2k | (void)bd; |
651 | 11.2k | dc_store_32xh(dst, stride, 16, &dc); |
652 | 11.2k | } |
653 | | |
654 | | void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
655 | | const uint16_t *above, |
656 | 13.3k | const uint16_t *left, int bd) { |
657 | 13.3k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
658 | 13.3k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
659 | 13.3k | (void)above; |
660 | 13.3k | (void)left; |
661 | 13.3k | dc_store_32xh(dst, stride, 16, &dc_dup); |
662 | 13.3k | } |
663 | | |
664 | | void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, |
665 | | const uint16_t *above, |
666 | 56.1k | const uint16_t *left, int bd) { |
667 | 56.1k | const __m128i sixteen = _mm_cvtsi32_si128(16); |
668 | 56.1k | const __m128i sum = dc_sum_32(above); |
669 | 56.1k | const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); |
670 | 56.1k | (void)left; |
671 | 56.1k | (void)bd; |
672 | 56.1k | dc_store_32xh(dst, stride, 32, &dc); |
673 | 56.1k | } |
674 | | |
675 | | void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, |
676 | | const uint16_t *above, |
677 | 18.3k | const uint16_t *left, int bd) { |
678 | 18.3k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
679 | 18.3k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
680 | 18.3k | (void)above; |
681 | 18.3k | (void)left; |
682 | 18.3k | dc_store_32xh(dst, stride, 32, &dc_dup); |
683 | 18.3k | } |
684 | | |
685 | | // ----------------------------------------------------------------------------- |
686 | | // V_PRED |
687 | | |
688 | | void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
689 | | const uint16_t *above, |
690 | 25.5k | const uint16_t *left, int bd) { |
691 | 25.5k | (void)left; |
692 | 25.5k | (void)bd; |
693 | 25.5k | const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above); |
694 | 25.5k | int i; |
695 | 76.6k | for (i = 0; i < 2; ++i) { |
696 | 51.1k | _mm_storel_epi64((__m128i *)dst, above_u16); |
697 | 51.1k | _mm_storel_epi64((__m128i *)(dst + stride), above_u16); |
698 | 51.1k | _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16); |
699 | 51.1k | _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16); |
700 | 51.1k | dst += stride << 2; |
701 | 51.1k | } |
702 | 25.5k | } |
703 | | |
704 | | void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
705 | | const uint16_t *above, |
706 | 49.6k | const uint16_t *left, int bd) { |
707 | 49.6k | (void)left; |
708 | 49.6k | (void)bd; |
709 | 49.6k | const __m128i above_u16 = _mm_load_si128((const __m128i *)above); |
710 | 49.6k | _mm_store_si128((__m128i *)dst, above_u16); |
711 | 49.6k | _mm_store_si128((__m128i *)(dst + stride), above_u16); |
712 | 49.6k | _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); |
713 | 49.6k | _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); |
714 | 49.6k | } |
715 | | |
716 | | void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
717 | | const uint16_t *above, |
718 | 15.3k | const uint16_t *left, int bd) { |
719 | 15.3k | (void)left; |
720 | 15.3k | (void)bd; |
721 | 15.3k | const __m128i above_u16 = _mm_load_si128((const __m128i *)above); |
722 | 15.3k | int i; |
723 | 76.6k | for (i = 0; i < 4; ++i) { |
724 | 61.2k | _mm_store_si128((__m128i *)dst, above_u16); |
725 | 61.2k | _mm_store_si128((__m128i *)(dst + stride), above_u16); |
726 | 61.2k | _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); |
727 | 61.2k | _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); |
728 | 61.2k | dst += stride << 2; |
729 | 61.2k | } |
730 | 15.3k | } |
731 | | |
732 | | void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
733 | | const uint16_t *above, |
734 | 44.2k | const uint16_t *left, int bd) { |
735 | 44.2k | (void)left; |
736 | 44.2k | (void)bd; |
737 | 44.2k | const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); |
738 | 44.2k | const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); |
739 | 44.2k | int i; |
740 | 132k | for (i = 0; i < 2; ++i) { |
741 | 88.5k | _mm_store_si128((__m128i *)dst, above0_u16); |
742 | 88.5k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
743 | 88.5k | dst += stride; |
744 | 88.5k | _mm_store_si128((__m128i *)dst, above0_u16); |
745 | 88.5k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
746 | 88.5k | dst += stride; |
747 | 88.5k | _mm_store_si128((__m128i *)dst, above0_u16); |
748 | 88.5k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
749 | 88.5k | dst += stride; |
750 | 88.5k | _mm_store_si128((__m128i *)dst, above0_u16); |
751 | 88.5k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
752 | 88.5k | dst += stride; |
753 | 88.5k | } |
754 | 44.2k | } |
755 | | |
756 | | void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
757 | | const uint16_t *above, |
758 | 9.16k | const uint16_t *left, int bd) { |
759 | 9.16k | (void)left; |
760 | 9.16k | (void)bd; |
761 | 9.16k | const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); |
762 | 9.16k | const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); |
763 | 9.16k | int i; |
764 | 82.4k | for (i = 0; i < 8; ++i) { |
765 | 73.2k | _mm_store_si128((__m128i *)dst, above0_u16); |
766 | 73.2k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
767 | 73.2k | dst += stride; |
768 | 73.2k | _mm_store_si128((__m128i *)dst, above0_u16); |
769 | 73.2k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
770 | 73.2k | dst += stride; |
771 | 73.2k | _mm_store_si128((__m128i *)dst, above0_u16); |
772 | 73.2k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
773 | 73.2k | dst += stride; |
774 | 73.2k | _mm_store_si128((__m128i *)dst, above0_u16); |
775 | 73.2k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
776 | 73.2k | dst += stride; |
777 | 73.2k | } |
778 | 9.16k | } |
779 | | |
780 | | void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
781 | | const uint16_t *above, |
782 | 4.74k | const uint16_t *left, int bd) { |
783 | 4.74k | (void)left; |
784 | 4.74k | (void)bd; |
785 | 4.74k | const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); |
786 | 4.74k | const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); |
787 | 4.74k | const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16)); |
788 | 4.74k | const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24)); |
789 | 4.74k | int i; |
790 | 23.7k | for (i = 0; i < 4; ++i) { |
791 | 18.9k | _mm_store_si128((__m128i *)dst, above0_u16); |
792 | 18.9k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
793 | 18.9k | _mm_store_si128((__m128i *)(dst + 16), above2_u16); |
794 | 18.9k | _mm_store_si128((__m128i *)(dst + 24), above3_u16); |
795 | 18.9k | dst += stride; |
796 | 18.9k | _mm_store_si128((__m128i *)dst, above0_u16); |
797 | 18.9k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
798 | 18.9k | _mm_store_si128((__m128i *)(dst + 16), above2_u16); |
799 | 18.9k | _mm_store_si128((__m128i *)(dst + 24), above3_u16); |
800 | 18.9k | dst += stride; |
801 | 18.9k | _mm_store_si128((__m128i *)dst, above0_u16); |
802 | 18.9k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
803 | 18.9k | _mm_store_si128((__m128i *)(dst + 16), above2_u16); |
804 | 18.9k | _mm_store_si128((__m128i *)(dst + 24), above3_u16); |
805 | 18.9k | dst += stride; |
806 | 18.9k | _mm_store_si128((__m128i *)dst, above0_u16); |
807 | 18.9k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
808 | 18.9k | _mm_store_si128((__m128i *)(dst + 16), above2_u16); |
809 | 18.9k | _mm_store_si128((__m128i *)(dst + 24), above3_u16); |
810 | 18.9k | dst += stride; |
811 | 18.9k | } |
812 | 4.74k | } |
813 | | |
814 | | // ----------------------------------------------------------------------------- |
815 | | // DC_PRED |
816 | | |
817 | | void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
818 | | const uint16_t *above, |
819 | 223k | const uint16_t *left, int bd) { |
820 | 223k | (void)bd; |
821 | 223k | const __m128i sum_above = dc_sum_4(above); |
822 | 223k | const __m128i sum_left = dc_sum_8(left); |
823 | 223k | const __m128i sum = _mm_add_epi16(sum_above, sum_left); |
824 | 223k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
825 | 223k | sum32 >>= 16; |
826 | 223k | sum32 += 6; |
827 | 223k | sum32 /= 12; |
828 | 223k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
829 | 223k | int i; |
830 | 1.11M | for (i = 0; i < 4; ++i) { |
831 | 892k | _mm_storel_epi64((__m128i *)dst, row); |
832 | 892k | dst += stride; |
833 | 892k | _mm_storel_epi64((__m128i *)dst, row); |
834 | 892k | dst += stride; |
835 | 892k | } |
836 | 223k | } |
837 | | |
838 | | void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
839 | | const uint16_t *above, |
840 | 462k | const uint16_t *left, int bd) { |
841 | 462k | (void)bd; |
842 | 462k | const __m128i sum_left = dc_sum_4(left); |
843 | 462k | const __m128i sum_above = dc_sum_8(above); |
844 | 462k | const __m128i sum = _mm_add_epi16(sum_above, sum_left); |
845 | 462k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
846 | 462k | sum32 >>= 16; |
847 | 462k | sum32 += 6; |
848 | 462k | sum32 /= 12; |
849 | 462k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
850 | | |
851 | 462k | _mm_store_si128((__m128i *)dst, row); |
852 | 462k | dst += stride; |
853 | 462k | _mm_store_si128((__m128i *)dst, row); |
854 | 462k | dst += stride; |
855 | 462k | _mm_store_si128((__m128i *)dst, row); |
856 | 462k | dst += stride; |
857 | 462k | _mm_store_si128((__m128i *)dst, row); |
858 | 462k | } |
859 | | |
860 | | void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
861 | | const uint16_t *above, |
862 | 245k | const uint16_t *left, int bd) { |
863 | 245k | (void)bd; |
864 | 245k | __m128i sum_left = dc_sum_16(left); |
865 | 245k | __m128i sum_above = dc_sum_8(above); |
866 | 245k | const __m128i zero = _mm_setzero_si128(); |
867 | 245k | sum_left = _mm_unpacklo_epi16(sum_left, zero); |
868 | 245k | sum_above = _mm_unpacklo_epi16(sum_above, zero); |
869 | 245k | const __m128i sum = _mm_add_epi32(sum_left, sum_above); |
870 | 245k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
871 | 245k | sum32 += 12; |
872 | 245k | sum32 /= 24; |
873 | 245k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
874 | 245k | int i; |
875 | 1.22M | for (i = 0; i < 4; ++i) { |
876 | 981k | _mm_store_si128((__m128i *)dst, row); |
877 | 981k | dst += stride; |
878 | 981k | _mm_store_si128((__m128i *)dst, row); |
879 | 981k | dst += stride; |
880 | 981k | _mm_store_si128((__m128i *)dst, row); |
881 | 981k | dst += stride; |
882 | 981k | _mm_store_si128((__m128i *)dst, row); |
883 | 981k | dst += stride; |
884 | 981k | } |
885 | 245k | } |
886 | | |
887 | | void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
888 | | const uint16_t *above, |
889 | 417k | const uint16_t *left, int bd) { |
890 | 417k | (void)bd; |
891 | 417k | __m128i sum_left = dc_sum_8(left); |
892 | 417k | __m128i sum_above = dc_sum_16(above); |
893 | 417k | const __m128i zero = _mm_setzero_si128(); |
894 | 417k | sum_left = _mm_unpacklo_epi16(sum_left, zero); |
895 | 417k | sum_above = _mm_unpacklo_epi16(sum_above, zero); |
896 | 417k | const __m128i sum = _mm_add_epi32(sum_left, sum_above); |
897 | 417k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
898 | 417k | sum32 += 12; |
899 | 417k | sum32 /= 24; |
900 | 417k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
901 | 417k | int i; |
902 | 1.25M | for (i = 0; i < 2; ++i) { |
903 | 835k | _mm_store_si128((__m128i *)dst, row); |
904 | 835k | _mm_store_si128((__m128i *)(dst + 8), row); |
905 | 835k | dst += stride; |
906 | 835k | _mm_store_si128((__m128i *)dst, row); |
907 | 835k | _mm_store_si128((__m128i *)(dst + 8), row); |
908 | 835k | dst += stride; |
909 | 835k | _mm_store_si128((__m128i *)dst, row); |
910 | 835k | _mm_store_si128((__m128i *)(dst + 8), row); |
911 | 835k | dst += stride; |
912 | 835k | _mm_store_si128((__m128i *)dst, row); |
913 | 835k | _mm_store_si128((__m128i *)(dst + 8), row); |
914 | 835k | dst += stride; |
915 | 835k | } |
916 | 417k | } |
917 | | |
918 | | void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
919 | | const uint16_t *above, |
920 | 179k | const uint16_t *left, int bd) { |
921 | 179k | (void)bd; |
922 | 179k | __m128i sum_left = dc_sum_32(left); |
923 | 179k | __m128i sum_above = dc_sum_16(above); |
924 | 179k | const __m128i zero = _mm_setzero_si128(); |
925 | 179k | sum_above = _mm_unpacklo_epi16(sum_above, zero); |
926 | 179k | const __m128i sum = _mm_add_epi32(sum_left, sum_above); |
927 | 179k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
928 | 179k | sum32 += 24; |
929 | 179k | sum32 /= 48; |
930 | 179k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
931 | 179k | int i; |
932 | 1.61M | for (i = 0; i < 8; ++i) { |
933 | 1.43M | _mm_store_si128((__m128i *)dst, row); |
934 | 1.43M | _mm_store_si128((__m128i *)(dst + 8), row); |
935 | 1.43M | dst += stride; |
936 | 1.43M | _mm_store_si128((__m128i *)dst, row); |
937 | 1.43M | _mm_store_si128((__m128i *)(dst + 8), row); |
938 | 1.43M | dst += stride; |
939 | 1.43M | _mm_store_si128((__m128i *)dst, row); |
940 | 1.43M | _mm_store_si128((__m128i *)(dst + 8), row); |
941 | 1.43M | dst += stride; |
942 | 1.43M | _mm_store_si128((__m128i *)dst, row); |
943 | 1.43M | _mm_store_si128((__m128i *)(dst + 8), row); |
944 | 1.43M | dst += stride; |
945 | 1.43M | } |
946 | 179k | } |
947 | | |
948 | | void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
949 | | const uint16_t *above, |
950 | 129k | const uint16_t *left, int bd) { |
951 | 129k | (void)bd; |
952 | 129k | __m128i sum_left = dc_sum_16(left); |
953 | 129k | __m128i sum_above = dc_sum_32(above); |
954 | 129k | const __m128i zero = _mm_setzero_si128(); |
955 | 129k | sum_left = _mm_unpacklo_epi16(sum_left, zero); |
956 | 129k | const __m128i sum = _mm_add_epi32(sum_left, sum_above); |
957 | 129k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
958 | 129k | sum32 += 24; |
959 | 129k | sum32 /= 48; |
960 | 129k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
961 | 129k | int i; |
962 | 646k | for (i = 0; i < 4; ++i) { |
963 | 516k | _mm_store_si128((__m128i *)dst, row); |
964 | 516k | _mm_store_si128((__m128i *)(dst + 8), row); |
965 | 516k | _mm_store_si128((__m128i *)(dst + 16), row); |
966 | 516k | _mm_store_si128((__m128i *)(dst + 24), row); |
967 | 516k | dst += stride; |
968 | 516k | _mm_store_si128((__m128i *)dst, row); |
969 | 516k | _mm_store_si128((__m128i *)(dst + 8), row); |
970 | 516k | _mm_store_si128((__m128i *)(dst + 16), row); |
971 | 516k | _mm_store_si128((__m128i *)(dst + 24), row); |
972 | 516k | dst += stride; |
973 | 516k | _mm_store_si128((__m128i *)dst, row); |
974 | 516k | _mm_store_si128((__m128i *)(dst + 8), row); |
975 | 516k | _mm_store_si128((__m128i *)(dst + 16), row); |
976 | 516k | _mm_store_si128((__m128i *)(dst + 24), row); |
977 | 516k | dst += stride; |
978 | 516k | _mm_store_si128((__m128i *)dst, row); |
979 | 516k | _mm_store_si128((__m128i *)(dst + 8), row); |
980 | 516k | _mm_store_si128((__m128i *)(dst + 16), row); |
981 | 516k | _mm_store_si128((__m128i *)(dst + 24), row); |
982 | 516k | dst += stride; |
983 | 516k | } |
984 | 129k | } |