/src/aom/aom_dsp/x86/highbd_intrapred_sse2.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <emmintrin.h> |
13 | | |
14 | | #include "config/aom_dsp_rtcd.h" |
15 | | |
16 | | // ----------------------------------------------------------------------------- |
17 | | // H_PRED |
18 | | |
19 | | void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, |
20 | | const uint16_t *above, |
21 | 219k | const uint16_t *left, int bd) { |
22 | 219k | const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left); |
23 | 219k | const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); |
24 | 219k | const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); |
25 | 219k | const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); |
26 | 219k | const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); |
27 | 219k | (void)above; |
28 | 219k | (void)bd; |
29 | 219k | _mm_storel_epi64((__m128i *)dst, row0); |
30 | 219k | dst += stride; |
31 | 219k | _mm_storel_epi64((__m128i *)dst, row1); |
32 | 219k | dst += stride; |
33 | 219k | _mm_storel_epi64((__m128i *)dst, row2); |
34 | 219k | dst += stride; |
35 | 219k | _mm_storel_epi64((__m128i *)dst, row3); |
36 | 219k | } |
37 | | |
38 | | void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
39 | | const uint16_t *above, |
40 | 45.8k | const uint16_t *left, int bd) { |
41 | 45.8k | aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); |
42 | 45.8k | dst += stride << 2; |
43 | 45.8k | left += 4; |
44 | 45.8k | aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); |
45 | 45.8k | } |
46 | | |
47 | | void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
48 | | const uint16_t *above, |
49 | 101k | const uint16_t *left, int bd) { |
50 | 101k | const __m128i left_u16 = _mm_load_si128((const __m128i *)left); |
51 | 101k | const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); |
52 | 101k | const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); |
53 | 101k | const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); |
54 | 101k | const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); |
55 | 101k | (void)above; |
56 | 101k | (void)bd; |
57 | 101k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); |
58 | 101k | dst += stride; |
59 | 101k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); |
60 | 101k | dst += stride; |
61 | 101k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); |
62 | 101k | dst += stride; |
63 | 101k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); |
64 | 101k | } |
65 | | |
66 | | void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, |
67 | | const uint16_t *above, |
68 | 325k | const uint16_t *left, int bd) { |
69 | 325k | const __m128i left_u16 = _mm_load_si128((const __m128i *)left); |
70 | 325k | const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); |
71 | 325k | const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); |
72 | 325k | const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); |
73 | 325k | const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); |
74 | 325k | const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); |
75 | 325k | const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); |
76 | 325k | const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); |
77 | 325k | const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); |
78 | 325k | (void)above; |
79 | 325k | (void)bd; |
80 | 325k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); |
81 | 325k | dst += stride; |
82 | 325k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); |
83 | 325k | dst += stride; |
84 | 325k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); |
85 | 325k | dst += stride; |
86 | 325k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); |
87 | 325k | dst += stride; |
88 | 325k | _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4)); |
89 | 325k | dst += stride; |
90 | 325k | _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5)); |
91 | 325k | dst += stride; |
92 | 325k | _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6)); |
93 | 325k | dst += stride; |
94 | 325k | _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7)); |
95 | 325k | } |
96 | | |
97 | | void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
98 | | const uint16_t *above, |
99 | 47.3k | const uint16_t *left, int bd) { |
100 | 47.3k | aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); |
101 | 47.3k | dst += stride << 3; |
102 | 47.3k | left += 8; |
103 | 47.3k | aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); |
104 | 47.3k | } |
105 | | |
106 | | static inline void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride, |
107 | 3.01M | const __m128i *row) { |
108 | 3.01M | const __m128i val = _mm_unpacklo_epi64(*row, *row); |
109 | 3.01M | _mm_store_si128((__m128i *)*dst, val); |
110 | 3.01M | _mm_store_si128((__m128i *)(*dst + 8), val); |
111 | 3.01M | *dst += stride; |
112 | 3.01M | } |
113 | | |
114 | | static inline void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride, |
115 | 3.01M | const __m128i *row) { |
116 | 3.01M | const __m128i val = _mm_unpackhi_epi64(*row, *row); |
117 | 3.01M | _mm_store_si128((__m128i *)(*dst), val); |
118 | 3.01M | _mm_store_si128((__m128i *)(*dst + 8), val); |
119 | 3.01M | *dst += stride; |
120 | 3.01M | } |
121 | | |
122 | | static inline void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride, |
123 | 754k | const uint16_t *left) { |
124 | 754k | const __m128i left_u16 = _mm_load_si128((const __m128i *)left); |
125 | 754k | const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); |
126 | 754k | const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); |
127 | 754k | const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); |
128 | 754k | const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); |
129 | 754k | const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); |
130 | 754k | const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); |
131 | 754k | const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); |
132 | 754k | const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); |
133 | 754k | h_store_16_unpacklo(&dst, stride, &row0); |
134 | 754k | h_store_16_unpacklo(&dst, stride, &row1); |
135 | 754k | h_store_16_unpacklo(&dst, stride, &row2); |
136 | 754k | h_store_16_unpacklo(&dst, stride, &row3); |
137 | 754k | h_store_16_unpackhi(&dst, stride, &row4); |
138 | 754k | h_store_16_unpackhi(&dst, stride, &row5); |
139 | 754k | h_store_16_unpackhi(&dst, stride, &row6); |
140 | 754k | h_store_16_unpackhi(&dst, stride, &row7); |
141 | 754k | } |
142 | | |
143 | | void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
144 | | const uint16_t *above, |
145 | 103k | const uint16_t *left, int bd) { |
146 | 103k | (void)above; |
147 | 103k | (void)bd; |
148 | 103k | h_predictor_16x8(dst, stride, left); |
149 | 103k | } |
150 | | |
151 | | void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, |
152 | | const uint16_t *above, |
153 | 272k | const uint16_t *left, int bd) { |
154 | 272k | int i; |
155 | 272k | (void)above; |
156 | 272k | (void)bd; |
157 | | |
158 | 815k | for (i = 0; i < 2; i++, left += 8) { |
159 | 543k | h_predictor_16x8(dst, stride, left); |
160 | 543k | dst += stride << 3; |
161 | 543k | } |
162 | 272k | } |
163 | | |
164 | | void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
165 | | const uint16_t *above, |
166 | 26.8k | const uint16_t *left, int bd) { |
167 | 26.8k | int i; |
168 | 26.8k | (void)above; |
169 | 26.8k | (void)bd; |
170 | | |
171 | 134k | for (i = 0; i < 4; i++, left += 8) { |
172 | 107k | h_predictor_16x8(dst, stride, left); |
173 | 107k | dst += stride << 3; |
174 | 107k | } |
175 | 26.8k | } |
176 | | |
177 | | static inline void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride, |
178 | 3.16M | const __m128i *row) { |
179 | 3.16M | const __m128i val = _mm_unpacklo_epi64(*row, *row); |
180 | 3.16M | _mm_store_si128((__m128i *)(*dst), val); |
181 | 3.16M | _mm_store_si128((__m128i *)(*dst + 8), val); |
182 | 3.16M | _mm_store_si128((__m128i *)(*dst + 16), val); |
183 | 3.16M | _mm_store_si128((__m128i *)(*dst + 24), val); |
184 | 3.16M | *dst += stride; |
185 | 3.16M | } |
186 | | |
187 | | static inline void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride, |
188 | 3.16M | const __m128i *row) { |
189 | 3.16M | const __m128i val = _mm_unpackhi_epi64(*row, *row); |
190 | 3.16M | _mm_store_si128((__m128i *)(*dst), val); |
191 | 3.16M | _mm_store_si128((__m128i *)(*dst + 8), val); |
192 | 3.16M | _mm_store_si128((__m128i *)(*dst + 16), val); |
193 | 3.16M | _mm_store_si128((__m128i *)(*dst + 24), val); |
194 | 3.16M | *dst += stride; |
195 | 3.16M | } |
196 | | |
197 | | static inline void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride, |
198 | 791k | const uint16_t *left) { |
199 | 791k | const __m128i left_u16 = _mm_load_si128((const __m128i *)left); |
200 | 791k | const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); |
201 | 791k | const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); |
202 | 791k | const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); |
203 | 791k | const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); |
204 | 791k | const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); |
205 | 791k | const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); |
206 | 791k | const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); |
207 | 791k | const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); |
208 | 791k | h_store_32_unpacklo(&dst, stride, &row0); |
209 | 791k | h_store_32_unpacklo(&dst, stride, &row1); |
210 | 791k | h_store_32_unpacklo(&dst, stride, &row2); |
211 | 791k | h_store_32_unpacklo(&dst, stride, &row3); |
212 | 791k | h_store_32_unpackhi(&dst, stride, &row4); |
213 | 791k | h_store_32_unpackhi(&dst, stride, &row5); |
214 | 791k | h_store_32_unpackhi(&dst, stride, &row6); |
215 | 791k | h_store_32_unpackhi(&dst, stride, &row7); |
216 | 791k | } |
217 | | |
218 | | void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
219 | | const uint16_t *above, |
220 | 64.5k | const uint16_t *left, int bd) { |
221 | 64.5k | int i; |
222 | 64.5k | (void)above; |
223 | 64.5k | (void)bd; |
224 | | |
225 | 193k | for (i = 0; i < 2; i++, left += 8) { |
226 | 129k | h_predictor_32x8(dst, stride, left); |
227 | 129k | dst += stride << 3; |
228 | 129k | } |
229 | 64.5k | } |
230 | | |
231 | | void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, |
232 | | const uint16_t *above, |
233 | 166k | const uint16_t *left, int bd) { |
234 | 166k | int i; |
235 | 166k | (void)above; |
236 | 166k | (void)bd; |
237 | | |
238 | 830k | for (i = 0; i < 4; i++, left += 8) { |
239 | 664k | h_predictor_32x8(dst, stride, left); |
240 | 664k | dst += stride << 3; |
241 | 664k | } |
242 | 166k | } |
243 | | |
244 | | // ----------------------------------------------------------------------------- |
245 | | // DC_TOP, DC_LEFT, DC_128 |
246 | | |
247 | | // 4x4 |
248 | | |
249 | 1.66M | static inline __m128i dc_sum_4(const uint16_t *ref) { |
250 | 1.66M | const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref); |
251 | 1.66M | const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); |
252 | 1.66M | const __m128i a = _mm_add_epi16(_dcba, _xxdc); |
253 | 1.66M | return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); |
254 | 1.66M | } |
255 | | |
256 | | static inline void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, |
257 | 942k | const __m128i *dc) { |
258 | 942k | const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); |
259 | 942k | int i; |
260 | 4.71M | for (i = 0; i < 4; ++i, dst += stride) { |
261 | 3.76M | _mm_storel_epi64((__m128i *)dst, dc_dup); |
262 | 3.76M | } |
263 | 942k | } |
264 | | |
265 | | void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, |
266 | | const uint16_t *above, |
267 | 58.7k | const uint16_t *left, int bd) { |
268 | 58.7k | const __m128i two = _mm_cvtsi32_si128(2); |
269 | 58.7k | const __m128i sum = dc_sum_4(left); |
270 | 58.7k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); |
271 | 58.7k | (void)above; |
272 | 58.7k | (void)bd; |
273 | 58.7k | dc_store_4x4(dst, stride, &dc); |
274 | 58.7k | } |
275 | | |
276 | | void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, |
277 | | const uint16_t *above, |
278 | 871k | const uint16_t *left, int bd) { |
279 | 871k | const __m128i two = _mm_cvtsi32_si128(2); |
280 | 871k | const __m128i sum = dc_sum_4(above); |
281 | 871k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); |
282 | 871k | (void)left; |
283 | 871k | (void)bd; |
284 | 871k | dc_store_4x4(dst, stride, &dc); |
285 | 871k | } |
286 | | |
287 | | void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, |
288 | | const uint16_t *above, |
289 | 11.4k | const uint16_t *left, int bd) { |
290 | 11.4k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
291 | 11.4k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
292 | 11.4k | (void)above; |
293 | 11.4k | (void)left; |
294 | 11.4k | dc_store_4x4(dst, stride, &dc_dup); |
295 | 11.4k | } |
296 | | |
297 | | // ----------------------------------------------------------------------------- |
298 | | // 4x8 |
299 | | |
300 | | static inline void dc_store_4x8(uint16_t *dst, ptrdiff_t stride, |
301 | 9.45k | const __m128i *dc) { |
302 | 9.45k | const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); |
303 | 9.45k | int i; |
304 | 85.1k | for (i = 0; i < 8; ++i, dst += stride) { |
305 | 75.6k | _mm_storel_epi64((__m128i *)dst, dc_dup); |
306 | 75.6k | } |
307 | 9.45k | } |
308 | | |
309 | | // Shared with DC 8xh |
310 | 4.99M | static inline __m128i dc_sum_8(const uint16_t *ref) { |
311 | 4.99M | const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref); |
312 | 4.99M | const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8)); |
313 | 4.99M | const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); |
314 | 4.99M | const __m128i a = _mm_add_epi16(_dcba, _xxdc); |
315 | | |
316 | 4.99M | return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); |
317 | 4.99M | } |
318 | | |
319 | | void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
320 | | const uint16_t *above, |
321 | 5.14k | const uint16_t *left, int bd) { |
322 | 5.14k | const __m128i sum = dc_sum_8(left); |
323 | 5.14k | const __m128i four = _mm_cvtsi32_si128(4); |
324 | 5.14k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); |
325 | 5.14k | (void)above; |
326 | 5.14k | (void)bd; |
327 | 5.14k | dc_store_4x8(dst, stride, &dc); |
328 | 5.14k | } |
329 | | |
330 | | void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
331 | | const uint16_t *above, |
332 | 3.81k | const uint16_t *left, int bd) { |
333 | 3.81k | const __m128i two = _mm_cvtsi32_si128(2); |
334 | 3.81k | const __m128i sum = dc_sum_4(above); |
335 | 3.81k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); |
336 | 3.81k | (void)left; |
337 | 3.81k | (void)bd; |
338 | 3.81k | dc_store_4x8(dst, stride, &dc); |
339 | 3.81k | } |
340 | | |
341 | | void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
342 | | const uint16_t *above, |
343 | 499 | const uint16_t *left, int bd) { |
344 | 499 | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
345 | 499 | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
346 | 499 | (void)above; |
347 | 499 | (void)left; |
348 | 499 | dc_store_4x8(dst, stride, &dc_dup); |
349 | 499 | } |
350 | | |
351 | | // ----------------------------------------------------------------------------- |
352 | | // 8xh |
353 | | |
354 | | static inline void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height, |
355 | 62.5k | const __m128i *dc) { |
356 | 62.5k | const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); |
357 | 62.5k | const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); |
358 | 62.5k | int i; |
359 | 596k | for (i = 0; i < height; ++i, dst += stride) { |
360 | 533k | _mm_store_si128((__m128i *)dst, dc_dup); |
361 | 533k | } |
362 | 62.5k | } |
363 | | |
364 | | // ----------------------------------------------------------------------------- |
365 | | // DC_TOP |
366 | | |
367 | | static inline void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride, |
368 | 23.0k | int height, const uint16_t *above) { |
369 | 23.0k | const __m128i four = _mm_cvtsi32_si128(4); |
370 | 23.0k | const __m128i sum = dc_sum_8(above); |
371 | 23.0k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); |
372 | 23.0k | dc_store_8xh(dst, stride, height, &dc); |
373 | 23.0k | } |
374 | | |
375 | | void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
376 | | const uint16_t *above, |
377 | 7.41k | const uint16_t *left, int bd) { |
378 | 7.41k | (void)left; |
379 | 7.41k | (void)bd; |
380 | 7.41k | dc_top_predictor_8xh(dst, stride, 4, above); |
381 | 7.41k | } |
382 | | |
383 | | void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, |
384 | | const uint16_t *above, |
385 | 12.4k | const uint16_t *left, int bd) { |
386 | 12.4k | (void)left; |
387 | 12.4k | (void)bd; |
388 | 12.4k | dc_top_predictor_8xh(dst, stride, 8, above); |
389 | 12.4k | } |
390 | | |
391 | | void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
392 | | const uint16_t *above, |
393 | 3.25k | const uint16_t *left, int bd) { |
394 | 3.25k | (void)left; |
395 | 3.25k | (void)bd; |
396 | 3.25k | dc_top_predictor_8xh(dst, stride, 16, above); |
397 | 3.25k | } |
398 | | |
399 | | // ----------------------------------------------------------------------------- |
400 | | // DC_LEFT |
401 | | |
402 | | void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
403 | | const uint16_t *above, |
404 | 5.36k | const uint16_t *left, int bd) { |
405 | 5.36k | const __m128i two = _mm_cvtsi32_si128(2); |
406 | 5.36k | const __m128i sum = dc_sum_4(left); |
407 | 5.36k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); |
408 | 5.36k | (void)above; |
409 | 5.36k | (void)bd; |
410 | 5.36k | dc_store_8xh(dst, stride, 4, &dc); |
411 | 5.36k | } |
412 | | |
413 | | void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, |
414 | | const uint16_t *above, |
415 | 22.2k | const uint16_t *left, int bd) { |
416 | 22.2k | const __m128i four = _mm_cvtsi32_si128(4); |
417 | 22.2k | const __m128i sum = dc_sum_8(left); |
418 | 22.2k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); |
419 | 22.2k | (void)above; |
420 | 22.2k | (void)bd; |
421 | 22.2k | dc_store_8xh(dst, stride, 8, &dc); |
422 | 22.2k | } |
423 | | |
424 | | // Shared with DC 16xh |
425 | 1.80M | static inline __m128i dc_sum_16(const uint16_t *ref) { |
426 | 1.80M | const __m128i sum_lo = dc_sum_8(ref); |
427 | 1.80M | const __m128i sum_hi = dc_sum_8(ref + 8); |
428 | 1.80M | return _mm_add_epi16(sum_lo, sum_hi); |
429 | 1.80M | } |
430 | | |
431 | | void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
432 | | const uint16_t *above, |
433 | 7.03k | const uint16_t *left, int bd) { |
434 | 7.03k | const __m128i eight = _mm_cvtsi32_si128(8); |
435 | 7.03k | const __m128i sum = dc_sum_16(left); |
436 | 7.03k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
437 | 7.03k | (void)above; |
438 | 7.03k | (void)bd; |
439 | 7.03k | dc_store_8xh(dst, stride, 16, &dc); |
440 | 7.03k | } |
441 | | |
442 | | // ----------------------------------------------------------------------------- |
443 | | // DC_128 |
444 | | |
445 | | static inline void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride, |
446 | 4.78k | int height, int bd) { |
447 | 4.78k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
448 | 4.78k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
449 | 4.78k | dc_store_8xh(dst, stride, height, &dc_dup); |
450 | 4.78k | } |
451 | | |
452 | | void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
453 | | const uint16_t *above, |
454 | 659 | const uint16_t *left, int bd) { |
455 | 659 | (void)above; |
456 | 659 | (void)left; |
457 | 659 | dc_128_predictor_8xh(dst, stride, 4, bd); |
458 | 659 | } |
459 | | |
460 | | void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, |
461 | | const uint16_t *above, |
462 | 3.56k | const uint16_t *left, int bd) { |
463 | 3.56k | (void)above; |
464 | 3.56k | (void)left; |
465 | 3.56k | dc_128_predictor_8xh(dst, stride, 8, bd); |
466 | 3.56k | } |
467 | | |
468 | | void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
469 | | const uint16_t *above, |
470 | 565 | const uint16_t *left, int bd) { |
471 | 565 | (void)above; |
472 | 565 | (void)left; |
473 | 565 | dc_128_predictor_8xh(dst, stride, 16, bd); |
474 | 565 | } |
475 | | |
476 | | // ----------------------------------------------------------------------------- |
477 | | // 16xh |
478 | | |
479 | | static inline void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height, |
480 | 57.4k | const __m128i *dc) { |
481 | 57.4k | const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); |
482 | 57.4k | const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); |
483 | 57.4k | int i; |
484 | 1.02M | for (i = 0; i < height; ++i, dst += stride) { |
485 | 968k | _mm_store_si128((__m128i *)dst, dc_dup); |
486 | 968k | _mm_store_si128((__m128i *)(dst + 8), dc_dup); |
487 | 968k | } |
488 | 57.4k | } |
489 | | |
490 | | // ----------------------------------------------------------------------------- |
491 | | // DC_LEFT |
492 | | |
493 | | void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
494 | | const uint16_t *above, |
495 | 9.33k | const uint16_t *left, int bd) { |
496 | 9.33k | const __m128i four = _mm_cvtsi32_si128(4); |
497 | 9.33k | const __m128i sum = dc_sum_8(left); |
498 | 9.33k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); |
499 | 9.33k | (void)above; |
500 | 9.33k | (void)bd; |
501 | 9.33k | dc_store_16xh(dst, stride, 8, &dc); |
502 | 9.33k | } |
503 | | |
504 | | void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, |
505 | | const uint16_t *above, |
506 | 20.0k | const uint16_t *left, int bd) { |
507 | 20.0k | const __m128i eight = _mm_cvtsi32_si128(8); |
508 | 20.0k | const __m128i sum = dc_sum_16(left); |
509 | 20.0k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
510 | 20.0k | (void)above; |
511 | 20.0k | (void)bd; |
512 | 20.0k | dc_store_16xh(dst, stride, 16, &dc); |
513 | 20.0k | } |
514 | | |
515 | | // Shared with 32xh |
516 | 423k | static inline __m128i dc_sum_32(const uint16_t *ref) { |
517 | 423k | const __m128i zero = _mm_setzero_si128(); |
518 | 423k | const __m128i sum_a = dc_sum_16(ref); |
519 | 423k | const __m128i sum_b = dc_sum_16(ref + 16); |
520 | | // 12 bit bd will outrange, so expand to 32 bit before adding final total |
521 | 423k | return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero), |
522 | 423k | _mm_unpacklo_epi16(sum_b, zero)); |
523 | 423k | } |
524 | | |
525 | | void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
526 | | const uint16_t *above, |
527 | 3.46k | const uint16_t *left, int bd) { |
528 | 3.46k | const __m128i sixteen = _mm_cvtsi32_si128(16); |
529 | 3.46k | const __m128i sum = dc_sum_32(left); |
530 | 3.46k | const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); |
531 | 3.46k | (void)above; |
532 | 3.46k | (void)bd; |
533 | 3.46k | dc_store_16xh(dst, stride, 32, &dc); |
534 | 3.46k | } |
535 | | |
536 | | // ----------------------------------------------------------------------------- |
537 | | // DC_TOP |
538 | | |
539 | | void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
540 | | const uint16_t *above, |
541 | 4.38k | const uint16_t *left, int bd) { |
542 | 4.38k | const __m128i eight = _mm_cvtsi32_si128(8); |
543 | 4.38k | const __m128i sum = dc_sum_16(above); |
544 | 4.38k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
545 | 4.38k | (void)left; |
546 | 4.38k | (void)bd; |
547 | 4.38k | dc_store_16xh(dst, stride, 8, &dc); |
548 | 4.38k | } |
549 | | |
550 | | void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, |
551 | | const uint16_t *above, |
552 | 8.93k | const uint16_t *left, int bd) { |
553 | 8.93k | const __m128i eight = _mm_cvtsi32_si128(8); |
554 | 8.93k | const __m128i sum = dc_sum_16(above); |
555 | 8.93k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
556 | 8.93k | (void)left; |
557 | 8.93k | (void)bd; |
558 | 8.93k | dc_store_16xh(dst, stride, 16, &dc); |
559 | 8.93k | } |
560 | | |
561 | | void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
562 | | const uint16_t *above, |
563 | 6.33k | const uint16_t *left, int bd) { |
564 | 6.33k | const __m128i eight = _mm_cvtsi32_si128(8); |
565 | 6.33k | const __m128i sum = dc_sum_16(above); |
566 | 6.33k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
567 | 6.33k | (void)left; |
568 | 6.33k | (void)bd; |
569 | 6.33k | dc_store_16xh(dst, stride, 32, &dc); |
570 | 6.33k | } |
571 | | |
572 | | // ----------------------------------------------------------------------------- |
573 | | // DC_128 |
574 | | |
575 | | void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
576 | | const uint16_t *above, |
577 | 1.15k | const uint16_t *left, int bd) { |
578 | 1.15k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
579 | 1.15k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
580 | 1.15k | (void)above; |
581 | 1.15k | (void)left; |
582 | 1.15k | dc_store_16xh(dst, stride, 8, &dc_dup); |
583 | 1.15k | } |
584 | | |
585 | | void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, |
586 | | const uint16_t *above, |
587 | 3.17k | const uint16_t *left, int bd) { |
588 | 3.17k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
589 | 3.17k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
590 | 3.17k | (void)above; |
591 | 3.17k | (void)left; |
592 | 3.17k | dc_store_16xh(dst, stride, 16, &dc_dup); |
593 | 3.17k | } |
594 | | |
595 | | void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
596 | | const uint16_t *above, |
597 | 681 | const uint16_t *left, int bd) { |
598 | 681 | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
599 | 681 | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
600 | 681 | (void)above; |
601 | 681 | (void)left; |
602 | 681 | dc_store_16xh(dst, stride, 32, &dc_dup); |
603 | 681 | } |
604 | | |
605 | | // ----------------------------------------------------------------------------- |
606 | | // 32xh |
607 | | |
608 | | static inline void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height, |
609 | 136k | const __m128i *dc) { |
610 | 136k | const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); |
611 | 136k | const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); |
612 | 136k | int i; |
613 | 4.24M | for (i = 0; i < height; ++i, dst += stride) { |
614 | 4.11M | _mm_store_si128((__m128i *)dst, dc_dup); |
615 | 4.11M | _mm_store_si128((__m128i *)(dst + 8), dc_dup); |
616 | 4.11M | _mm_store_si128((__m128i *)(dst + 16), dc_dup); |
617 | 4.11M | _mm_store_si128((__m128i *)(dst + 24), dc_dup); |
618 | 4.11M | } |
619 | 136k | } |
620 | | |
621 | | void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
622 | | const uint16_t *above, |
623 | 5.28k | const uint16_t *left, int bd) { |
624 | 5.28k | const __m128i eight = _mm_cvtsi32_si128(8); |
625 | 5.28k | const __m128i sum = dc_sum_16(left); |
626 | 5.28k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
627 | 5.28k | (void)above; |
628 | 5.28k | (void)bd; |
629 | 5.28k | dc_store_32xh(dst, stride, 16, &dc); |
630 | 5.28k | } |
631 | | |
632 | | void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, |
633 | | const uint16_t *above, |
634 | 64.9k | const uint16_t *left, int bd) { |
635 | 64.9k | const __m128i sixteen = _mm_cvtsi32_si128(16); |
636 | 64.9k | const __m128i sum = dc_sum_32(left); |
637 | 64.9k | const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); |
638 | 64.9k | (void)above; |
639 | 64.9k | (void)bd; |
640 | 64.9k | dc_store_32xh(dst, stride, 32, &dc); |
641 | 64.9k | } |
642 | | |
643 | | void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
644 | | const uint16_t *above, |
645 | 3.74k | const uint16_t *left, int bd) { |
646 | 3.74k | const __m128i sixteen = _mm_cvtsi32_si128(16); |
647 | 3.74k | const __m128i sum = dc_sum_32(above); |
648 | 3.74k | const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); |
649 | 3.74k | (void)left; |
650 | 3.74k | (void)bd; |
651 | 3.74k | dc_store_32xh(dst, stride, 16, &dc); |
652 | 3.74k | } |
653 | | |
654 | | void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
655 | | const uint16_t *above, |
656 | 5.93k | const uint16_t *left, int bd) { |
657 | 5.93k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
658 | 5.93k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
659 | 5.93k | (void)above; |
660 | 5.93k | (void)left; |
661 | 5.93k | dc_store_32xh(dst, stride, 16, &dc_dup); |
662 | 5.93k | } |
663 | | |
664 | | void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, |
665 | | const uint16_t *above, |
666 | 45.6k | const uint16_t *left, int bd) { |
667 | 45.6k | const __m128i sixteen = _mm_cvtsi32_si128(16); |
668 | 45.6k | const __m128i sum = dc_sum_32(above); |
669 | 45.6k | const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); |
670 | 45.6k | (void)left; |
671 | 45.6k | (void)bd; |
672 | 45.6k | dc_store_32xh(dst, stride, 32, &dc); |
673 | 45.6k | } |
674 | | |
675 | | void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, |
676 | | const uint16_t *above, |
677 | 10.4k | const uint16_t *left, int bd) { |
678 | 10.4k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
679 | 10.4k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
680 | 10.4k | (void)above; |
681 | 10.4k | (void)left; |
682 | 10.4k | dc_store_32xh(dst, stride, 32, &dc_dup); |
683 | 10.4k | } |
684 | | |
685 | | // ----------------------------------------------------------------------------- |
686 | | // V_PRED |
687 | | |
688 | | void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
689 | | const uint16_t *above, |
690 | 25.7k | const uint16_t *left, int bd) { |
691 | 25.7k | (void)left; |
692 | 25.7k | (void)bd; |
693 | 25.7k | const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above); |
694 | 25.7k | int i; |
695 | 77.3k | for (i = 0; i < 2; ++i) { |
696 | 51.5k | _mm_storel_epi64((__m128i *)dst, above_u16); |
697 | 51.5k | _mm_storel_epi64((__m128i *)(dst + stride), above_u16); |
698 | 51.5k | _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16); |
699 | 51.5k | _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16); |
700 | 51.5k | dst += stride << 2; |
701 | 51.5k | } |
702 | 25.7k | } |
703 | | |
704 | | void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
705 | | const uint16_t *above, |
706 | 53.3k | const uint16_t *left, int bd) { |
707 | 53.3k | (void)left; |
708 | 53.3k | (void)bd; |
709 | 53.3k | const __m128i above_u16 = _mm_load_si128((const __m128i *)above); |
710 | 53.3k | _mm_store_si128((__m128i *)dst, above_u16); |
711 | 53.3k | _mm_store_si128((__m128i *)(dst + stride), above_u16); |
712 | 53.3k | _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); |
713 | 53.3k | _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); |
714 | 53.3k | } |
715 | | |
716 | | void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
717 | | const uint16_t *above, |
718 | 21.3k | const uint16_t *left, int bd) { |
719 | 21.3k | (void)left; |
720 | 21.3k | (void)bd; |
721 | 21.3k | const __m128i above_u16 = _mm_load_si128((const __m128i *)above); |
722 | 21.3k | int i; |
723 | 106k | for (i = 0; i < 4; ++i) { |
724 | 85.3k | _mm_store_si128((__m128i *)dst, above_u16); |
725 | 85.3k | _mm_store_si128((__m128i *)(dst + stride), above_u16); |
726 | 85.3k | _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); |
727 | 85.3k | _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); |
728 | 85.3k | dst += stride << 2; |
729 | 85.3k | } |
730 | 21.3k | } |
731 | | |
732 | | void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
733 | | const uint16_t *above, |
734 | 43.3k | const uint16_t *left, int bd) { |
735 | 43.3k | (void)left; |
736 | 43.3k | (void)bd; |
737 | 43.3k | const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); |
738 | 43.3k | const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); |
739 | 43.3k | int i; |
740 | 130k | for (i = 0; i < 2; ++i) { |
741 | 86.7k | _mm_store_si128((__m128i *)dst, above0_u16); |
742 | 86.7k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
743 | 86.7k | dst += stride; |
744 | 86.7k | _mm_store_si128((__m128i *)dst, above0_u16); |
745 | 86.7k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
746 | 86.7k | dst += stride; |
747 | 86.7k | _mm_store_si128((__m128i *)dst, above0_u16); |
748 | 86.7k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
749 | 86.7k | dst += stride; |
750 | 86.7k | _mm_store_si128((__m128i *)dst, above0_u16); |
751 | 86.7k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
752 | 86.7k | dst += stride; |
753 | 86.7k | } |
754 | 43.3k | } |
755 | | |
756 | | void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
757 | | const uint16_t *above, |
758 | 12.6k | const uint16_t *left, int bd) { |
759 | 12.6k | (void)left; |
760 | 12.6k | (void)bd; |
761 | 12.6k | const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); |
762 | 12.6k | const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); |
763 | 12.6k | int i; |
764 | 113k | for (i = 0; i < 8; ++i) { |
765 | 101k | _mm_store_si128((__m128i *)dst, above0_u16); |
766 | 101k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
767 | 101k | dst += stride; |
768 | 101k | _mm_store_si128((__m128i *)dst, above0_u16); |
769 | 101k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
770 | 101k | dst += stride; |
771 | 101k | _mm_store_si128((__m128i *)dst, above0_u16); |
772 | 101k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
773 | 101k | dst += stride; |
774 | 101k | _mm_store_si128((__m128i *)dst, above0_u16); |
775 | 101k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
776 | 101k | dst += stride; |
777 | 101k | } |
778 | 12.6k | } |
779 | | |
780 | | void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
781 | | const uint16_t *above, |
782 | 9.75k | const uint16_t *left, int bd) { |
783 | 9.75k | (void)left; |
784 | 9.75k | (void)bd; |
785 | 9.75k | const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); |
786 | 9.75k | const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); |
787 | 9.75k | const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16)); |
788 | 9.75k | const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24)); |
789 | 9.75k | int i; |
790 | 48.7k | for (i = 0; i < 4; ++i) { |
791 | 39.0k | _mm_store_si128((__m128i *)dst, above0_u16); |
792 | 39.0k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
793 | 39.0k | _mm_store_si128((__m128i *)(dst + 16), above2_u16); |
794 | 39.0k | _mm_store_si128((__m128i *)(dst + 24), above3_u16); |
795 | 39.0k | dst += stride; |
796 | 39.0k | _mm_store_si128((__m128i *)dst, above0_u16); |
797 | 39.0k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
798 | 39.0k | _mm_store_si128((__m128i *)(dst + 16), above2_u16); |
799 | 39.0k | _mm_store_si128((__m128i *)(dst + 24), above3_u16); |
800 | 39.0k | dst += stride; |
801 | 39.0k | _mm_store_si128((__m128i *)dst, above0_u16); |
802 | 39.0k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
803 | 39.0k | _mm_store_si128((__m128i *)(dst + 16), above2_u16); |
804 | 39.0k | _mm_store_si128((__m128i *)(dst + 24), above3_u16); |
805 | 39.0k | dst += stride; |
806 | 39.0k | _mm_store_si128((__m128i *)dst, above0_u16); |
807 | 39.0k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
808 | 39.0k | _mm_store_si128((__m128i *)(dst + 16), above2_u16); |
809 | 39.0k | _mm_store_si128((__m128i *)(dst + 24), above3_u16); |
810 | 39.0k | dst += stride; |
811 | 39.0k | } |
812 | 9.75k | } |
813 | | |
814 | | // ----------------------------------------------------------------------------- |
815 | | // DC_PRED |
816 | | |
817 | | void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
818 | | const uint16_t *above, |
819 | 234k | const uint16_t *left, int bd) { |
820 | 234k | (void)bd; |
821 | 234k | const __m128i sum_above = dc_sum_4(above); |
822 | 234k | const __m128i sum_left = dc_sum_8(left); |
823 | 234k | const __m128i sum = _mm_add_epi16(sum_above, sum_left); |
824 | 234k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
825 | 234k | sum32 >>= 16; |
826 | 234k | sum32 += 6; |
827 | 234k | sum32 /= 12; |
828 | 234k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
829 | 234k | int i; |
830 | 1.17M | for (i = 0; i < 4; ++i) { |
831 | 936k | _mm_storel_epi64((__m128i *)dst, row); |
832 | 936k | dst += stride; |
833 | 936k | _mm_storel_epi64((__m128i *)dst, row); |
834 | 936k | dst += stride; |
835 | 936k | } |
836 | 234k | } |
837 | | |
838 | | void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
839 | | const uint16_t *above, |
840 | 488k | const uint16_t *left, int bd) { |
841 | 488k | (void)bd; |
842 | 488k | const __m128i sum_left = dc_sum_4(left); |
843 | 488k | const __m128i sum_above = dc_sum_8(above); |
844 | 488k | const __m128i sum = _mm_add_epi16(sum_above, sum_left); |
845 | 488k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
846 | 488k | sum32 >>= 16; |
847 | 488k | sum32 += 6; |
848 | 488k | sum32 /= 12; |
849 | 488k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
850 | | |
851 | 488k | _mm_store_si128((__m128i *)dst, row); |
852 | 488k | dst += stride; |
853 | 488k | _mm_store_si128((__m128i *)dst, row); |
854 | 488k | dst += stride; |
855 | 488k | _mm_store_si128((__m128i *)dst, row); |
856 | 488k | dst += stride; |
857 | 488k | _mm_store_si128((__m128i *)dst, row); |
858 | 488k | } |
859 | | |
860 | | void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
861 | | const uint16_t *above, |
862 | 214k | const uint16_t *left, int bd) { |
863 | 214k | (void)bd; |
864 | 214k | __m128i sum_left = dc_sum_16(left); |
865 | 214k | __m128i sum_above = dc_sum_8(above); |
866 | 214k | const __m128i zero = _mm_setzero_si128(); |
867 | 214k | sum_left = _mm_unpacklo_epi16(sum_left, zero); |
868 | 214k | sum_above = _mm_unpacklo_epi16(sum_above, zero); |
869 | 214k | const __m128i sum = _mm_add_epi32(sum_left, sum_above); |
870 | 214k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
871 | 214k | sum32 += 12; |
872 | 214k | sum32 /= 24; |
873 | 214k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
874 | 214k | int i; |
875 | 1.07M | for (i = 0; i < 4; ++i) { |
876 | 859k | _mm_store_si128((__m128i *)dst, row); |
877 | 859k | dst += stride; |
878 | 859k | _mm_store_si128((__m128i *)dst, row); |
879 | 859k | dst += stride; |
880 | 859k | _mm_store_si128((__m128i *)dst, row); |
881 | 859k | dst += stride; |
882 | 859k | _mm_store_si128((__m128i *)dst, row); |
883 | 859k | dst += stride; |
884 | 859k | } |
885 | 214k | } |
886 | | |
887 | | void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
888 | | const uint16_t *above, |
889 | 386k | const uint16_t *left, int bd) { |
890 | 386k | (void)bd; |
891 | 386k | __m128i sum_left = dc_sum_8(left); |
892 | 386k | __m128i sum_above = dc_sum_16(above); |
893 | 386k | const __m128i zero = _mm_setzero_si128(); |
894 | 386k | sum_left = _mm_unpacklo_epi16(sum_left, zero); |
895 | 386k | sum_above = _mm_unpacklo_epi16(sum_above, zero); |
896 | 386k | const __m128i sum = _mm_add_epi32(sum_left, sum_above); |
897 | 386k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
898 | 386k | sum32 += 12; |
899 | 386k | sum32 /= 24; |
900 | 386k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
901 | 386k | int i; |
902 | 1.16M | for (i = 0; i < 2; ++i) { |
903 | 773k | _mm_store_si128((__m128i *)dst, row); |
904 | 773k | _mm_store_si128((__m128i *)(dst + 8), row); |
905 | 773k | dst += stride; |
906 | 773k | _mm_store_si128((__m128i *)dst, row); |
907 | 773k | _mm_store_si128((__m128i *)(dst + 8), row); |
908 | 773k | dst += stride; |
909 | 773k | _mm_store_si128((__m128i *)dst, row); |
910 | 773k | _mm_store_si128((__m128i *)(dst + 8), row); |
911 | 773k | dst += stride; |
912 | 773k | _mm_store_si128((__m128i *)dst, row); |
913 | 773k | _mm_store_si128((__m128i *)(dst + 8), row); |
914 | 773k | dst += stride; |
915 | 773k | } |
916 | 386k | } |
917 | | |
918 | | void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
919 | | const uint16_t *above, |
920 | 163k | const uint16_t *left, int bd) { |
921 | 163k | (void)bd; |
922 | 163k | __m128i sum_left = dc_sum_32(left); |
923 | 163k | __m128i sum_above = dc_sum_16(above); |
924 | 163k | const __m128i zero = _mm_setzero_si128(); |
925 | 163k | sum_above = _mm_unpacklo_epi16(sum_above, zero); |
926 | 163k | const __m128i sum = _mm_add_epi32(sum_left, sum_above); |
927 | 163k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
928 | 163k | sum32 += 24; |
929 | 163k | sum32 /= 48; |
930 | 163k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
931 | 163k | int i; |
932 | 1.47M | for (i = 0; i < 8; ++i) { |
933 | 1.31M | _mm_store_si128((__m128i *)dst, row); |
934 | 1.31M | _mm_store_si128((__m128i *)(dst + 8), row); |
935 | 1.31M | dst += stride; |
936 | 1.31M | _mm_store_si128((__m128i *)dst, row); |
937 | 1.31M | _mm_store_si128((__m128i *)(dst + 8), row); |
938 | 1.31M | dst += stride; |
939 | 1.31M | _mm_store_si128((__m128i *)dst, row); |
940 | 1.31M | _mm_store_si128((__m128i *)(dst + 8), row); |
941 | 1.31M | dst += stride; |
942 | 1.31M | _mm_store_si128((__m128i *)dst, row); |
943 | 1.31M | _mm_store_si128((__m128i *)(dst + 8), row); |
944 | 1.31M | dst += stride; |
945 | 1.31M | } |
946 | 163k | } |
947 | | |
948 | | void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
949 | | const uint16_t *above, |
950 | 141k | const uint16_t *left, int bd) { |
951 | 141k | (void)bd; |
952 | 141k | __m128i sum_left = dc_sum_16(left); |
953 | 141k | __m128i sum_above = dc_sum_32(above); |
954 | 141k | const __m128i zero = _mm_setzero_si128(); |
955 | 141k | sum_left = _mm_unpacklo_epi16(sum_left, zero); |
956 | 141k | const __m128i sum = _mm_add_epi32(sum_left, sum_above); |
957 | 141k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
958 | 141k | sum32 += 24; |
959 | 141k | sum32 /= 48; |
960 | 141k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
961 | 141k | int i; |
962 | 706k | for (i = 0; i < 4; ++i) { |
963 | 565k | _mm_store_si128((__m128i *)dst, row); |
964 | 565k | _mm_store_si128((__m128i *)(dst + 8), row); |
965 | 565k | _mm_store_si128((__m128i *)(dst + 16), row); |
966 | 565k | _mm_store_si128((__m128i *)(dst + 24), row); |
967 | 565k | dst += stride; |
968 | 565k | _mm_store_si128((__m128i *)dst, row); |
969 | 565k | _mm_store_si128((__m128i *)(dst + 8), row); |
970 | 565k | _mm_store_si128((__m128i *)(dst + 16), row); |
971 | 565k | _mm_store_si128((__m128i *)(dst + 24), row); |
972 | 565k | dst += stride; |
973 | 565k | _mm_store_si128((__m128i *)dst, row); |
974 | 565k | _mm_store_si128((__m128i *)(dst + 8), row); |
975 | 565k | _mm_store_si128((__m128i *)(dst + 16), row); |
976 | 565k | _mm_store_si128((__m128i *)(dst + 24), row); |
977 | 565k | dst += stride; |
978 | 565k | _mm_store_si128((__m128i *)dst, row); |
979 | 565k | _mm_store_si128((__m128i *)(dst + 8), row); |
980 | 565k | _mm_store_si128((__m128i *)(dst + 16), row); |
981 | 565k | _mm_store_si128((__m128i *)(dst + 24), row); |
982 | 565k | dst += stride; |
983 | 565k | } |
984 | 141k | } |