/src/aom/aom_dsp/x86/highbd_intrapred_sse2.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <emmintrin.h> |
13 | | |
14 | | #include "config/aom_dsp_rtcd.h" |
15 | | |
16 | | // ----------------------------------------------------------------------------- |
17 | | // H_PRED |
18 | | |
19 | | void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, |
20 | | const uint16_t *above, |
21 | 286k | const uint16_t *left, int bd) { |
22 | 286k | const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left); |
23 | 286k | const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); |
24 | 286k | const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); |
25 | 286k | const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); |
26 | 286k | const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); |
27 | 286k | (void)above; |
28 | 286k | (void)bd; |
29 | 286k | _mm_storel_epi64((__m128i *)dst, row0); |
30 | 286k | dst += stride; |
31 | 286k | _mm_storel_epi64((__m128i *)dst, row1); |
32 | 286k | dst += stride; |
33 | 286k | _mm_storel_epi64((__m128i *)dst, row2); |
34 | 286k | dst += stride; |
35 | 286k | _mm_storel_epi64((__m128i *)dst, row3); |
36 | 286k | } |
37 | | |
38 | | void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
39 | | const uint16_t *above, |
40 | 53.2k | const uint16_t *left, int bd) { |
41 | 53.2k | aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); |
42 | 53.2k | dst += stride << 2; |
43 | 53.2k | left += 4; |
44 | 53.2k | aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); |
45 | 53.2k | } |
46 | | |
47 | | void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
48 | | const uint16_t *above, |
49 | 133k | const uint16_t *left, int bd) { |
50 | 133k | const __m128i left_u16 = _mm_load_si128((const __m128i *)left); |
51 | 133k | const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); |
52 | 133k | const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); |
53 | 133k | const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); |
54 | 133k | const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); |
55 | 133k | (void)above; |
56 | 133k | (void)bd; |
57 | 133k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); |
58 | 133k | dst += stride; |
59 | 133k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); |
60 | 133k | dst += stride; |
61 | 133k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); |
62 | 133k | dst += stride; |
63 | 133k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); |
64 | 133k | } |
65 | | |
66 | | void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, |
67 | | const uint16_t *above, |
68 | 400k | const uint16_t *left, int bd) { |
69 | 400k | const __m128i left_u16 = _mm_load_si128((const __m128i *)left); |
70 | 400k | const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); |
71 | 400k | const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); |
72 | 400k | const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); |
73 | 400k | const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); |
74 | 400k | const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); |
75 | 400k | const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); |
76 | 400k | const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); |
77 | 400k | const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); |
78 | 400k | (void)above; |
79 | 400k | (void)bd; |
80 | 400k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); |
81 | 400k | dst += stride; |
82 | 400k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); |
83 | 400k | dst += stride; |
84 | 400k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); |
85 | 400k | dst += stride; |
86 | 400k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); |
87 | 400k | dst += stride; |
88 | 400k | _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4)); |
89 | 400k | dst += stride; |
90 | 400k | _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5)); |
91 | 400k | dst += stride; |
92 | 400k | _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6)); |
93 | 400k | dst += stride; |
94 | 400k | _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7)); |
95 | 400k | } |
96 | | |
97 | | void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
98 | | const uint16_t *above, |
99 | 57.2k | const uint16_t *left, int bd) { |
100 | 57.2k | aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); |
101 | 57.2k | dst += stride << 3; |
102 | 57.2k | left += 8; |
103 | 57.2k | aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); |
104 | 57.2k | } |
105 | | |
106 | | static inline void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride, |
107 | 3.14M | const __m128i *row) { |
108 | 3.14M | const __m128i val = _mm_unpacklo_epi64(*row, *row); |
109 | 3.14M | _mm_store_si128((__m128i *)*dst, val); |
110 | 3.14M | _mm_store_si128((__m128i *)(*dst + 8), val); |
111 | 3.14M | *dst += stride; |
112 | 3.14M | } |
113 | | |
114 | | static inline void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride, |
115 | 3.14M | const __m128i *row) { |
116 | 3.14M | const __m128i val = _mm_unpackhi_epi64(*row, *row); |
117 | 3.14M | _mm_store_si128((__m128i *)(*dst), val); |
118 | 3.14M | _mm_store_si128((__m128i *)(*dst + 8), val); |
119 | 3.14M | *dst += stride; |
120 | 3.14M | } |
121 | | |
122 | | static inline void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride, |
123 | 787k | const uint16_t *left) { |
124 | 787k | const __m128i left_u16 = _mm_load_si128((const __m128i *)left); |
125 | 787k | const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); |
126 | 787k | const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); |
127 | 787k | const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); |
128 | 787k | const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); |
129 | 787k | const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); |
130 | 787k | const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); |
131 | 787k | const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); |
132 | 787k | const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); |
133 | 787k | h_store_16_unpacklo(&dst, stride, &row0); |
134 | 787k | h_store_16_unpacklo(&dst, stride, &row1); |
135 | 787k | h_store_16_unpacklo(&dst, stride, &row2); |
136 | 787k | h_store_16_unpacklo(&dst, stride, &row3); |
137 | 787k | h_store_16_unpackhi(&dst, stride, &row4); |
138 | 787k | h_store_16_unpackhi(&dst, stride, &row5); |
139 | 787k | h_store_16_unpackhi(&dst, stride, &row6); |
140 | 787k | h_store_16_unpackhi(&dst, stride, &row7); |
141 | 787k | } |
142 | | |
143 | | void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
144 | | const uint16_t *above, |
145 | 113k | const uint16_t *left, int bd) { |
146 | 113k | (void)above; |
147 | 113k | (void)bd; |
148 | 113k | h_predictor_16x8(dst, stride, left); |
149 | 113k | } |
150 | | |
151 | | void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, |
152 | | const uint16_t *above, |
153 | 284k | const uint16_t *left, int bd) { |
154 | 284k | int i; |
155 | 284k | (void)above; |
156 | 284k | (void)bd; |
157 | | |
158 | 853k | for (i = 0; i < 2; i++, left += 8) { |
159 | 569k | h_predictor_16x8(dst, stride, left); |
160 | 569k | dst += stride << 3; |
161 | 569k | } |
162 | 284k | } |
163 | | |
164 | | void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
165 | | const uint16_t *above, |
166 | 26.1k | const uint16_t *left, int bd) { |
167 | 26.1k | int i; |
168 | 26.1k | (void)above; |
169 | 26.1k | (void)bd; |
170 | | |
171 | 130k | for (i = 0; i < 4; i++, left += 8) { |
172 | 104k | h_predictor_16x8(dst, stride, left); |
173 | 104k | dst += stride << 3; |
174 | 104k | } |
175 | 26.1k | } |
176 | | |
177 | | static inline void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride, |
178 | 3.19M | const __m128i *row) { |
179 | 3.19M | const __m128i val = _mm_unpacklo_epi64(*row, *row); |
180 | 3.19M | _mm_store_si128((__m128i *)(*dst), val); |
181 | 3.19M | _mm_store_si128((__m128i *)(*dst + 8), val); |
182 | 3.19M | _mm_store_si128((__m128i *)(*dst + 16), val); |
183 | 3.19M | _mm_store_si128((__m128i *)(*dst + 24), val); |
184 | 3.19M | *dst += stride; |
185 | 3.19M | } |
186 | | |
187 | | static inline void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride, |
188 | 3.19M | const __m128i *row) { |
189 | 3.19M | const __m128i val = _mm_unpackhi_epi64(*row, *row); |
190 | 3.19M | _mm_store_si128((__m128i *)(*dst), val); |
191 | 3.19M | _mm_store_si128((__m128i *)(*dst + 8), val); |
192 | 3.19M | _mm_store_si128((__m128i *)(*dst + 16), val); |
193 | 3.19M | _mm_store_si128((__m128i *)(*dst + 24), val); |
194 | 3.19M | *dst += stride; |
195 | 3.19M | } |
196 | | |
197 | | static inline void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride, |
198 | 797k | const uint16_t *left) { |
199 | 797k | const __m128i left_u16 = _mm_load_si128((const __m128i *)left); |
200 | 797k | const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); |
201 | 797k | const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); |
202 | 797k | const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); |
203 | 797k | const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); |
204 | 797k | const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); |
205 | 797k | const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); |
206 | 797k | const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); |
207 | 797k | const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); |
208 | 797k | h_store_32_unpacklo(&dst, stride, &row0); |
209 | 797k | h_store_32_unpacklo(&dst, stride, &row1); |
210 | 797k | h_store_32_unpacklo(&dst, stride, &row2); |
211 | 797k | h_store_32_unpacklo(&dst, stride, &row3); |
212 | 797k | h_store_32_unpackhi(&dst, stride, &row4); |
213 | 797k | h_store_32_unpackhi(&dst, stride, &row5); |
214 | 797k | h_store_32_unpackhi(&dst, stride, &row6); |
215 | 797k | h_store_32_unpackhi(&dst, stride, &row7); |
216 | 797k | } |
217 | | |
218 | | void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
219 | | const uint16_t *above, |
220 | 67.7k | const uint16_t *left, int bd) { |
221 | 67.7k | int i; |
222 | 67.7k | (void)above; |
223 | 67.7k | (void)bd; |
224 | | |
225 | 203k | for (i = 0; i < 2; i++, left += 8) { |
226 | 135k | h_predictor_32x8(dst, stride, left); |
227 | 135k | dst += stride << 3; |
228 | 135k | } |
229 | 67.7k | } |
230 | | |
231 | | void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, |
232 | | const uint16_t *above, |
233 | 165k | const uint16_t *left, int bd) { |
234 | 165k | int i; |
235 | 165k | (void)above; |
236 | 165k | (void)bd; |
237 | | |
238 | 828k | for (i = 0; i < 4; i++, left += 8) { |
239 | 662k | h_predictor_32x8(dst, stride, left); |
240 | 662k | dst += stride << 3; |
241 | 662k | } |
242 | 165k | } |
243 | | |
244 | | // ----------------------------------------------------------------------------- |
245 | | // DC_TOP, DC_LEFT, DC_128 |
246 | | |
247 | | // 4x4 |
248 | | |
249 | 2.35M | static inline __m128i dc_sum_4(const uint16_t *ref) { |
250 | 2.35M | const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref); |
251 | 2.35M | const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); |
252 | 2.35M | const __m128i a = _mm_add_epi16(_dcba, _xxdc); |
253 | 2.35M | return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); |
254 | 2.35M | } |
255 | | |
256 | | static inline void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, |
257 | 1.42M | const __m128i *dc) { |
258 | 1.42M | const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); |
259 | 1.42M | int i; |
260 | 7.11M | for (i = 0; i < 4; ++i, dst += stride) { |
261 | 5.69M | _mm_storel_epi64((__m128i *)dst, dc_dup); |
262 | 5.69M | } |
263 | 1.42M | } |
264 | | |
265 | | void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, |
266 | | const uint16_t *above, |
267 | 94.5k | const uint16_t *left, int bd) { |
268 | 94.5k | const __m128i two = _mm_cvtsi32_si128(2); |
269 | 94.5k | const __m128i sum = dc_sum_4(left); |
270 | 94.5k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); |
271 | 94.5k | (void)above; |
272 | 94.5k | (void)bd; |
273 | 94.5k | dc_store_4x4(dst, stride, &dc); |
274 | 94.5k | } |
275 | | |
276 | | void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, |
277 | | const uint16_t *above, |
278 | 1.32M | const uint16_t *left, int bd) { |
279 | 1.32M | const __m128i two = _mm_cvtsi32_si128(2); |
280 | 1.32M | const __m128i sum = dc_sum_4(above); |
281 | 1.32M | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); |
282 | 1.32M | (void)left; |
283 | 1.32M | (void)bd; |
284 | 1.32M | dc_store_4x4(dst, stride, &dc); |
285 | 1.32M | } |
286 | | |
287 | | void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, |
288 | | const uint16_t *above, |
289 | 6.56k | const uint16_t *left, int bd) { |
290 | 6.56k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
291 | 6.56k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
292 | 6.56k | (void)above; |
293 | 6.56k | (void)left; |
294 | 6.56k | dc_store_4x4(dst, stride, &dc_dup); |
295 | 6.56k | } |
296 | | |
297 | | // ----------------------------------------------------------------------------- |
298 | | // 4x8 |
299 | | |
300 | | static inline void dc_store_4x8(uint16_t *dst, ptrdiff_t stride, |
301 | 20.3k | const __m128i *dc) { |
302 | 20.3k | const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); |
303 | 20.3k | int i; |
304 | 183k | for (i = 0; i < 8; ++i, dst += stride) { |
305 | 162k | _mm_storel_epi64((__m128i *)dst, dc_dup); |
306 | 162k | } |
307 | 20.3k | } |
308 | | |
309 | | // Shared with DC 8xh |
310 | 6.82M | static inline __m128i dc_sum_8(const uint16_t *ref) { |
311 | 6.82M | const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref); |
312 | 6.82M | const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8)); |
313 | 6.82M | const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); |
314 | 6.82M | const __m128i a = _mm_add_epi16(_dcba, _xxdc); |
315 | | |
316 | 6.82M | return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); |
317 | 6.82M | } |
318 | | |
319 | | void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
320 | | const uint16_t *above, |
321 | 3.16k | const uint16_t *left, int bd) { |
322 | 3.16k | const __m128i sum = dc_sum_8(left); |
323 | 3.16k | const __m128i four = _mm_cvtsi32_si128(4); |
324 | 3.16k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); |
325 | 3.16k | (void)above; |
326 | 3.16k | (void)bd; |
327 | 3.16k | dc_store_4x8(dst, stride, &dc); |
328 | 3.16k | } |
329 | | |
330 | | void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
331 | | const uint16_t *above, |
332 | 16.6k | const uint16_t *left, int bd) { |
333 | 16.6k | const __m128i two = _mm_cvtsi32_si128(2); |
334 | 16.6k | const __m128i sum = dc_sum_4(above); |
335 | 16.6k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); |
336 | 16.6k | (void)left; |
337 | 16.6k | (void)bd; |
338 | 16.6k | dc_store_4x8(dst, stride, &dc); |
339 | 16.6k | } |
340 | | |
341 | | void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
342 | | const uint16_t *above, |
343 | 570 | const uint16_t *left, int bd) { |
344 | 570 | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
345 | 570 | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
346 | 570 | (void)above; |
347 | 570 | (void)left; |
348 | 570 | dc_store_4x8(dst, stride, &dc_dup); |
349 | 570 | } |
350 | | |
351 | | // ----------------------------------------------------------------------------- |
352 | | // 8xh |
353 | | |
354 | | static inline void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height, |
355 | 123k | const __m128i *dc) { |
356 | 123k | const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); |
357 | 123k | const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); |
358 | 123k | int i; |
359 | 1.10M | for (i = 0; i < height; ++i, dst += stride) { |
360 | 979k | _mm_store_si128((__m128i *)dst, dc_dup); |
361 | 979k | } |
362 | 123k | } |
363 | | |
364 | | // ----------------------------------------------------------------------------- |
365 | | // DC_TOP |
366 | | |
367 | | static inline void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride, |
368 | 65.4k | int height, const uint16_t *above) { |
369 | 65.4k | const __m128i four = _mm_cvtsi32_si128(4); |
370 | 65.4k | const __m128i sum = dc_sum_8(above); |
371 | 65.4k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); |
372 | 65.4k | dc_store_8xh(dst, stride, height, &dc); |
373 | 65.4k | } |
374 | | |
375 | | void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
376 | | const uint16_t *above, |
377 | 26.1k | const uint16_t *left, int bd) { |
378 | 26.1k | (void)left; |
379 | 26.1k | (void)bd; |
380 | 26.1k | dc_top_predictor_8xh(dst, stride, 4, above); |
381 | 26.1k | } |
382 | | |
383 | | void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, |
384 | | const uint16_t *above, |
385 | 31.2k | const uint16_t *left, int bd) { |
386 | 31.2k | (void)left; |
387 | 31.2k | (void)bd; |
388 | 31.2k | dc_top_predictor_8xh(dst, stride, 8, above); |
389 | 31.2k | } |
390 | | |
391 | | void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
392 | | const uint16_t *above, |
393 | 8.00k | const uint16_t *left, int bd) { |
394 | 8.00k | (void)left; |
395 | 8.00k | (void)bd; |
396 | 8.00k | dc_top_predictor_8xh(dst, stride, 16, above); |
397 | 8.00k | } |
398 | | |
399 | | // ----------------------------------------------------------------------------- |
400 | | // DC_LEFT |
401 | | |
402 | | void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
403 | | const uint16_t *above, |
404 | 6.39k | const uint16_t *left, int bd) { |
405 | 6.39k | const __m128i two = _mm_cvtsi32_si128(2); |
406 | 6.39k | const __m128i sum = dc_sum_4(left); |
407 | 6.39k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); |
408 | 6.39k | (void)above; |
409 | 6.39k | (void)bd; |
410 | 6.39k | dc_store_8xh(dst, stride, 4, &dc); |
411 | 6.39k | } |
412 | | |
413 | | void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, |
414 | | const uint16_t *above, |
415 | 43.5k | const uint16_t *left, int bd) { |
416 | 43.5k | const __m128i four = _mm_cvtsi32_si128(4); |
417 | 43.5k | const __m128i sum = dc_sum_8(left); |
418 | 43.5k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); |
419 | 43.5k | (void)above; |
420 | 43.5k | (void)bd; |
421 | 43.5k | dc_store_8xh(dst, stride, 8, &dc); |
422 | 43.5k | } |
423 | | |
424 | | // Shared with DC 16xh |
425 | 2.45M | static inline __m128i dc_sum_16(const uint16_t *ref) { |
426 | 2.45M | const __m128i sum_lo = dc_sum_8(ref); |
427 | 2.45M | const __m128i sum_hi = dc_sum_8(ref + 8); |
428 | 2.45M | return _mm_add_epi16(sum_lo, sum_hi); |
429 | 2.45M | } |
430 | | |
431 | | void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
432 | | const uint16_t *above, |
433 | 5.58k | const uint16_t *left, int bd) { |
434 | 5.58k | const __m128i eight = _mm_cvtsi32_si128(8); |
435 | 5.58k | const __m128i sum = dc_sum_16(left); |
436 | 5.58k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
437 | 5.58k | (void)above; |
438 | 5.58k | (void)bd; |
439 | 5.58k | dc_store_8xh(dst, stride, 16, &dc); |
440 | 5.58k | } |
441 | | |
442 | | // ----------------------------------------------------------------------------- |
443 | | // DC_128 |
444 | | |
445 | | static inline void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride, |
446 | 2.93k | int height, int bd) { |
447 | 2.93k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
448 | 2.93k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
449 | 2.93k | dc_store_8xh(dst, stride, height, &dc_dup); |
450 | 2.93k | } |
451 | | |
452 | | void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
453 | | const uint16_t *above, |
454 | 95 | const uint16_t *left, int bd) { |
455 | 95 | (void)above; |
456 | 95 | (void)left; |
457 | 95 | dc_128_predictor_8xh(dst, stride, 4, bd); |
458 | 95 | } |
459 | | |
460 | | void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, |
461 | | const uint16_t *above, |
462 | 1.67k | const uint16_t *left, int bd) { |
463 | 1.67k | (void)above; |
464 | 1.67k | (void)left; |
465 | 1.67k | dc_128_predictor_8xh(dst, stride, 8, bd); |
466 | 1.67k | } |
467 | | |
468 | | void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
469 | | const uint16_t *above, |
470 | 1.16k | const uint16_t *left, int bd) { |
471 | 1.16k | (void)above; |
472 | 1.16k | (void)left; |
473 | 1.16k | dc_128_predictor_8xh(dst, stride, 16, bd); |
474 | 1.16k | } |
475 | | |
476 | | // ----------------------------------------------------------------------------- |
477 | | // 16xh |
478 | | |
479 | | static inline void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height, |
480 | 106k | const __m128i *dc) { |
481 | 106k | const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); |
482 | 106k | const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); |
483 | 106k | int i; |
484 | 2.03M | for (i = 0; i < height; ++i, dst += stride) { |
485 | 1.93M | _mm_store_si128((__m128i *)dst, dc_dup); |
486 | 1.93M | _mm_store_si128((__m128i *)(dst + 8), dc_dup); |
487 | 1.93M | } |
488 | 106k | } |
489 | | |
490 | | // ----------------------------------------------------------------------------- |
491 | | // DC_LEFT |
492 | | |
493 | | void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
494 | | const uint16_t *above, |
495 | 13.3k | const uint16_t *left, int bd) { |
496 | 13.3k | const __m128i four = _mm_cvtsi32_si128(4); |
497 | 13.3k | const __m128i sum = dc_sum_8(left); |
498 | 13.3k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); |
499 | 13.3k | (void)above; |
500 | 13.3k | (void)bd; |
501 | 13.3k | dc_store_16xh(dst, stride, 8, &dc); |
502 | 13.3k | } |
503 | | |
504 | | void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, |
505 | | const uint16_t *above, |
506 | 43.5k | const uint16_t *left, int bd) { |
507 | 43.5k | const __m128i eight = _mm_cvtsi32_si128(8); |
508 | 43.5k | const __m128i sum = dc_sum_16(left); |
509 | 43.5k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
510 | 43.5k | (void)above; |
511 | 43.5k | (void)bd; |
512 | 43.5k | dc_store_16xh(dst, stride, 16, &dc); |
513 | 43.5k | } |
514 | | |
515 | | // Shared with 32xh |
516 | 565k | static inline __m128i dc_sum_32(const uint16_t *ref) { |
517 | 565k | const __m128i zero = _mm_setzero_si128(); |
518 | 565k | const __m128i sum_a = dc_sum_16(ref); |
519 | 565k | const __m128i sum_b = dc_sum_16(ref + 16); |
520 | | // 12 bit bd will outrange, so expand to 32 bit before adding final total |
521 | 565k | return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero), |
522 | 565k | _mm_unpacklo_epi16(sum_b, zero)); |
523 | 565k | } |
524 | | |
525 | | void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
526 | | const uint16_t *above, |
527 | 8.01k | const uint16_t *left, int bd) { |
528 | 8.01k | const __m128i sixteen = _mm_cvtsi32_si128(16); |
529 | 8.01k | const __m128i sum = dc_sum_32(left); |
530 | 8.01k | const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); |
531 | 8.01k | (void)above; |
532 | 8.01k | (void)bd; |
533 | 8.01k | dc_store_16xh(dst, stride, 32, &dc); |
534 | 8.01k | } |
535 | | |
536 | | // ----------------------------------------------------------------------------- |
537 | | // DC_TOP |
538 | | |
539 | | void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
540 | | const uint16_t *above, |
541 | 8.25k | const uint16_t *left, int bd) { |
542 | 8.25k | const __m128i eight = _mm_cvtsi32_si128(8); |
543 | 8.25k | const __m128i sum = dc_sum_16(above); |
544 | 8.25k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
545 | 8.25k | (void)left; |
546 | 8.25k | (void)bd; |
547 | 8.25k | dc_store_16xh(dst, stride, 8, &dc); |
548 | 8.25k | } |
549 | | |
550 | | void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, |
551 | | const uint16_t *above, |
552 | 14.3k | const uint16_t *left, int bd) { |
553 | 14.3k | const __m128i eight = _mm_cvtsi32_si128(8); |
554 | 14.3k | const __m128i sum = dc_sum_16(above); |
555 | 14.3k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
556 | 14.3k | (void)left; |
557 | 14.3k | (void)bd; |
558 | 14.3k | dc_store_16xh(dst, stride, 16, &dc); |
559 | 14.3k | } |
560 | | |
561 | | void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
562 | | const uint16_t *above, |
563 | 12.9k | const uint16_t *left, int bd) { |
564 | 12.9k | const __m128i eight = _mm_cvtsi32_si128(8); |
565 | 12.9k | const __m128i sum = dc_sum_16(above); |
566 | 12.9k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
567 | 12.9k | (void)left; |
568 | 12.9k | (void)bd; |
569 | 12.9k | dc_store_16xh(dst, stride, 32, &dc); |
570 | 12.9k | } |
571 | | |
572 | | // ----------------------------------------------------------------------------- |
573 | | // DC_128 |
574 | | |
575 | | void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
576 | | const uint16_t *above, |
577 | 104 | const uint16_t *left, int bd) { |
578 | 104 | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
579 | 104 | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
580 | 104 | (void)above; |
581 | 104 | (void)left; |
582 | 104 | dc_store_16xh(dst, stride, 8, &dc_dup); |
583 | 104 | } |
584 | | |
585 | | void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, |
586 | | const uint16_t *above, |
587 | 2.68k | const uint16_t *left, int bd) { |
588 | 2.68k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
589 | 2.68k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
590 | 2.68k | (void)above; |
591 | 2.68k | (void)left; |
592 | 2.68k | dc_store_16xh(dst, stride, 16, &dc_dup); |
593 | 2.68k | } |
594 | | |
595 | | void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
596 | | const uint16_t *above, |
597 | 3.70k | const uint16_t *left, int bd) { |
598 | 3.70k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
599 | 3.70k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
600 | 3.70k | (void)above; |
601 | 3.70k | (void)left; |
602 | 3.70k | dc_store_16xh(dst, stride, 32, &dc_dup); |
603 | 3.70k | } |
604 | | |
605 | | // ----------------------------------------------------------------------------- |
606 | | // 32xh |
607 | | |
608 | | static inline void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height, |
609 | 247k | const __m128i *dc) { |
610 | 247k | const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); |
611 | 247k | const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); |
612 | 247k | int i; |
613 | 7.45M | for (i = 0; i < height; ++i, dst += stride) { |
614 | 7.20M | _mm_store_si128((__m128i *)dst, dc_dup); |
615 | 7.20M | _mm_store_si128((__m128i *)(dst + 8), dc_dup); |
616 | 7.20M | _mm_store_si128((__m128i *)(dst + 16), dc_dup); |
617 | 7.20M | _mm_store_si128((__m128i *)(dst + 24), dc_dup); |
618 | 7.20M | } |
619 | 247k | } |
620 | | |
621 | | void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
622 | | const uint16_t *above, |
623 | 10.4k | const uint16_t *left, int bd) { |
624 | 10.4k | const __m128i eight = _mm_cvtsi32_si128(8); |
625 | 10.4k | const __m128i sum = dc_sum_16(left); |
626 | 10.4k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
627 | 10.4k | (void)above; |
628 | 10.4k | (void)bd; |
629 | 10.4k | dc_store_32xh(dst, stride, 16, &dc); |
630 | 10.4k | } |
631 | | |
632 | | void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, |
633 | | const uint16_t *above, |
634 | 113k | const uint16_t *left, int bd) { |
635 | 113k | const __m128i sixteen = _mm_cvtsi32_si128(16); |
636 | 113k | const __m128i sum = dc_sum_32(left); |
637 | 113k | const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); |
638 | 113k | (void)above; |
639 | 113k | (void)bd; |
640 | 113k | dc_store_32xh(dst, stride, 32, &dc); |
641 | 113k | } |
642 | | |
643 | | void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
644 | | const uint16_t *above, |
645 | 16.8k | const uint16_t *left, int bd) { |
646 | 16.8k | const __m128i sixteen = _mm_cvtsi32_si128(16); |
647 | 16.8k | const __m128i sum = dc_sum_32(above); |
648 | 16.8k | const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); |
649 | 16.8k | (void)left; |
650 | 16.8k | (void)bd; |
651 | 16.8k | dc_store_32xh(dst, stride, 16, &dc); |
652 | 16.8k | } |
653 | | |
654 | | void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
655 | | const uint16_t *above, |
656 | 16.5k | const uint16_t *left, int bd) { |
657 | 16.5k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
658 | 16.5k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
659 | 16.5k | (void)above; |
660 | 16.5k | (void)left; |
661 | 16.5k | dc_store_32xh(dst, stride, 16, &dc_dup); |
662 | 16.5k | } |
663 | | |
664 | | void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, |
665 | | const uint16_t *above, |
666 | 67.7k | const uint16_t *left, int bd) { |
667 | 67.7k | const __m128i sixteen = _mm_cvtsi32_si128(16); |
668 | 67.7k | const __m128i sum = dc_sum_32(above); |
669 | 67.7k | const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); |
670 | 67.7k | (void)left; |
671 | 67.7k | (void)bd; |
672 | 67.7k | dc_store_32xh(dst, stride, 32, &dc); |
673 | 67.7k | } |
674 | | |
675 | | void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, |
676 | | const uint16_t *above, |
677 | 22.0k | const uint16_t *left, int bd) { |
678 | 22.0k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
679 | 22.0k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
680 | 22.0k | (void)above; |
681 | 22.0k | (void)left; |
682 | 22.0k | dc_store_32xh(dst, stride, 32, &dc_dup); |
683 | 22.0k | } |
684 | | |
685 | | // ----------------------------------------------------------------------------- |
686 | | // V_PRED |
687 | | |
688 | | void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
689 | | const uint16_t *above, |
690 | 35.8k | const uint16_t *left, int bd) { |
691 | 35.8k | (void)left; |
692 | 35.8k | (void)bd; |
693 | 35.8k | const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above); |
694 | 35.8k | int i; |
695 | 107k | for (i = 0; i < 2; ++i) { |
696 | 71.6k | _mm_storel_epi64((__m128i *)dst, above_u16); |
697 | 71.6k | _mm_storel_epi64((__m128i *)(dst + stride), above_u16); |
698 | 71.6k | _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16); |
699 | 71.6k | _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16); |
700 | 71.6k | dst += stride << 2; |
701 | 71.6k | } |
702 | 35.8k | } |
703 | | |
704 | | void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
705 | | const uint16_t *above, |
706 | 65.7k | const uint16_t *left, int bd) { |
707 | 65.7k | (void)left; |
708 | 65.7k | (void)bd; |
709 | 65.7k | const __m128i above_u16 = _mm_load_si128((const __m128i *)above); |
710 | 65.7k | _mm_store_si128((__m128i *)dst, above_u16); |
711 | 65.7k | _mm_store_si128((__m128i *)(dst + stride), above_u16); |
712 | 65.7k | _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); |
713 | 65.7k | _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); |
714 | 65.7k | } |
715 | | |
716 | | void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
717 | | const uint16_t *above, |
718 | 27.5k | const uint16_t *left, int bd) { |
719 | 27.5k | (void)left; |
720 | 27.5k | (void)bd; |
721 | 27.5k | const __m128i above_u16 = _mm_load_si128((const __m128i *)above); |
722 | 27.5k | int i; |
723 | 137k | for (i = 0; i < 4; ++i) { |
724 | 110k | _mm_store_si128((__m128i *)dst, above_u16); |
725 | 110k | _mm_store_si128((__m128i *)(dst + stride), above_u16); |
726 | 110k | _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); |
727 | 110k | _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); |
728 | 110k | dst += stride << 2; |
729 | 110k | } |
730 | 27.5k | } |
731 | | |
732 | | void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
733 | | const uint16_t *above, |
734 | 65.4k | const uint16_t *left, int bd) { |
735 | 65.4k | (void)left; |
736 | 65.4k | (void)bd; |
737 | 65.4k | const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); |
738 | 65.4k | const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); |
739 | 65.4k | int i; |
740 | 196k | for (i = 0; i < 2; ++i) { |
741 | 130k | _mm_store_si128((__m128i *)dst, above0_u16); |
742 | 130k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
743 | 130k | dst += stride; |
744 | 130k | _mm_store_si128((__m128i *)dst, above0_u16); |
745 | 130k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
746 | 130k | dst += stride; |
747 | 130k | _mm_store_si128((__m128i *)dst, above0_u16); |
748 | 130k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
749 | 130k | dst += stride; |
750 | 130k | _mm_store_si128((__m128i *)dst, above0_u16); |
751 | 130k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
752 | 130k | dst += stride; |
753 | 130k | } |
754 | 65.4k | } |
755 | | |
756 | | void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
757 | | const uint16_t *above, |
758 | 12.6k | const uint16_t *left, int bd) { |
759 | 12.6k | (void)left; |
760 | 12.6k | (void)bd; |
761 | 12.6k | const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); |
762 | 12.6k | const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); |
763 | 12.6k | int i; |
764 | 113k | for (i = 0; i < 8; ++i) { |
765 | 100k | _mm_store_si128((__m128i *)dst, above0_u16); |
766 | 100k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
767 | 100k | dst += stride; |
768 | 100k | _mm_store_si128((__m128i *)dst, above0_u16); |
769 | 100k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
770 | 100k | dst += stride; |
771 | 100k | _mm_store_si128((__m128i *)dst, above0_u16); |
772 | 100k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
773 | 100k | dst += stride; |
774 | 100k | _mm_store_si128((__m128i *)dst, above0_u16); |
775 | 100k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
776 | 100k | dst += stride; |
777 | 100k | } |
778 | 12.6k | } |
779 | | |
780 | | void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
781 | | const uint16_t *above, |
782 | 10.7k | const uint16_t *left, int bd) { |
783 | 10.7k | (void)left; |
784 | 10.7k | (void)bd; |
785 | 10.7k | const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); |
786 | 10.7k | const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); |
787 | 10.7k | const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16)); |
788 | 10.7k | const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24)); |
789 | 10.7k | int i; |
790 | 53.7k | for (i = 0; i < 4; ++i) { |
791 | 42.9k | _mm_store_si128((__m128i *)dst, above0_u16); |
792 | 42.9k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
793 | 42.9k | _mm_store_si128((__m128i *)(dst + 16), above2_u16); |
794 | 42.9k | _mm_store_si128((__m128i *)(dst + 24), above3_u16); |
795 | 42.9k | dst += stride; |
796 | 42.9k | _mm_store_si128((__m128i *)dst, above0_u16); |
797 | 42.9k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
798 | 42.9k | _mm_store_si128((__m128i *)(dst + 16), above2_u16); |
799 | 42.9k | _mm_store_si128((__m128i *)(dst + 24), above3_u16); |
800 | 42.9k | dst += stride; |
801 | 42.9k | _mm_store_si128((__m128i *)dst, above0_u16); |
802 | 42.9k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
803 | 42.9k | _mm_store_si128((__m128i *)(dst + 16), above2_u16); |
804 | 42.9k | _mm_store_si128((__m128i *)(dst + 24), above3_u16); |
805 | 42.9k | dst += stride; |
806 | 42.9k | _mm_store_si128((__m128i *)dst, above0_u16); |
807 | 42.9k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
808 | 42.9k | _mm_store_si128((__m128i *)(dst + 16), above2_u16); |
809 | 42.9k | _mm_store_si128((__m128i *)(dst + 24), above3_u16); |
810 | 42.9k | dst += stride; |
811 | 42.9k | } |
812 | 10.7k | } |
813 | | |
814 | | // ----------------------------------------------------------------------------- |
815 | | // DC_PRED |
816 | | |
817 | | void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
818 | | const uint16_t *above, |
819 | 316k | const uint16_t *left, int bd) { |
820 | 316k | (void)bd; |
821 | 316k | const __m128i sum_above = dc_sum_4(above); |
822 | 316k | const __m128i sum_left = dc_sum_8(left); |
823 | 316k | const __m128i sum = _mm_add_epi16(sum_above, sum_left); |
824 | 316k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
825 | 316k | sum32 >>= 16; |
826 | 316k | sum32 += 6; |
827 | 316k | sum32 /= 12; |
828 | 316k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
829 | 316k | int i; |
830 | 1.58M | for (i = 0; i < 4; ++i) { |
831 | 1.26M | _mm_storel_epi64((__m128i *)dst, row); |
832 | 1.26M | dst += stride; |
833 | 1.26M | _mm_storel_epi64((__m128i *)dst, row); |
834 | 1.26M | dst += stride; |
835 | 1.26M | } |
836 | 316k | } |
837 | | |
838 | | void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
839 | | const uint16_t *above, |
840 | 598k | const uint16_t *left, int bd) { |
841 | 598k | (void)bd; |
842 | 598k | const __m128i sum_left = dc_sum_4(left); |
843 | 598k | const __m128i sum_above = dc_sum_8(above); |
844 | 598k | const __m128i sum = _mm_add_epi16(sum_above, sum_left); |
845 | 598k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
846 | 598k | sum32 >>= 16; |
847 | 598k | sum32 += 6; |
848 | 598k | sum32 /= 12; |
849 | 598k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
850 | | |
851 | 598k | _mm_store_si128((__m128i *)dst, row); |
852 | 598k | dst += stride; |
853 | 598k | _mm_store_si128((__m128i *)dst, row); |
854 | 598k | dst += stride; |
855 | 598k | _mm_store_si128((__m128i *)dst, row); |
856 | 598k | dst += stride; |
857 | 598k | _mm_store_si128((__m128i *)dst, row); |
858 | 598k | } |
859 | | |
860 | | void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
861 | | const uint16_t *above, |
862 | 324k | const uint16_t *left, int bd) { |
863 | 324k | (void)bd; |
864 | 324k | __m128i sum_left = dc_sum_16(left); |
865 | 324k | __m128i sum_above = dc_sum_8(above); |
866 | 324k | const __m128i zero = _mm_setzero_si128(); |
867 | 324k | sum_left = _mm_unpacklo_epi16(sum_left, zero); |
868 | 324k | sum_above = _mm_unpacklo_epi16(sum_above, zero); |
869 | 324k | const __m128i sum = _mm_add_epi32(sum_left, sum_above); |
870 | 324k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
871 | 324k | sum32 += 12; |
872 | 324k | sum32 /= 24; |
873 | 324k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
874 | 324k | int i; |
875 | 1.62M | for (i = 0; i < 4; ++i) { |
876 | 1.29M | _mm_store_si128((__m128i *)dst, row); |
877 | 1.29M | dst += stride; |
878 | 1.29M | _mm_store_si128((__m128i *)dst, row); |
879 | 1.29M | dst += stride; |
880 | 1.29M | _mm_store_si128((__m128i *)dst, row); |
881 | 1.29M | dst += stride; |
882 | 1.29M | _mm_store_si128((__m128i *)dst, row); |
883 | 1.29M | dst += stride; |
884 | 1.29M | } |
885 | 324k | } |
886 | | |
887 | | void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
888 | | const uint16_t *above, |
889 | 548k | const uint16_t *left, int bd) { |
890 | 548k | (void)bd; |
891 | 548k | __m128i sum_left = dc_sum_8(left); |
892 | 548k | __m128i sum_above = dc_sum_16(above); |
893 | 548k | const __m128i zero = _mm_setzero_si128(); |
894 | 548k | sum_left = _mm_unpacklo_epi16(sum_left, zero); |
895 | 548k | sum_above = _mm_unpacklo_epi16(sum_above, zero); |
896 | 548k | const __m128i sum = _mm_add_epi32(sum_left, sum_above); |
897 | 548k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
898 | 548k | sum32 += 12; |
899 | 548k | sum32 /= 24; |
900 | 548k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
901 | 548k | int i; |
902 | 1.64M | for (i = 0; i < 2; ++i) { |
903 | 1.09M | _mm_store_si128((__m128i *)dst, row); |
904 | 1.09M | _mm_store_si128((__m128i *)(dst + 8), row); |
905 | 1.09M | dst += stride; |
906 | 1.09M | _mm_store_si128((__m128i *)dst, row); |
907 | 1.09M | _mm_store_si128((__m128i *)(dst + 8), row); |
908 | 1.09M | dst += stride; |
909 | 1.09M | _mm_store_si128((__m128i *)dst, row); |
910 | 1.09M | _mm_store_si128((__m128i *)(dst + 8), row); |
911 | 1.09M | dst += stride; |
912 | 1.09M | _mm_store_si128((__m128i *)dst, row); |
913 | 1.09M | _mm_store_si128((__m128i *)(dst + 8), row); |
914 | 1.09M | dst += stride; |
915 | 1.09M | } |
916 | 548k | } |
917 | | |
918 | | void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
919 | | const uint16_t *above, |
920 | 176k | const uint16_t *left, int bd) { |
921 | 176k | (void)bd; |
922 | 176k | __m128i sum_left = dc_sum_32(left); |
923 | 176k | __m128i sum_above = dc_sum_16(above); |
924 | 176k | const __m128i zero = _mm_setzero_si128(); |
925 | 176k | sum_above = _mm_unpacklo_epi16(sum_above, zero); |
926 | 176k | const __m128i sum = _mm_add_epi32(sum_left, sum_above); |
927 | 176k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
928 | 176k | sum32 += 24; |
929 | 176k | sum32 /= 48; |
930 | 176k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
931 | 176k | int i; |
932 | 1.58M | for (i = 0; i < 8; ++i) { |
933 | 1.40M | _mm_store_si128((__m128i *)dst, row); |
934 | 1.40M | _mm_store_si128((__m128i *)(dst + 8), row); |
935 | 1.40M | dst += stride; |
936 | 1.40M | _mm_store_si128((__m128i *)dst, row); |
937 | 1.40M | _mm_store_si128((__m128i *)(dst + 8), row); |
938 | 1.40M | dst += stride; |
939 | 1.40M | _mm_store_si128((__m128i *)dst, row); |
940 | 1.40M | _mm_store_si128((__m128i *)(dst + 8), row); |
941 | 1.40M | dst += stride; |
942 | 1.40M | _mm_store_si128((__m128i *)dst, row); |
943 | 1.40M | _mm_store_si128((__m128i *)(dst + 8), row); |
944 | 1.40M | dst += stride; |
945 | 1.40M | } |
946 | 176k | } |
947 | | |
948 | | void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
949 | | const uint16_t *above, |
950 | 183k | const uint16_t *left, int bd) { |
951 | 183k | (void)bd; |
952 | 183k | __m128i sum_left = dc_sum_16(left); |
953 | 183k | __m128i sum_above = dc_sum_32(above); |
954 | 183k | const __m128i zero = _mm_setzero_si128(); |
955 | 183k | sum_left = _mm_unpacklo_epi16(sum_left, zero); |
956 | 183k | const __m128i sum = _mm_add_epi32(sum_left, sum_above); |
957 | 183k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
958 | 183k | sum32 += 24; |
959 | 183k | sum32 /= 48; |
960 | 183k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
961 | 183k | int i; |
962 | 915k | for (i = 0; i < 4; ++i) { |
963 | 732k | _mm_store_si128((__m128i *)dst, row); |
964 | 732k | _mm_store_si128((__m128i *)(dst + 8), row); |
965 | 732k | _mm_store_si128((__m128i *)(dst + 16), row); |
966 | 732k | _mm_store_si128((__m128i *)(dst + 24), row); |
967 | 732k | dst += stride; |
968 | 732k | _mm_store_si128((__m128i *)dst, row); |
969 | 732k | _mm_store_si128((__m128i *)(dst + 8), row); |
970 | 732k | _mm_store_si128((__m128i *)(dst + 16), row); |
971 | 732k | _mm_store_si128((__m128i *)(dst + 24), row); |
972 | 732k | dst += stride; |
973 | 732k | _mm_store_si128((__m128i *)dst, row); |
974 | 732k | _mm_store_si128((__m128i *)(dst + 8), row); |
975 | 732k | _mm_store_si128((__m128i *)(dst + 16), row); |
976 | 732k | _mm_store_si128((__m128i *)(dst + 24), row); |
977 | 732k | dst += stride; |
978 | 732k | _mm_store_si128((__m128i *)dst, row); |
979 | 732k | _mm_store_si128((__m128i *)(dst + 8), row); |
980 | 732k | _mm_store_si128((__m128i *)(dst + 16), row); |
981 | 732k | _mm_store_si128((__m128i *)(dst + 24), row); |
982 | 732k | dst += stride; |
983 | 732k | } |
984 | 183k | } |