/src/aom/aom_dsp/x86/highbd_intrapred_sse2.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <emmintrin.h> |
13 | | |
14 | | #include "config/aom_dsp_rtcd.h" |
15 | | |
16 | | // ----------------------------------------------------------------------------- |
17 | | // H_PRED |
18 | | |
19 | | void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, |
20 | | const uint16_t *above, |
21 | 276k | const uint16_t *left, int bd) { |
22 | 276k | const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left); |
23 | 276k | const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); |
24 | 276k | const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); |
25 | 276k | const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); |
26 | 276k | const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); |
27 | 276k | (void)above; |
28 | 276k | (void)bd; |
29 | 276k | _mm_storel_epi64((__m128i *)dst, row0); |
30 | 276k | dst += stride; |
31 | 276k | _mm_storel_epi64((__m128i *)dst, row1); |
32 | 276k | dst += stride; |
33 | 276k | _mm_storel_epi64((__m128i *)dst, row2); |
34 | 276k | dst += stride; |
35 | 276k | _mm_storel_epi64((__m128i *)dst, row3); |
36 | 276k | } |
37 | | |
38 | | void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
39 | | const uint16_t *above, |
40 | 51.9k | const uint16_t *left, int bd) { |
41 | 51.9k | aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); |
42 | 51.9k | dst += stride << 2; |
43 | 51.9k | left += 4; |
44 | 51.9k | aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); |
45 | 51.9k | } |
46 | | |
47 | | void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
48 | | const uint16_t *above, |
49 | 124k | const uint16_t *left, int bd) { |
50 | 124k | const __m128i left_u16 = _mm_load_si128((const __m128i *)left); |
51 | 124k | const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); |
52 | 124k | const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); |
53 | 124k | const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); |
54 | 124k | const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); |
55 | 124k | (void)above; |
56 | 124k | (void)bd; |
57 | 124k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); |
58 | 124k | dst += stride; |
59 | 124k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); |
60 | 124k | dst += stride; |
61 | 124k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); |
62 | 124k | dst += stride; |
63 | 124k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); |
64 | 124k | } |
65 | | |
66 | | void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, |
67 | | const uint16_t *above, |
68 | 327k | const uint16_t *left, int bd) { |
69 | 327k | const __m128i left_u16 = _mm_load_si128((const __m128i *)left); |
70 | 327k | const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); |
71 | 327k | const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); |
72 | 327k | const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); |
73 | 327k | const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); |
74 | 327k | const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); |
75 | 327k | const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); |
76 | 327k | const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); |
77 | 327k | const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); |
78 | 327k | (void)above; |
79 | 327k | (void)bd; |
80 | 327k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); |
81 | 327k | dst += stride; |
82 | 327k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); |
83 | 327k | dst += stride; |
84 | 327k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); |
85 | 327k | dst += stride; |
86 | 327k | _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); |
87 | 327k | dst += stride; |
88 | 327k | _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4)); |
89 | 327k | dst += stride; |
90 | 327k | _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5)); |
91 | 327k | dst += stride; |
92 | 327k | _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6)); |
93 | 327k | dst += stride; |
94 | 327k | _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7)); |
95 | 327k | } |
96 | | |
97 | | void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
98 | | const uint16_t *above, |
99 | 47.9k | const uint16_t *left, int bd) { |
100 | 47.9k | aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); |
101 | 47.9k | dst += stride << 3; |
102 | 47.9k | left += 8; |
103 | 47.9k | aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); |
104 | 47.9k | } |
105 | | |
106 | | static inline void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride, |
107 | 2.41M | const __m128i *row) { |
108 | 2.41M | const __m128i val = _mm_unpacklo_epi64(*row, *row); |
109 | 2.41M | _mm_store_si128((__m128i *)*dst, val); |
110 | 2.41M | _mm_store_si128((__m128i *)(*dst + 8), val); |
111 | 2.41M | *dst += stride; |
112 | 2.41M | } |
113 | | |
114 | | static inline void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride, |
115 | 2.41M | const __m128i *row) { |
116 | 2.41M | const __m128i val = _mm_unpackhi_epi64(*row, *row); |
117 | 2.41M | _mm_store_si128((__m128i *)(*dst), val); |
118 | 2.41M | _mm_store_si128((__m128i *)(*dst + 8), val); |
119 | 2.41M | *dst += stride; |
120 | 2.41M | } |
121 | | |
122 | | static inline void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride, |
123 | 603k | const uint16_t *left) { |
124 | 603k | const __m128i left_u16 = _mm_load_si128((const __m128i *)left); |
125 | 603k | const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); |
126 | 603k | const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); |
127 | 603k | const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); |
128 | 603k | const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); |
129 | 603k | const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); |
130 | 603k | const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); |
131 | 603k | const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); |
132 | 603k | const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); |
133 | 603k | h_store_16_unpacklo(&dst, stride, &row0); |
134 | 603k | h_store_16_unpacklo(&dst, stride, &row1); |
135 | 603k | h_store_16_unpacklo(&dst, stride, &row2); |
136 | 603k | h_store_16_unpacklo(&dst, stride, &row3); |
137 | 603k | h_store_16_unpackhi(&dst, stride, &row4); |
138 | 603k | h_store_16_unpackhi(&dst, stride, &row5); |
139 | 603k | h_store_16_unpackhi(&dst, stride, &row6); |
140 | 603k | h_store_16_unpackhi(&dst, stride, &row7); |
141 | 603k | } |
142 | | |
143 | | void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
144 | | const uint16_t *above, |
145 | 92.8k | const uint16_t *left, int bd) { |
146 | 92.8k | (void)above; |
147 | 92.8k | (void)bd; |
148 | 92.8k | h_predictor_16x8(dst, stride, left); |
149 | 92.8k | } |
150 | | |
151 | | void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, |
152 | | const uint16_t *above, |
153 | 205k | const uint16_t *left, int bd) { |
154 | 205k | int i; |
155 | 205k | (void)above; |
156 | 205k | (void)bd; |
157 | | |
158 | 615k | for (i = 0; i < 2; i++, left += 8) { |
159 | 410k | h_predictor_16x8(dst, stride, left); |
160 | 410k | dst += stride << 3; |
161 | 410k | } |
162 | 205k | } |
163 | | |
164 | | void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
165 | | const uint16_t *above, |
166 | 24.9k | const uint16_t *left, int bd) { |
167 | 24.9k | int i; |
168 | 24.9k | (void)above; |
169 | 24.9k | (void)bd; |
170 | | |
171 | 124k | for (i = 0; i < 4; i++, left += 8) { |
172 | 99.9k | h_predictor_16x8(dst, stride, left); |
173 | 99.9k | dst += stride << 3; |
174 | 99.9k | } |
175 | 24.9k | } |
176 | | |
177 | | static inline void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride, |
178 | 2.39M | const __m128i *row) { |
179 | 2.39M | const __m128i val = _mm_unpacklo_epi64(*row, *row); |
180 | 2.39M | _mm_store_si128((__m128i *)(*dst), val); |
181 | 2.39M | _mm_store_si128((__m128i *)(*dst + 8), val); |
182 | 2.39M | _mm_store_si128((__m128i *)(*dst + 16), val); |
183 | 2.39M | _mm_store_si128((__m128i *)(*dst + 24), val); |
184 | 2.39M | *dst += stride; |
185 | 2.39M | } |
186 | | |
187 | | static inline void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride, |
188 | 2.39M | const __m128i *row) { |
189 | 2.39M | const __m128i val = _mm_unpackhi_epi64(*row, *row); |
190 | 2.39M | _mm_store_si128((__m128i *)(*dst), val); |
191 | 2.39M | _mm_store_si128((__m128i *)(*dst + 8), val); |
192 | 2.39M | _mm_store_si128((__m128i *)(*dst + 16), val); |
193 | 2.39M | _mm_store_si128((__m128i *)(*dst + 24), val); |
194 | 2.39M | *dst += stride; |
195 | 2.39M | } |
196 | | |
197 | | static inline void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride, |
198 | 598k | const uint16_t *left) { |
199 | 598k | const __m128i left_u16 = _mm_load_si128((const __m128i *)left); |
200 | 598k | const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); |
201 | 598k | const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); |
202 | 598k | const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); |
203 | 598k | const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); |
204 | 598k | const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); |
205 | 598k | const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); |
206 | 598k | const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); |
207 | 598k | const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); |
208 | 598k | h_store_32_unpacklo(&dst, stride, &row0); |
209 | 598k | h_store_32_unpacklo(&dst, stride, &row1); |
210 | 598k | h_store_32_unpacklo(&dst, stride, &row2); |
211 | 598k | h_store_32_unpacklo(&dst, stride, &row3); |
212 | 598k | h_store_32_unpackhi(&dst, stride, &row4); |
213 | 598k | h_store_32_unpackhi(&dst, stride, &row5); |
214 | 598k | h_store_32_unpackhi(&dst, stride, &row6); |
215 | 598k | h_store_32_unpackhi(&dst, stride, &row7); |
216 | 598k | } |
217 | | |
218 | | void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
219 | | const uint16_t *above, |
220 | 49.6k | const uint16_t *left, int bd) { |
221 | 49.6k | int i; |
222 | 49.6k | (void)above; |
223 | 49.6k | (void)bd; |
224 | | |
225 | 148k | for (i = 0; i < 2; i++, left += 8) { |
226 | 99.3k | h_predictor_32x8(dst, stride, left); |
227 | 99.3k | dst += stride << 3; |
228 | 99.3k | } |
229 | 49.6k | } |
230 | | |
231 | | void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, |
232 | | const uint16_t *above, |
233 | 124k | const uint16_t *left, int bd) { |
234 | 124k | int i; |
235 | 124k | (void)above; |
236 | 124k | (void)bd; |
237 | | |
238 | 624k | for (i = 0; i < 4; i++, left += 8) { |
239 | 499k | h_predictor_32x8(dst, stride, left); |
240 | 499k | dst += stride << 3; |
241 | 499k | } |
242 | 124k | } |
243 | | |
244 | | // ----------------------------------------------------------------------------- |
245 | | // DC_TOP, DC_LEFT, DC_128 |
246 | | |
247 | | // 4x4 |
248 | | |
249 | 2.02M | static inline __m128i dc_sum_4(const uint16_t *ref) { |
250 | 2.02M | const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref); |
251 | 2.02M | const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); |
252 | 2.02M | const __m128i a = _mm_add_epi16(_dcba, _xxdc); |
253 | 2.02M | return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); |
254 | 2.02M | } |
255 | | |
256 | | static inline void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, |
257 | 1.19M | const __m128i *dc) { |
258 | 1.19M | const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); |
259 | 1.19M | int i; |
260 | 5.99M | for (i = 0; i < 4; ++i, dst += stride) { |
261 | 4.79M | _mm_storel_epi64((__m128i *)dst, dc_dup); |
262 | 4.79M | } |
263 | 1.19M | } |
264 | | |
265 | | void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, |
266 | | const uint16_t *above, |
267 | 82.4k | const uint16_t *left, int bd) { |
268 | 82.4k | const __m128i two = _mm_cvtsi32_si128(2); |
269 | 82.4k | const __m128i sum = dc_sum_4(left); |
270 | 82.4k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); |
271 | 82.4k | (void)above; |
272 | 82.4k | (void)bd; |
273 | 82.4k | dc_store_4x4(dst, stride, &dc); |
274 | 82.4k | } |
275 | | |
276 | | void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, |
277 | | const uint16_t *above, |
278 | 1.10M | const uint16_t *left, int bd) { |
279 | 1.10M | const __m128i two = _mm_cvtsi32_si128(2); |
280 | 1.10M | const __m128i sum = dc_sum_4(above); |
281 | 1.10M | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); |
282 | 1.10M | (void)left; |
283 | 1.10M | (void)bd; |
284 | 1.10M | dc_store_4x4(dst, stride, &dc); |
285 | 1.10M | } |
286 | | |
287 | | void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, |
288 | | const uint16_t *above, |
289 | 13.4k | const uint16_t *left, int bd) { |
290 | 13.4k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
291 | 13.4k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
292 | 13.4k | (void)above; |
293 | 13.4k | (void)left; |
294 | 13.4k | dc_store_4x4(dst, stride, &dc_dup); |
295 | 13.4k | } |
296 | | |
297 | | // ----------------------------------------------------------------------------- |
298 | | // 4x8 |
299 | | |
300 | | static inline void dc_store_4x8(uint16_t *dst, ptrdiff_t stride, |
301 | 17.9k | const __m128i *dc) { |
302 | 17.9k | const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); |
303 | 17.9k | int i; |
304 | 161k | for (i = 0; i < 8; ++i, dst += stride) { |
305 | 143k | _mm_storel_epi64((__m128i *)dst, dc_dup); |
306 | 143k | } |
307 | 17.9k | } |
308 | | |
309 | | // Shared with DC 8xh |
310 | 6.12M | static inline __m128i dc_sum_8(const uint16_t *ref) { |
311 | 6.12M | const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref); |
312 | 6.12M | const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8)); |
313 | 6.12M | const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); |
314 | 6.12M | const __m128i a = _mm_add_epi16(_dcba, _xxdc); |
315 | | |
316 | 6.12M | return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); |
317 | 6.12M | } |
318 | | |
319 | | void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
320 | | const uint16_t *above, |
321 | 3.56k | const uint16_t *left, int bd) { |
322 | 3.56k | const __m128i sum = dc_sum_8(left); |
323 | 3.56k | const __m128i four = _mm_cvtsi32_si128(4); |
324 | 3.56k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); |
325 | 3.56k | (void)above; |
326 | 3.56k | (void)bd; |
327 | 3.56k | dc_store_4x8(dst, stride, &dc); |
328 | 3.56k | } |
329 | | |
330 | | void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
331 | | const uint16_t *above, |
332 | 13.8k | const uint16_t *left, int bd) { |
333 | 13.8k | const __m128i two = _mm_cvtsi32_si128(2); |
334 | 13.8k | const __m128i sum = dc_sum_4(above); |
335 | 13.8k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); |
336 | 13.8k | (void)left; |
337 | 13.8k | (void)bd; |
338 | 13.8k | dc_store_4x8(dst, stride, &dc); |
339 | 13.8k | } |
340 | | |
341 | | void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
342 | | const uint16_t *above, |
343 | 547 | const uint16_t *left, int bd) { |
344 | 547 | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
345 | 547 | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
346 | 547 | (void)above; |
347 | 547 | (void)left; |
348 | 547 | dc_store_4x8(dst, stride, &dc_dup); |
349 | 547 | } |
350 | | |
351 | | // ----------------------------------------------------------------------------- |
352 | | // 8xh |
353 | | |
354 | | static inline void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height, |
355 | 123k | const __m128i *dc) { |
356 | 123k | const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); |
357 | 123k | const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); |
358 | 123k | int i; |
359 | 1.11M | for (i = 0; i < height; ++i, dst += stride) { |
360 | 992k | _mm_store_si128((__m128i *)dst, dc_dup); |
361 | 992k | } |
362 | 123k | } |
363 | | |
364 | | // ----------------------------------------------------------------------------- |
365 | | // DC_TOP |
366 | | |
367 | | static inline void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride, |
368 | 60.5k | int height, const uint16_t *above) { |
369 | 60.5k | const __m128i four = _mm_cvtsi32_si128(4); |
370 | 60.5k | const __m128i sum = dc_sum_8(above); |
371 | 60.5k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); |
372 | 60.5k | dc_store_8xh(dst, stride, height, &dc); |
373 | 60.5k | } |
374 | | |
375 | | void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
376 | | const uint16_t *above, |
377 | 23.4k | const uint16_t *left, int bd) { |
378 | 23.4k | (void)left; |
379 | 23.4k | (void)bd; |
380 | 23.4k | dc_top_predictor_8xh(dst, stride, 4, above); |
381 | 23.4k | } |
382 | | |
383 | | void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, |
384 | | const uint16_t *above, |
385 | 29.6k | const uint16_t *left, int bd) { |
386 | 29.6k | (void)left; |
387 | 29.6k | (void)bd; |
388 | 29.6k | dc_top_predictor_8xh(dst, stride, 8, above); |
389 | 29.6k | } |
390 | | |
391 | | void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
392 | | const uint16_t *above, |
393 | 7.49k | const uint16_t *left, int bd) { |
394 | 7.49k | (void)left; |
395 | 7.49k | (void)bd; |
396 | 7.49k | dc_top_predictor_8xh(dst, stride, 16, above); |
397 | 7.49k | } |
398 | | |
399 | | // ----------------------------------------------------------------------------- |
400 | | // DC_LEFT |
401 | | |
402 | | void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
403 | | const uint16_t *above, |
404 | 6.13k | const uint16_t *left, int bd) { |
405 | 6.13k | const __m128i two = _mm_cvtsi32_si128(2); |
406 | 6.13k | const __m128i sum = dc_sum_4(left); |
407 | 6.13k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); |
408 | 6.13k | (void)above; |
409 | 6.13k | (void)bd; |
410 | 6.13k | dc_store_8xh(dst, stride, 4, &dc); |
411 | 6.13k | } |
412 | | |
413 | | void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, |
414 | | const uint16_t *above, |
415 | 42.4k | const uint16_t *left, int bd) { |
416 | 42.4k | const __m128i four = _mm_cvtsi32_si128(4); |
417 | 42.4k | const __m128i sum = dc_sum_8(left); |
418 | 42.4k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); |
419 | 42.4k | (void)above; |
420 | 42.4k | (void)bd; |
421 | 42.4k | dc_store_8xh(dst, stride, 8, &dc); |
422 | 42.4k | } |
423 | | |
424 | | // Shared with DC 16xh |
425 | 2.20M | static inline __m128i dc_sum_16(const uint16_t *ref) { |
426 | 2.20M | const __m128i sum_lo = dc_sum_8(ref); |
427 | 2.20M | const __m128i sum_hi = dc_sum_8(ref + 8); |
428 | 2.20M | return _mm_add_epi16(sum_lo, sum_hi); |
429 | 2.20M | } |
430 | | |
431 | | void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
432 | | const uint16_t *above, |
433 | 7.62k | const uint16_t *left, int bd) { |
434 | 7.62k | const __m128i eight = _mm_cvtsi32_si128(8); |
435 | 7.62k | const __m128i sum = dc_sum_16(left); |
436 | 7.62k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
437 | 7.62k | (void)above; |
438 | 7.62k | (void)bd; |
439 | 7.62k | dc_store_8xh(dst, stride, 16, &dc); |
440 | 7.62k | } |
441 | | |
442 | | // ----------------------------------------------------------------------------- |
443 | | // DC_128 |
444 | | |
445 | | static inline void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride, |
446 | 6.51k | int height, int bd) { |
447 | 6.51k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
448 | 6.51k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
449 | 6.51k | dc_store_8xh(dst, stride, height, &dc_dup); |
450 | 6.51k | } |
451 | | |
452 | | void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
453 | | const uint16_t *above, |
454 | 2.22k | const uint16_t *left, int bd) { |
455 | 2.22k | (void)above; |
456 | 2.22k | (void)left; |
457 | 2.22k | dc_128_predictor_8xh(dst, stride, 4, bd); |
458 | 2.22k | } |
459 | | |
460 | | void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, |
461 | | const uint16_t *above, |
462 | 2.71k | const uint16_t *left, int bd) { |
463 | 2.71k | (void)above; |
464 | 2.71k | (void)left; |
465 | 2.71k | dc_128_predictor_8xh(dst, stride, 8, bd); |
466 | 2.71k | } |
467 | | |
468 | | void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
469 | | const uint16_t *above, |
470 | 1.57k | const uint16_t *left, int bd) { |
471 | 1.57k | (void)above; |
472 | 1.57k | (void)left; |
473 | 1.57k | dc_128_predictor_8xh(dst, stride, 16, bd); |
474 | 1.57k | } |
475 | | |
476 | | // ----------------------------------------------------------------------------- |
477 | | // 16xh |
478 | | |
479 | | static inline void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height, |
480 | 96.0k | const __m128i *dc) { |
481 | 96.0k | const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); |
482 | 96.0k | const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); |
483 | 96.0k | int i; |
484 | 1.79M | for (i = 0; i < height; ++i, dst += stride) { |
485 | 1.69M | _mm_store_si128((__m128i *)dst, dc_dup); |
486 | 1.69M | _mm_store_si128((__m128i *)(dst + 8), dc_dup); |
487 | 1.69M | } |
488 | 96.0k | } |
489 | | |
490 | | // ----------------------------------------------------------------------------- |
491 | | // DC_LEFT |
492 | | |
493 | | void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
494 | | const uint16_t *above, |
495 | 13.0k | const uint16_t *left, int bd) { |
496 | 13.0k | const __m128i four = _mm_cvtsi32_si128(4); |
497 | 13.0k | const __m128i sum = dc_sum_8(left); |
498 | 13.0k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); |
499 | 13.0k | (void)above; |
500 | 13.0k | (void)bd; |
501 | 13.0k | dc_store_16xh(dst, stride, 8, &dc); |
502 | 13.0k | } |
503 | | |
504 | | void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, |
505 | | const uint16_t *above, |
506 | 37.2k | const uint16_t *left, int bd) { |
507 | 37.2k | const __m128i eight = _mm_cvtsi32_si128(8); |
508 | 37.2k | const __m128i sum = dc_sum_16(left); |
509 | 37.2k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
510 | 37.2k | (void)above; |
511 | 37.2k | (void)bd; |
512 | 37.2k | dc_store_16xh(dst, stride, 16, &dc); |
513 | 37.2k | } |
514 | | |
515 | | // Shared with 32xh |
516 | 512k | static inline __m128i dc_sum_32(const uint16_t *ref) { |
517 | 512k | const __m128i zero = _mm_setzero_si128(); |
518 | 512k | const __m128i sum_a = dc_sum_16(ref); |
519 | 512k | const __m128i sum_b = dc_sum_16(ref + 16); |
520 | | // 12 bit bd will outrange, so expand to 32 bit before adding final total |
521 | 512k | return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero), |
522 | 512k | _mm_unpacklo_epi16(sum_b, zero)); |
523 | 512k | } |
524 | | |
525 | | void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
526 | | const uint16_t *above, |
527 | 5.93k | const uint16_t *left, int bd) { |
528 | 5.93k | const __m128i sixteen = _mm_cvtsi32_si128(16); |
529 | 5.93k | const __m128i sum = dc_sum_32(left); |
530 | 5.93k | const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); |
531 | 5.93k | (void)above; |
532 | 5.93k | (void)bd; |
533 | 5.93k | dc_store_16xh(dst, stride, 32, &dc); |
534 | 5.93k | } |
535 | | |
536 | | // ----------------------------------------------------------------------------- |
537 | | // DC_TOP |
538 | | |
539 | | void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
540 | | const uint16_t *above, |
541 | 7.77k | const uint16_t *left, int bd) { |
542 | 7.77k | const __m128i eight = _mm_cvtsi32_si128(8); |
543 | 7.77k | const __m128i sum = dc_sum_16(above); |
544 | 7.77k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
545 | 7.77k | (void)left; |
546 | 7.77k | (void)bd; |
547 | 7.77k | dc_store_16xh(dst, stride, 8, &dc); |
548 | 7.77k | } |
549 | | |
550 | | void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, |
551 | | const uint16_t *above, |
552 | 13.1k | const uint16_t *left, int bd) { |
553 | 13.1k | const __m128i eight = _mm_cvtsi32_si128(8); |
554 | 13.1k | const __m128i sum = dc_sum_16(above); |
555 | 13.1k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
556 | 13.1k | (void)left; |
557 | 13.1k | (void)bd; |
558 | 13.1k | dc_store_16xh(dst, stride, 16, &dc); |
559 | 13.1k | } |
560 | | |
561 | | void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
562 | | const uint16_t *above, |
563 | 12.0k | const uint16_t *left, int bd) { |
564 | 12.0k | const __m128i eight = _mm_cvtsi32_si128(8); |
565 | 12.0k | const __m128i sum = dc_sum_16(above); |
566 | 12.0k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
567 | 12.0k | (void)left; |
568 | 12.0k | (void)bd; |
569 | 12.0k | dc_store_16xh(dst, stride, 32, &dc); |
570 | 12.0k | } |
571 | | |
572 | | // ----------------------------------------------------------------------------- |
573 | | // DC_128 |
574 | | |
575 | | void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
576 | | const uint16_t *above, |
577 | 1.20k | const uint16_t *left, int bd) { |
578 | 1.20k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
579 | 1.20k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
580 | 1.20k | (void)above; |
581 | 1.20k | (void)left; |
582 | 1.20k | dc_store_16xh(dst, stride, 8, &dc_dup); |
583 | 1.20k | } |
584 | | |
585 | | void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, |
586 | | const uint16_t *above, |
587 | 2.66k | const uint16_t *left, int bd) { |
588 | 2.66k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
589 | 2.66k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
590 | 2.66k | (void)above; |
591 | 2.66k | (void)left; |
592 | 2.66k | dc_store_16xh(dst, stride, 16, &dc_dup); |
593 | 2.66k | } |
594 | | |
595 | | void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
596 | | const uint16_t *above, |
597 | 2.88k | const uint16_t *left, int bd) { |
598 | 2.88k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
599 | 2.88k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
600 | 2.88k | (void)above; |
601 | 2.88k | (void)left; |
602 | 2.88k | dc_store_16xh(dst, stride, 32, &dc_dup); |
603 | 2.88k | } |
604 | | |
605 | | // ----------------------------------------------------------------------------- |
606 | | // 32xh |
607 | | |
608 | | static inline void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height, |
609 | 216k | const __m128i *dc) { |
610 | 216k | const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); |
611 | 216k | const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); |
612 | 216k | int i; |
613 | 6.67M | for (i = 0; i < height; ++i, dst += stride) { |
614 | 6.46M | _mm_store_si128((__m128i *)dst, dc_dup); |
615 | 6.46M | _mm_store_si128((__m128i *)(dst + 8), dc_dup); |
616 | 6.46M | _mm_store_si128((__m128i *)(dst + 16), dc_dup); |
617 | 6.46M | _mm_store_si128((__m128i *)(dst + 24), dc_dup); |
618 | 6.46M | } |
619 | 216k | } |
620 | | |
621 | | void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
622 | | const uint16_t *above, |
623 | 6.89k | const uint16_t *left, int bd) { |
624 | 6.89k | const __m128i eight = _mm_cvtsi32_si128(8); |
625 | 6.89k | const __m128i sum = dc_sum_16(left); |
626 | 6.89k | const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); |
627 | 6.89k | (void)above; |
628 | 6.89k | (void)bd; |
629 | 6.89k | dc_store_32xh(dst, stride, 16, &dc); |
630 | 6.89k | } |
631 | | |
632 | | void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, |
633 | | const uint16_t *above, |
634 | 100k | const uint16_t *left, int bd) { |
635 | 100k | const __m128i sixteen = _mm_cvtsi32_si128(16); |
636 | 100k | const __m128i sum = dc_sum_32(left); |
637 | 100k | const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); |
638 | 100k | (void)above; |
639 | 100k | (void)bd; |
640 | 100k | dc_store_32xh(dst, stride, 32, &dc); |
641 | 100k | } |
642 | | |
643 | | void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
644 | | const uint16_t *above, |
645 | 7.68k | const uint16_t *left, int bd) { |
646 | 7.68k | const __m128i sixteen = _mm_cvtsi32_si128(16); |
647 | 7.68k | const __m128i sum = dc_sum_32(above); |
648 | 7.68k | const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); |
649 | 7.68k | (void)left; |
650 | 7.68k | (void)bd; |
651 | 7.68k | dc_store_32xh(dst, stride, 16, &dc); |
652 | 7.68k | } |
653 | | |
654 | | void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
655 | | const uint16_t *above, |
656 | 14.5k | const uint16_t *left, int bd) { |
657 | 14.5k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
658 | 14.5k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
659 | 14.5k | (void)above; |
660 | 14.5k | (void)left; |
661 | 14.5k | dc_store_32xh(dst, stride, 16, &dc_dup); |
662 | 14.5k | } |
663 | | |
664 | | void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, |
665 | | const uint16_t *above, |
666 | 67.7k | const uint16_t *left, int bd) { |
667 | 67.7k | const __m128i sixteen = _mm_cvtsi32_si128(16); |
668 | 67.7k | const __m128i sum = dc_sum_32(above); |
669 | 67.7k | const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); |
670 | 67.7k | (void)left; |
671 | 67.7k | (void)bd; |
672 | 67.7k | dc_store_32xh(dst, stride, 32, &dc); |
673 | 67.7k | } |
674 | | |
675 | | void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, |
676 | | const uint16_t *above, |
677 | 19.4k | const uint16_t *left, int bd) { |
678 | 19.4k | const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); |
679 | 19.4k | const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); |
680 | 19.4k | (void)above; |
681 | 19.4k | (void)left; |
682 | 19.4k | dc_store_32xh(dst, stride, 32, &dc_dup); |
683 | 19.4k | } |
684 | | |
685 | | // ----------------------------------------------------------------------------- |
686 | | // V_PRED |
687 | | |
688 | | void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
689 | | const uint16_t *above, |
690 | 33.4k | const uint16_t *left, int bd) { |
691 | 33.4k | (void)left; |
692 | 33.4k | (void)bd; |
693 | 33.4k | const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above); |
694 | 33.4k | int i; |
695 | 100k | for (i = 0; i < 2; ++i) { |
696 | 66.9k | _mm_storel_epi64((__m128i *)dst, above_u16); |
697 | 66.9k | _mm_storel_epi64((__m128i *)(dst + stride), above_u16); |
698 | 66.9k | _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16); |
699 | 66.9k | _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16); |
700 | 66.9k | dst += stride << 2; |
701 | 66.9k | } |
702 | 33.4k | } |
703 | | |
704 | | void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
705 | | const uint16_t *above, |
706 | 61.4k | const uint16_t *left, int bd) { |
707 | 61.4k | (void)left; |
708 | 61.4k | (void)bd; |
709 | 61.4k | const __m128i above_u16 = _mm_load_si128((const __m128i *)above); |
710 | 61.4k | _mm_store_si128((__m128i *)dst, above_u16); |
711 | 61.4k | _mm_store_si128((__m128i *)(dst + stride), above_u16); |
712 | 61.4k | _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); |
713 | 61.4k | _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); |
714 | 61.4k | } |
715 | | |
716 | | void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
717 | | const uint16_t *above, |
718 | 24.3k | const uint16_t *left, int bd) { |
719 | 24.3k | (void)left; |
720 | 24.3k | (void)bd; |
721 | 24.3k | const __m128i above_u16 = _mm_load_si128((const __m128i *)above); |
722 | 24.3k | int i; |
723 | 121k | for (i = 0; i < 4; ++i) { |
724 | 97.3k | _mm_store_si128((__m128i *)dst, above_u16); |
725 | 97.3k | _mm_store_si128((__m128i *)(dst + stride), above_u16); |
726 | 97.3k | _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); |
727 | 97.3k | _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); |
728 | 97.3k | dst += stride << 2; |
729 | 97.3k | } |
730 | 24.3k | } |
731 | | |
732 | | void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
733 | | const uint16_t *above, |
734 | 61.0k | const uint16_t *left, int bd) { |
735 | 61.0k | (void)left; |
736 | 61.0k | (void)bd; |
737 | 61.0k | const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); |
738 | 61.0k | const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); |
739 | 61.0k | int i; |
740 | 183k | for (i = 0; i < 2; ++i) { |
741 | 122k | _mm_store_si128((__m128i *)dst, above0_u16); |
742 | 122k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
743 | 122k | dst += stride; |
744 | 122k | _mm_store_si128((__m128i *)dst, above0_u16); |
745 | 122k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
746 | 122k | dst += stride; |
747 | 122k | _mm_store_si128((__m128i *)dst, above0_u16); |
748 | 122k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
749 | 122k | dst += stride; |
750 | 122k | _mm_store_si128((__m128i *)dst, above0_u16); |
751 | 122k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
752 | 122k | dst += stride; |
753 | 122k | } |
754 | 61.0k | } |
755 | | |
756 | | void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
757 | | const uint16_t *above, |
758 | 13.7k | const uint16_t *left, int bd) { |
759 | 13.7k | (void)left; |
760 | 13.7k | (void)bd; |
761 | 13.7k | const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); |
762 | 13.7k | const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); |
763 | 13.7k | int i; |
764 | 123k | for (i = 0; i < 8; ++i) { |
765 | 109k | _mm_store_si128((__m128i *)dst, above0_u16); |
766 | 109k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
767 | 109k | dst += stride; |
768 | 109k | _mm_store_si128((__m128i *)dst, above0_u16); |
769 | 109k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
770 | 109k | dst += stride; |
771 | 109k | _mm_store_si128((__m128i *)dst, above0_u16); |
772 | 109k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
773 | 109k | dst += stride; |
774 | 109k | _mm_store_si128((__m128i *)dst, above0_u16); |
775 | 109k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
776 | 109k | dst += stride; |
777 | 109k | } |
778 | 13.7k | } |
779 | | |
780 | | void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
781 | | const uint16_t *above, |
782 | 9.80k | const uint16_t *left, int bd) { |
783 | 9.80k | (void)left; |
784 | 9.80k | (void)bd; |
785 | 9.80k | const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); |
786 | 9.80k | const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); |
787 | 9.80k | const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16)); |
788 | 9.80k | const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24)); |
789 | 9.80k | int i; |
790 | 49.0k | for (i = 0; i < 4; ++i) { |
791 | 39.2k | _mm_store_si128((__m128i *)dst, above0_u16); |
792 | 39.2k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
793 | 39.2k | _mm_store_si128((__m128i *)(dst + 16), above2_u16); |
794 | 39.2k | _mm_store_si128((__m128i *)(dst + 24), above3_u16); |
795 | 39.2k | dst += stride; |
796 | 39.2k | _mm_store_si128((__m128i *)dst, above0_u16); |
797 | 39.2k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
798 | 39.2k | _mm_store_si128((__m128i *)(dst + 16), above2_u16); |
799 | 39.2k | _mm_store_si128((__m128i *)(dst + 24), above3_u16); |
800 | 39.2k | dst += stride; |
801 | 39.2k | _mm_store_si128((__m128i *)dst, above0_u16); |
802 | 39.2k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
803 | 39.2k | _mm_store_si128((__m128i *)(dst + 16), above2_u16); |
804 | 39.2k | _mm_store_si128((__m128i *)(dst + 24), above3_u16); |
805 | 39.2k | dst += stride; |
806 | 39.2k | _mm_store_si128((__m128i *)dst, above0_u16); |
807 | 39.2k | _mm_store_si128((__m128i *)(dst + 8), above1_u16); |
808 | 39.2k | _mm_store_si128((__m128i *)(dst + 16), above2_u16); |
809 | 39.2k | _mm_store_si128((__m128i *)(dst + 24), above3_u16); |
810 | 39.2k | dst += stride; |
811 | 39.2k | } |
812 | 9.80k | } |
813 | | |
814 | | // ----------------------------------------------------------------------------- |
815 | | // DC_PRED |
816 | | |
817 | | void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, |
818 | | const uint16_t *above, |
819 | 278k | const uint16_t *left, int bd) { |
820 | 278k | (void)bd; |
821 | 278k | const __m128i sum_above = dc_sum_4(above); |
822 | 278k | const __m128i sum_left = dc_sum_8(left); |
823 | 278k | const __m128i sum = _mm_add_epi16(sum_above, sum_left); |
824 | 278k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
825 | 278k | sum32 >>= 16; |
826 | 278k | sum32 += 6; |
827 | 278k | sum32 /= 12; |
828 | 278k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
829 | 278k | int i; |
830 | 1.39M | for (i = 0; i < 4; ++i) { |
831 | 1.11M | _mm_storel_epi64((__m128i *)dst, row); |
832 | 1.11M | dst += stride; |
833 | 1.11M | _mm_storel_epi64((__m128i *)dst, row); |
834 | 1.11M | dst += stride; |
835 | 1.11M | } |
836 | 278k | } |
837 | | |
838 | | void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, |
839 | | const uint16_t *above, |
840 | 544k | const uint16_t *left, int bd) { |
841 | 544k | (void)bd; |
842 | 544k | const __m128i sum_left = dc_sum_4(left); |
843 | 544k | const __m128i sum_above = dc_sum_8(above); |
844 | 544k | const __m128i sum = _mm_add_epi16(sum_above, sum_left); |
845 | 544k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
846 | 544k | sum32 >>= 16; |
847 | 544k | sum32 += 6; |
848 | 544k | sum32 /= 12; |
849 | 544k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
850 | | |
851 | 544k | _mm_store_si128((__m128i *)dst, row); |
852 | 544k | dst += stride; |
853 | 544k | _mm_store_si128((__m128i *)dst, row); |
854 | 544k | dst += stride; |
855 | 544k | _mm_store_si128((__m128i *)dst, row); |
856 | 544k | dst += stride; |
857 | 544k | _mm_store_si128((__m128i *)dst, row); |
858 | 544k | } |
859 | | |
860 | | void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, |
861 | | const uint16_t *above, |
862 | 288k | const uint16_t *left, int bd) { |
863 | 288k | (void)bd; |
864 | 288k | __m128i sum_left = dc_sum_16(left); |
865 | 288k | __m128i sum_above = dc_sum_8(above); |
866 | 288k | const __m128i zero = _mm_setzero_si128(); |
867 | 288k | sum_left = _mm_unpacklo_epi16(sum_left, zero); |
868 | 288k | sum_above = _mm_unpacklo_epi16(sum_above, zero); |
869 | 288k | const __m128i sum = _mm_add_epi32(sum_left, sum_above); |
870 | 288k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
871 | 288k | sum32 += 12; |
872 | 288k | sum32 /= 24; |
873 | 288k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
874 | 288k | int i; |
875 | 1.44M | for (i = 0; i < 4; ++i) { |
876 | 1.15M | _mm_store_si128((__m128i *)dst, row); |
877 | 1.15M | dst += stride; |
878 | 1.15M | _mm_store_si128((__m128i *)dst, row); |
879 | 1.15M | dst += stride; |
880 | 1.15M | _mm_store_si128((__m128i *)dst, row); |
881 | 1.15M | dst += stride; |
882 | 1.15M | _mm_store_si128((__m128i *)dst, row); |
883 | 1.15M | dst += stride; |
884 | 1.15M | } |
885 | 288k | } |
886 | | |
887 | | void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, |
888 | | const uint16_t *above, |
889 | 480k | const uint16_t *left, int bd) { |
890 | 480k | (void)bd; |
891 | 480k | __m128i sum_left = dc_sum_8(left); |
892 | 480k | __m128i sum_above = dc_sum_16(above); |
893 | 480k | const __m128i zero = _mm_setzero_si128(); |
894 | 480k | sum_left = _mm_unpacklo_epi16(sum_left, zero); |
895 | 480k | sum_above = _mm_unpacklo_epi16(sum_above, zero); |
896 | 480k | const __m128i sum = _mm_add_epi32(sum_left, sum_above); |
897 | 480k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
898 | 480k | sum32 += 12; |
899 | 480k | sum32 /= 24; |
900 | 480k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
901 | 480k | int i; |
902 | 1.44M | for (i = 0; i < 2; ++i) { |
903 | 960k | _mm_store_si128((__m128i *)dst, row); |
904 | 960k | _mm_store_si128((__m128i *)(dst + 8), row); |
905 | 960k | dst += stride; |
906 | 960k | _mm_store_si128((__m128i *)dst, row); |
907 | 960k | _mm_store_si128((__m128i *)(dst + 8), row); |
908 | 960k | dst += stride; |
909 | 960k | _mm_store_si128((__m128i *)dst, row); |
910 | 960k | _mm_store_si128((__m128i *)(dst + 8), row); |
911 | 960k | dst += stride; |
912 | 960k | _mm_store_si128((__m128i *)dst, row); |
913 | 960k | _mm_store_si128((__m128i *)(dst + 8), row); |
914 | 960k | dst += stride; |
915 | 960k | } |
916 | 480k | } |
917 | | |
918 | | void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, |
919 | | const uint16_t *above, |
920 | 187k | const uint16_t *left, int bd) { |
921 | 187k | (void)bd; |
922 | 187k | __m128i sum_left = dc_sum_32(left); |
923 | 187k | __m128i sum_above = dc_sum_16(above); |
924 | 187k | const __m128i zero = _mm_setzero_si128(); |
925 | 187k | sum_above = _mm_unpacklo_epi16(sum_above, zero); |
926 | 187k | const __m128i sum = _mm_add_epi32(sum_left, sum_above); |
927 | 187k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
928 | 187k | sum32 += 24; |
929 | 187k | sum32 /= 48; |
930 | 187k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
931 | 187k | int i; |
932 | 1.68M | for (i = 0; i < 8; ++i) { |
933 | 1.49M | _mm_store_si128((__m128i *)dst, row); |
934 | 1.49M | _mm_store_si128((__m128i *)(dst + 8), row); |
935 | 1.49M | dst += stride; |
936 | 1.49M | _mm_store_si128((__m128i *)dst, row); |
937 | 1.49M | _mm_store_si128((__m128i *)(dst + 8), row); |
938 | 1.49M | dst += stride; |
939 | 1.49M | _mm_store_si128((__m128i *)dst, row); |
940 | 1.49M | _mm_store_si128((__m128i *)(dst + 8), row); |
941 | 1.49M | dst += stride; |
942 | 1.49M | _mm_store_si128((__m128i *)dst, row); |
943 | 1.49M | _mm_store_si128((__m128i *)(dst + 8), row); |
944 | 1.49M | dst += stride; |
945 | 1.49M | } |
946 | 187k | } |
947 | | |
948 | | void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, |
949 | | const uint16_t *above, |
950 | 143k | const uint16_t *left, int bd) { |
951 | 143k | (void)bd; |
952 | 143k | __m128i sum_left = dc_sum_16(left); |
953 | 143k | __m128i sum_above = dc_sum_32(above); |
954 | 143k | const __m128i zero = _mm_setzero_si128(); |
955 | 143k | sum_left = _mm_unpacklo_epi16(sum_left, zero); |
956 | 143k | const __m128i sum = _mm_add_epi32(sum_left, sum_above); |
957 | 143k | uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); |
958 | 143k | sum32 += 24; |
959 | 143k | sum32 /= 48; |
960 | 143k | const __m128i row = _mm_set1_epi16((int16_t)sum32); |
961 | 143k | int i; |
962 | 716k | for (i = 0; i < 4; ++i) { |
963 | 573k | _mm_store_si128((__m128i *)dst, row); |
964 | 573k | _mm_store_si128((__m128i *)(dst + 8), row); |
965 | 573k | _mm_store_si128((__m128i *)(dst + 16), row); |
966 | 573k | _mm_store_si128((__m128i *)(dst + 24), row); |
967 | 573k | dst += stride; |
968 | 573k | _mm_store_si128((__m128i *)dst, row); |
969 | 573k | _mm_store_si128((__m128i *)(dst + 8), row); |
970 | 573k | _mm_store_si128((__m128i *)(dst + 16), row); |
971 | 573k | _mm_store_si128((__m128i *)(dst + 24), row); |
972 | 573k | dst += stride; |
973 | 573k | _mm_store_si128((__m128i *)dst, row); |
974 | 573k | _mm_store_si128((__m128i *)(dst + 8), row); |
975 | 573k | _mm_store_si128((__m128i *)(dst + 16), row); |
976 | 573k | _mm_store_si128((__m128i *)(dst + 24), row); |
977 | 573k | dst += stride; |
978 | 573k | _mm_store_si128((__m128i *)dst, row); |
979 | 573k | _mm_store_si128((__m128i *)(dst + 8), row); |
980 | 573k | _mm_store_si128((__m128i *)(dst + 16), row); |
981 | 573k | _mm_store_si128((__m128i *)(dst + 24), row); |
982 | 573k | dst += stride; |
983 | 573k | } |
984 | 143k | } |