/src/aom/aom_dsp/x86/intrapred_sse2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <emmintrin.h> |
13 | | #include "aom_dsp/x86/intrapred_x86.h" |
14 | | #include "config/aom_dsp_rtcd.h" |
15 | | |
16 | | static inline void dc_store_4xh(uint32_t dc, int height, uint8_t *dst, |
17 | 427k | ptrdiff_t stride) { |
18 | 2.79M | for (int i = 0; i < height; i += 2) { |
19 | 2.36M | *(uint32_t *)dst = dc; |
20 | 2.36M | dst += stride; |
21 | 2.36M | *(uint32_t *)dst = dc; |
22 | 2.36M | dst += stride; |
23 | 2.36M | } |
24 | 427k | } |
25 | | |
26 | | static inline void dc_store_8xh(const __m128i *row, int height, uint8_t *dst, |
27 | 727k | ptrdiff_t stride) { |
28 | 727k | int i; |
29 | 9.57M | for (i = 0; i < height; ++i) { |
30 | 8.85M | _mm_storel_epi64((__m128i *)dst, *row); |
31 | 8.85M | dst += stride; |
32 | 8.85M | } |
33 | 727k | } |
34 | | |
35 | | static inline void dc_store_16xh(const __m128i *row, int height, uint8_t *dst, |
36 | 1.09M | ptrdiff_t stride) { |
37 | 1.09M | int i; |
38 | 12.2M | for (i = 0; i < height; ++i) { |
39 | 11.1M | _mm_store_si128((__m128i *)dst, *row); |
40 | 11.1M | dst += stride; |
41 | 11.1M | } |
42 | 1.09M | } |
43 | | |
44 | | static inline void dc_store_32xh(const __m128i *row, int height, uint8_t *dst, |
45 | 326k | ptrdiff_t stride) { |
46 | 326k | int i; |
47 | 2.94M | for (i = 0; i < height; ++i) { |
48 | 2.61M | _mm_store_si128((__m128i *)dst, *row); |
49 | 2.61M | _mm_store_si128((__m128i *)(dst + 16), *row); |
50 | 2.61M | dst += stride; |
51 | 2.61M | } |
52 | 326k | } |
53 | | |
54 | | static inline void dc_store_64xh(const __m128i *row, int height, uint8_t *dst, |
55 | 0 | ptrdiff_t stride) { |
56 | 0 | for (int i = 0; i < height; ++i) { |
57 | 0 | _mm_store_si128((__m128i *)dst, *row); |
58 | 0 | _mm_store_si128((__m128i *)(dst + 16), *row); |
59 | 0 | _mm_store_si128((__m128i *)(dst + 32), *row); |
60 | 0 | _mm_store_si128((__m128i *)(dst + 48), *row); |
61 | 0 | dst += stride; |
62 | 0 | } |
63 | 0 | } |
64 | | |
65 | 1.27M | static inline __m128i dc_sum_4(const uint8_t *ref) { |
66 | 1.27M | __m128i x = _mm_loadl_epi64((__m128i const *)ref); |
67 | 1.27M | const __m128i zero = _mm_setzero_si128(); |
68 | 1.27M | x = _mm_unpacklo_epi8(x, zero); |
69 | 1.27M | return _mm_sad_epu8(x, zero); |
70 | 1.27M | } |
71 | | |
72 | 1.51M | static inline __m128i dc_sum_8(const uint8_t *ref) { |
73 | 1.51M | __m128i x = _mm_loadl_epi64((__m128i const *)ref); |
74 | 1.51M | const __m128i zero = _mm_setzero_si128(); |
75 | 1.51M | return _mm_sad_epu8(x, zero); |
76 | 1.51M | } |
77 | | |
78 | 16.8k | static inline __m128i dc_sum_64(const uint8_t *ref) { |
79 | 16.8k | __m128i x0 = _mm_load_si128((__m128i const *)ref); |
80 | 16.8k | __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); |
81 | 16.8k | __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32)); |
82 | 16.8k | __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48)); |
83 | 16.8k | const __m128i zero = _mm_setzero_si128(); |
84 | 16.8k | x0 = _mm_sad_epu8(x0, zero); |
85 | 16.8k | x1 = _mm_sad_epu8(x1, zero); |
86 | 16.8k | x2 = _mm_sad_epu8(x2, zero); |
87 | 16.8k | x3 = _mm_sad_epu8(x3, zero); |
88 | 16.8k | x0 = _mm_add_epi16(x0, x1); |
89 | 16.8k | x2 = _mm_add_epi16(x2, x3); |
90 | 16.8k | x0 = _mm_add_epi16(x0, x2); |
91 | 16.8k | const __m128i high = _mm_unpackhi_epi64(x0, x0); |
92 | 16.8k | return _mm_add_epi16(x0, high); |
93 | 16.8k | } |
94 | | |
95 | 1.20M | #define DC_MULTIPLIER_1X2 0x5556 |
96 | 1.11M | #define DC_MULTIPLIER_1X4 0x3334 |
97 | | |
98 | 2.32M | #define DC_SHIFT2 16 |
99 | | |
100 | | static inline int divide_using_multiply_shift(int num, int shift1, |
101 | 2.32M | int multiplier) { |
102 | 2.32M | const int interm = num >> shift1; |
103 | 2.32M | return interm * multiplier >> DC_SHIFT2; |
104 | 2.32M | } |
105 | | |
106 | | // ----------------------------------------------------------------------------- |
107 | | // DC_PRED |
108 | | |
109 | | void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, |
110 | 227k | const uint8_t *above, const uint8_t *left) { |
111 | 227k | const __m128i sum_left = dc_sum_8(left); |
112 | 227k | __m128i sum_above = dc_sum_4(above); |
113 | 227k | sum_above = _mm_add_epi16(sum_left, sum_above); |
114 | | |
115 | 227k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
116 | 227k | sum += 6; |
117 | 227k | sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); |
118 | | |
119 | 227k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
120 | 227k | const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row); |
121 | 227k | dc_store_4xh(pred, 8, dst, stride); |
122 | 227k | } |
123 | | |
124 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
125 | | void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, |
126 | 149k | const uint8_t *above, const uint8_t *left) { |
127 | 149k | const __m128i sum_left = dc_sum_16_sse2(left); |
128 | 149k | __m128i sum_above = dc_sum_4(above); |
129 | 149k | sum_above = _mm_add_epi16(sum_left, sum_above); |
130 | | |
131 | 149k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
132 | 149k | sum += 10; |
133 | 149k | sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); |
134 | | |
135 | 149k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
136 | 149k | const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row); |
137 | 149k | dc_store_4xh(pred, 16, dst, stride); |
138 | 149k | } |
139 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
140 | | |
141 | | void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, |
142 | 344k | const uint8_t *above, const uint8_t *left) { |
143 | 344k | const __m128i sum_left = dc_sum_4(left); |
144 | 344k | __m128i sum_above = dc_sum_8(above); |
145 | 344k | sum_above = _mm_add_epi16(sum_above, sum_left); |
146 | | |
147 | 344k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
148 | 344k | sum += 6; |
149 | 344k | sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); |
150 | | |
151 | 344k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
152 | 344k | dc_store_8xh(&row, 4, dst, stride); |
153 | 344k | } |
154 | | |
155 | | void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, |
156 | 195k | const uint8_t *above, const uint8_t *left) { |
157 | 195k | const __m128i sum_left = dc_sum_16_sse2(left); |
158 | 195k | __m128i sum_above = dc_sum_8(above); |
159 | 195k | sum_above = _mm_add_epi16(sum_above, sum_left); |
160 | | |
161 | 195k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
162 | 195k | sum += 12; |
163 | 195k | sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); |
164 | 195k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
165 | 195k | dc_store_8xh(&row, 16, dst, stride); |
166 | 195k | } |
167 | | |
168 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
169 | | void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, |
170 | 97.5k | const uint8_t *above, const uint8_t *left) { |
171 | 97.5k | const __m128i sum_left = dc_sum_32_sse2(left); |
172 | 97.5k | __m128i sum_above = dc_sum_8(above); |
173 | 97.5k | sum_above = _mm_add_epi16(sum_above, sum_left); |
174 | | |
175 | 97.5k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
176 | 97.5k | sum += 20; |
177 | 97.5k | sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); |
178 | 97.5k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
179 | 97.5k | dc_store_8xh(&row, 32, dst, stride); |
180 | 97.5k | } |
181 | | |
182 | | void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, |
183 | 537k | const uint8_t *above, const uint8_t *left) { |
184 | 537k | const __m128i sum_left = dc_sum_4(left); |
185 | 537k | __m128i sum_above = dc_sum_16_sse2(above); |
186 | 537k | sum_above = _mm_add_epi16(sum_above, sum_left); |
187 | | |
188 | 537k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
189 | 537k | sum += 10; |
190 | 537k | sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); |
191 | 537k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
192 | 537k | dc_store_16xh(&row, 4, dst, stride); |
193 | 537k | } |
194 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
195 | | |
196 | | void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, |
197 | 309k | const uint8_t *above, const uint8_t *left) { |
198 | 309k | const __m128i sum_left = dc_sum_8(left); |
199 | 309k | __m128i sum_above = dc_sum_16_sse2(above); |
200 | 309k | sum_above = _mm_add_epi16(sum_above, sum_left); |
201 | | |
202 | 309k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
203 | 309k | sum += 12; |
204 | 309k | sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); |
205 | 309k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
206 | 309k | dc_store_16xh(&row, 8, dst, stride); |
207 | 309k | } |
208 | | |
209 | | void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, |
210 | 130k | const uint8_t *above, const uint8_t *left) { |
211 | 130k | const __m128i sum_left = dc_sum_32_sse2(left); |
212 | 130k | __m128i sum_above = dc_sum_16_sse2(above); |
213 | 130k | sum_above = _mm_add_epi16(sum_left, sum_above); |
214 | | |
215 | 130k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
216 | 130k | sum += 24; |
217 | 130k | sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); |
218 | 130k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
219 | 130k | dc_store_16xh(&row, 32, dst, stride); |
220 | 130k | } |
221 | | |
222 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
223 | | void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, |
224 | 16.4k | const uint8_t *above, const uint8_t *left) { |
225 | 16.4k | const __m128i sum_left = dc_sum_64(left); |
226 | 16.4k | __m128i sum_above = dc_sum_16_sse2(above); |
227 | 16.4k | sum_above = _mm_add_epi16(sum_left, sum_above); |
228 | | |
229 | 16.4k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
230 | 16.4k | sum += 40; |
231 | 16.4k | sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); |
232 | 16.4k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
233 | 16.4k | dc_store_16xh(&row, 64, dst, stride); |
234 | 16.4k | } |
235 | | |
236 | | void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, |
237 | 318k | const uint8_t *above, const uint8_t *left) { |
238 | 318k | __m128i sum_above = dc_sum_32_sse2(above); |
239 | 318k | const __m128i sum_left = dc_sum_8(left); |
240 | 318k | sum_above = _mm_add_epi16(sum_above, sum_left); |
241 | | |
242 | 318k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
243 | 318k | sum += 20; |
244 | 318k | sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); |
245 | 318k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
246 | 318k | dc_store_32xh(&row, 8, dst, stride); |
247 | 318k | } |
248 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
249 | | |
250 | | void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, |
251 | 0 | const uint8_t *above, const uint8_t *left) { |
252 | 0 | __m128i sum_above = dc_sum_32_sse2(above); |
253 | 0 | const __m128i sum_left = dc_sum_16_sse2(left); |
254 | 0 | sum_above = _mm_add_epi16(sum_above, sum_left); |
255 | |
|
256 | 0 | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
257 | 0 | sum += 24; |
258 | 0 | sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); |
259 | 0 | const __m128i row = _mm_set1_epi8((int8_t)sum); |
260 | 0 | dc_store_32xh(&row, 16, dst, stride); |
261 | 0 | } |
262 | | |
263 | | void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, |
264 | 0 | const uint8_t *above, const uint8_t *left) { |
265 | 0 | __m128i sum_above = dc_sum_32_sse2(above); |
266 | 0 | const __m128i sum_left = dc_sum_64(left); |
267 | 0 | sum_above = _mm_add_epi16(sum_above, sum_left); |
268 | |
|
269 | 0 | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
270 | 0 | sum += 48; |
271 | 0 | sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); |
272 | 0 | const __m128i row = _mm_set1_epi8((int8_t)sum); |
273 | 0 | dc_store_32xh(&row, 64, dst, stride); |
274 | 0 | } |
275 | | |
276 | | void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, |
277 | 0 | const uint8_t *above, const uint8_t *left) { |
278 | 0 | __m128i sum_above = dc_sum_64(above); |
279 | 0 | const __m128i sum_left = dc_sum_64(left); |
280 | 0 | sum_above = _mm_add_epi16(sum_above, sum_left); |
281 | |
|
282 | 0 | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
283 | 0 | sum += 64; |
284 | 0 | sum /= 128; |
285 | 0 | const __m128i row = _mm_set1_epi8((int8_t)sum); |
286 | 0 | dc_store_64xh(&row, 64, dst, stride); |
287 | 0 | } |
288 | | |
289 | | void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, |
290 | 0 | const uint8_t *above, const uint8_t *left) { |
291 | 0 | __m128i sum_above = dc_sum_64(above); |
292 | 0 | const __m128i sum_left = dc_sum_32_sse2(left); |
293 | 0 | sum_above = _mm_add_epi16(sum_above, sum_left); |
294 | |
|
295 | 0 | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
296 | 0 | sum += 48; |
297 | 0 | sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); |
298 | 0 | const __m128i row = _mm_set1_epi8((int8_t)sum); |
299 | 0 | dc_store_64xh(&row, 32, dst, stride); |
300 | 0 | } |
301 | | |
302 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
303 | | void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, |
304 | 0 | const uint8_t *above, const uint8_t *left) { |
305 | 0 | __m128i sum_above = dc_sum_64(above); |
306 | 0 | const __m128i sum_left = dc_sum_16_sse2(left); |
307 | 0 | sum_above = _mm_add_epi16(sum_above, sum_left); |
308 | |
|
309 | 0 | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
310 | 0 | sum += 40; |
311 | 0 | sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); |
312 | 0 | const __m128i row = _mm_set1_epi8((int8_t)sum); |
313 | 0 | dc_store_64xh(&row, 16, dst, stride); |
314 | 0 | } |
315 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
316 | | |
317 | | // ----------------------------------------------------------------------------- |
318 | | // DC_TOP |
319 | | |
320 | | void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, |
321 | 3.40k | const uint8_t *above, const uint8_t *left) { |
322 | 3.40k | (void)left; |
323 | 3.40k | __m128i sum_above = dc_sum_4(above); |
324 | 3.40k | const __m128i two = _mm_set1_epi16(2); |
325 | 3.40k | sum_above = _mm_add_epi16(sum_above, two); |
326 | 3.40k | sum_above = _mm_srai_epi16(sum_above, 2); |
327 | 3.40k | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
328 | 3.40k | sum_above = _mm_packus_epi16(sum_above, sum_above); |
329 | | |
330 | 3.40k | const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above); |
331 | 3.40k | dc_store_4xh(pred, 8, dst, stride); |
332 | 3.40k | } |
333 | | |
334 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
335 | | void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, |
336 | 3.30k | const uint8_t *above, const uint8_t *left) { |
337 | 3.30k | (void)left; |
338 | 3.30k | __m128i sum_above = dc_sum_4(above); |
339 | 3.30k | const __m128i two = _mm_set1_epi16(2); |
340 | 3.30k | sum_above = _mm_add_epi16(sum_above, two); |
341 | 3.30k | sum_above = _mm_srai_epi16(sum_above, 2); |
342 | 3.30k | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
343 | 3.30k | sum_above = _mm_packus_epi16(sum_above, sum_above); |
344 | | |
345 | 3.30k | const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above); |
346 | 3.30k | dc_store_4xh(pred, 16, dst, stride); |
347 | 3.30k | } |
348 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
349 | | |
350 | | void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, |
351 | 2.40k | const uint8_t *above, const uint8_t *left) { |
352 | 2.40k | (void)left; |
353 | 2.40k | __m128i sum_above = dc_sum_8(above); |
354 | 2.40k | const __m128i four = _mm_set1_epi16(4); |
355 | 2.40k | sum_above = _mm_add_epi16(sum_above, four); |
356 | 2.40k | sum_above = _mm_srai_epi16(sum_above, 3); |
357 | 2.40k | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
358 | 2.40k | const __m128i row = _mm_shufflelo_epi16(sum_above, 0); |
359 | 2.40k | dc_store_8xh(&row, 4, dst, stride); |
360 | 2.40k | } |
361 | | |
362 | | void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, |
363 | 3.76k | const uint8_t *above, const uint8_t *left) { |
364 | 3.76k | (void)left; |
365 | 3.76k | __m128i sum_above = dc_sum_8(above); |
366 | 3.76k | const __m128i four = _mm_set1_epi16(4); |
367 | 3.76k | sum_above = _mm_add_epi16(sum_above, four); |
368 | 3.76k | sum_above = _mm_srai_epi16(sum_above, 3); |
369 | 3.76k | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
370 | 3.76k | const __m128i row = _mm_shufflelo_epi16(sum_above, 0); |
371 | 3.76k | dc_store_8xh(&row, 16, dst, stride); |
372 | 3.76k | } |
373 | | |
374 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
375 | | void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, |
376 | 4.88k | const uint8_t *above, const uint8_t *left) { |
377 | 4.88k | (void)left; |
378 | 4.88k | __m128i sum_above = dc_sum_8(above); |
379 | 4.88k | const __m128i four = _mm_set1_epi16(4); |
380 | 4.88k | sum_above = _mm_add_epi16(sum_above, four); |
381 | 4.88k | sum_above = _mm_srai_epi16(sum_above, 3); |
382 | 4.88k | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
383 | 4.88k | const __m128i row = _mm_shufflelo_epi16(sum_above, 0); |
384 | 4.88k | dc_store_8xh(&row, 32, dst, stride); |
385 | 4.88k | } |
386 | | |
387 | | void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, |
388 | 6.43k | const uint8_t *above, const uint8_t *left) { |
389 | 6.43k | (void)left; |
390 | 6.43k | __m128i sum_above = dc_sum_16_sse2(above); |
391 | 6.43k | const __m128i eight = _mm_set1_epi16(8); |
392 | 6.43k | sum_above = _mm_add_epi16(sum_above, eight); |
393 | 6.43k | sum_above = _mm_srai_epi16(sum_above, 4); |
394 | 6.43k | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
395 | 6.43k | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
396 | 6.43k | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
397 | 6.43k | dc_store_16xh(&row, 4, dst, stride); |
398 | 6.43k | } |
399 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
400 | | |
401 | | void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, |
402 | 1.49k | const uint8_t *above, const uint8_t *left) { |
403 | 1.49k | (void)left; |
404 | 1.49k | __m128i sum_above = dc_sum_16_sse2(above); |
405 | 1.49k | const __m128i eight = _mm_set1_epi16(8); |
406 | 1.49k | sum_above = _mm_add_epi16(sum_above, eight); |
407 | 1.49k | sum_above = _mm_srai_epi16(sum_above, 4); |
408 | 1.49k | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
409 | 1.49k | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
410 | 1.49k | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
411 | 1.49k | dc_store_16xh(&row, 8, dst, stride); |
412 | 1.49k | } |
413 | | |
414 | | void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, |
415 | | const uint8_t *above, |
416 | 4.22k | const uint8_t *left) { |
417 | 4.22k | (void)left; |
418 | 4.22k | __m128i sum_above = dc_sum_16_sse2(above); |
419 | 4.22k | const __m128i eight = _mm_set1_epi16(8); |
420 | 4.22k | sum_above = _mm_add_epi16(sum_above, eight); |
421 | 4.22k | sum_above = _mm_srai_epi16(sum_above, 4); |
422 | 4.22k | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
423 | 4.22k | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
424 | 4.22k | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
425 | 4.22k | dc_store_16xh(&row, 32, dst, stride); |
426 | 4.22k | } |
427 | | |
428 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
429 | | void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, |
430 | | const uint8_t *above, |
431 | 185 | const uint8_t *left) { |
432 | 185 | (void)left; |
433 | 185 | __m128i sum_above = dc_sum_16_sse2(above); |
434 | 185 | const __m128i eight = _mm_set1_epi16(8); |
435 | 185 | sum_above = _mm_add_epi16(sum_above, eight); |
436 | 185 | sum_above = _mm_srai_epi16(sum_above, 4); |
437 | 185 | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
438 | 185 | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
439 | 185 | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
440 | 185 | dc_store_16xh(&row, 64, dst, stride); |
441 | 185 | } |
442 | | |
443 | | void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, |
444 | 6.72k | const uint8_t *above, const uint8_t *left) { |
445 | 6.72k | (void)left; |
446 | 6.72k | __m128i sum_above = dc_sum_32_sse2(above); |
447 | 6.72k | const __m128i sixteen = _mm_set1_epi16(16); |
448 | 6.72k | sum_above = _mm_add_epi16(sum_above, sixteen); |
449 | 6.72k | sum_above = _mm_srai_epi16(sum_above, 5); |
450 | 6.72k | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
451 | 6.72k | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
452 | 6.72k | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
453 | 6.72k | dc_store_32xh(&row, 8, dst, stride); |
454 | 6.72k | } |
455 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
456 | | |
457 | | void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, |
458 | | const uint8_t *above, |
459 | 0 | const uint8_t *left) { |
460 | 0 | (void)left; |
461 | 0 | __m128i sum_above = dc_sum_32_sse2(above); |
462 | 0 | const __m128i sixteen = _mm_set1_epi16(16); |
463 | 0 | sum_above = _mm_add_epi16(sum_above, sixteen); |
464 | 0 | sum_above = _mm_srai_epi16(sum_above, 5); |
465 | 0 | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
466 | 0 | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
467 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
468 | 0 | dc_store_32xh(&row, 16, dst, stride); |
469 | 0 | } |
470 | | |
471 | | void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, |
472 | | const uint8_t *above, |
473 | 0 | const uint8_t *left) { |
474 | 0 | (void)left; |
475 | 0 | __m128i sum_above = dc_sum_32_sse2(above); |
476 | 0 | const __m128i sixteen = _mm_set1_epi16(16); |
477 | 0 | sum_above = _mm_add_epi16(sum_above, sixteen); |
478 | 0 | sum_above = _mm_srai_epi16(sum_above, 5); |
479 | 0 | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
480 | 0 | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
481 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
482 | 0 | dc_store_32xh(&row, 64, dst, stride); |
483 | 0 | } |
484 | | |
485 | | void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, |
486 | | const uint8_t *above, |
487 | 0 | const uint8_t *left) { |
488 | 0 | (void)left; |
489 | 0 | __m128i sum_above = dc_sum_64(above); |
490 | 0 | const __m128i thirtytwo = _mm_set1_epi16(32); |
491 | 0 | sum_above = _mm_add_epi16(sum_above, thirtytwo); |
492 | 0 | sum_above = _mm_srai_epi16(sum_above, 6); |
493 | 0 | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
494 | 0 | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
495 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
496 | 0 | dc_store_64xh(&row, 64, dst, stride); |
497 | 0 | } |
498 | | |
499 | | void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, |
500 | | const uint8_t *above, |
501 | 0 | const uint8_t *left) { |
502 | 0 | (void)left; |
503 | 0 | __m128i sum_above = dc_sum_64(above); |
504 | 0 | const __m128i thirtytwo = _mm_set1_epi16(32); |
505 | 0 | sum_above = _mm_add_epi16(sum_above, thirtytwo); |
506 | 0 | sum_above = _mm_srai_epi16(sum_above, 6); |
507 | 0 | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
508 | 0 | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
509 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
510 | 0 | dc_store_64xh(&row, 32, dst, stride); |
511 | 0 | } |
512 | | |
513 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
514 | | void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, |
515 | | const uint8_t *above, |
516 | 0 | const uint8_t *left) { |
517 | 0 | (void)left; |
518 | 0 | __m128i sum_above = dc_sum_64(above); |
519 | 0 | const __m128i thirtytwo = _mm_set1_epi16(32); |
520 | 0 | sum_above = _mm_add_epi16(sum_above, thirtytwo); |
521 | 0 | sum_above = _mm_srai_epi16(sum_above, 6); |
522 | 0 | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
523 | 0 | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
524 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
525 | 0 | dc_store_64xh(&row, 16, dst, stride); |
526 | 0 | } |
527 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
528 | | |
529 | | // ----------------------------------------------------------------------------- |
530 | | // DC_LEFT |
531 | | |
532 | | void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, |
533 | 6.02k | const uint8_t *above, const uint8_t *left) { |
534 | 6.02k | (void)above; |
535 | 6.02k | __m128i sum_left = dc_sum_8(left); |
536 | 6.02k | const __m128i four = _mm_set1_epi16(4); |
537 | 6.02k | sum_left = _mm_add_epi16(sum_left, four); |
538 | 6.02k | sum_left = _mm_srai_epi16(sum_left, 3); |
539 | 6.02k | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
540 | 6.02k | sum_left = _mm_packus_epi16(sum_left, sum_left); |
541 | | |
542 | 6.02k | const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left); |
543 | 6.02k | dc_store_4xh(pred, 8, dst, stride); |
544 | 6.02k | } |
545 | | |
546 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
547 | | void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, |
548 | | const uint8_t *above, |
549 | 4.21k | const uint8_t *left) { |
550 | 4.21k | (void)above; |
551 | 4.21k | __m128i sum_left = dc_sum_16_sse2(left); |
552 | 4.21k | const __m128i eight = _mm_set1_epi16(8); |
553 | 4.21k | sum_left = _mm_add_epi16(sum_left, eight); |
554 | 4.21k | sum_left = _mm_srai_epi16(sum_left, 4); |
555 | 4.21k | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
556 | 4.21k | sum_left = _mm_packus_epi16(sum_left, sum_left); |
557 | | |
558 | 4.21k | const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left); |
559 | 4.21k | dc_store_4xh(pred, 16, dst, stride); |
560 | 4.21k | } |
561 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
562 | | |
563 | | void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, |
564 | 4.49k | const uint8_t *above, const uint8_t *left) { |
565 | 4.49k | (void)above; |
566 | 4.49k | __m128i sum_left = dc_sum_4(left); |
567 | 4.49k | const __m128i two = _mm_set1_epi16(2); |
568 | 4.49k | sum_left = _mm_add_epi16(sum_left, two); |
569 | 4.49k | sum_left = _mm_srai_epi16(sum_left, 2); |
570 | 4.49k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
571 | 4.49k | const __m128i row = _mm_shufflelo_epi16(sum_left, 0); |
572 | 4.49k | dc_store_8xh(&row, 4, dst, stride); |
573 | 4.49k | } |
574 | | |
575 | | void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, |
576 | | const uint8_t *above, |
577 | 3.28k | const uint8_t *left) { |
578 | 3.28k | (void)above; |
579 | 3.28k | __m128i sum_left = dc_sum_16_sse2(left); |
580 | 3.28k | const __m128i eight = _mm_set1_epi16(8); |
581 | 3.28k | sum_left = _mm_add_epi16(sum_left, eight); |
582 | 3.28k | sum_left = _mm_srai_epi16(sum_left, 4); |
583 | 3.28k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
584 | 3.28k | const __m128i row = _mm_shufflelo_epi16(sum_left, 0); |
585 | 3.28k | dc_store_8xh(&row, 16, dst, stride); |
586 | 3.28k | } |
587 | | |
588 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
589 | | void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, |
590 | | const uint8_t *above, |
591 | 9.79k | const uint8_t *left) { |
592 | 9.79k | (void)above; |
593 | 9.79k | __m128i sum_left = dc_sum_32_sse2(left); |
594 | 9.79k | const __m128i sixteen = _mm_set1_epi16(16); |
595 | 9.79k | sum_left = _mm_add_epi16(sum_left, sixteen); |
596 | 9.79k | sum_left = _mm_srai_epi16(sum_left, 5); |
597 | 9.79k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
598 | 9.79k | const __m128i row = _mm_shufflelo_epi16(sum_left, 0); |
599 | 9.79k | dc_store_8xh(&row, 32, dst, stride); |
600 | 9.79k | } |
601 | | |
602 | | void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, |
603 | | const uint8_t *above, |
604 | 3.13k | const uint8_t *left) { |
605 | 3.13k | (void)above; |
606 | 3.13k | __m128i sum_left = dc_sum_4(left); |
607 | 3.13k | const __m128i two = _mm_set1_epi16(2); |
608 | 3.13k | sum_left = _mm_add_epi16(sum_left, two); |
609 | 3.13k | sum_left = _mm_srai_epi16(sum_left, 2); |
610 | 3.13k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
611 | 3.13k | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
612 | 3.13k | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
613 | 3.13k | dc_store_16xh(&row, 4, dst, stride); |
614 | 3.13k | } |
615 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
616 | | |
617 | | void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, |
618 | | const uint8_t *above, |
619 | 7.36k | const uint8_t *left) { |
620 | 7.36k | (void)above; |
621 | 7.36k | __m128i sum_left = dc_sum_8(left); |
622 | 7.36k | const __m128i four = _mm_set1_epi16(4); |
623 | 7.36k | sum_left = _mm_add_epi16(sum_left, four); |
624 | 7.36k | sum_left = _mm_srai_epi16(sum_left, 3); |
625 | 7.36k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
626 | 7.36k | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
627 | 7.36k | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
628 | 7.36k | dc_store_16xh(&row, 8, dst, stride); |
629 | 7.36k | } |
630 | | |
631 | | void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, |
632 | | const uint8_t *above, |
633 | 3.88k | const uint8_t *left) { |
634 | 3.88k | (void)above; |
635 | 3.88k | __m128i sum_left = dc_sum_32_sse2(left); |
636 | 3.88k | const __m128i sixteen = _mm_set1_epi16(16); |
637 | 3.88k | sum_left = _mm_add_epi16(sum_left, sixteen); |
638 | 3.88k | sum_left = _mm_srai_epi16(sum_left, 5); |
639 | 3.88k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
640 | 3.88k | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
641 | 3.88k | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
642 | 3.88k | dc_store_16xh(&row, 32, dst, stride); |
643 | 3.88k | } |
644 | | |
645 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
646 | | void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, |
647 | | const uint8_t *above, |
648 | 443 | const uint8_t *left) { |
649 | 443 | (void)above; |
650 | 443 | __m128i sum_left = dc_sum_64(left); |
651 | 443 | const __m128i thirtytwo = _mm_set1_epi16(32); |
652 | 443 | sum_left = _mm_add_epi16(sum_left, thirtytwo); |
653 | 443 | sum_left = _mm_srai_epi16(sum_left, 6); |
654 | 443 | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
655 | 443 | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
656 | 443 | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
657 | 443 | dc_store_16xh(&row, 64, dst, stride); |
658 | 443 | } |
659 | | |
660 | | void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, |
661 | | const uint8_t *above, |
662 | 1.93k | const uint8_t *left) { |
663 | 1.93k | (void)above; |
664 | 1.93k | __m128i sum_left = dc_sum_8(left); |
665 | 1.93k | const __m128i four = _mm_set1_epi16(4); |
666 | 1.93k | sum_left = _mm_add_epi16(sum_left, four); |
667 | 1.93k | sum_left = _mm_srai_epi16(sum_left, 3); |
668 | 1.93k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
669 | 1.93k | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
670 | 1.93k | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
671 | 1.93k | dc_store_32xh(&row, 8, dst, stride); |
672 | 1.93k | } |
673 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
674 | | |
675 | | void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, |
676 | | const uint8_t *above, |
677 | 0 | const uint8_t *left) { |
678 | 0 | (void)above; |
679 | 0 | __m128i sum_left = dc_sum_16_sse2(left); |
680 | 0 | const __m128i eight = _mm_set1_epi16(8); |
681 | 0 | sum_left = _mm_add_epi16(sum_left, eight); |
682 | 0 | sum_left = _mm_srai_epi16(sum_left, 4); |
683 | 0 | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
684 | 0 | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
685 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
686 | 0 | dc_store_32xh(&row, 16, dst, stride); |
687 | 0 | } |
688 | | |
689 | | void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, |
690 | | const uint8_t *above, |
691 | 0 | const uint8_t *left) { |
692 | 0 | (void)above; |
693 | 0 | __m128i sum_left = dc_sum_64(left); |
694 | 0 | const __m128i thirtytwo = _mm_set1_epi16(32); |
695 | 0 | sum_left = _mm_add_epi16(sum_left, thirtytwo); |
696 | 0 | sum_left = _mm_srai_epi16(sum_left, 6); |
697 | 0 | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
698 | 0 | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
699 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
700 | 0 | dc_store_32xh(&row, 64, dst, stride); |
701 | 0 | } |
702 | | |
703 | | void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, |
704 | | const uint8_t *above, |
705 | 0 | const uint8_t *left) { |
706 | 0 | (void)above; |
707 | 0 | __m128i sum_left = dc_sum_64(left); |
708 | 0 | const __m128i thirtytwo = _mm_set1_epi16(32); |
709 | 0 | sum_left = _mm_add_epi16(sum_left, thirtytwo); |
710 | 0 | sum_left = _mm_srai_epi16(sum_left, 6); |
711 | 0 | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
712 | 0 | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
713 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
714 | 0 | dc_store_64xh(&row, 64, dst, stride); |
715 | 0 | } |
716 | | |
717 | | void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, |
718 | | const uint8_t *above, |
719 | 0 | const uint8_t *left) { |
720 | 0 | (void)above; |
721 | 0 | __m128i sum_left = dc_sum_32_sse2(left); |
722 | 0 | const __m128i sixteen = _mm_set1_epi16(16); |
723 | 0 | sum_left = _mm_add_epi16(sum_left, sixteen); |
724 | 0 | sum_left = _mm_srai_epi16(sum_left, 5); |
725 | 0 | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
726 | 0 | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
727 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
728 | 0 | dc_store_64xh(&row, 32, dst, stride); |
729 | 0 | } |
730 | | |
731 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
732 | | void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, |
733 | | const uint8_t *above, |
734 | 0 | const uint8_t *left) { |
735 | 0 | (void)above; |
736 | 0 | __m128i sum_left = dc_sum_16_sse2(left); |
737 | 0 | const __m128i eight = _mm_set1_epi16(8); |
738 | 0 | sum_left = _mm_add_epi16(sum_left, eight); |
739 | 0 | sum_left = _mm_srai_epi16(sum_left, 4); |
740 | 0 | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
741 | 0 | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
742 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
743 | 0 | dc_store_64xh(&row, 16, dst, stride); |
744 | 0 | } |
745 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
746 | | |
747 | | // ----------------------------------------------------------------------------- |
748 | | // DC_128 |
749 | | |
750 | | void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, |
751 | 1.99k | const uint8_t *above, const uint8_t *left) { |
752 | 1.99k | (void)above; |
753 | 1.99k | (void)left; |
754 | 1.99k | const uint32_t pred = 0x80808080; |
755 | 1.99k | dc_store_4xh(pred, 8, dst, stride); |
756 | 1.99k | } |
757 | | |
758 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
759 | | void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, |
760 | 166 | const uint8_t *above, const uint8_t *left) { |
761 | 166 | (void)above; |
762 | 166 | (void)left; |
763 | 166 | const uint32_t pred = 0x80808080; |
764 | 166 | dc_store_4xh(pred, 16, dst, stride); |
765 | 166 | } |
766 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
767 | | |
768 | | void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, |
769 | 722 | const uint8_t *above, const uint8_t *left) { |
770 | 722 | (void)above; |
771 | 722 | (void)left; |
772 | 722 | const __m128i row = _mm_set1_epi8((int8_t)128); |
773 | 722 | dc_store_8xh(&row, 4, dst, stride); |
774 | 722 | } |
775 | | |
776 | | void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, |
777 | 1.70k | const uint8_t *above, const uint8_t *left) { |
778 | 1.70k | (void)above; |
779 | 1.70k | (void)left; |
780 | 1.70k | const __m128i row = _mm_set1_epi8((int8_t)128); |
781 | 1.70k | dc_store_8xh(&row, 16, dst, stride); |
782 | 1.70k | } |
783 | | |
784 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
785 | | void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, |
786 | 92 | const uint8_t *above, const uint8_t *left) { |
787 | 92 | (void)above; |
788 | 92 | (void)left; |
789 | 92 | const __m128i row = _mm_set1_epi8((int8_t)128); |
790 | 92 | dc_store_8xh(&row, 32, dst, stride); |
791 | 92 | } |
792 | | |
793 | | void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, |
794 | 102 | const uint8_t *above, const uint8_t *left) { |
795 | 102 | (void)above; |
796 | 102 | (void)left; |
797 | 102 | const __m128i row = _mm_set1_epi8((int8_t)128); |
798 | 102 | dc_store_16xh(&row, 4, dst, stride); |
799 | 102 | } |
800 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
801 | | |
802 | | void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, |
803 | 376 | const uint8_t *above, const uint8_t *left) { |
804 | 376 | (void)above; |
805 | 376 | (void)left; |
806 | 376 | const __m128i row = _mm_set1_epi8((int8_t)128); |
807 | 376 | dc_store_16xh(&row, 8, dst, stride); |
808 | 376 | } |
809 | | |
810 | | void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, |
811 | | const uint8_t *above, |
812 | 2.90k | const uint8_t *left) { |
813 | 2.90k | (void)above; |
814 | 2.90k | (void)left; |
815 | 2.90k | const __m128i row = _mm_set1_epi8((int8_t)128); |
816 | 2.90k | dc_store_16xh(&row, 32, dst, stride); |
817 | 2.90k | } |
818 | | |
819 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
820 | | void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, |
821 | | const uint8_t *above, |
822 | 33 | const uint8_t *left) { |
823 | 33 | (void)above; |
824 | 33 | (void)left; |
825 | 33 | const __m128i row = _mm_set1_epi8((int8_t)128); |
826 | 33 | dc_store_16xh(&row, 64, dst, stride); |
827 | 33 | } |
828 | | |
829 | | void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, |
830 | 85 | const uint8_t *above, const uint8_t *left) { |
831 | 85 | (void)above; |
832 | 85 | (void)left; |
833 | 85 | const __m128i row = _mm_set1_epi8((int8_t)128); |
834 | 85 | dc_store_32xh(&row, 8, dst, stride); |
835 | 85 | } |
836 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
837 | | |
838 | | void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, |
839 | | const uint8_t *above, |
840 | 0 | const uint8_t *left) { |
841 | 0 | (void)above; |
842 | 0 | (void)left; |
843 | 0 | const __m128i row = _mm_set1_epi8((int8_t)128); |
844 | 0 | dc_store_32xh(&row, 16, dst, stride); |
845 | 0 | } |
846 | | |
847 | | void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, |
848 | | const uint8_t *above, |
849 | 0 | const uint8_t *left) { |
850 | 0 | (void)above; |
851 | 0 | (void)left; |
852 | 0 | const __m128i row = _mm_set1_epi8((int8_t)128); |
853 | 0 | dc_store_32xh(&row, 64, dst, stride); |
854 | 0 | } |
855 | | |
856 | | void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, |
857 | | const uint8_t *above, |
858 | 0 | const uint8_t *left) { |
859 | 0 | (void)above; |
860 | 0 | (void)left; |
861 | 0 | const __m128i row = _mm_set1_epi8((int8_t)128); |
862 | 0 | dc_store_64xh(&row, 64, dst, stride); |
863 | 0 | } |
864 | | |
865 | | void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, |
866 | | const uint8_t *above, |
867 | 0 | const uint8_t *left) { |
868 | 0 | (void)above; |
869 | 0 | (void)left; |
870 | 0 | const __m128i row = _mm_set1_epi8((int8_t)128); |
871 | 0 | dc_store_64xh(&row, 32, dst, stride); |
872 | 0 | } |
873 | | |
874 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
875 | | void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, |
876 | | const uint8_t *above, |
877 | 0 | const uint8_t *left) { |
878 | 0 | (void)above; |
879 | 0 | (void)left; |
880 | 0 | const __m128i row = _mm_set1_epi8((int8_t)128); |
881 | 0 | dc_store_64xh(&row, 16, dst, stride); |
882 | 0 | } |
883 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
884 | | |
885 | | // ----------------------------------------------------------------------------- |
886 | | // V_PRED |
887 | | |
888 | | void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, |
889 | 23.3k | const uint8_t *above, const uint8_t *left) { |
890 | 23.3k | const uint32_t pred = *(uint32_t *)above; |
891 | 23.3k | (void)left; |
892 | 23.3k | dc_store_4xh(pred, 8, dst, stride); |
893 | 23.3k | } |
894 | | |
895 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
896 | | void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, |
897 | 7.92k | const uint8_t *above, const uint8_t *left) { |
898 | 7.92k | const uint32_t pred = *(uint32_t *)above; |
899 | 7.92k | (void)left; |
900 | 7.92k | dc_store_4xh(pred, 16, dst, stride); |
901 | 7.92k | } |
902 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
903 | | |
904 | | void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, |
905 | 36.8k | const uint8_t *above, const uint8_t *left) { |
906 | 36.8k | const __m128i row = _mm_loadl_epi64((__m128i const *)above); |
907 | 36.8k | (void)left; |
908 | 36.8k | dc_store_8xh(&row, 4, dst, stride); |
909 | 36.8k | } |
910 | | |
911 | | void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, |
912 | 17.4k | const uint8_t *above, const uint8_t *left) { |
913 | 17.4k | const __m128i row = _mm_loadl_epi64((__m128i const *)above); |
914 | 17.4k | (void)left; |
915 | 17.4k | dc_store_8xh(&row, 16, dst, stride); |
916 | 17.4k | } |
917 | | |
918 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
919 | | void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, |
920 | 5.03k | const uint8_t *above, const uint8_t *left) { |
921 | 5.03k | const __m128i row = _mm_loadl_epi64((__m128i const *)above); |
922 | 5.03k | (void)left; |
923 | 5.03k | dc_store_8xh(&row, 32, dst, stride); |
924 | 5.03k | } |
925 | | |
926 | | void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, |
927 | 32.0k | const uint8_t *above, const uint8_t *left) { |
928 | 32.0k | const __m128i row = _mm_load_si128((__m128i const *)above); |
929 | 32.0k | (void)left; |
930 | 32.0k | dc_store_16xh(&row, 4, dst, stride); |
931 | 32.0k | } |
932 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
933 | | |
934 | | void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, |
935 | 31.2k | const uint8_t *above, const uint8_t *left) { |
936 | 31.2k | const __m128i row = _mm_load_si128((__m128i const *)above); |
937 | 31.2k | (void)left; |
938 | 31.2k | dc_store_16xh(&row, 8, dst, stride); |
939 | 31.2k | } |
940 | | |
941 | | void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, |
942 | 9.58k | const uint8_t *above, const uint8_t *left) { |
943 | 9.58k | const __m128i row = _mm_load_si128((__m128i const *)above); |
944 | 9.58k | (void)left; |
945 | 9.58k | dc_store_16xh(&row, 32, dst, stride); |
946 | 9.58k | } |
947 | | |
948 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
949 | | void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, |
950 | 1.81k | const uint8_t *above, const uint8_t *left) { |
951 | 1.81k | const __m128i row = _mm_load_si128((__m128i const *)above); |
952 | 1.81k | (void)left; |
953 | 1.81k | dc_store_16xh(&row, 64, dst, stride); |
954 | 1.81k | } |
955 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
956 | | |
957 | | static inline void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride, |
958 | 13.6k | const uint8_t *above, int height) { |
959 | 13.6k | const __m128i row0 = _mm_load_si128((__m128i const *)above); |
960 | 13.6k | const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); |
961 | 122k | for (int i = 0; i < height; ++i) { |
962 | 109k | _mm_store_si128((__m128i *)dst, row0); |
963 | 109k | _mm_store_si128((__m128i *)(dst + 16), row1); |
964 | 109k | dst += stride; |
965 | 109k | } |
966 | 13.6k | } |
967 | | |
968 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
969 | | void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, |
970 | 13.6k | const uint8_t *above, const uint8_t *left) { |
971 | 13.6k | (void)left; |
972 | 13.6k | v_predictor_32xh(dst, stride, above, 8); |
973 | 13.6k | } |
974 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
975 | | |
976 | | void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, |
977 | 0 | const uint8_t *above, const uint8_t *left) { |
978 | 0 | (void)left; |
979 | 0 | v_predictor_32xh(dst, stride, above, 16); |
980 | 0 | } |
981 | | |
982 | | void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, |
983 | 0 | const uint8_t *above, const uint8_t *left) { |
984 | 0 | (void)left; |
985 | 0 | v_predictor_32xh(dst, stride, above, 64); |
986 | 0 | } |
987 | | |
988 | | static inline void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride, |
989 | 0 | const uint8_t *above, int height) { |
990 | 0 | const __m128i row0 = _mm_load_si128((__m128i const *)above); |
991 | 0 | const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); |
992 | 0 | const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32)); |
993 | 0 | const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48)); |
994 | 0 | for (int i = 0; i < height; ++i) { |
995 | 0 | _mm_store_si128((__m128i *)dst, row0); |
996 | 0 | _mm_store_si128((__m128i *)(dst + 16), row1); |
997 | 0 | _mm_store_si128((__m128i *)(dst + 32), row2); |
998 | 0 | _mm_store_si128((__m128i *)(dst + 48), row3); |
999 | 0 | dst += stride; |
1000 | 0 | } |
1001 | 0 | } |
1002 | | |
1003 | | void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, |
1004 | 0 | const uint8_t *above, const uint8_t *left) { |
1005 | 0 | (void)left; |
1006 | 0 | v_predictor_64xh(dst, stride, above, 64); |
1007 | 0 | } |
1008 | | |
1009 | | void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, |
1010 | 0 | const uint8_t *above, const uint8_t *left) { |
1011 | 0 | (void)left; |
1012 | 0 | v_predictor_64xh(dst, stride, above, 32); |
1013 | 0 | } |
1014 | | |
1015 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1016 | | void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, |
1017 | 0 | const uint8_t *above, const uint8_t *left) { |
1018 | 0 | (void)left; |
1019 | 0 | v_predictor_64xh(dst, stride, above, 16); |
1020 | 0 | } |
1021 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1022 | | |
1023 | | // ----------------------------------------------------------------------------- |
1024 | | // H_PRED |
1025 | | |
1026 | | void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, |
1027 | 34.2k | const uint8_t *above, const uint8_t *left) { |
1028 | 34.2k | (void)above; |
1029 | 34.2k | __m128i left_col = _mm_loadl_epi64((__m128i const *)left); |
1030 | 34.2k | left_col = _mm_unpacklo_epi8(left_col, left_col); |
1031 | 34.2k | __m128i row0 = _mm_shufflelo_epi16(left_col, 0); |
1032 | 34.2k | __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); |
1033 | 34.2k | __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); |
1034 | 34.2k | __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); |
1035 | 34.2k | *(int *)dst = _mm_cvtsi128_si32(row0); |
1036 | 34.2k | dst += stride; |
1037 | 34.2k | *(int *)dst = _mm_cvtsi128_si32(row1); |
1038 | 34.2k | dst += stride; |
1039 | 34.2k | *(int *)dst = _mm_cvtsi128_si32(row2); |
1040 | 34.2k | dst += stride; |
1041 | 34.2k | *(int *)dst = _mm_cvtsi128_si32(row3); |
1042 | 34.2k | dst += stride; |
1043 | 34.2k | left_col = _mm_unpackhi_epi64(left_col, left_col); |
1044 | 34.2k | row0 = _mm_shufflelo_epi16(left_col, 0); |
1045 | 34.2k | row1 = _mm_shufflelo_epi16(left_col, 0x55); |
1046 | 34.2k | row2 = _mm_shufflelo_epi16(left_col, 0xaa); |
1047 | 34.2k | row3 = _mm_shufflelo_epi16(left_col, 0xff); |
1048 | 34.2k | *(int *)dst = _mm_cvtsi128_si32(row0); |
1049 | 34.2k | dst += stride; |
1050 | 34.2k | *(int *)dst = _mm_cvtsi128_si32(row1); |
1051 | 34.2k | dst += stride; |
1052 | 34.2k | *(int *)dst = _mm_cvtsi128_si32(row2); |
1053 | 34.2k | dst += stride; |
1054 | 34.2k | *(int *)dst = _mm_cvtsi128_si32(row3); |
1055 | 34.2k | } |
1056 | | |
1057 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1058 | | void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, |
1059 | 19.4k | const uint8_t *above, const uint8_t *left) { |
1060 | 19.4k | (void)above; |
1061 | 19.4k | const __m128i left_col = _mm_load_si128((__m128i const *)left); |
1062 | 19.4k | __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); |
1063 | 19.4k | __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); |
1064 | | |
1065 | 19.4k | __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); |
1066 | 19.4k | __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); |
1067 | 19.4k | __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); |
1068 | 19.4k | __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); |
1069 | 19.4k | *(int *)dst = _mm_cvtsi128_si32(row0); |
1070 | 19.4k | dst += stride; |
1071 | 19.4k | *(int *)dst = _mm_cvtsi128_si32(row1); |
1072 | 19.4k | dst += stride; |
1073 | 19.4k | *(int *)dst = _mm_cvtsi128_si32(row2); |
1074 | 19.4k | dst += stride; |
1075 | 19.4k | *(int *)dst = _mm_cvtsi128_si32(row3); |
1076 | 19.4k | dst += stride; |
1077 | | |
1078 | 19.4k | left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); |
1079 | 19.4k | row0 = _mm_shufflelo_epi16(left_col_low, 0); |
1080 | 19.4k | row1 = _mm_shufflelo_epi16(left_col_low, 0x55); |
1081 | 19.4k | row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); |
1082 | 19.4k | row3 = _mm_shufflelo_epi16(left_col_low, 0xff); |
1083 | 19.4k | *(int *)dst = _mm_cvtsi128_si32(row0); |
1084 | 19.4k | dst += stride; |
1085 | 19.4k | *(int *)dst = _mm_cvtsi128_si32(row1); |
1086 | 19.4k | dst += stride; |
1087 | 19.4k | *(int *)dst = _mm_cvtsi128_si32(row2); |
1088 | 19.4k | dst += stride; |
1089 | 19.4k | *(int *)dst = _mm_cvtsi128_si32(row3); |
1090 | 19.4k | dst += stride; |
1091 | | |
1092 | 19.4k | row0 = _mm_shufflelo_epi16(left_col_high, 0); |
1093 | 19.4k | row1 = _mm_shufflelo_epi16(left_col_high, 0x55); |
1094 | 19.4k | row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); |
1095 | 19.4k | row3 = _mm_shufflelo_epi16(left_col_high, 0xff); |
1096 | 19.4k | *(int *)dst = _mm_cvtsi128_si32(row0); |
1097 | 19.4k | dst += stride; |
1098 | 19.4k | *(int *)dst = _mm_cvtsi128_si32(row1); |
1099 | 19.4k | dst += stride; |
1100 | 19.4k | *(int *)dst = _mm_cvtsi128_si32(row2); |
1101 | 19.4k | dst += stride; |
1102 | 19.4k | *(int *)dst = _mm_cvtsi128_si32(row3); |
1103 | 19.4k | dst += stride; |
1104 | | |
1105 | 19.4k | left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); |
1106 | 19.4k | row0 = _mm_shufflelo_epi16(left_col_high, 0); |
1107 | 19.4k | row1 = _mm_shufflelo_epi16(left_col_high, 0x55); |
1108 | 19.4k | row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); |
1109 | 19.4k | row3 = _mm_shufflelo_epi16(left_col_high, 0xff); |
1110 | 19.4k | *(int *)dst = _mm_cvtsi128_si32(row0); |
1111 | 19.4k | dst += stride; |
1112 | 19.4k | *(int *)dst = _mm_cvtsi128_si32(row1); |
1113 | 19.4k | dst += stride; |
1114 | 19.4k | *(int *)dst = _mm_cvtsi128_si32(row2); |
1115 | 19.4k | dst += stride; |
1116 | 19.4k | *(int *)dst = _mm_cvtsi128_si32(row3); |
1117 | 19.4k | } |
1118 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1119 | | |
1120 | | void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, |
1121 | 62.8k | const uint8_t *above, const uint8_t *left) { |
1122 | 62.8k | (void)above; |
1123 | 62.8k | __m128i left_col = _mm_loadl_epi64((__m128i const *)left); |
1124 | 62.8k | left_col = _mm_unpacklo_epi8(left_col, left_col); |
1125 | 62.8k | __m128i row0 = _mm_shufflelo_epi16(left_col, 0); |
1126 | 62.8k | __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); |
1127 | 62.8k | __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); |
1128 | 62.8k | __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); |
1129 | 62.8k | _mm_storel_epi64((__m128i *)dst, row0); |
1130 | 62.8k | dst += stride; |
1131 | 62.8k | _mm_storel_epi64((__m128i *)dst, row1); |
1132 | 62.8k | dst += stride; |
1133 | 62.8k | _mm_storel_epi64((__m128i *)dst, row2); |
1134 | 62.8k | dst += stride; |
1135 | 62.8k | _mm_storel_epi64((__m128i *)dst, row3); |
1136 | 62.8k | } |
1137 | | |
1138 | | static inline void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride, |
1139 | | const uint8_t *above, const uint8_t *left, |
1140 | 41.3k | int count) { |
1141 | 41.3k | (void)above; |
1142 | 92.1k | for (int i = 0; i < count; ++i) { |
1143 | 50.8k | const __m128i left_col = _mm_load_si128((__m128i const *)left); |
1144 | 50.8k | __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); |
1145 | 50.8k | __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); |
1146 | | |
1147 | 50.8k | __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); |
1148 | 50.8k | __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); |
1149 | 50.8k | __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); |
1150 | 50.8k | __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); |
1151 | 50.8k | _mm_storel_epi64((__m128i *)dst, row0); |
1152 | 50.8k | dst += stride; |
1153 | 50.8k | _mm_storel_epi64((__m128i *)dst, row1); |
1154 | 50.8k | dst += stride; |
1155 | 50.8k | _mm_storel_epi64((__m128i *)dst, row2); |
1156 | 50.8k | dst += stride; |
1157 | 50.8k | _mm_storel_epi64((__m128i *)dst, row3); |
1158 | 50.8k | dst += stride; |
1159 | | |
1160 | 50.8k | left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); |
1161 | 50.8k | row0 = _mm_shufflelo_epi16(left_col_low, 0); |
1162 | 50.8k | row1 = _mm_shufflelo_epi16(left_col_low, 0x55); |
1163 | 50.8k | row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); |
1164 | 50.8k | row3 = _mm_shufflelo_epi16(left_col_low, 0xff); |
1165 | 50.8k | _mm_storel_epi64((__m128i *)dst, row0); |
1166 | 50.8k | dst += stride; |
1167 | 50.8k | _mm_storel_epi64((__m128i *)dst, row1); |
1168 | 50.8k | dst += stride; |
1169 | 50.8k | _mm_storel_epi64((__m128i *)dst, row2); |
1170 | 50.8k | dst += stride; |
1171 | 50.8k | _mm_storel_epi64((__m128i *)dst, row3); |
1172 | 50.8k | dst += stride; |
1173 | | |
1174 | 50.8k | row0 = _mm_shufflelo_epi16(left_col_high, 0); |
1175 | 50.8k | row1 = _mm_shufflelo_epi16(left_col_high, 0x55); |
1176 | 50.8k | row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); |
1177 | 50.8k | row3 = _mm_shufflelo_epi16(left_col_high, 0xff); |
1178 | 50.8k | _mm_storel_epi64((__m128i *)dst, row0); |
1179 | 50.8k | dst += stride; |
1180 | 50.8k | _mm_storel_epi64((__m128i *)dst, row1); |
1181 | 50.8k | dst += stride; |
1182 | 50.8k | _mm_storel_epi64((__m128i *)dst, row2); |
1183 | 50.8k | dst += stride; |
1184 | 50.8k | _mm_storel_epi64((__m128i *)dst, row3); |
1185 | 50.8k | dst += stride; |
1186 | | |
1187 | 50.8k | left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); |
1188 | 50.8k | row0 = _mm_shufflelo_epi16(left_col_high, 0); |
1189 | 50.8k | row1 = _mm_shufflelo_epi16(left_col_high, 0x55); |
1190 | 50.8k | row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); |
1191 | 50.8k | row3 = _mm_shufflelo_epi16(left_col_high, 0xff); |
1192 | 50.8k | _mm_storel_epi64((__m128i *)dst, row0); |
1193 | 50.8k | dst += stride; |
1194 | 50.8k | _mm_storel_epi64((__m128i *)dst, row1); |
1195 | 50.8k | dst += stride; |
1196 | 50.8k | _mm_storel_epi64((__m128i *)dst, row2); |
1197 | 50.8k | dst += stride; |
1198 | 50.8k | _mm_storel_epi64((__m128i *)dst, row3); |
1199 | 50.8k | dst += stride; |
1200 | 50.8k | left += 16; |
1201 | 50.8k | } |
1202 | 41.3k | } |
1203 | | |
1204 | | void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, |
1205 | 31.8k | const uint8_t *above, const uint8_t *left) { |
1206 | 31.8k | h_predictor_8x16xc(dst, stride, above, left, 1); |
1207 | 31.8k | } |
1208 | | |
1209 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1210 | | void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, |
1211 | 9.50k | const uint8_t *above, const uint8_t *left) { |
1212 | 9.50k | h_predictor_8x16xc(dst, stride, above, left, 2); |
1213 | 9.50k | } |
1214 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1215 | | |
1216 | | static inline void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst, |
1217 | 475k | ptrdiff_t stride) { |
1218 | 475k | int i; |
1219 | 2.37M | for (i = 0; i < h; ++i) { |
1220 | 1.90M | _mm_store_si128((__m128i *)dst, row[i]); |
1221 | 1.90M | dst += stride; |
1222 | 1.90M | } |
1223 | 475k | } |
1224 | | |
1225 | 541k | static inline void repeat_low_4pixels(const __m128i *x, __m128i *row) { |
1226 | 541k | const __m128i u0 = _mm_shufflelo_epi16(*x, 0); |
1227 | 541k | const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55); |
1228 | 541k | const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa); |
1229 | 541k | const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff); |
1230 | | |
1231 | 541k | row[0] = _mm_unpacklo_epi64(u0, u0); |
1232 | 541k | row[1] = _mm_unpacklo_epi64(u1, u1); |
1233 | 541k | row[2] = _mm_unpacklo_epi64(u2, u2); |
1234 | 541k | row[3] = _mm_unpacklo_epi64(u3, u3); |
1235 | 541k | } |
1236 | | |
1237 | 415k | static inline void repeat_high_4pixels(const __m128i *x, __m128i *row) { |
1238 | 415k | const __m128i u0 = _mm_shufflehi_epi16(*x, 0); |
1239 | 415k | const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55); |
1240 | 415k | const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa); |
1241 | 415k | const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff); |
1242 | | |
1243 | 415k | row[0] = _mm_unpackhi_epi64(u0, u0); |
1244 | 415k | row[1] = _mm_unpackhi_epi64(u1, u1); |
1245 | 415k | row[2] = _mm_unpackhi_epi64(u2, u2); |
1246 | 415k | row[3] = _mm_unpackhi_epi64(u3, u3); |
1247 | 415k | } |
1248 | | |
1249 | | // Process 16x8, first 4 rows |
1250 | | // Use first 8 bytes of left register: xxxxxxxx33221100 |
1251 | | static inline void h_prediction_16x8_1(const __m128i *left, uint8_t *dst, |
1252 | 300k | ptrdiff_t stride) { |
1253 | 300k | __m128i row[4]; |
1254 | 300k | repeat_low_4pixels(left, row); |
1255 | 300k | h_pred_store_16xh(row, 4, dst, stride); |
1256 | 300k | } |
1257 | | |
1258 | | // Process 16x8, second 4 rows |
1259 | | // Use second 8 bytes of left register: 77665544xxxxxxxx |
1260 | | static inline void h_prediction_16x8_2(const __m128i *left, uint8_t *dst, |
1261 | 174k | ptrdiff_t stride) { |
1262 | 174k | __m128i row[4]; |
1263 | 174k | repeat_high_4pixels(left, row); |
1264 | 174k | h_pred_store_16xh(row, 4, dst, stride); |
1265 | 174k | } |
1266 | | |
1267 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1268 | | void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, |
1269 | 125k | const uint8_t *above, const uint8_t *left) { |
1270 | 125k | (void)above; |
1271 | 125k | const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); |
1272 | 125k | const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); |
1273 | 125k | h_prediction_16x8_1(&left_col_8p, dst, stride); |
1274 | 125k | } |
1275 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1276 | | |
1277 | | void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, |
1278 | 75.5k | const uint8_t *above, const uint8_t *left) { |
1279 | 75.5k | (void)above; |
1280 | 75.5k | const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); |
1281 | 75.5k | const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); |
1282 | 75.5k | h_prediction_16x8_1(&left_col_8p, dst, stride); |
1283 | 75.5k | dst += stride << 2; |
1284 | 75.5k | h_prediction_16x8_2(&left_col_8p, dst, stride); |
1285 | 75.5k | } |
1286 | | |
1287 | | static inline void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride, |
1288 | 22.2k | const uint8_t *left, int count) { |
1289 | 22.2k | int i = 0; |
1290 | 49.6k | do { |
1291 | 49.6k | const __m128i left_col = _mm_load_si128((const __m128i *)left); |
1292 | 49.6k | const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col); |
1293 | 49.6k | h_prediction_16x8_1(&left_col_8p_lo, dst, stride); |
1294 | 49.6k | dst += stride << 2; |
1295 | 49.6k | h_prediction_16x8_2(&left_col_8p_lo, dst, stride); |
1296 | 49.6k | dst += stride << 2; |
1297 | | |
1298 | 49.6k | const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col); |
1299 | 49.6k | h_prediction_16x8_1(&left_col_8p_hi, dst, stride); |
1300 | 49.6k | dst += stride << 2; |
1301 | 49.6k | h_prediction_16x8_2(&left_col_8p_hi, dst, stride); |
1302 | 49.6k | dst += stride << 2; |
1303 | | |
1304 | 49.6k | left += 16; |
1305 | 49.6k | i++; |
1306 | 49.6k | } while (i < count); |
1307 | 22.2k | } |
1308 | | |
1309 | | void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, |
1310 | 19.6k | const uint8_t *above, const uint8_t *left) { |
1311 | 19.6k | (void)above; |
1312 | 19.6k | h_predictor_16xh(dst, stride, left, 2); |
1313 | 19.6k | } |
1314 | | |
1315 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1316 | | void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, |
1317 | 2.60k | const uint8_t *above, const uint8_t *left) { |
1318 | 2.60k | (void)above; |
1319 | 2.60k | h_predictor_16xh(dst, stride, left, 4); |
1320 | 2.60k | } |
1321 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1322 | | |
1323 | | static inline void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst, |
1324 | 481k | ptrdiff_t stride) { |
1325 | 481k | int i; |
1326 | 2.40M | for (i = 0; i < h; ++i) { |
1327 | 1.92M | _mm_store_si128((__m128i *)dst, row[i]); |
1328 | 1.92M | _mm_store_si128((__m128i *)(dst + 16), row[i]); |
1329 | 1.92M | dst += stride; |
1330 | 1.92M | } |
1331 | 481k | } |
1332 | | |
1333 | | // Process 32x8, first 4 rows |
1334 | | // Use first 8 bytes of left register: xxxxxxxx33221100 |
1335 | | static inline void h_prediction_32x8_1(const __m128i *left, uint8_t *dst, |
1336 | 240k | ptrdiff_t stride) { |
1337 | 240k | __m128i row[4]; |
1338 | 240k | repeat_low_4pixels(left, row); |
1339 | 240k | h_pred_store_32xh(row, 4, dst, stride); |
1340 | 240k | } |
1341 | | |
1342 | | // Process 32x8, second 4 rows |
1343 | | // Use second 8 bytes of left register: 77665544xxxxxxxx |
1344 | | static inline void h_prediction_32x8_2(const __m128i *left, uint8_t *dst, |
1345 | 240k | ptrdiff_t stride) { |
1346 | 240k | __m128i row[4]; |
1347 | 240k | repeat_high_4pixels(left, row); |
1348 | 240k | h_pred_store_32xh(row, 4, dst, stride); |
1349 | 240k | } |
1350 | | |
1351 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1352 | | void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, |
1353 | 147k | const uint8_t *above, const uint8_t *left) { |
1354 | 147k | __m128i left_col, left_col_8p; |
1355 | 147k | (void)above; |
1356 | | |
1357 | 147k | left_col = _mm_load_si128((const __m128i *)left); |
1358 | | |
1359 | 147k | left_col_8p = _mm_unpacklo_epi8(left_col, left_col); |
1360 | 147k | h_prediction_32x8_1(&left_col_8p, dst, stride); |
1361 | 147k | dst += stride << 2; |
1362 | 147k | h_prediction_32x8_2(&left_col_8p, dst, stride); |
1363 | 147k | } |
1364 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1365 | | |
1366 | | void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, |
1367 | 46.7k | const uint8_t *above, const uint8_t *left) { |
1368 | 46.7k | __m128i left_col, left_col_8p; |
1369 | 46.7k | (void)above; |
1370 | | |
1371 | 46.7k | left_col = _mm_load_si128((const __m128i *)left); |
1372 | | |
1373 | 46.7k | left_col_8p = _mm_unpacklo_epi8(left_col, left_col); |
1374 | 46.7k | h_prediction_32x8_1(&left_col_8p, dst, stride); |
1375 | 46.7k | dst += stride << 2; |
1376 | 46.7k | h_prediction_32x8_2(&left_col_8p, dst, stride); |
1377 | 46.7k | dst += stride << 2; |
1378 | | |
1379 | 46.7k | left_col_8p = _mm_unpackhi_epi8(left_col, left_col); |
1380 | 46.7k | h_prediction_32x8_1(&left_col_8p, dst, stride); |
1381 | 46.7k | dst += stride << 2; |
1382 | 46.7k | h_prediction_32x8_2(&left_col_8p, dst, stride); |
1383 | 46.7k | } |
1384 | | |
1385 | | static inline void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride, |
1386 | 2.77k | const uint8_t *left, int height) { |
1387 | 2.77k | int i = height >> 2; |
1388 | 44.3k | do { |
1389 | 44.3k | __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]); |
1390 | 44.3k | left4 = _mm_unpacklo_epi8(left4, left4); |
1391 | 44.3k | left4 = _mm_unpacklo_epi8(left4, left4); |
1392 | 44.3k | const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); |
1393 | 44.3k | const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); |
1394 | 44.3k | _mm_store_si128((__m128i *)dst, r0); |
1395 | 44.3k | _mm_store_si128((__m128i *)(dst + 16), r0); |
1396 | 44.3k | _mm_store_si128((__m128i *)(dst + stride), r1); |
1397 | 44.3k | _mm_store_si128((__m128i *)(dst + stride + 16), r1); |
1398 | 44.3k | const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); |
1399 | 44.3k | const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); |
1400 | 44.3k | _mm_store_si128((__m128i *)(dst + stride * 2), r2); |
1401 | 44.3k | _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); |
1402 | 44.3k | _mm_store_si128((__m128i *)(dst + stride * 3), r3); |
1403 | 44.3k | _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); |
1404 | 44.3k | left += 4; |
1405 | 44.3k | dst += stride * 4; |
1406 | 44.3k | } while (--i); |
1407 | 2.77k | } |
1408 | | |
1409 | | void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, |
1410 | 2.77k | const uint8_t *above, const uint8_t *left) { |
1411 | 2.77k | (void)above; |
1412 | 2.77k | h_predictor_32xh(dst, stride, left, 64); |
1413 | 2.77k | } |
1414 | | |
1415 | | static inline void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride, |
1416 | 74.4k | const uint8_t *left, int height) { |
1417 | 74.4k | int i = height >> 2; |
1418 | 538k | do { |
1419 | 538k | __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]); |
1420 | 538k | left4 = _mm_unpacklo_epi8(left4, left4); |
1421 | 538k | left4 = _mm_unpacklo_epi8(left4, left4); |
1422 | 538k | const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); |
1423 | 538k | const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); |
1424 | 538k | _mm_store_si128((__m128i *)dst, r0); |
1425 | 538k | _mm_store_si128((__m128i *)(dst + 16), r0); |
1426 | 538k | _mm_store_si128((__m128i *)(dst + 32), r0); |
1427 | 538k | _mm_store_si128((__m128i *)(dst + 48), r0); |
1428 | 538k | _mm_store_si128((__m128i *)(dst + stride), r1); |
1429 | 538k | _mm_store_si128((__m128i *)(dst + stride + 16), r1); |
1430 | 538k | _mm_store_si128((__m128i *)(dst + stride + 32), r1); |
1431 | 538k | _mm_store_si128((__m128i *)(dst + stride + 48), r1); |
1432 | 538k | const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); |
1433 | 538k | const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); |
1434 | 538k | _mm_store_si128((__m128i *)(dst + stride * 2), r2); |
1435 | 538k | _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); |
1436 | 538k | _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2); |
1437 | 538k | _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2); |
1438 | 538k | _mm_store_si128((__m128i *)(dst + stride * 3), r3); |
1439 | 538k | _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); |
1440 | 538k | _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3); |
1441 | 538k | _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3); |
1442 | 538k | left += 4; |
1443 | 538k | dst += stride * 4; |
1444 | 538k | } while (--i); |
1445 | 74.4k | } |
1446 | | |
1447 | | void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, |
1448 | 16.7k | const uint8_t *above, const uint8_t *left) { |
1449 | 16.7k | (void)above; |
1450 | 16.7k | h_predictor_64xh(dst, stride, left, 64); |
1451 | 16.7k | } |
1452 | | |
1453 | | void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, |
1454 | 9.85k | const uint8_t *above, const uint8_t *left) { |
1455 | 9.85k | (void)above; |
1456 | 9.85k | h_predictor_64xh(dst, stride, left, 32); |
1457 | 9.85k | } |
1458 | | |
1459 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1460 | | void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, |
1461 | 47.9k | const uint8_t *above, const uint8_t *left) { |
1462 | 47.9k | (void)above; |
1463 | 47.9k | h_predictor_64xh(dst, stride, left, 16); |
1464 | 47.9k | } |
1465 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |