/src/aom/aom_dsp/x86/intrapred_sse2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <emmintrin.h> |
13 | | #include "aom_dsp/x86/intrapred_x86.h" |
14 | | #include "config/aom_dsp_rtcd.h" |
15 | | |
16 | | static inline void dc_store_4xh(uint32_t dc, int height, uint8_t *dst, |
17 | 513k | ptrdiff_t stride) { |
18 | 3.35M | for (int i = 0; i < height; i += 2) { |
19 | 2.84M | *(uint32_t *)dst = dc; |
20 | 2.84M | dst += stride; |
21 | 2.84M | *(uint32_t *)dst = dc; |
22 | 2.84M | dst += stride; |
23 | 2.84M | } |
24 | 513k | } |
25 | | |
26 | | static inline void dc_store_8xh(const __m128i *row, int height, uint8_t *dst, |
27 | 828k | ptrdiff_t stride) { |
28 | 828k | int i; |
29 | 10.3M | for (i = 0; i < height; ++i) { |
30 | 9.52M | _mm_storel_epi64((__m128i *)dst, *row); |
31 | 9.52M | dst += stride; |
32 | 9.52M | } |
33 | 828k | } |
34 | | |
35 | | static inline void dc_store_16xh(const __m128i *row, int height, uint8_t *dst, |
36 | 1.23M | ptrdiff_t stride) { |
37 | 1.23M | int i; |
38 | 13.6M | for (i = 0; i < height; ++i) { |
39 | 12.3M | _mm_store_si128((__m128i *)dst, *row); |
40 | 12.3M | dst += stride; |
41 | 12.3M | } |
42 | 1.23M | } |
43 | | |
44 | | static inline void dc_store_32xh(const __m128i *row, int height, uint8_t *dst, |
45 | 386k | ptrdiff_t stride) { |
46 | 386k | int i; |
47 | 3.47M | for (i = 0; i < height; ++i) { |
48 | 3.08M | _mm_store_si128((__m128i *)dst, *row); |
49 | 3.08M | _mm_store_si128((__m128i *)(dst + 16), *row); |
50 | 3.08M | dst += stride; |
51 | 3.08M | } |
52 | 386k | } |
53 | | |
54 | | static inline void dc_store_64xh(const __m128i *row, int height, uint8_t *dst, |
55 | 0 | ptrdiff_t stride) { |
56 | 0 | for (int i = 0; i < height; ++i) { |
57 | 0 | _mm_store_si128((__m128i *)dst, *row); |
58 | 0 | _mm_store_si128((__m128i *)(dst + 16), *row); |
59 | 0 | _mm_store_si128((__m128i *)(dst + 32), *row); |
60 | 0 | _mm_store_si128((__m128i *)(dst + 48), *row); |
61 | 0 | dst += stride; |
62 | 0 | } |
63 | 0 | } |
64 | | |
65 | 1.51M | static inline __m128i dc_sum_4(const uint8_t *ref) { |
66 | 1.51M | __m128i x = _mm_loadl_epi64((__m128i const *)ref); |
67 | 1.51M | const __m128i zero = _mm_setzero_si128(); |
68 | 1.51M | x = _mm_unpacklo_epi8(x, zero); |
69 | 1.51M | return _mm_sad_epu8(x, zero); |
70 | 1.51M | } |
71 | | |
72 | 1.76M | static inline __m128i dc_sum_8(const uint8_t *ref) { |
73 | 1.76M | __m128i x = _mm_loadl_epi64((__m128i const *)ref); |
74 | 1.76M | const __m128i zero = _mm_setzero_si128(); |
75 | 1.76M | return _mm_sad_epu8(x, zero); |
76 | 1.76M | } |
77 | | |
78 | 19.0k | static inline __m128i dc_sum_64(const uint8_t *ref) { |
79 | 19.0k | __m128i x0 = _mm_load_si128((__m128i const *)ref); |
80 | 19.0k | __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); |
81 | 19.0k | __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32)); |
82 | 19.0k | __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48)); |
83 | 19.0k | const __m128i zero = _mm_setzero_si128(); |
84 | 19.0k | x0 = _mm_sad_epu8(x0, zero); |
85 | 19.0k | x1 = _mm_sad_epu8(x1, zero); |
86 | 19.0k | x2 = _mm_sad_epu8(x2, zero); |
87 | 19.0k | x3 = _mm_sad_epu8(x3, zero); |
88 | 19.0k | x0 = _mm_add_epi16(x0, x1); |
89 | 19.0k | x2 = _mm_add_epi16(x2, x3); |
90 | 19.0k | x0 = _mm_add_epi16(x0, x2); |
91 | 19.0k | const __m128i high = _mm_unpackhi_epi64(x0, x0); |
92 | 19.0k | return _mm_add_epi16(x0, high); |
93 | 19.0k | } |
94 | | |
95 | 1.40M | #define DC_MULTIPLIER_1X2 0x5556 |
96 | 1.29M | #define DC_MULTIPLIER_1X4 0x3334 |
97 | | |
98 | 2.70M | #define DC_SHIFT2 16 |
99 | | |
100 | | static inline int divide_using_multiply_shift(int num, int shift1, |
101 | 2.70M | int multiplier) { |
102 | 2.70M | const int interm = num >> shift1; |
103 | 2.70M | return interm * multiplier >> DC_SHIFT2; |
104 | 2.70M | } |
105 | | |
106 | | // ----------------------------------------------------------------------------- |
107 | | // DC_PRED |
108 | | |
109 | | void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, |
110 | 277k | const uint8_t *above, const uint8_t *left) { |
111 | 277k | const __m128i sum_left = dc_sum_8(left); |
112 | 277k | __m128i sum_above = dc_sum_4(above); |
113 | 277k | sum_above = _mm_add_epi16(sum_left, sum_above); |
114 | | |
115 | 277k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
116 | 277k | sum += 6; |
117 | 277k | sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); |
118 | | |
119 | 277k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
120 | 277k | const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row); |
121 | 277k | dc_store_4xh(pred, 8, dst, stride); |
122 | 277k | } |
123 | | |
124 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
125 | | void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, |
126 | 182k | const uint8_t *above, const uint8_t *left) { |
127 | 182k | const __m128i sum_left = dc_sum_16_sse2(left); |
128 | 182k | __m128i sum_above = dc_sum_4(above); |
129 | 182k | sum_above = _mm_add_epi16(sum_left, sum_above); |
130 | | |
131 | 182k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
132 | 182k | sum += 10; |
133 | 182k | sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); |
134 | | |
135 | 182k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
136 | 182k | const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row); |
137 | 182k | dc_store_4xh(pred, 16, dst, stride); |
138 | 182k | } |
139 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
140 | | |
141 | | void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, |
142 | 422k | const uint8_t *above, const uint8_t *left) { |
143 | 422k | const __m128i sum_left = dc_sum_4(left); |
144 | 422k | __m128i sum_above = dc_sum_8(above); |
145 | 422k | sum_above = _mm_add_epi16(sum_above, sum_left); |
146 | | |
147 | 422k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
148 | 422k | sum += 6; |
149 | 422k | sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); |
150 | | |
151 | 422k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
152 | 422k | dc_store_8xh(&row, 4, dst, stride); |
153 | 422k | } |
154 | | |
155 | | void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, |
156 | 214k | const uint8_t *above, const uint8_t *left) { |
157 | 214k | const __m128i sum_left = dc_sum_16_sse2(left); |
158 | 214k | __m128i sum_above = dc_sum_8(above); |
159 | 214k | sum_above = _mm_add_epi16(sum_above, sum_left); |
160 | | |
161 | 214k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
162 | 214k | sum += 12; |
163 | 214k | sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); |
164 | 214k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
165 | 214k | dc_store_8xh(&row, 16, dst, stride); |
166 | 214k | } |
167 | | |
168 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
169 | | void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, |
170 | 102k | const uint8_t *above, const uint8_t *left) { |
171 | 102k | const __m128i sum_left = dc_sum_32_sse2(left); |
172 | 102k | __m128i sum_above = dc_sum_8(above); |
173 | 102k | sum_above = _mm_add_epi16(sum_above, sum_left); |
174 | | |
175 | 102k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
176 | 102k | sum += 20; |
177 | 102k | sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); |
178 | 102k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
179 | 102k | dc_store_8xh(&row, 32, dst, stride); |
180 | 102k | } |
181 | | |
182 | | void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, |
183 | 619k | const uint8_t *above, const uint8_t *left) { |
184 | 619k | const __m128i sum_left = dc_sum_4(left); |
185 | 619k | __m128i sum_above = dc_sum_16_sse2(above); |
186 | 619k | sum_above = _mm_add_epi16(sum_above, sum_left); |
187 | | |
188 | 619k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
189 | 619k | sum += 10; |
190 | 619k | sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); |
191 | 619k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
192 | 619k | dc_store_16xh(&row, 4, dst, stride); |
193 | 619k | } |
194 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
195 | | |
196 | | void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, |
197 | 349k | const uint8_t *above, const uint8_t *left) { |
198 | 349k | const __m128i sum_left = dc_sum_8(left); |
199 | 349k | __m128i sum_above = dc_sum_16_sse2(above); |
200 | 349k | sum_above = _mm_add_epi16(sum_above, sum_left); |
201 | | |
202 | 349k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
203 | 349k | sum += 12; |
204 | 349k | sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); |
205 | 349k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
206 | 349k | dc_store_16xh(&row, 8, dst, stride); |
207 | 349k | } |
208 | | |
209 | | void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, |
210 | 143k | const uint8_t *above, const uint8_t *left) { |
211 | 143k | const __m128i sum_left = dc_sum_32_sse2(left); |
212 | 143k | __m128i sum_above = dc_sum_16_sse2(above); |
213 | 143k | sum_above = _mm_add_epi16(sum_left, sum_above); |
214 | | |
215 | 143k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
216 | 143k | sum += 24; |
217 | 143k | sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); |
218 | 143k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
219 | 143k | dc_store_16xh(&row, 32, dst, stride); |
220 | 143k | } |
221 | | |
222 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
223 | | void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, |
224 | 18.5k | const uint8_t *above, const uint8_t *left) { |
225 | 18.5k | const __m128i sum_left = dc_sum_64(left); |
226 | 18.5k | __m128i sum_above = dc_sum_16_sse2(above); |
227 | 18.5k | sum_above = _mm_add_epi16(sum_left, sum_above); |
228 | | |
229 | 18.5k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
230 | 18.5k | sum += 40; |
231 | 18.5k | sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); |
232 | 18.5k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
233 | 18.5k | dc_store_16xh(&row, 64, dst, stride); |
234 | 18.5k | } |
235 | | |
236 | | void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, |
237 | 374k | const uint8_t *above, const uint8_t *left) { |
238 | 374k | __m128i sum_above = dc_sum_32_sse2(above); |
239 | 374k | const __m128i sum_left = dc_sum_8(left); |
240 | 374k | sum_above = _mm_add_epi16(sum_above, sum_left); |
241 | | |
242 | 374k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
243 | 374k | sum += 20; |
244 | 374k | sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); |
245 | 374k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
246 | 374k | dc_store_32xh(&row, 8, dst, stride); |
247 | 374k | } |
248 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
249 | | |
250 | | void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, |
251 | 0 | const uint8_t *above, const uint8_t *left) { |
252 | 0 | __m128i sum_above = dc_sum_32_sse2(above); |
253 | 0 | const __m128i sum_left = dc_sum_16_sse2(left); |
254 | 0 | sum_above = _mm_add_epi16(sum_above, sum_left); |
255 | |
|
256 | 0 | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
257 | 0 | sum += 24; |
258 | 0 | sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); |
259 | 0 | const __m128i row = _mm_set1_epi8((int8_t)sum); |
260 | 0 | dc_store_32xh(&row, 16, dst, stride); |
261 | 0 | } |
262 | | |
263 | | void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, |
264 | 0 | const uint8_t *above, const uint8_t *left) { |
265 | 0 | __m128i sum_above = dc_sum_32_sse2(above); |
266 | 0 | const __m128i sum_left = dc_sum_64(left); |
267 | 0 | sum_above = _mm_add_epi16(sum_above, sum_left); |
268 | |
|
269 | 0 | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
270 | 0 | sum += 48; |
271 | 0 | sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); |
272 | 0 | const __m128i row = _mm_set1_epi8((int8_t)sum); |
273 | 0 | dc_store_32xh(&row, 64, dst, stride); |
274 | 0 | } |
275 | | |
276 | | void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, |
277 | 0 | const uint8_t *above, const uint8_t *left) { |
278 | 0 | __m128i sum_above = dc_sum_64(above); |
279 | 0 | const __m128i sum_left = dc_sum_64(left); |
280 | 0 | sum_above = _mm_add_epi16(sum_above, sum_left); |
281 | |
|
282 | 0 | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
283 | 0 | sum += 64; |
284 | 0 | sum /= 128; |
285 | 0 | const __m128i row = _mm_set1_epi8((int8_t)sum); |
286 | 0 | dc_store_64xh(&row, 64, dst, stride); |
287 | 0 | } |
288 | | |
289 | | void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, |
290 | 0 | const uint8_t *above, const uint8_t *left) { |
291 | 0 | __m128i sum_above = dc_sum_64(above); |
292 | 0 | const __m128i sum_left = dc_sum_32_sse2(left); |
293 | 0 | sum_above = _mm_add_epi16(sum_above, sum_left); |
294 | |
|
295 | 0 | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
296 | 0 | sum += 48; |
297 | 0 | sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); |
298 | 0 | const __m128i row = _mm_set1_epi8((int8_t)sum); |
299 | 0 | dc_store_64xh(&row, 32, dst, stride); |
300 | 0 | } |
301 | | |
302 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
303 | | void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, |
304 | 0 | const uint8_t *above, const uint8_t *left) { |
305 | 0 | __m128i sum_above = dc_sum_64(above); |
306 | 0 | const __m128i sum_left = dc_sum_16_sse2(left); |
307 | 0 | sum_above = _mm_add_epi16(sum_above, sum_left); |
308 | |
|
309 | 0 | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
310 | 0 | sum += 40; |
311 | 0 | sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); |
312 | 0 | const __m128i row = _mm_set1_epi8((int8_t)sum); |
313 | 0 | dc_store_64xh(&row, 16, dst, stride); |
314 | 0 | } |
315 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
316 | | |
317 | | // ----------------------------------------------------------------------------- |
318 | | // DC_TOP |
319 | | |
320 | | void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, |
321 | 4.33k | const uint8_t *above, const uint8_t *left) { |
322 | 4.33k | (void)left; |
323 | 4.33k | __m128i sum_above = dc_sum_4(above); |
324 | 4.33k | const __m128i two = _mm_set1_epi16(2); |
325 | 4.33k | sum_above = _mm_add_epi16(sum_above, two); |
326 | 4.33k | sum_above = _mm_srai_epi16(sum_above, 2); |
327 | 4.33k | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
328 | 4.33k | sum_above = _mm_packus_epi16(sum_above, sum_above); |
329 | | |
330 | 4.33k | const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above); |
331 | 4.33k | dc_store_4xh(pred, 8, dst, stride); |
332 | 4.33k | } |
333 | | |
334 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
335 | | void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, |
336 | 2.15k | const uint8_t *above, const uint8_t *left) { |
337 | 2.15k | (void)left; |
338 | 2.15k | __m128i sum_above = dc_sum_4(above); |
339 | 2.15k | const __m128i two = _mm_set1_epi16(2); |
340 | 2.15k | sum_above = _mm_add_epi16(sum_above, two); |
341 | 2.15k | sum_above = _mm_srai_epi16(sum_above, 2); |
342 | 2.15k | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
343 | 2.15k | sum_above = _mm_packus_epi16(sum_above, sum_above); |
344 | | |
345 | 2.15k | const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above); |
346 | 2.15k | dc_store_4xh(pred, 16, dst, stride); |
347 | 2.15k | } |
348 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
349 | | |
350 | | void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, |
351 | 2.44k | const uint8_t *above, const uint8_t *left) { |
352 | 2.44k | (void)left; |
353 | 2.44k | __m128i sum_above = dc_sum_8(above); |
354 | 2.44k | const __m128i four = _mm_set1_epi16(4); |
355 | 2.44k | sum_above = _mm_add_epi16(sum_above, four); |
356 | 2.44k | sum_above = _mm_srai_epi16(sum_above, 3); |
357 | 2.44k | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
358 | 2.44k | const __m128i row = _mm_shufflelo_epi16(sum_above, 0); |
359 | 2.44k | dc_store_8xh(&row, 4, dst, stride); |
360 | 2.44k | } |
361 | | |
362 | | void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, |
363 | 3.72k | const uint8_t *above, const uint8_t *left) { |
364 | 3.72k | (void)left; |
365 | 3.72k | __m128i sum_above = dc_sum_8(above); |
366 | 3.72k | const __m128i four = _mm_set1_epi16(4); |
367 | 3.72k | sum_above = _mm_add_epi16(sum_above, four); |
368 | 3.72k | sum_above = _mm_srai_epi16(sum_above, 3); |
369 | 3.72k | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
370 | 3.72k | const __m128i row = _mm_shufflelo_epi16(sum_above, 0); |
371 | 3.72k | dc_store_8xh(&row, 16, dst, stride); |
372 | 3.72k | } |
373 | | |
374 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
375 | | void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, |
376 | 5.61k | const uint8_t *above, const uint8_t *left) { |
377 | 5.61k | (void)left; |
378 | 5.61k | __m128i sum_above = dc_sum_8(above); |
379 | 5.61k | const __m128i four = _mm_set1_epi16(4); |
380 | 5.61k | sum_above = _mm_add_epi16(sum_above, four); |
381 | 5.61k | sum_above = _mm_srai_epi16(sum_above, 3); |
382 | 5.61k | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
383 | 5.61k | const __m128i row = _mm_shufflelo_epi16(sum_above, 0); |
384 | 5.61k | dc_store_8xh(&row, 32, dst, stride); |
385 | 5.61k | } |
386 | | |
387 | | void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, |
388 | 5.28k | const uint8_t *above, const uint8_t *left) { |
389 | 5.28k | (void)left; |
390 | 5.28k | __m128i sum_above = dc_sum_16_sse2(above); |
391 | 5.28k | const __m128i eight = _mm_set1_epi16(8); |
392 | 5.28k | sum_above = _mm_add_epi16(sum_above, eight); |
393 | 5.28k | sum_above = _mm_srai_epi16(sum_above, 4); |
394 | 5.28k | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
395 | 5.28k | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
396 | 5.28k | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
397 | 5.28k | dc_store_16xh(&row, 4, dst, stride); |
398 | 5.28k | } |
399 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
400 | | |
401 | | void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, |
402 | 1.64k | const uint8_t *above, const uint8_t *left) { |
403 | 1.64k | (void)left; |
404 | 1.64k | __m128i sum_above = dc_sum_16_sse2(above); |
405 | 1.64k | const __m128i eight = _mm_set1_epi16(8); |
406 | 1.64k | sum_above = _mm_add_epi16(sum_above, eight); |
407 | 1.64k | sum_above = _mm_srai_epi16(sum_above, 4); |
408 | 1.64k | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
409 | 1.64k | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
410 | 1.64k | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
411 | 1.64k | dc_store_16xh(&row, 8, dst, stride); |
412 | 1.64k | } |
413 | | |
414 | | void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, |
415 | | const uint8_t *above, |
416 | 5.49k | const uint8_t *left) { |
417 | 5.49k | (void)left; |
418 | 5.49k | __m128i sum_above = dc_sum_16_sse2(above); |
419 | 5.49k | const __m128i eight = _mm_set1_epi16(8); |
420 | 5.49k | sum_above = _mm_add_epi16(sum_above, eight); |
421 | 5.49k | sum_above = _mm_srai_epi16(sum_above, 4); |
422 | 5.49k | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
423 | 5.49k | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
424 | 5.49k | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
425 | 5.49k | dc_store_16xh(&row, 32, dst, stride); |
426 | 5.49k | } |
427 | | |
428 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
429 | | void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, |
430 | | const uint8_t *above, |
431 | 238 | const uint8_t *left) { |
432 | 238 | (void)left; |
433 | 238 | __m128i sum_above = dc_sum_16_sse2(above); |
434 | 238 | const __m128i eight = _mm_set1_epi16(8); |
435 | 238 | sum_above = _mm_add_epi16(sum_above, eight); |
436 | 238 | sum_above = _mm_srai_epi16(sum_above, 4); |
437 | 238 | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
438 | 238 | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
439 | 238 | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
440 | 238 | dc_store_16xh(&row, 64, dst, stride); |
441 | 238 | } |
442 | | |
443 | | void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, |
444 | 8.78k | const uint8_t *above, const uint8_t *left) { |
445 | 8.78k | (void)left; |
446 | 8.78k | __m128i sum_above = dc_sum_32_sse2(above); |
447 | 8.78k | const __m128i sixteen = _mm_set1_epi16(16); |
448 | 8.78k | sum_above = _mm_add_epi16(sum_above, sixteen); |
449 | 8.78k | sum_above = _mm_srai_epi16(sum_above, 5); |
450 | 8.78k | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
451 | 8.78k | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
452 | 8.78k | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
453 | 8.78k | dc_store_32xh(&row, 8, dst, stride); |
454 | 8.78k | } |
455 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
456 | | |
457 | | void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, |
458 | | const uint8_t *above, |
459 | 0 | const uint8_t *left) { |
460 | 0 | (void)left; |
461 | 0 | __m128i sum_above = dc_sum_32_sse2(above); |
462 | 0 | const __m128i sixteen = _mm_set1_epi16(16); |
463 | 0 | sum_above = _mm_add_epi16(sum_above, sixteen); |
464 | 0 | sum_above = _mm_srai_epi16(sum_above, 5); |
465 | 0 | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
466 | 0 | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
467 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
468 | 0 | dc_store_32xh(&row, 16, dst, stride); |
469 | 0 | } |
470 | | |
471 | | void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, |
472 | | const uint8_t *above, |
473 | 0 | const uint8_t *left) { |
474 | 0 | (void)left; |
475 | 0 | __m128i sum_above = dc_sum_32_sse2(above); |
476 | 0 | const __m128i sixteen = _mm_set1_epi16(16); |
477 | 0 | sum_above = _mm_add_epi16(sum_above, sixteen); |
478 | 0 | sum_above = _mm_srai_epi16(sum_above, 5); |
479 | 0 | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
480 | 0 | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
481 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
482 | 0 | dc_store_32xh(&row, 64, dst, stride); |
483 | 0 | } |
484 | | |
485 | | void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, |
486 | | const uint8_t *above, |
487 | 0 | const uint8_t *left) { |
488 | 0 | (void)left; |
489 | 0 | __m128i sum_above = dc_sum_64(above); |
490 | 0 | const __m128i thirtytwo = _mm_set1_epi16(32); |
491 | 0 | sum_above = _mm_add_epi16(sum_above, thirtytwo); |
492 | 0 | sum_above = _mm_srai_epi16(sum_above, 6); |
493 | 0 | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
494 | 0 | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
495 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
496 | 0 | dc_store_64xh(&row, 64, dst, stride); |
497 | 0 | } |
498 | | |
499 | | void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, |
500 | | const uint8_t *above, |
501 | 0 | const uint8_t *left) { |
502 | 0 | (void)left; |
503 | 0 | __m128i sum_above = dc_sum_64(above); |
504 | 0 | const __m128i thirtytwo = _mm_set1_epi16(32); |
505 | 0 | sum_above = _mm_add_epi16(sum_above, thirtytwo); |
506 | 0 | sum_above = _mm_srai_epi16(sum_above, 6); |
507 | 0 | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
508 | 0 | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
509 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
510 | 0 | dc_store_64xh(&row, 32, dst, stride); |
511 | 0 | } |
512 | | |
513 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
514 | | void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, |
515 | | const uint8_t *above, |
516 | 0 | const uint8_t *left) { |
517 | 0 | (void)left; |
518 | 0 | __m128i sum_above = dc_sum_64(above); |
519 | 0 | const __m128i thirtytwo = _mm_set1_epi16(32); |
520 | 0 | sum_above = _mm_add_epi16(sum_above, thirtytwo); |
521 | 0 | sum_above = _mm_srai_epi16(sum_above, 6); |
522 | 0 | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
523 | 0 | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
524 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
525 | 0 | dc_store_64xh(&row, 16, dst, stride); |
526 | 0 | } |
527 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
528 | | |
529 | | // ----------------------------------------------------------------------------- |
530 | | // DC_LEFT |
531 | | |
532 | | void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, |
533 | 4.61k | const uint8_t *above, const uint8_t *left) { |
534 | 4.61k | (void)above; |
535 | 4.61k | __m128i sum_left = dc_sum_8(left); |
536 | 4.61k | const __m128i four = _mm_set1_epi16(4); |
537 | 4.61k | sum_left = _mm_add_epi16(sum_left, four); |
538 | 4.61k | sum_left = _mm_srai_epi16(sum_left, 3); |
539 | 4.61k | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
540 | 4.61k | sum_left = _mm_packus_epi16(sum_left, sum_left); |
541 | | |
542 | 4.61k | const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left); |
543 | 4.61k | dc_store_4xh(pred, 8, dst, stride); |
544 | 4.61k | } |
545 | | |
546 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
547 | | void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, |
548 | | const uint8_t *above, |
549 | 3.94k | const uint8_t *left) { |
550 | 3.94k | (void)above; |
551 | 3.94k | __m128i sum_left = dc_sum_16_sse2(left); |
552 | 3.94k | const __m128i eight = _mm_set1_epi16(8); |
553 | 3.94k | sum_left = _mm_add_epi16(sum_left, eight); |
554 | 3.94k | sum_left = _mm_srai_epi16(sum_left, 4); |
555 | 3.94k | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
556 | 3.94k | sum_left = _mm_packus_epi16(sum_left, sum_left); |
557 | | |
558 | 3.94k | const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left); |
559 | 3.94k | dc_store_4xh(pred, 16, dst, stride); |
560 | 3.94k | } |
561 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
562 | | |
563 | | void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, |
564 | 4.93k | const uint8_t *above, const uint8_t *left) { |
565 | 4.93k | (void)above; |
566 | 4.93k | __m128i sum_left = dc_sum_4(left); |
567 | 4.93k | const __m128i two = _mm_set1_epi16(2); |
568 | 4.93k | sum_left = _mm_add_epi16(sum_left, two); |
569 | 4.93k | sum_left = _mm_srai_epi16(sum_left, 2); |
570 | 4.93k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
571 | 4.93k | const __m128i row = _mm_shufflelo_epi16(sum_left, 0); |
572 | 4.93k | dc_store_8xh(&row, 4, dst, stride); |
573 | 4.93k | } |
574 | | |
575 | | void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, |
576 | | const uint8_t *above, |
577 | 2.84k | const uint8_t *left) { |
578 | 2.84k | (void)above; |
579 | 2.84k | __m128i sum_left = dc_sum_16_sse2(left); |
580 | 2.84k | const __m128i eight = _mm_set1_epi16(8); |
581 | 2.84k | sum_left = _mm_add_epi16(sum_left, eight); |
582 | 2.84k | sum_left = _mm_srai_epi16(sum_left, 4); |
583 | 2.84k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
584 | 2.84k | const __m128i row = _mm_shufflelo_epi16(sum_left, 0); |
585 | 2.84k | dc_store_8xh(&row, 16, dst, stride); |
586 | 2.84k | } |
587 | | |
588 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
589 | | void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, |
590 | | const uint8_t *above, |
591 | 6.21k | const uint8_t *left) { |
592 | 6.21k | (void)above; |
593 | 6.21k | __m128i sum_left = dc_sum_32_sse2(left); |
594 | 6.21k | const __m128i sixteen = _mm_set1_epi16(16); |
595 | 6.21k | sum_left = _mm_add_epi16(sum_left, sixteen); |
596 | 6.21k | sum_left = _mm_srai_epi16(sum_left, 5); |
597 | 6.21k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
598 | 6.21k | const __m128i row = _mm_shufflelo_epi16(sum_left, 0); |
599 | 6.21k | dc_store_8xh(&row, 32, dst, stride); |
600 | 6.21k | } |
601 | | |
602 | | void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, |
603 | | const uint8_t *above, |
604 | 3.75k | const uint8_t *left) { |
605 | 3.75k | (void)above; |
606 | 3.75k | __m128i sum_left = dc_sum_4(left); |
607 | 3.75k | const __m128i two = _mm_set1_epi16(2); |
608 | 3.75k | sum_left = _mm_add_epi16(sum_left, two); |
609 | 3.75k | sum_left = _mm_srai_epi16(sum_left, 2); |
610 | 3.75k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
611 | 3.75k | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
612 | 3.75k | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
613 | 3.75k | dc_store_16xh(&row, 4, dst, stride); |
614 | 3.75k | } |
615 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
616 | | |
617 | | void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, |
618 | | const uint8_t *above, |
619 | 7.97k | const uint8_t *left) { |
620 | 7.97k | (void)above; |
621 | 7.97k | __m128i sum_left = dc_sum_8(left); |
622 | 7.97k | const __m128i four = _mm_set1_epi16(4); |
623 | 7.97k | sum_left = _mm_add_epi16(sum_left, four); |
624 | 7.97k | sum_left = _mm_srai_epi16(sum_left, 3); |
625 | 7.97k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
626 | 7.97k | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
627 | 7.97k | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
628 | 7.97k | dc_store_16xh(&row, 8, dst, stride); |
629 | 7.97k | } |
630 | | |
631 | | void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, |
632 | | const uint8_t *above, |
633 | 4.50k | const uint8_t *left) { |
634 | 4.50k | (void)above; |
635 | 4.50k | __m128i sum_left = dc_sum_32_sse2(left); |
636 | 4.50k | const __m128i sixteen = _mm_set1_epi16(16); |
637 | 4.50k | sum_left = _mm_add_epi16(sum_left, sixteen); |
638 | 4.50k | sum_left = _mm_srai_epi16(sum_left, 5); |
639 | 4.50k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
640 | 4.50k | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
641 | 4.50k | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
642 | 4.50k | dc_store_16xh(&row, 32, dst, stride); |
643 | 4.50k | } |
644 | | |
645 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
646 | | void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, |
647 | | const uint8_t *above, |
648 | 458 | const uint8_t *left) { |
649 | 458 | (void)above; |
650 | 458 | __m128i sum_left = dc_sum_64(left); |
651 | 458 | const __m128i thirtytwo = _mm_set1_epi16(32); |
652 | 458 | sum_left = _mm_add_epi16(sum_left, thirtytwo); |
653 | 458 | sum_left = _mm_srai_epi16(sum_left, 6); |
654 | 458 | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
655 | 458 | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
656 | 458 | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
657 | 458 | dc_store_16xh(&row, 64, dst, stride); |
658 | 458 | } |
659 | | |
660 | | void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, |
661 | | const uint8_t *above, |
662 | 2.44k | const uint8_t *left) { |
663 | 2.44k | (void)above; |
664 | 2.44k | __m128i sum_left = dc_sum_8(left); |
665 | 2.44k | const __m128i four = _mm_set1_epi16(4); |
666 | 2.44k | sum_left = _mm_add_epi16(sum_left, four); |
667 | 2.44k | sum_left = _mm_srai_epi16(sum_left, 3); |
668 | 2.44k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
669 | 2.44k | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
670 | 2.44k | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
671 | 2.44k | dc_store_32xh(&row, 8, dst, stride); |
672 | 2.44k | } |
673 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
674 | | |
675 | | void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, |
676 | | const uint8_t *above, |
677 | 0 | const uint8_t *left) { |
678 | 0 | (void)above; |
679 | 0 | __m128i sum_left = dc_sum_16_sse2(left); |
680 | 0 | const __m128i eight = _mm_set1_epi16(8); |
681 | 0 | sum_left = _mm_add_epi16(sum_left, eight); |
682 | 0 | sum_left = _mm_srai_epi16(sum_left, 4); |
683 | 0 | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
684 | 0 | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
685 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
686 | 0 | dc_store_32xh(&row, 16, dst, stride); |
687 | 0 | } |
688 | | |
689 | | void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, |
690 | | const uint8_t *above, |
691 | 0 | const uint8_t *left) { |
692 | 0 | (void)above; |
693 | 0 | __m128i sum_left = dc_sum_64(left); |
694 | 0 | const __m128i thirtytwo = _mm_set1_epi16(32); |
695 | 0 | sum_left = _mm_add_epi16(sum_left, thirtytwo); |
696 | 0 | sum_left = _mm_srai_epi16(sum_left, 6); |
697 | 0 | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
698 | 0 | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
699 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
700 | 0 | dc_store_32xh(&row, 64, dst, stride); |
701 | 0 | } |
702 | | |
703 | | void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, |
704 | | const uint8_t *above, |
705 | 0 | const uint8_t *left) { |
706 | 0 | (void)above; |
707 | 0 | __m128i sum_left = dc_sum_64(left); |
708 | 0 | const __m128i thirtytwo = _mm_set1_epi16(32); |
709 | 0 | sum_left = _mm_add_epi16(sum_left, thirtytwo); |
710 | 0 | sum_left = _mm_srai_epi16(sum_left, 6); |
711 | 0 | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
712 | 0 | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
713 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
714 | 0 | dc_store_64xh(&row, 64, dst, stride); |
715 | 0 | } |
716 | | |
717 | | void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, |
718 | | const uint8_t *above, |
719 | 0 | const uint8_t *left) { |
720 | 0 | (void)above; |
721 | 0 | __m128i sum_left = dc_sum_32_sse2(left); |
722 | 0 | const __m128i sixteen = _mm_set1_epi16(16); |
723 | 0 | sum_left = _mm_add_epi16(sum_left, sixteen); |
724 | 0 | sum_left = _mm_srai_epi16(sum_left, 5); |
725 | 0 | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
726 | 0 | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
727 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
728 | 0 | dc_store_64xh(&row, 32, dst, stride); |
729 | 0 | } |
730 | | |
731 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
732 | | void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, |
733 | | const uint8_t *above, |
734 | 0 | const uint8_t *left) { |
735 | 0 | (void)above; |
736 | 0 | __m128i sum_left = dc_sum_16_sse2(left); |
737 | 0 | const __m128i eight = _mm_set1_epi16(8); |
738 | 0 | sum_left = _mm_add_epi16(sum_left, eight); |
739 | 0 | sum_left = _mm_srai_epi16(sum_left, 4); |
740 | 0 | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
741 | 0 | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
742 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
743 | 0 | dc_store_64xh(&row, 16, dst, stride); |
744 | 0 | } |
745 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
746 | | |
747 | | // ----------------------------------------------------------------------------- |
748 | | // DC_128 |
749 | | |
750 | | void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, |
751 | 5.41k | const uint8_t *above, const uint8_t *left) { |
752 | 5.41k | (void)above; |
753 | 5.41k | (void)left; |
754 | 5.41k | const uint32_t pred = 0x80808080; |
755 | 5.41k | dc_store_4xh(pred, 8, dst, stride); |
756 | 5.41k | } |
757 | | |
758 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
759 | | void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, |
760 | 164 | const uint8_t *above, const uint8_t *left) { |
761 | 164 | (void)above; |
762 | 164 | (void)left; |
763 | 164 | const uint32_t pred = 0x80808080; |
764 | 164 | dc_store_4xh(pred, 16, dst, stride); |
765 | 164 | } |
766 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
767 | | |
768 | | void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, |
769 | 289 | const uint8_t *above, const uint8_t *left) { |
770 | 289 | (void)above; |
771 | 289 | (void)left; |
772 | 289 | const __m128i row = _mm_set1_epi8((int8_t)128); |
773 | 289 | dc_store_8xh(&row, 4, dst, stride); |
774 | 289 | } |
775 | | |
776 | | void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, |
777 | 1.73k | const uint8_t *above, const uint8_t *left) { |
778 | 1.73k | (void)above; |
779 | 1.73k | (void)left; |
780 | 1.73k | const __m128i row = _mm_set1_epi8((int8_t)128); |
781 | 1.73k | dc_store_8xh(&row, 16, dst, stride); |
782 | 1.73k | } |
783 | | |
784 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
785 | | void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, |
786 | 120 | const uint8_t *above, const uint8_t *left) { |
787 | 120 | (void)above; |
788 | 120 | (void)left; |
789 | 120 | const __m128i row = _mm_set1_epi8((int8_t)128); |
790 | 120 | dc_store_8xh(&row, 32, dst, stride); |
791 | 120 | } |
792 | | |
793 | | void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, |
794 | 106 | const uint8_t *above, const uint8_t *left) { |
795 | 106 | (void)above; |
796 | 106 | (void)left; |
797 | 106 | const __m128i row = _mm_set1_epi8((int8_t)128); |
798 | 106 | dc_store_16xh(&row, 4, dst, stride); |
799 | 106 | } |
800 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
801 | | |
802 | | void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, |
803 | 189 | const uint8_t *above, const uint8_t *left) { |
804 | 189 | (void)above; |
805 | 189 | (void)left; |
806 | 189 | const __m128i row = _mm_set1_epi8((int8_t)128); |
807 | 189 | dc_store_16xh(&row, 8, dst, stride); |
808 | 189 | } |
809 | | |
810 | | void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, |
811 | | const uint8_t *above, |
812 | 2.37k | const uint8_t *left) { |
813 | 2.37k | (void)above; |
814 | 2.37k | (void)left; |
815 | 2.37k | const __m128i row = _mm_set1_epi8((int8_t)128); |
816 | 2.37k | dc_store_16xh(&row, 32, dst, stride); |
817 | 2.37k | } |
818 | | |
819 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
820 | | void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, |
821 | | const uint8_t *above, |
822 | 34 | const uint8_t *left) { |
823 | 34 | (void)above; |
824 | 34 | (void)left; |
825 | 34 | const __m128i row = _mm_set1_epi8((int8_t)128); |
826 | 34 | dc_store_16xh(&row, 64, dst, stride); |
827 | 34 | } |
828 | | |
829 | | void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, |
830 | 90 | const uint8_t *above, const uint8_t *left) { |
831 | 90 | (void)above; |
832 | 90 | (void)left; |
833 | 90 | const __m128i row = _mm_set1_epi8((int8_t)128); |
834 | 90 | dc_store_32xh(&row, 8, dst, stride); |
835 | 90 | } |
836 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
837 | | |
838 | | void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, |
839 | | const uint8_t *above, |
840 | 0 | const uint8_t *left) { |
841 | 0 | (void)above; |
842 | 0 | (void)left; |
843 | 0 | const __m128i row = _mm_set1_epi8((int8_t)128); |
844 | 0 | dc_store_32xh(&row, 16, dst, stride); |
845 | 0 | } |
846 | | |
847 | | void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, |
848 | | const uint8_t *above, |
849 | 0 | const uint8_t *left) { |
850 | 0 | (void)above; |
851 | 0 | (void)left; |
852 | 0 | const __m128i row = _mm_set1_epi8((int8_t)128); |
853 | 0 | dc_store_32xh(&row, 64, dst, stride); |
854 | 0 | } |
855 | | |
856 | | void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, |
857 | | const uint8_t *above, |
858 | 0 | const uint8_t *left) { |
859 | 0 | (void)above; |
860 | 0 | (void)left; |
861 | 0 | const __m128i row = _mm_set1_epi8((int8_t)128); |
862 | 0 | dc_store_64xh(&row, 64, dst, stride); |
863 | 0 | } |
864 | | |
865 | | void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, |
866 | | const uint8_t *above, |
867 | 0 | const uint8_t *left) { |
868 | 0 | (void)above; |
869 | 0 | (void)left; |
870 | 0 | const __m128i row = _mm_set1_epi8((int8_t)128); |
871 | 0 | dc_store_64xh(&row, 32, dst, stride); |
872 | 0 | } |
873 | | |
874 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
875 | | void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, |
876 | | const uint8_t *above, |
877 | 0 | const uint8_t *left) { |
878 | 0 | (void)above; |
879 | 0 | (void)left; |
880 | 0 | const __m128i row = _mm_set1_epi8((int8_t)128); |
881 | 0 | dc_store_64xh(&row, 16, dst, stride); |
882 | 0 | } |
883 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
884 | | |
885 | | // ----------------------------------------------------------------------------- |
886 | | // V_PRED |
887 | | |
888 | | void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, |
889 | 25.7k | const uint8_t *above, const uint8_t *left) { |
890 | 25.7k | const uint32_t pred = *(uint32_t *)above; |
891 | 25.7k | (void)left; |
892 | 25.7k | dc_store_4xh(pred, 8, dst, stride); |
893 | 25.7k | } |
894 | | |
895 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
896 | | void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, |
897 | 7.40k | const uint8_t *above, const uint8_t *left) { |
898 | 7.40k | const uint32_t pred = *(uint32_t *)above; |
899 | 7.40k | (void)left; |
900 | 7.40k | dc_store_4xh(pred, 16, dst, stride); |
901 | 7.40k | } |
902 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
903 | | |
904 | | void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, |
905 | 38.8k | const uint8_t *above, const uint8_t *left) { |
906 | 38.8k | const __m128i row = _mm_loadl_epi64((__m128i const *)above); |
907 | 38.8k | (void)left; |
908 | 38.8k | dc_store_8xh(&row, 4, dst, stride); |
909 | 38.8k | } |
910 | | |
911 | | void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, |
912 | 18.9k | const uint8_t *above, const uint8_t *left) { |
913 | 18.9k | const __m128i row = _mm_loadl_epi64((__m128i const *)above); |
914 | 18.9k | (void)left; |
915 | 18.9k | dc_store_8xh(&row, 16, dst, stride); |
916 | 18.9k | } |
917 | | |
918 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
919 | | void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, |
920 | 4.16k | const uint8_t *above, const uint8_t *left) { |
921 | 4.16k | const __m128i row = _mm_loadl_epi64((__m128i const *)above); |
922 | 4.16k | (void)left; |
923 | 4.16k | dc_store_8xh(&row, 32, dst, stride); |
924 | 4.16k | } |
925 | | |
926 | | void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, |
927 | 32.3k | const uint8_t *above, const uint8_t *left) { |
928 | 32.3k | const __m128i row = _mm_load_si128((__m128i const *)above); |
929 | 32.3k | (void)left; |
930 | 32.3k | dc_store_16xh(&row, 4, dst, stride); |
931 | 32.3k | } |
932 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
933 | | |
934 | | void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, |
935 | 32.1k | const uint8_t *above, const uint8_t *left) { |
936 | 32.1k | const __m128i row = _mm_load_si128((__m128i const *)above); |
937 | 32.1k | (void)left; |
938 | 32.1k | dc_store_16xh(&row, 8, dst, stride); |
939 | 32.1k | } |
940 | | |
941 | | void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, |
942 | 9.48k | const uint8_t *above, const uint8_t *left) { |
943 | 9.48k | const __m128i row = _mm_load_si128((__m128i const *)above); |
944 | 9.48k | (void)left; |
945 | 9.48k | dc_store_16xh(&row, 32, dst, stride); |
946 | 9.48k | } |
947 | | |
948 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
949 | | void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, |
950 | 1.65k | const uint8_t *above, const uint8_t *left) { |
951 | 1.65k | const __m128i row = _mm_load_si128((__m128i const *)above); |
952 | 1.65k | (void)left; |
953 | 1.65k | dc_store_16xh(&row, 64, dst, stride); |
954 | 1.65k | } |
955 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
956 | | |
957 | | static inline void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride, |
958 | 14.1k | const uint8_t *above, int height) { |
959 | 14.1k | const __m128i row0 = _mm_load_si128((__m128i const *)above); |
960 | 14.1k | const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); |
961 | 127k | for (int i = 0; i < height; ++i) { |
962 | 112k | _mm_store_si128((__m128i *)dst, row0); |
963 | 112k | _mm_store_si128((__m128i *)(dst + 16), row1); |
964 | 112k | dst += stride; |
965 | 112k | } |
966 | 14.1k | } |
967 | | |
968 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
969 | | void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, |
970 | 14.1k | const uint8_t *above, const uint8_t *left) { |
971 | 14.1k | (void)left; |
972 | 14.1k | v_predictor_32xh(dst, stride, above, 8); |
973 | 14.1k | } |
974 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
975 | | |
976 | | void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, |
977 | 0 | const uint8_t *above, const uint8_t *left) { |
978 | 0 | (void)left; |
979 | 0 | v_predictor_32xh(dst, stride, above, 16); |
980 | 0 | } |
981 | | |
982 | | void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, |
983 | 0 | const uint8_t *above, const uint8_t *left) { |
984 | 0 | (void)left; |
985 | 0 | v_predictor_32xh(dst, stride, above, 64); |
986 | 0 | } |
987 | | |
988 | | static inline void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride, |
989 | 0 | const uint8_t *above, int height) { |
990 | 0 | const __m128i row0 = _mm_load_si128((__m128i const *)above); |
991 | 0 | const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); |
992 | 0 | const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32)); |
993 | 0 | const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48)); |
994 | 0 | for (int i = 0; i < height; ++i) { |
995 | 0 | _mm_store_si128((__m128i *)dst, row0); |
996 | 0 | _mm_store_si128((__m128i *)(dst + 16), row1); |
997 | 0 | _mm_store_si128((__m128i *)(dst + 32), row2); |
998 | 0 | _mm_store_si128((__m128i *)(dst + 48), row3); |
999 | 0 | dst += stride; |
1000 | 0 | } |
1001 | 0 | } |
1002 | | |
1003 | | void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, |
1004 | 0 | const uint8_t *above, const uint8_t *left) { |
1005 | 0 | (void)left; |
1006 | 0 | v_predictor_64xh(dst, stride, above, 64); |
1007 | 0 | } |
1008 | | |
1009 | | void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, |
1010 | 0 | const uint8_t *above, const uint8_t *left) { |
1011 | 0 | (void)left; |
1012 | 0 | v_predictor_64xh(dst, stride, above, 32); |
1013 | 0 | } |
1014 | | |
1015 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1016 | | void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, |
1017 | 0 | const uint8_t *above, const uint8_t *left) { |
1018 | 0 | (void)left; |
1019 | 0 | v_predictor_64xh(dst, stride, above, 16); |
1020 | 0 | } |
1021 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1022 | | |
1023 | | // ----------------------------------------------------------------------------- |
1024 | | // H_PRED |
1025 | | |
1026 | | void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, |
1027 | 39.4k | const uint8_t *above, const uint8_t *left) { |
1028 | 39.4k | (void)above; |
1029 | 39.4k | __m128i left_col = _mm_loadl_epi64((__m128i const *)left); |
1030 | 39.4k | left_col = _mm_unpacklo_epi8(left_col, left_col); |
1031 | 39.4k | __m128i row0 = _mm_shufflelo_epi16(left_col, 0); |
1032 | 39.4k | __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); |
1033 | 39.4k | __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); |
1034 | 39.4k | __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); |
1035 | 39.4k | *(int *)dst = _mm_cvtsi128_si32(row0); |
1036 | 39.4k | dst += stride; |
1037 | 39.4k | *(int *)dst = _mm_cvtsi128_si32(row1); |
1038 | 39.4k | dst += stride; |
1039 | 39.4k | *(int *)dst = _mm_cvtsi128_si32(row2); |
1040 | 39.4k | dst += stride; |
1041 | 39.4k | *(int *)dst = _mm_cvtsi128_si32(row3); |
1042 | 39.4k | dst += stride; |
1043 | 39.4k | left_col = _mm_unpackhi_epi64(left_col, left_col); |
1044 | 39.4k | row0 = _mm_shufflelo_epi16(left_col, 0); |
1045 | 39.4k | row1 = _mm_shufflelo_epi16(left_col, 0x55); |
1046 | 39.4k | row2 = _mm_shufflelo_epi16(left_col, 0xaa); |
1047 | 39.4k | row3 = _mm_shufflelo_epi16(left_col, 0xff); |
1048 | 39.4k | *(int *)dst = _mm_cvtsi128_si32(row0); |
1049 | 39.4k | dst += stride; |
1050 | 39.4k | *(int *)dst = _mm_cvtsi128_si32(row1); |
1051 | 39.4k | dst += stride; |
1052 | 39.4k | *(int *)dst = _mm_cvtsi128_si32(row2); |
1053 | 39.4k | dst += stride; |
1054 | 39.4k | *(int *)dst = _mm_cvtsi128_si32(row3); |
1055 | 39.4k | } |
1056 | | |
1057 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1058 | | void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, |
1059 | 20.7k | const uint8_t *above, const uint8_t *left) { |
1060 | 20.7k | (void)above; |
1061 | 20.7k | const __m128i left_col = _mm_load_si128((__m128i const *)left); |
1062 | 20.7k | __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); |
1063 | 20.7k | __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); |
1064 | | |
1065 | 20.7k | __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); |
1066 | 20.7k | __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); |
1067 | 20.7k | __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); |
1068 | 20.7k | __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); |
1069 | 20.7k | *(int *)dst = _mm_cvtsi128_si32(row0); |
1070 | 20.7k | dst += stride; |
1071 | 20.7k | *(int *)dst = _mm_cvtsi128_si32(row1); |
1072 | 20.7k | dst += stride; |
1073 | 20.7k | *(int *)dst = _mm_cvtsi128_si32(row2); |
1074 | 20.7k | dst += stride; |
1075 | 20.7k | *(int *)dst = _mm_cvtsi128_si32(row3); |
1076 | 20.7k | dst += stride; |
1077 | | |
1078 | 20.7k | left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); |
1079 | 20.7k | row0 = _mm_shufflelo_epi16(left_col_low, 0); |
1080 | 20.7k | row1 = _mm_shufflelo_epi16(left_col_low, 0x55); |
1081 | 20.7k | row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); |
1082 | 20.7k | row3 = _mm_shufflelo_epi16(left_col_low, 0xff); |
1083 | 20.7k | *(int *)dst = _mm_cvtsi128_si32(row0); |
1084 | 20.7k | dst += stride; |
1085 | 20.7k | *(int *)dst = _mm_cvtsi128_si32(row1); |
1086 | 20.7k | dst += stride; |
1087 | 20.7k | *(int *)dst = _mm_cvtsi128_si32(row2); |
1088 | 20.7k | dst += stride; |
1089 | 20.7k | *(int *)dst = _mm_cvtsi128_si32(row3); |
1090 | 20.7k | dst += stride; |
1091 | | |
1092 | 20.7k | row0 = _mm_shufflelo_epi16(left_col_high, 0); |
1093 | 20.7k | row1 = _mm_shufflelo_epi16(left_col_high, 0x55); |
1094 | 20.7k | row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); |
1095 | 20.7k | row3 = _mm_shufflelo_epi16(left_col_high, 0xff); |
1096 | 20.7k | *(int *)dst = _mm_cvtsi128_si32(row0); |
1097 | 20.7k | dst += stride; |
1098 | 20.7k | *(int *)dst = _mm_cvtsi128_si32(row1); |
1099 | 20.7k | dst += stride; |
1100 | 20.7k | *(int *)dst = _mm_cvtsi128_si32(row2); |
1101 | 20.7k | dst += stride; |
1102 | 20.7k | *(int *)dst = _mm_cvtsi128_si32(row3); |
1103 | 20.7k | dst += stride; |
1104 | | |
1105 | 20.7k | left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); |
1106 | 20.7k | row0 = _mm_shufflelo_epi16(left_col_high, 0); |
1107 | 20.7k | row1 = _mm_shufflelo_epi16(left_col_high, 0x55); |
1108 | 20.7k | row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); |
1109 | 20.7k | row3 = _mm_shufflelo_epi16(left_col_high, 0xff); |
1110 | 20.7k | *(int *)dst = _mm_cvtsi128_si32(row0); |
1111 | 20.7k | dst += stride; |
1112 | 20.7k | *(int *)dst = _mm_cvtsi128_si32(row1); |
1113 | 20.7k | dst += stride; |
1114 | 20.7k | *(int *)dst = _mm_cvtsi128_si32(row2); |
1115 | 20.7k | dst += stride; |
1116 | 20.7k | *(int *)dst = _mm_cvtsi128_si32(row3); |
1117 | 20.7k | } |
1118 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1119 | | |
1120 | | void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, |
1121 | 74.3k | const uint8_t *above, const uint8_t *left) { |
1122 | 74.3k | (void)above; |
1123 | 74.3k | __m128i left_col = _mm_loadl_epi64((__m128i const *)left); |
1124 | 74.3k | left_col = _mm_unpacklo_epi8(left_col, left_col); |
1125 | 74.3k | __m128i row0 = _mm_shufflelo_epi16(left_col, 0); |
1126 | 74.3k | __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); |
1127 | 74.3k | __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); |
1128 | 74.3k | __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); |
1129 | 74.3k | _mm_storel_epi64((__m128i *)dst, row0); |
1130 | 74.3k | dst += stride; |
1131 | 74.3k | _mm_storel_epi64((__m128i *)dst, row1); |
1132 | 74.3k | dst += stride; |
1133 | 74.3k | _mm_storel_epi64((__m128i *)dst, row2); |
1134 | 74.3k | dst += stride; |
1135 | 74.3k | _mm_storel_epi64((__m128i *)dst, row3); |
1136 | 74.3k | } |
1137 | | |
1138 | | static inline void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride, |
1139 | | const uint8_t *above, const uint8_t *left, |
1140 | 49.7k | int count) { |
1141 | 49.7k | (void)above; |
1142 | 109k | for (int i = 0; i < count; ++i) { |
1143 | 59.3k | const __m128i left_col = _mm_load_si128((__m128i const *)left); |
1144 | 59.3k | __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); |
1145 | 59.3k | __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); |
1146 | | |
1147 | 59.3k | __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); |
1148 | 59.3k | __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); |
1149 | 59.3k | __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); |
1150 | 59.3k | __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); |
1151 | 59.3k | _mm_storel_epi64((__m128i *)dst, row0); |
1152 | 59.3k | dst += stride; |
1153 | 59.3k | _mm_storel_epi64((__m128i *)dst, row1); |
1154 | 59.3k | dst += stride; |
1155 | 59.3k | _mm_storel_epi64((__m128i *)dst, row2); |
1156 | 59.3k | dst += stride; |
1157 | 59.3k | _mm_storel_epi64((__m128i *)dst, row3); |
1158 | 59.3k | dst += stride; |
1159 | | |
1160 | 59.3k | left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); |
1161 | 59.3k | row0 = _mm_shufflelo_epi16(left_col_low, 0); |
1162 | 59.3k | row1 = _mm_shufflelo_epi16(left_col_low, 0x55); |
1163 | 59.3k | row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); |
1164 | 59.3k | row3 = _mm_shufflelo_epi16(left_col_low, 0xff); |
1165 | 59.3k | _mm_storel_epi64((__m128i *)dst, row0); |
1166 | 59.3k | dst += stride; |
1167 | 59.3k | _mm_storel_epi64((__m128i *)dst, row1); |
1168 | 59.3k | dst += stride; |
1169 | 59.3k | _mm_storel_epi64((__m128i *)dst, row2); |
1170 | 59.3k | dst += stride; |
1171 | 59.3k | _mm_storel_epi64((__m128i *)dst, row3); |
1172 | 59.3k | dst += stride; |
1173 | | |
1174 | 59.3k | row0 = _mm_shufflelo_epi16(left_col_high, 0); |
1175 | 59.3k | row1 = _mm_shufflelo_epi16(left_col_high, 0x55); |
1176 | 59.3k | row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); |
1177 | 59.3k | row3 = _mm_shufflelo_epi16(left_col_high, 0xff); |
1178 | 59.3k | _mm_storel_epi64((__m128i *)dst, row0); |
1179 | 59.3k | dst += stride; |
1180 | 59.3k | _mm_storel_epi64((__m128i *)dst, row1); |
1181 | 59.3k | dst += stride; |
1182 | 59.3k | _mm_storel_epi64((__m128i *)dst, row2); |
1183 | 59.3k | dst += stride; |
1184 | 59.3k | _mm_storel_epi64((__m128i *)dst, row3); |
1185 | 59.3k | dst += stride; |
1186 | | |
1187 | 59.3k | left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); |
1188 | 59.3k | row0 = _mm_shufflelo_epi16(left_col_high, 0); |
1189 | 59.3k | row1 = _mm_shufflelo_epi16(left_col_high, 0x55); |
1190 | 59.3k | row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); |
1191 | 59.3k | row3 = _mm_shufflelo_epi16(left_col_high, 0xff); |
1192 | 59.3k | _mm_storel_epi64((__m128i *)dst, row0); |
1193 | 59.3k | dst += stride; |
1194 | 59.3k | _mm_storel_epi64((__m128i *)dst, row1); |
1195 | 59.3k | dst += stride; |
1196 | 59.3k | _mm_storel_epi64((__m128i *)dst, row2); |
1197 | 59.3k | dst += stride; |
1198 | 59.3k | _mm_storel_epi64((__m128i *)dst, row3); |
1199 | 59.3k | dst += stride; |
1200 | 59.3k | left += 16; |
1201 | 59.3k | } |
1202 | 49.7k | } |
1203 | | |
1204 | | void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, |
1205 | 40.2k | const uint8_t *above, const uint8_t *left) { |
1206 | 40.2k | h_predictor_8x16xc(dst, stride, above, left, 1); |
1207 | 40.2k | } |
1208 | | |
1209 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1210 | | void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, |
1211 | 9.56k | const uint8_t *above, const uint8_t *left) { |
1212 | 9.56k | h_predictor_8x16xc(dst, stride, above, left, 2); |
1213 | 9.56k | } |
1214 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1215 | | |
1216 | | static inline void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst, |
1217 | 561k | ptrdiff_t stride) { |
1218 | 561k | int i; |
1219 | 2.80M | for (i = 0; i < h; ++i) { |
1220 | 2.24M | _mm_store_si128((__m128i *)dst, row[i]); |
1221 | 2.24M | dst += stride; |
1222 | 2.24M | } |
1223 | 561k | } |
1224 | | |
1225 | 659k | static inline void repeat_low_4pixels(const __m128i *x, __m128i *row) { |
1226 | 659k | const __m128i u0 = _mm_shufflelo_epi16(*x, 0); |
1227 | 659k | const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55); |
1228 | 659k | const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa); |
1229 | 659k | const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff); |
1230 | | |
1231 | 659k | row[0] = _mm_unpacklo_epi64(u0, u0); |
1232 | 659k | row[1] = _mm_unpacklo_epi64(u1, u1); |
1233 | 659k | row[2] = _mm_unpacklo_epi64(u2, u2); |
1234 | 659k | row[3] = _mm_unpacklo_epi64(u3, u3); |
1235 | 659k | } |
1236 | | |
1237 | 505k | static inline void repeat_high_4pixels(const __m128i *x, __m128i *row) { |
1238 | 505k | const __m128i u0 = _mm_shufflehi_epi16(*x, 0); |
1239 | 505k | const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55); |
1240 | 505k | const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa); |
1241 | 505k | const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff); |
1242 | | |
1243 | 505k | row[0] = _mm_unpackhi_epi64(u0, u0); |
1244 | 505k | row[1] = _mm_unpackhi_epi64(u1, u1); |
1245 | 505k | row[2] = _mm_unpackhi_epi64(u2, u2); |
1246 | 505k | row[3] = _mm_unpackhi_epi64(u3, u3); |
1247 | 505k | } |
1248 | | |
1249 | | // Process 16x8, first 4 rows |
1250 | | // Use first 8 bytes of left register: xxxxxxxx33221100 |
1251 | | static inline void h_prediction_16x8_1(const __m128i *left, uint8_t *dst, |
1252 | 357k | ptrdiff_t stride) { |
1253 | 357k | __m128i row[4]; |
1254 | 357k | repeat_low_4pixels(left, row); |
1255 | 357k | h_pred_store_16xh(row, 4, dst, stride); |
1256 | 357k | } |
1257 | | |
1258 | | // Process 16x8, second 4 rows |
1259 | | // Use second 8 bytes of left register: 77665544xxxxxxxx |
1260 | | static inline void h_prediction_16x8_2(const __m128i *left, uint8_t *dst, |
1261 | 203k | ptrdiff_t stride) { |
1262 | 203k | __m128i row[4]; |
1263 | 203k | repeat_high_4pixels(left, row); |
1264 | 203k | h_pred_store_16xh(row, 4, dst, stride); |
1265 | 203k | } |
1266 | | |
1267 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1268 | | void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, |
1269 | 154k | const uint8_t *above, const uint8_t *left) { |
1270 | 154k | (void)above; |
1271 | 154k | const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); |
1272 | 154k | const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); |
1273 | 154k | h_prediction_16x8_1(&left_col_8p, dst, stride); |
1274 | 154k | } |
1275 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1276 | | |
1277 | | void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, |
1278 | 90.2k | const uint8_t *above, const uint8_t *left) { |
1279 | 90.2k | (void)above; |
1280 | 90.2k | const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); |
1281 | 90.2k | const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); |
1282 | 90.2k | h_prediction_16x8_1(&left_col_8p, dst, stride); |
1283 | 90.2k | dst += stride << 2; |
1284 | 90.2k | h_prediction_16x8_2(&left_col_8p, dst, stride); |
1285 | 90.2k | } |
1286 | | |
1287 | | static inline void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride, |
1288 | 25.7k | const uint8_t *left, int count) { |
1289 | 25.7k | int i = 0; |
1290 | 56.7k | do { |
1291 | 56.7k | const __m128i left_col = _mm_load_si128((const __m128i *)left); |
1292 | 56.7k | const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col); |
1293 | 56.7k | h_prediction_16x8_1(&left_col_8p_lo, dst, stride); |
1294 | 56.7k | dst += stride << 2; |
1295 | 56.7k | h_prediction_16x8_2(&left_col_8p_lo, dst, stride); |
1296 | 56.7k | dst += stride << 2; |
1297 | | |
1298 | 56.7k | const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col); |
1299 | 56.7k | h_prediction_16x8_1(&left_col_8p_hi, dst, stride); |
1300 | 56.7k | dst += stride << 2; |
1301 | 56.7k | h_prediction_16x8_2(&left_col_8p_hi, dst, stride); |
1302 | 56.7k | dst += stride << 2; |
1303 | | |
1304 | 56.7k | left += 16; |
1305 | 56.7k | i++; |
1306 | 56.7k | } while (i < count); |
1307 | 25.7k | } |
1308 | | |
1309 | | void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, |
1310 | 23.0k | const uint8_t *above, const uint8_t *left) { |
1311 | 23.0k | (void)above; |
1312 | 23.0k | h_predictor_16xh(dst, stride, left, 2); |
1313 | 23.0k | } |
1314 | | |
1315 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1316 | | void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, |
1317 | 2.64k | const uint8_t *above, const uint8_t *left) { |
1318 | 2.64k | (void)above; |
1319 | 2.64k | h_predictor_16xh(dst, stride, left, 4); |
1320 | 2.64k | } |
1321 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1322 | | |
1323 | | static inline void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst, |
1324 | 602k | ptrdiff_t stride) { |
1325 | 602k | int i; |
1326 | 3.01M | for (i = 0; i < h; ++i) { |
1327 | 2.41M | _mm_store_si128((__m128i *)dst, row[i]); |
1328 | 2.41M | _mm_store_si128((__m128i *)(dst + 16), row[i]); |
1329 | 2.41M | dst += stride; |
1330 | 2.41M | } |
1331 | 602k | } |
1332 | | |
1333 | | // Process 32x8, first 4 rows |
1334 | | // Use first 8 bytes of left register: xxxxxxxx33221100 |
1335 | | static inline void h_prediction_32x8_1(const __m128i *left, uint8_t *dst, |
1336 | 301k | ptrdiff_t stride) { |
1337 | 301k | __m128i row[4]; |
1338 | 301k | repeat_low_4pixels(left, row); |
1339 | 301k | h_pred_store_32xh(row, 4, dst, stride); |
1340 | 301k | } |
1341 | | |
1342 | | // Process 32x8, second 4 rows |
1343 | | // Use second 8 bytes of left register: 77665544xxxxxxxx |
1344 | | static inline void h_prediction_32x8_2(const __m128i *left, uint8_t *dst, |
1345 | 301k | ptrdiff_t stride) { |
1346 | 301k | __m128i row[4]; |
1347 | 301k | repeat_high_4pixels(left, row); |
1348 | 301k | h_pred_store_32xh(row, 4, dst, stride); |
1349 | 301k | } |
1350 | | |
1351 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1352 | | void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, |
1353 | 183k | const uint8_t *above, const uint8_t *left) { |
1354 | 183k | __m128i left_col, left_col_8p; |
1355 | 183k | (void)above; |
1356 | | |
1357 | 183k | left_col = _mm_load_si128((const __m128i *)left); |
1358 | | |
1359 | 183k | left_col_8p = _mm_unpacklo_epi8(left_col, left_col); |
1360 | 183k | h_prediction_32x8_1(&left_col_8p, dst, stride); |
1361 | 183k | dst += stride << 2; |
1362 | 183k | h_prediction_32x8_2(&left_col_8p, dst, stride); |
1363 | 183k | } |
1364 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1365 | | |
1366 | | void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, |
1367 | 58.9k | const uint8_t *above, const uint8_t *left) { |
1368 | 58.9k | __m128i left_col, left_col_8p; |
1369 | 58.9k | (void)above; |
1370 | | |
1371 | 58.9k | left_col = _mm_load_si128((const __m128i *)left); |
1372 | | |
1373 | 58.9k | left_col_8p = _mm_unpacklo_epi8(left_col, left_col); |
1374 | 58.9k | h_prediction_32x8_1(&left_col_8p, dst, stride); |
1375 | 58.9k | dst += stride << 2; |
1376 | 58.9k | h_prediction_32x8_2(&left_col_8p, dst, stride); |
1377 | 58.9k | dst += stride << 2; |
1378 | | |
1379 | 58.9k | left_col_8p = _mm_unpackhi_epi8(left_col, left_col); |
1380 | 58.9k | h_prediction_32x8_1(&left_col_8p, dst, stride); |
1381 | 58.9k | dst += stride << 2; |
1382 | 58.9k | h_prediction_32x8_2(&left_col_8p, dst, stride); |
1383 | 58.9k | } |
1384 | | |
1385 | | static inline void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride, |
1386 | 3.19k | const uint8_t *left, int height) { |
1387 | 3.19k | int i = height >> 2; |
1388 | 51.1k | do { |
1389 | 51.1k | __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]); |
1390 | 51.1k | left4 = _mm_unpacklo_epi8(left4, left4); |
1391 | 51.1k | left4 = _mm_unpacklo_epi8(left4, left4); |
1392 | 51.1k | const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); |
1393 | 51.1k | const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); |
1394 | 51.1k | _mm_store_si128((__m128i *)dst, r0); |
1395 | 51.1k | _mm_store_si128((__m128i *)(dst + 16), r0); |
1396 | 51.1k | _mm_store_si128((__m128i *)(dst + stride), r1); |
1397 | 51.1k | _mm_store_si128((__m128i *)(dst + stride + 16), r1); |
1398 | 51.1k | const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); |
1399 | 51.1k | const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); |
1400 | 51.1k | _mm_store_si128((__m128i *)(dst + stride * 2), r2); |
1401 | 51.1k | _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); |
1402 | 51.1k | _mm_store_si128((__m128i *)(dst + stride * 3), r3); |
1403 | 51.1k | _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); |
1404 | 51.1k | left += 4; |
1405 | 51.1k | dst += stride * 4; |
1406 | 51.1k | } while (--i); |
1407 | 3.19k | } |
1408 | | |
1409 | | void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, |
1410 | 3.19k | const uint8_t *above, const uint8_t *left) { |
1411 | 3.19k | (void)above; |
1412 | 3.19k | h_predictor_32xh(dst, stride, left, 64); |
1413 | 3.19k | } |
1414 | | |
1415 | | static inline void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride, |
1416 | 90.9k | const uint8_t *left, int height) { |
1417 | 90.9k | int i = height >> 2; |
1418 | 636k | do { |
1419 | 636k | __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]); |
1420 | 636k | left4 = _mm_unpacklo_epi8(left4, left4); |
1421 | 636k | left4 = _mm_unpacklo_epi8(left4, left4); |
1422 | 636k | const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); |
1423 | 636k | const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); |
1424 | 636k | _mm_store_si128((__m128i *)dst, r0); |
1425 | 636k | _mm_store_si128((__m128i *)(dst + 16), r0); |
1426 | 636k | _mm_store_si128((__m128i *)(dst + 32), r0); |
1427 | 636k | _mm_store_si128((__m128i *)(dst + 48), r0); |
1428 | 636k | _mm_store_si128((__m128i *)(dst + stride), r1); |
1429 | 636k | _mm_store_si128((__m128i *)(dst + stride + 16), r1); |
1430 | 636k | _mm_store_si128((__m128i *)(dst + stride + 32), r1); |
1431 | 636k | _mm_store_si128((__m128i *)(dst + stride + 48), r1); |
1432 | 636k | const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); |
1433 | 636k | const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); |
1434 | 636k | _mm_store_si128((__m128i *)(dst + stride * 2), r2); |
1435 | 636k | _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); |
1436 | 636k | _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2); |
1437 | 636k | _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2); |
1438 | 636k | _mm_store_si128((__m128i *)(dst + stride * 3), r3); |
1439 | 636k | _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); |
1440 | 636k | _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3); |
1441 | 636k | _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3); |
1442 | 636k | left += 4; |
1443 | 636k | dst += stride * 4; |
1444 | 636k | } while (--i); |
1445 | 90.9k | } |
1446 | | |
1447 | | void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, |
1448 | 18.6k | const uint8_t *above, const uint8_t *left) { |
1449 | 18.6k | (void)above; |
1450 | 18.6k | h_predictor_64xh(dst, stride, left, 64); |
1451 | 18.6k | } |
1452 | | |
1453 | | void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, |
1454 | 12.3k | const uint8_t *above, const uint8_t *left) { |
1455 | 12.3k | (void)above; |
1456 | 12.3k | h_predictor_64xh(dst, stride, left, 32); |
1457 | 12.3k | } |
1458 | | |
1459 | | #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |
1460 | | void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, |
1461 | 59.8k | const uint8_t *above, const uint8_t *left) { |
1462 | 59.8k | (void)above; |
1463 | 59.8k | h_predictor_64xh(dst, stride, left, 16); |
1464 | 59.8k | } |
1465 | | #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER |