/src/aom/aom_dsp/x86/intrapred_sse2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <emmintrin.h> |
13 | | #include "aom_dsp/x86/intrapred_x86.h" |
14 | | #include "config/aom_dsp_rtcd.h" |
15 | | |
16 | | static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst, |
17 | 579k | ptrdiff_t stride) { |
18 | 3.84M | for (int i = 0; i < height; i += 2) { |
19 | 3.26M | *(uint32_t *)dst = dc; |
20 | 3.26M | dst += stride; |
21 | 3.26M | *(uint32_t *)dst = dc; |
22 | 3.26M | dst += stride; |
23 | 3.26M | } |
24 | 579k | } |
25 | | |
26 | | static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst, |
27 | 880k | ptrdiff_t stride) { |
28 | 880k | int i; |
29 | 10.5M | for (i = 0; i < height; ++i) { |
30 | 9.68M | _mm_storel_epi64((__m128i *)dst, *row); |
31 | 9.68M | dst += stride; |
32 | 9.68M | } |
33 | 880k | } |
34 | | |
35 | | static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst, |
36 | 1.44M | ptrdiff_t stride) { |
37 | 1.44M | int i; |
38 | 14.1M | for (i = 0; i < height; ++i) { |
39 | 12.6M | _mm_store_si128((__m128i *)dst, *row); |
40 | 12.6M | dst += stride; |
41 | 12.6M | } |
42 | 1.44M | } |
43 | | |
44 | | static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst, |
45 | 500k | ptrdiff_t stride) { |
46 | 500k | int i; |
47 | 4.50M | for (i = 0; i < height; ++i) { |
48 | 4.00M | _mm_store_si128((__m128i *)dst, *row); |
49 | 4.00M | _mm_store_si128((__m128i *)(dst + 16), *row); |
50 | 4.00M | dst += stride; |
51 | 4.00M | } |
52 | 500k | } |
53 | | |
54 | | static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst, |
55 | 0 | ptrdiff_t stride) { |
56 | 0 | for (int i = 0; i < height; ++i) { |
57 | 0 | _mm_store_si128((__m128i *)dst, *row); |
58 | 0 | _mm_store_si128((__m128i *)(dst + 16), *row); |
59 | 0 | _mm_store_si128((__m128i *)(dst + 32), *row); |
60 | 0 | _mm_store_si128((__m128i *)(dst + 48), *row); |
61 | 0 | dst += stride; |
62 | 0 | } |
63 | 0 | } |
64 | | |
65 | 1.81M | static INLINE __m128i dc_sum_4(const uint8_t *ref) { |
66 | 1.81M | __m128i x = _mm_loadl_epi64((__m128i const *)ref); |
67 | 1.81M | const __m128i zero = _mm_setzero_si128(); |
68 | 1.81M | x = _mm_unpacklo_epi8(x, zero); |
69 | 1.81M | return _mm_sad_epu8(x, zero); |
70 | 1.81M | } |
71 | | |
72 | 1.96M | static INLINE __m128i dc_sum_8(const uint8_t *ref) { |
73 | 1.96M | __m128i x = _mm_loadl_epi64((__m128i const *)ref); |
74 | 1.96M | const __m128i zero = _mm_setzero_si128(); |
75 | 1.96M | return _mm_sad_epu8(x, zero); |
76 | 1.96M | } |
77 | | |
78 | 16.9k | static INLINE __m128i dc_sum_64(const uint8_t *ref) { |
79 | 16.9k | __m128i x0 = _mm_load_si128((__m128i const *)ref); |
80 | 16.9k | __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); |
81 | 16.9k | __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32)); |
82 | 16.9k | __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48)); |
83 | 16.9k | const __m128i zero = _mm_setzero_si128(); |
84 | 16.9k | x0 = _mm_sad_epu8(x0, zero); |
85 | 16.9k | x1 = _mm_sad_epu8(x1, zero); |
86 | 16.9k | x2 = _mm_sad_epu8(x2, zero); |
87 | 16.9k | x3 = _mm_sad_epu8(x3, zero); |
88 | 16.9k | x0 = _mm_add_epi16(x0, x1); |
89 | 16.9k | x2 = _mm_add_epi16(x2, x3); |
90 | 16.9k | x0 = _mm_add_epi16(x0, x2); |
91 | 16.9k | const __m128i high = _mm_unpackhi_epi64(x0, x0); |
92 | 16.9k | return _mm_add_epi16(x0, high); |
93 | 16.9k | } |
94 | | |
95 | 1.47M | #define DC_MULTIPLIER_1X2 0x5556 |
96 | 1.61M | #define DC_MULTIPLIER_1X4 0x3334 |
97 | | |
98 | 3.09M | #define DC_SHIFT2 16 |
99 | | |
100 | | static INLINE int divide_using_multiply_shift(int num, int shift1, |
101 | 3.09M | int multiplier) { |
102 | 3.09M | const int interm = num >> shift1; |
103 | 3.09M | return interm * multiplier >> DC_SHIFT2; |
104 | 3.09M | } |
105 | | |
106 | | // ----------------------------------------------------------------------------- |
107 | | // DC_PRED |
108 | | |
109 | | void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, |
110 | 305k | const uint8_t *above, const uint8_t *left) { |
111 | 305k | const __m128i sum_left = dc_sum_8(left); |
112 | 305k | __m128i sum_above = dc_sum_4(above); |
113 | 305k | sum_above = _mm_add_epi16(sum_left, sum_above); |
114 | | |
115 | 305k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
116 | 305k | sum += 6; |
117 | 305k | sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); |
118 | | |
119 | 305k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
120 | 305k | const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row); |
121 | 305k | dc_store_4xh(pred, 8, dst, stride); |
122 | 305k | } |
123 | | |
124 | | void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, |
125 | 220k | const uint8_t *above, const uint8_t *left) { |
126 | 220k | const __m128i sum_left = dc_sum_16_sse2(left); |
127 | 220k | __m128i sum_above = dc_sum_4(above); |
128 | 220k | sum_above = _mm_add_epi16(sum_left, sum_above); |
129 | | |
130 | 220k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
131 | 220k | sum += 10; |
132 | 220k | sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); |
133 | | |
134 | 220k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
135 | 220k | const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row); |
136 | 220k | dc_store_4xh(pred, 16, dst, stride); |
137 | 220k | } |
138 | | |
139 | | void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, |
140 | 473k | const uint8_t *above, const uint8_t *left) { |
141 | 473k | const __m128i sum_left = dc_sum_4(left); |
142 | 473k | __m128i sum_above = dc_sum_8(above); |
143 | 473k | sum_above = _mm_add_epi16(sum_above, sum_left); |
144 | | |
145 | 473k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
146 | 473k | sum += 6; |
147 | 473k | sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); |
148 | | |
149 | 473k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
150 | 473k | dc_store_8xh(&row, 4, dst, stride); |
151 | 473k | } |
152 | | |
153 | | void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, |
154 | 201k | const uint8_t *above, const uint8_t *left) { |
155 | 201k | const __m128i sum_left = dc_sum_16_sse2(left); |
156 | 201k | __m128i sum_above = dc_sum_8(above); |
157 | 201k | sum_above = _mm_add_epi16(sum_above, sum_left); |
158 | | |
159 | 201k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
160 | 201k | sum += 12; |
161 | 201k | sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); |
162 | 201k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
163 | 201k | dc_store_8xh(&row, 16, dst, stride); |
164 | 201k | } |
165 | | |
166 | | void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, |
167 | 95.7k | const uint8_t *above, const uint8_t *left) { |
168 | 95.7k | const __m128i sum_left = dc_sum_32_sse2(left); |
169 | 95.7k | __m128i sum_above = dc_sum_8(above); |
170 | 95.7k | sum_above = _mm_add_epi16(sum_above, sum_left); |
171 | | |
172 | 95.7k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
173 | 95.7k | sum += 20; |
174 | 95.7k | sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); |
175 | 95.7k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
176 | 95.7k | dc_store_8xh(&row, 32, dst, stride); |
177 | 95.7k | } |
178 | | |
179 | | void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, |
180 | 798k | const uint8_t *above, const uint8_t *left) { |
181 | 798k | const __m128i sum_left = dc_sum_4(left); |
182 | 798k | __m128i sum_above = dc_sum_16_sse2(above); |
183 | 798k | sum_above = _mm_add_epi16(sum_above, sum_left); |
184 | | |
185 | 798k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
186 | 798k | sum += 10; |
187 | 798k | sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); |
188 | 798k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
189 | 798k | dc_store_16xh(&row, 4, dst, stride); |
190 | 798k | } |
191 | | |
192 | | void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, |
193 | 377k | const uint8_t *above, const uint8_t *left) { |
194 | 377k | const __m128i sum_left = dc_sum_8(left); |
195 | 377k | __m128i sum_above = dc_sum_16_sse2(above); |
196 | 377k | sum_above = _mm_add_epi16(sum_above, sum_left); |
197 | | |
198 | 377k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
199 | 377k | sum += 12; |
200 | 377k | sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); |
201 | 377k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
202 | 377k | dc_store_16xh(&row, 8, dst, stride); |
203 | 377k | } |
204 | | |
205 | | void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, |
206 | 121k | const uint8_t *above, const uint8_t *left) { |
207 | 121k | const __m128i sum_left = dc_sum_32_sse2(left); |
208 | 121k | __m128i sum_above = dc_sum_16_sse2(above); |
209 | 121k | sum_above = _mm_add_epi16(sum_left, sum_above); |
210 | | |
211 | 121k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
212 | 121k | sum += 24; |
213 | 121k | sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); |
214 | 121k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
215 | 121k | dc_store_16xh(&row, 32, dst, stride); |
216 | 121k | } |
217 | | |
218 | | void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, |
219 | 15.0k | const uint8_t *above, const uint8_t *left) { |
220 | 15.0k | const __m128i sum_left = dc_sum_64(left); |
221 | 15.0k | __m128i sum_above = dc_sum_16_sse2(above); |
222 | 15.0k | sum_above = _mm_add_epi16(sum_left, sum_above); |
223 | | |
224 | 15.0k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
225 | 15.0k | sum += 40; |
226 | 15.0k | sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); |
227 | 15.0k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
228 | 15.0k | dc_store_16xh(&row, 64, dst, stride); |
229 | 15.0k | } |
230 | | |
231 | | void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, |
232 | 484k | const uint8_t *above, const uint8_t *left) { |
233 | 484k | __m128i sum_above = dc_sum_32_sse2(above); |
234 | 484k | const __m128i sum_left = dc_sum_8(left); |
235 | 484k | sum_above = _mm_add_epi16(sum_above, sum_left); |
236 | | |
237 | 484k | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
238 | 484k | sum += 20; |
239 | 484k | sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); |
240 | 484k | const __m128i row = _mm_set1_epi8((int8_t)sum); |
241 | 484k | dc_store_32xh(&row, 8, dst, stride); |
242 | 484k | } |
243 | | |
244 | | void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, |
245 | 0 | const uint8_t *above, const uint8_t *left) { |
246 | 0 | __m128i sum_above = dc_sum_32_sse2(above); |
247 | 0 | const __m128i sum_left = dc_sum_16_sse2(left); |
248 | 0 | sum_above = _mm_add_epi16(sum_above, sum_left); |
249 | |
|
250 | 0 | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
251 | 0 | sum += 24; |
252 | 0 | sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); |
253 | 0 | const __m128i row = _mm_set1_epi8((int8_t)sum); |
254 | 0 | dc_store_32xh(&row, 16, dst, stride); |
255 | 0 | } |
256 | | |
257 | | void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, |
258 | 0 | const uint8_t *above, const uint8_t *left) { |
259 | 0 | __m128i sum_above = dc_sum_32_sse2(above); |
260 | 0 | const __m128i sum_left = dc_sum_64(left); |
261 | 0 | sum_above = _mm_add_epi16(sum_above, sum_left); |
262 | |
|
263 | 0 | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
264 | 0 | sum += 48; |
265 | 0 | sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); |
266 | 0 | const __m128i row = _mm_set1_epi8((int8_t)sum); |
267 | 0 | dc_store_32xh(&row, 64, dst, stride); |
268 | 0 | } |
269 | | |
270 | | void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, |
271 | 0 | const uint8_t *above, const uint8_t *left) { |
272 | 0 | __m128i sum_above = dc_sum_64(above); |
273 | 0 | const __m128i sum_left = dc_sum_64(left); |
274 | 0 | sum_above = _mm_add_epi16(sum_above, sum_left); |
275 | |
|
276 | 0 | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
277 | 0 | sum += 64; |
278 | 0 | sum /= 128; |
279 | 0 | const __m128i row = _mm_set1_epi8((int8_t)sum); |
280 | 0 | dc_store_64xh(&row, 64, dst, stride); |
281 | 0 | } |
282 | | |
283 | | void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, |
284 | 0 | const uint8_t *above, const uint8_t *left) { |
285 | 0 | __m128i sum_above = dc_sum_64(above); |
286 | 0 | const __m128i sum_left = dc_sum_32_sse2(left); |
287 | 0 | sum_above = _mm_add_epi16(sum_above, sum_left); |
288 | |
|
289 | 0 | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
290 | 0 | sum += 48; |
291 | 0 | sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); |
292 | 0 | const __m128i row = _mm_set1_epi8((int8_t)sum); |
293 | 0 | dc_store_64xh(&row, 32, dst, stride); |
294 | 0 | } |
295 | | |
296 | | void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, |
297 | 0 | const uint8_t *above, const uint8_t *left) { |
298 | 0 | __m128i sum_above = dc_sum_64(above); |
299 | 0 | const __m128i sum_left = dc_sum_16_sse2(left); |
300 | 0 | sum_above = _mm_add_epi16(sum_above, sum_left); |
301 | |
|
302 | 0 | uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); |
303 | 0 | sum += 40; |
304 | 0 | sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); |
305 | 0 | const __m128i row = _mm_set1_epi8((int8_t)sum); |
306 | 0 | dc_store_64xh(&row, 16, dst, stride); |
307 | 0 | } |
308 | | |
309 | | // ----------------------------------------------------------------------------- |
310 | | // DC_TOP |
311 | | |
312 | | void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, |
313 | 3.93k | const uint8_t *above, const uint8_t *left) { |
314 | 3.93k | (void)left; |
315 | 3.93k | __m128i sum_above = dc_sum_4(above); |
316 | 3.93k | const __m128i two = _mm_set1_epi16(2); |
317 | 3.93k | sum_above = _mm_add_epi16(sum_above, two); |
318 | 3.93k | sum_above = _mm_srai_epi16(sum_above, 2); |
319 | 3.93k | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
320 | 3.93k | sum_above = _mm_packus_epi16(sum_above, sum_above); |
321 | | |
322 | 3.93k | const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above); |
323 | 3.93k | dc_store_4xh(pred, 8, dst, stride); |
324 | 3.93k | } |
325 | | |
326 | | void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, |
327 | 2.55k | const uint8_t *above, const uint8_t *left) { |
328 | 2.55k | (void)left; |
329 | 2.55k | __m128i sum_above = dc_sum_4(above); |
330 | 2.55k | const __m128i two = _mm_set1_epi16(2); |
331 | 2.55k | sum_above = _mm_add_epi16(sum_above, two); |
332 | 2.55k | sum_above = _mm_srai_epi16(sum_above, 2); |
333 | 2.55k | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
334 | 2.55k | sum_above = _mm_packus_epi16(sum_above, sum_above); |
335 | | |
336 | 2.55k | const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above); |
337 | 2.55k | dc_store_4xh(pred, 16, dst, stride); |
338 | 2.55k | } |
339 | | |
340 | | void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, |
341 | 2.86k | const uint8_t *above, const uint8_t *left) { |
342 | 2.86k | (void)left; |
343 | 2.86k | __m128i sum_above = dc_sum_8(above); |
344 | 2.86k | const __m128i four = _mm_set1_epi16(4); |
345 | 2.86k | sum_above = _mm_add_epi16(sum_above, four); |
346 | 2.86k | sum_above = _mm_srai_epi16(sum_above, 3); |
347 | 2.86k | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
348 | 2.86k | const __m128i row = _mm_shufflelo_epi16(sum_above, 0); |
349 | 2.86k | dc_store_8xh(&row, 4, dst, stride); |
350 | 2.86k | } |
351 | | |
352 | | void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, |
353 | 2.86k | const uint8_t *above, const uint8_t *left) { |
354 | 2.86k | (void)left; |
355 | 2.86k | __m128i sum_above = dc_sum_8(above); |
356 | 2.86k | const __m128i four = _mm_set1_epi16(4); |
357 | 2.86k | sum_above = _mm_add_epi16(sum_above, four); |
358 | 2.86k | sum_above = _mm_srai_epi16(sum_above, 3); |
359 | 2.86k | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
360 | 2.86k | const __m128i row = _mm_shufflelo_epi16(sum_above, 0); |
361 | 2.86k | dc_store_8xh(&row, 16, dst, stride); |
362 | 2.86k | } |
363 | | |
364 | | void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, |
365 | 3.86k | const uint8_t *above, const uint8_t *left) { |
366 | 3.86k | (void)left; |
367 | 3.86k | __m128i sum_above = dc_sum_8(above); |
368 | 3.86k | const __m128i four = _mm_set1_epi16(4); |
369 | 3.86k | sum_above = _mm_add_epi16(sum_above, four); |
370 | 3.86k | sum_above = _mm_srai_epi16(sum_above, 3); |
371 | 3.86k | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
372 | 3.86k | const __m128i row = _mm_shufflelo_epi16(sum_above, 0); |
373 | 3.86k | dc_store_8xh(&row, 32, dst, stride); |
374 | 3.86k | } |
375 | | |
376 | | void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, |
377 | 8.28k | const uint8_t *above, const uint8_t *left) { |
378 | 8.28k | (void)left; |
379 | 8.28k | __m128i sum_above = dc_sum_16_sse2(above); |
380 | 8.28k | const __m128i eight = _mm_set1_epi16(8); |
381 | 8.28k | sum_above = _mm_add_epi16(sum_above, eight); |
382 | 8.28k | sum_above = _mm_srai_epi16(sum_above, 4); |
383 | 8.28k | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
384 | 8.28k | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
385 | 8.28k | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
386 | 8.28k | dc_store_16xh(&row, 4, dst, stride); |
387 | 8.28k | } |
388 | | |
389 | | void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, |
390 | 3.58k | const uint8_t *above, const uint8_t *left) { |
391 | 3.58k | (void)left; |
392 | 3.58k | __m128i sum_above = dc_sum_16_sse2(above); |
393 | 3.58k | const __m128i eight = _mm_set1_epi16(8); |
394 | 3.58k | sum_above = _mm_add_epi16(sum_above, eight); |
395 | 3.58k | sum_above = _mm_srai_epi16(sum_above, 4); |
396 | 3.58k | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
397 | 3.58k | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
398 | 3.58k | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
399 | 3.58k | dc_store_16xh(&row, 8, dst, stride); |
400 | 3.58k | } |
401 | | |
402 | | void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, |
403 | | const uint8_t *above, |
404 | 4.21k | const uint8_t *left) { |
405 | 4.21k | (void)left; |
406 | 4.21k | __m128i sum_above = dc_sum_16_sse2(above); |
407 | 4.21k | const __m128i eight = _mm_set1_epi16(8); |
408 | 4.21k | sum_above = _mm_add_epi16(sum_above, eight); |
409 | 4.21k | sum_above = _mm_srai_epi16(sum_above, 4); |
410 | 4.21k | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
411 | 4.21k | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
412 | 4.21k | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
413 | 4.21k | dc_store_16xh(&row, 32, dst, stride); |
414 | 4.21k | } |
415 | | |
416 | | void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, |
417 | | const uint8_t *above, |
418 | 204 | const uint8_t *left) { |
419 | 204 | (void)left; |
420 | 204 | __m128i sum_above = dc_sum_16_sse2(above); |
421 | 204 | const __m128i eight = _mm_set1_epi16(8); |
422 | 204 | sum_above = _mm_add_epi16(sum_above, eight); |
423 | 204 | sum_above = _mm_srai_epi16(sum_above, 4); |
424 | 204 | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
425 | 204 | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
426 | 204 | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
427 | 204 | dc_store_16xh(&row, 64, dst, stride); |
428 | 204 | } |
429 | | |
430 | | void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, |
431 | 12.3k | const uint8_t *above, const uint8_t *left) { |
432 | 12.3k | (void)left; |
433 | 12.3k | __m128i sum_above = dc_sum_32_sse2(above); |
434 | 12.3k | const __m128i sixteen = _mm_set1_epi16(16); |
435 | 12.3k | sum_above = _mm_add_epi16(sum_above, sixteen); |
436 | 12.3k | sum_above = _mm_srai_epi16(sum_above, 5); |
437 | 12.3k | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
438 | 12.3k | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
439 | 12.3k | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
440 | 12.3k | dc_store_32xh(&row, 8, dst, stride); |
441 | 12.3k | } |
442 | | |
443 | | void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, |
444 | | const uint8_t *above, |
445 | 0 | const uint8_t *left) { |
446 | 0 | (void)left; |
447 | 0 | __m128i sum_above = dc_sum_32_sse2(above); |
448 | 0 | const __m128i sixteen = _mm_set1_epi16(16); |
449 | 0 | sum_above = _mm_add_epi16(sum_above, sixteen); |
450 | 0 | sum_above = _mm_srai_epi16(sum_above, 5); |
451 | 0 | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
452 | 0 | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
453 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
454 | 0 | dc_store_32xh(&row, 16, dst, stride); |
455 | 0 | } |
456 | | |
457 | | void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, |
458 | | const uint8_t *above, |
459 | 0 | const uint8_t *left) { |
460 | 0 | (void)left; |
461 | 0 | __m128i sum_above = dc_sum_32_sse2(above); |
462 | 0 | const __m128i sixteen = _mm_set1_epi16(16); |
463 | 0 | sum_above = _mm_add_epi16(sum_above, sixteen); |
464 | 0 | sum_above = _mm_srai_epi16(sum_above, 5); |
465 | 0 | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
466 | 0 | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
467 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
468 | 0 | dc_store_32xh(&row, 64, dst, stride); |
469 | 0 | } |
470 | | |
471 | | void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, |
472 | | const uint8_t *above, |
473 | 0 | const uint8_t *left) { |
474 | 0 | (void)left; |
475 | 0 | __m128i sum_above = dc_sum_64(above); |
476 | 0 | const __m128i thirtytwo = _mm_set1_epi16(32); |
477 | 0 | sum_above = _mm_add_epi16(sum_above, thirtytwo); |
478 | 0 | sum_above = _mm_srai_epi16(sum_above, 6); |
479 | 0 | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
480 | 0 | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
481 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
482 | 0 | dc_store_64xh(&row, 64, dst, stride); |
483 | 0 | } |
484 | | |
485 | | void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, |
486 | | const uint8_t *above, |
487 | 0 | const uint8_t *left) { |
488 | 0 | (void)left; |
489 | 0 | __m128i sum_above = dc_sum_64(above); |
490 | 0 | const __m128i thirtytwo = _mm_set1_epi16(32); |
491 | 0 | sum_above = _mm_add_epi16(sum_above, thirtytwo); |
492 | 0 | sum_above = _mm_srai_epi16(sum_above, 6); |
493 | 0 | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
494 | 0 | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
495 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
496 | 0 | dc_store_64xh(&row, 32, dst, stride); |
497 | 0 | } |
498 | | |
499 | | void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, |
500 | | const uint8_t *above, |
501 | 0 | const uint8_t *left) { |
502 | 0 | (void)left; |
503 | 0 | __m128i sum_above = dc_sum_64(above); |
504 | 0 | const __m128i thirtytwo = _mm_set1_epi16(32); |
505 | 0 | sum_above = _mm_add_epi16(sum_above, thirtytwo); |
506 | 0 | sum_above = _mm_srai_epi16(sum_above, 6); |
507 | 0 | sum_above = _mm_unpacklo_epi8(sum_above, sum_above); |
508 | 0 | sum_above = _mm_shufflelo_epi16(sum_above, 0); |
509 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); |
510 | 0 | dc_store_64xh(&row, 16, dst, stride); |
511 | 0 | } |
512 | | |
513 | | // ----------------------------------------------------------------------------- |
514 | | // DC_LEFT |
515 | | |
516 | | void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, |
517 | 3.28k | const uint8_t *above, const uint8_t *left) { |
518 | 3.28k | (void)above; |
519 | 3.28k | __m128i sum_left = dc_sum_8(left); |
520 | 3.28k | const __m128i four = _mm_set1_epi16(4); |
521 | 3.28k | sum_left = _mm_add_epi16(sum_left, four); |
522 | 3.28k | sum_left = _mm_srai_epi16(sum_left, 3); |
523 | 3.28k | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
524 | 3.28k | sum_left = _mm_packus_epi16(sum_left, sum_left); |
525 | | |
526 | 3.28k | const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left); |
527 | 3.28k | dc_store_4xh(pred, 8, dst, stride); |
528 | 3.28k | } |
529 | | |
530 | | void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, |
531 | | const uint8_t *above, |
532 | 6.10k | const uint8_t *left) { |
533 | 6.10k | (void)above; |
534 | 6.10k | __m128i sum_left = dc_sum_16_sse2(left); |
535 | 6.10k | const __m128i eight = _mm_set1_epi16(8); |
536 | 6.10k | sum_left = _mm_add_epi16(sum_left, eight); |
537 | 6.10k | sum_left = _mm_srai_epi16(sum_left, 4); |
538 | 6.10k | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
539 | 6.10k | sum_left = _mm_packus_epi16(sum_left, sum_left); |
540 | | |
541 | 6.10k | const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left); |
542 | 6.10k | dc_store_4xh(pred, 16, dst, stride); |
543 | 6.10k | } |
544 | | |
545 | | void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, |
546 | 6.40k | const uint8_t *above, const uint8_t *left) { |
547 | 6.40k | (void)above; |
548 | 6.40k | __m128i sum_left = dc_sum_4(left); |
549 | 6.40k | const __m128i two = _mm_set1_epi16(2); |
550 | 6.40k | sum_left = _mm_add_epi16(sum_left, two); |
551 | 6.40k | sum_left = _mm_srai_epi16(sum_left, 2); |
552 | 6.40k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
553 | 6.40k | const __m128i row = _mm_shufflelo_epi16(sum_left, 0); |
554 | 6.40k | dc_store_8xh(&row, 4, dst, stride); |
555 | 6.40k | } |
556 | | |
557 | | void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, |
558 | | const uint8_t *above, |
559 | 2.84k | const uint8_t *left) { |
560 | 2.84k | (void)above; |
561 | 2.84k | __m128i sum_left = dc_sum_16_sse2(left); |
562 | 2.84k | const __m128i eight = _mm_set1_epi16(8); |
563 | 2.84k | sum_left = _mm_add_epi16(sum_left, eight); |
564 | 2.84k | sum_left = _mm_srai_epi16(sum_left, 4); |
565 | 2.84k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
566 | 2.84k | const __m128i row = _mm_shufflelo_epi16(sum_left, 0); |
567 | 2.84k | dc_store_8xh(&row, 16, dst, stride); |
568 | 2.84k | } |
569 | | |
570 | | void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, |
571 | | const uint8_t *above, |
572 | 16.6k | const uint8_t *left) { |
573 | 16.6k | (void)above; |
574 | 16.6k | __m128i sum_left = dc_sum_32_sse2(left); |
575 | 16.6k | const __m128i sixteen = _mm_set1_epi16(16); |
576 | 16.6k | sum_left = _mm_add_epi16(sum_left, sixteen); |
577 | 16.6k | sum_left = _mm_srai_epi16(sum_left, 5); |
578 | 16.6k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
579 | 16.6k | const __m128i row = _mm_shufflelo_epi16(sum_left, 0); |
580 | 16.6k | dc_store_8xh(&row, 32, dst, stride); |
581 | 16.6k | } |
582 | | |
583 | | void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, |
584 | | const uint8_t *above, |
585 | 4.43k | const uint8_t *left) { |
586 | 4.43k | (void)above; |
587 | 4.43k | __m128i sum_left = dc_sum_4(left); |
588 | 4.43k | const __m128i two = _mm_set1_epi16(2); |
589 | 4.43k | sum_left = _mm_add_epi16(sum_left, two); |
590 | 4.43k | sum_left = _mm_srai_epi16(sum_left, 2); |
591 | 4.43k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
592 | 4.43k | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
593 | 4.43k | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
594 | 4.43k | dc_store_16xh(&row, 4, dst, stride); |
595 | 4.43k | } |
596 | | |
597 | | void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, |
598 | | const uint8_t *above, |
599 | 9.46k | const uint8_t *left) { |
600 | 9.46k | (void)above; |
601 | 9.46k | __m128i sum_left = dc_sum_8(left); |
602 | 9.46k | const __m128i four = _mm_set1_epi16(4); |
603 | 9.46k | sum_left = _mm_add_epi16(sum_left, four); |
604 | 9.46k | sum_left = _mm_srai_epi16(sum_left, 3); |
605 | 9.46k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
606 | 9.46k | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
607 | 9.46k | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
608 | 9.46k | dc_store_16xh(&row, 8, dst, stride); |
609 | 9.46k | } |
610 | | |
611 | | void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, |
612 | | const uint8_t *above, |
613 | 4.43k | const uint8_t *left) { |
614 | 4.43k | (void)above; |
615 | 4.43k | __m128i sum_left = dc_sum_32_sse2(left); |
616 | 4.43k | const __m128i sixteen = _mm_set1_epi16(16); |
617 | 4.43k | sum_left = _mm_add_epi16(sum_left, sixteen); |
618 | 4.43k | sum_left = _mm_srai_epi16(sum_left, 5); |
619 | 4.43k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
620 | 4.43k | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
621 | 4.43k | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
622 | 4.43k | dc_store_16xh(&row, 32, dst, stride); |
623 | 4.43k | } |
624 | | |
625 | | void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, |
626 | | const uint8_t *above, |
627 | 1.94k | const uint8_t *left) { |
628 | 1.94k | (void)above; |
629 | 1.94k | __m128i sum_left = dc_sum_64(left); |
630 | 1.94k | const __m128i thirtytwo = _mm_set1_epi16(32); |
631 | 1.94k | sum_left = _mm_add_epi16(sum_left, thirtytwo); |
632 | 1.94k | sum_left = _mm_srai_epi16(sum_left, 6); |
633 | 1.94k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
634 | 1.94k | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
635 | 1.94k | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
636 | 1.94k | dc_store_16xh(&row, 64, dst, stride); |
637 | 1.94k | } |
638 | | |
639 | | void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, |
640 | | const uint8_t *above, |
641 | 3.91k | const uint8_t *left) { |
642 | 3.91k | (void)above; |
643 | 3.91k | __m128i sum_left = dc_sum_8(left); |
644 | 3.91k | const __m128i four = _mm_set1_epi16(4); |
645 | 3.91k | sum_left = _mm_add_epi16(sum_left, four); |
646 | 3.91k | sum_left = _mm_srai_epi16(sum_left, 3); |
647 | 3.91k | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
648 | 3.91k | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
649 | 3.91k | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
650 | 3.91k | dc_store_32xh(&row, 8, dst, stride); |
651 | 3.91k | } |
652 | | |
653 | | void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, |
654 | | const uint8_t *above, |
655 | 0 | const uint8_t *left) { |
656 | 0 | (void)above; |
657 | 0 | __m128i sum_left = dc_sum_16_sse2(left); |
658 | 0 | const __m128i eight = _mm_set1_epi16(8); |
659 | 0 | sum_left = _mm_add_epi16(sum_left, eight); |
660 | 0 | sum_left = _mm_srai_epi16(sum_left, 4); |
661 | 0 | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
662 | 0 | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
663 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
664 | 0 | dc_store_32xh(&row, 16, dst, stride); |
665 | 0 | } |
666 | | |
667 | | void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, |
668 | | const uint8_t *above, |
669 | 0 | const uint8_t *left) { |
670 | 0 | (void)above; |
671 | 0 | __m128i sum_left = dc_sum_64(left); |
672 | 0 | const __m128i thirtytwo = _mm_set1_epi16(32); |
673 | 0 | sum_left = _mm_add_epi16(sum_left, thirtytwo); |
674 | 0 | sum_left = _mm_srai_epi16(sum_left, 6); |
675 | 0 | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
676 | 0 | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
677 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
678 | 0 | dc_store_32xh(&row, 64, dst, stride); |
679 | 0 | } |
680 | | |
681 | | void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, |
682 | | const uint8_t *above, |
683 | 0 | const uint8_t *left) { |
684 | 0 | (void)above; |
685 | 0 | __m128i sum_left = dc_sum_64(left); |
686 | 0 | const __m128i thirtytwo = _mm_set1_epi16(32); |
687 | 0 | sum_left = _mm_add_epi16(sum_left, thirtytwo); |
688 | 0 | sum_left = _mm_srai_epi16(sum_left, 6); |
689 | 0 | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
690 | 0 | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
691 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
692 | 0 | dc_store_64xh(&row, 64, dst, stride); |
693 | 0 | } |
694 | | |
695 | | void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, |
696 | | const uint8_t *above, |
697 | 0 | const uint8_t *left) { |
698 | 0 | (void)above; |
699 | 0 | __m128i sum_left = dc_sum_32_sse2(left); |
700 | 0 | const __m128i sixteen = _mm_set1_epi16(16); |
701 | 0 | sum_left = _mm_add_epi16(sum_left, sixteen); |
702 | 0 | sum_left = _mm_srai_epi16(sum_left, 5); |
703 | 0 | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
704 | 0 | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
705 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
706 | 0 | dc_store_64xh(&row, 32, dst, stride); |
707 | 0 | } |
708 | | |
709 | | void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, |
710 | | const uint8_t *above, |
711 | 0 | const uint8_t *left) { |
712 | 0 | (void)above; |
713 | 0 | __m128i sum_left = dc_sum_16_sse2(left); |
714 | 0 | const __m128i eight = _mm_set1_epi16(8); |
715 | 0 | sum_left = _mm_add_epi16(sum_left, eight); |
716 | 0 | sum_left = _mm_srai_epi16(sum_left, 4); |
717 | 0 | sum_left = _mm_unpacklo_epi8(sum_left, sum_left); |
718 | 0 | sum_left = _mm_shufflelo_epi16(sum_left, 0); |
719 | 0 | const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); |
720 | 0 | dc_store_64xh(&row, 16, dst, stride); |
721 | 0 | } |
722 | | |
723 | | // ----------------------------------------------------------------------------- |
724 | | // DC_128 |
725 | | |
726 | | void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, |
727 | 1.02k | const uint8_t *above, const uint8_t *left) { |
728 | 1.02k | (void)above; |
729 | 1.02k | (void)left; |
730 | 1.02k | const uint32_t pred = 0x80808080; |
731 | 1.02k | dc_store_4xh(pred, 8, dst, stride); |
732 | 1.02k | } |
733 | | |
734 | | void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, |
735 | 341 | const uint8_t *above, const uint8_t *left) { |
736 | 341 | (void)above; |
737 | 341 | (void)left; |
738 | 341 | const uint32_t pred = 0x80808080; |
739 | 341 | dc_store_4xh(pred, 16, dst, stride); |
740 | 341 | } |
741 | | |
742 | | void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, |
743 | 150 | const uint8_t *above, const uint8_t *left) { |
744 | 150 | (void)above; |
745 | 150 | (void)left; |
746 | 150 | const __m128i row = _mm_set1_epi8((int8_t)128); |
747 | 150 | dc_store_8xh(&row, 4, dst, stride); |
748 | 150 | } |
749 | | |
750 | | void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, |
751 | 2.99k | const uint8_t *above, const uint8_t *left) { |
752 | 2.99k | (void)above; |
753 | 2.99k | (void)left; |
754 | 2.99k | const __m128i row = _mm_set1_epi8((int8_t)128); |
755 | 2.99k | dc_store_8xh(&row, 16, dst, stride); |
756 | 2.99k | } |
757 | | |
758 | | void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, |
759 | 269 | const uint8_t *above, const uint8_t *left) { |
760 | 269 | (void)above; |
761 | 269 | (void)left; |
762 | 269 | const __m128i row = _mm_set1_epi8((int8_t)128); |
763 | 269 | dc_store_8xh(&row, 32, dst, stride); |
764 | 269 | } |
765 | | |
766 | | void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, |
767 | 174 | const uint8_t *above, const uint8_t *left) { |
768 | 174 | (void)above; |
769 | 174 | (void)left; |
770 | 174 | const __m128i row = _mm_set1_epi8((int8_t)128); |
771 | 174 | dc_store_16xh(&row, 4, dst, stride); |
772 | 174 | } |
773 | | |
774 | | void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, |
775 | 423 | const uint8_t *above, const uint8_t *left) { |
776 | 423 | (void)above; |
777 | 423 | (void)left; |
778 | 423 | const __m128i row = _mm_set1_epi8((int8_t)128); |
779 | 423 | dc_store_16xh(&row, 8, dst, stride); |
780 | 423 | } |
781 | | |
782 | | void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, |
783 | | const uint8_t *above, |
784 | 2.31k | const uint8_t *left) { |
785 | 2.31k | (void)above; |
786 | 2.31k | (void)left; |
787 | 2.31k | const __m128i row = _mm_set1_epi8((int8_t)128); |
788 | 2.31k | dc_store_16xh(&row, 32, dst, stride); |
789 | 2.31k | } |
790 | | |
791 | | void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, |
792 | | const uint8_t *above, |
793 | 1.51k | const uint8_t *left) { |
794 | 1.51k | (void)above; |
795 | 1.51k | (void)left; |
796 | 1.51k | const __m128i row = _mm_set1_epi8((int8_t)128); |
797 | 1.51k | dc_store_16xh(&row, 64, dst, stride); |
798 | 1.51k | } |
799 | | |
800 | | void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, |
801 | 191 | const uint8_t *above, const uint8_t *left) { |
802 | 191 | (void)above; |
803 | 191 | (void)left; |
804 | 191 | const __m128i row = _mm_set1_epi8((int8_t)128); |
805 | 191 | dc_store_32xh(&row, 8, dst, stride); |
806 | 191 | } |
807 | | |
808 | | void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, |
809 | | const uint8_t *above, |
810 | 0 | const uint8_t *left) { |
811 | 0 | (void)above; |
812 | 0 | (void)left; |
813 | 0 | const __m128i row = _mm_set1_epi8((int8_t)128); |
814 | 0 | dc_store_32xh(&row, 16, dst, stride); |
815 | 0 | } |
816 | | |
817 | | void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, |
818 | | const uint8_t *above, |
819 | 0 | const uint8_t *left) { |
820 | 0 | (void)above; |
821 | 0 | (void)left; |
822 | 0 | const __m128i row = _mm_set1_epi8((int8_t)128); |
823 | 0 | dc_store_32xh(&row, 64, dst, stride); |
824 | 0 | } |
825 | | |
826 | | void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, |
827 | | const uint8_t *above, |
828 | 0 | const uint8_t *left) { |
829 | 0 | (void)above; |
830 | 0 | (void)left; |
831 | 0 | const __m128i row = _mm_set1_epi8((int8_t)128); |
832 | 0 | dc_store_64xh(&row, 64, dst, stride); |
833 | 0 | } |
834 | | |
835 | | void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, |
836 | | const uint8_t *above, |
837 | 0 | const uint8_t *left) { |
838 | 0 | (void)above; |
839 | 0 | (void)left; |
840 | 0 | const __m128i row = _mm_set1_epi8((int8_t)128); |
841 | 0 | dc_store_64xh(&row, 32, dst, stride); |
842 | 0 | } |
843 | | |
844 | | void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, |
845 | | const uint8_t *above, |
846 | 0 | const uint8_t *left) { |
847 | 0 | (void)above; |
848 | 0 | (void)left; |
849 | 0 | const __m128i row = _mm_set1_epi8((int8_t)128); |
850 | 0 | dc_store_64xh(&row, 16, dst, stride); |
851 | 0 | } |
852 | | |
853 | | // ----------------------------------------------------------------------------- |
854 | | // V_PRED |
855 | | |
856 | | void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, |
857 | 29.7k | const uint8_t *above, const uint8_t *left) { |
858 | 29.7k | const uint32_t pred = *(uint32_t *)above; |
859 | 29.7k | (void)left; |
860 | 29.7k | dc_store_4xh(pred, 8, dst, stride); |
861 | 29.7k | } |
862 | | |
863 | | void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, |
864 | 7.38k | const uint8_t *above, const uint8_t *left) { |
865 | 7.38k | const uint32_t pred = *(uint32_t *)above; |
866 | 7.38k | (void)left; |
867 | 7.38k | dc_store_4xh(pred, 16, dst, stride); |
868 | 7.38k | } |
869 | | |
870 | | void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, |
871 | 44.2k | const uint8_t *above, const uint8_t *left) { |
872 | 44.2k | const __m128i row = _mm_loadl_epi64((__m128i const *)above); |
873 | 44.2k | (void)left; |
874 | 44.2k | dc_store_8xh(&row, 4, dst, stride); |
875 | 44.2k | } |
876 | | |
877 | | void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, |
878 | 22.1k | const uint8_t *above, const uint8_t *left) { |
879 | 22.1k | const __m128i row = _mm_loadl_epi64((__m128i const *)above); |
880 | 22.1k | (void)left; |
881 | 22.1k | dc_store_8xh(&row, 16, dst, stride); |
882 | 22.1k | } |
883 | | |
884 | | void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, |
885 | 4.09k | const uint8_t *above, const uint8_t *left) { |
886 | 4.09k | const __m128i row = _mm_loadl_epi64((__m128i const *)above); |
887 | 4.09k | (void)left; |
888 | 4.09k | dc_store_8xh(&row, 32, dst, stride); |
889 | 4.09k | } |
890 | | |
891 | | void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, |
892 | 41.2k | const uint8_t *above, const uint8_t *left) { |
893 | 41.2k | const __m128i row = _mm_load_si128((__m128i const *)above); |
894 | 41.2k | (void)left; |
895 | 41.2k | dc_store_16xh(&row, 4, dst, stride); |
896 | 41.2k | } |
897 | | |
898 | | void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, |
899 | 44.0k | const uint8_t *above, const uint8_t *left) { |
900 | 44.0k | const __m128i row = _mm_load_si128((__m128i const *)above); |
901 | 44.0k | (void)left; |
902 | 44.0k | dc_store_16xh(&row, 8, dst, stride); |
903 | 44.0k | } |
904 | | |
905 | | void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, |
906 | 8.85k | const uint8_t *above, const uint8_t *left) { |
907 | 8.85k | const __m128i row = _mm_load_si128((__m128i const *)above); |
908 | 8.85k | (void)left; |
909 | 8.85k | dc_store_16xh(&row, 32, dst, stride); |
910 | 8.85k | } |
911 | | |
912 | | void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, |
913 | 1.49k | const uint8_t *above, const uint8_t *left) { |
914 | 1.49k | const __m128i row = _mm_load_si128((__m128i const *)above); |
915 | 1.49k | (void)left; |
916 | 1.49k | dc_store_16xh(&row, 64, dst, stride); |
917 | 1.49k | } |
918 | | |
919 | | static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride, |
920 | 16.8k | const uint8_t *above, int height) { |
921 | 16.8k | const __m128i row0 = _mm_load_si128((__m128i const *)above); |
922 | 16.8k | const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); |
923 | 151k | for (int i = 0; i < height; ++i) { |
924 | 134k | _mm_store_si128((__m128i *)dst, row0); |
925 | 134k | _mm_store_si128((__m128i *)(dst + 16), row1); |
926 | 134k | dst += stride; |
927 | 134k | } |
928 | 16.8k | } |
929 | | |
930 | | void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, |
931 | 16.8k | const uint8_t *above, const uint8_t *left) { |
932 | 16.8k | (void)left; |
933 | 16.8k | v_predictor_32xh(dst, stride, above, 8); |
934 | 16.8k | } |
935 | | |
936 | | void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, |
937 | 0 | const uint8_t *above, const uint8_t *left) { |
938 | 0 | (void)left; |
939 | 0 | v_predictor_32xh(dst, stride, above, 16); |
940 | 0 | } |
941 | | |
942 | | void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, |
943 | 0 | const uint8_t *above, const uint8_t *left) { |
944 | 0 | (void)left; |
945 | 0 | v_predictor_32xh(dst, stride, above, 64); |
946 | 0 | } |
947 | | |
948 | | static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride, |
949 | 0 | const uint8_t *above, int height) { |
950 | 0 | const __m128i row0 = _mm_load_si128((__m128i const *)above); |
951 | 0 | const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); |
952 | 0 | const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32)); |
953 | 0 | const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48)); |
954 | 0 | for (int i = 0; i < height; ++i) { |
955 | 0 | _mm_store_si128((__m128i *)dst, row0); |
956 | 0 | _mm_store_si128((__m128i *)(dst + 16), row1); |
957 | 0 | _mm_store_si128((__m128i *)(dst + 32), row2); |
958 | 0 | _mm_store_si128((__m128i *)(dst + 48), row3); |
959 | 0 | dst += stride; |
960 | 0 | } |
961 | 0 | } |
962 | | |
963 | | void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, |
964 | 0 | const uint8_t *above, const uint8_t *left) { |
965 | 0 | (void)left; |
966 | 0 | v_predictor_64xh(dst, stride, above, 64); |
967 | 0 | } |
968 | | |
969 | | void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, |
970 | 0 | const uint8_t *above, const uint8_t *left) { |
971 | 0 | (void)left; |
972 | 0 | v_predictor_64xh(dst, stride, above, 32); |
973 | 0 | } |
974 | | |
975 | | void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, |
976 | 0 | const uint8_t *above, const uint8_t *left) { |
977 | 0 | (void)left; |
978 | 0 | v_predictor_64xh(dst, stride, above, 16); |
979 | 0 | } |
980 | | |
981 | | // ----------------------------------------------------------------------------- |
982 | | // H_PRED |
983 | | |
984 | | void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, |
985 | 51.0k | const uint8_t *above, const uint8_t *left) { |
986 | 51.0k | (void)above; |
987 | 51.0k | __m128i left_col = _mm_loadl_epi64((__m128i const *)left); |
988 | 51.0k | left_col = _mm_unpacklo_epi8(left_col, left_col); |
989 | 51.0k | __m128i row0 = _mm_shufflelo_epi16(left_col, 0); |
990 | 51.0k | __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); |
991 | 51.0k | __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); |
992 | 51.0k | __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); |
993 | 51.0k | *(int *)dst = _mm_cvtsi128_si32(row0); |
994 | 51.0k | dst += stride; |
995 | 51.0k | *(int *)dst = _mm_cvtsi128_si32(row1); |
996 | 51.0k | dst += stride; |
997 | 51.0k | *(int *)dst = _mm_cvtsi128_si32(row2); |
998 | 51.0k | dst += stride; |
999 | 51.0k | *(int *)dst = _mm_cvtsi128_si32(row3); |
1000 | 51.0k | dst += stride; |
1001 | 51.0k | left_col = _mm_unpackhi_epi64(left_col, left_col); |
1002 | 51.0k | row0 = _mm_shufflelo_epi16(left_col, 0); |
1003 | 51.0k | row1 = _mm_shufflelo_epi16(left_col, 0x55); |
1004 | 51.0k | row2 = _mm_shufflelo_epi16(left_col, 0xaa); |
1005 | 51.0k | row3 = _mm_shufflelo_epi16(left_col, 0xff); |
1006 | 51.0k | *(int *)dst = _mm_cvtsi128_si32(row0); |
1007 | 51.0k | dst += stride; |
1008 | 51.0k | *(int *)dst = _mm_cvtsi128_si32(row1); |
1009 | 51.0k | dst += stride; |
1010 | 51.0k | *(int *)dst = _mm_cvtsi128_si32(row2); |
1011 | 51.0k | dst += stride; |
1012 | 51.0k | *(int *)dst = _mm_cvtsi128_si32(row3); |
1013 | 51.0k | } |
1014 | | |
1015 | | void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, |
1016 | 17.1k | const uint8_t *above, const uint8_t *left) { |
1017 | 17.1k | (void)above; |
1018 | 17.1k | const __m128i left_col = _mm_load_si128((__m128i const *)left); |
1019 | 17.1k | __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); |
1020 | 17.1k | __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); |
1021 | | |
1022 | 17.1k | __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); |
1023 | 17.1k | __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); |
1024 | 17.1k | __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); |
1025 | 17.1k | __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); |
1026 | 17.1k | *(int *)dst = _mm_cvtsi128_si32(row0); |
1027 | 17.1k | dst += stride; |
1028 | 17.1k | *(int *)dst = _mm_cvtsi128_si32(row1); |
1029 | 17.1k | dst += stride; |
1030 | 17.1k | *(int *)dst = _mm_cvtsi128_si32(row2); |
1031 | 17.1k | dst += stride; |
1032 | 17.1k | *(int *)dst = _mm_cvtsi128_si32(row3); |
1033 | 17.1k | dst += stride; |
1034 | | |
1035 | 17.1k | left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); |
1036 | 17.1k | row0 = _mm_shufflelo_epi16(left_col_low, 0); |
1037 | 17.1k | row1 = _mm_shufflelo_epi16(left_col_low, 0x55); |
1038 | 17.1k | row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); |
1039 | 17.1k | row3 = _mm_shufflelo_epi16(left_col_low, 0xff); |
1040 | 17.1k | *(int *)dst = _mm_cvtsi128_si32(row0); |
1041 | 17.1k | dst += stride; |
1042 | 17.1k | *(int *)dst = _mm_cvtsi128_si32(row1); |
1043 | 17.1k | dst += stride; |
1044 | 17.1k | *(int *)dst = _mm_cvtsi128_si32(row2); |
1045 | 17.1k | dst += stride; |
1046 | 17.1k | *(int *)dst = _mm_cvtsi128_si32(row3); |
1047 | 17.1k | dst += stride; |
1048 | | |
1049 | 17.1k | row0 = _mm_shufflelo_epi16(left_col_high, 0); |
1050 | 17.1k | row1 = _mm_shufflelo_epi16(left_col_high, 0x55); |
1051 | 17.1k | row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); |
1052 | 17.1k | row3 = _mm_shufflelo_epi16(left_col_high, 0xff); |
1053 | 17.1k | *(int *)dst = _mm_cvtsi128_si32(row0); |
1054 | 17.1k | dst += stride; |
1055 | 17.1k | *(int *)dst = _mm_cvtsi128_si32(row1); |
1056 | 17.1k | dst += stride; |
1057 | 17.1k | *(int *)dst = _mm_cvtsi128_si32(row2); |
1058 | 17.1k | dst += stride; |
1059 | 17.1k | *(int *)dst = _mm_cvtsi128_si32(row3); |
1060 | 17.1k | dst += stride; |
1061 | | |
1062 | 17.1k | left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); |
1063 | 17.1k | row0 = _mm_shufflelo_epi16(left_col_high, 0); |
1064 | 17.1k | row1 = _mm_shufflelo_epi16(left_col_high, 0x55); |
1065 | 17.1k | row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); |
1066 | 17.1k | row3 = _mm_shufflelo_epi16(left_col_high, 0xff); |
1067 | 17.1k | *(int *)dst = _mm_cvtsi128_si32(row0); |
1068 | 17.1k | dst += stride; |
1069 | 17.1k | *(int *)dst = _mm_cvtsi128_si32(row1); |
1070 | 17.1k | dst += stride; |
1071 | 17.1k | *(int *)dst = _mm_cvtsi128_si32(row2); |
1072 | 17.1k | dst += stride; |
1073 | 17.1k | *(int *)dst = _mm_cvtsi128_si32(row3); |
1074 | 17.1k | } |
1075 | | |
1076 | | void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, |
1077 | 96.0k | const uint8_t *above, const uint8_t *left) { |
1078 | 96.0k | (void)above; |
1079 | 96.0k | __m128i left_col = _mm_loadl_epi64((__m128i const *)left); |
1080 | 96.0k | left_col = _mm_unpacklo_epi8(left_col, left_col); |
1081 | 96.0k | __m128i row0 = _mm_shufflelo_epi16(left_col, 0); |
1082 | 96.0k | __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); |
1083 | 96.0k | __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); |
1084 | 96.0k | __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); |
1085 | 96.0k | _mm_storel_epi64((__m128i *)dst, row0); |
1086 | 96.0k | dst += stride; |
1087 | 96.0k | _mm_storel_epi64((__m128i *)dst, row1); |
1088 | 96.0k | dst += stride; |
1089 | 96.0k | _mm_storel_epi64((__m128i *)dst, row2); |
1090 | 96.0k | dst += stride; |
1091 | 96.0k | _mm_storel_epi64((__m128i *)dst, row3); |
1092 | 96.0k | } |
1093 | | |
1094 | | static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride, |
1095 | | const uint8_t *above, const uint8_t *left, |
1096 | 67.3k | int count) { |
1097 | 67.3k | (void)above; |
1098 | 147k | for (int i = 0; i < count; ++i) { |
1099 | 80.0k | const __m128i left_col = _mm_load_si128((__m128i const *)left); |
1100 | 80.0k | __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); |
1101 | 80.0k | __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); |
1102 | | |
1103 | 80.0k | __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); |
1104 | 80.0k | __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); |
1105 | 80.0k | __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); |
1106 | 80.0k | __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); |
1107 | 80.0k | _mm_storel_epi64((__m128i *)dst, row0); |
1108 | 80.0k | dst += stride; |
1109 | 80.0k | _mm_storel_epi64((__m128i *)dst, row1); |
1110 | 80.0k | dst += stride; |
1111 | 80.0k | _mm_storel_epi64((__m128i *)dst, row2); |
1112 | 80.0k | dst += stride; |
1113 | 80.0k | _mm_storel_epi64((__m128i *)dst, row3); |
1114 | 80.0k | dst += stride; |
1115 | | |
1116 | 80.0k | left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); |
1117 | 80.0k | row0 = _mm_shufflelo_epi16(left_col_low, 0); |
1118 | 80.0k | row1 = _mm_shufflelo_epi16(left_col_low, 0x55); |
1119 | 80.0k | row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); |
1120 | 80.0k | row3 = _mm_shufflelo_epi16(left_col_low, 0xff); |
1121 | 80.0k | _mm_storel_epi64((__m128i *)dst, row0); |
1122 | 80.0k | dst += stride; |
1123 | 80.0k | _mm_storel_epi64((__m128i *)dst, row1); |
1124 | 80.0k | dst += stride; |
1125 | 80.0k | _mm_storel_epi64((__m128i *)dst, row2); |
1126 | 80.0k | dst += stride; |
1127 | 80.0k | _mm_storel_epi64((__m128i *)dst, row3); |
1128 | 80.0k | dst += stride; |
1129 | | |
1130 | 80.0k | row0 = _mm_shufflelo_epi16(left_col_high, 0); |
1131 | 80.0k | row1 = _mm_shufflelo_epi16(left_col_high, 0x55); |
1132 | 80.0k | row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); |
1133 | 80.0k | row3 = _mm_shufflelo_epi16(left_col_high, 0xff); |
1134 | 80.0k | _mm_storel_epi64((__m128i *)dst, row0); |
1135 | 80.0k | dst += stride; |
1136 | 80.0k | _mm_storel_epi64((__m128i *)dst, row1); |
1137 | 80.0k | dst += stride; |
1138 | 80.0k | _mm_storel_epi64((__m128i *)dst, row2); |
1139 | 80.0k | dst += stride; |
1140 | 80.0k | _mm_storel_epi64((__m128i *)dst, row3); |
1141 | 80.0k | dst += stride; |
1142 | | |
1143 | 80.0k | left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); |
1144 | 80.0k | row0 = _mm_shufflelo_epi16(left_col_high, 0); |
1145 | 80.0k | row1 = _mm_shufflelo_epi16(left_col_high, 0x55); |
1146 | 80.0k | row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); |
1147 | 80.0k | row3 = _mm_shufflelo_epi16(left_col_high, 0xff); |
1148 | 80.0k | _mm_storel_epi64((__m128i *)dst, row0); |
1149 | 80.0k | dst += stride; |
1150 | 80.0k | _mm_storel_epi64((__m128i *)dst, row1); |
1151 | 80.0k | dst += stride; |
1152 | 80.0k | _mm_storel_epi64((__m128i *)dst, row2); |
1153 | 80.0k | dst += stride; |
1154 | 80.0k | _mm_storel_epi64((__m128i *)dst, row3); |
1155 | 80.0k | dst += stride; |
1156 | 80.0k | left += 16; |
1157 | 80.0k | } |
1158 | 67.3k | } |
1159 | | |
1160 | | void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, |
1161 | 54.6k | const uint8_t *above, const uint8_t *left) { |
1162 | 54.6k | h_predictor_8x16xc(dst, stride, above, left, 1); |
1163 | 54.6k | } |
1164 | | |
1165 | | void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, |
1166 | 12.6k | const uint8_t *above, const uint8_t *left) { |
1167 | 12.6k | h_predictor_8x16xc(dst, stride, above, left, 2); |
1168 | 12.6k | } |
1169 | | |
1170 | | static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst, |
1171 | 742k | ptrdiff_t stride) { |
1172 | 742k | int i; |
1173 | 3.71M | for (i = 0; i < h; ++i) { |
1174 | 2.96M | _mm_store_si128((__m128i *)dst, row[i]); |
1175 | 2.96M | dst += stride; |
1176 | 2.96M | } |
1177 | 742k | } |
1178 | | |
1179 | 923k | static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) { |
1180 | 923k | const __m128i u0 = _mm_shufflelo_epi16(*x, 0); |
1181 | 923k | const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55); |
1182 | 923k | const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa); |
1183 | 923k | const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff); |
1184 | | |
1185 | 923k | row[0] = _mm_unpacklo_epi64(u0, u0); |
1186 | 923k | row[1] = _mm_unpacklo_epi64(u1, u1); |
1187 | 923k | row[2] = _mm_unpacklo_epi64(u2, u2); |
1188 | 923k | row[3] = _mm_unpacklo_epi64(u3, u3); |
1189 | 923k | } |
1190 | | |
1191 | 694k | static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) { |
1192 | 694k | const __m128i u0 = _mm_shufflehi_epi16(*x, 0); |
1193 | 694k | const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55); |
1194 | 694k | const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa); |
1195 | 694k | const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff); |
1196 | | |
1197 | 694k | row[0] = _mm_unpackhi_epi64(u0, u0); |
1198 | 694k | row[1] = _mm_unpackhi_epi64(u1, u1); |
1199 | 694k | row[2] = _mm_unpackhi_epi64(u2, u2); |
1200 | 694k | row[3] = _mm_unpackhi_epi64(u3, u3); |
1201 | 694k | } |
1202 | | |
1203 | | // Process 16x8, first 4 rows |
1204 | | // Use first 8 bytes of left register: xxxxxxxx33221100 |
1205 | | static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst, |
1206 | 485k | ptrdiff_t stride) { |
1207 | 485k | __m128i row[4]; |
1208 | 485k | repeat_low_4pixels(left, row); |
1209 | 485k | h_pred_store_16xh(row, 4, dst, stride); |
1210 | 485k | } |
1211 | | |
1212 | | // Process 16x8, second 4 rows |
1213 | | // Use second 8 bytes of left register: 77665544xxxxxxxx |
1214 | | static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst, |
1215 | 256k | ptrdiff_t stride) { |
1216 | 256k | __m128i row[4]; |
1217 | 256k | repeat_high_4pixels(left, row); |
1218 | 256k | h_pred_store_16xh(row, 4, dst, stride); |
1219 | 256k | } |
1220 | | |
1221 | | void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, |
1222 | 229k | const uint8_t *above, const uint8_t *left) { |
1223 | 229k | (void)above; |
1224 | 229k | const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); |
1225 | 229k | const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); |
1226 | 229k | h_prediction_16x8_1(&left_col_8p, dst, stride); |
1227 | 229k | } |
1228 | | |
1229 | | void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, |
1230 | 130k | const uint8_t *above, const uint8_t *left) { |
1231 | 130k | (void)above; |
1232 | 130k | const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); |
1233 | 130k | const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); |
1234 | 130k | h_prediction_16x8_1(&left_col_8p, dst, stride); |
1235 | 130k | dst += stride << 2; |
1236 | 130k | h_prediction_16x8_2(&left_col_8p, dst, stride); |
1237 | 130k | } |
1238 | | |
1239 | | static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride, |
1240 | 29.4k | const uint8_t *left, int count) { |
1241 | 29.4k | int i = 0; |
1242 | 63.1k | do { |
1243 | 63.1k | const __m128i left_col = _mm_load_si128((const __m128i *)left); |
1244 | 63.1k | const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col); |
1245 | 63.1k | h_prediction_16x8_1(&left_col_8p_lo, dst, stride); |
1246 | 63.1k | dst += stride << 2; |
1247 | 63.1k | h_prediction_16x8_2(&left_col_8p_lo, dst, stride); |
1248 | 63.1k | dst += stride << 2; |
1249 | | |
1250 | 63.1k | const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col); |
1251 | 63.1k | h_prediction_16x8_1(&left_col_8p_hi, dst, stride); |
1252 | 63.1k | dst += stride << 2; |
1253 | 63.1k | h_prediction_16x8_2(&left_col_8p_hi, dst, stride); |
1254 | 63.1k | dst += stride << 2; |
1255 | | |
1256 | 63.1k | left += 16; |
1257 | 63.1k | i++; |
1258 | 63.1k | } while (i < count); |
1259 | 29.4k | } |
1260 | | |
1261 | | void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, |
1262 | 27.2k | const uint8_t *above, const uint8_t *left) { |
1263 | 27.2k | (void)above; |
1264 | 27.2k | h_predictor_16xh(dst, stride, left, 2); |
1265 | 27.2k | } |
1266 | | |
1267 | | void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, |
1268 | 2.16k | const uint8_t *above, const uint8_t *left) { |
1269 | 2.16k | (void)above; |
1270 | 2.16k | h_predictor_16xh(dst, stride, left, 4); |
1271 | 2.16k | } |
1272 | | |
1273 | | static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst, |
1274 | 876k | ptrdiff_t stride) { |
1275 | 876k | int i; |
1276 | 4.38M | for (i = 0; i < h; ++i) { |
1277 | 3.50M | _mm_store_si128((__m128i *)dst, row[i]); |
1278 | 3.50M | _mm_store_si128((__m128i *)(dst + 16), row[i]); |
1279 | 3.50M | dst += stride; |
1280 | 3.50M | } |
1281 | 876k | } |
1282 | | |
1283 | | // Process 32x8, first 4 rows |
1284 | | // Use first 8 bytes of left register: xxxxxxxx33221100 |
1285 | | static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst, |
1286 | 438k | ptrdiff_t stride) { |
1287 | 438k | __m128i row[4]; |
1288 | 438k | repeat_low_4pixels(left, row); |
1289 | 438k | h_pred_store_32xh(row, 4, dst, stride); |
1290 | 438k | } |
1291 | | |
1292 | | // Process 32x8, second 4 rows |
1293 | | // Use second 8 bytes of left register: 77665544xxxxxxxx |
1294 | | static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst, |
1295 | 438k | ptrdiff_t stride) { |
1296 | 438k | __m128i row[4]; |
1297 | 438k | repeat_high_4pixels(left, row); |
1298 | 438k | h_pred_store_32xh(row, 4, dst, stride); |
1299 | 438k | } |
1300 | | |
1301 | | void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, |
1302 | 274k | const uint8_t *above, const uint8_t *left) { |
1303 | 274k | __m128i left_col, left_col_8p; |
1304 | 274k | (void)above; |
1305 | | |
1306 | 274k | left_col = _mm_load_si128((const __m128i *)left); |
1307 | | |
1308 | 274k | left_col_8p = _mm_unpacklo_epi8(left_col, left_col); |
1309 | 274k | h_prediction_32x8_1(&left_col_8p, dst, stride); |
1310 | 274k | dst += stride << 2; |
1311 | 274k | h_prediction_32x8_2(&left_col_8p, dst, stride); |
1312 | 274k | } |
1313 | | |
1314 | | void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, |
1315 | 81.6k | const uint8_t *above, const uint8_t *left) { |
1316 | 81.6k | __m128i left_col, left_col_8p; |
1317 | 81.6k | (void)above; |
1318 | | |
1319 | 81.6k | left_col = _mm_load_si128((const __m128i *)left); |
1320 | | |
1321 | 81.6k | left_col_8p = _mm_unpacklo_epi8(left_col, left_col); |
1322 | 81.6k | h_prediction_32x8_1(&left_col_8p, dst, stride); |
1323 | 81.6k | dst += stride << 2; |
1324 | 81.6k | h_prediction_32x8_2(&left_col_8p, dst, stride); |
1325 | 81.6k | dst += stride << 2; |
1326 | | |
1327 | 81.6k | left_col_8p = _mm_unpackhi_epi8(left_col, left_col); |
1328 | 81.6k | h_prediction_32x8_1(&left_col_8p, dst, stride); |
1329 | 81.6k | dst += stride << 2; |
1330 | 81.6k | h_prediction_32x8_2(&left_col_8p, dst, stride); |
1331 | 81.6k | } |
1332 | | |
1333 | | static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride, |
1334 | 7.29k | const uint8_t *left, int height) { |
1335 | 7.29k | int i = height >> 2; |
1336 | 116k | do { |
1337 | 116k | __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]); |
1338 | 116k | left4 = _mm_unpacklo_epi8(left4, left4); |
1339 | 116k | left4 = _mm_unpacklo_epi8(left4, left4); |
1340 | 116k | const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); |
1341 | 116k | const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); |
1342 | 116k | _mm_store_si128((__m128i *)dst, r0); |
1343 | 116k | _mm_store_si128((__m128i *)(dst + 16), r0); |
1344 | 116k | _mm_store_si128((__m128i *)(dst + stride), r1); |
1345 | 116k | _mm_store_si128((__m128i *)(dst + stride + 16), r1); |
1346 | 116k | const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); |
1347 | 116k | const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); |
1348 | 116k | _mm_store_si128((__m128i *)(dst + stride * 2), r2); |
1349 | 116k | _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); |
1350 | 116k | _mm_store_si128((__m128i *)(dst + stride * 3), r3); |
1351 | 116k | _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); |
1352 | 116k | left += 4; |
1353 | 116k | dst += stride * 4; |
1354 | 116k | } while (--i); |
1355 | 7.29k | } |
1356 | | |
1357 | | void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, |
1358 | 7.29k | const uint8_t *above, const uint8_t *left) { |
1359 | 7.29k | (void)above; |
1360 | 7.29k | h_predictor_32xh(dst, stride, left, 64); |
1361 | 7.29k | } |
1362 | | |
1363 | | static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride, |
1364 | 136k | const uint8_t *left, int height) { |
1365 | 136k | int i = height >> 2; |
1366 | 930k | do { |
1367 | 930k | __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]); |
1368 | 930k | left4 = _mm_unpacklo_epi8(left4, left4); |
1369 | 930k | left4 = _mm_unpacklo_epi8(left4, left4); |
1370 | 930k | const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); |
1371 | 930k | const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); |
1372 | 930k | _mm_store_si128((__m128i *)dst, r0); |
1373 | 930k | _mm_store_si128((__m128i *)(dst + 16), r0); |
1374 | 930k | _mm_store_si128((__m128i *)(dst + 32), r0); |
1375 | 930k | _mm_store_si128((__m128i *)(dst + 48), r0); |
1376 | 930k | _mm_store_si128((__m128i *)(dst + stride), r1); |
1377 | 930k | _mm_store_si128((__m128i *)(dst + stride + 16), r1); |
1378 | 930k | _mm_store_si128((__m128i *)(dst + stride + 32), r1); |
1379 | 930k | _mm_store_si128((__m128i *)(dst + stride + 48), r1); |
1380 | 930k | const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); |
1381 | 930k | const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); |
1382 | 930k | _mm_store_si128((__m128i *)(dst + stride * 2), r2); |
1383 | 930k | _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); |
1384 | 930k | _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2); |
1385 | 930k | _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2); |
1386 | 930k | _mm_store_si128((__m128i *)(dst + stride * 3), r3); |
1387 | 930k | _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); |
1388 | 930k | _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3); |
1389 | 930k | _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3); |
1390 | 930k | left += 4; |
1391 | 930k | dst += stride * 4; |
1392 | 930k | } while (--i); |
1393 | 136k | } |
1394 | | |
1395 | | void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, |
1396 | 25.9k | const uint8_t *above, const uint8_t *left) { |
1397 | 25.9k | (void)above; |
1398 | 25.9k | h_predictor_64xh(dst, stride, left, 64); |
1399 | 25.9k | } |
1400 | | |
1401 | | void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, |
1402 | 18.8k | const uint8_t *above, const uint8_t *left) { |
1403 | 18.8k | (void)above; |
1404 | 18.8k | h_predictor_64xh(dst, stride, left, 32); |
1405 | 18.8k | } |
1406 | | |
1407 | | void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, |
1408 | 91.2k | const uint8_t *above, const uint8_t *left) { |
1409 | 91.2k | (void)above; |
1410 | 91.2k | h_predictor_64xh(dst, stride, left, 16); |
1411 | 91.2k | } |