Coverage Report

Created: 2023-06-07 06:31

/src/aom/aom_dsp/x86/intrapred_sse2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <emmintrin.h>
13
#include "aom_dsp/x86/intrapred_x86.h"
14
#include "config/aom_dsp_rtcd.h"
15
16
static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
17
579k
                                ptrdiff_t stride) {
18
3.84M
  for (int i = 0; i < height; i += 2) {
19
3.26M
    *(uint32_t *)dst = dc;
20
3.26M
    dst += stride;
21
3.26M
    *(uint32_t *)dst = dc;
22
3.26M
    dst += stride;
23
3.26M
  }
24
579k
}
25
26
static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
27
880k
                                ptrdiff_t stride) {
28
880k
  int i;
29
10.5M
  for (i = 0; i < height; ++i) {
30
9.68M
    _mm_storel_epi64((__m128i *)dst, *row);
31
9.68M
    dst += stride;
32
9.68M
  }
33
880k
}
34
35
static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
36
1.44M
                                 ptrdiff_t stride) {
37
1.44M
  int i;
38
14.1M
  for (i = 0; i < height; ++i) {
39
12.6M
    _mm_store_si128((__m128i *)dst, *row);
40
12.6M
    dst += stride;
41
12.6M
  }
42
1.44M
}
43
44
static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
45
500k
                                 ptrdiff_t stride) {
46
500k
  int i;
47
4.50M
  for (i = 0; i < height; ++i) {
48
4.00M
    _mm_store_si128((__m128i *)dst, *row);
49
4.00M
    _mm_store_si128((__m128i *)(dst + 16), *row);
50
4.00M
    dst += stride;
51
4.00M
  }
52
500k
}
53
54
static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
55
0
                                 ptrdiff_t stride) {
56
0
  for (int i = 0; i < height; ++i) {
57
0
    _mm_store_si128((__m128i *)dst, *row);
58
0
    _mm_store_si128((__m128i *)(dst + 16), *row);
59
0
    _mm_store_si128((__m128i *)(dst + 32), *row);
60
0
    _mm_store_si128((__m128i *)(dst + 48), *row);
61
0
    dst += stride;
62
0
  }
63
0
}
64
65
1.81M
static INLINE __m128i dc_sum_4(const uint8_t *ref) {
66
1.81M
  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
67
1.81M
  const __m128i zero = _mm_setzero_si128();
68
1.81M
  x = _mm_unpacklo_epi8(x, zero);
69
1.81M
  return _mm_sad_epu8(x, zero);
70
1.81M
}
71
72
1.96M
static INLINE __m128i dc_sum_8(const uint8_t *ref) {
73
1.96M
  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
74
1.96M
  const __m128i zero = _mm_setzero_si128();
75
1.96M
  return _mm_sad_epu8(x, zero);
76
1.96M
}
77
78
16.9k
static INLINE __m128i dc_sum_64(const uint8_t *ref) {
79
16.9k
  __m128i x0 = _mm_load_si128((__m128i const *)ref);
80
16.9k
  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
81
16.9k
  __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
82
16.9k
  __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
83
16.9k
  const __m128i zero = _mm_setzero_si128();
84
16.9k
  x0 = _mm_sad_epu8(x0, zero);
85
16.9k
  x1 = _mm_sad_epu8(x1, zero);
86
16.9k
  x2 = _mm_sad_epu8(x2, zero);
87
16.9k
  x3 = _mm_sad_epu8(x3, zero);
88
16.9k
  x0 = _mm_add_epi16(x0, x1);
89
16.9k
  x2 = _mm_add_epi16(x2, x3);
90
16.9k
  x0 = _mm_add_epi16(x0, x2);
91
16.9k
  const __m128i high = _mm_unpackhi_epi64(x0, x0);
92
16.9k
  return _mm_add_epi16(x0, high);
93
16.9k
}
94
95
1.47M
#define DC_MULTIPLIER_1X2 0x5556
96
1.61M
#define DC_MULTIPLIER_1X4 0x3334
97
98
3.09M
#define DC_SHIFT2 16
99
100
static INLINE int divide_using_multiply_shift(int num, int shift1,
101
3.09M
                                              int multiplier) {
102
3.09M
  const int interm = num >> shift1;
103
3.09M
  return interm * multiplier >> DC_SHIFT2;
104
3.09M
}
105
106
// -----------------------------------------------------------------------------
107
// DC_PRED
108
109
void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
110
305k
                               const uint8_t *above, const uint8_t *left) {
111
305k
  const __m128i sum_left = dc_sum_8(left);
112
305k
  __m128i sum_above = dc_sum_4(above);
113
305k
  sum_above = _mm_add_epi16(sum_left, sum_above);
114
115
305k
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
116
305k
  sum += 6;
117
305k
  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
118
119
305k
  const __m128i row = _mm_set1_epi8((int8_t)sum);
120
305k
  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
121
305k
  dc_store_4xh(pred, 8, dst, stride);
122
305k
}
123
124
void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
125
220k
                                const uint8_t *above, const uint8_t *left) {
126
220k
  const __m128i sum_left = dc_sum_16_sse2(left);
127
220k
  __m128i sum_above = dc_sum_4(above);
128
220k
  sum_above = _mm_add_epi16(sum_left, sum_above);
129
130
220k
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
131
220k
  sum += 10;
132
220k
  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
133
134
220k
  const __m128i row = _mm_set1_epi8((int8_t)sum);
135
220k
  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
136
220k
  dc_store_4xh(pred, 16, dst, stride);
137
220k
}
138
139
void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
140
473k
                               const uint8_t *above, const uint8_t *left) {
141
473k
  const __m128i sum_left = dc_sum_4(left);
142
473k
  __m128i sum_above = dc_sum_8(above);
143
473k
  sum_above = _mm_add_epi16(sum_above, sum_left);
144
145
473k
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
146
473k
  sum += 6;
147
473k
  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
148
149
473k
  const __m128i row = _mm_set1_epi8((int8_t)sum);
150
473k
  dc_store_8xh(&row, 4, dst, stride);
151
473k
}
152
153
void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
154
201k
                                const uint8_t *above, const uint8_t *left) {
155
201k
  const __m128i sum_left = dc_sum_16_sse2(left);
156
201k
  __m128i sum_above = dc_sum_8(above);
157
201k
  sum_above = _mm_add_epi16(sum_above, sum_left);
158
159
201k
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
160
201k
  sum += 12;
161
201k
  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
162
201k
  const __m128i row = _mm_set1_epi8((int8_t)sum);
163
201k
  dc_store_8xh(&row, 16, dst, stride);
164
201k
}
165
166
void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
167
95.7k
                                const uint8_t *above, const uint8_t *left) {
168
95.7k
  const __m128i sum_left = dc_sum_32_sse2(left);
169
95.7k
  __m128i sum_above = dc_sum_8(above);
170
95.7k
  sum_above = _mm_add_epi16(sum_above, sum_left);
171
172
95.7k
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
173
95.7k
  sum += 20;
174
95.7k
  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
175
95.7k
  const __m128i row = _mm_set1_epi8((int8_t)sum);
176
95.7k
  dc_store_8xh(&row, 32, dst, stride);
177
95.7k
}
178
179
void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
180
798k
                                const uint8_t *above, const uint8_t *left) {
181
798k
  const __m128i sum_left = dc_sum_4(left);
182
798k
  __m128i sum_above = dc_sum_16_sse2(above);
183
798k
  sum_above = _mm_add_epi16(sum_above, sum_left);
184
185
798k
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
186
798k
  sum += 10;
187
798k
  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
188
798k
  const __m128i row = _mm_set1_epi8((int8_t)sum);
189
798k
  dc_store_16xh(&row, 4, dst, stride);
190
798k
}
191
192
void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
193
377k
                                const uint8_t *above, const uint8_t *left) {
194
377k
  const __m128i sum_left = dc_sum_8(left);
195
377k
  __m128i sum_above = dc_sum_16_sse2(above);
196
377k
  sum_above = _mm_add_epi16(sum_above, sum_left);
197
198
377k
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
199
377k
  sum += 12;
200
377k
  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
201
377k
  const __m128i row = _mm_set1_epi8((int8_t)sum);
202
377k
  dc_store_16xh(&row, 8, dst, stride);
203
377k
}
204
205
void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
206
121k
                                 const uint8_t *above, const uint8_t *left) {
207
121k
  const __m128i sum_left = dc_sum_32_sse2(left);
208
121k
  __m128i sum_above = dc_sum_16_sse2(above);
209
121k
  sum_above = _mm_add_epi16(sum_left, sum_above);
210
211
121k
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
212
121k
  sum += 24;
213
121k
  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
214
121k
  const __m128i row = _mm_set1_epi8((int8_t)sum);
215
121k
  dc_store_16xh(&row, 32, dst, stride);
216
121k
}
217
218
void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
219
15.0k
                                 const uint8_t *above, const uint8_t *left) {
220
15.0k
  const __m128i sum_left = dc_sum_64(left);
221
15.0k
  __m128i sum_above = dc_sum_16_sse2(above);
222
15.0k
  sum_above = _mm_add_epi16(sum_left, sum_above);
223
224
15.0k
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
225
15.0k
  sum += 40;
226
15.0k
  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
227
15.0k
  const __m128i row = _mm_set1_epi8((int8_t)sum);
228
15.0k
  dc_store_16xh(&row, 64, dst, stride);
229
15.0k
}
230
231
void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
232
484k
                                const uint8_t *above, const uint8_t *left) {
233
484k
  __m128i sum_above = dc_sum_32_sse2(above);
234
484k
  const __m128i sum_left = dc_sum_8(left);
235
484k
  sum_above = _mm_add_epi16(sum_above, sum_left);
236
237
484k
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
238
484k
  sum += 20;
239
484k
  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
240
484k
  const __m128i row = _mm_set1_epi8((int8_t)sum);
241
484k
  dc_store_32xh(&row, 8, dst, stride);
242
484k
}
243
244
void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
245
0
                                 const uint8_t *above, const uint8_t *left) {
246
0
  __m128i sum_above = dc_sum_32_sse2(above);
247
0
  const __m128i sum_left = dc_sum_16_sse2(left);
248
0
  sum_above = _mm_add_epi16(sum_above, sum_left);
249
250
0
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
251
0
  sum += 24;
252
0
  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
253
0
  const __m128i row = _mm_set1_epi8((int8_t)sum);
254
0
  dc_store_32xh(&row, 16, dst, stride);
255
0
}
256
257
void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
258
0
                                 const uint8_t *above, const uint8_t *left) {
259
0
  __m128i sum_above = dc_sum_32_sse2(above);
260
0
  const __m128i sum_left = dc_sum_64(left);
261
0
  sum_above = _mm_add_epi16(sum_above, sum_left);
262
263
0
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
264
0
  sum += 48;
265
0
  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
266
0
  const __m128i row = _mm_set1_epi8((int8_t)sum);
267
0
  dc_store_32xh(&row, 64, dst, stride);
268
0
}
269
270
void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
271
0
                                 const uint8_t *above, const uint8_t *left) {
272
0
  __m128i sum_above = dc_sum_64(above);
273
0
  const __m128i sum_left = dc_sum_64(left);
274
0
  sum_above = _mm_add_epi16(sum_above, sum_left);
275
276
0
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
277
0
  sum += 64;
278
0
  sum /= 128;
279
0
  const __m128i row = _mm_set1_epi8((int8_t)sum);
280
0
  dc_store_64xh(&row, 64, dst, stride);
281
0
}
282
283
void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
284
0
                                 const uint8_t *above, const uint8_t *left) {
285
0
  __m128i sum_above = dc_sum_64(above);
286
0
  const __m128i sum_left = dc_sum_32_sse2(left);
287
0
  sum_above = _mm_add_epi16(sum_above, sum_left);
288
289
0
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
290
0
  sum += 48;
291
0
  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
292
0
  const __m128i row = _mm_set1_epi8((int8_t)sum);
293
0
  dc_store_64xh(&row, 32, dst, stride);
294
0
}
295
296
void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
297
0
                                 const uint8_t *above, const uint8_t *left) {
298
0
  __m128i sum_above = dc_sum_64(above);
299
0
  const __m128i sum_left = dc_sum_16_sse2(left);
300
0
  sum_above = _mm_add_epi16(sum_above, sum_left);
301
302
0
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
303
0
  sum += 40;
304
0
  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
305
0
  const __m128i row = _mm_set1_epi8((int8_t)sum);
306
0
  dc_store_64xh(&row, 16, dst, stride);
307
0
}
308
309
// -----------------------------------------------------------------------------
310
// DC_TOP
311
312
void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
313
3.93k
                                   const uint8_t *above, const uint8_t *left) {
314
3.93k
  (void)left;
315
3.93k
  __m128i sum_above = dc_sum_4(above);
316
3.93k
  const __m128i two = _mm_set1_epi16(2);
317
3.93k
  sum_above = _mm_add_epi16(sum_above, two);
318
3.93k
  sum_above = _mm_srai_epi16(sum_above, 2);
319
3.93k
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
320
3.93k
  sum_above = _mm_packus_epi16(sum_above, sum_above);
321
322
3.93k
  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
323
3.93k
  dc_store_4xh(pred, 8, dst, stride);
324
3.93k
}
325
326
void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
327
2.55k
                                    const uint8_t *above, const uint8_t *left) {
328
2.55k
  (void)left;
329
2.55k
  __m128i sum_above = dc_sum_4(above);
330
2.55k
  const __m128i two = _mm_set1_epi16(2);
331
2.55k
  sum_above = _mm_add_epi16(sum_above, two);
332
2.55k
  sum_above = _mm_srai_epi16(sum_above, 2);
333
2.55k
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
334
2.55k
  sum_above = _mm_packus_epi16(sum_above, sum_above);
335
336
2.55k
  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
337
2.55k
  dc_store_4xh(pred, 16, dst, stride);
338
2.55k
}
339
340
void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
341
2.86k
                                   const uint8_t *above, const uint8_t *left) {
342
2.86k
  (void)left;
343
2.86k
  __m128i sum_above = dc_sum_8(above);
344
2.86k
  const __m128i four = _mm_set1_epi16(4);
345
2.86k
  sum_above = _mm_add_epi16(sum_above, four);
346
2.86k
  sum_above = _mm_srai_epi16(sum_above, 3);
347
2.86k
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
348
2.86k
  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
349
2.86k
  dc_store_8xh(&row, 4, dst, stride);
350
2.86k
}
351
352
void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
353
2.86k
                                    const uint8_t *above, const uint8_t *left) {
354
2.86k
  (void)left;
355
2.86k
  __m128i sum_above = dc_sum_8(above);
356
2.86k
  const __m128i four = _mm_set1_epi16(4);
357
2.86k
  sum_above = _mm_add_epi16(sum_above, four);
358
2.86k
  sum_above = _mm_srai_epi16(sum_above, 3);
359
2.86k
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
360
2.86k
  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
361
2.86k
  dc_store_8xh(&row, 16, dst, stride);
362
2.86k
}
363
364
void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
365
3.86k
                                    const uint8_t *above, const uint8_t *left) {
366
3.86k
  (void)left;
367
3.86k
  __m128i sum_above = dc_sum_8(above);
368
3.86k
  const __m128i four = _mm_set1_epi16(4);
369
3.86k
  sum_above = _mm_add_epi16(sum_above, four);
370
3.86k
  sum_above = _mm_srai_epi16(sum_above, 3);
371
3.86k
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
372
3.86k
  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
373
3.86k
  dc_store_8xh(&row, 32, dst, stride);
374
3.86k
}
375
376
void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
377
8.28k
                                    const uint8_t *above, const uint8_t *left) {
378
8.28k
  (void)left;
379
8.28k
  __m128i sum_above = dc_sum_16_sse2(above);
380
8.28k
  const __m128i eight = _mm_set1_epi16(8);
381
8.28k
  sum_above = _mm_add_epi16(sum_above, eight);
382
8.28k
  sum_above = _mm_srai_epi16(sum_above, 4);
383
8.28k
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
384
8.28k
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
385
8.28k
  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
386
8.28k
  dc_store_16xh(&row, 4, dst, stride);
387
8.28k
}
388
389
void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
390
3.58k
                                    const uint8_t *above, const uint8_t *left) {
391
3.58k
  (void)left;
392
3.58k
  __m128i sum_above = dc_sum_16_sse2(above);
393
3.58k
  const __m128i eight = _mm_set1_epi16(8);
394
3.58k
  sum_above = _mm_add_epi16(sum_above, eight);
395
3.58k
  sum_above = _mm_srai_epi16(sum_above, 4);
396
3.58k
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
397
3.58k
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
398
3.58k
  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
399
3.58k
  dc_store_16xh(&row, 8, dst, stride);
400
3.58k
}
401
402
void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
403
                                     const uint8_t *above,
404
4.21k
                                     const uint8_t *left) {
405
4.21k
  (void)left;
406
4.21k
  __m128i sum_above = dc_sum_16_sse2(above);
407
4.21k
  const __m128i eight = _mm_set1_epi16(8);
408
4.21k
  sum_above = _mm_add_epi16(sum_above, eight);
409
4.21k
  sum_above = _mm_srai_epi16(sum_above, 4);
410
4.21k
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
411
4.21k
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
412
4.21k
  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
413
4.21k
  dc_store_16xh(&row, 32, dst, stride);
414
4.21k
}
415
416
void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
417
                                     const uint8_t *above,
418
204
                                     const uint8_t *left) {
419
204
  (void)left;
420
204
  __m128i sum_above = dc_sum_16_sse2(above);
421
204
  const __m128i eight = _mm_set1_epi16(8);
422
204
  sum_above = _mm_add_epi16(sum_above, eight);
423
204
  sum_above = _mm_srai_epi16(sum_above, 4);
424
204
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
425
204
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
426
204
  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
427
204
  dc_store_16xh(&row, 64, dst, stride);
428
204
}
429
430
void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
431
12.3k
                                    const uint8_t *above, const uint8_t *left) {
432
12.3k
  (void)left;
433
12.3k
  __m128i sum_above = dc_sum_32_sse2(above);
434
12.3k
  const __m128i sixteen = _mm_set1_epi16(16);
435
12.3k
  sum_above = _mm_add_epi16(sum_above, sixteen);
436
12.3k
  sum_above = _mm_srai_epi16(sum_above, 5);
437
12.3k
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
438
12.3k
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
439
12.3k
  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
440
12.3k
  dc_store_32xh(&row, 8, dst, stride);
441
12.3k
}
442
443
void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
444
                                     const uint8_t *above,
445
0
                                     const uint8_t *left) {
446
0
  (void)left;
447
0
  __m128i sum_above = dc_sum_32_sse2(above);
448
0
  const __m128i sixteen = _mm_set1_epi16(16);
449
0
  sum_above = _mm_add_epi16(sum_above, sixteen);
450
0
  sum_above = _mm_srai_epi16(sum_above, 5);
451
0
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
452
0
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
453
0
  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
454
0
  dc_store_32xh(&row, 16, dst, stride);
455
0
}
456
457
void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
458
                                     const uint8_t *above,
459
0
                                     const uint8_t *left) {
460
0
  (void)left;
461
0
  __m128i sum_above = dc_sum_32_sse2(above);
462
0
  const __m128i sixteen = _mm_set1_epi16(16);
463
0
  sum_above = _mm_add_epi16(sum_above, sixteen);
464
0
  sum_above = _mm_srai_epi16(sum_above, 5);
465
0
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
466
0
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
467
0
  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
468
0
  dc_store_32xh(&row, 64, dst, stride);
469
0
}
470
471
void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
472
                                     const uint8_t *above,
473
0
                                     const uint8_t *left) {
474
0
  (void)left;
475
0
  __m128i sum_above = dc_sum_64(above);
476
0
  const __m128i thirtytwo = _mm_set1_epi16(32);
477
0
  sum_above = _mm_add_epi16(sum_above, thirtytwo);
478
0
  sum_above = _mm_srai_epi16(sum_above, 6);
479
0
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
480
0
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
481
0
  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
482
0
  dc_store_64xh(&row, 64, dst, stride);
483
0
}
484
485
void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
486
                                     const uint8_t *above,
487
0
                                     const uint8_t *left) {
488
0
  (void)left;
489
0
  __m128i sum_above = dc_sum_64(above);
490
0
  const __m128i thirtytwo = _mm_set1_epi16(32);
491
0
  sum_above = _mm_add_epi16(sum_above, thirtytwo);
492
0
  sum_above = _mm_srai_epi16(sum_above, 6);
493
0
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
494
0
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
495
0
  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
496
0
  dc_store_64xh(&row, 32, dst, stride);
497
0
}
498
499
void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
500
                                     const uint8_t *above,
501
0
                                     const uint8_t *left) {
502
0
  (void)left;
503
0
  __m128i sum_above = dc_sum_64(above);
504
0
  const __m128i thirtytwo = _mm_set1_epi16(32);
505
0
  sum_above = _mm_add_epi16(sum_above, thirtytwo);
506
0
  sum_above = _mm_srai_epi16(sum_above, 6);
507
0
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
508
0
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
509
0
  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
510
0
  dc_store_64xh(&row, 16, dst, stride);
511
0
}
512
513
// -----------------------------------------------------------------------------
514
// DC_LEFT
515
516
void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
517
3.28k
                                    const uint8_t *above, const uint8_t *left) {
518
3.28k
  (void)above;
519
3.28k
  __m128i sum_left = dc_sum_8(left);
520
3.28k
  const __m128i four = _mm_set1_epi16(4);
521
3.28k
  sum_left = _mm_add_epi16(sum_left, four);
522
3.28k
  sum_left = _mm_srai_epi16(sum_left, 3);
523
3.28k
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
524
3.28k
  sum_left = _mm_packus_epi16(sum_left, sum_left);
525
526
3.28k
  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
527
3.28k
  dc_store_4xh(pred, 8, dst, stride);
528
3.28k
}
529
530
void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
531
                                     const uint8_t *above,
532
6.10k
                                     const uint8_t *left) {
533
6.10k
  (void)above;
534
6.10k
  __m128i sum_left = dc_sum_16_sse2(left);
535
6.10k
  const __m128i eight = _mm_set1_epi16(8);
536
6.10k
  sum_left = _mm_add_epi16(sum_left, eight);
537
6.10k
  sum_left = _mm_srai_epi16(sum_left, 4);
538
6.10k
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
539
6.10k
  sum_left = _mm_packus_epi16(sum_left, sum_left);
540
541
6.10k
  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
542
6.10k
  dc_store_4xh(pred, 16, dst, stride);
543
6.10k
}
544
545
void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
546
6.40k
                                    const uint8_t *above, const uint8_t *left) {
547
6.40k
  (void)above;
548
6.40k
  __m128i sum_left = dc_sum_4(left);
549
6.40k
  const __m128i two = _mm_set1_epi16(2);
550
6.40k
  sum_left = _mm_add_epi16(sum_left, two);
551
6.40k
  sum_left = _mm_srai_epi16(sum_left, 2);
552
6.40k
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
553
6.40k
  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
554
6.40k
  dc_store_8xh(&row, 4, dst, stride);
555
6.40k
}
556
557
void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
558
                                     const uint8_t *above,
559
2.84k
                                     const uint8_t *left) {
560
2.84k
  (void)above;
561
2.84k
  __m128i sum_left = dc_sum_16_sse2(left);
562
2.84k
  const __m128i eight = _mm_set1_epi16(8);
563
2.84k
  sum_left = _mm_add_epi16(sum_left, eight);
564
2.84k
  sum_left = _mm_srai_epi16(sum_left, 4);
565
2.84k
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
566
2.84k
  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
567
2.84k
  dc_store_8xh(&row, 16, dst, stride);
568
2.84k
}
569
570
void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
571
                                     const uint8_t *above,
572
16.6k
                                     const uint8_t *left) {
573
16.6k
  (void)above;
574
16.6k
  __m128i sum_left = dc_sum_32_sse2(left);
575
16.6k
  const __m128i sixteen = _mm_set1_epi16(16);
576
16.6k
  sum_left = _mm_add_epi16(sum_left, sixteen);
577
16.6k
  sum_left = _mm_srai_epi16(sum_left, 5);
578
16.6k
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
579
16.6k
  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
580
16.6k
  dc_store_8xh(&row, 32, dst, stride);
581
16.6k
}
582
583
void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
584
                                     const uint8_t *above,
585
4.43k
                                     const uint8_t *left) {
586
4.43k
  (void)above;
587
4.43k
  __m128i sum_left = dc_sum_4(left);
588
4.43k
  const __m128i two = _mm_set1_epi16(2);
589
4.43k
  sum_left = _mm_add_epi16(sum_left, two);
590
4.43k
  sum_left = _mm_srai_epi16(sum_left, 2);
591
4.43k
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
592
4.43k
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
593
4.43k
  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
594
4.43k
  dc_store_16xh(&row, 4, dst, stride);
595
4.43k
}
596
597
void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
598
                                     const uint8_t *above,
599
9.46k
                                     const uint8_t *left) {
600
9.46k
  (void)above;
601
9.46k
  __m128i sum_left = dc_sum_8(left);
602
9.46k
  const __m128i four = _mm_set1_epi16(4);
603
9.46k
  sum_left = _mm_add_epi16(sum_left, four);
604
9.46k
  sum_left = _mm_srai_epi16(sum_left, 3);
605
9.46k
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
606
9.46k
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
607
9.46k
  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
608
9.46k
  dc_store_16xh(&row, 8, dst, stride);
609
9.46k
}
610
611
void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
612
                                      const uint8_t *above,
613
4.43k
                                      const uint8_t *left) {
614
4.43k
  (void)above;
615
4.43k
  __m128i sum_left = dc_sum_32_sse2(left);
616
4.43k
  const __m128i sixteen = _mm_set1_epi16(16);
617
4.43k
  sum_left = _mm_add_epi16(sum_left, sixteen);
618
4.43k
  sum_left = _mm_srai_epi16(sum_left, 5);
619
4.43k
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
620
4.43k
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
621
4.43k
  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
622
4.43k
  dc_store_16xh(&row, 32, dst, stride);
623
4.43k
}
624
625
void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
626
                                      const uint8_t *above,
627
1.94k
                                      const uint8_t *left) {
628
1.94k
  (void)above;
629
1.94k
  __m128i sum_left = dc_sum_64(left);
630
1.94k
  const __m128i thirtytwo = _mm_set1_epi16(32);
631
1.94k
  sum_left = _mm_add_epi16(sum_left, thirtytwo);
632
1.94k
  sum_left = _mm_srai_epi16(sum_left, 6);
633
1.94k
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
634
1.94k
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
635
1.94k
  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
636
1.94k
  dc_store_16xh(&row, 64, dst, stride);
637
1.94k
}
638
639
void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
640
                                     const uint8_t *above,
641
3.91k
                                     const uint8_t *left) {
642
3.91k
  (void)above;
643
3.91k
  __m128i sum_left = dc_sum_8(left);
644
3.91k
  const __m128i four = _mm_set1_epi16(4);
645
3.91k
  sum_left = _mm_add_epi16(sum_left, four);
646
3.91k
  sum_left = _mm_srai_epi16(sum_left, 3);
647
3.91k
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
648
3.91k
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
649
3.91k
  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
650
3.91k
  dc_store_32xh(&row, 8, dst, stride);
651
3.91k
}
652
653
void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
654
                                      const uint8_t *above,
655
0
                                      const uint8_t *left) {
656
0
  (void)above;
657
0
  __m128i sum_left = dc_sum_16_sse2(left);
658
0
  const __m128i eight = _mm_set1_epi16(8);
659
0
  sum_left = _mm_add_epi16(sum_left, eight);
660
0
  sum_left = _mm_srai_epi16(sum_left, 4);
661
0
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
662
0
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
663
0
  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
664
0
  dc_store_32xh(&row, 16, dst, stride);
665
0
}
666
667
void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
668
                                      const uint8_t *above,
669
0
                                      const uint8_t *left) {
670
0
  (void)above;
671
0
  __m128i sum_left = dc_sum_64(left);
672
0
  const __m128i thirtytwo = _mm_set1_epi16(32);
673
0
  sum_left = _mm_add_epi16(sum_left, thirtytwo);
674
0
  sum_left = _mm_srai_epi16(sum_left, 6);
675
0
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
676
0
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
677
0
  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
678
0
  dc_store_32xh(&row, 64, dst, stride);
679
0
}
680
681
void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
682
                                      const uint8_t *above,
683
0
                                      const uint8_t *left) {
684
0
  (void)above;
685
0
  __m128i sum_left = dc_sum_64(left);
686
0
  const __m128i thirtytwo = _mm_set1_epi16(32);
687
0
  sum_left = _mm_add_epi16(sum_left, thirtytwo);
688
0
  sum_left = _mm_srai_epi16(sum_left, 6);
689
0
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
690
0
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
691
0
  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
692
0
  dc_store_64xh(&row, 64, dst, stride);
693
0
}
694
695
void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
696
                                      const uint8_t *above,
697
0
                                      const uint8_t *left) {
698
0
  (void)above;
699
0
  __m128i sum_left = dc_sum_32_sse2(left);
700
0
  const __m128i sixteen = _mm_set1_epi16(16);
701
0
  sum_left = _mm_add_epi16(sum_left, sixteen);
702
0
  sum_left = _mm_srai_epi16(sum_left, 5);
703
0
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
704
0
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
705
0
  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
706
0
  dc_store_64xh(&row, 32, dst, stride);
707
0
}
708
709
void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
710
                                      const uint8_t *above,
711
0
                                      const uint8_t *left) {
712
0
  (void)above;
713
0
  __m128i sum_left = dc_sum_16_sse2(left);
714
0
  const __m128i eight = _mm_set1_epi16(8);
715
0
  sum_left = _mm_add_epi16(sum_left, eight);
716
0
  sum_left = _mm_srai_epi16(sum_left, 4);
717
0
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
718
0
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
719
0
  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
720
0
  dc_store_64xh(&row, 16, dst, stride);
721
0
}
722
723
// -----------------------------------------------------------------------------
724
// DC_128
725
726
void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
727
1.02k
                                   const uint8_t *above, const uint8_t *left) {
728
1.02k
  (void)above;
729
1.02k
  (void)left;
730
1.02k
  const uint32_t pred = 0x80808080;
731
1.02k
  dc_store_4xh(pred, 8, dst, stride);
732
1.02k
}
733
734
void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
735
341
                                    const uint8_t *above, const uint8_t *left) {
736
341
  (void)above;
737
341
  (void)left;
738
341
  const uint32_t pred = 0x80808080;
739
341
  dc_store_4xh(pred, 16, dst, stride);
740
341
}
741
742
void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
743
150
                                   const uint8_t *above, const uint8_t *left) {
744
150
  (void)above;
745
150
  (void)left;
746
150
  const __m128i row = _mm_set1_epi8((int8_t)128);
747
150
  dc_store_8xh(&row, 4, dst, stride);
748
150
}
749
750
void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
751
2.99k
                                    const uint8_t *above, const uint8_t *left) {
752
2.99k
  (void)above;
753
2.99k
  (void)left;
754
2.99k
  const __m128i row = _mm_set1_epi8((int8_t)128);
755
2.99k
  dc_store_8xh(&row, 16, dst, stride);
756
2.99k
}
757
758
void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
759
269
                                    const uint8_t *above, const uint8_t *left) {
760
269
  (void)above;
761
269
  (void)left;
762
269
  const __m128i row = _mm_set1_epi8((int8_t)128);
763
269
  dc_store_8xh(&row, 32, dst, stride);
764
269
}
765
766
void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
767
174
                                    const uint8_t *above, const uint8_t *left) {
768
174
  (void)above;
769
174
  (void)left;
770
174
  const __m128i row = _mm_set1_epi8((int8_t)128);
771
174
  dc_store_16xh(&row, 4, dst, stride);
772
174
}
773
774
void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
775
423
                                    const uint8_t *above, const uint8_t *left) {
776
423
  (void)above;
777
423
  (void)left;
778
423
  const __m128i row = _mm_set1_epi8((int8_t)128);
779
423
  dc_store_16xh(&row, 8, dst, stride);
780
423
}
781
782
void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
783
                                     const uint8_t *above,
784
2.31k
                                     const uint8_t *left) {
785
2.31k
  (void)above;
786
2.31k
  (void)left;
787
2.31k
  const __m128i row = _mm_set1_epi8((int8_t)128);
788
2.31k
  dc_store_16xh(&row, 32, dst, stride);
789
2.31k
}
790
791
void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
792
                                     const uint8_t *above,
793
1.51k
                                     const uint8_t *left) {
794
1.51k
  (void)above;
795
1.51k
  (void)left;
796
1.51k
  const __m128i row = _mm_set1_epi8((int8_t)128);
797
1.51k
  dc_store_16xh(&row, 64, dst, stride);
798
1.51k
}
799
800
void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
801
191
                                    const uint8_t *above, const uint8_t *left) {
802
191
  (void)above;
803
191
  (void)left;
804
191
  const __m128i row = _mm_set1_epi8((int8_t)128);
805
191
  dc_store_32xh(&row, 8, dst, stride);
806
191
}
807
808
void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
809
                                     const uint8_t *above,
810
0
                                     const uint8_t *left) {
811
0
  (void)above;
812
0
  (void)left;
813
0
  const __m128i row = _mm_set1_epi8((int8_t)128);
814
0
  dc_store_32xh(&row, 16, dst, stride);
815
0
}
816
817
void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
818
                                     const uint8_t *above,
819
0
                                     const uint8_t *left) {
820
0
  (void)above;
821
0
  (void)left;
822
0
  const __m128i row = _mm_set1_epi8((int8_t)128);
823
0
  dc_store_32xh(&row, 64, dst, stride);
824
0
}
825
826
void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
827
                                     const uint8_t *above,
828
0
                                     const uint8_t *left) {
829
0
  (void)above;
830
0
  (void)left;
831
0
  const __m128i row = _mm_set1_epi8((int8_t)128);
832
0
  dc_store_64xh(&row, 64, dst, stride);
833
0
}
834
835
void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
836
                                     const uint8_t *above,
837
0
                                     const uint8_t *left) {
838
0
  (void)above;
839
0
  (void)left;
840
0
  const __m128i row = _mm_set1_epi8((int8_t)128);
841
0
  dc_store_64xh(&row, 32, dst, stride);
842
0
}
843
844
void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
845
                                     const uint8_t *above,
846
0
                                     const uint8_t *left) {
847
0
  (void)above;
848
0
  (void)left;
849
0
  const __m128i row = _mm_set1_epi8((int8_t)128);
850
0
  dc_store_64xh(&row, 16, dst, stride);
851
0
}
852
853
// -----------------------------------------------------------------------------
854
// V_PRED
855
856
void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
857
29.7k
                              const uint8_t *above, const uint8_t *left) {
858
29.7k
  const uint32_t pred = *(uint32_t *)above;
859
29.7k
  (void)left;
860
29.7k
  dc_store_4xh(pred, 8, dst, stride);
861
29.7k
}
862
863
void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
864
7.38k
                               const uint8_t *above, const uint8_t *left) {
865
7.38k
  const uint32_t pred = *(uint32_t *)above;
866
7.38k
  (void)left;
867
7.38k
  dc_store_4xh(pred, 16, dst, stride);
868
7.38k
}
869
870
void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
871
44.2k
                              const uint8_t *above, const uint8_t *left) {
872
44.2k
  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
873
44.2k
  (void)left;
874
44.2k
  dc_store_8xh(&row, 4, dst, stride);
875
44.2k
}
876
877
void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
878
22.1k
                               const uint8_t *above, const uint8_t *left) {
879
22.1k
  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
880
22.1k
  (void)left;
881
22.1k
  dc_store_8xh(&row, 16, dst, stride);
882
22.1k
}
883
884
void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
885
4.09k
                               const uint8_t *above, const uint8_t *left) {
886
4.09k
  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
887
4.09k
  (void)left;
888
4.09k
  dc_store_8xh(&row, 32, dst, stride);
889
4.09k
}
890
891
void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
892
41.2k
                               const uint8_t *above, const uint8_t *left) {
893
41.2k
  const __m128i row = _mm_load_si128((__m128i const *)above);
894
41.2k
  (void)left;
895
41.2k
  dc_store_16xh(&row, 4, dst, stride);
896
41.2k
}
897
898
void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
899
44.0k
                               const uint8_t *above, const uint8_t *left) {
900
44.0k
  const __m128i row = _mm_load_si128((__m128i const *)above);
901
44.0k
  (void)left;
902
44.0k
  dc_store_16xh(&row, 8, dst, stride);
903
44.0k
}
904
905
void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
906
8.85k
                                const uint8_t *above, const uint8_t *left) {
907
8.85k
  const __m128i row = _mm_load_si128((__m128i const *)above);
908
8.85k
  (void)left;
909
8.85k
  dc_store_16xh(&row, 32, dst, stride);
910
8.85k
}
911
912
void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
913
1.49k
                                const uint8_t *above, const uint8_t *left) {
914
1.49k
  const __m128i row = _mm_load_si128((__m128i const *)above);
915
1.49k
  (void)left;
916
1.49k
  dc_store_16xh(&row, 64, dst, stride);
917
1.49k
}
918
919
static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
920
16.8k
                                    const uint8_t *above, int height) {
921
16.8k
  const __m128i row0 = _mm_load_si128((__m128i const *)above);
922
16.8k
  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
923
151k
  for (int i = 0; i < height; ++i) {
924
134k
    _mm_store_si128((__m128i *)dst, row0);
925
134k
    _mm_store_si128((__m128i *)(dst + 16), row1);
926
134k
    dst += stride;
927
134k
  }
928
16.8k
}
929
930
void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
931
16.8k
                               const uint8_t *above, const uint8_t *left) {
932
16.8k
  (void)left;
933
16.8k
  v_predictor_32xh(dst, stride, above, 8);
934
16.8k
}
935
936
void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
937
0
                                const uint8_t *above, const uint8_t *left) {
938
0
  (void)left;
939
0
  v_predictor_32xh(dst, stride, above, 16);
940
0
}
941
942
void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
943
0
                                const uint8_t *above, const uint8_t *left) {
944
0
  (void)left;
945
0
  v_predictor_32xh(dst, stride, above, 64);
946
0
}
947
948
static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
949
0
                                    const uint8_t *above, int height) {
950
0
  const __m128i row0 = _mm_load_si128((__m128i const *)above);
951
0
  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
952
0
  const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
953
0
  const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
954
0
  for (int i = 0; i < height; ++i) {
955
0
    _mm_store_si128((__m128i *)dst, row0);
956
0
    _mm_store_si128((__m128i *)(dst + 16), row1);
957
0
    _mm_store_si128((__m128i *)(dst + 32), row2);
958
0
    _mm_store_si128((__m128i *)(dst + 48), row3);
959
0
    dst += stride;
960
0
  }
961
0
}
962
963
void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
964
0
                                const uint8_t *above, const uint8_t *left) {
965
0
  (void)left;
966
0
  v_predictor_64xh(dst, stride, above, 64);
967
0
}
968
969
void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
970
0
                                const uint8_t *above, const uint8_t *left) {
971
0
  (void)left;
972
0
  v_predictor_64xh(dst, stride, above, 32);
973
0
}
974
975
void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
976
0
                                const uint8_t *above, const uint8_t *left) {
977
0
  (void)left;
978
0
  v_predictor_64xh(dst, stride, above, 16);
979
0
}
980
981
// -----------------------------------------------------------------------------
982
// H_PRED
983
984
void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
985
51.0k
                              const uint8_t *above, const uint8_t *left) {
986
51.0k
  (void)above;
987
51.0k
  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
988
51.0k
  left_col = _mm_unpacklo_epi8(left_col, left_col);
989
51.0k
  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
990
51.0k
  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
991
51.0k
  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
992
51.0k
  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
993
51.0k
  *(int *)dst = _mm_cvtsi128_si32(row0);
994
51.0k
  dst += stride;
995
51.0k
  *(int *)dst = _mm_cvtsi128_si32(row1);
996
51.0k
  dst += stride;
997
51.0k
  *(int *)dst = _mm_cvtsi128_si32(row2);
998
51.0k
  dst += stride;
999
51.0k
  *(int *)dst = _mm_cvtsi128_si32(row3);
1000
51.0k
  dst += stride;
1001
51.0k
  left_col = _mm_unpackhi_epi64(left_col, left_col);
1002
51.0k
  row0 = _mm_shufflelo_epi16(left_col, 0);
1003
51.0k
  row1 = _mm_shufflelo_epi16(left_col, 0x55);
1004
51.0k
  row2 = _mm_shufflelo_epi16(left_col, 0xaa);
1005
51.0k
  row3 = _mm_shufflelo_epi16(left_col, 0xff);
1006
51.0k
  *(int *)dst = _mm_cvtsi128_si32(row0);
1007
51.0k
  dst += stride;
1008
51.0k
  *(int *)dst = _mm_cvtsi128_si32(row1);
1009
51.0k
  dst += stride;
1010
51.0k
  *(int *)dst = _mm_cvtsi128_si32(row2);
1011
51.0k
  dst += stride;
1012
51.0k
  *(int *)dst = _mm_cvtsi128_si32(row3);
1013
51.0k
}
1014
1015
void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
1016
17.1k
                               const uint8_t *above, const uint8_t *left) {
1017
17.1k
  (void)above;
1018
17.1k
  const __m128i left_col = _mm_load_si128((__m128i const *)left);
1019
17.1k
  __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
1020
17.1k
  __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
1021
1022
17.1k
  __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
1023
17.1k
  __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1024
17.1k
  __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1025
17.1k
  __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1026
17.1k
  *(int *)dst = _mm_cvtsi128_si32(row0);
1027
17.1k
  dst += stride;
1028
17.1k
  *(int *)dst = _mm_cvtsi128_si32(row1);
1029
17.1k
  dst += stride;
1030
17.1k
  *(int *)dst = _mm_cvtsi128_si32(row2);
1031
17.1k
  dst += stride;
1032
17.1k
  *(int *)dst = _mm_cvtsi128_si32(row3);
1033
17.1k
  dst += stride;
1034
1035
17.1k
  left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
1036
17.1k
  row0 = _mm_shufflelo_epi16(left_col_low, 0);
1037
17.1k
  row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1038
17.1k
  row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1039
17.1k
  row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1040
17.1k
  *(int *)dst = _mm_cvtsi128_si32(row0);
1041
17.1k
  dst += stride;
1042
17.1k
  *(int *)dst = _mm_cvtsi128_si32(row1);
1043
17.1k
  dst += stride;
1044
17.1k
  *(int *)dst = _mm_cvtsi128_si32(row2);
1045
17.1k
  dst += stride;
1046
17.1k
  *(int *)dst = _mm_cvtsi128_si32(row3);
1047
17.1k
  dst += stride;
1048
1049
17.1k
  row0 = _mm_shufflelo_epi16(left_col_high, 0);
1050
17.1k
  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1051
17.1k
  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1052
17.1k
  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1053
17.1k
  *(int *)dst = _mm_cvtsi128_si32(row0);
1054
17.1k
  dst += stride;
1055
17.1k
  *(int *)dst = _mm_cvtsi128_si32(row1);
1056
17.1k
  dst += stride;
1057
17.1k
  *(int *)dst = _mm_cvtsi128_si32(row2);
1058
17.1k
  dst += stride;
1059
17.1k
  *(int *)dst = _mm_cvtsi128_si32(row3);
1060
17.1k
  dst += stride;
1061
1062
17.1k
  left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
1063
17.1k
  row0 = _mm_shufflelo_epi16(left_col_high, 0);
1064
17.1k
  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1065
17.1k
  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1066
17.1k
  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1067
17.1k
  *(int *)dst = _mm_cvtsi128_si32(row0);
1068
17.1k
  dst += stride;
1069
17.1k
  *(int *)dst = _mm_cvtsi128_si32(row1);
1070
17.1k
  dst += stride;
1071
17.1k
  *(int *)dst = _mm_cvtsi128_si32(row2);
1072
17.1k
  dst += stride;
1073
17.1k
  *(int *)dst = _mm_cvtsi128_si32(row3);
1074
17.1k
}
1075
1076
void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
1077
96.0k
                              const uint8_t *above, const uint8_t *left) {
1078
96.0k
  (void)above;
1079
96.0k
  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
1080
96.0k
  left_col = _mm_unpacklo_epi8(left_col, left_col);
1081
96.0k
  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
1082
96.0k
  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
1083
96.0k
  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
1084
96.0k
  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
1085
96.0k
  _mm_storel_epi64((__m128i *)dst, row0);
1086
96.0k
  dst += stride;
1087
96.0k
  _mm_storel_epi64((__m128i *)dst, row1);
1088
96.0k
  dst += stride;
1089
96.0k
  _mm_storel_epi64((__m128i *)dst, row2);
1090
96.0k
  dst += stride;
1091
96.0k
  _mm_storel_epi64((__m128i *)dst, row3);
1092
96.0k
}
1093
1094
static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
1095
                                      const uint8_t *above, const uint8_t *left,
1096
67.3k
                                      int count) {
1097
67.3k
  (void)above;
1098
147k
  for (int i = 0; i < count; ++i) {
1099
80.0k
    const __m128i left_col = _mm_load_si128((__m128i const *)left);
1100
80.0k
    __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
1101
80.0k
    __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
1102
1103
80.0k
    __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
1104
80.0k
    __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1105
80.0k
    __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1106
80.0k
    __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1107
80.0k
    _mm_storel_epi64((__m128i *)dst, row0);
1108
80.0k
    dst += stride;
1109
80.0k
    _mm_storel_epi64((__m128i *)dst, row1);
1110
80.0k
    dst += stride;
1111
80.0k
    _mm_storel_epi64((__m128i *)dst, row2);
1112
80.0k
    dst += stride;
1113
80.0k
    _mm_storel_epi64((__m128i *)dst, row3);
1114
80.0k
    dst += stride;
1115
1116
80.0k
    left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
1117
80.0k
    row0 = _mm_shufflelo_epi16(left_col_low, 0);
1118
80.0k
    row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1119
80.0k
    row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1120
80.0k
    row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1121
80.0k
    _mm_storel_epi64((__m128i *)dst, row0);
1122
80.0k
    dst += stride;
1123
80.0k
    _mm_storel_epi64((__m128i *)dst, row1);
1124
80.0k
    dst += stride;
1125
80.0k
    _mm_storel_epi64((__m128i *)dst, row2);
1126
80.0k
    dst += stride;
1127
80.0k
    _mm_storel_epi64((__m128i *)dst, row3);
1128
80.0k
    dst += stride;
1129
1130
80.0k
    row0 = _mm_shufflelo_epi16(left_col_high, 0);
1131
80.0k
    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1132
80.0k
    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1133
80.0k
    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1134
80.0k
    _mm_storel_epi64((__m128i *)dst, row0);
1135
80.0k
    dst += stride;
1136
80.0k
    _mm_storel_epi64((__m128i *)dst, row1);
1137
80.0k
    dst += stride;
1138
80.0k
    _mm_storel_epi64((__m128i *)dst, row2);
1139
80.0k
    dst += stride;
1140
80.0k
    _mm_storel_epi64((__m128i *)dst, row3);
1141
80.0k
    dst += stride;
1142
1143
80.0k
    left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
1144
80.0k
    row0 = _mm_shufflelo_epi16(left_col_high, 0);
1145
80.0k
    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1146
80.0k
    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1147
80.0k
    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1148
80.0k
    _mm_storel_epi64((__m128i *)dst, row0);
1149
80.0k
    dst += stride;
1150
80.0k
    _mm_storel_epi64((__m128i *)dst, row1);
1151
80.0k
    dst += stride;
1152
80.0k
    _mm_storel_epi64((__m128i *)dst, row2);
1153
80.0k
    dst += stride;
1154
80.0k
    _mm_storel_epi64((__m128i *)dst, row3);
1155
80.0k
    dst += stride;
1156
80.0k
    left += 16;
1157
80.0k
  }
1158
67.3k
}
1159
1160
void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
1161
54.6k
                               const uint8_t *above, const uint8_t *left) {
1162
54.6k
  h_predictor_8x16xc(dst, stride, above, left, 1);
1163
54.6k
}
1164
1165
void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
1166
12.6k
                               const uint8_t *above, const uint8_t *left) {
1167
12.6k
  h_predictor_8x16xc(dst, stride, above, left, 2);
1168
12.6k
}
1169
1170
static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
1171
742k
                                     ptrdiff_t stride) {
1172
742k
  int i;
1173
3.71M
  for (i = 0; i < h; ++i) {
1174
2.96M
    _mm_store_si128((__m128i *)dst, row[i]);
1175
2.96M
    dst += stride;
1176
2.96M
  }
1177
742k
}
1178
1179
923k
static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) {
1180
923k
  const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
1181
923k
  const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
1182
923k
  const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
1183
923k
  const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
1184
1185
923k
  row[0] = _mm_unpacklo_epi64(u0, u0);
1186
923k
  row[1] = _mm_unpacklo_epi64(u1, u1);
1187
923k
  row[2] = _mm_unpacklo_epi64(u2, u2);
1188
923k
  row[3] = _mm_unpacklo_epi64(u3, u3);
1189
923k
}
1190
1191
694k
static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) {
1192
694k
  const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
1193
694k
  const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
1194
694k
  const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
1195
694k
  const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
1196
1197
694k
  row[0] = _mm_unpackhi_epi64(u0, u0);
1198
694k
  row[1] = _mm_unpackhi_epi64(u1, u1);
1199
694k
  row[2] = _mm_unpackhi_epi64(u2, u2);
1200
694k
  row[3] = _mm_unpackhi_epi64(u3, u3);
1201
694k
}
1202
1203
// Process 16x8, first 4 rows
1204
// Use first 8 bytes of left register: xxxxxxxx33221100
1205
static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
1206
485k
                                       ptrdiff_t stride) {
1207
485k
  __m128i row[4];
1208
485k
  repeat_low_4pixels(left, row);
1209
485k
  h_pred_store_16xh(row, 4, dst, stride);
1210
485k
}
1211
1212
// Process 16x8, second 4 rows
1213
// Use second 8 bytes of left register: 77665544xxxxxxxx
1214
static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
1215
256k
                                       ptrdiff_t stride) {
1216
256k
  __m128i row[4];
1217
256k
  repeat_high_4pixels(left, row);
1218
256k
  h_pred_store_16xh(row, 4, dst, stride);
1219
256k
}
1220
1221
void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
1222
229k
                               const uint8_t *above, const uint8_t *left) {
1223
229k
  (void)above;
1224
229k
  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
1225
229k
  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1226
229k
  h_prediction_16x8_1(&left_col_8p, dst, stride);
1227
229k
}
1228
1229
void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
1230
130k
                               const uint8_t *above, const uint8_t *left) {
1231
130k
  (void)above;
1232
130k
  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
1233
130k
  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1234
130k
  h_prediction_16x8_1(&left_col_8p, dst, stride);
1235
130k
  dst += stride << 2;
1236
130k
  h_prediction_16x8_2(&left_col_8p, dst, stride);
1237
130k
}
1238
1239
static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
1240
29.4k
                                    const uint8_t *left, int count) {
1241
29.4k
  int i = 0;
1242
63.1k
  do {
1243
63.1k
    const __m128i left_col = _mm_load_si128((const __m128i *)left);
1244
63.1k
    const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
1245
63.1k
    h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
1246
63.1k
    dst += stride << 2;
1247
63.1k
    h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
1248
63.1k
    dst += stride << 2;
1249
1250
63.1k
    const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
1251
63.1k
    h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
1252
63.1k
    dst += stride << 2;
1253
63.1k
    h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
1254
63.1k
    dst += stride << 2;
1255
1256
63.1k
    left += 16;
1257
63.1k
    i++;
1258
63.1k
  } while (i < count);
1259
29.4k
}
1260
1261
void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
1262
27.2k
                                const uint8_t *above, const uint8_t *left) {
1263
27.2k
  (void)above;
1264
27.2k
  h_predictor_16xh(dst, stride, left, 2);
1265
27.2k
}
1266
1267
void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
1268
2.16k
                                const uint8_t *above, const uint8_t *left) {
1269
2.16k
  (void)above;
1270
2.16k
  h_predictor_16xh(dst, stride, left, 4);
1271
2.16k
}
1272
1273
static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
1274
876k
                                     ptrdiff_t stride) {
1275
876k
  int i;
1276
4.38M
  for (i = 0; i < h; ++i) {
1277
3.50M
    _mm_store_si128((__m128i *)dst, row[i]);
1278
3.50M
    _mm_store_si128((__m128i *)(dst + 16), row[i]);
1279
3.50M
    dst += stride;
1280
3.50M
  }
1281
876k
}
1282
1283
// Process 32x8, first 4 rows
1284
// Use first 8 bytes of left register: xxxxxxxx33221100
1285
static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
1286
438k
                                       ptrdiff_t stride) {
1287
438k
  __m128i row[4];
1288
438k
  repeat_low_4pixels(left, row);
1289
438k
  h_pred_store_32xh(row, 4, dst, stride);
1290
438k
}
1291
1292
// Process 32x8, second 4 rows
1293
// Use second 8 bytes of left register: 77665544xxxxxxxx
1294
static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
1295
438k
                                       ptrdiff_t stride) {
1296
438k
  __m128i row[4];
1297
438k
  repeat_high_4pixels(left, row);
1298
438k
  h_pred_store_32xh(row, 4, dst, stride);
1299
438k
}
1300
1301
void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
1302
274k
                               const uint8_t *above, const uint8_t *left) {
1303
274k
  __m128i left_col, left_col_8p;
1304
274k
  (void)above;
1305
1306
274k
  left_col = _mm_load_si128((const __m128i *)left);
1307
1308
274k
  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1309
274k
  h_prediction_32x8_1(&left_col_8p, dst, stride);
1310
274k
  dst += stride << 2;
1311
274k
  h_prediction_32x8_2(&left_col_8p, dst, stride);
1312
274k
}
1313
1314
void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
1315
81.6k
                                const uint8_t *above, const uint8_t *left) {
1316
81.6k
  __m128i left_col, left_col_8p;
1317
81.6k
  (void)above;
1318
1319
81.6k
  left_col = _mm_load_si128((const __m128i *)left);
1320
1321
81.6k
  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1322
81.6k
  h_prediction_32x8_1(&left_col_8p, dst, stride);
1323
81.6k
  dst += stride << 2;
1324
81.6k
  h_prediction_32x8_2(&left_col_8p, dst, stride);
1325
81.6k
  dst += stride << 2;
1326
1327
81.6k
  left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
1328
81.6k
  h_prediction_32x8_1(&left_col_8p, dst, stride);
1329
81.6k
  dst += stride << 2;
1330
81.6k
  h_prediction_32x8_2(&left_col_8p, dst, stride);
1331
81.6k
}
1332
1333
static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
1334
7.29k
                                    const uint8_t *left, int height) {
1335
7.29k
  int i = height >> 2;
1336
116k
  do {
1337
116k
    __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
1338
116k
    left4 = _mm_unpacklo_epi8(left4, left4);
1339
116k
    left4 = _mm_unpacklo_epi8(left4, left4);
1340
116k
    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
1341
116k
    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
1342
116k
    _mm_store_si128((__m128i *)dst, r0);
1343
116k
    _mm_store_si128((__m128i *)(dst + 16), r0);
1344
116k
    _mm_store_si128((__m128i *)(dst + stride), r1);
1345
116k
    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
1346
116k
    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
1347
116k
    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
1348
116k
    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
1349
116k
    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
1350
116k
    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
1351
116k
    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
1352
116k
    left += 4;
1353
116k
    dst += stride * 4;
1354
116k
  } while (--i);
1355
7.29k
}
1356
1357
void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
1358
7.29k
                                const uint8_t *above, const uint8_t *left) {
1359
7.29k
  (void)above;
1360
7.29k
  h_predictor_32xh(dst, stride, left, 64);
1361
7.29k
}
1362
1363
static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
1364
136k
                                    const uint8_t *left, int height) {
1365
136k
  int i = height >> 2;
1366
930k
  do {
1367
930k
    __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
1368
930k
    left4 = _mm_unpacklo_epi8(left4, left4);
1369
930k
    left4 = _mm_unpacklo_epi8(left4, left4);
1370
930k
    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
1371
930k
    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
1372
930k
    _mm_store_si128((__m128i *)dst, r0);
1373
930k
    _mm_store_si128((__m128i *)(dst + 16), r0);
1374
930k
    _mm_store_si128((__m128i *)(dst + 32), r0);
1375
930k
    _mm_store_si128((__m128i *)(dst + 48), r0);
1376
930k
    _mm_store_si128((__m128i *)(dst + stride), r1);
1377
930k
    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
1378
930k
    _mm_store_si128((__m128i *)(dst + stride + 32), r1);
1379
930k
    _mm_store_si128((__m128i *)(dst + stride + 48), r1);
1380
930k
    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
1381
930k
    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
1382
930k
    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
1383
930k
    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
1384
930k
    _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
1385
930k
    _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
1386
930k
    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
1387
930k
    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
1388
930k
    _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
1389
930k
    _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
1390
930k
    left += 4;
1391
930k
    dst += stride * 4;
1392
930k
  } while (--i);
1393
136k
}
1394
1395
void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
1396
25.9k
                                const uint8_t *above, const uint8_t *left) {
1397
25.9k
  (void)above;
1398
25.9k
  h_predictor_64xh(dst, stride, left, 64);
1399
25.9k
}
1400
1401
void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
1402
18.8k
                                const uint8_t *above, const uint8_t *left) {
1403
18.8k
  (void)above;
1404
18.8k
  h_predictor_64xh(dst, stride, left, 32);
1405
18.8k
}
1406
1407
void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
1408
91.2k
                                const uint8_t *above, const uint8_t *left) {
1409
91.2k
  (void)above;
1410
91.2k
  h_predictor_64xh(dst, stride, left, 16);
1411
91.2k
}