Coverage Report

Created: 2026-03-08 06:51

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/aom_dsp/x86/intrapred_sse2.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <emmintrin.h>
13
#include "aom_dsp/x86/intrapred_x86.h"
14
#include "config/aom_dsp_rtcd.h"
15
16
static inline void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
17
880k
                                ptrdiff_t stride) {
18
6.87M
  for (int i = 0; i < height; i += 2) {
19
5.99M
    *(uint32_t *)dst = dc;
20
5.99M
    dst += stride;
21
5.99M
    *(uint32_t *)dst = dc;
22
5.99M
    dst += stride;
23
5.99M
  }
24
880k
}
25
26
static inline void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
27
736k
                                ptrdiff_t stride) {
28
736k
  int i;
29
9.64M
  for (i = 0; i < height; ++i) {
30
8.90M
    _mm_storel_epi64((__m128i *)dst, *row);
31
8.90M
    dst += stride;
32
8.90M
  }
33
736k
}
34
35
static inline void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
36
1.29M
                                 ptrdiff_t stride) {
37
1.29M
  int i;
38
15.0M
  for (i = 0; i < height; ++i) {
39
13.7M
    _mm_store_si128((__m128i *)dst, *row);
40
13.7M
    dst += stride;
41
13.7M
  }
42
1.29M
}
43
44
static inline void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
45
345k
                                 ptrdiff_t stride) {
46
345k
  int i;
47
3.10M
  for (i = 0; i < height; ++i) {
48
2.75M
    _mm_store_si128((__m128i *)dst, *row);
49
2.75M
    _mm_store_si128((__m128i *)(dst + 16), *row);
50
2.75M
    dst += stride;
51
2.75M
  }
52
345k
}
53
54
static inline void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
55
0
                                 ptrdiff_t stride) {
56
0
  for (int i = 0; i < height; ++i) {
57
0
    _mm_store_si128((__m128i *)dst, *row);
58
0
    _mm_store_si128((__m128i *)(dst + 16), *row);
59
0
    _mm_store_si128((__m128i *)(dst + 32), *row);
60
0
    _mm_store_si128((__m128i *)(dst + 48), *row);
61
0
    dst += stride;
62
0
  }
63
0
}
64
65
1.81M
static inline __m128i dc_sum_4(const uint8_t *ref) {
66
1.81M
  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
67
1.81M
  const __m128i zero = _mm_setzero_si128();
68
1.81M
  x = _mm_unpacklo_epi8(x, zero);
69
1.81M
  return _mm_sad_epu8(x, zero);
70
1.81M
}
71
72
1.56M
static inline __m128i dc_sum_8(const uint8_t *ref) {
73
1.56M
  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
74
1.56M
  const __m128i zero = _mm_setzero_si128();
75
1.56M
  return _mm_sad_epu8(x, zero);
76
1.56M
}
77
78
20.1k
static inline __m128i dc_sum_64(const uint8_t *ref) {
79
20.1k
  __m128i x0 = _mm_load_si128((__m128i const *)ref);
80
20.1k
  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
81
20.1k
  __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
82
20.1k
  __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
83
20.1k
  const __m128i zero = _mm_setzero_si128();
84
20.1k
  x0 = _mm_sad_epu8(x0, zero);
85
20.1k
  x1 = _mm_sad_epu8(x1, zero);
86
20.1k
  x2 = _mm_sad_epu8(x2, zero);
87
20.1k
  x3 = _mm_sad_epu8(x3, zero);
88
20.1k
  x0 = _mm_add_epi16(x0, x1);
89
20.1k
  x2 = _mm_add_epi16(x2, x3);
90
20.1k
  x0 = _mm_add_epi16(x0, x2);
91
20.1k
  const __m128i high = _mm_unpackhi_epi64(x0, x0);
92
20.1k
  return _mm_add_epi16(x0, high);
93
20.1k
}
94
95
1.29M
#define DC_MULTIPLIER_1X2 0x5556
96
1.68M
#define DC_MULTIPLIER_1X4 0x3334
97
98
2.97M
#define DC_SHIFT2 16
99
100
static inline int divide_using_multiply_shift(int num, int shift1,
101
2.97M
                                              int multiplier) {
102
2.97M
  const int interm = num >> shift1;
103
2.97M
  return interm * multiplier >> DC_SHIFT2;
104
2.97M
}
105
106
// -----------------------------------------------------------------------------
107
// DC_PRED
108
109
void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
110
223k
                               const uint8_t *above, const uint8_t *left) {
111
223k
  const __m128i sum_left = dc_sum_8(left);
112
223k
  __m128i sum_above = dc_sum_4(above);
113
223k
  sum_above = _mm_add_epi16(sum_left, sum_above);
114
115
223k
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
116
223k
  sum += 6;
117
223k
  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
118
119
223k
  const __m128i row = _mm_set1_epi8((int8_t)sum);
120
223k
  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
121
223k
  dc_store_4xh(pred, 8, dst, stride);
122
223k
}
123
124
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
125
void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
126
605k
                                const uint8_t *above, const uint8_t *left) {
127
605k
  const __m128i sum_left = dc_sum_16_sse2(left);
128
605k
  __m128i sum_above = dc_sum_4(above);
129
605k
  sum_above = _mm_add_epi16(sum_left, sum_above);
130
131
605k
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
132
605k
  sum += 10;
133
605k
  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
134
135
605k
  const __m128i row = _mm_set1_epi8((int8_t)sum);
136
605k
  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
137
605k
  dc_store_4xh(pred, 16, dst, stride);
138
605k
}
139
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
140
141
void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
142
342k
                               const uint8_t *above, const uint8_t *left) {
143
342k
  const __m128i sum_left = dc_sum_4(left);
144
342k
  __m128i sum_above = dc_sum_8(above);
145
342k
  sum_above = _mm_add_epi16(sum_above, sum_left);
146
147
342k
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
148
342k
  sum += 6;
149
342k
  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
150
151
342k
  const __m128i row = _mm_set1_epi8((int8_t)sum);
152
342k
  dc_store_8xh(&row, 4, dst, stride);
153
342k
}
154
155
void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
156
209k
                                const uint8_t *above, const uint8_t *left) {
157
209k
  const __m128i sum_left = dc_sum_16_sse2(left);
158
209k
  __m128i sum_above = dc_sum_8(above);
159
209k
  sum_above = _mm_add_epi16(sum_above, sum_left);
160
161
209k
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
162
209k
  sum += 12;
163
209k
  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
164
209k
  const __m128i row = _mm_set1_epi8((int8_t)sum);
165
209k
  dc_store_8xh(&row, 16, dst, stride);
166
209k
}
167
168
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
169
void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
170
92.4k
                                const uint8_t *above, const uint8_t *left) {
171
92.4k
  const __m128i sum_left = dc_sum_32_sse2(left);
172
92.4k
  __m128i sum_above = dc_sum_8(above);
173
92.4k
  sum_above = _mm_add_epi16(sum_above, sum_left);
174
175
92.4k
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
176
92.4k
  sum += 20;
177
92.4k
  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
178
92.4k
  const __m128i row = _mm_set1_epi8((int8_t)sum);
179
92.4k
  dc_store_8xh(&row, 32, dst, stride);
180
92.4k
}
181
182
void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
183
632k
                                const uint8_t *above, const uint8_t *left) {
184
632k
  const __m128i sum_left = dc_sum_4(left);
185
632k
  __m128i sum_above = dc_sum_16_sse2(above);
186
632k
  sum_above = _mm_add_epi16(sum_above, sum_left);
187
188
632k
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
189
632k
  sum += 10;
190
632k
  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
191
632k
  const __m128i row = _mm_set1_epi8((int8_t)sum);
192
632k
  dc_store_16xh(&row, 4, dst, stride);
193
632k
}
194
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
195
196
void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
197
343k
                                const uint8_t *above, const uint8_t *left) {
198
343k
  const __m128i sum_left = dc_sum_8(left);
199
343k
  __m128i sum_above = dc_sum_16_sse2(above);
200
343k
  sum_above = _mm_add_epi16(sum_above, sum_left);
201
202
343k
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
203
343k
  sum += 12;
204
343k
  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
205
343k
  const __m128i row = _mm_set1_epi8((int8_t)sum);
206
343k
  dc_store_16xh(&row, 8, dst, stride);
207
343k
}
208
209
void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
210
174k
                                 const uint8_t *above, const uint8_t *left) {
211
174k
  const __m128i sum_left = dc_sum_32_sse2(left);
212
174k
  __m128i sum_above = dc_sum_16_sse2(above);
213
174k
  sum_above = _mm_add_epi16(sum_left, sum_above);
214
215
174k
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
216
174k
  sum += 24;
217
174k
  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
218
174k
  const __m128i row = _mm_set1_epi8((int8_t)sum);
219
174k
  dc_store_16xh(&row, 32, dst, stride);
220
174k
}
221
222
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
223
void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
224
19.8k
                                 const uint8_t *above, const uint8_t *left) {
225
19.8k
  const __m128i sum_left = dc_sum_64(left);
226
19.8k
  __m128i sum_above = dc_sum_16_sse2(above);
227
19.8k
  sum_above = _mm_add_epi16(sum_left, sum_above);
228
229
19.8k
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
230
19.8k
  sum += 40;
231
19.8k
  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
232
19.8k
  const __m128i row = _mm_set1_epi8((int8_t)sum);
233
19.8k
  dc_store_16xh(&row, 64, dst, stride);
234
19.8k
}
235
236
void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
237
333k
                                const uint8_t *above, const uint8_t *left) {
238
333k
  __m128i sum_above = dc_sum_32_sse2(above);
239
333k
  const __m128i sum_left = dc_sum_8(left);
240
333k
  sum_above = _mm_add_epi16(sum_above, sum_left);
241
242
333k
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
243
333k
  sum += 20;
244
333k
  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
245
333k
  const __m128i row = _mm_set1_epi8((int8_t)sum);
246
333k
  dc_store_32xh(&row, 8, dst, stride);
247
333k
}
248
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
249
250
void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
251
0
                                 const uint8_t *above, const uint8_t *left) {
252
0
  __m128i sum_above = dc_sum_32_sse2(above);
253
0
  const __m128i sum_left = dc_sum_16_sse2(left);
254
0
  sum_above = _mm_add_epi16(sum_above, sum_left);
255
256
0
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
257
0
  sum += 24;
258
0
  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
259
0
  const __m128i row = _mm_set1_epi8((int8_t)sum);
260
0
  dc_store_32xh(&row, 16, dst, stride);
261
0
}
262
263
void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
264
0
                                 const uint8_t *above, const uint8_t *left) {
265
0
  __m128i sum_above = dc_sum_32_sse2(above);
266
0
  const __m128i sum_left = dc_sum_64(left);
267
0
  sum_above = _mm_add_epi16(sum_above, sum_left);
268
269
0
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
270
0
  sum += 48;
271
0
  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
272
0
  const __m128i row = _mm_set1_epi8((int8_t)sum);
273
0
  dc_store_32xh(&row, 64, dst, stride);
274
0
}
275
276
void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
277
0
                                 const uint8_t *above, const uint8_t *left) {
278
0
  __m128i sum_above = dc_sum_64(above);
279
0
  const __m128i sum_left = dc_sum_64(left);
280
0
  sum_above = _mm_add_epi16(sum_above, sum_left);
281
282
0
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
283
0
  sum += 64;
284
0
  sum /= 128;
285
0
  const __m128i row = _mm_set1_epi8((int8_t)sum);
286
0
  dc_store_64xh(&row, 64, dst, stride);
287
0
}
288
289
void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
290
0
                                 const uint8_t *above, const uint8_t *left) {
291
0
  __m128i sum_above = dc_sum_64(above);
292
0
  const __m128i sum_left = dc_sum_32_sse2(left);
293
0
  sum_above = _mm_add_epi16(sum_above, sum_left);
294
295
0
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
296
0
  sum += 48;
297
0
  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
298
0
  const __m128i row = _mm_set1_epi8((int8_t)sum);
299
0
  dc_store_64xh(&row, 32, dst, stride);
300
0
}
301
302
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
303
void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
304
0
                                 const uint8_t *above, const uint8_t *left) {
305
0
  __m128i sum_above = dc_sum_64(above);
306
0
  const __m128i sum_left = dc_sum_16_sse2(left);
307
0
  sum_above = _mm_add_epi16(sum_above, sum_left);
308
309
0
  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
310
0
  sum += 40;
311
0
  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
312
0
  const __m128i row = _mm_set1_epi8((int8_t)sum);
313
0
  dc_store_64xh(&row, 16, dst, stride);
314
0
}
315
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
316
317
// -----------------------------------------------------------------------------
318
// DC_TOP
319
320
void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
321
3.61k
                                   const uint8_t *above, const uint8_t *left) {
322
3.61k
  (void)left;
323
3.61k
  __m128i sum_above = dc_sum_4(above);
324
3.61k
  const __m128i two = _mm_set1_epi16(2);
325
3.61k
  sum_above = _mm_add_epi16(sum_above, two);
326
3.61k
  sum_above = _mm_srai_epi16(sum_above, 2);
327
3.61k
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
328
3.61k
  sum_above = _mm_packus_epi16(sum_above, sum_above);
329
330
3.61k
  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
331
3.61k
  dc_store_4xh(pred, 8, dst, stride);
332
3.61k
}
333
334
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
335
void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
336
3.26k
                                    const uint8_t *above, const uint8_t *left) {
337
3.26k
  (void)left;
338
3.26k
  __m128i sum_above = dc_sum_4(above);
339
3.26k
  const __m128i two = _mm_set1_epi16(2);
340
3.26k
  sum_above = _mm_add_epi16(sum_above, two);
341
3.26k
  sum_above = _mm_srai_epi16(sum_above, 2);
342
3.26k
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
343
3.26k
  sum_above = _mm_packus_epi16(sum_above, sum_above);
344
345
3.26k
  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
346
3.26k
  dc_store_4xh(pred, 16, dst, stride);
347
3.26k
}
348
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
349
350
void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
351
2.60k
                                   const uint8_t *above, const uint8_t *left) {
352
2.60k
  (void)left;
353
2.60k
  __m128i sum_above = dc_sum_8(above);
354
2.60k
  const __m128i four = _mm_set1_epi16(4);
355
2.60k
  sum_above = _mm_add_epi16(sum_above, four);
356
2.60k
  sum_above = _mm_srai_epi16(sum_above, 3);
357
2.60k
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
358
2.60k
  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
359
2.60k
  dc_store_8xh(&row, 4, dst, stride);
360
2.60k
}
361
362
void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
363
2.37k
                                    const uint8_t *above, const uint8_t *left) {
364
2.37k
  (void)left;
365
2.37k
  __m128i sum_above = dc_sum_8(above);
366
2.37k
  const __m128i four = _mm_set1_epi16(4);
367
2.37k
  sum_above = _mm_add_epi16(sum_above, four);
368
2.37k
  sum_above = _mm_srai_epi16(sum_above, 3);
369
2.37k
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
370
2.37k
  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
371
2.37k
  dc_store_8xh(&row, 16, dst, stride);
372
2.37k
}
373
374
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
375
void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
376
3.25k
                                    const uint8_t *above, const uint8_t *left) {
377
3.25k
  (void)left;
378
3.25k
  __m128i sum_above = dc_sum_8(above);
379
3.25k
  const __m128i four = _mm_set1_epi16(4);
380
3.25k
  sum_above = _mm_add_epi16(sum_above, four);
381
3.25k
  sum_above = _mm_srai_epi16(sum_above, 3);
382
3.25k
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
383
3.25k
  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
384
3.25k
  dc_store_8xh(&row, 32, dst, stride);
385
3.25k
}
386
387
void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
388
12.7k
                                    const uint8_t *above, const uint8_t *left) {
389
12.7k
  (void)left;
390
12.7k
  __m128i sum_above = dc_sum_16_sse2(above);
391
12.7k
  const __m128i eight = _mm_set1_epi16(8);
392
12.7k
  sum_above = _mm_add_epi16(sum_above, eight);
393
12.7k
  sum_above = _mm_srai_epi16(sum_above, 4);
394
12.7k
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
395
12.7k
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
396
12.7k
  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
397
12.7k
  dc_store_16xh(&row, 4, dst, stride);
398
12.7k
}
399
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
400
401
void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
402
1.80k
                                    const uint8_t *above, const uint8_t *left) {
403
1.80k
  (void)left;
404
1.80k
  __m128i sum_above = dc_sum_16_sse2(above);
405
1.80k
  const __m128i eight = _mm_set1_epi16(8);
406
1.80k
  sum_above = _mm_add_epi16(sum_above, eight);
407
1.80k
  sum_above = _mm_srai_epi16(sum_above, 4);
408
1.80k
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
409
1.80k
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
410
1.80k
  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
411
1.80k
  dc_store_16xh(&row, 8, dst, stride);
412
1.80k
}
413
414
void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
415
                                     const uint8_t *above,
416
12.5k
                                     const uint8_t *left) {
417
12.5k
  (void)left;
418
12.5k
  __m128i sum_above = dc_sum_16_sse2(above);
419
12.5k
  const __m128i eight = _mm_set1_epi16(8);
420
12.5k
  sum_above = _mm_add_epi16(sum_above, eight);
421
12.5k
  sum_above = _mm_srai_epi16(sum_above, 4);
422
12.5k
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
423
12.5k
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
424
12.5k
  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
425
12.5k
  dc_store_16xh(&row, 32, dst, stride);
426
12.5k
}
427
428
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
429
void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
430
                                     const uint8_t *above,
431
991
                                     const uint8_t *left) {
432
991
  (void)left;
433
991
  __m128i sum_above = dc_sum_16_sse2(above);
434
991
  const __m128i eight = _mm_set1_epi16(8);
435
991
  sum_above = _mm_add_epi16(sum_above, eight);
436
991
  sum_above = _mm_srai_epi16(sum_above, 4);
437
991
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
438
991
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
439
991
  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
440
991
  dc_store_16xh(&row, 64, dst, stride);
441
991
}
442
443
void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
444
9.22k
                                    const uint8_t *above, const uint8_t *left) {
445
9.22k
  (void)left;
446
9.22k
  __m128i sum_above = dc_sum_32_sse2(above);
447
9.22k
  const __m128i sixteen = _mm_set1_epi16(16);
448
9.22k
  sum_above = _mm_add_epi16(sum_above, sixteen);
449
9.22k
  sum_above = _mm_srai_epi16(sum_above, 5);
450
9.22k
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
451
9.22k
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
452
9.22k
  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
453
9.22k
  dc_store_32xh(&row, 8, dst, stride);
454
9.22k
}
455
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
456
457
void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
458
                                     const uint8_t *above,
459
0
                                     const uint8_t *left) {
460
0
  (void)left;
461
0
  __m128i sum_above = dc_sum_32_sse2(above);
462
0
  const __m128i sixteen = _mm_set1_epi16(16);
463
0
  sum_above = _mm_add_epi16(sum_above, sixteen);
464
0
  sum_above = _mm_srai_epi16(sum_above, 5);
465
0
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
466
0
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
467
0
  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
468
0
  dc_store_32xh(&row, 16, dst, stride);
469
0
}
470
471
void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
472
                                     const uint8_t *above,
473
0
                                     const uint8_t *left) {
474
0
  (void)left;
475
0
  __m128i sum_above = dc_sum_32_sse2(above);
476
0
  const __m128i sixteen = _mm_set1_epi16(16);
477
0
  sum_above = _mm_add_epi16(sum_above, sixteen);
478
0
  sum_above = _mm_srai_epi16(sum_above, 5);
479
0
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
480
0
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
481
0
  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
482
0
  dc_store_32xh(&row, 64, dst, stride);
483
0
}
484
485
void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
486
                                     const uint8_t *above,
487
0
                                     const uint8_t *left) {
488
0
  (void)left;
489
0
  __m128i sum_above = dc_sum_64(above);
490
0
  const __m128i thirtytwo = _mm_set1_epi16(32);
491
0
  sum_above = _mm_add_epi16(sum_above, thirtytwo);
492
0
  sum_above = _mm_srai_epi16(sum_above, 6);
493
0
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
494
0
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
495
0
  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
496
0
  dc_store_64xh(&row, 64, dst, stride);
497
0
}
498
499
void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
500
                                     const uint8_t *above,
501
0
                                     const uint8_t *left) {
502
0
  (void)left;
503
0
  __m128i sum_above = dc_sum_64(above);
504
0
  const __m128i thirtytwo = _mm_set1_epi16(32);
505
0
  sum_above = _mm_add_epi16(sum_above, thirtytwo);
506
0
  sum_above = _mm_srai_epi16(sum_above, 6);
507
0
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
508
0
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
509
0
  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
510
0
  dc_store_64xh(&row, 32, dst, stride);
511
0
}
512
513
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
514
void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
515
                                     const uint8_t *above,
516
0
                                     const uint8_t *left) {
517
0
  (void)left;
518
0
  __m128i sum_above = dc_sum_64(above);
519
0
  const __m128i thirtytwo = _mm_set1_epi16(32);
520
0
  sum_above = _mm_add_epi16(sum_above, thirtytwo);
521
0
  sum_above = _mm_srai_epi16(sum_above, 6);
522
0
  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
523
0
  sum_above = _mm_shufflelo_epi16(sum_above, 0);
524
0
  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
525
0
  dc_store_64xh(&row, 16, dst, stride);
526
0
}
527
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
528
529
// -----------------------------------------------------------------------------
530
// DC_LEFT
531
532
void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
533
4.75k
                                    const uint8_t *above, const uint8_t *left) {
534
4.75k
  (void)above;
535
4.75k
  __m128i sum_left = dc_sum_8(left);
536
4.75k
  const __m128i four = _mm_set1_epi16(4);
537
4.75k
  sum_left = _mm_add_epi16(sum_left, four);
538
4.75k
  sum_left = _mm_srai_epi16(sum_left, 3);
539
4.75k
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
540
4.75k
  sum_left = _mm_packus_epi16(sum_left, sum_left);
541
542
4.75k
  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
543
4.75k
  dc_store_4xh(pred, 8, dst, stride);
544
4.75k
}
545
546
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
547
void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
548
                                     const uint8_t *above,
549
4.58k
                                     const uint8_t *left) {
550
4.58k
  (void)above;
551
4.58k
  __m128i sum_left = dc_sum_16_sse2(left);
552
4.58k
  const __m128i eight = _mm_set1_epi16(8);
553
4.58k
  sum_left = _mm_add_epi16(sum_left, eight);
554
4.58k
  sum_left = _mm_srai_epi16(sum_left, 4);
555
4.58k
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
556
4.58k
  sum_left = _mm_packus_epi16(sum_left, sum_left);
557
558
4.58k
  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
559
4.58k
  dc_store_4xh(pred, 16, dst, stride);
560
4.58k
}
561
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
562
563
void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
564
4.42k
                                    const uint8_t *above, const uint8_t *left) {
565
4.42k
  (void)above;
566
4.42k
  __m128i sum_left = dc_sum_4(left);
567
4.42k
  const __m128i two = _mm_set1_epi16(2);
568
4.42k
  sum_left = _mm_add_epi16(sum_left, two);
569
4.42k
  sum_left = _mm_srai_epi16(sum_left, 2);
570
4.42k
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
571
4.42k
  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
572
4.42k
  dc_store_8xh(&row, 4, dst, stride);
573
4.42k
}
574
575
void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
576
                                     const uint8_t *above,
577
2.41k
                                     const uint8_t *left) {
578
2.41k
  (void)above;
579
2.41k
  __m128i sum_left = dc_sum_16_sse2(left);
580
2.41k
  const __m128i eight = _mm_set1_epi16(8);
581
2.41k
  sum_left = _mm_add_epi16(sum_left, eight);
582
2.41k
  sum_left = _mm_srai_epi16(sum_left, 4);
583
2.41k
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
584
2.41k
  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
585
2.41k
  dc_store_8xh(&row, 16, dst, stride);
586
2.41k
}
587
588
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
589
void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
590
                                     const uint8_t *above,
591
12.7k
                                     const uint8_t *left) {
592
12.7k
  (void)above;
593
12.7k
  __m128i sum_left = dc_sum_32_sse2(left);
594
12.7k
  const __m128i sixteen = _mm_set1_epi16(16);
595
12.7k
  sum_left = _mm_add_epi16(sum_left, sixteen);
596
12.7k
  sum_left = _mm_srai_epi16(sum_left, 5);
597
12.7k
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
598
12.7k
  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
599
12.7k
  dc_store_8xh(&row, 32, dst, stride);
600
12.7k
}
601
602
void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
603
                                     const uint8_t *above,
604
3.75k
                                     const uint8_t *left) {
605
3.75k
  (void)above;
606
3.75k
  __m128i sum_left = dc_sum_4(left);
607
3.75k
  const __m128i two = _mm_set1_epi16(2);
608
3.75k
  sum_left = _mm_add_epi16(sum_left, two);
609
3.75k
  sum_left = _mm_srai_epi16(sum_left, 2);
610
3.75k
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
611
3.75k
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
612
3.75k
  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
613
3.75k
  dc_store_16xh(&row, 4, dst, stride);
614
3.75k
}
615
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
616
617
void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
618
                                     const uint8_t *above,
619
8.16k
                                     const uint8_t *left) {
620
8.16k
  (void)above;
621
8.16k
  __m128i sum_left = dc_sum_8(left);
622
8.16k
  const __m128i four = _mm_set1_epi16(4);
623
8.16k
  sum_left = _mm_add_epi16(sum_left, four);
624
8.16k
  sum_left = _mm_srai_epi16(sum_left, 3);
625
8.16k
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
626
8.16k
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
627
8.16k
  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
628
8.16k
  dc_store_16xh(&row, 8, dst, stride);
629
8.16k
}
630
631
void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
632
                                      const uint8_t *above,
633
4.98k
                                      const uint8_t *left) {
634
4.98k
  (void)above;
635
4.98k
  __m128i sum_left = dc_sum_32_sse2(left);
636
4.98k
  const __m128i sixteen = _mm_set1_epi16(16);
637
4.98k
  sum_left = _mm_add_epi16(sum_left, sixteen);
638
4.98k
  sum_left = _mm_srai_epi16(sum_left, 5);
639
4.98k
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
640
4.98k
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
641
4.98k
  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
642
4.98k
  dc_store_16xh(&row, 32, dst, stride);
643
4.98k
}
644
645
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
646
void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
647
                                      const uint8_t *above,
648
350
                                      const uint8_t *left) {
649
350
  (void)above;
650
350
  __m128i sum_left = dc_sum_64(left);
651
350
  const __m128i thirtytwo = _mm_set1_epi16(32);
652
350
  sum_left = _mm_add_epi16(sum_left, thirtytwo);
653
350
  sum_left = _mm_srai_epi16(sum_left, 6);
654
350
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
655
350
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
656
350
  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
657
350
  dc_store_16xh(&row, 64, dst, stride);
658
350
}
659
660
void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
661
                                     const uint8_t *above,
662
2.57k
                                     const uint8_t *left) {
663
2.57k
  (void)above;
664
2.57k
  __m128i sum_left = dc_sum_8(left);
665
2.57k
  const __m128i four = _mm_set1_epi16(4);
666
2.57k
  sum_left = _mm_add_epi16(sum_left, four);
667
2.57k
  sum_left = _mm_srai_epi16(sum_left, 3);
668
2.57k
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
669
2.57k
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
670
2.57k
  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
671
2.57k
  dc_store_32xh(&row, 8, dst, stride);
672
2.57k
}
673
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
674
675
void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
676
                                      const uint8_t *above,
677
0
                                      const uint8_t *left) {
678
0
  (void)above;
679
0
  __m128i sum_left = dc_sum_16_sse2(left);
680
0
  const __m128i eight = _mm_set1_epi16(8);
681
0
  sum_left = _mm_add_epi16(sum_left, eight);
682
0
  sum_left = _mm_srai_epi16(sum_left, 4);
683
0
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
684
0
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
685
0
  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
686
0
  dc_store_32xh(&row, 16, dst, stride);
687
0
}
688
689
void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
690
                                      const uint8_t *above,
691
0
                                      const uint8_t *left) {
692
0
  (void)above;
693
0
  __m128i sum_left = dc_sum_64(left);
694
0
  const __m128i thirtytwo = _mm_set1_epi16(32);
695
0
  sum_left = _mm_add_epi16(sum_left, thirtytwo);
696
0
  sum_left = _mm_srai_epi16(sum_left, 6);
697
0
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
698
0
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
699
0
  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
700
0
  dc_store_32xh(&row, 64, dst, stride);
701
0
}
702
703
void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
704
                                      const uint8_t *above,
705
0
                                      const uint8_t *left) {
706
0
  (void)above;
707
0
  __m128i sum_left = dc_sum_64(left);
708
0
  const __m128i thirtytwo = _mm_set1_epi16(32);
709
0
  sum_left = _mm_add_epi16(sum_left, thirtytwo);
710
0
  sum_left = _mm_srai_epi16(sum_left, 6);
711
0
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
712
0
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
713
0
  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
714
0
  dc_store_64xh(&row, 64, dst, stride);
715
0
}
716
717
void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
718
                                      const uint8_t *above,
719
0
                                      const uint8_t *left) {
720
0
  (void)above;
721
0
  __m128i sum_left = dc_sum_32_sse2(left);
722
0
  const __m128i sixteen = _mm_set1_epi16(16);
723
0
  sum_left = _mm_add_epi16(sum_left, sixteen);
724
0
  sum_left = _mm_srai_epi16(sum_left, 5);
725
0
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
726
0
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
727
0
  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
728
0
  dc_store_64xh(&row, 32, dst, stride);
729
0
}
730
731
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
732
void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
733
                                      const uint8_t *above,
734
0
                                      const uint8_t *left) {
735
0
  (void)above;
736
0
  __m128i sum_left = dc_sum_16_sse2(left);
737
0
  const __m128i eight = _mm_set1_epi16(8);
738
0
  sum_left = _mm_add_epi16(sum_left, eight);
739
0
  sum_left = _mm_srai_epi16(sum_left, 4);
740
0
  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
741
0
  sum_left = _mm_shufflelo_epi16(sum_left, 0);
742
0
  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
743
0
  dc_store_64xh(&row, 16, dst, stride);
744
0
}
745
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
746
747
// -----------------------------------------------------------------------------
748
// DC_128
749
750
void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
751
2.49k
                                   const uint8_t *above, const uint8_t *left) {
752
2.49k
  (void)above;
753
2.49k
  (void)left;
754
2.49k
  const uint32_t pred = 0x80808080;
755
2.49k
  dc_store_4xh(pred, 8, dst, stride);
756
2.49k
}
757
758
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
759
void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
760
261
                                    const uint8_t *above, const uint8_t *left) {
761
261
  (void)above;
762
261
  (void)left;
763
261
  const uint32_t pred = 0x80808080;
764
261
  dc_store_4xh(pred, 16, dst, stride);
765
261
}
766
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
767
768
void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
769
356
                                   const uint8_t *above, const uint8_t *left) {
770
356
  (void)above;
771
356
  (void)left;
772
356
  const __m128i row = _mm_set1_epi8((int8_t)128);
773
356
  dc_store_8xh(&row, 4, dst, stride);
774
356
}
775
776
void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
777
1.19k
                                    const uint8_t *above, const uint8_t *left) {
778
1.19k
  (void)above;
779
1.19k
  (void)left;
780
1.19k
  const __m128i row = _mm_set1_epi8((int8_t)128);
781
1.19k
  dc_store_8xh(&row, 16, dst, stride);
782
1.19k
}
783
784
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
785
void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
786
275
                                    const uint8_t *above, const uint8_t *left) {
787
275
  (void)above;
788
275
  (void)left;
789
275
  const __m128i row = _mm_set1_epi8((int8_t)128);
790
275
  dc_store_8xh(&row, 32, dst, stride);
791
275
}
792
793
void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
794
99
                                    const uint8_t *above, const uint8_t *left) {
795
99
  (void)above;
796
99
  (void)left;
797
99
  const __m128i row = _mm_set1_epi8((int8_t)128);
798
99
  dc_store_16xh(&row, 4, dst, stride);
799
99
}
800
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
801
802
void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
803
211
                                    const uint8_t *above, const uint8_t *left) {
804
211
  (void)above;
805
211
  (void)left;
806
211
  const __m128i row = _mm_set1_epi8((int8_t)128);
807
211
  dc_store_16xh(&row, 8, dst, stride);
808
211
}
809
810
void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
811
                                     const uint8_t *above,
812
1.23k
                                     const uint8_t *left) {
813
1.23k
  (void)above;
814
1.23k
  (void)left;
815
1.23k
  const __m128i row = _mm_set1_epi8((int8_t)128);
816
1.23k
  dc_store_16xh(&row, 32, dst, stride);
817
1.23k
}
818
819
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
820
void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
821
                                     const uint8_t *above,
822
76
                                     const uint8_t *left) {
823
76
  (void)above;
824
76
  (void)left;
825
76
  const __m128i row = _mm_set1_epi8((int8_t)128);
826
76
  dc_store_16xh(&row, 64, dst, stride);
827
76
}
828
829
void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
830
66
                                    const uint8_t *above, const uint8_t *left) {
831
66
  (void)above;
832
66
  (void)left;
833
66
  const __m128i row = _mm_set1_epi8((int8_t)128);
834
66
  dc_store_32xh(&row, 8, dst, stride);
835
66
}
836
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
837
838
void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
839
                                     const uint8_t *above,
840
0
                                     const uint8_t *left) {
841
0
  (void)above;
842
0
  (void)left;
843
0
  const __m128i row = _mm_set1_epi8((int8_t)128);
844
0
  dc_store_32xh(&row, 16, dst, stride);
845
0
}
846
847
void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
848
                                     const uint8_t *above,
849
0
                                     const uint8_t *left) {
850
0
  (void)above;
851
0
  (void)left;
852
0
  const __m128i row = _mm_set1_epi8((int8_t)128);
853
0
  dc_store_32xh(&row, 64, dst, stride);
854
0
}
855
856
void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
857
                                     const uint8_t *above,
858
0
                                     const uint8_t *left) {
859
0
  (void)above;
860
0
  (void)left;
861
0
  const __m128i row = _mm_set1_epi8((int8_t)128);
862
0
  dc_store_64xh(&row, 64, dst, stride);
863
0
}
864
865
void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
866
                                     const uint8_t *above,
867
0
                                     const uint8_t *left) {
868
0
  (void)above;
869
0
  (void)left;
870
0
  const __m128i row = _mm_set1_epi8((int8_t)128);
871
0
  dc_store_64xh(&row, 32, dst, stride);
872
0
}
873
874
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
875
void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
876
                                     const uint8_t *above,
877
0
                                     const uint8_t *left) {
878
0
  (void)above;
879
0
  (void)left;
880
0
  const __m128i row = _mm_set1_epi8((int8_t)128);
881
0
  dc_store_64xh(&row, 16, dst, stride);
882
0
}
883
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
884
885
// -----------------------------------------------------------------------------
886
// V_PRED
887
888
void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
889
26.2k
                              const uint8_t *above, const uint8_t *left) {
890
26.2k
  const uint32_t pred = *(uint32_t *)above;
891
26.2k
  (void)left;
892
26.2k
  dc_store_4xh(pred, 8, dst, stride);
893
26.2k
}
894
895
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
896
void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
897
6.20k
                               const uint8_t *above, const uint8_t *left) {
898
6.20k
  const uint32_t pred = *(uint32_t *)above;
899
6.20k
  (void)left;
900
6.20k
  dc_store_4xh(pred, 16, dst, stride);
901
6.20k
}
902
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
903
904
void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
905
39.5k
                              const uint8_t *above, const uint8_t *left) {
906
39.5k
  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
907
39.5k
  (void)left;
908
39.5k
  dc_store_8xh(&row, 4, dst, stride);
909
39.5k
}
910
911
void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
912
19.0k
                               const uint8_t *above, const uint8_t *left) {
913
19.0k
  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
914
19.0k
  (void)left;
915
19.0k
  dc_store_8xh(&row, 16, dst, stride);
916
19.0k
}
917
918
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
919
void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
920
3.70k
                               const uint8_t *above, const uint8_t *left) {
921
3.70k
  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
922
3.70k
  (void)left;
923
3.70k
  dc_store_8xh(&row, 32, dst, stride);
924
3.70k
}
925
926
void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
927
34.3k
                               const uint8_t *above, const uint8_t *left) {
928
34.3k
  const __m128i row = _mm_load_si128((__m128i const *)above);
929
34.3k
  (void)left;
930
34.3k
  dc_store_16xh(&row, 4, dst, stride);
931
34.3k
}
932
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
933
934
void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
935
37.6k
                               const uint8_t *above, const uint8_t *left) {
936
37.6k
  const __m128i row = _mm_load_si128((__m128i const *)above);
937
37.6k
  (void)left;
938
37.6k
  dc_store_16xh(&row, 8, dst, stride);
939
37.6k
}
940
941
void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
942
9.00k
                                const uint8_t *above, const uint8_t *left) {
943
9.00k
  const __m128i row = _mm_load_si128((__m128i const *)above);
944
9.00k
  (void)left;
945
9.00k
  dc_store_16xh(&row, 32, dst, stride);
946
9.00k
}
947
948
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
949
void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
950
1.38k
                                const uint8_t *above, const uint8_t *left) {
951
1.38k
  const __m128i row = _mm_load_si128((__m128i const *)above);
952
1.38k
  (void)left;
953
1.38k
  dc_store_16xh(&row, 64, dst, stride);
954
1.38k
}
955
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
956
957
static inline void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
958
12.0k
                                    const uint8_t *above, int height) {
959
12.0k
  const __m128i row0 = _mm_load_si128((__m128i const *)above);
960
12.0k
  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
961
108k
  for (int i = 0; i < height; ++i) {
962
96.7k
    _mm_store_si128((__m128i *)dst, row0);
963
96.7k
    _mm_store_si128((__m128i *)(dst + 16), row1);
964
96.7k
    dst += stride;
965
96.7k
  }
966
12.0k
}
967
968
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
969
void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
970
12.0k
                               const uint8_t *above, const uint8_t *left) {
971
12.0k
  (void)left;
972
12.0k
  v_predictor_32xh(dst, stride, above, 8);
973
12.0k
}
974
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
975
976
void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
977
0
                                const uint8_t *above, const uint8_t *left) {
978
0
  (void)left;
979
0
  v_predictor_32xh(dst, stride, above, 16);
980
0
}
981
982
void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
983
0
                                const uint8_t *above, const uint8_t *left) {
984
0
  (void)left;
985
0
  v_predictor_32xh(dst, stride, above, 64);
986
0
}
987
988
static inline void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
989
0
                                    const uint8_t *above, int height) {
990
0
  const __m128i row0 = _mm_load_si128((__m128i const *)above);
991
0
  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
992
0
  const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
993
0
  const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
994
0
  for (int i = 0; i < height; ++i) {
995
0
    _mm_store_si128((__m128i *)dst, row0);
996
0
    _mm_store_si128((__m128i *)(dst + 16), row1);
997
0
    _mm_store_si128((__m128i *)(dst + 32), row2);
998
0
    _mm_store_si128((__m128i *)(dst + 48), row3);
999
0
    dst += stride;
1000
0
  }
1001
0
}
1002
1003
void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
1004
0
                                const uint8_t *above, const uint8_t *left) {
1005
0
  (void)left;
1006
0
  v_predictor_64xh(dst, stride, above, 64);
1007
0
}
1008
1009
void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
1010
0
                                const uint8_t *above, const uint8_t *left) {
1011
0
  (void)left;
1012
0
  v_predictor_64xh(dst, stride, above, 32);
1013
0
}
1014
1015
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1016
void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
1017
0
                                const uint8_t *above, const uint8_t *left) {
1018
0
  (void)left;
1019
0
  v_predictor_64xh(dst, stride, above, 16);
1020
0
}
1021
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1022
1023
// -----------------------------------------------------------------------------
1024
// H_PRED
1025
1026
void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
1027
39.0k
                              const uint8_t *above, const uint8_t *left) {
1028
39.0k
  (void)above;
1029
39.0k
  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
1030
39.0k
  left_col = _mm_unpacklo_epi8(left_col, left_col);
1031
39.0k
  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
1032
39.0k
  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
1033
39.0k
  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
1034
39.0k
  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
1035
39.0k
  *(int *)dst = _mm_cvtsi128_si32(row0);
1036
39.0k
  dst += stride;
1037
39.0k
  *(int *)dst = _mm_cvtsi128_si32(row1);
1038
39.0k
  dst += stride;
1039
39.0k
  *(int *)dst = _mm_cvtsi128_si32(row2);
1040
39.0k
  dst += stride;
1041
39.0k
  *(int *)dst = _mm_cvtsi128_si32(row3);
1042
39.0k
  dst += stride;
1043
39.0k
  left_col = _mm_unpackhi_epi64(left_col, left_col);
1044
39.0k
  row0 = _mm_shufflelo_epi16(left_col, 0);
1045
39.0k
  row1 = _mm_shufflelo_epi16(left_col, 0x55);
1046
39.0k
  row2 = _mm_shufflelo_epi16(left_col, 0xaa);
1047
39.0k
  row3 = _mm_shufflelo_epi16(left_col, 0xff);
1048
39.0k
  *(int *)dst = _mm_cvtsi128_si32(row0);
1049
39.0k
  dst += stride;
1050
39.0k
  *(int *)dst = _mm_cvtsi128_si32(row1);
1051
39.0k
  dst += stride;
1052
39.0k
  *(int *)dst = _mm_cvtsi128_si32(row2);
1053
39.0k
  dst += stride;
1054
39.0k
  *(int *)dst = _mm_cvtsi128_si32(row3);
1055
39.0k
}
1056
1057
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1058
void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
1059
20.1k
                               const uint8_t *above, const uint8_t *left) {
1060
20.1k
  (void)above;
1061
20.1k
  const __m128i left_col = _mm_load_si128((__m128i const *)left);
1062
20.1k
  __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
1063
20.1k
  __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
1064
1065
20.1k
  __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
1066
20.1k
  __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1067
20.1k
  __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1068
20.1k
  __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1069
20.1k
  *(int *)dst = _mm_cvtsi128_si32(row0);
1070
20.1k
  dst += stride;
1071
20.1k
  *(int *)dst = _mm_cvtsi128_si32(row1);
1072
20.1k
  dst += stride;
1073
20.1k
  *(int *)dst = _mm_cvtsi128_si32(row2);
1074
20.1k
  dst += stride;
1075
20.1k
  *(int *)dst = _mm_cvtsi128_si32(row3);
1076
20.1k
  dst += stride;
1077
1078
20.1k
  left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
1079
20.1k
  row0 = _mm_shufflelo_epi16(left_col_low, 0);
1080
20.1k
  row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1081
20.1k
  row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1082
20.1k
  row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1083
20.1k
  *(int *)dst = _mm_cvtsi128_si32(row0);
1084
20.1k
  dst += stride;
1085
20.1k
  *(int *)dst = _mm_cvtsi128_si32(row1);
1086
20.1k
  dst += stride;
1087
20.1k
  *(int *)dst = _mm_cvtsi128_si32(row2);
1088
20.1k
  dst += stride;
1089
20.1k
  *(int *)dst = _mm_cvtsi128_si32(row3);
1090
20.1k
  dst += stride;
1091
1092
20.1k
  row0 = _mm_shufflelo_epi16(left_col_high, 0);
1093
20.1k
  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1094
20.1k
  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1095
20.1k
  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1096
20.1k
  *(int *)dst = _mm_cvtsi128_si32(row0);
1097
20.1k
  dst += stride;
1098
20.1k
  *(int *)dst = _mm_cvtsi128_si32(row1);
1099
20.1k
  dst += stride;
1100
20.1k
  *(int *)dst = _mm_cvtsi128_si32(row2);
1101
20.1k
  dst += stride;
1102
20.1k
  *(int *)dst = _mm_cvtsi128_si32(row3);
1103
20.1k
  dst += stride;
1104
1105
20.1k
  left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
1106
20.1k
  row0 = _mm_shufflelo_epi16(left_col_high, 0);
1107
20.1k
  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1108
20.1k
  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1109
20.1k
  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1110
20.1k
  *(int *)dst = _mm_cvtsi128_si32(row0);
1111
20.1k
  dst += stride;
1112
20.1k
  *(int *)dst = _mm_cvtsi128_si32(row1);
1113
20.1k
  dst += stride;
1114
20.1k
  *(int *)dst = _mm_cvtsi128_si32(row2);
1115
20.1k
  dst += stride;
1116
20.1k
  *(int *)dst = _mm_cvtsi128_si32(row3);
1117
20.1k
}
1118
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1119
1120
void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
1121
71.9k
                              const uint8_t *above, const uint8_t *left) {
1122
71.9k
  (void)above;
1123
71.9k
  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
1124
71.9k
  left_col = _mm_unpacklo_epi8(left_col, left_col);
1125
71.9k
  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
1126
71.9k
  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
1127
71.9k
  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
1128
71.9k
  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
1129
71.9k
  _mm_storel_epi64((__m128i *)dst, row0);
1130
71.9k
  dst += stride;
1131
71.9k
  _mm_storel_epi64((__m128i *)dst, row1);
1132
71.9k
  dst += stride;
1133
71.9k
  _mm_storel_epi64((__m128i *)dst, row2);
1134
71.9k
  dst += stride;
1135
71.9k
  _mm_storel_epi64((__m128i *)dst, row3);
1136
71.9k
}
1137
1138
static inline void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
1139
                                      const uint8_t *above, const uint8_t *left,
1140
52.7k
                                      int count) {
1141
52.7k
  (void)above;
1142
117k
  for (int i = 0; i < count; ++i) {
1143
64.8k
    const __m128i left_col = _mm_load_si128((__m128i const *)left);
1144
64.8k
    __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
1145
64.8k
    __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
1146
1147
64.8k
    __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
1148
64.8k
    __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1149
64.8k
    __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1150
64.8k
    __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1151
64.8k
    _mm_storel_epi64((__m128i *)dst, row0);
1152
64.8k
    dst += stride;
1153
64.8k
    _mm_storel_epi64((__m128i *)dst, row1);
1154
64.8k
    dst += stride;
1155
64.8k
    _mm_storel_epi64((__m128i *)dst, row2);
1156
64.8k
    dst += stride;
1157
64.8k
    _mm_storel_epi64((__m128i *)dst, row3);
1158
64.8k
    dst += stride;
1159
1160
64.8k
    left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
1161
64.8k
    row0 = _mm_shufflelo_epi16(left_col_low, 0);
1162
64.8k
    row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1163
64.8k
    row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1164
64.8k
    row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1165
64.8k
    _mm_storel_epi64((__m128i *)dst, row0);
1166
64.8k
    dst += stride;
1167
64.8k
    _mm_storel_epi64((__m128i *)dst, row1);
1168
64.8k
    dst += stride;
1169
64.8k
    _mm_storel_epi64((__m128i *)dst, row2);
1170
64.8k
    dst += stride;
1171
64.8k
    _mm_storel_epi64((__m128i *)dst, row3);
1172
64.8k
    dst += stride;
1173
1174
64.8k
    row0 = _mm_shufflelo_epi16(left_col_high, 0);
1175
64.8k
    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1176
64.8k
    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1177
64.8k
    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1178
64.8k
    _mm_storel_epi64((__m128i *)dst, row0);
1179
64.8k
    dst += stride;
1180
64.8k
    _mm_storel_epi64((__m128i *)dst, row1);
1181
64.8k
    dst += stride;
1182
64.8k
    _mm_storel_epi64((__m128i *)dst, row2);
1183
64.8k
    dst += stride;
1184
64.8k
    _mm_storel_epi64((__m128i *)dst, row3);
1185
64.8k
    dst += stride;
1186
1187
64.8k
    left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
1188
64.8k
    row0 = _mm_shufflelo_epi16(left_col_high, 0);
1189
64.8k
    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1190
64.8k
    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1191
64.8k
    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1192
64.8k
    _mm_storel_epi64((__m128i *)dst, row0);
1193
64.8k
    dst += stride;
1194
64.8k
    _mm_storel_epi64((__m128i *)dst, row1);
1195
64.8k
    dst += stride;
1196
64.8k
    _mm_storel_epi64((__m128i *)dst, row2);
1197
64.8k
    dst += stride;
1198
64.8k
    _mm_storel_epi64((__m128i *)dst, row3);
1199
64.8k
    dst += stride;
1200
64.8k
    left += 16;
1201
64.8k
  }
1202
52.7k
}
1203
1204
void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
1205
40.6k
                               const uint8_t *above, const uint8_t *left) {
1206
40.6k
  h_predictor_8x16xc(dst, stride, above, left, 1);
1207
40.6k
}
1208
1209
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1210
void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
1211
12.0k
                               const uint8_t *above, const uint8_t *left) {
1212
12.0k
  h_predictor_8x16xc(dst, stride, above, left, 2);
1213
12.0k
}
1214
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1215
1216
static inline void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
1217
539k
                                     ptrdiff_t stride) {
1218
539k
  int i;
1219
2.69M
  for (i = 0; i < h; ++i) {
1220
2.15M
    _mm_store_si128((__m128i *)dst, row[i]);
1221
2.15M
    dst += stride;
1222
2.15M
  }
1223
539k
}
1224
1225
609k
static inline void repeat_low_4pixels(const __m128i *x, __m128i *row) {
1226
609k
  const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
1227
609k
  const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
1228
609k
  const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
1229
609k
  const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
1230
1231
609k
  row[0] = _mm_unpacklo_epi64(u0, u0);
1232
609k
  row[1] = _mm_unpacklo_epi64(u1, u1);
1233
609k
  row[2] = _mm_unpacklo_epi64(u2, u2);
1234
609k
  row[3] = _mm_unpacklo_epi64(u3, u3);
1235
609k
}
1236
1237
473k
static inline void repeat_high_4pixels(const __m128i *x, __m128i *row) {
1238
473k
  const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
1239
473k
  const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
1240
473k
  const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
1241
473k
  const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
1242
1243
473k
  row[0] = _mm_unpackhi_epi64(u0, u0);
1244
473k
  row[1] = _mm_unpackhi_epi64(u1, u1);
1245
473k
  row[2] = _mm_unpackhi_epi64(u2, u2);
1246
473k
  row[3] = _mm_unpackhi_epi64(u3, u3);
1247
473k
}
1248
1249
// Process 16x8, first 4 rows
1250
// Use first 8 bytes of left register: xxxxxxxx33221100
1251
static inline void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
1252
337k
                                       ptrdiff_t stride) {
1253
337k
  __m128i row[4];
1254
337k
  repeat_low_4pixels(left, row);
1255
337k
  h_pred_store_16xh(row, 4, dst, stride);
1256
337k
}
1257
1258
// Process 16x8, second 4 rows
1259
// Use second 8 bytes of left register: 77665544xxxxxxxx
1260
static inline void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
1261
201k
                                       ptrdiff_t stride) {
1262
201k
  __m128i row[4];
1263
201k
  repeat_high_4pixels(left, row);
1264
201k
  h_pred_store_16xh(row, 4, dst, stride);
1265
201k
}
1266
1267
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1268
void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
1269
136k
                               const uint8_t *above, const uint8_t *left) {
1270
136k
  (void)above;
1271
136k
  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
1272
136k
  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1273
136k
  h_prediction_16x8_1(&left_col_8p, dst, stride);
1274
136k
}
1275
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1276
1277
void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
1278
88.5k
                               const uint8_t *above, const uint8_t *left) {
1279
88.5k
  (void)above;
1280
88.5k
  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
1281
88.5k
  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1282
88.5k
  h_prediction_16x8_1(&left_col_8p, dst, stride);
1283
88.5k
  dst += stride << 2;
1284
88.5k
  h_prediction_16x8_2(&left_col_8p, dst, stride);
1285
88.5k
}
1286
1287
static inline void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
1288
25.8k
                                    const uint8_t *left, int count) {
1289
25.8k
  int i = 0;
1290
56.4k
  do {
1291
56.4k
    const __m128i left_col = _mm_load_si128((const __m128i *)left);
1292
56.4k
    const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
1293
56.4k
    h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
1294
56.4k
    dst += stride << 2;
1295
56.4k
    h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
1296
56.4k
    dst += stride << 2;
1297
1298
56.4k
    const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
1299
56.4k
    h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
1300
56.4k
    dst += stride << 2;
1301
56.4k
    h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
1302
56.4k
    dst += stride << 2;
1303
1304
56.4k
    left += 16;
1305
56.4k
    i++;
1306
56.4k
  } while (i < count);
1307
25.8k
}
1308
1309
void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
1310
23.5k
                                const uint8_t *above, const uint8_t *left) {
1311
23.5k
  (void)above;
1312
23.5k
  h_predictor_16xh(dst, stride, left, 2);
1313
23.5k
}
1314
1315
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1316
void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
1317
2.34k
                                const uint8_t *above, const uint8_t *left) {
1318
2.34k
  (void)above;
1319
2.34k
  h_predictor_16xh(dst, stride, left, 4);
1320
2.34k
}
1321
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1322
1323
static inline void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
1324
544k
                                     ptrdiff_t stride) {
1325
544k
  int i;
1326
2.72M
  for (i = 0; i < h; ++i) {
1327
2.17M
    _mm_store_si128((__m128i *)dst, row[i]);
1328
2.17M
    _mm_store_si128((__m128i *)(dst + 16), row[i]);
1329
2.17M
    dst += stride;
1330
2.17M
  }
1331
544k
}
1332
1333
// Process 32x8, first 4 rows
1334
// Use first 8 bytes of left register: xxxxxxxx33221100
1335
static inline void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
1336
272k
                                       ptrdiff_t stride) {
1337
272k
  __m128i row[4];
1338
272k
  repeat_low_4pixels(left, row);
1339
272k
  h_pred_store_32xh(row, 4, dst, stride);
1340
272k
}
1341
1342
// Process 32x8, second 4 rows
1343
// Use second 8 bytes of left register: 77665544xxxxxxxx
1344
static inline void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
1345
272k
                                       ptrdiff_t stride) {
1346
272k
  __m128i row[4];
1347
272k
  repeat_high_4pixels(left, row);
1348
272k
  h_pred_store_32xh(row, 4, dst, stride);
1349
272k
}
1350
1351
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1352
void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
1353
165k
                               const uint8_t *above, const uint8_t *left) {
1354
165k
  __m128i left_col, left_col_8p;
1355
165k
  (void)above;
1356
1357
165k
  left_col = _mm_load_si128((const __m128i *)left);
1358
1359
165k
  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1360
165k
  h_prediction_32x8_1(&left_col_8p, dst, stride);
1361
165k
  dst += stride << 2;
1362
165k
  h_prediction_32x8_2(&left_col_8p, dst, stride);
1363
165k
}
1364
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1365
1366
void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
1367
53.4k
                                const uint8_t *above, const uint8_t *left) {
1368
53.4k
  __m128i left_col, left_col_8p;
1369
53.4k
  (void)above;
1370
1371
53.4k
  left_col = _mm_load_si128((const __m128i *)left);
1372
1373
53.4k
  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1374
53.4k
  h_prediction_32x8_1(&left_col_8p, dst, stride);
1375
53.4k
  dst += stride << 2;
1376
53.4k
  h_prediction_32x8_2(&left_col_8p, dst, stride);
1377
53.4k
  dst += stride << 2;
1378
1379
53.4k
  left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
1380
53.4k
  h_prediction_32x8_1(&left_col_8p, dst, stride);
1381
53.4k
  dst += stride << 2;
1382
53.4k
  h_prediction_32x8_2(&left_col_8p, dst, stride);
1383
53.4k
}
1384
1385
static inline void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
1386
2.77k
                                    const uint8_t *left, int height) {
1387
2.77k
  int i = height >> 2;
1388
44.3k
  do {
1389
44.3k
    __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
1390
44.3k
    left4 = _mm_unpacklo_epi8(left4, left4);
1391
44.3k
    left4 = _mm_unpacklo_epi8(left4, left4);
1392
44.3k
    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
1393
44.3k
    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
1394
44.3k
    _mm_store_si128((__m128i *)dst, r0);
1395
44.3k
    _mm_store_si128((__m128i *)(dst + 16), r0);
1396
44.3k
    _mm_store_si128((__m128i *)(dst + stride), r1);
1397
44.3k
    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
1398
44.3k
    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
1399
44.3k
    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
1400
44.3k
    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
1401
44.3k
    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
1402
44.3k
    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
1403
44.3k
    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
1404
44.3k
    left += 4;
1405
44.3k
    dst += stride * 4;
1406
44.3k
  } while (--i);
1407
2.77k
}
1408
1409
void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
1410
2.77k
                                const uint8_t *above, const uint8_t *left) {
1411
2.77k
  (void)above;
1412
2.77k
  h_predictor_32xh(dst, stride, left, 64);
1413
2.77k
}
1414
1415
static inline void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
1416
82.8k
                                    const uint8_t *left, int height) {
1417
82.8k
  int i = height >> 2;
1418
587k
  do {
1419
587k
    __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
1420
587k
    left4 = _mm_unpacklo_epi8(left4, left4);
1421
587k
    left4 = _mm_unpacklo_epi8(left4, left4);
1422
587k
    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
1423
587k
    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
1424
587k
    _mm_store_si128((__m128i *)dst, r0);
1425
587k
    _mm_store_si128((__m128i *)(dst + 16), r0);
1426
587k
    _mm_store_si128((__m128i *)(dst + 32), r0);
1427
587k
    _mm_store_si128((__m128i *)(dst + 48), r0);
1428
587k
    _mm_store_si128((__m128i *)(dst + stride), r1);
1429
587k
    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
1430
587k
    _mm_store_si128((__m128i *)(dst + stride + 32), r1);
1431
587k
    _mm_store_si128((__m128i *)(dst + stride + 48), r1);
1432
587k
    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
1433
587k
    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
1434
587k
    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
1435
587k
    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
1436
587k
    _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
1437
587k
    _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
1438
587k
    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
1439
587k
    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
1440
587k
    _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
1441
587k
    _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
1442
587k
    left += 4;
1443
587k
    dst += stride * 4;
1444
587k
  } while (--i);
1445
82.8k
}
1446
1447
void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
1448
17.6k
                                const uint8_t *above, const uint8_t *left) {
1449
17.6k
  (void)above;
1450
17.6k
  h_predictor_64xh(dst, stride, left, 64);
1451
17.6k
}
1452
1453
void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
1454
11.1k
                                const uint8_t *above, const uint8_t *left) {
1455
11.1k
  (void)above;
1456
11.1k
  h_predictor_64xh(dst, stride, left, 32);
1457
11.1k
}
1458
1459
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1460
void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
1461
54.0k
                                const uint8_t *above, const uint8_t *left) {
1462
54.0k
  (void)above;
1463
54.0k
  h_predictor_64xh(dst, stride, left, 16);
1464
54.0k
}
1465
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER