Coverage Report

Created: 2023-06-07 06:31

/src/aom/aom_dsp/x86/intrapred_ssse3.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <tmmintrin.h>
13
14
#include "config/aom_dsp_rtcd.h"
15
16
#include "aom_dsp/intrapred_common.h"
17
18
// -----------------------------------------------------------------------------
19
// PAETH_PRED
20
21
// Return 8 16-bit pixels in one row
22
static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
23
8.12M
                                     const __m128i *topleft) {
24
8.12M
  const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
25
26
8.12M
  __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
27
8.12M
  __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
28
8.12M
  __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
29
30
8.12M
  __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
31
8.12M
  mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
32
8.12M
  __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
33
34
8.12M
  pl = _mm_andnot_si128(mask1, *left);
35
36
8.12M
  ptl = _mm_and_si128(mask2, *topleft);
37
8.12M
  pt = _mm_andnot_si128(mask2, *top);
38
8.12M
  pt = _mm_or_si128(pt, ptl);
39
8.12M
  pt = _mm_and_si128(mask1, pt);
40
41
8.12M
  return _mm_or_si128(pl, pt);
42
8.12M
}
43
44
void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
45
211k
                                   const uint8_t *above, const uint8_t *left) {
46
211k
  __m128i l = _mm_loadl_epi64((const __m128i *)left);
47
211k
  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
48
211k
  const __m128i zero = _mm_setzero_si128();
49
211k
  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
50
211k
  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
51
211k
  __m128i rep = _mm_set1_epi16((short)0x8000);
52
211k
  const __m128i one = _mm_set1_epi16(1);
53
54
211k
  int i;
55
1.05M
  for (i = 0; i < 4; ++i) {
56
844k
    const __m128i l16 = _mm_shuffle_epi8(l, rep);
57
844k
    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
58
59
844k
    *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
60
844k
    dst += stride;
61
844k
    rep = _mm_add_epi16(rep, one);
62
844k
  }
63
211k
}
64
65
void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
66
54.6k
                                   const uint8_t *above, const uint8_t *left) {
67
54.6k
  __m128i l = _mm_loadl_epi64((const __m128i *)left);
68
54.6k
  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
69
54.6k
  const __m128i zero = _mm_setzero_si128();
70
54.6k
  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
71
54.6k
  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
72
54.6k
  __m128i rep = _mm_set1_epi16((short)0x8000);
73
54.6k
  const __m128i one = _mm_set1_epi16(1);
74
75
54.6k
  int i;
76
491k
  for (i = 0; i < 8; ++i) {
77
436k
    const __m128i l16 = _mm_shuffle_epi8(l, rep);
78
436k
    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
79
80
436k
    *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
81
436k
    dst += stride;
82
436k
    rep = _mm_add_epi16(rep, one);
83
436k
  }
84
54.6k
}
85
86
void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
87
92.0k
                                    const uint8_t *above, const uint8_t *left) {
88
92.0k
  __m128i l = _mm_load_si128((const __m128i *)left);
89
92.0k
  const __m128i t = _mm_cvtsi32_si128(((const int *)above)[0]);
90
92.0k
  const __m128i zero = _mm_setzero_si128();
91
92.0k
  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
92
92.0k
  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
93
92.0k
  __m128i rep = _mm_set1_epi16((short)0x8000);
94
92.0k
  const __m128i one = _mm_set1_epi16(1);
95
96
1.56M
  for (int i = 0; i < 16; ++i) {
97
1.47M
    const __m128i l16 = _mm_shuffle_epi8(l, rep);
98
1.47M
    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
99
100
1.47M
    *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
101
1.47M
    dst += stride;
102
1.47M
    rep = _mm_add_epi16(rep, one);
103
1.47M
  }
104
92.0k
}
105
106
void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
107
79.9k
                                   const uint8_t *above, const uint8_t *left) {
108
79.9k
  __m128i l = _mm_loadl_epi64((const __m128i *)left);
109
79.9k
  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
110
79.9k
  const __m128i zero = _mm_setzero_si128();
111
79.9k
  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
112
79.9k
  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
113
79.9k
  __m128i rep = _mm_set1_epi16((short)0x8000);
114
79.9k
  const __m128i one = _mm_set1_epi16(1);
115
116
79.9k
  int i;
117
399k
  for (i = 0; i < 4; ++i) {
118
319k
    const __m128i l16 = _mm_shuffle_epi8(l, rep);
119
319k
    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
120
121
319k
    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
122
319k
    dst += stride;
123
319k
    rep = _mm_add_epi16(rep, one);
124
319k
  }
125
79.9k
}
126
127
void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
128
178k
                                   const uint8_t *above, const uint8_t *left) {
129
178k
  __m128i l = _mm_loadl_epi64((const __m128i *)left);
130
178k
  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
131
178k
  const __m128i zero = _mm_setzero_si128();
132
178k
  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
133
178k
  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
134
178k
  __m128i rep = _mm_set1_epi16((short)0x8000);
135
178k
  const __m128i one = _mm_set1_epi16(1);
136
137
178k
  int i;
138
1.61M
  for (i = 0; i < 8; ++i) {
139
1.43M
    const __m128i l16 = _mm_shuffle_epi8(l, rep);
140
1.43M
    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
141
142
1.43M
    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
143
1.43M
    dst += stride;
144
1.43M
    rep = _mm_add_epi16(rep, one);
145
1.43M
  }
146
178k
}
147
148
void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
149
46.5k
                                    const uint8_t *above, const uint8_t *left) {
150
46.5k
  __m128i l = _mm_load_si128((const __m128i *)left);
151
46.5k
  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
152
46.5k
  const __m128i zero = _mm_setzero_si128();
153
46.5k
  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
154
46.5k
  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
155
46.5k
  __m128i rep = _mm_set1_epi16((short)0x8000);
156
46.5k
  const __m128i one = _mm_set1_epi16(1);
157
158
46.5k
  int i;
159
791k
  for (i = 0; i < 16; ++i) {
160
744k
    const __m128i l16 = _mm_shuffle_epi8(l, rep);
161
744k
    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
162
163
744k
    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
164
744k
    dst += stride;
165
744k
    rep = _mm_add_epi16(rep, one);
166
744k
  }
167
46.5k
}
168
169
void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
170
43.0k
                                    const uint8_t *above, const uint8_t *left) {
171
43.0k
  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
172
43.0k
  const __m128i zero = _mm_setzero_si128();
173
43.0k
  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
174
43.0k
  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
175
43.0k
  const __m128i one = _mm_set1_epi16(1);
176
177
129k
  for (int j = 0; j < 2; ++j) {
178
86.0k
    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
179
86.0k
    __m128i rep = _mm_set1_epi16((short)0x8000);
180
1.46M
    for (int i = 0; i < 16; ++i) {
181
1.37M
      const __m128i l16 = _mm_shuffle_epi8(l, rep);
182
1.37M
      const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
183
184
1.37M
      _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
185
1.37M
      dst += stride;
186
1.37M
      rep = _mm_add_epi16(rep, one);
187
1.37M
    }
188
86.0k
  }
189
43.0k
}
190
191
// Return 16 8-bit pixels in one row
192
static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
193
                                      const __m128i *top1,
194
747k
                                      const __m128i *topleft) {
195
747k
  const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
196
747k
  const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
197
747k
  return _mm_packus_epi16(p0, p1);
198
747k
}
199
200
void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
201
71.9k
                                    const uint8_t *above, const uint8_t *left) {
202
71.9k
  __m128i l = _mm_cvtsi32_si128(((const int *)left)[0]);
203
71.9k
  const __m128i t = _mm_load_si128((const __m128i *)above);
204
71.9k
  const __m128i zero = _mm_setzero_si128();
205
71.9k
  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
206
71.9k
  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
207
71.9k
  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
208
71.9k
  __m128i rep = _mm_set1_epi16((short)0x8000);
209
71.9k
  const __m128i one = _mm_set1_epi16(1);
210
211
359k
  for (int i = 0; i < 4; ++i) {
212
287k
    const __m128i l16 = _mm_shuffle_epi8(l, rep);
213
287k
    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
214
215
287k
    _mm_store_si128((__m128i *)dst, row);
216
287k
    dst += stride;
217
287k
    rep = _mm_add_epi16(rep, one);
218
287k
  }
219
71.9k
}
220
221
void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
222
0
                                    const uint8_t *above, const uint8_t *left) {
223
0
  __m128i l = _mm_loadl_epi64((const __m128i *)left);
224
0
  const __m128i t = _mm_load_si128((const __m128i *)above);
225
0
  const __m128i zero = _mm_setzero_si128();
226
0
  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
227
0
  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
228
0
  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
229
0
  __m128i rep = _mm_set1_epi16((short)0x8000);
230
0
  const __m128i one = _mm_set1_epi16(1);
231
232
0
  int i;
233
0
  for (i = 0; i < 8; ++i) {
234
0
    const __m128i l16 = _mm_shuffle_epi8(l, rep);
235
0
    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
236
237
0
    _mm_store_si128((__m128i *)dst, row);
238
0
    dst += stride;
239
0
    rep = _mm_add_epi16(rep, one);
240
0
  }
241
0
}
242
243
void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
244
                                     const uint8_t *above,
245
0
                                     const uint8_t *left) {
246
0
  __m128i l = _mm_load_si128((const __m128i *)left);
247
0
  const __m128i t = _mm_load_si128((const __m128i *)above);
248
0
  const __m128i zero = _mm_setzero_si128();
249
0
  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
250
0
  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
251
0
  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
252
0
  __m128i rep = _mm_set1_epi16((short)0x8000);
253
0
  const __m128i one = _mm_set1_epi16(1);
254
255
0
  int i;
256
0
  for (i = 0; i < 16; ++i) {
257
0
    const __m128i l16 = _mm_shuffle_epi8(l, rep);
258
0
    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
259
260
0
    _mm_store_si128((__m128i *)dst, row);
261
0
    dst += stride;
262
0
    rep = _mm_add_epi16(rep, one);
263
0
  }
264
0
}
265
266
void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
267
                                     const uint8_t *above,
268
0
                                     const uint8_t *left) {
269
0
  __m128i l = _mm_load_si128((const __m128i *)left);
270
0
  const __m128i t = _mm_load_si128((const __m128i *)above);
271
0
  const __m128i zero = _mm_setzero_si128();
272
0
  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
273
0
  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
274
0
  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
275
0
  __m128i rep = _mm_set1_epi16((short)0x8000);
276
0
  const __m128i one = _mm_set1_epi16(1);
277
0
  __m128i l16;
278
279
0
  int i;
280
0
  for (i = 0; i < 16; ++i) {
281
0
    l16 = _mm_shuffle_epi8(l, rep);
282
0
    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
283
284
0
    _mm_store_si128((__m128i *)dst, row);
285
0
    dst += stride;
286
0
    rep = _mm_add_epi16(rep, one);
287
0
  }
288
289
0
  l = _mm_load_si128((const __m128i *)(left + 16));
290
0
  rep = _mm_set1_epi16((short)0x8000);
291
0
  for (i = 0; i < 16; ++i) {
292
0
    l16 = _mm_shuffle_epi8(l, rep);
293
0
    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
294
295
0
    _mm_store_si128((__m128i *)dst, row);
296
0
    dst += stride;
297
0
    rep = _mm_add_epi16(rep, one);
298
0
  }
299
0
}
300
301
void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
302
                                     const uint8_t *above,
303
0
                                     const uint8_t *left) {
304
0
  const __m128i t = _mm_load_si128((const __m128i *)above);
305
0
  const __m128i zero = _mm_setzero_si128();
306
0
  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
307
0
  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
308
0
  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
309
0
  const __m128i one = _mm_set1_epi16(1);
310
311
0
  for (int j = 0; j < 4; ++j) {
312
0
    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
313
0
    __m128i rep = _mm_set1_epi16((short)0x8000);
314
0
    for (int i = 0; i < 16; ++i) {
315
0
      const __m128i l16 = _mm_shuffle_epi8(l, rep);
316
0
      const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
317
0
      _mm_store_si128((__m128i *)dst, row);
318
0
      dst += stride;
319
0
      rep = _mm_add_epi16(rep, one);
320
0
    }
321
0
  }
322
0
}
323
324
void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
325
28.7k
                                    const uint8_t *above, const uint8_t *left) {
326
28.7k
  const __m128i a = _mm_load_si128((const __m128i *)above);
327
28.7k
  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
328
28.7k
  const __m128i zero = _mm_setzero_si128();
329
28.7k
  const __m128i al = _mm_unpacklo_epi8(a, zero);
330
28.7k
  const __m128i ah = _mm_unpackhi_epi8(a, zero);
331
28.7k
  const __m128i bl = _mm_unpacklo_epi8(b, zero);
332
28.7k
  const __m128i bh = _mm_unpackhi_epi8(b, zero);
333
334
28.7k
  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
335
28.7k
  __m128i rep = _mm_set1_epi16((short)0x8000);
336
28.7k
  const __m128i one = _mm_set1_epi16(1);
337
28.7k
  const __m128i l = _mm_loadl_epi64((const __m128i *)left);
338
28.7k
  __m128i l16;
339
340
258k
  for (int i = 0; i < 8; ++i) {
341
229k
    l16 = _mm_shuffle_epi8(l, rep);
342
229k
    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
343
229k
    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
344
345
229k
    _mm_store_si128((__m128i *)dst, r32l);
346
229k
    _mm_store_si128((__m128i *)(dst + 16), r32h);
347
229k
    dst += stride;
348
229k
    rep = _mm_add_epi16(rep, one);
349
229k
  }
350
28.7k
}
351
352
void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
353
                                     const uint8_t *above,
354
0
                                     const uint8_t *left) {
355
0
  const __m128i a = _mm_load_si128((const __m128i *)above);
356
0
  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
357
0
  const __m128i zero = _mm_setzero_si128();
358
0
  const __m128i al = _mm_unpacklo_epi8(a, zero);
359
0
  const __m128i ah = _mm_unpackhi_epi8(a, zero);
360
0
  const __m128i bl = _mm_unpacklo_epi8(b, zero);
361
0
  const __m128i bh = _mm_unpackhi_epi8(b, zero);
362
363
0
  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
364
0
  __m128i rep = _mm_set1_epi16((short)0x8000);
365
0
  const __m128i one = _mm_set1_epi16(1);
366
0
  __m128i l = _mm_load_si128((const __m128i *)left);
367
0
  __m128i l16;
368
369
0
  int i;
370
0
  for (i = 0; i < 16; ++i) {
371
0
    l16 = _mm_shuffle_epi8(l, rep);
372
0
    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
373
0
    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
374
375
0
    _mm_store_si128((__m128i *)dst, r32l);
376
0
    _mm_store_si128((__m128i *)(dst + 16), r32h);
377
0
    dst += stride;
378
0
    rep = _mm_add_epi16(rep, one);
379
0
  }
380
0
}
381
382
void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
383
                                     const uint8_t *above,
384
0
                                     const uint8_t *left) {
385
0
  const __m128i a = _mm_load_si128((const __m128i *)above);
386
0
  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
387
0
  const __m128i zero = _mm_setzero_si128();
388
0
  const __m128i al = _mm_unpacklo_epi8(a, zero);
389
0
  const __m128i ah = _mm_unpackhi_epi8(a, zero);
390
0
  const __m128i bl = _mm_unpacklo_epi8(b, zero);
391
0
  const __m128i bh = _mm_unpackhi_epi8(b, zero);
392
393
0
  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
394
0
  __m128i rep = _mm_set1_epi16((short)0x8000);
395
0
  const __m128i one = _mm_set1_epi16(1);
396
0
  __m128i l = _mm_load_si128((const __m128i *)left);
397
0
  __m128i l16;
398
399
0
  int i;
400
0
  for (i = 0; i < 16; ++i) {
401
0
    l16 = _mm_shuffle_epi8(l, rep);
402
0
    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
403
0
    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
404
405
0
    _mm_store_si128((__m128i *)dst, r32l);
406
0
    _mm_store_si128((__m128i *)(dst + 16), r32h);
407
0
    dst += stride;
408
0
    rep = _mm_add_epi16(rep, one);
409
0
  }
410
411
0
  rep = _mm_set1_epi16((short)0x8000);
412
0
  l = _mm_load_si128((const __m128i *)(left + 16));
413
0
  for (i = 0; i < 16; ++i) {
414
0
    l16 = _mm_shuffle_epi8(l, rep);
415
0
    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
416
0
    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
417
418
0
    _mm_store_si128((__m128i *)dst, r32l);
419
0
    _mm_store_si128((__m128i *)(dst + 16), r32h);
420
0
    dst += stride;
421
0
    rep = _mm_add_epi16(rep, one);
422
0
  }
423
0
}
424
425
void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
426
                                     const uint8_t *above,
427
0
                                     const uint8_t *left) {
428
0
  const __m128i a = _mm_load_si128((const __m128i *)above);
429
0
  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
430
0
  const __m128i zero = _mm_setzero_si128();
431
0
  const __m128i al = _mm_unpacklo_epi8(a, zero);
432
0
  const __m128i ah = _mm_unpackhi_epi8(a, zero);
433
0
  const __m128i bl = _mm_unpacklo_epi8(b, zero);
434
0
  const __m128i bh = _mm_unpackhi_epi8(b, zero);
435
436
0
  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
437
0
  const __m128i one = _mm_set1_epi16(1);
438
0
  __m128i l16;
439
440
0
  int i, j;
441
0
  for (j = 0; j < 4; ++j) {
442
0
    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
443
0
    __m128i rep = _mm_set1_epi16((short)0x8000);
444
0
    for (i = 0; i < 16; ++i) {
445
0
      l16 = _mm_shuffle_epi8(l, rep);
446
0
      const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
447
0
      const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
448
449
0
      _mm_store_si128((__m128i *)dst, r32l);
450
0
      _mm_store_si128((__m128i *)(dst + 16), r32h);
451
0
      dst += stride;
452
0
      rep = _mm_add_epi16(rep, one);
453
0
    }
454
0
  }
455
0
}
456
457
void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
458
                                     const uint8_t *above,
459
0
                                     const uint8_t *left) {
460
0
  const __m128i a = _mm_load_si128((const __m128i *)above);
461
0
  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
462
0
  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
463
0
  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
464
0
  const __m128i zero = _mm_setzero_si128();
465
0
  const __m128i al = _mm_unpacklo_epi8(a, zero);
466
0
  const __m128i ah = _mm_unpackhi_epi8(a, zero);
467
0
  const __m128i bl = _mm_unpacklo_epi8(b, zero);
468
0
  const __m128i bh = _mm_unpackhi_epi8(b, zero);
469
0
  const __m128i cl = _mm_unpacklo_epi8(c, zero);
470
0
  const __m128i ch = _mm_unpackhi_epi8(c, zero);
471
0
  const __m128i dl = _mm_unpacklo_epi8(d, zero);
472
0
  const __m128i dh = _mm_unpackhi_epi8(d, zero);
473
474
0
  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
475
0
  const __m128i one = _mm_set1_epi16(1);
476
0
  __m128i l16;
477
478
0
  int i, j;
479
0
  for (j = 0; j < 2; ++j) {
480
0
    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
481
0
    __m128i rep = _mm_set1_epi16((short)0x8000);
482
0
    for (i = 0; i < 16; ++i) {
483
0
      l16 = _mm_shuffle_epi8(l, rep);
484
0
      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
485
0
      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
486
0
      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
487
0
      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
488
489
0
      _mm_store_si128((__m128i *)dst, r0);
490
0
      _mm_store_si128((__m128i *)(dst + 16), r1);
491
0
      _mm_store_si128((__m128i *)(dst + 32), r2);
492
0
      _mm_store_si128((__m128i *)(dst + 48), r3);
493
0
      dst += stride;
494
0
      rep = _mm_add_epi16(rep, one);
495
0
    }
496
0
  }
497
0
}
498
499
void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
500
                                     const uint8_t *above,
501
0
                                     const uint8_t *left) {
502
0
  const __m128i a = _mm_load_si128((const __m128i *)above);
503
0
  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
504
0
  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
505
0
  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
506
0
  const __m128i zero = _mm_setzero_si128();
507
0
  const __m128i al = _mm_unpacklo_epi8(a, zero);
508
0
  const __m128i ah = _mm_unpackhi_epi8(a, zero);
509
0
  const __m128i bl = _mm_unpacklo_epi8(b, zero);
510
0
  const __m128i bh = _mm_unpackhi_epi8(b, zero);
511
0
  const __m128i cl = _mm_unpacklo_epi8(c, zero);
512
0
  const __m128i ch = _mm_unpackhi_epi8(c, zero);
513
0
  const __m128i dl = _mm_unpacklo_epi8(d, zero);
514
0
  const __m128i dh = _mm_unpackhi_epi8(d, zero);
515
516
0
  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
517
0
  const __m128i one = _mm_set1_epi16(1);
518
0
  __m128i l16;
519
520
0
  int i, j;
521
0
  for (j = 0; j < 4; ++j) {
522
0
    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
523
0
    __m128i rep = _mm_set1_epi16((short)0x8000);
524
0
    for (i = 0; i < 16; ++i) {
525
0
      l16 = _mm_shuffle_epi8(l, rep);
526
0
      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
527
0
      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
528
0
      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
529
0
      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
530
531
0
      _mm_store_si128((__m128i *)dst, r0);
532
0
      _mm_store_si128((__m128i *)(dst + 16), r1);
533
0
      _mm_store_si128((__m128i *)(dst + 32), r2);
534
0
      _mm_store_si128((__m128i *)(dst + 48), r3);
535
0
      dst += stride;
536
0
      rep = _mm_add_epi16(rep, one);
537
0
    }
538
0
  }
539
0
}
540
541
void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
542
                                     const uint8_t *above,
543
0
                                     const uint8_t *left) {
544
0
  const __m128i a = _mm_load_si128((const __m128i *)above);
545
0
  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
546
0
  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
547
0
  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
548
0
  const __m128i zero = _mm_setzero_si128();
549
0
  const __m128i al = _mm_unpacklo_epi8(a, zero);
550
0
  const __m128i ah = _mm_unpackhi_epi8(a, zero);
551
0
  const __m128i bl = _mm_unpacklo_epi8(b, zero);
552
0
  const __m128i bh = _mm_unpackhi_epi8(b, zero);
553
0
  const __m128i cl = _mm_unpacklo_epi8(c, zero);
554
0
  const __m128i ch = _mm_unpackhi_epi8(c, zero);
555
0
  const __m128i dl = _mm_unpacklo_epi8(d, zero);
556
0
  const __m128i dh = _mm_unpackhi_epi8(d, zero);
557
558
0
  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
559
0
  const __m128i one = _mm_set1_epi16(1);
560
0
  __m128i l16;
561
562
0
  int i;
563
0
  const __m128i l = _mm_load_si128((const __m128i *)left);
564
0
  __m128i rep = _mm_set1_epi16((short)0x8000);
565
0
  for (i = 0; i < 16; ++i) {
566
0
    l16 = _mm_shuffle_epi8(l, rep);
567
0
    const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
568
0
    const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
569
0
    const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
570
0
    const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
571
572
0
    _mm_store_si128((__m128i *)dst, r0);
573
0
    _mm_store_si128((__m128i *)(dst + 16), r1);
574
0
    _mm_store_si128((__m128i *)(dst + 32), r2);
575
0
    _mm_store_si128((__m128i *)(dst + 48), r3);
576
0
    dst += stride;
577
0
    rep = _mm_add_epi16(rep, one);
578
0
  }
579
0
}
580
581
// -----------------------------------------------------------------------------
582
// SMOOTH_PRED
583
584
// pixels[0]: above and below_pred interleave vector
585
// pixels[1]: left vector
586
// pixels[2]: right_pred vector
587
static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
588
312k
                                 int height, __m128i *pixels) {
589
312k
  __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]);
590
312k
  if (height == 4)
591
206k
    pixels[1] = _mm_cvtsi32_si128(((const int *)left)[0]);
592
105k
  else if (height == 8)
593
70.4k
    pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
594
35.4k
  else
595
35.4k
    pixels[1] = _mm_loadu_si128(((const __m128i *)left));
596
597
312k
  pixels[2] = _mm_set1_epi16((int16_t)above[3]);
598
599
312k
  const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
600
312k
  const __m128i zero = _mm_setzero_si128();
601
312k
  d = _mm_unpacklo_epi8(d, zero);
602
312k
  pixels[0] = _mm_unpacklo_epi16(d, bp);
603
312k
}
604
605
// weight_h[0]: weight_h vector
606
// weight_h[1]: scale - weight_h vector
607
// weight_h[2]: same as [0], second half for height = 16 only
608
// weight_h[3]: same as [1], second half for height = 16 only
609
// weight_w[0]: weights_w and scale - weights_w interleave vector
610
static INLINE void load_weight_w4(int height, __m128i *weight_h,
611
312k
                                  __m128i *weight_w) {
612
312k
  const __m128i zero = _mm_setzero_si128();
613
312k
  const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
614
312k
  const __m128i t = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]);
615
312k
  weight_h[0] = _mm_unpacklo_epi8(t, zero);
616
312k
  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
617
312k
  weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
618
619
312k
  if (height == 8) {
620
70.4k
    const __m128i weight = _mm_loadl_epi64((const __m128i *)&smooth_weights[4]);
621
70.4k
    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
622
70.4k
    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
623
242k
  } else if (height == 16) {
624
35.4k
    const __m128i weight =
625
35.4k
        _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
626
35.4k
    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
627
35.4k
    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
628
35.4k
    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
629
35.4k
    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
630
35.4k
  }
631
312k
}
632
633
static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
634
                                   const __m128i *ww, int h, uint8_t *dst,
635
348k
                                   ptrdiff_t stride, int second_half) {
636
348k
  const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
637
348k
  const __m128i one = _mm_set1_epi16(1);
638
348k
  const __m128i inc = _mm_set1_epi16(0x202);
639
348k
  const __m128i gat = _mm_set1_epi32(0xc080400);
640
348k
  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
641
348k
                            : _mm_set1_epi16((short)0x8000);
642
348k
  __m128i d = _mm_set1_epi16(0x100);
643
644
2.30M
  for (int i = 0; i < h; ++i) {
645
1.95M
    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
646
1.95M
    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
647
1.95M
    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
648
1.95M
    __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
649
650
1.95M
    __m128i b = _mm_shuffle_epi8(pixel[1], rep);
651
1.95M
    b = _mm_unpacklo_epi16(b, pixel[2]);
652
1.95M
    __m128i sum = _mm_madd_epi16(b, ww[0]);
653
654
1.95M
    sum = _mm_add_epi32(s, sum);
655
1.95M
    sum = _mm_add_epi32(sum, round);
656
1.95M
    sum = _mm_srai_epi32(sum, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
657
658
1.95M
    sum = _mm_shuffle_epi8(sum, gat);
659
1.95M
    *(int *)dst = _mm_cvtsi128_si32(sum);
660
1.95M
    dst += stride;
661
662
1.95M
    rep = _mm_add_epi16(rep, one);
663
1.95M
    d = _mm_add_epi16(d, inc);
664
1.95M
  }
665
348k
}
666
667
void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
668
206k
                                    const uint8_t *above, const uint8_t *left) {
669
206k
  __m128i pixels[3];
670
206k
  load_pixel_w4(above, left, 4, pixels);
671
672
206k
  __m128i wh[4], ww[2];
673
206k
  load_weight_w4(4, wh, ww);
674
675
206k
  smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
676
206k
}
677
678
void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
679
70.4k
                                    const uint8_t *above, const uint8_t *left) {
680
70.4k
  __m128i pixels[3];
681
70.4k
  load_pixel_w4(above, left, 8, pixels);
682
683
70.4k
  __m128i wh[4], ww[2];
684
70.4k
  load_weight_w4(8, wh, ww);
685
686
70.4k
  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
687
70.4k
}
688
689
void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
690
                                     const uint8_t *above,
691
35.4k
                                     const uint8_t *left) {
692
35.4k
  __m128i pixels[3];
693
35.4k
  load_pixel_w4(above, left, 16, pixels);
694
695
35.4k
  __m128i wh[4], ww[2];
696
35.4k
  load_weight_w4(16, wh, ww);
697
698
35.4k
  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
699
35.4k
  dst += stride << 3;
700
35.4k
  smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
701
35.4k
}
702
703
// pixels[0]: above and below_pred interleave vector, first half
704
// pixels[1]: above and below_pred interleave vector, second half
705
// pixels[2]: left vector
706
// pixels[3]: right_pred vector
707
// pixels[4]: above and below_pred interleave vector, first half
708
// pixels[5]: above and below_pred interleave vector, second half
709
// pixels[6]: left vector + 16
710
// pixels[7]: right_pred vector
711
static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
712
367k
                                 int height, __m128i *pixels) {
713
367k
  const __m128i zero = _mm_setzero_si128();
714
367k
  const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
715
367k
  __m128i d = _mm_loadl_epi64((const __m128i *)above);
716
367k
  d = _mm_unpacklo_epi8(d, zero);
717
367k
  pixels[0] = _mm_unpacklo_epi16(d, bp);
718
367k
  pixels[1] = _mm_unpackhi_epi16(d, bp);
719
720
367k
  pixels[3] = _mm_set1_epi16((int16_t)above[7]);
721
722
367k
  if (height == 4) {
723
99.1k
    pixels[2] = _mm_cvtsi32_si128(((const int *)left)[0]);
724
268k
  } else if (height == 8) {
725
202k
    pixels[2] = _mm_loadl_epi64((const __m128i *)left);
726
202k
  } else if (height == 16) {
727
47.4k
    pixels[2] = _mm_load_si128((const __m128i *)left);
728
47.4k
  } else {
729
18.8k
    pixels[2] = _mm_load_si128((const __m128i *)left);
730
18.8k
    pixels[4] = pixels[0];
731
18.8k
    pixels[5] = pixels[1];
732
18.8k
    pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
733
18.8k
    pixels[7] = pixels[3];
734
18.8k
  }
735
367k
}
736
737
// weight_h[0]: weight_h vector
738
// weight_h[1]: scale - weight_h vector
739
// weight_h[2]: same as [0], offset 8
740
// weight_h[3]: same as [1], offset 8
741
// weight_h[4]: same as [0], offset 16
742
// weight_h[5]: same as [1], offset 16
743
// weight_h[6]: same as [0], offset 24
744
// weight_h[7]: same as [1], offset 24
745
// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
746
// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
747
static INLINE void load_weight_w8(int height, __m128i *weight_h,
748
367k
                                  __m128i *weight_w) {
749
367k
  const __m128i zero = _mm_setzero_si128();
750
367k
  const int we_offset = height < 8 ? 0 : 4;
751
367k
  __m128i we = _mm_loadu_si128((const __m128i *)&smooth_weights[we_offset]);
752
367k
  weight_h[0] = _mm_unpacklo_epi8(we, zero);
753
367k
  const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
754
367k
  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
755
756
367k
  if (height == 4) {
757
99.1k
    we = _mm_srli_si128(we, 4);
758
99.1k
    __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
759
99.1k
    __m128i tmp2 = _mm_sub_epi16(d, tmp1);
760
99.1k
    weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
761
99.1k
    weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
762
268k
  } else {
763
268k
    weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
764
268k
    weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
765
268k
  }
766
767
367k
  if (height == 16) {
768
47.4k
    we = _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
769
47.4k
    weight_h[0] = _mm_unpacklo_epi8(we, zero);
770
47.4k
    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
771
47.4k
    weight_h[2] = _mm_unpackhi_epi8(we, zero);
772
47.4k
    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
773
320k
  } else if (height == 32) {
774
18.8k
    const __m128i weight_lo =
775
18.8k
        _mm_loadu_si128((const __m128i *)&smooth_weights[28]);
776
18.8k
    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
777
18.8k
    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
778
18.8k
    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
779
18.8k
    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
780
18.8k
    const __m128i weight_hi =
781
18.8k
        _mm_loadu_si128((const __m128i *)&smooth_weights[28 + 16]);
782
18.8k
    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
783
18.8k
    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
784
18.8k
    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
785
18.8k
    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
786
18.8k
  }
787
367k
}
788
789
static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
790
                                   const __m128i *ww, int h, uint8_t *dst,
791
471k
                                   ptrdiff_t stride, int second_half) {
792
471k
  const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
793
471k
  const __m128i one = _mm_set1_epi16(1);
794
471k
  const __m128i inc = _mm_set1_epi16(0x202);
795
471k
  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
796
797
471k
  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
798
471k
                            : _mm_set1_epi16((short)0x8000);
799
471k
  __m128i d = _mm_set1_epi16(0x100);
800
801
471k
  int i;
802
3.85M
  for (i = 0; i < h; ++i) {
803
3.37M
    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
804
3.37M
    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
805
3.37M
    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
806
3.37M
    __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
807
3.37M
    __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
808
809
3.37M
    __m128i b = _mm_shuffle_epi8(pixels[2], rep);
810
3.37M
    b = _mm_unpacklo_epi16(b, pixels[3]);
811
3.37M
    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
812
3.37M
    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
813
814
3.37M
    s0 = _mm_add_epi32(s0, sum0);
815
3.37M
    s0 = _mm_add_epi32(s0, round);
816
3.37M
    s0 = _mm_srai_epi32(s0, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
817
818
3.37M
    s1 = _mm_add_epi32(s1, sum1);
819
3.37M
    s1 = _mm_add_epi32(s1, round);
820
3.37M
    s1 = _mm_srai_epi32(s1, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
821
822
3.37M
    sum0 = _mm_packus_epi16(s0, s1);
823
3.37M
    sum0 = _mm_shuffle_epi8(sum0, gat);
824
3.37M
    _mm_storel_epi64((__m128i *)dst, sum0);
825
3.37M
    dst += stride;
826
827
3.37M
    rep = _mm_add_epi16(rep, one);
828
3.37M
    d = _mm_add_epi16(d, inc);
829
3.37M
  }
830
471k
}
831
832
void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
833
99.1k
                                    const uint8_t *above, const uint8_t *left) {
834
99.1k
  __m128i pixels[4];
835
99.1k
  load_pixel_w8(above, left, 4, pixels);
836
837
99.1k
  __m128i wh[4], ww[2];
838
99.1k
  load_weight_w8(4, wh, ww);
839
840
99.1k
  smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
841
99.1k
}
842
843
void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
844
202k
                                    const uint8_t *above, const uint8_t *left) {
845
202k
  __m128i pixels[4];
846
202k
  load_pixel_w8(above, left, 8, pixels);
847
848
202k
  __m128i wh[4], ww[2];
849
202k
  load_weight_w8(8, wh, ww);
850
851
202k
  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
852
202k
}
853
854
void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
855
                                     const uint8_t *above,
856
47.4k
                                     const uint8_t *left) {
857
47.4k
  __m128i pixels[4];
858
47.4k
  load_pixel_w8(above, left, 16, pixels);
859
860
47.4k
  __m128i wh[4], ww[2];
861
47.4k
  load_weight_w8(16, wh, ww);
862
863
47.4k
  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
864
47.4k
  dst += stride << 3;
865
47.4k
  smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
866
47.4k
}
867
868
void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
869
                                     const uint8_t *above,
870
18.8k
                                     const uint8_t *left) {
871
18.8k
  __m128i pixels[8];
872
18.8k
  load_pixel_w8(above, left, 32, pixels);
873
874
18.8k
  __m128i wh[8], ww[2];
875
18.8k
  load_weight_w8(32, wh, ww);
876
877
18.8k
  smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
878
18.8k
  dst += stride << 3;
879
18.8k
  smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
880
18.8k
  dst += stride << 3;
881
18.8k
  smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
882
18.8k
  dst += stride << 3;
883
18.8k
  smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
884
18.8k
}
885
886
// TODO(slavarnway): Visual Studio only supports restrict when /std:c11
887
// (available in 2019+) or greater is specified; __restrict can be used in that
888
// case. This should be moved to rtcd and used consistently between the
889
// function declarations and definitions to avoid warnings in Visual Studio
890
// when defining LIBAOM_RESTRICT to restrict or __restrict.
891
#if defined(_MSC_VER)
892
#define LIBAOM_RESTRICT
893
#else
894
#define LIBAOM_RESTRICT restrict
895
#endif
896
897
493k
static AOM_FORCE_INLINE __m128i Load4(const void *src) {
898
  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
899
  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
900
  // movss instruction.
901
  //
902
  // Until compiler support of _mm_loadu_si32 is widespread, use of
903
  // _mm_loadu_si32 is banned.
904
493k
  int val;
905
493k
  memcpy(&val, src, sizeof(val));
906
493k
  return _mm_cvtsi32_si128(val);
907
493k
}
908
909
87.7M
static AOM_FORCE_INLINE __m128i LoadLo8(const void *a) {
910
87.7M
  return _mm_loadl_epi64((const __m128i *)(a));
911
87.7M
}
912
913
916k
static AOM_FORCE_INLINE __m128i LoadUnaligned16(const void *a) {
914
916k
  return _mm_loadu_si128((const __m128i *)(a));
915
916k
}
916
917
1.12M
static AOM_FORCE_INLINE void Store4(void *dst, const __m128i x) {
918
1.12M
  const int val = _mm_cvtsi128_si32(x);
919
1.12M
  memcpy(dst, &val, sizeof(val));
920
1.12M
}
921
922
45.0M
static AOM_FORCE_INLINE void StoreLo8(void *a, const __m128i v) {
923
45.0M
  _mm_storel_epi64((__m128i *)(a), v);
924
45.0M
}
925
926
15.5M
static AOM_FORCE_INLINE void StoreUnaligned16(void *a, const __m128i v) {
927
15.5M
  _mm_storeu_si128((__m128i *)(a), v);
928
15.5M
}
929
930
89.1M
static AOM_FORCE_INLINE __m128i cvtepu8_epi16(__m128i x) {
931
89.1M
  return _mm_unpacklo_epi8((x), _mm_setzero_si128());
932
89.1M
}
933
934
292k
static AOM_FORCE_INLINE __m128i cvtepu8_epi32(__m128i x) {
935
292k
  const __m128i tmp = _mm_unpacklo_epi8((x), _mm_setzero_si128());
936
292k
  return _mm_unpacklo_epi16(tmp, _mm_setzero_si128());
937
292k
}
938
939
43.4M
static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) {
940
43.4M
  return _mm_unpacklo_epi16((x), _mm_setzero_si128());
941
43.4M
}
942
943
void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
944
                          const uint8_t *LIBAOM_RESTRICT top_row,
945
                          const uint8_t *LIBAOM_RESTRICT left_column, int width,
946
536k
                          int height) {
947
536k
  const uint8_t *const sm_weights_h = smooth_weights + height - 4;
948
536k
  const uint8_t *const sm_weights_w = smooth_weights + width - 4;
949
536k
  const __m128i zero = _mm_setzero_si128();
950
536k
  const __m128i scale_value = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
951
536k
  const __m128i bottom_left = _mm_cvtsi32_si128(left_column[height - 1]);
952
536k
  const __m128i top_right = _mm_set1_epi16(top_row[width - 1]);
953
536k
  const __m128i round = _mm_set1_epi32(1 << SMOOTH_WEIGHT_LOG2_SCALE);
954
11.1M
  for (int y = 0; y < height; ++y) {
955
10.5M
    const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
956
10.5M
    const __m128i left_y = _mm_cvtsi32_si128(left_column[y]);
957
10.5M
    const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
958
10.5M
    __m128i scaled_bottom_left =
959
10.5M
        _mm_mullo_epi16(scale_m_weights_y, bottom_left);
960
10.5M
    const __m128i weight_left_y =
961
10.5M
        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
962
10.5M
    scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round);
963
10.5M
    scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0);
964
54.0M
    for (int x = 0; x < width; x += 8) {
965
43.4M
      const __m128i top_x = LoadLo8(top_row + x);
966
43.4M
      const __m128i weights_x = LoadLo8(sm_weights_w + x);
967
43.4M
      const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x);
968
43.4M
      const __m128i top_weights_x_lo = cvtepu8_epi16(top_weights_x);
969
43.4M
      const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero);
970
971
      // Here opposite weights and pixels are multiplied, where the order of
972
      // interleaving is indicated in the names.
973
43.4M
      __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y);
974
43.4M
      __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y);
975
976
      // |scaled_bottom_left| is always scaled by the same weight each row, so
977
      // we only derive |scaled_top_right| values here.
978
43.4M
      const __m128i inverted_weights_x =
979
43.4M
          _mm_sub_epi16(scale_value, cvtepu8_epi16(weights_x));
980
43.4M
      const __m128i scaled_top_right =
981
43.4M
          _mm_mullo_epi16(inverted_weights_x, top_right);
982
43.4M
      const __m128i scaled_top_right_lo = cvtepu16_epi32(scaled_top_right);
983
43.4M
      const __m128i scaled_top_right_hi =
984
43.4M
          _mm_unpackhi_epi16(scaled_top_right, zero);
985
43.4M
      pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left);
986
43.4M
      pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left);
987
43.4M
      pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo);
988
43.4M
      pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi);
989
990
      // The round value for RightShiftWithRounding was added with
991
      // |scaled_bottom_left|.
992
43.4M
      pred_lo = _mm_srli_epi32(pred_lo, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
993
43.4M
      pred_hi = _mm_srli_epi32(pred_hi, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
994
43.4M
      const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
995
43.4M
      StoreLo8(dst + x, _mm_packus_epi16(pred, pred));
996
43.4M
    }
997
10.5M
    dst += stride;
998
10.5M
  }
999
536k
}
1000
1001
void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1002
                                     const uint8_t *above,
1003
63.4k
                                     const uint8_t *left) {
1004
63.4k
  smooth_predictor_wxh(dst, stride, above, left, 16, 4);
1005
63.4k
}
1006
1007
void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1008
                                     const uint8_t *above,
1009
69.4k
                                     const uint8_t *left) {
1010
69.4k
  smooth_predictor_wxh(dst, stride, above, left, 16, 8);
1011
69.4k
}
1012
1013
void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1014
                                      const uint8_t *above,
1015
127k
                                      const uint8_t *left) {
1016
127k
  smooth_predictor_wxh(dst, stride, above, left, 16, 16);
1017
127k
}
1018
1019
void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1020
                                      const uint8_t *above,
1021
30.0k
                                      const uint8_t *left) {
1022
30.0k
  smooth_predictor_wxh(dst, stride, above, left, 16, 32);
1023
30.0k
}
1024
1025
void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1026
                                      const uint8_t *above,
1027
6.31k
                                      const uint8_t *left) {
1028
6.31k
  smooth_predictor_wxh(dst, stride, above, left, 16, 64);
1029
6.31k
}
1030
1031
void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1032
                                     const uint8_t *above,
1033
51.9k
                                     const uint8_t *left) {
1034
51.9k
  smooth_predictor_wxh(dst, stride, above, left, 32, 8);
1035
51.9k
}
1036
1037
void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1038
                                      const uint8_t *above,
1039
27.8k
                                      const uint8_t *left) {
1040
27.8k
  smooth_predictor_wxh(dst, stride, above, left, 32, 16);
1041
27.8k
}
1042
1043
void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1044
                                      const uint8_t *above,
1045
93.3k
                                      const uint8_t *left) {
1046
93.3k
  smooth_predictor_wxh(dst, stride, above, left, 32, 32);
1047
93.3k
}
1048
1049
void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1050
                                      const uint8_t *above,
1051
2.36k
                                      const uint8_t *left) {
1052
2.36k
  smooth_predictor_wxh(dst, stride, above, left, 32, 64);
1053
2.36k
}
1054
1055
void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1056
                                      const uint8_t *above,
1057
33.1k
                                      const uint8_t *left) {
1058
33.1k
  smooth_predictor_wxh(dst, stride, above, left, 64, 16);
1059
33.1k
}
1060
1061
void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1062
                                      const uint8_t *above,
1063
5.46k
                                      const uint8_t *left) {
1064
5.46k
  smooth_predictor_wxh(dst, stride, above, left, 64, 32);
1065
5.46k
}
1066
1067
void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1068
                                      const uint8_t *above,
1069
26.1k
                                      const uint8_t *left) {
1070
26.1k
  smooth_predictor_wxh(dst, stride, above, left, 64, 64);
1071
26.1k
}
1072
1073
// -----------------------------------------------------------------------------
1074
// Smooth horizontal/vertical helper functions.
1075
1076
// For Horizontal, pixels1 and pixels2 are the same repeated value. For
1077
// Vertical, weights1 and weights2 are the same, and scaled_corner1 and
1078
// scaled_corner2 are the same.
1079
static AOM_FORCE_INLINE void write_smooth_directional_sum16(
1080
    uint8_t *LIBAOM_RESTRICT dst, const __m128i pixels1, const __m128i pixels2,
1081
    const __m128i weights1, const __m128i weights2,
1082
    const __m128i scaled_corner1, const __m128i scaled_corner2,
1083
15.5M
    const __m128i round) {
1084
15.5M
  const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
1085
15.5M
  const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
1086
15.5M
  const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
1087
15.5M
  const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2);
1088
  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
1089
15.5M
  const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8);
1090
15.5M
  const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8);
1091
15.5M
  StoreUnaligned16(dst, _mm_packus_epi16(pred1, pred2));
1092
15.5M
}
1093
1094
static AOM_FORCE_INLINE __m128i smooth_directional_sum8(
1095
1.62M
    const __m128i pixels, const __m128i weights, const __m128i scaled_corner) {
1096
1.62M
  const __m128i weighted_px = _mm_mullo_epi16(pixels, weights);
1097
1.62M
  return _mm_add_epi16(scaled_corner, weighted_px);
1098
1.62M
}
1099
1100
static AOM_FORCE_INLINE void write_smooth_directional_sum8(
1101
    uint8_t *LIBAOM_RESTRICT dst, const __m128i *pixels, const __m128i *weights,
1102
1.62M
    const __m128i *scaled_corner, const __m128i *round) {
1103
1.62M
  const __m128i pred_sum =
1104
1.62M
      smooth_directional_sum8(*pixels, *weights, *scaled_corner);
1105
  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
1106
1.62M
  const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, *round), 8);
1107
1.62M
  StoreLo8(dst, _mm_packus_epi16(pred, pred));
1108
1.62M
}
1109
1110
// -----------------------------------------------------------------------------
1111
// SMOOTH_V_PRED
1112
1113
static AOM_FORCE_INLINE void load_smooth_vertical_pixels4(
1114
    const uint8_t *LIBAOM_RESTRICT above, const uint8_t *LIBAOM_RESTRICT left,
1115
65.5k
    const int height, __m128i *pixels) {
1116
65.5k
  __m128i top = Load4(above);
1117
65.5k
  const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
1118
65.5k
  top = cvtepu8_epi16(top);
1119
65.5k
  pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
1120
65.5k
}
1121
1122
// |weight_array| alternates weight vectors from the table with their inverted
1123
// (256-w) counterparts. This is precomputed by the compiler when the weights
1124
// table is visible to this module. Removing this visibility can cut speed by up
1125
// to half in both 4xH and 8xH transforms.
1126
static AOM_FORCE_INLINE void load_smooth_vertical_weights4(
1127
    const uint8_t *LIBAOM_RESTRICT weight_array, const int height,
1128
65.5k
    __m128i *weights) {
1129
65.5k
  const __m128i inverter = _mm_set1_epi16(256);
1130
1131
65.5k
  if (height == 4) {
1132
42.2k
    const __m128i weight = Load4(weight_array);
1133
42.2k
    weights[0] = cvtepu8_epi16(weight);
1134
42.2k
    weights[1] = _mm_sub_epi16(inverter, weights[0]);
1135
42.2k
  } else if (height == 8) {
1136
14.5k
    const __m128i weight = LoadLo8(weight_array + 4);
1137
14.5k
    weights[0] = cvtepu8_epi16(weight);
1138
14.5k
    weights[1] = _mm_sub_epi16(inverter, weights[0]);
1139
14.5k
  } else {
1140
8.71k
    const __m128i weight = LoadUnaligned16(weight_array + 12);
1141
8.71k
    const __m128i zero = _mm_setzero_si128();
1142
8.71k
    weights[0] = cvtepu8_epi16(weight);
1143
8.71k
    weights[1] = _mm_sub_epi16(inverter, weights[0]);
1144
8.71k
    weights[2] = _mm_unpackhi_epi8(weight, zero);
1145
8.71k
    weights[3] = _mm_sub_epi16(inverter, weights[2]);
1146
8.71k
  }
1147
65.5k
}
1148
1149
static AOM_FORCE_INLINE void write_smooth_vertical4xh(
1150
    const __m128i *pixel, const __m128i *weight, const int height,
1151
74.2k
    uint8_t *LIBAOM_RESTRICT dst, const ptrdiff_t stride) {
1152
74.2k
  const __m128i pred_round = _mm_set1_epi32(128);
1153
74.2k
  const __m128i mask_increment = _mm_set1_epi16(0x0202);
1154
74.2k
  const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400);
1155
74.2k
  __m128i y_select = _mm_set1_epi16(0x0100);
1156
1157
499k
  for (int y = 0; y < height; ++y) {
1158
424k
    const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select);
1159
424k
    const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select);
1160
424k
    const __m128i alternate_weights =
1161
424k
        _mm_unpacklo_epi16(weight_y, inverted_weight_y);
1162
    // Here the pixel vector is top_row[0], corner, top_row[1], corner, ...
1163
    // The madd instruction yields four results of the form:
1164
    // (top_row[x] * weight[y] + corner * inverted_weight[y])
1165
424k
    __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights);
1166
424k
    sum = _mm_add_epi32(sum, pred_round);
1167
424k
    sum = _mm_srai_epi32(sum, 8);
1168
424k
    sum = _mm_shuffle_epi8(sum, cvtepu8_epi32);
1169
424k
    Store4(dst, sum);
1170
424k
    dst += stride;
1171
424k
    y_select = _mm_add_epi16(y_select, mask_increment);
1172
424k
  }
1173
74.2k
}
1174
1175
void aom_smooth_v_predictor_4x4_ssse3(
1176
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1177
    const uint8_t *LIBAOM_RESTRICT top_row,
1178
42.2k
    const uint8_t *LIBAOM_RESTRICT left_column) {
1179
42.2k
  __m128i pixels;
1180
42.2k
  load_smooth_vertical_pixels4(top_row, left_column, 4, &pixels);
1181
1182
42.2k
  __m128i weights[2];
1183
42.2k
  load_smooth_vertical_weights4(smooth_weights, 4, weights);
1184
1185
42.2k
  write_smooth_vertical4xh(&pixels, weights, 4, dst, stride);
1186
42.2k
}
1187
1188
void aom_smooth_v_predictor_4x8_ssse3(
1189
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1190
    const uint8_t *LIBAOM_RESTRICT top_row,
1191
14.5k
    const uint8_t *LIBAOM_RESTRICT left_column) {
1192
14.5k
  __m128i pixels;
1193
14.5k
  load_smooth_vertical_pixels4(top_row, left_column, 8, &pixels);
1194
1195
14.5k
  __m128i weights[2];
1196
14.5k
  load_smooth_vertical_weights4(smooth_weights, 8, weights);
1197
1198
14.5k
  write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
1199
14.5k
}
1200
1201
void aom_smooth_v_predictor_4x16_ssse3(
1202
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1203
    const uint8_t *LIBAOM_RESTRICT top_row,
1204
8.71k
    const uint8_t *LIBAOM_RESTRICT left_column) {
1205
8.71k
  __m128i pixels;
1206
8.71k
  load_smooth_vertical_pixels4(top_row, left_column, 16, &pixels);
1207
1208
8.71k
  __m128i weights[4];
1209
8.71k
  load_smooth_vertical_weights4(smooth_weights, 16, weights);
1210
1211
8.71k
  write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
1212
8.71k
  dst += stride << 3;
1213
8.71k
  write_smooth_vertical4xh(&pixels, &weights[2], 8, dst, stride);
1214
8.71k
}
1215
1216
void aom_smooth_v_predictor_8x4_ssse3(
1217
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1218
    const uint8_t *LIBAOM_RESTRICT top_row,
1219
20.1k
    const uint8_t *LIBAOM_RESTRICT left_column) {
1220
20.1k
  const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
1221
20.1k
  const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
1222
20.1k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1223
20.1k
  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1224
20.1k
  const __m128i scaled_bottom_left =
1225
20.1k
      _mm_mullo_epi16(inverted_weights, bottom_left);
1226
20.1k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1227
20.1k
  __m128i y_select = _mm_set1_epi32(0x01000100);
1228
20.1k
  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1229
20.1k
  __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1230
20.1k
  __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1231
20.1k
  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1232
20.1k
                                &round);
1233
20.1k
  dst += stride;
1234
20.1k
  y_select = _mm_set1_epi32(0x03020302);
1235
20.1k
  weights_y = _mm_shuffle_epi8(weights, y_select);
1236
20.1k
  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1237
20.1k
  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1238
20.1k
                                &round);
1239
20.1k
  dst += stride;
1240
20.1k
  y_select = _mm_set1_epi32(0x05040504);
1241
20.1k
  weights_y = _mm_shuffle_epi8(weights, y_select);
1242
20.1k
  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1243
20.1k
  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1244
20.1k
                                &round);
1245
20.1k
  dst += stride;
1246
20.1k
  y_select = _mm_set1_epi32(0x07060706);
1247
20.1k
  weights_y = _mm_shuffle_epi8(weights, y_select);
1248
20.1k
  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1249
20.1k
  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1250
20.1k
                                &round);
1251
20.1k
}
1252
1253
void aom_smooth_v_predictor_8x8_ssse3(
1254
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1255
    const uint8_t *LIBAOM_RESTRICT top_row,
1256
41.2k
    const uint8_t *LIBAOM_RESTRICT left_column) {
1257
41.2k
  const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
1258
41.2k
  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
1259
41.2k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1260
41.2k
  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1261
41.2k
  const __m128i scaled_bottom_left =
1262
41.2k
      _mm_mullo_epi16(inverted_weights, bottom_left);
1263
41.2k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1264
41.2k
  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1265
371k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1266
330k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1267
330k
    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1268
330k
    const __m128i scaled_bottom_left_y =
1269
330k
        _mm_shuffle_epi8(scaled_bottom_left, y_select);
1270
330k
    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1271
330k
                                  &round);
1272
330k
    dst += stride;
1273
330k
  }
1274
41.2k
}
1275
1276
void aom_smooth_v_predictor_8x16_ssse3(
1277
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1278
    const uint8_t *LIBAOM_RESTRICT top_row,
1279
10.2k
    const uint8_t *LIBAOM_RESTRICT left_column) {
1280
10.2k
  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1281
10.2k
  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1282
1283
10.2k
  const __m128i weights1 = cvtepu8_epi16(weights);
1284
10.2k
  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
1285
10.2k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1286
10.2k
  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1287
10.2k
  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1288
10.2k
  const __m128i scaled_bottom_left1 =
1289
10.2k
      _mm_mullo_epi16(inverted_weights1, bottom_left);
1290
10.2k
  const __m128i scaled_bottom_left2 =
1291
10.2k
      _mm_mullo_epi16(inverted_weights2, bottom_left);
1292
10.2k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1293
10.2k
  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1294
91.8k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1295
81.6k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1296
81.6k
    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1297
81.6k
    const __m128i scaled_bottom_left_y =
1298
81.6k
        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1299
81.6k
    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1300
81.6k
                                  &round);
1301
81.6k
    dst += stride;
1302
81.6k
  }
1303
91.8k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1304
81.6k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1305
81.6k
    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1306
81.6k
    const __m128i scaled_bottom_left_y =
1307
81.6k
        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1308
81.6k
    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1309
81.6k
                                  &round);
1310
81.6k
    dst += stride;
1311
81.6k
  }
1312
10.2k
}
1313
1314
void aom_smooth_v_predictor_8x32_ssse3(
1315
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1316
    const uint8_t *LIBAOM_RESTRICT top_row,
1317
3.72k
    const uint8_t *LIBAOM_RESTRICT left_column) {
1318
3.72k
  const __m128i zero = _mm_setzero_si128();
1319
3.72k
  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1320
3.72k
  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1321
3.72k
  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1322
3.72k
  const __m128i weights1 = cvtepu8_epi16(weights_lo);
1323
3.72k
  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1324
3.72k
  const __m128i weights3 = cvtepu8_epi16(weights_hi);
1325
3.72k
  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1326
3.72k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1327
3.72k
  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1328
3.72k
  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1329
3.72k
  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1330
3.72k
  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1331
3.72k
  const __m128i scaled_bottom_left1 =
1332
3.72k
      _mm_mullo_epi16(inverted_weights1, bottom_left);
1333
3.72k
  const __m128i scaled_bottom_left2 =
1334
3.72k
      _mm_mullo_epi16(inverted_weights2, bottom_left);
1335
3.72k
  const __m128i scaled_bottom_left3 =
1336
3.72k
      _mm_mullo_epi16(inverted_weights3, bottom_left);
1337
3.72k
  const __m128i scaled_bottom_left4 =
1338
3.72k
      _mm_mullo_epi16(inverted_weights4, bottom_left);
1339
3.72k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1340
3.72k
  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1341
33.5k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1342
29.8k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1343
29.8k
    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1344
29.8k
    const __m128i scaled_bottom_left_y =
1345
29.8k
        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1346
29.8k
    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1347
29.8k
                                  &round);
1348
29.8k
    dst += stride;
1349
29.8k
  }
1350
33.5k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1351
29.8k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1352
29.8k
    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1353
29.8k
    const __m128i scaled_bottom_left_y =
1354
29.8k
        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1355
29.8k
    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1356
29.8k
                                  &round);
1357
29.8k
    dst += stride;
1358
29.8k
  }
1359
33.5k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1360
29.8k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1361
29.8k
    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1362
29.8k
    const __m128i scaled_bottom_left_y =
1363
29.8k
        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1364
29.8k
    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1365
29.8k
                                  &round);
1366
29.8k
    dst += stride;
1367
29.8k
  }
1368
33.5k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1369
29.8k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1370
29.8k
    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1371
29.8k
    const __m128i scaled_bottom_left_y =
1372
29.8k
        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1373
29.8k
    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1374
29.8k
                                  &round);
1375
29.8k
    dst += stride;
1376
29.8k
  }
1377
3.72k
}
1378
1379
void aom_smooth_v_predictor_16x4_ssse3(
1380
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1381
    const uint8_t *LIBAOM_RESTRICT top_row,
1382
14.5k
    const uint8_t *LIBAOM_RESTRICT left_column) {
1383
14.5k
  const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
1384
14.5k
  const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
1385
14.5k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1386
14.5k
  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1387
14.5k
  const __m128i scaled_bottom_left =
1388
14.5k
      _mm_mullo_epi16(inverted_weights, bottom_left);
1389
14.5k
  const __m128i round = _mm_set1_epi16(128);
1390
14.5k
  const __m128i top = LoadUnaligned16(top_row);
1391
14.5k
  const __m128i top_lo = cvtepu8_epi16(top);
1392
14.5k
  const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
1393
1394
14.5k
  __m128i y_select = _mm_set1_epi32(0x01000100);
1395
14.5k
  __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1396
14.5k
  __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1397
14.5k
  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1398
14.5k
                                 scaled_bottom_left_y, scaled_bottom_left_y,
1399
14.5k
                                 round);
1400
14.5k
  dst += stride;
1401
14.5k
  y_select = _mm_set1_epi32(0x03020302);
1402
14.5k
  weights_y = _mm_shuffle_epi8(weights, y_select);
1403
14.5k
  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1404
14.5k
  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1405
14.5k
                                 scaled_bottom_left_y, scaled_bottom_left_y,
1406
14.5k
                                 round);
1407
14.5k
  dst += stride;
1408
14.5k
  y_select = _mm_set1_epi32(0x05040504);
1409
14.5k
  weights_y = _mm_shuffle_epi8(weights, y_select);
1410
14.5k
  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1411
14.5k
  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1412
14.5k
                                 scaled_bottom_left_y, scaled_bottom_left_y,
1413
14.5k
                                 round);
1414
14.5k
  dst += stride;
1415
14.5k
  y_select = _mm_set1_epi32(0x07060706);
1416
14.5k
  weights_y = _mm_shuffle_epi8(weights, y_select);
1417
14.5k
  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1418
14.5k
  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1419
14.5k
                                 scaled_bottom_left_y, scaled_bottom_left_y,
1420
14.5k
                                 round);
1421
14.5k
}
1422
1423
void aom_smooth_v_predictor_16x8_ssse3(
1424
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1425
    const uint8_t *LIBAOM_RESTRICT top_row,
1426
11.5k
    const uint8_t *LIBAOM_RESTRICT left_column) {
1427
11.5k
  const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
1428
11.5k
  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
1429
11.5k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1430
11.5k
  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1431
11.5k
  const __m128i scaled_bottom_left =
1432
11.5k
      _mm_mullo_epi16(inverted_weights, bottom_left);
1433
11.5k
  const __m128i round = _mm_set1_epi16(128);
1434
11.5k
  const __m128i top = LoadUnaligned16(top_row);
1435
11.5k
  const __m128i top_lo = cvtepu8_epi16(top);
1436
11.5k
  const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
1437
104k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1438
92.5k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1439
92.5k
    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1440
92.5k
    const __m128i scaled_bottom_left_y =
1441
92.5k
        _mm_shuffle_epi8(scaled_bottom_left, y_select);
1442
92.5k
    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1443
92.5k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1444
92.5k
                                   round);
1445
92.5k
    dst += stride;
1446
92.5k
  }
1447
11.5k
}
1448
1449
void aom_smooth_v_predictor_16x16_ssse3(
1450
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1451
    const uint8_t *LIBAOM_RESTRICT top_row,
1452
36.5k
    const uint8_t *LIBAOM_RESTRICT left_column) {
1453
36.5k
  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1454
36.5k
  const __m128i zero = _mm_setzero_si128();
1455
36.5k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1456
36.5k
  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1457
36.5k
  const __m128i weights_lo = cvtepu8_epi16(weights);
1458
36.5k
  const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1459
36.5k
  const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1460
36.5k
  const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1461
36.5k
  const __m128i scaled_bottom_left_lo =
1462
36.5k
      _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1463
36.5k
  const __m128i scaled_bottom_left_hi =
1464
36.5k
      _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1465
36.5k
  const __m128i round = _mm_set1_epi16(128);
1466
1467
36.5k
  const __m128i top = LoadUnaligned16(top_row);
1468
36.5k
  const __m128i top_lo = cvtepu8_epi16(top);
1469
36.5k
  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1470
328k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1471
292k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1472
292k
    const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1473
292k
    const __m128i scaled_bottom_left_y =
1474
292k
        _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1475
292k
    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1476
292k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1477
292k
                                   round);
1478
292k
    dst += stride;
1479
292k
  }
1480
328k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1481
292k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1482
292k
    const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1483
292k
    const __m128i scaled_bottom_left_y =
1484
292k
        _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1485
292k
    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1486
292k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1487
292k
                                   round);
1488
292k
    dst += stride;
1489
292k
  }
1490
36.5k
}
1491
1492
void aom_smooth_v_predictor_16x32_ssse3(
1493
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1494
    const uint8_t *LIBAOM_RESTRICT top_row,
1495
7.99k
    const uint8_t *LIBAOM_RESTRICT left_column) {
1496
7.99k
  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1497
7.99k
  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1498
7.99k
  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1499
7.99k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1500
7.99k
  const __m128i zero = _mm_setzero_si128();
1501
7.99k
  const __m128i weights1 = cvtepu8_epi16(weights_lo);
1502
7.99k
  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1503
7.99k
  const __m128i weights3 = cvtepu8_epi16(weights_hi);
1504
7.99k
  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1505
7.99k
  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1506
7.99k
  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1507
7.99k
  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1508
7.99k
  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1509
7.99k
  const __m128i scaled_bottom_left1 =
1510
7.99k
      _mm_mullo_epi16(inverted_weights1, bottom_left);
1511
7.99k
  const __m128i scaled_bottom_left2 =
1512
7.99k
      _mm_mullo_epi16(inverted_weights2, bottom_left);
1513
7.99k
  const __m128i scaled_bottom_left3 =
1514
7.99k
      _mm_mullo_epi16(inverted_weights3, bottom_left);
1515
7.99k
  const __m128i scaled_bottom_left4 =
1516
7.99k
      _mm_mullo_epi16(inverted_weights4, bottom_left);
1517
7.99k
  const __m128i round = _mm_set1_epi16(128);
1518
1519
7.99k
  const __m128i top = LoadUnaligned16(top_row);
1520
7.99k
  const __m128i top_lo = cvtepu8_epi16(top);
1521
7.99k
  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1522
71.9k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1523
63.9k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1524
63.9k
    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1525
63.9k
    const __m128i scaled_bottom_left_y =
1526
63.9k
        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1527
63.9k
    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1528
63.9k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1529
63.9k
                                   round);
1530
63.9k
    dst += stride;
1531
63.9k
  }
1532
71.9k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1533
63.9k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1534
63.9k
    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1535
63.9k
    const __m128i scaled_bottom_left_y =
1536
63.9k
        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1537
63.9k
    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1538
63.9k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1539
63.9k
                                   round);
1540
63.9k
    dst += stride;
1541
63.9k
  }
1542
71.9k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1543
63.9k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1544
63.9k
    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1545
63.9k
    const __m128i scaled_bottom_left_y =
1546
63.9k
        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1547
63.9k
    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1548
63.9k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1549
63.9k
                                   round);
1550
63.9k
    dst += stride;
1551
63.9k
  }
1552
71.9k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1553
63.9k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1554
63.9k
    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1555
63.9k
    const __m128i scaled_bottom_left_y =
1556
63.9k
        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1557
63.9k
    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1558
63.9k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1559
63.9k
                                   round);
1560
63.9k
    dst += stride;
1561
63.9k
  }
1562
7.99k
}
1563
1564
void aom_smooth_v_predictor_16x64_ssse3(
1565
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1566
    const uint8_t *LIBAOM_RESTRICT top_row,
1567
1.77k
    const uint8_t *LIBAOM_RESTRICT left_column) {
1568
1.77k
  const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
1569
1.77k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1570
1.77k
  const __m128i round = _mm_set1_epi16(128);
1571
1.77k
  const __m128i zero = _mm_setzero_si128();
1572
1.77k
  const __m128i top = LoadUnaligned16(top_row);
1573
1.77k
  const __m128i top_lo = cvtepu8_epi16(top);
1574
1.77k
  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1575
1.77k
  const uint8_t *weights_base_ptr = smooth_weights + 60;
1576
8.86k
  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
1577
7.09k
    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
1578
7.09k
    const __m128i weights_lo = cvtepu8_epi16(weights);
1579
7.09k
    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1580
7.09k
    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1581
7.09k
    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1582
7.09k
    const __m128i scaled_bottom_left_lo =
1583
7.09k
        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1584
7.09k
    const __m128i scaled_bottom_left_hi =
1585
7.09k
        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1586
1587
63.8k
    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1588
56.7k
      const __m128i y_select = _mm_set1_epi32(y_mask);
1589
56.7k
      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1590
56.7k
      const __m128i scaled_bottom_left_y =
1591
56.7k
          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1592
56.7k
      write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1593
56.7k
                                     scaled_bottom_left_y, scaled_bottom_left_y,
1594
56.7k
                                     round);
1595
56.7k
      dst += stride;
1596
56.7k
    }
1597
63.8k
    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1598
56.7k
      const __m128i y_select = _mm_set1_epi32(y_mask);
1599
56.7k
      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1600
56.7k
      const __m128i scaled_bottom_left_y =
1601
56.7k
          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1602
56.7k
      write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1603
56.7k
                                     scaled_bottom_left_y, scaled_bottom_left_y,
1604
56.7k
                                     round);
1605
56.7k
      dst += stride;
1606
56.7k
    }
1607
7.09k
  }
1608
1.77k
}
1609
1610
void aom_smooth_v_predictor_32x8_ssse3(
1611
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1612
    const uint8_t *LIBAOM_RESTRICT top_row,
1613
21.4k
    const uint8_t *LIBAOM_RESTRICT left_column) {
1614
21.4k
  const __m128i zero = _mm_setzero_si128();
1615
21.4k
  const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
1616
21.4k
  const __m128i top_lo = LoadUnaligned16(top_row);
1617
21.4k
  const __m128i top_hi = LoadUnaligned16(top_row + 16);
1618
21.4k
  const __m128i top1 = cvtepu8_epi16(top_lo);
1619
21.4k
  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1620
21.4k
  const __m128i top3 = cvtepu8_epi16(top_hi);
1621
21.4k
  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1622
21.4k
  __m128i scale = _mm_set1_epi16(256);
1623
21.4k
  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
1624
21.4k
  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1625
21.4k
  const __m128i scaled_bottom_left =
1626
21.4k
      _mm_mullo_epi16(inverted_weights, bottom_left);
1627
21.4k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1628
192k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1629
171k
    __m128i y_select = _mm_set1_epi32(y_mask);
1630
171k
    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1631
171k
    const __m128i scaled_bottom_left_y =
1632
171k
        _mm_shuffle_epi8(scaled_bottom_left, y_select);
1633
171k
    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1634
171k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1635
171k
                                   round);
1636
171k
    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1637
171k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1638
171k
                                   round);
1639
171k
    dst += stride;
1640
171k
  }
1641
21.4k
}
1642
1643
void aom_smooth_v_predictor_32x16_ssse3(
1644
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1645
    const uint8_t *LIBAOM_RESTRICT top_row,
1646
9.24k
    const uint8_t *LIBAOM_RESTRICT left_column) {
1647
9.24k
  const __m128i zero = _mm_setzero_si128();
1648
9.24k
  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1649
9.24k
  const __m128i top_lo = LoadUnaligned16(top_row);
1650
9.24k
  const __m128i top_hi = LoadUnaligned16(top_row + 16);
1651
9.24k
  const __m128i top1 = cvtepu8_epi16(top_lo);
1652
9.24k
  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1653
9.24k
  const __m128i top3 = cvtepu8_epi16(top_hi);
1654
9.24k
  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1655
9.24k
  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1656
9.24k
  const __m128i weights1 = cvtepu8_epi16(weights);
1657
9.24k
  const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
1658
9.24k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1659
9.24k
  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1660
9.24k
  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1661
9.24k
  const __m128i scaled_bottom_left1 =
1662
9.24k
      _mm_mullo_epi16(inverted_weights1, bottom_left);
1663
9.24k
  const __m128i scaled_bottom_left2 =
1664
9.24k
      _mm_mullo_epi16(inverted_weights2, bottom_left);
1665
9.24k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1666
83.2k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1667
73.9k
    __m128i y_select = _mm_set1_epi32(y_mask);
1668
73.9k
    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1669
73.9k
    const __m128i scaled_bottom_left_y =
1670
73.9k
        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1671
73.9k
    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1672
73.9k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1673
73.9k
                                   round);
1674
73.9k
    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1675
73.9k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1676
73.9k
                                   round);
1677
73.9k
    dst += stride;
1678
73.9k
  }
1679
83.2k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1680
73.9k
    __m128i y_select = _mm_set1_epi32(y_mask);
1681
73.9k
    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1682
73.9k
    const __m128i scaled_bottom_left_y =
1683
73.9k
        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1684
73.9k
    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1685
73.9k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1686
73.9k
                                   round);
1687
73.9k
    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1688
73.9k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1689
73.9k
                                   round);
1690
73.9k
    dst += stride;
1691
73.9k
  }
1692
9.24k
}
1693
1694
void aom_smooth_v_predictor_32x32_ssse3(
1695
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1696
    const uint8_t *LIBAOM_RESTRICT top_row,
1697
49.5k
    const uint8_t *LIBAOM_RESTRICT left_column) {
1698
49.5k
  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1699
49.5k
  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1700
49.5k
  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1701
49.5k
  const __m128i zero = _mm_setzero_si128();
1702
49.5k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1703
49.5k
  const __m128i top_lo = LoadUnaligned16(top_row);
1704
49.5k
  const __m128i top_hi = LoadUnaligned16(top_row + 16);
1705
49.5k
  const __m128i top1 = cvtepu8_epi16(top_lo);
1706
49.5k
  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1707
49.5k
  const __m128i top3 = cvtepu8_epi16(top_hi);
1708
49.5k
  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1709
49.5k
  const __m128i weights1 = cvtepu8_epi16(weights_lo);
1710
49.5k
  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1711
49.5k
  const __m128i weights3 = cvtepu8_epi16(weights_hi);
1712
49.5k
  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1713
49.5k
  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1714
49.5k
  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1715
49.5k
  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1716
49.5k
  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1717
49.5k
  const __m128i scaled_bottom_left1 =
1718
49.5k
      _mm_mullo_epi16(inverted_weights1, bottom_left);
1719
49.5k
  const __m128i scaled_bottom_left2 =
1720
49.5k
      _mm_mullo_epi16(inverted_weights2, bottom_left);
1721
49.5k
  const __m128i scaled_bottom_left3 =
1722
49.5k
      _mm_mullo_epi16(inverted_weights3, bottom_left);
1723
49.5k
  const __m128i scaled_bottom_left4 =
1724
49.5k
      _mm_mullo_epi16(inverted_weights4, bottom_left);
1725
49.5k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1726
446k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1727
396k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1728
396k
    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1729
396k
    const __m128i scaled_bottom_left_y =
1730
396k
        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1731
396k
    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1732
396k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1733
396k
                                   round);
1734
396k
    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1735
396k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1736
396k
                                   round);
1737
396k
    dst += stride;
1738
396k
  }
1739
446k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1740
396k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1741
396k
    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1742
396k
    const __m128i scaled_bottom_left_y =
1743
396k
        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1744
396k
    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1745
396k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1746
396k
                                   round);
1747
396k
    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1748
396k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1749
396k
                                   round);
1750
396k
    dst += stride;
1751
396k
  }
1752
446k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1753
396k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1754
396k
    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1755
396k
    const __m128i scaled_bottom_left_y =
1756
396k
        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1757
396k
    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1758
396k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1759
396k
                                   round);
1760
396k
    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1761
396k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1762
396k
                                   round);
1763
396k
    dst += stride;
1764
396k
  }
1765
446k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1766
396k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1767
396k
    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1768
396k
    const __m128i scaled_bottom_left_y =
1769
396k
        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1770
396k
    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1771
396k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1772
396k
                                   round);
1773
396k
    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1774
396k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1775
396k
                                   round);
1776
396k
    dst += stride;
1777
396k
  }
1778
49.5k
}
1779
1780
void aom_smooth_v_predictor_32x64_ssse3(
1781
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1782
    const uint8_t *LIBAOM_RESTRICT top_row,
1783
1.49k
    const uint8_t *LIBAOM_RESTRICT left_column) {
1784
1.49k
  const __m128i zero = _mm_setzero_si128();
1785
1.49k
  const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
1786
1.49k
  const __m128i top_lo = LoadUnaligned16(top_row);
1787
1.49k
  const __m128i top_hi = LoadUnaligned16(top_row + 16);
1788
1.49k
  const __m128i top1 = cvtepu8_epi16(top_lo);
1789
1.49k
  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1790
1.49k
  const __m128i top3 = cvtepu8_epi16(top_hi);
1791
1.49k
  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1792
1.49k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1793
1.49k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1794
1.49k
  const uint8_t *weights_base_ptr = smooth_weights + 60;
1795
7.45k
  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
1796
5.96k
    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
1797
5.96k
    const __m128i weights_lo = cvtepu8_epi16(weights);
1798
5.96k
    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1799
5.96k
    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1800
5.96k
    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1801
5.96k
    const __m128i scaled_bottom_left_lo =
1802
5.96k
        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1803
5.96k
    const __m128i scaled_bottom_left_hi =
1804
5.96k
        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1805
1806
53.6k
    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1807
47.7k
      const __m128i y_select = _mm_set1_epi32(y_mask);
1808
47.7k
      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1809
47.7k
      const __m128i scaled_bottom_left_y =
1810
47.7k
          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1811
47.7k
      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1812
47.7k
                                     scaled_bottom_left_y, scaled_bottom_left_y,
1813
47.7k
                                     round);
1814
47.7k
      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1815
47.7k
                                     scaled_bottom_left_y, scaled_bottom_left_y,
1816
47.7k
                                     round);
1817
47.7k
      dst += stride;
1818
47.7k
    }
1819
53.6k
    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1820
47.7k
      const __m128i y_select = _mm_set1_epi32(y_mask);
1821
47.7k
      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1822
47.7k
      const __m128i scaled_bottom_left_y =
1823
47.7k
          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1824
47.7k
      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1825
47.7k
                                     scaled_bottom_left_y, scaled_bottom_left_y,
1826
47.7k
                                     round);
1827
47.7k
      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1828
47.7k
                                     scaled_bottom_left_y, scaled_bottom_left_y,
1829
47.7k
                                     round);
1830
47.7k
      dst += stride;
1831
47.7k
    }
1832
5.96k
  }
1833
1.49k
}
1834
1835
void aom_smooth_v_predictor_64x16_ssse3(
1836
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1837
    const uint8_t *LIBAOM_RESTRICT top_row,
1838
20.2k
    const uint8_t *LIBAOM_RESTRICT left_column) {
1839
20.2k
  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1840
20.2k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1841
20.2k
  const __m128i zero = _mm_setzero_si128();
1842
20.2k
  const __m128i top_lolo = LoadUnaligned16(top_row);
1843
20.2k
  const __m128i top_lohi = LoadUnaligned16(top_row + 16);
1844
20.2k
  const __m128i top1 = cvtepu8_epi16(top_lolo);
1845
20.2k
  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
1846
20.2k
  const __m128i top3 = cvtepu8_epi16(top_lohi);
1847
20.2k
  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
1848
1849
20.2k
  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1850
20.2k
  const __m128i weights1 = cvtepu8_epi16(weights);
1851
20.2k
  const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
1852
20.2k
  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1853
20.2k
  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1854
20.2k
  const __m128i top_hilo = LoadUnaligned16(top_row + 32);
1855
20.2k
  const __m128i top_hihi = LoadUnaligned16(top_row + 48);
1856
20.2k
  const __m128i top5 = cvtepu8_epi16(top_hilo);
1857
20.2k
  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
1858
20.2k
  const __m128i top7 = cvtepu8_epi16(top_hihi);
1859
20.2k
  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
1860
20.2k
  const __m128i scaled_bottom_left1 =
1861
20.2k
      _mm_mullo_epi16(inverted_weights1, bottom_left);
1862
20.2k
  const __m128i scaled_bottom_left2 =
1863
20.2k
      _mm_mullo_epi16(inverted_weights2, bottom_left);
1864
20.2k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1865
182k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1866
162k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1867
162k
    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1868
162k
    const __m128i scaled_bottom_left_y =
1869
162k
        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1870
162k
    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1871
162k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1872
162k
                                   round);
1873
162k
    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1874
162k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1875
162k
                                   round);
1876
162k
    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1877
162k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1878
162k
                                   round);
1879
162k
    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1880
162k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1881
162k
                                   round);
1882
162k
    dst += stride;
1883
162k
  }
1884
182k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1885
162k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1886
162k
    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1887
162k
    const __m128i scaled_bottom_left_y =
1888
162k
        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1889
162k
    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1890
162k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1891
162k
                                   round);
1892
162k
    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1893
162k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1894
162k
                                   round);
1895
162k
    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1896
162k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1897
162k
                                   round);
1898
162k
    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1899
162k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1900
162k
                                   round);
1901
162k
    dst += stride;
1902
162k
  }
1903
20.2k
}
1904
1905
void aom_smooth_v_predictor_64x32_ssse3(
1906
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1907
    const uint8_t *LIBAOM_RESTRICT top_row,
1908
3.33k
    const uint8_t *LIBAOM_RESTRICT left_column) {
1909
3.33k
  const __m128i zero = _mm_setzero_si128();
1910
3.33k
  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1911
3.33k
  const __m128i top_lolo = LoadUnaligned16(top_row);
1912
3.33k
  const __m128i top_lohi = LoadUnaligned16(top_row + 16);
1913
3.33k
  const __m128i top1 = cvtepu8_epi16(top_lolo);
1914
3.33k
  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
1915
3.33k
  const __m128i top3 = cvtepu8_epi16(top_lohi);
1916
3.33k
  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
1917
3.33k
  const __m128i top_hilo = LoadUnaligned16(top_row + 32);
1918
3.33k
  const __m128i top_hihi = LoadUnaligned16(top_row + 48);
1919
3.33k
  const __m128i top5 = cvtepu8_epi16(top_hilo);
1920
3.33k
  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
1921
3.33k
  const __m128i top7 = cvtepu8_epi16(top_hihi);
1922
3.33k
  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
1923
3.33k
  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1924
3.33k
  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1925
3.33k
  const __m128i weights1 = cvtepu8_epi16(weights_lo);
1926
3.33k
  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1927
3.33k
  const __m128i weights3 = cvtepu8_epi16(weights_hi);
1928
3.33k
  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1929
3.33k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1930
3.33k
  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1931
3.33k
  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1932
3.33k
  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1933
3.33k
  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1934
3.33k
  const __m128i scaled_bottom_left1 =
1935
3.33k
      _mm_mullo_epi16(inverted_weights1, bottom_left);
1936
3.33k
  const __m128i scaled_bottom_left2 =
1937
3.33k
      _mm_mullo_epi16(inverted_weights2, bottom_left);
1938
3.33k
  const __m128i scaled_bottom_left3 =
1939
3.33k
      _mm_mullo_epi16(inverted_weights3, bottom_left);
1940
3.33k
  const __m128i scaled_bottom_left4 =
1941
3.33k
      _mm_mullo_epi16(inverted_weights4, bottom_left);
1942
3.33k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1943
1944
30.0k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1945
26.6k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1946
26.6k
    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1947
26.6k
    const __m128i scaled_bottom_left_y =
1948
26.6k
        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1949
26.6k
    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1950
26.6k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1951
26.6k
                                   round);
1952
26.6k
    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1953
26.6k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1954
26.6k
                                   round);
1955
26.6k
    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1956
26.6k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1957
26.6k
                                   round);
1958
26.6k
    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1959
26.6k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1960
26.6k
                                   round);
1961
26.6k
    dst += stride;
1962
26.6k
  }
1963
30.0k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1964
26.6k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1965
26.6k
    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1966
26.6k
    const __m128i scaled_bottom_left_y =
1967
26.6k
        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1968
26.6k
    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1969
26.6k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1970
26.6k
                                   round);
1971
26.6k
    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1972
26.6k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1973
26.6k
                                   round);
1974
26.6k
    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1975
26.6k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1976
26.6k
                                   round);
1977
26.6k
    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1978
26.6k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1979
26.6k
                                   round);
1980
26.6k
    dst += stride;
1981
26.6k
  }
1982
30.0k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1983
26.6k
    const __m128i y_select = _mm_set1_epi32(y_mask);
1984
26.6k
    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1985
26.6k
    const __m128i scaled_bottom_left_y =
1986
26.6k
        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1987
26.6k
    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1988
26.6k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1989
26.6k
                                   round);
1990
26.6k
    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1991
26.6k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1992
26.6k
                                   round);
1993
26.6k
    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1994
26.6k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1995
26.6k
                                   round);
1996
26.6k
    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1997
26.6k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
1998
26.6k
                                   round);
1999
26.6k
    dst += stride;
2000
26.6k
  }
2001
30.0k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2002
26.6k
    const __m128i y_select = _mm_set1_epi32(y_mask);
2003
26.6k
    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
2004
26.6k
    const __m128i scaled_bottom_left_y =
2005
26.6k
        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
2006
26.6k
    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
2007
26.6k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
2008
26.6k
                                   round);
2009
26.6k
    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2010
26.6k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
2011
26.6k
                                   round);
2012
26.6k
    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2013
26.6k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
2014
26.6k
                                   round);
2015
26.6k
    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2016
26.6k
                                   scaled_bottom_left_y, scaled_bottom_left_y,
2017
26.6k
                                   round);
2018
26.6k
    dst += stride;
2019
26.6k
  }
2020
3.33k
}
2021
2022
void aom_smooth_v_predictor_64x64_ssse3(
2023
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2024
    const uint8_t *LIBAOM_RESTRICT top_row,
2025
7.35k
    const uint8_t *LIBAOM_RESTRICT left_column) {
2026
7.35k
  const __m128i zero = _mm_setzero_si128();
2027
7.35k
  const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
2028
7.35k
  const __m128i top_lolo = LoadUnaligned16(top_row);
2029
7.35k
  const __m128i top_lohi = LoadUnaligned16(top_row + 16);
2030
7.35k
  const __m128i top1 = cvtepu8_epi16(top_lolo);
2031
7.35k
  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
2032
7.35k
  const __m128i top3 = cvtepu8_epi16(top_lohi);
2033
7.35k
  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
2034
7.35k
  const __m128i top_hilo = LoadUnaligned16(top_row + 32);
2035
7.35k
  const __m128i top_hihi = LoadUnaligned16(top_row + 48);
2036
7.35k
  const __m128i top5 = cvtepu8_epi16(top_hilo);
2037
7.35k
  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
2038
7.35k
  const __m128i top7 = cvtepu8_epi16(top_hihi);
2039
7.35k
  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
2040
7.35k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2041
7.35k
  const __m128i round = _mm_set1_epi16(128);
2042
7.35k
  const uint8_t *weights_base_ptr = smooth_weights + 60;
2043
36.7k
  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
2044
29.4k
    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
2045
29.4k
    const __m128i weights_lo = cvtepu8_epi16(weights);
2046
29.4k
    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
2047
29.4k
    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
2048
29.4k
    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
2049
29.4k
    const __m128i scaled_bottom_left_lo =
2050
29.4k
        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
2051
29.4k
    const __m128i scaled_bottom_left_hi =
2052
29.4k
        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
2053
264k
    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2054
235k
      const __m128i y_select = _mm_set1_epi32(y_mask);
2055
235k
      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
2056
235k
      const __m128i scaled_bottom_left_y =
2057
235k
          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
2058
235k
      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
2059
235k
                                     scaled_bottom_left_y, scaled_bottom_left_y,
2060
235k
                                     round);
2061
235k
      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2062
235k
                                     scaled_bottom_left_y, scaled_bottom_left_y,
2063
235k
                                     round);
2064
235k
      write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2065
235k
                                     scaled_bottom_left_y, scaled_bottom_left_y,
2066
235k
                                     round);
2067
235k
      write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2068
235k
                                     scaled_bottom_left_y, scaled_bottom_left_y,
2069
235k
                                     round);
2070
235k
      dst += stride;
2071
235k
    }
2072
264k
    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2073
235k
      const __m128i y_select = _mm_set1_epi32(y_mask);
2074
235k
      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
2075
235k
      const __m128i scaled_bottom_left_y =
2076
235k
          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
2077
235k
      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
2078
235k
                                     scaled_bottom_left_y, scaled_bottom_left_y,
2079
235k
                                     round);
2080
235k
      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2081
235k
                                     scaled_bottom_left_y, scaled_bottom_left_y,
2082
235k
                                     round);
2083
235k
      write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2084
235k
                                     scaled_bottom_left_y, scaled_bottom_left_y,
2085
235k
                                     round);
2086
235k
      write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2087
235k
                                     scaled_bottom_left_y, scaled_bottom_left_y,
2088
235k
                                     round);
2089
235k
      dst += stride;
2090
235k
    }
2091
29.4k
  }
2092
7.35k
}
2093
2094
// -----------------------------------------------------------------------------
2095
// SMOOTH_H_PRED
2096
static AOM_FORCE_INLINE void write_smooth_horizontal_sum4(
2097
    uint8_t *LIBAOM_RESTRICT dst, const __m128i *left_y, const __m128i *weights,
2098
702k
    const __m128i *scaled_top_right, const __m128i *round) {
2099
702k
  const __m128i weighted_left_y = _mm_mullo_epi16(*left_y, *weights);
2100
702k
  const __m128i pred_sum = _mm_add_epi32(*scaled_top_right, weighted_left_y);
2101
  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
2102
702k
  const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, *round), 8);
2103
702k
  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
2104
702k
  Store4(dst, _mm_shuffle_epi8(pred, cvtepi32_epi8));
2105
702k
}
2106
2107
void aom_smooth_h_predictor_4x4_ssse3(
2108
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2109
    const uint8_t *LIBAOM_RESTRICT top_row,
2110
83.6k
    const uint8_t *LIBAOM_RESTRICT left_column) {
2111
83.6k
  const __m128i top_right = _mm_set1_epi32(top_row[3]);
2112
83.6k
  const __m128i left = cvtepu8_epi32(Load4(left_column));
2113
83.6k
  const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
2114
83.6k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2115
83.6k
  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
2116
83.6k
  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2117
83.6k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2118
83.6k
  __m128i left_y = _mm_shuffle_epi32(left, 0);
2119
83.6k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2120
83.6k
                               &round);
2121
83.6k
  dst += stride;
2122
83.6k
  left_y = _mm_shuffle_epi32(left, 0x55);
2123
83.6k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2124
83.6k
                               &round);
2125
83.6k
  dst += stride;
2126
83.6k
  left_y = _mm_shuffle_epi32(left, 0xaa);
2127
83.6k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2128
83.6k
                               &round);
2129
83.6k
  dst += stride;
2130
83.6k
  left_y = _mm_shuffle_epi32(left, 0xff);
2131
83.6k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2132
83.6k
                               &round);
2133
83.6k
}
2134
2135
void aom_smooth_h_predictor_4x8_ssse3(
2136
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2137
    const uint8_t *LIBAOM_RESTRICT top_row,
2138
20.9k
    const uint8_t *LIBAOM_RESTRICT left_column) {
2139
20.9k
  const __m128i top_right = _mm_set1_epi32(top_row[3]);
2140
20.9k
  const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
2141
20.9k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2142
20.9k
  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
2143
20.9k
  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2144
20.9k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2145
20.9k
  __m128i left = cvtepu8_epi32(Load4(left_column));
2146
20.9k
  __m128i left_y = _mm_shuffle_epi32(left, 0);
2147
20.9k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2148
20.9k
                               &round);
2149
20.9k
  dst += stride;
2150
20.9k
  left_y = _mm_shuffle_epi32(left, 0x55);
2151
20.9k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2152
20.9k
                               &round);
2153
20.9k
  dst += stride;
2154
20.9k
  left_y = _mm_shuffle_epi32(left, 0xaa);
2155
20.9k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2156
20.9k
                               &round);
2157
20.9k
  dst += stride;
2158
20.9k
  left_y = _mm_shuffle_epi32(left, 0xff);
2159
20.9k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2160
20.9k
                               &round);
2161
20.9k
  dst += stride;
2162
2163
20.9k
  left = cvtepu8_epi32(Load4(left_column + 4));
2164
20.9k
  left_y = _mm_shuffle_epi32(left, 0);
2165
20.9k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2166
20.9k
                               &round);
2167
20.9k
  dst += stride;
2168
20.9k
  left_y = _mm_shuffle_epi32(left, 0x55);
2169
20.9k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2170
20.9k
                               &round);
2171
20.9k
  dst += stride;
2172
20.9k
  left_y = _mm_shuffle_epi32(left, 0xaa);
2173
20.9k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2174
20.9k
                               &round);
2175
20.9k
  dst += stride;
2176
20.9k
  left_y = _mm_shuffle_epi32(left, 0xff);
2177
20.9k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2178
20.9k
                               &round);
2179
20.9k
}
2180
2181
void aom_smooth_h_predictor_4x16_ssse3(
2182
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2183
    const uint8_t *LIBAOM_RESTRICT top_row,
2184
12.5k
    const uint8_t *LIBAOM_RESTRICT left_column) {
2185
12.5k
  const __m128i top_right = _mm_set1_epi32(top_row[3]);
2186
12.5k
  const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
2187
12.5k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2188
12.5k
  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
2189
12.5k
  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2190
12.5k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2191
12.5k
  __m128i left = cvtepu8_epi32(Load4(left_column));
2192
12.5k
  __m128i left_y = _mm_shuffle_epi32(left, 0);
2193
12.5k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2194
12.5k
                               &round);
2195
12.5k
  dst += stride;
2196
12.5k
  left_y = _mm_shuffle_epi32(left, 0x55);
2197
12.5k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2198
12.5k
                               &round);
2199
12.5k
  dst += stride;
2200
12.5k
  left_y = _mm_shuffle_epi32(left, 0xaa);
2201
12.5k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2202
12.5k
                               &round);
2203
12.5k
  dst += stride;
2204
12.5k
  left_y = _mm_shuffle_epi32(left, 0xff);
2205
12.5k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2206
12.5k
                               &round);
2207
12.5k
  dst += stride;
2208
2209
12.5k
  left = cvtepu8_epi32(Load4(left_column + 4));
2210
12.5k
  left_y = _mm_shuffle_epi32(left, 0);
2211
12.5k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2212
12.5k
                               &round);
2213
12.5k
  dst += stride;
2214
12.5k
  left_y = _mm_shuffle_epi32(left, 0x55);
2215
12.5k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2216
12.5k
                               &round);
2217
12.5k
  dst += stride;
2218
12.5k
  left_y = _mm_shuffle_epi32(left, 0xaa);
2219
12.5k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2220
12.5k
                               &round);
2221
12.5k
  dst += stride;
2222
12.5k
  left_y = _mm_shuffle_epi32(left, 0xff);
2223
12.5k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2224
12.5k
                               &round);
2225
12.5k
  dst += stride;
2226
2227
12.5k
  left = cvtepu8_epi32(Load4(left_column + 8));
2228
12.5k
  left_y = _mm_shuffle_epi32(left, 0);
2229
12.5k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2230
12.5k
                               &round);
2231
12.5k
  dst += stride;
2232
12.5k
  left_y = _mm_shuffle_epi32(left, 0x55);
2233
12.5k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2234
12.5k
                               &round);
2235
12.5k
  dst += stride;
2236
12.5k
  left_y = _mm_shuffle_epi32(left, 0xaa);
2237
12.5k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2238
12.5k
                               &round);
2239
12.5k
  dst += stride;
2240
12.5k
  left_y = _mm_shuffle_epi32(left, 0xff);
2241
12.5k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2242
12.5k
                               &round);
2243
12.5k
  dst += stride;
2244
2245
12.5k
  left = cvtepu8_epi32(Load4(left_column + 12));
2246
12.5k
  left_y = _mm_shuffle_epi32(left, 0);
2247
12.5k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2248
12.5k
                               &round);
2249
12.5k
  dst += stride;
2250
12.5k
  left_y = _mm_shuffle_epi32(left, 0x55);
2251
12.5k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2252
12.5k
                               &round);
2253
12.5k
  dst += stride;
2254
12.5k
  left_y = _mm_shuffle_epi32(left, 0xaa);
2255
12.5k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2256
12.5k
                               &round);
2257
12.5k
  dst += stride;
2258
12.5k
  left_y = _mm_shuffle_epi32(left, 0xff);
2259
12.5k
  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2260
12.5k
                               &round);
2261
12.5k
}
2262
2263
// For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
2264
// |pixels| is a segment of the top row or the whole top row, and |weights| is
2265
// repeated.
2266
void aom_smooth_h_predictor_8x4_ssse3(
2267
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2268
    const uint8_t *LIBAOM_RESTRICT top_row,
2269
35.0k
    const uint8_t *LIBAOM_RESTRICT left_column) {
2270
35.0k
  const __m128i top_right = _mm_set1_epi16(top_row[7]);
2271
35.0k
  const __m128i left = cvtepu8_epi16(Load4(left_column));
2272
35.0k
  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2273
35.0k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2274
35.0k
  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2275
35.0k
  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2276
35.0k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2277
35.0k
  __m128i y_select = _mm_set1_epi32(0x01000100);
2278
35.0k
  __m128i left_y = _mm_shuffle_epi8(left, y_select);
2279
35.0k
  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2280
35.0k
                                &round);
2281
35.0k
  dst += stride;
2282
35.0k
  y_select = _mm_set1_epi32(0x03020302);
2283
35.0k
  left_y = _mm_shuffle_epi8(left, y_select);
2284
35.0k
  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2285
35.0k
                                &round);
2286
35.0k
  dst += stride;
2287
35.0k
  y_select = _mm_set1_epi32(0x05040504);
2288
35.0k
  left_y = _mm_shuffle_epi8(left, y_select);
2289
35.0k
  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2290
35.0k
                                &round);
2291
35.0k
  dst += stride;
2292
35.0k
  y_select = _mm_set1_epi32(0x07060706);
2293
35.0k
  left_y = _mm_shuffle_epi8(left, y_select);
2294
35.0k
  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2295
35.0k
                                &round);
2296
35.0k
}
2297
2298
void aom_smooth_h_predictor_8x8_ssse3(
2299
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2300
    const uint8_t *LIBAOM_RESTRICT top_row,
2301
50.1k
    const uint8_t *LIBAOM_RESTRICT left_column) {
2302
50.1k
  const __m128i top_right = _mm_set1_epi16(top_row[7]);
2303
50.1k
  const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2304
50.1k
  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2305
50.1k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2306
50.1k
  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2307
50.1k
  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2308
50.1k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2309
451k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2310
401k
    const __m128i y_select = _mm_set1_epi32(y_mask);
2311
401k
    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2312
401k
    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2313
401k
                                  &round);
2314
401k
    dst += stride;
2315
401k
  }
2316
50.1k
}
2317
2318
void aom_smooth_h_predictor_8x16_ssse3(
2319
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2320
    const uint8_t *LIBAOM_RESTRICT top_row,
2321
12.1k
    const uint8_t *LIBAOM_RESTRICT left_column) {
2322
12.1k
  const __m128i top_right = _mm_set1_epi16(top_row[7]);
2323
12.1k
  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2324
12.1k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2325
12.1k
  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2326
12.1k
  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2327
12.1k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2328
12.1k
  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2329
109k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2330
97.3k
    const __m128i y_select = _mm_set1_epi32(y_mask);
2331
97.3k
    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2332
97.3k
    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2333
97.3k
                                  &round);
2334
97.3k
    dst += stride;
2335
97.3k
  }
2336
12.1k
  left = cvtepu8_epi16(LoadLo8(left_column + 8));
2337
109k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2338
97.3k
    const __m128i y_select = _mm_set1_epi32(y_mask);
2339
97.3k
    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2340
97.3k
    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2341
97.3k
                                  &round);
2342
97.3k
    dst += stride;
2343
97.3k
  }
2344
12.1k
}
2345
2346
void aom_smooth_h_predictor_8x32_ssse3(
2347
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2348
    const uint8_t *LIBAOM_RESTRICT top_row,
2349
6.03k
    const uint8_t *LIBAOM_RESTRICT left_column) {
2350
6.03k
  const __m128i top_right = _mm_set1_epi16(top_row[7]);
2351
6.03k
  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2352
6.03k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2353
6.03k
  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2354
6.03k
  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2355
6.03k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2356
6.03k
  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2357
54.2k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2358
48.2k
    const __m128i y_select = _mm_set1_epi32(y_mask);
2359
48.2k
    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2360
48.2k
    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2361
48.2k
                                  &round);
2362
48.2k
    dst += stride;
2363
48.2k
  }
2364
6.03k
  left = cvtepu8_epi16(LoadLo8(left_column + 8));
2365
54.2k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2366
48.2k
    const __m128i y_select = _mm_set1_epi32(y_mask);
2367
48.2k
    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2368
48.2k
    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2369
48.2k
                                  &round);
2370
48.2k
    dst += stride;
2371
48.2k
  }
2372
6.03k
  left = cvtepu8_epi16(LoadLo8(left_column + 16));
2373
54.2k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2374
48.2k
    const __m128i y_select = _mm_set1_epi32(y_mask);
2375
48.2k
    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2376
48.2k
    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2377
48.2k
                                  &round);
2378
48.2k
    dst += stride;
2379
48.2k
  }
2380
6.03k
  left = cvtepu8_epi16(LoadLo8(left_column + 24));
2381
54.2k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2382
48.2k
    const __m128i y_select = _mm_set1_epi32(y_mask);
2383
48.2k
    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2384
48.2k
    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2385
48.2k
                                  &round);
2386
48.2k
    dst += stride;
2387
48.2k
  }
2388
6.03k
}
2389
2390
void aom_smooth_h_predictor_16x4_ssse3(
2391
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2392
    const uint8_t *LIBAOM_RESTRICT top_row,
2393
23.2k
    const uint8_t *LIBAOM_RESTRICT left_column) {
2394
23.2k
  const __m128i top_right = _mm_set1_epi16(top_row[15]);
2395
23.2k
  const __m128i left = cvtepu8_epi16(Load4(left_column));
2396
23.2k
  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2397
23.2k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2398
23.2k
  const __m128i weights1 = cvtepu8_epi16(weights);
2399
23.2k
  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2400
23.2k
  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2401
23.2k
  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2402
23.2k
  const __m128i scaled_top_right1 =
2403
23.2k
      _mm_mullo_epi16(inverted_weights1, top_right);
2404
23.2k
  const __m128i scaled_top_right2 =
2405
23.2k
      _mm_mullo_epi16(inverted_weights2, top_right);
2406
23.2k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2407
23.2k
  __m128i y_mask = _mm_set1_epi32(0x01000100);
2408
23.2k
  __m128i left_y = _mm_shuffle_epi8(left, y_mask);
2409
23.2k
  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2410
23.2k
                                 scaled_top_right1, scaled_top_right2, round);
2411
23.2k
  dst += stride;
2412
23.2k
  y_mask = _mm_set1_epi32(0x03020302);
2413
23.2k
  left_y = _mm_shuffle_epi8(left, y_mask);
2414
23.2k
  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2415
23.2k
                                 scaled_top_right1, scaled_top_right2, round);
2416
23.2k
  dst += stride;
2417
23.2k
  y_mask = _mm_set1_epi32(0x05040504);
2418
23.2k
  left_y = _mm_shuffle_epi8(left, y_mask);
2419
23.2k
  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2420
23.2k
                                 scaled_top_right1, scaled_top_right2, round);
2421
23.2k
  dst += stride;
2422
23.2k
  y_mask = _mm_set1_epi32(0x07060706);
2423
23.2k
  left_y = _mm_shuffle_epi8(left, y_mask);
2424
23.2k
  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2425
23.2k
                                 scaled_top_right1, scaled_top_right2, round);
2426
23.2k
}
2427
2428
void aom_smooth_h_predictor_16x8_ssse3(
2429
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2430
    const uint8_t *LIBAOM_RESTRICT top_row,
2431
20.1k
    const uint8_t *LIBAOM_RESTRICT left_column) {
2432
20.1k
  const __m128i top_right = _mm_set1_epi16(top_row[15]);
2433
20.1k
  const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2434
20.1k
  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2435
20.1k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2436
20.1k
  const __m128i weights1 = cvtepu8_epi16(weights);
2437
20.1k
  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2438
20.1k
  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2439
20.1k
  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2440
20.1k
  const __m128i scaled_top_right1 =
2441
20.1k
      _mm_mullo_epi16(inverted_weights1, top_right);
2442
20.1k
  const __m128i scaled_top_right2 =
2443
20.1k
      _mm_mullo_epi16(inverted_weights2, top_right);
2444
20.1k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2445
181k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2446
160k
    const __m128i y_select = _mm_set1_epi32(y_mask);
2447
160k
    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2448
160k
    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2449
160k
                                   scaled_top_right1, scaled_top_right2, round);
2450
160k
    dst += stride;
2451
160k
  }
2452
20.1k
}
2453
2454
void aom_smooth_h_predictor_16x16_ssse3(
2455
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2456
    const uint8_t *LIBAOM_RESTRICT top_row,
2457
44.0k
    const uint8_t *LIBAOM_RESTRICT left_column) {
2458
44.0k
  const __m128i top_right = _mm_set1_epi16(top_row[15]);
2459
44.0k
  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2460
44.0k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2461
44.0k
  const __m128i weights1 = cvtepu8_epi16(weights);
2462
44.0k
  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2463
44.0k
  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2464
44.0k
  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2465
44.0k
  const __m128i scaled_top_right1 =
2466
44.0k
      _mm_mullo_epi16(inverted_weights1, top_right);
2467
44.0k
  const __m128i scaled_top_right2 =
2468
44.0k
      _mm_mullo_epi16(inverted_weights2, top_right);
2469
44.0k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2470
44.0k
  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2471
396k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2472
352k
    const __m128i y_select = _mm_set1_epi32(y_mask);
2473
352k
    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2474
352k
    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2475
352k
                                   scaled_top_right1, scaled_top_right2, round);
2476
352k
    dst += stride;
2477
352k
  }
2478
44.0k
  left = cvtepu8_epi16(LoadLo8(left_column + 8));
2479
396k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2480
352k
    const __m128i y_select = _mm_set1_epi32(y_mask);
2481
352k
    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2482
352k
    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2483
352k
                                   scaled_top_right1, scaled_top_right2, round);
2484
352k
    dst += stride;
2485
352k
  }
2486
44.0k
}
2487
2488
void aom_smooth_h_predictor_16x32_ssse3(
2489
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2490
    const uint8_t *LIBAOM_RESTRICT top_row,
2491
11.0k
    const uint8_t *LIBAOM_RESTRICT left_column) {
2492
11.0k
  const __m128i top_right = _mm_set1_epi16(top_row[15]);
2493
11.0k
  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2494
11.0k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2495
11.0k
  const __m128i weights1 = cvtepu8_epi16(weights);
2496
11.0k
  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2497
11.0k
  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2498
11.0k
  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2499
11.0k
  const __m128i scaled_top_right1 =
2500
11.0k
      _mm_mullo_epi16(inverted_weights1, top_right);
2501
11.0k
  const __m128i scaled_top_right2 =
2502
11.0k
      _mm_mullo_epi16(inverted_weights2, top_right);
2503
11.0k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2504
11.0k
  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2505
99.0k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2506
88.0k
    const __m128i y_select = _mm_set1_epi32(y_mask);
2507
88.0k
    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2508
88.0k
    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2509
88.0k
                                   scaled_top_right1, scaled_top_right2, round);
2510
88.0k
    dst += stride;
2511
88.0k
  }
2512
11.0k
  left = cvtepu8_epi16(LoadLo8(left_column + 8));
2513
99.0k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2514
88.0k
    const __m128i y_select = _mm_set1_epi32(y_mask);
2515
88.0k
    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2516
88.0k
    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2517
88.0k
                                   scaled_top_right1, scaled_top_right2, round);
2518
88.0k
    dst += stride;
2519
88.0k
  }
2520
11.0k
  left = cvtepu8_epi16(LoadLo8(left_column + 16));
2521
99.0k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2522
88.0k
    const __m128i y_select = _mm_set1_epi32(y_mask);
2523
88.0k
    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2524
88.0k
    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2525
88.0k
                                   scaled_top_right1, scaled_top_right2, round);
2526
88.0k
    dst += stride;
2527
88.0k
  }
2528
11.0k
  left = cvtepu8_epi16(LoadLo8(left_column + 24));
2529
99.0k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2530
88.0k
    const __m128i y_select = _mm_set1_epi32(y_mask);
2531
88.0k
    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2532
88.0k
    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2533
88.0k
                                   scaled_top_right1, scaled_top_right2, round);
2534
88.0k
    dst += stride;
2535
88.0k
  }
2536
11.0k
}
2537
2538
void aom_smooth_h_predictor_16x64_ssse3(
2539
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2540
    const uint8_t *LIBAOM_RESTRICT top_row,
2541
2.00k
    const uint8_t *LIBAOM_RESTRICT left_column) {
2542
2.00k
  const __m128i top_right = _mm_set1_epi16(top_row[15]);
2543
2.00k
  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2544
2.00k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2545
2.00k
  const __m128i weights1 = cvtepu8_epi16(weights);
2546
2.00k
  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2547
2.00k
  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2548
2.00k
  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2549
2.00k
  const __m128i scaled_top_right1 =
2550
2.00k
      _mm_mullo_epi16(inverted_weights1, top_right);
2551
2.00k
  const __m128i scaled_top_right2 =
2552
2.00k
      _mm_mullo_epi16(inverted_weights2, top_right);
2553
2.00k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2554
18.0k
  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
2555
16.0k
    const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
2556
144k
    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2557
128k
      const __m128i y_select = _mm_set1_epi32(y_mask);
2558
128k
      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2559
128k
      write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2560
128k
                                     scaled_top_right1, scaled_top_right2,
2561
128k
                                     round);
2562
128k
      dst += stride;
2563
128k
    }
2564
16.0k
  }
2565
2.00k
}
2566
2567
void aom_smooth_h_predictor_32x8_ssse3(
2568
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2569
    const uint8_t *LIBAOM_RESTRICT top_row,
2570
21.8k
    const uint8_t *LIBAOM_RESTRICT left_column) {
2571
21.8k
  const __m128i top_right = _mm_set1_epi16(top_row[31]);
2572
21.8k
  const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2573
21.8k
  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2574
21.8k
  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
2575
21.8k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2576
21.8k
  const __m128i weights1 = cvtepu8_epi16(weights_lo);
2577
21.8k
  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2578
21.8k
  const __m128i weights3 = cvtepu8_epi16(weights_hi);
2579
21.8k
  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2580
21.8k
  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2581
21.8k
  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2582
21.8k
  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2583
21.8k
  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2584
21.8k
  const __m128i scaled_top_right1 =
2585
21.8k
      _mm_mullo_epi16(inverted_weights1, top_right);
2586
21.8k
  const __m128i scaled_top_right2 =
2587
21.8k
      _mm_mullo_epi16(inverted_weights2, top_right);
2588
21.8k
  const __m128i scaled_top_right3 =
2589
21.8k
      _mm_mullo_epi16(inverted_weights3, top_right);
2590
21.8k
  const __m128i scaled_top_right4 =
2591
21.8k
      _mm_mullo_epi16(inverted_weights4, top_right);
2592
21.8k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2593
197k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2594
175k
    __m128i y_select = _mm_set1_epi32(y_mask);
2595
175k
    __m128i left_y = _mm_shuffle_epi8(left, y_select);
2596
175k
    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2597
175k
                                   scaled_top_right1, scaled_top_right2, round);
2598
175k
    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2599
175k
                                   scaled_top_right3, scaled_top_right4, round);
2600
175k
    dst += stride;
2601
175k
  }
2602
21.8k
}
2603
2604
void aom_smooth_h_predictor_32x16_ssse3(
2605
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2606
    const uint8_t *LIBAOM_RESTRICT top_row,
2607
9.11k
    const uint8_t *LIBAOM_RESTRICT left_column) {
2608
9.11k
  const __m128i top_right = _mm_set1_epi16(top_row[31]);
2609
9.11k
  const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
2610
9.11k
  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2611
9.11k
  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
2612
9.11k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2613
9.11k
  const __m128i weights1 = cvtepu8_epi16(weights_lo);
2614
9.11k
  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2615
9.11k
  const __m128i weights3 = cvtepu8_epi16(weights_hi);
2616
9.11k
  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2617
9.11k
  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2618
9.11k
  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2619
9.11k
  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2620
9.11k
  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2621
9.11k
  const __m128i scaled_top_right1 =
2622
9.11k
      _mm_mullo_epi16(inverted_weights1, top_right);
2623
9.11k
  const __m128i scaled_top_right2 =
2624
9.11k
      _mm_mullo_epi16(inverted_weights2, top_right);
2625
9.11k
  const __m128i scaled_top_right3 =
2626
9.11k
      _mm_mullo_epi16(inverted_weights3, top_right);
2627
9.11k
  const __m128i scaled_top_right4 =
2628
9.11k
      _mm_mullo_epi16(inverted_weights4, top_right);
2629
9.11k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2630
82.0k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2631
72.9k
    __m128i y_select = _mm_set1_epi32(y_mask);
2632
72.9k
    __m128i left_y = _mm_shuffle_epi8(left1, y_select);
2633
72.9k
    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2634
72.9k
                                   scaled_top_right1, scaled_top_right2, round);
2635
72.9k
    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2636
72.9k
                                   scaled_top_right3, scaled_top_right4, round);
2637
72.9k
    dst += stride;
2638
72.9k
  }
2639
9.11k
  const __m128i left2 =
2640
9.11k
      cvtepu8_epi16(LoadLo8((const uint8_t *)left_column + 8));
2641
82.0k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2642
72.9k
    __m128i y_select = _mm_set1_epi32(y_mask);
2643
72.9k
    __m128i left_y = _mm_shuffle_epi8(left2, y_select);
2644
72.9k
    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2645
72.9k
                                   scaled_top_right1, scaled_top_right2, round);
2646
72.9k
    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2647
72.9k
                                   scaled_top_right3, scaled_top_right4, round);
2648
72.9k
    dst += stride;
2649
72.9k
  }
2650
9.11k
}
2651
2652
void aom_smooth_h_predictor_32x32_ssse3(
2653
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2654
    const uint8_t *LIBAOM_RESTRICT top_row,
2655
36.1k
    const uint8_t *LIBAOM_RESTRICT left_column) {
2656
36.1k
  const __m128i top_right = _mm_set1_epi16(top_row[31]);
2657
36.1k
  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2658
36.1k
  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
2659
36.1k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2660
36.1k
  const __m128i weights1 = cvtepu8_epi16(weights_lo);
2661
36.1k
  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2662
36.1k
  const __m128i weights3 = cvtepu8_epi16(weights_hi);
2663
36.1k
  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2664
36.1k
  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2665
36.1k
  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2666
36.1k
  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2667
36.1k
  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2668
36.1k
  const __m128i scaled_top_right1 =
2669
36.1k
      _mm_mullo_epi16(inverted_weights1, top_right);
2670
36.1k
  const __m128i scaled_top_right2 =
2671
36.1k
      _mm_mullo_epi16(inverted_weights2, top_right);
2672
36.1k
  const __m128i scaled_top_right3 =
2673
36.1k
      _mm_mullo_epi16(inverted_weights3, top_right);
2674
36.1k
  const __m128i scaled_top_right4 =
2675
36.1k
      _mm_mullo_epi16(inverted_weights4, top_right);
2676
36.1k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2677
36.1k
  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2678
325k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2679
289k
    __m128i y_select = _mm_set1_epi32(y_mask);
2680
289k
    __m128i left_y = _mm_shuffle_epi8(left, y_select);
2681
289k
    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2682
289k
                                   scaled_top_right1, scaled_top_right2, round);
2683
289k
    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2684
289k
                                   scaled_top_right3, scaled_top_right4, round);
2685
289k
    dst += stride;
2686
289k
  }
2687
36.1k
  left = cvtepu8_epi16(LoadLo8(left_column + 8));
2688
325k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2689
289k
    __m128i y_select = _mm_set1_epi32(y_mask);
2690
289k
    __m128i left_y = _mm_shuffle_epi8(left, y_select);
2691
289k
    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2692
289k
                                   scaled_top_right1, scaled_top_right2, round);
2693
289k
    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2694
289k
                                   scaled_top_right3, scaled_top_right4, round);
2695
289k
    dst += stride;
2696
289k
  }
2697
36.1k
  left = cvtepu8_epi16(LoadLo8(left_column + 16));
2698
325k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2699
289k
    __m128i y_select = _mm_set1_epi32(y_mask);
2700
289k
    __m128i left_y = _mm_shuffle_epi8(left, y_select);
2701
289k
    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2702
289k
                                   scaled_top_right1, scaled_top_right2, round);
2703
289k
    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2704
289k
                                   scaled_top_right3, scaled_top_right4, round);
2705
289k
    dst += stride;
2706
289k
  }
2707
36.1k
  left = cvtepu8_epi16(LoadLo8(left_column + 24));
2708
325k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2709
289k
    __m128i y_select = _mm_set1_epi32(y_mask);
2710
289k
    __m128i left_y = _mm_shuffle_epi8(left, y_select);
2711
289k
    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2712
289k
                                   scaled_top_right1, scaled_top_right2, round);
2713
289k
    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2714
289k
                                   scaled_top_right3, scaled_top_right4, round);
2715
289k
    dst += stride;
2716
289k
  }
2717
36.1k
}
2718
2719
void aom_smooth_h_predictor_32x64_ssse3(
2720
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2721
    const uint8_t *LIBAOM_RESTRICT top_row,
2722
749
    const uint8_t *LIBAOM_RESTRICT left_column) {
2723
749
  const __m128i top_right = _mm_set1_epi16(top_row[31]);
2724
749
  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2725
749
  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
2726
749
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2727
749
  const __m128i weights1 = cvtepu8_epi16(weights_lo);
2728
749
  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2729
749
  const __m128i weights3 = cvtepu8_epi16(weights_hi);
2730
749
  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2731
749
  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2732
749
  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2733
749
  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2734
749
  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2735
749
  const __m128i scaled_top_right1 =
2736
749
      _mm_mullo_epi16(inverted_weights1, top_right);
2737
749
  const __m128i scaled_top_right2 =
2738
749
      _mm_mullo_epi16(inverted_weights2, top_right);
2739
749
  const __m128i scaled_top_right3 =
2740
749
      _mm_mullo_epi16(inverted_weights3, top_right);
2741
749
  const __m128i scaled_top_right4 =
2742
749
      _mm_mullo_epi16(inverted_weights4, top_right);
2743
749
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2744
6.74k
  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
2745
5.99k
    const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
2746
53.9k
    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2747
47.9k
      const __m128i y_select = _mm_set1_epi32(y_mask);
2748
47.9k
      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2749
47.9k
      write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2750
47.9k
                                     scaled_top_right1, scaled_top_right2,
2751
47.9k
                                     round);
2752
47.9k
      write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
2753
47.9k
                                     weights4, scaled_top_right3,
2754
47.9k
                                     scaled_top_right4, round);
2755
47.9k
      dst += stride;
2756
47.9k
    }
2757
5.99k
  }
2758
749
}
2759
2760
void aom_smooth_h_predictor_64x16_ssse3(
2761
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2762
    const uint8_t *LIBAOM_RESTRICT top_row,
2763
7.95k
    const uint8_t *LIBAOM_RESTRICT left_column) {
2764
7.95k
  const __m128i top_right = _mm_set1_epi16(top_row[63]);
2765
7.95k
  const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
2766
7.95k
  const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
2767
7.95k
  const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
2768
7.95k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2769
7.95k
  const __m128i weights1 = cvtepu8_epi16(weights_lolo);
2770
7.95k
  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
2771
7.95k
  const __m128i weights3 = cvtepu8_epi16(weights_lohi);
2772
7.95k
  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
2773
7.95k
  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2774
7.95k
  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2775
7.95k
  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2776
7.95k
  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2777
7.95k
  const __m128i scaled_top_right1 =
2778
7.95k
      _mm_mullo_epi16(inverted_weights1, top_right);
2779
7.95k
  const __m128i scaled_top_right2 =
2780
7.95k
      _mm_mullo_epi16(inverted_weights2, top_right);
2781
7.95k
  const __m128i scaled_top_right3 =
2782
7.95k
      _mm_mullo_epi16(inverted_weights3, top_right);
2783
7.95k
  const __m128i scaled_top_right4 =
2784
7.95k
      _mm_mullo_epi16(inverted_weights4, top_right);
2785
7.95k
  const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
2786
7.95k
  const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
2787
7.95k
  const __m128i weights5 = cvtepu8_epi16(weights_hilo);
2788
7.95k
  const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
2789
7.95k
  const __m128i weights7 = cvtepu8_epi16(weights_hihi);
2790
7.95k
  const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
2791
7.95k
  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
2792
7.95k
  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
2793
7.95k
  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
2794
7.95k
  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
2795
7.95k
  const __m128i scaled_top_right5 =
2796
7.95k
      _mm_mullo_epi16(inverted_weights5, top_right);
2797
7.95k
  const __m128i scaled_top_right6 =
2798
7.95k
      _mm_mullo_epi16(inverted_weights6, top_right);
2799
7.95k
  const __m128i scaled_top_right7 =
2800
7.95k
      _mm_mullo_epi16(inverted_weights7, top_right);
2801
7.95k
  const __m128i scaled_top_right8 =
2802
7.95k
      _mm_mullo_epi16(inverted_weights8, top_right);
2803
7.95k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2804
71.5k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2805
63.6k
    __m128i y_select = _mm_set1_epi32(y_mask);
2806
63.6k
    __m128i left_y = _mm_shuffle_epi8(left1, y_select);
2807
63.6k
    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2808
63.6k
                                   scaled_top_right1, scaled_top_right2, round);
2809
63.6k
    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2810
63.6k
                                   scaled_top_right3, scaled_top_right4, round);
2811
63.6k
    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2812
63.6k
                                   scaled_top_right5, scaled_top_right6, round);
2813
63.6k
    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2814
63.6k
                                   scaled_top_right7, scaled_top_right8, round);
2815
63.6k
    dst += stride;
2816
63.6k
  }
2817
7.95k
  const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
2818
71.5k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2819
63.6k
    __m128i y_select = _mm_set1_epi32(y_mask);
2820
63.6k
    __m128i left_y = _mm_shuffle_epi8(left2, y_select);
2821
63.6k
    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2822
63.6k
                                   scaled_top_right1, scaled_top_right2, round);
2823
63.6k
    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2824
63.6k
                                   scaled_top_right3, scaled_top_right4, round);
2825
63.6k
    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2826
63.6k
                                   scaled_top_right5, scaled_top_right6, round);
2827
63.6k
    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2828
63.6k
                                   scaled_top_right7, scaled_top_right8, round);
2829
63.6k
    dst += stride;
2830
63.6k
  }
2831
7.95k
}
2832
2833
void aom_smooth_h_predictor_64x32_ssse3(
2834
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2835
    const uint8_t *LIBAOM_RESTRICT top_row,
2836
1.40k
    const uint8_t *LIBAOM_RESTRICT left_column) {
2837
1.40k
  const __m128i top_right = _mm_set1_epi16(top_row[63]);
2838
1.40k
  const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
2839
1.40k
  const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
2840
1.40k
  const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
2841
1.40k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2842
1.40k
  const __m128i weights1 = cvtepu8_epi16(weights_lolo);
2843
1.40k
  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
2844
1.40k
  const __m128i weights3 = cvtepu8_epi16(weights_lohi);
2845
1.40k
  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
2846
1.40k
  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2847
1.40k
  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2848
1.40k
  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2849
1.40k
  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2850
1.40k
  const __m128i scaled_top_right1 =
2851
1.40k
      _mm_mullo_epi16(inverted_weights1, top_right);
2852
1.40k
  const __m128i scaled_top_right2 =
2853
1.40k
      _mm_mullo_epi16(inverted_weights2, top_right);
2854
1.40k
  const __m128i scaled_top_right3 =
2855
1.40k
      _mm_mullo_epi16(inverted_weights3, top_right);
2856
1.40k
  const __m128i scaled_top_right4 =
2857
1.40k
      _mm_mullo_epi16(inverted_weights4, top_right);
2858
1.40k
  const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
2859
1.40k
  const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
2860
1.40k
  const __m128i weights5 = cvtepu8_epi16(weights_hilo);
2861
1.40k
  const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
2862
1.40k
  const __m128i weights7 = cvtepu8_epi16(weights_hihi);
2863
1.40k
  const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
2864
1.40k
  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
2865
1.40k
  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
2866
1.40k
  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
2867
1.40k
  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
2868
1.40k
  const __m128i scaled_top_right5 =
2869
1.40k
      _mm_mullo_epi16(inverted_weights5, top_right);
2870
1.40k
  const __m128i scaled_top_right6 =
2871
1.40k
      _mm_mullo_epi16(inverted_weights6, top_right);
2872
1.40k
  const __m128i scaled_top_right7 =
2873
1.40k
      _mm_mullo_epi16(inverted_weights7, top_right);
2874
1.40k
  const __m128i scaled_top_right8 =
2875
1.40k
      _mm_mullo_epi16(inverted_weights8, top_right);
2876
1.40k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2877
12.6k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2878
11.2k
    const __m128i y_select = _mm_set1_epi32(y_mask);
2879
11.2k
    const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
2880
11.2k
    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2881
11.2k
                                   scaled_top_right1, scaled_top_right2, round);
2882
11.2k
    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2883
11.2k
                                   scaled_top_right3, scaled_top_right4, round);
2884
11.2k
    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2885
11.2k
                                   scaled_top_right5, scaled_top_right6, round);
2886
11.2k
    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2887
11.2k
                                   scaled_top_right7, scaled_top_right8, round);
2888
11.2k
    dst += stride;
2889
11.2k
  }
2890
1.40k
  const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
2891
12.6k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2892
11.2k
    const __m128i y_select = _mm_set1_epi32(y_mask);
2893
11.2k
    const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
2894
11.2k
    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2895
11.2k
                                   scaled_top_right1, scaled_top_right2, round);
2896
11.2k
    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2897
11.2k
                                   scaled_top_right3, scaled_top_right4, round);
2898
11.2k
    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2899
11.2k
                                   scaled_top_right5, scaled_top_right6, round);
2900
11.2k
    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2901
11.2k
                                   scaled_top_right7, scaled_top_right8, round);
2902
11.2k
    dst += stride;
2903
11.2k
  }
2904
1.40k
  const __m128i left3 = cvtepu8_epi16(LoadLo8(left_column + 16));
2905
12.6k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2906
11.2k
    const __m128i y_select = _mm_set1_epi32(y_mask);
2907
11.2k
    const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
2908
11.2k
    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2909
11.2k
                                   scaled_top_right1, scaled_top_right2, round);
2910
11.2k
    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2911
11.2k
                                   scaled_top_right3, scaled_top_right4, round);
2912
11.2k
    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2913
11.2k
                                   scaled_top_right5, scaled_top_right6, round);
2914
11.2k
    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2915
11.2k
                                   scaled_top_right7, scaled_top_right8, round);
2916
11.2k
    dst += stride;
2917
11.2k
  }
2918
1.40k
  const __m128i left4 = cvtepu8_epi16(LoadLo8(left_column + 24));
2919
12.6k
  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2920
11.2k
    const __m128i y_select = _mm_set1_epi32(y_mask);
2921
11.2k
    const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
2922
11.2k
    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2923
11.2k
                                   scaled_top_right1, scaled_top_right2, round);
2924
11.2k
    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2925
11.2k
                                   scaled_top_right3, scaled_top_right4, round);
2926
11.2k
    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2927
11.2k
                                   scaled_top_right5, scaled_top_right6, round);
2928
11.2k
    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2929
11.2k
                                   scaled_top_right7, scaled_top_right8, round);
2930
11.2k
    dst += stride;
2931
11.2k
  }
2932
1.40k
}
2933
2934
void aom_smooth_h_predictor_64x64_ssse3(
2935
    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2936
    const uint8_t *LIBAOM_RESTRICT top_row,
2937
6.61k
    const uint8_t *LIBAOM_RESTRICT left_column) {
2938
6.61k
  const __m128i top_right = _mm_set1_epi16(top_row[63]);
2939
6.61k
  const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
2940
6.61k
  const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
2941
6.61k
  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2942
6.61k
  const __m128i weights1 = cvtepu8_epi16(weights_lolo);
2943
6.61k
  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
2944
6.61k
  const __m128i weights3 = cvtepu8_epi16(weights_lohi);
2945
6.61k
  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
2946
6.61k
  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2947
6.61k
  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2948
6.61k
  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2949
6.61k
  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2950
6.61k
  const __m128i scaled_top_right1 =
2951
6.61k
      _mm_mullo_epi16(inverted_weights1, top_right);
2952
6.61k
  const __m128i scaled_top_right2 =
2953
6.61k
      _mm_mullo_epi16(inverted_weights2, top_right);
2954
6.61k
  const __m128i scaled_top_right3 =
2955
6.61k
      _mm_mullo_epi16(inverted_weights3, top_right);
2956
6.61k
  const __m128i scaled_top_right4 =
2957
6.61k
      _mm_mullo_epi16(inverted_weights4, top_right);
2958
6.61k
  const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
2959
6.61k
  const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
2960
6.61k
  const __m128i weights5 = cvtepu8_epi16(weights_hilo);
2961
6.61k
  const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
2962
6.61k
  const __m128i weights7 = cvtepu8_epi16(weights_hihi);
2963
6.61k
  const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
2964
6.61k
  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
2965
6.61k
  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
2966
6.61k
  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
2967
6.61k
  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
2968
6.61k
  const __m128i scaled_top_right5 =
2969
6.61k
      _mm_mullo_epi16(inverted_weights5, top_right);
2970
6.61k
  const __m128i scaled_top_right6 =
2971
6.61k
      _mm_mullo_epi16(inverted_weights6, top_right);
2972
6.61k
  const __m128i scaled_top_right7 =
2973
6.61k
      _mm_mullo_epi16(inverted_weights7, top_right);
2974
6.61k
  const __m128i scaled_top_right8 =
2975
6.61k
      _mm_mullo_epi16(inverted_weights8, top_right);
2976
6.61k
  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2977
59.5k
  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
2978
52.9k
    const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
2979
476k
    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2980
423k
      const __m128i y_select = _mm_set1_epi32(y_mask);
2981
423k
      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2982
423k
      write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2983
423k
                                     scaled_top_right1, scaled_top_right2,
2984
423k
                                     round);
2985
423k
      write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
2986
423k
                                     weights4, scaled_top_right3,
2987
423k
                                     scaled_top_right4, round);
2988
423k
      write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5,
2989
423k
                                     weights6, scaled_top_right5,
2990
423k
                                     scaled_top_right6, round);
2991
423k
      write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7,
2992
423k
                                     weights8, scaled_top_right7,
2993
423k
                                     scaled_top_right8, round);
2994
423k
      dst += stride;
2995
423k
    }
2996
52.9k
  }
2997
6.61k
}