Coverage Report

Created: 2023-06-07 06:31

/src/aom/aom_dsp/x86/intrapred_avx2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <immintrin.h>
13
14
#include "config/aom_dsp_rtcd.h"
15
#include "aom_dsp/x86/intrapred_x86.h"
16
#include "aom_dsp/x86/intrapred_utils.h"
17
#include "aom_dsp/x86/lpf_common_sse2.h"
18
19
353k
static INLINE __m256i dc_sum_64(const uint8_t *ref) {
20
353k
  const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
21
353k
  const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
22
353k
  const __m256i zero = _mm256_setzero_si256();
23
353k
  __m256i y0 = _mm256_sad_epu8(x0, zero);
24
353k
  __m256i y1 = _mm256_sad_epu8(x1, zero);
25
353k
  y0 = _mm256_add_epi64(y0, y1);
26
353k
  __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
27
353k
  y0 = _mm256_add_epi64(u0, y0);
28
353k
  u0 = _mm256_unpackhi_epi64(y0, y0);
29
353k
  return _mm256_add_epi16(y0, u0);
30
353k
}
31
32
2.06M
static INLINE __m256i dc_sum_32(const uint8_t *ref) {
33
2.06M
  const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
34
2.06M
  const __m256i zero = _mm256_setzero_si256();
35
2.06M
  __m256i y = _mm256_sad_epu8(x, zero);
36
2.06M
  __m256i u = _mm256_permute2x128_si256(y, y, 1);
37
2.06M
  y = _mm256_add_epi64(u, y);
38
2.06M
  u = _mm256_unpackhi_epi64(y, y);
39
2.06M
  return _mm256_add_epi16(y, u);
40
2.06M
}
41
42
static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
43
1.32M
                                  ptrdiff_t stride) {
44
41.5M
  for (int i = 0; i < height; ++i) {
45
40.2M
    _mm256_storeu_si256((__m256i *)dst, *r);
46
40.2M
    dst += stride;
47
40.2M
  }
48
1.32M
}
49
50
static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
51
                                    int height, uint8_t *dst,
52
3.69k
                                    ptrdiff_t stride) {
53
187k
  for (int i = 0; i < height; ++i) {
54
183k
    _mm256_storeu_si256((__m256i *)dst, *r0);
55
183k
    _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
56
183k
    dst += stride;
57
183k
  }
58
3.69k
}
59
60
static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
61
251k
                                  ptrdiff_t stride) {
62
11.7M
  for (int i = 0; i < height; ++i) {
63
11.4M
    _mm256_storeu_si256((__m256i *)dst, *r);
64
11.4M
    _mm256_storeu_si256((__m256i *)(dst + 32), *r);
65
11.4M
    dst += stride;
66
11.4M
  }
67
251k
}
68
69
static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = {
70
  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
71
  { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
72
  { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
73
  { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
74
  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 },
75
  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 },
76
  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 },
77
  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
78
};
79
80
static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = {
81
  { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 },
82
  { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 },
83
  { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 },
84
  { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 }
85
};
86
87
static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = {
88
  { 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25, 28, 29,
89
    2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
90
  { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27,
91
    0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
92
  { 0, 1, 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25,
93
    0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27 },
94
  { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23,
95
    0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25 },
96
  { 0, 1, 0, 1, 0, 1, 0, 1, 8,  9,  12, 13, 16, 17, 20, 21,
97
    0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23 },
98
  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19,
99
    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21 },
100
  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17,
101
    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19 },
102
  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15,
103
    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17 }
104
};
105
106
static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = {
107
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
108
  { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
109
  { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
110
  { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
111
  { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
112
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
113
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
114
    0 },
115
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0,
116
    0, 0 },
117
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0,
118
    0, 0, 0, 0 },
119
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0,
120
    0, 0, 0, 0, 0, 0 },
121
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
122
    0xffff, 0, 0, 0, 0, 0, 0 },
123
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
124
    0xffff, 0xffff, 0, 0, 0, 0, 0 },
125
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
126
    0xffff, 0xffff, 0xffff, 0, 0, 0, 0 },
127
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
128
    0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 },
129
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
130
    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 },
131
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
132
    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 },
133
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
134
    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
135
};
136
137
53.2k
static INLINE void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
138
53.2k
  __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
139
140
53.2k
  r0 = _mm_unpacklo_epi16(x[0], x[1]);
141
53.2k
  r1 = _mm_unpacklo_epi16(x[2], x[3]);
142
53.2k
  r2 = _mm_unpacklo_epi16(x[4], x[5]);
143
53.2k
  r3 = _mm_unpacklo_epi16(x[6], x[7]);
144
145
53.2k
  r4 = _mm_unpacklo_epi16(x[8], x[9]);
146
53.2k
  r5 = _mm_unpacklo_epi16(x[10], x[11]);
147
53.2k
  r6 = _mm_unpacklo_epi16(x[12], x[13]);
148
53.2k
  r7 = _mm_unpacklo_epi16(x[14], x[15]);
149
150
53.2k
  r8 = _mm_unpacklo_epi32(r0, r1);
151
53.2k
  r9 = _mm_unpackhi_epi32(r0, r1);
152
53.2k
  r10 = _mm_unpacklo_epi32(r2, r3);
153
53.2k
  r11 = _mm_unpackhi_epi32(r2, r3);
154
155
53.2k
  r12 = _mm_unpacklo_epi32(r4, r5);
156
53.2k
  r13 = _mm_unpackhi_epi32(r4, r5);
157
53.2k
  r14 = _mm_unpacklo_epi32(r6, r7);
158
53.2k
  r15 = _mm_unpackhi_epi32(r6, r7);
159
160
53.2k
  r0 = _mm_unpacklo_epi64(r8, r9);
161
53.2k
  r1 = _mm_unpackhi_epi64(r8, r9);
162
53.2k
  r2 = _mm_unpacklo_epi64(r10, r11);
163
53.2k
  r3 = _mm_unpackhi_epi64(r10, r11);
164
165
53.2k
  r4 = _mm_unpacklo_epi64(r12, r13);
166
53.2k
  r5 = _mm_unpackhi_epi64(r12, r13);
167
53.2k
  r6 = _mm_unpacklo_epi64(r14, r15);
168
53.2k
  r7 = _mm_unpackhi_epi64(r14, r15);
169
170
53.2k
  d[0] = _mm_unpacklo_epi64(r0, r2);
171
53.2k
  d[1] = _mm_unpacklo_epi64(r4, r6);
172
53.2k
  d[2] = _mm_unpacklo_epi64(r1, r3);
173
53.2k
  d[3] = _mm_unpacklo_epi64(r5, r7);
174
175
53.2k
  d[4] = _mm_unpackhi_epi64(r0, r2);
176
53.2k
  d[5] = _mm_unpackhi_epi64(r4, r6);
177
53.2k
  d[6] = _mm_unpackhi_epi64(r1, r3);
178
53.2k
  d[7] = _mm_unpackhi_epi64(r5, r7);
179
53.2k
}
180
181
17.8k
static INLINE void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
182
17.8k
  __m256i w0, w1, w2, w3, ww0, ww1;
183
184
17.8k
  w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
185
17.8k
  w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
186
17.8k
  w2 = _mm256_unpackhi_epi16(x[0], x[1]);  // 40 50 41 51 42 52 43 53
187
17.8k
  w3 = _mm256_unpackhi_epi16(x[2], x[3]);  // 60 70 61 71 62 72 63 73
188
189
17.8k
  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
190
17.8k
  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
191
192
17.8k
  d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
193
17.8k
  d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
194
195
17.8k
  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
196
17.8k
  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
197
198
17.8k
  d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
199
17.8k
  d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
200
17.8k
}
201
202
140k
static INLINE void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
203
140k
  __m256i w0, w1, w2, w3, ww0, ww1;
204
205
140k
  w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
206
140k
  w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
207
140k
  w2 = _mm256_unpacklo_epi16(x[4], x[5]);  // 40 50 41 51 42 52 43 53
208
140k
  w3 = _mm256_unpacklo_epi16(x[6], x[7]);  // 60 70 61 71 62 72 63 73
209
210
140k
  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
211
140k
  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
212
213
140k
  d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
214
140k
  d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
215
216
140k
  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
217
140k
  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
218
219
140k
  d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
220
140k
  d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
221
222
140k
  w0 = _mm256_unpackhi_epi16(x[0], x[1]);  // 04 14 05 15 06 16 07 17
223
140k
  w1 = _mm256_unpackhi_epi16(x[2], x[3]);  // 24 34 25 35 26 36 27 37
224
140k
  w2 = _mm256_unpackhi_epi16(x[4], x[5]);  // 44 54 45 55 46 56 47 57
225
140k
  w3 = _mm256_unpackhi_epi16(x[6], x[7]);  // 64 74 65 75 66 76 67 77
226
227
140k
  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
228
140k
  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
229
230
140k
  d[4] = _mm256_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
231
140k
  d[5] = _mm256_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
232
233
140k
  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
234
140k
  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
235
236
140k
  d[6] = _mm256_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
237
140k
  d[7] = _mm256_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
238
140k
}
239
240
813k
static INLINE void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) {
241
813k
  __m256i w0, w1, w2, w3, ww0, ww1;
242
813k
  __m256i dd[16];
243
813k
  w0 = _mm256_unpacklo_epi16(x[0], x[1]);
244
813k
  w1 = _mm256_unpacklo_epi16(x[2], x[3]);
245
813k
  w2 = _mm256_unpacklo_epi16(x[4], x[5]);
246
813k
  w3 = _mm256_unpacklo_epi16(x[6], x[7]);
247
248
813k
  ww0 = _mm256_unpacklo_epi32(w0, w1);  //
249
813k
  ww1 = _mm256_unpacklo_epi32(w2, w3);  //
250
251
813k
  dd[0] = _mm256_unpacklo_epi64(ww0, ww1);
252
813k
  dd[1] = _mm256_unpackhi_epi64(ww0, ww1);
253
254
813k
  ww0 = _mm256_unpackhi_epi32(w0, w1);  //
255
813k
  ww1 = _mm256_unpackhi_epi32(w2, w3);  //
256
257
813k
  dd[2] = _mm256_unpacklo_epi64(ww0, ww1);
258
813k
  dd[3] = _mm256_unpackhi_epi64(ww0, ww1);
259
260
813k
  w0 = _mm256_unpackhi_epi16(x[0], x[1]);
261
813k
  w1 = _mm256_unpackhi_epi16(x[2], x[3]);
262
813k
  w2 = _mm256_unpackhi_epi16(x[4], x[5]);
263
813k
  w3 = _mm256_unpackhi_epi16(x[6], x[7]);
264
265
813k
  ww0 = _mm256_unpacklo_epi32(w0, w1);  //
266
813k
  ww1 = _mm256_unpacklo_epi32(w2, w3);  //
267
268
813k
  dd[4] = _mm256_unpacklo_epi64(ww0, ww1);
269
813k
  dd[5] = _mm256_unpackhi_epi64(ww0, ww1);
270
271
813k
  ww0 = _mm256_unpackhi_epi32(w0, w1);  //
272
813k
  ww1 = _mm256_unpackhi_epi32(w2, w3);  //
273
274
813k
  dd[6] = _mm256_unpacklo_epi64(ww0, ww1);
275
813k
  dd[7] = _mm256_unpackhi_epi64(ww0, ww1);
276
277
813k
  w0 = _mm256_unpacklo_epi16(x[8], x[9]);
278
813k
  w1 = _mm256_unpacklo_epi16(x[10], x[11]);
279
813k
  w2 = _mm256_unpacklo_epi16(x[12], x[13]);
280
813k
  w3 = _mm256_unpacklo_epi16(x[14], x[15]);
281
282
813k
  ww0 = _mm256_unpacklo_epi32(w0, w1);
283
813k
  ww1 = _mm256_unpacklo_epi32(w2, w3);
284
285
813k
  dd[8] = _mm256_unpacklo_epi64(ww0, ww1);
286
813k
  dd[9] = _mm256_unpackhi_epi64(ww0, ww1);
287
288
813k
  ww0 = _mm256_unpackhi_epi32(w0, w1);
289
813k
  ww1 = _mm256_unpackhi_epi32(w2, w3);
290
291
813k
  dd[10] = _mm256_unpacklo_epi64(ww0, ww1);
292
813k
  dd[11] = _mm256_unpackhi_epi64(ww0, ww1);
293
294
813k
  w0 = _mm256_unpackhi_epi16(x[8], x[9]);
295
813k
  w1 = _mm256_unpackhi_epi16(x[10], x[11]);
296
813k
  w2 = _mm256_unpackhi_epi16(x[12], x[13]);
297
813k
  w3 = _mm256_unpackhi_epi16(x[14], x[15]);
298
299
813k
  ww0 = _mm256_unpacklo_epi32(w0, w1);
300
813k
  ww1 = _mm256_unpacklo_epi32(w2, w3);
301
302
813k
  dd[12] = _mm256_unpacklo_epi64(ww0, ww1);
303
813k
  dd[13] = _mm256_unpackhi_epi64(ww0, ww1);
304
305
813k
  ww0 = _mm256_unpackhi_epi32(w0, w1);
306
813k
  ww1 = _mm256_unpackhi_epi32(w2, w3);
307
308
813k
  dd[14] = _mm256_unpacklo_epi64(ww0, ww1);
309
813k
  dd[15] = _mm256_unpackhi_epi64(ww0, ww1);
310
311
7.32M
  for (int i = 0; i < 8; i++) {
312
6.51M
    d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1);
313
6.51M
    d[i + 8] = _mm256_insertf128_si256(dd[i + 8],
314
6.51M
                                       _mm256_extracti128_si256(dd[i], 1), 0);
315
6.51M
  }
316
813k
}
317
318
void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
319
923k
                                 const uint8_t *above, const uint8_t *left) {
320
923k
  const __m256i sum_above = dc_sum_32(above);
321
923k
  __m256i sum_left = dc_sum_32(left);
322
923k
  sum_left = _mm256_add_epi16(sum_left, sum_above);
323
923k
  const __m256i thirtytwo = _mm256_set1_epi16(32);
324
923k
  sum_left = _mm256_add_epi16(sum_left, thirtytwo);
325
923k
  sum_left = _mm256_srai_epi16(sum_left, 6);
326
923k
  const __m256i zero = _mm256_setzero_si256();
327
923k
  __m256i row = _mm256_shuffle_epi8(sum_left, zero);
328
923k
  row_store_32xh(&row, 32, dst, stride);
329
923k
}
330
331
void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
332
                                     const uint8_t *above,
333
67.9k
                                     const uint8_t *left) {
334
67.9k
  __m256i sum = dc_sum_32(above);
335
67.9k
  (void)left;
336
337
67.9k
  const __m256i sixteen = _mm256_set1_epi16(16);
338
67.9k
  sum = _mm256_add_epi16(sum, sixteen);
339
67.9k
  sum = _mm256_srai_epi16(sum, 5);
340
67.9k
  const __m256i zero = _mm256_setzero_si256();
341
67.9k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
342
67.9k
  row_store_32xh(&row, 32, dst, stride);
343
67.9k
}
344
345
void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
346
                                      const uint8_t *above,
347
116k
                                      const uint8_t *left) {
348
116k
  __m256i sum = dc_sum_32(left);
349
116k
  (void)above;
350
351
116k
  const __m256i sixteen = _mm256_set1_epi16(16);
352
116k
  sum = _mm256_add_epi16(sum, sixteen);
353
116k
  sum = _mm256_srai_epi16(sum, 5);
354
116k
  const __m256i zero = _mm256_setzero_si256();
355
116k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
356
116k
  row_store_32xh(&row, 32, dst, stride);
357
116k
}
358
359
void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
360
                                     const uint8_t *above,
361
27.1k
                                     const uint8_t *left) {
362
27.1k
  (void)above;
363
27.1k
  (void)left;
364
27.1k
  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
365
27.1k
  row_store_32xh(&row, 32, dst, stride);
366
27.1k
}
367
368
void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
369
24.4k
                                const uint8_t *above, const uint8_t *left) {
370
24.4k
  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
371
24.4k
  (void)left;
372
24.4k
  row_store_32xh(&row, 32, dst, stride);
373
24.4k
}
374
375
// There are 32 rows togeter. This function does line:
376
// 0,1,2,3, and 16,17,18,19. The next call would do
377
// 4,5,6,7, and 20,21,22,23. So 4 times of calling
378
// would finish 32 rows.
379
static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
380
872k
                                        ptrdiff_t stride) {
381
872k
  __m256i t[4];
382
872k
  __m256i m = _mm256_setzero_si256();
383
872k
  const __m256i inc = _mm256_set1_epi8(4);
384
872k
  int i;
385
386
4.36M
  for (i = 0; i < 4; i++) {
387
3.49M
    t[i] = _mm256_shuffle_epi8(*row, m);
388
3.49M
    __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
389
3.49M
    __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
390
3.49M
    _mm256_storeu_si256((__m256i *)dst, r0);
391
3.49M
    _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
392
3.49M
    dst += stride;
393
3.49M
    m = _mm256_add_epi8(m, inc);
394
3.49M
  }
395
872k
}
396
397
void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
398
218k
                                const uint8_t *above, const uint8_t *left) {
399
218k
  (void)above;
400
218k
  const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
401
402
218k
  __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
403
404
218k
  __m256i v = _mm256_unpacklo_epi8(u, u);
405
218k
  h_predictor_32x8line(&v, dst, stride);
406
218k
  dst += stride << 2;
407
408
218k
  v = _mm256_unpackhi_epi8(u, u);
409
218k
  h_predictor_32x8line(&v, dst, stride);
410
218k
  dst += stride << 2;
411
412
218k
  u = _mm256_unpackhi_epi8(left_col, left_col);
413
414
218k
  v = _mm256_unpacklo_epi8(u, u);
415
218k
  h_predictor_32x8line(&v, dst, stride);
416
218k
  dst += stride << 2;
417
418
218k
  v = _mm256_unpackhi_epi8(u, u);
419
218k
  h_predictor_32x8line(&v, dst, stride);
420
218k
}
421
422
// -----------------------------------------------------------------------------
423
// Rectangle
424
void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
425
140k
                                 const uint8_t *above, const uint8_t *left) {
426
140k
  const __m128i top_sum = dc_sum_32_sse2(above);
427
140k
  __m128i left_sum = dc_sum_16_sse2(left);
428
140k
  left_sum = _mm_add_epi16(top_sum, left_sum);
429
140k
  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(left_sum);
430
140k
  sum += 24;
431
140k
  sum /= 48;
432
140k
  const __m256i row = _mm256_set1_epi8((int8_t)sum);
433
140k
  row_store_32xh(&row, 16, dst, stride);
434
140k
}
435
436
void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
437
7.36k
                                 const uint8_t *above, const uint8_t *left) {
438
7.36k
  const __m256i sum_above = dc_sum_32(above);
439
7.36k
  __m256i sum_left = dc_sum_64(left);
440
7.36k
  sum_left = _mm256_add_epi16(sum_left, sum_above);
441
7.36k
  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
442
7.36k
  sum += 48;
443
7.36k
  sum /= 96;
444
7.36k
  const __m256i row = _mm256_set1_epi8((int8_t)sum);
445
7.36k
  row_store_32xh(&row, 64, dst, stride);
446
7.36k
}
447
448
void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
449
106k
                                 const uint8_t *above, const uint8_t *left) {
450
106k
  const __m256i sum_above = dc_sum_64(above);
451
106k
  __m256i sum_left = dc_sum_64(left);
452
106k
  sum_left = _mm256_add_epi16(sum_left, sum_above);
453
106k
  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
454
106k
  sum += 64;
455
106k
  sum /= 128;
456
106k
  const __m256i row = _mm256_set1_epi8((int8_t)sum);
457
106k
  row_store_64xh(&row, 64, dst, stride);
458
106k
}
459
460
void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
461
17.9k
                                 const uint8_t *above, const uint8_t *left) {
462
17.9k
  const __m256i sum_above = dc_sum_64(above);
463
17.9k
  __m256i sum_left = dc_sum_32(left);
464
17.9k
  sum_left = _mm256_add_epi16(sum_left, sum_above);
465
17.9k
  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
466
17.9k
  sum += 48;
467
17.9k
  sum /= 96;
468
17.9k
  const __m256i row = _mm256_set1_epi8((int8_t)sum);
469
17.9k
  row_store_64xh(&row, 32, dst, stride);
470
17.9k
}
471
472
void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
473
76.3k
                                 const uint8_t *above, const uint8_t *left) {
474
76.3k
  const __m256i sum_above = dc_sum_64(above);
475
76.3k
  __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
476
76.3k
  sum_left = _mm256_add_epi16(sum_left, sum_above);
477
76.3k
  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
478
76.3k
  sum += 40;
479
76.3k
  sum /= 80;
480
76.3k
  const __m256i row = _mm256_set1_epi8((int8_t)sum);
481
76.3k
  row_store_64xh(&row, 16, dst, stride);
482
76.3k
}
483
484
void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
485
                                     const uint8_t *above,
486
7.04k
                                     const uint8_t *left) {
487
7.04k
  __m256i sum = dc_sum_32(above);
488
7.04k
  (void)left;
489
490
7.04k
  const __m256i sixteen = _mm256_set1_epi16(16);
491
7.04k
  sum = _mm256_add_epi16(sum, sixteen);
492
7.04k
  sum = _mm256_srai_epi16(sum, 5);
493
7.04k
  const __m256i zero = _mm256_setzero_si256();
494
7.04k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
495
7.04k
  row_store_32xh(&row, 16, dst, stride);
496
7.04k
}
497
498
void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
499
                                     const uint8_t *above,
500
1.07k
                                     const uint8_t *left) {
501
1.07k
  __m256i sum = dc_sum_32(above);
502
1.07k
  (void)left;
503
504
1.07k
  const __m256i sixteen = _mm256_set1_epi16(16);
505
1.07k
  sum = _mm256_add_epi16(sum, sixteen);
506
1.07k
  sum = _mm256_srai_epi16(sum, 5);
507
1.07k
  const __m256i zero = _mm256_setzero_si256();
508
1.07k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
509
1.07k
  row_store_32xh(&row, 64, dst, stride);
510
1.07k
}
511
512
void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
513
                                     const uint8_t *above,
514
12.8k
                                     const uint8_t *left) {
515
12.8k
  __m256i sum = dc_sum_64(above);
516
12.8k
  (void)left;
517
518
12.8k
  const __m256i thirtytwo = _mm256_set1_epi16(32);
519
12.8k
  sum = _mm256_add_epi16(sum, thirtytwo);
520
12.8k
  sum = _mm256_srai_epi16(sum, 6);
521
12.8k
  const __m256i zero = _mm256_setzero_si256();
522
12.8k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
523
12.8k
  row_store_64xh(&row, 64, dst, stride);
524
12.8k
}
525
526
void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
527
                                     const uint8_t *above,
528
617
                                     const uint8_t *left) {
529
617
  __m256i sum = dc_sum_64(above);
530
617
  (void)left;
531
532
617
  const __m256i thirtytwo = _mm256_set1_epi16(32);
533
617
  sum = _mm256_add_epi16(sum, thirtytwo);
534
617
  sum = _mm256_srai_epi16(sum, 6);
535
617
  const __m256i zero = _mm256_setzero_si256();
536
617
  __m256i row = _mm256_shuffle_epi8(sum, zero);
537
617
  row_store_64xh(&row, 32, dst, stride);
538
617
}
539
540
void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
541
                                     const uint8_t *above,
542
4.29k
                                     const uint8_t *left) {
543
4.29k
  __m256i sum = dc_sum_64(above);
544
4.29k
  (void)left;
545
546
4.29k
  const __m256i thirtytwo = _mm256_set1_epi16(32);
547
4.29k
  sum = _mm256_add_epi16(sum, thirtytwo);
548
4.29k
  sum = _mm256_srai_epi16(sum, 6);
549
4.29k
  const __m256i zero = _mm256_setzero_si256();
550
4.29k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
551
4.29k
  row_store_64xh(&row, 16, dst, stride);
552
4.29k
}
553
554
void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
555
                                      const uint8_t *above,
556
5.49k
                                      const uint8_t *left) {
557
5.49k
  __m128i sum = dc_sum_16_sse2(left);
558
5.49k
  (void)above;
559
560
5.49k
  const __m128i eight = _mm_set1_epi16(8);
561
5.49k
  sum = _mm_add_epi16(sum, eight);
562
5.49k
  sum = _mm_srai_epi16(sum, 4);
563
5.49k
  const __m128i zero = _mm_setzero_si128();
564
5.49k
  const __m128i r = _mm_shuffle_epi8(sum, zero);
565
5.49k
  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
566
5.49k
  row_store_32xh(&row, 16, dst, stride);
567
5.49k
}
568
569
void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
570
                                      const uint8_t *above,
571
591
                                      const uint8_t *left) {
572
591
  __m256i sum = dc_sum_64(left);
573
591
  (void)above;
574
575
591
  const __m256i thirtytwo = _mm256_set1_epi16(32);
576
591
  sum = _mm256_add_epi16(sum, thirtytwo);
577
591
  sum = _mm256_srai_epi16(sum, 6);
578
591
  const __m256i zero = _mm256_setzero_si256();
579
591
  __m256i row = _mm256_shuffle_epi8(sum, zero);
580
591
  row_store_32xh(&row, 64, dst, stride);
581
591
}
582
583
void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
584
                                      const uint8_t *above,
585
21.1k
                                      const uint8_t *left) {
586
21.1k
  __m256i sum = dc_sum_64(left);
587
21.1k
  (void)above;
588
589
21.1k
  const __m256i thirtytwo = _mm256_set1_epi16(32);
590
21.1k
  sum = _mm256_add_epi16(sum, thirtytwo);
591
21.1k
  sum = _mm256_srai_epi16(sum, 6);
592
21.1k
  const __m256i zero = _mm256_setzero_si256();
593
21.1k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
594
21.1k
  row_store_64xh(&row, 64, dst, stride);
595
21.1k
}
596
597
void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
598
                                      const uint8_t *above,
599
1.11k
                                      const uint8_t *left) {
600
1.11k
  __m256i sum = dc_sum_32(left);
601
1.11k
  (void)above;
602
603
1.11k
  const __m256i sixteen = _mm256_set1_epi16(16);
604
1.11k
  sum = _mm256_add_epi16(sum, sixteen);
605
1.11k
  sum = _mm256_srai_epi16(sum, 5);
606
1.11k
  const __m256i zero = _mm256_setzero_si256();
607
1.11k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
608
1.11k
  row_store_64xh(&row, 32, dst, stride);
609
1.11k
}
610
611
void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
612
                                      const uint8_t *above,
613
344
                                      const uint8_t *left) {
614
344
  __m128i sum = dc_sum_16_sse2(left);
615
344
  (void)above;
616
617
344
  const __m128i eight = _mm_set1_epi16(8);
618
344
  sum = _mm_add_epi16(sum, eight);
619
344
  sum = _mm_srai_epi16(sum, 4);
620
344
  const __m128i zero = _mm_setzero_si128();
621
344
  const __m128i r = _mm_shuffle_epi8(sum, zero);
622
344
  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
623
344
  row_store_64xh(&row, 16, dst, stride);
624
344
}
625
626
void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
627
                                     const uint8_t *above,
628
4.15k
                                     const uint8_t *left) {
629
4.15k
  (void)above;
630
4.15k
  (void)left;
631
4.15k
  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
632
4.15k
  row_store_32xh(&row, 16, dst, stride);
633
4.15k
}
634
635
void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
636
                                     const uint8_t *above,
637
1.03k
                                     const uint8_t *left) {
638
1.03k
  (void)above;
639
1.03k
  (void)left;
640
1.03k
  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
641
1.03k
  row_store_32xh(&row, 64, dst, stride);
642
1.03k
}
643
644
void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
645
                                     const uint8_t *above,
646
8.68k
                                     const uint8_t *left) {
647
8.68k
  (void)above;
648
8.68k
  (void)left;
649
8.68k
  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
650
8.68k
  row_store_64xh(&row, 64, dst, stride);
651
8.68k
}
652
653
void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
654
                                     const uint8_t *above,
655
1.44k
                                     const uint8_t *left) {
656
1.44k
  (void)above;
657
1.44k
  (void)left;
658
1.44k
  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
659
1.44k
  row_store_64xh(&row, 32, dst, stride);
660
1.44k
}
661
662
void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
663
                                     const uint8_t *above,
664
811
                                     const uint8_t *left) {
665
811
  (void)above;
666
811
  (void)left;
667
811
  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
668
811
  row_store_64xh(&row, 16, dst, stride);
669
811
}
670
671
void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
672
8.94k
                                const uint8_t *above, const uint8_t *left) {
673
8.94k
  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
674
8.94k
  (void)left;
675
8.94k
  row_store_32xh(&row, 16, dst, stride);
676
8.94k
}
677
678
void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
679
567
                                const uint8_t *above, const uint8_t *left) {
680
567
  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
681
567
  (void)left;
682
567
  row_store_32xh(&row, 64, dst, stride);
683
567
}
684
685
void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
686
2.42k
                                const uint8_t *above, const uint8_t *left) {
687
2.42k
  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
688
2.42k
  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
689
2.42k
  (void)left;
690
2.42k
  row_store_32x2xh(&row0, &row1, 64, dst, stride);
691
2.42k
}
692
693
void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
694
530
                                const uint8_t *above, const uint8_t *left) {
695
530
  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
696
530
  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
697
530
  (void)left;
698
530
  row_store_32x2xh(&row0, &row1, 32, dst, stride);
699
530
}
700
701
void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
702
742
                                const uint8_t *above, const uint8_t *left) {
703
742
  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
704
742
  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
705
742
  (void)left;
706
742
  row_store_32x2xh(&row0, &row1, 16, dst, stride);
707
742
}
708
709
// -----------------------------------------------------------------------------
710
// PAETH_PRED
711
712
// Return 16 16-bit pixels in one row (__m256i)
713
static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
714
79.1M
                                 const __m256i *topleft) {
715
79.1M
  const __m256i base =
716
79.1M
      _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
717
718
79.1M
  __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
719
79.1M
  __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
720
79.1M
  __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
721
722
79.1M
  __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
723
79.1M
  mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
724
79.1M
  __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
725
726
79.1M
  pl = _mm256_andnot_si256(mask1, *left);
727
728
79.1M
  ptl = _mm256_and_si256(mask2, *topleft);
729
79.1M
  pt = _mm256_andnot_si256(mask2, *top);
730
79.1M
  pt = _mm256_or_si256(pt, ptl);
731
79.1M
  pt = _mm256_and_si256(mask1, pt);
732
733
79.1M
  return _mm256_or_si256(pt, pl);
734
79.1M
}
735
736
// Return 16 8-bit pixels in one row (__m128i)
737
static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
738
78.3M
                                      const __m256i *topleft) {
739
78.3M
  const __m256i p0 = paeth_pred(left, top, topleft);
740
78.3M
  const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
741
78.3M
  const __m256i p = _mm256_packus_epi16(p0, p1);
742
78.3M
  return _mm256_castsi256_si128(p);
743
78.3M
}
744
745
2.23M
static INLINE __m256i get_top_vector(const uint8_t *above) {
746
2.23M
  const __m128i x = _mm_load_si128((const __m128i *)above);
747
2.23M
  const __m128i zero = _mm_setzero_si128();
748
2.23M
  const __m128i t0 = _mm_unpacklo_epi8(x, zero);
749
2.23M
  const __m128i t1 = _mm_unpackhi_epi8(x, zero);
750
2.23M
  return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
751
2.23M
}
752
753
void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
754
77.8k
                                   const uint8_t *above, const uint8_t *left) {
755
77.8k
  __m128i x = _mm_loadl_epi64((const __m128i *)left);
756
77.8k
  const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
757
77.8k
  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
758
77.8k
  __m256i rep = _mm256_set1_epi16((short)0x8000);
759
77.8k
  const __m256i one = _mm256_set1_epi16(1);
760
77.8k
  const __m256i top = get_top_vector(above);
761
762
77.8k
  int i;
763
700k
  for (i = 0; i < 8; ++i) {
764
622k
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
765
622k
    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
766
767
622k
    _mm_store_si128((__m128i *)dst, row);
768
622k
    dst += stride;
769
622k
    rep = _mm256_add_epi16(rep, one);
770
622k
  }
771
77.8k
}
772
773
3.94M
static INLINE __m256i get_left_vector(const uint8_t *left) {
774
3.94M
  const __m128i x = _mm_load_si128((const __m128i *)left);
775
3.94M
  return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
776
3.94M
}
777
778
void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
779
137k
                                    const uint8_t *above, const uint8_t *left) {
780
137k
  const __m256i l = get_left_vector(left);
781
137k
  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
782
137k
  __m256i rep = _mm256_set1_epi16((short)0x8000);
783
137k
  const __m256i one = _mm256_set1_epi16(1);
784
137k
  const __m256i top = get_top_vector(above);
785
786
137k
  int i;
787
2.34M
  for (i = 0; i < 16; ++i) {
788
2.20M
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
789
2.20M
    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
790
791
2.20M
    _mm_store_si128((__m128i *)dst, row);
792
2.20M
    dst += stride;
793
2.20M
    rep = _mm256_add_epi16(rep, one);
794
2.20M
  }
795
137k
}
796
797
void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
798
1.06M
                                    const uint8_t *above, const uint8_t *left) {
799
1.06M
  __m256i l = get_left_vector(left);
800
1.06M
  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
801
1.06M
  __m256i rep = _mm256_set1_epi16((short)0x8000);
802
1.06M
  const __m256i one = _mm256_set1_epi16(1);
803
1.06M
  const __m256i top = get_top_vector(above);
804
805
1.06M
  int i;
806
18.0M
  for (i = 0; i < 16; ++i) {
807
17.0M
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
808
17.0M
    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
809
810
17.0M
    _mm_store_si128((__m128i *)dst, row);
811
17.0M
    dst += stride;
812
17.0M
    rep = _mm256_add_epi16(rep, one);
813
17.0M
  }
814
815
1.06M
  l = get_left_vector(left + 16);
816
1.06M
  rep = _mm256_set1_epi16((short)0x8000);
817
18.0M
  for (i = 0; i < 16; ++i) {
818
17.0M
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
819
17.0M
    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
820
821
17.0M
    _mm_store_si128((__m128i *)dst, row);
822
17.0M
    dst += stride;
823
17.0M
    rep = _mm256_add_epi16(rep, one);
824
17.0M
  }
825
1.06M
}
826
827
void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
828
262k
                                    const uint8_t *above, const uint8_t *left) {
829
262k
  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
830
262k
  const __m256i one = _mm256_set1_epi16(1);
831
262k
  const __m256i top = get_top_vector(above);
832
833
1.31M
  for (int j = 0; j < 4; ++j) {
834
1.05M
    const __m256i l = get_left_vector(left + j * 16);
835
1.05M
    __m256i rep = _mm256_set1_epi16((short)0x8000);
836
17.8M
    for (int i = 0; i < 16; ++i) {
837
16.8M
      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
838
16.8M
      const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
839
840
16.8M
      _mm_store_si128((__m128i *)dst, row);
841
16.8M
      dst += stride;
842
16.8M
      rep = _mm256_add_epi16(rep, one);
843
16.8M
    }
844
1.05M
  }
845
262k
}
846
847
// Return 32 8-bit pixels in one row (__m256i)
848
static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
849
                                      const __m256i *top1,
850
443k
                                      const __m256i *topleft) {
851
443k
  __m256i p0 = paeth_pred(left, top0, topleft);
852
443k
  __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
853
443k
  const __m256i x0 = _mm256_packus_epi16(p0, p1);
854
855
443k
  p0 = paeth_pred(left, top1, topleft);
856
443k
  p1 = _mm256_permute4x64_epi64(p0, 0xe);
857
443k
  const __m256i x1 = _mm256_packus_epi16(p0, p1);
858
859
443k
  return _mm256_permute2x128_si256(x0, x1, 0x20);
860
443k
}
861
862
void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
863
27.7k
                                    const uint8_t *above, const uint8_t *left) {
864
27.7k
  const __m256i l = get_left_vector(left);
865
27.7k
  const __m256i t0 = get_top_vector(above);
866
27.7k
  const __m256i t1 = get_top_vector(above + 16);
867
27.7k
  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
868
27.7k
  __m256i rep = _mm256_set1_epi16((short)0x8000);
869
27.7k
  const __m256i one = _mm256_set1_epi16(1);
870
871
27.7k
  int i;
872
471k
  for (i = 0; i < 16; ++i) {
873
443k
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
874
875
443k
    const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
876
877
443k
    _mm256_storeu_si256((__m256i *)dst, r);
878
879
443k
    dst += stride;
880
443k
    rep = _mm256_add_epi16(rep, one);
881
443k
  }
882
27.7k
}
883
884
void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
885
210k
                                    const uint8_t *above, const uint8_t *left) {
886
210k
  __m256i l = get_left_vector(left);
887
210k
  const __m256i t0 = get_top_vector(above);
888
210k
  const __m256i t1 = get_top_vector(above + 16);
889
210k
  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
890
210k
  __m256i rep = _mm256_set1_epi16((short)0x8000);
891
210k
  const __m256i one = _mm256_set1_epi16(1);
892
893
210k
  int i;
894
3.57M
  for (i = 0; i < 16; ++i) {
895
3.36M
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
896
897
3.36M
    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
898
3.36M
    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
899
900
3.36M
    _mm_store_si128((__m128i *)dst, r0);
901
3.36M
    _mm_store_si128((__m128i *)(dst + 16), r1);
902
903
3.36M
    dst += stride;
904
3.36M
    rep = _mm256_add_epi16(rep, one);
905
3.36M
  }
906
907
210k
  l = get_left_vector(left + 16);
908
210k
  rep = _mm256_set1_epi16((short)0x8000);
909
3.57M
  for (i = 0; i < 16; ++i) {
910
3.36M
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
911
912
3.36M
    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
913
3.36M
    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
914
915
3.36M
    _mm_store_si128((__m128i *)dst, r0);
916
3.36M
    _mm_store_si128((__m128i *)(dst + 16), r1);
917
918
3.36M
    dst += stride;
919
3.36M
    rep = _mm256_add_epi16(rep, one);
920
3.36M
  }
921
210k
}
922
923
void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
924
4.49k
                                    const uint8_t *above, const uint8_t *left) {
925
4.49k
  const __m256i t0 = get_top_vector(above);
926
4.49k
  const __m256i t1 = get_top_vector(above + 16);
927
4.49k
  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
928
4.49k
  const __m256i one = _mm256_set1_epi16(1);
929
930
4.49k
  int i, j;
931
22.4k
  for (j = 0; j < 4; ++j) {
932
17.9k
    const __m256i l = get_left_vector(left + j * 16);
933
17.9k
    __m256i rep = _mm256_set1_epi16((short)0x8000);
934
305k
    for (i = 0; i < 16; ++i) {
935
287k
      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
936
937
287k
      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
938
287k
      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
939
940
287k
      _mm_store_si128((__m128i *)dst, r0);
941
287k
      _mm_store_si128((__m128i *)(dst + 16), r1);
942
943
287k
      dst += stride;
944
287k
      rep = _mm256_add_epi16(rep, one);
945
287k
    }
946
17.9k
  }
947
4.49k
}
948
949
void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
950
4.83k
                                    const uint8_t *above, const uint8_t *left) {
951
4.83k
  const __m256i t0 = get_top_vector(above);
952
4.83k
  const __m256i t1 = get_top_vector(above + 16);
953
4.83k
  const __m256i t2 = get_top_vector(above + 32);
954
4.83k
  const __m256i t3 = get_top_vector(above + 48);
955
4.83k
  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
956
4.83k
  const __m256i one = _mm256_set1_epi16(1);
957
958
4.83k
  int i, j;
959
14.5k
  for (j = 0; j < 2; ++j) {
960
9.67k
    const __m256i l = get_left_vector(left + j * 16);
961
9.67k
    __m256i rep = _mm256_set1_epi16((short)0x8000);
962
164k
    for (i = 0; i < 16; ++i) {
963
154k
      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
964
965
154k
      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
966
154k
      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
967
154k
      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
968
154k
      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
969
970
154k
      _mm_store_si128((__m128i *)dst, r0);
971
154k
      _mm_store_si128((__m128i *)(dst + 16), r1);
972
154k
      _mm_store_si128((__m128i *)(dst + 32), r2);
973
154k
      _mm_store_si128((__m128i *)(dst + 48), r3);
974
975
154k
      dst += stride;
976
154k
      rep = _mm256_add_epi16(rep, one);
977
154k
    }
978
9.67k
  }
979
4.83k
}
980
981
void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
982
36.4k
                                    const uint8_t *above, const uint8_t *left) {
983
36.4k
  const __m256i t0 = get_top_vector(above);
984
36.4k
  const __m256i t1 = get_top_vector(above + 16);
985
36.4k
  const __m256i t2 = get_top_vector(above + 32);
986
36.4k
  const __m256i t3 = get_top_vector(above + 48);
987
36.4k
  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
988
36.4k
  const __m256i one = _mm256_set1_epi16(1);
989
990
36.4k
  int i, j;
991
182k
  for (j = 0; j < 4; ++j) {
992
145k
    const __m256i l = get_left_vector(left + j * 16);
993
145k
    __m256i rep = _mm256_set1_epi16((short)0x8000);
994
2.47M
    for (i = 0; i < 16; ++i) {
995
2.33M
      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
996
997
2.33M
      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
998
2.33M
      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
999
2.33M
      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
1000
2.33M
      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
1001
1002
2.33M
      _mm_store_si128((__m128i *)dst, r0);
1003
2.33M
      _mm_store_si128((__m128i *)(dst + 16), r1);
1004
2.33M
      _mm_store_si128((__m128i *)(dst + 32), r2);
1005
2.33M
      _mm_store_si128((__m128i *)(dst + 48), r3);
1006
1007
2.33M
      dst += stride;
1008
2.33M
      rep = _mm256_add_epi16(rep, one);
1009
2.33M
    }
1010
145k
  }
1011
36.4k
}
1012
1013
void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
1014
10.7k
                                    const uint8_t *above, const uint8_t *left) {
1015
10.7k
  const __m256i t0 = get_top_vector(above);
1016
10.7k
  const __m256i t1 = get_top_vector(above + 16);
1017
10.7k
  const __m256i t2 = get_top_vector(above + 32);
1018
10.7k
  const __m256i t3 = get_top_vector(above + 48);
1019
10.7k
  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
1020
10.7k
  const __m256i one = _mm256_set1_epi16(1);
1021
1022
10.7k
  int i;
1023
10.7k
  const __m256i l = get_left_vector(left);
1024
10.7k
  __m256i rep = _mm256_set1_epi16((short)0x8000);
1025
181k
  for (i = 0; i < 16; ++i) {
1026
171k
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
1027
1028
171k
    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
1029
171k
    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
1030
171k
    const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
1031
171k
    const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
1032
1033
171k
    _mm_store_si128((__m128i *)dst, r0);
1034
171k
    _mm_store_si128((__m128i *)(dst + 16), r1);
1035
171k
    _mm_store_si128((__m128i *)(dst + 32), r2);
1036
171k
    _mm_store_si128((__m128i *)(dst + 48), r3);
1037
1038
171k
    dst += stride;
1039
171k
    rep = _mm256_add_epi16(rep, one);
1040
171k
  }
1041
10.7k
}
1042
1043
#define PERM4x64(c0, c1, c2, c3) c0 + (c1 << 2) + (c2 << 4) + (c3 << 6)
1044
#define PERM2x128(c0, c1) c0 + (c1 << 4)
1045
1046
static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2(
1047
228k
    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1048
228k
  const int frac_bits = 6 - upsample_above;
1049
228k
  const int max_base_x = ((N + 4) - 1) << upsample_above;
1050
1051
228k
  assert(dx > 0);
1052
  // pre-filter above pixels
1053
  // store in temp buffers:
1054
  //   above[x] * 32 + 16
1055
  //   above[x+1] - above[x]
1056
  // final pixels will be calculated as:
1057
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1058
0
  __m256i a0, a1, a32, a16;
1059
228k
  __m256i diff, c3f;
1060
228k
  __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
1061
228k
  __m128i a0_128, a1_128;
1062
228k
  a16 = _mm256_set1_epi16(16);
1063
228k
  a_mbase_x = _mm_set1_epi16(above[max_base_x]);
1064
228k
  max_base_x128 = _mm_set1_epi16(max_base_x);
1065
228k
  c3f = _mm256_set1_epi16(0x3f);
1066
1067
228k
  int x = dx;
1068
1.83M
  for (int r = 0; r < N; r++) {
1069
1.60M
    __m256i b, res, shift;
1070
1.60M
    __m128i res1;
1071
1072
1.60M
    int base = x >> frac_bits;
1073
1.60M
    if (base >= max_base_x) {
1074
5.87k
      for (int i = r; i < N; ++i) {
1075
3.30k
        dst[i] = a_mbase_x;  // save 4 values
1076
3.30k
      }
1077
2.56k
      return;
1078
2.56k
    }
1079
1080
1.60M
    a0_128 = _mm_loadu_si128((__m128i *)(above + base));
1081
1.60M
    a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
1082
1083
1.60M
    if (upsample_above) {
1084
627k
      a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]);
1085
627k
      a1_128 = _mm_srli_si128(a0_128, 8);
1086
1087
627k
      base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8,
1088
627k
                                   base + 10, base + 12, base + 14);
1089
627k
      shift = _mm256_srli_epi16(
1090
627k
          _mm256_and_si256(
1091
627k
              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above),
1092
627k
              _mm256_set1_epi16(0x3f)),
1093
627k
          1);
1094
977k
    } else {
1095
977k
      base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4,
1096
977k
                                   base + 5, base + 6, base + 7);
1097
977k
      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1098
977k
    }
1099
1.60M
    a0 = _mm256_castsi128_si256(a0_128);
1100
1.60M
    a1 = _mm256_castsi128_si256(a1_128);
1101
1.60M
    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1102
1.60M
    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1103
1.60M
    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1104
1105
1.60M
    b = _mm256_mullo_epi16(diff, shift);
1106
1.60M
    res = _mm256_add_epi16(a32, b);
1107
1.60M
    res = _mm256_srli_epi16(res, 5);
1108
1.60M
    res1 = _mm256_castsi256_si128(res);
1109
1110
1.60M
    mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128);
1111
1.60M
    dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
1112
1.60M
    x += dx;
1113
1.60M
  }
1114
228k
}
1115
1116
static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_4xN_internal_avx2(
1117
184k
    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1118
184k
  const int frac_bits = 6 - upsample_above;
1119
184k
  const int max_base_x = ((N + 4) - 1) << upsample_above;
1120
1121
184k
  assert(dx > 0);
1122
  // pre-filter above pixels
1123
  // store in temp buffers:
1124
  //   above[x] * 32 + 16
1125
  //   above[x+1] - above[x]
1126
  // final pixels will be calculated as:
1127
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1128
0
  __m256i a0, a1, a32, a16;
1129
184k
  __m256i diff;
1130
184k
  __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
1131
1132
184k
  a16 = _mm256_set1_epi32(16);
1133
184k
  a_mbase_x = _mm_set1_epi16(above[max_base_x]);
1134
184k
  max_base_x128 = _mm_set1_epi32(max_base_x);
1135
1136
184k
  int x = dx;
1137
1.40M
  for (int r = 0; r < N; r++) {
1138
1.21M
    __m256i b, res, shift;
1139
1.21M
    __m128i res1;
1140
1141
1.21M
    int base = x >> frac_bits;
1142
1.21M
    if (base >= max_base_x) {
1143
684
      for (int i = r; i < N; ++i) {
1144
431
        dst[i] = a_mbase_x;  // save 4 values
1145
431
      }
1146
253
      return;
1147
253
    }
1148
1149
1.21M
    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1150
1.21M
    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1151
1152
1.21M
    if (upsample_above) {
1153
276k
      a0 = _mm256_permutevar8x32_epi32(
1154
276k
          a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1155
276k
      a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
1156
276k
      base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6);
1157
276k
      shift = _mm256_srli_epi32(
1158
276k
          _mm256_and_si256(
1159
276k
              _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
1160
276k
              _mm256_set1_epi32(0x3f)),
1161
276k
          1);
1162
942k
    } else {
1163
942k
      base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3);
1164
942k
      shift = _mm256_srli_epi32(
1165
942k
          _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1166
942k
    }
1167
1168
1.21M
    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1169
1.21M
    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1170
1.21M
    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1171
1172
1.21M
    b = _mm256_mullo_epi32(diff, shift);
1173
1.21M
    res = _mm256_add_epi32(a32, b);
1174
1.21M
    res = _mm256_srli_epi32(res, 5);
1175
1176
1.21M
    res1 = _mm256_castsi256_si128(res);
1177
1.21M
    res1 = _mm_packus_epi32(res1, res1);
1178
1179
1.21M
    mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128);
1180
1.21M
    mask128 = _mm_packs_epi32(mask128, mask128);  // goto 16 bit
1181
1.21M
    dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
1182
1.21M
    x += dx;
1183
1.21M
  }
1184
184k
}
1185
1186
static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst,
1187
                                             ptrdiff_t stride,
1188
                                             const uint16_t *above,
1189
                                             int upsample_above, int dx,
1190
153k
                                             int bd) {
1191
153k
  __m128i dstvec[16];
1192
153k
  if (bd < 12) {
1193
71.1k
    highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above,
1194
71.1k
                                              dx);
1195
82.4k
  } else {
1196
82.4k
    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above,
1197
82.4k
                                                    upsample_above, dx);
1198
82.4k
  }
1199
1.10M
  for (int i = 0; i < N; i++) {
1200
954k
    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
1201
954k
  }
1202
153k
}
1203
1204
static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_8xN_internal_avx2(
1205
269k
    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1206
269k
  const int frac_bits = 6 - upsample_above;
1207
269k
  const int max_base_x = ((8 + N) - 1) << upsample_above;
1208
1209
269k
  assert(dx > 0);
1210
  // pre-filter above pixels
1211
  // store in temp buffers:
1212
  //   above[x] * 32 + 16
1213
  //   above[x+1] - above[x]
1214
  // final pixels will be calculated as:
1215
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1216
0
  __m256i a0, a1, a0_1, a1_1, a32, a16;
1217
269k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1218
1219
269k
  a16 = _mm256_set1_epi32(16);
1220
269k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1221
269k
  max_base_x256 = _mm256_set1_epi32(max_base_x);
1222
1223
269k
  int x = dx;
1224
2.84M
  for (int r = 0; r < N; r++) {
1225
2.57M
    __m256i b, res, res1, shift;
1226
1227
2.57M
    int base = x >> frac_bits;
1228
2.57M
    if (base >= max_base_x) {
1229
1.90k
      for (int i = r; i < N; ++i) {
1230
1.17k
        dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
1231
1.17k
      }
1232
735
      return;
1233
735
    }
1234
1235
2.57M
    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1236
2.57M
    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1237
1238
2.57M
    if (upsample_above) {
1239
246k
      a0 = _mm256_permutevar8x32_epi32(
1240
246k
          a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1241
246k
      a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
1242
1243
246k
      a0_1 =
1244
246k
          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
1245
246k
      a0_1 = _mm256_permutevar8x32_epi32(
1246
246k
          a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1247
246k
      a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
1248
1249
246k
      a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
1250
246k
      a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
1251
246k
      base_inc256 =
1252
246k
          _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8,
1253
246k
                            base + 10, base + 12, base + 14);
1254
246k
      shift = _mm256_srli_epi32(
1255
246k
          _mm256_and_si256(
1256
246k
              _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
1257
246k
              _mm256_set1_epi32(0x3f)),
1258
246k
          1);
1259
2.32M
    } else {
1260
2.32M
      base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3,
1261
2.32M
                                      base + 4, base + 5, base + 6, base + 7);
1262
2.32M
      shift = _mm256_srli_epi32(
1263
2.32M
          _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1264
2.32M
    }
1265
1266
2.57M
    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1267
2.57M
    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1268
2.57M
    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1269
1270
2.57M
    b = _mm256_mullo_epi32(diff, shift);
1271
2.57M
    res = _mm256_add_epi32(a32, b);
1272
2.57M
    res = _mm256_srli_epi32(res, 5);
1273
1274
2.57M
    res1 = _mm256_packus_epi32(
1275
2.57M
        res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
1276
1277
2.57M
    mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256);
1278
2.57M
    mask256 = _mm256_packs_epi32(
1279
2.57M
        mask256, _mm256_castsi128_si256(
1280
2.57M
                     _mm256_extracti128_si256(mask256, 1)));  // goto 16 bit
1281
2.57M
    res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1282
2.57M
    dst[r] = _mm256_castsi256_si128(res1);
1283
2.57M
    x += dx;
1284
2.57M
  }
1285
269k
}
1286
1287
static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2(
1288
233k
    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1289
233k
  const int frac_bits = 6 - upsample_above;
1290
233k
  const int max_base_x = ((8 + N) - 1) << upsample_above;
1291
1292
233k
  assert(dx > 0);
1293
  // pre-filter above pixels
1294
  // store in temp buffers:
1295
  //   above[x] * 32 + 16
1296
  //   above[x+1] - above[x]
1297
  // final pixels will be calculated as:
1298
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1299
0
  __m256i a0, a1, a32, a16, c3f;
1300
233k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1301
233k
  __m128i a0_x128, a1_x128;
1302
1303
233k
  a16 = _mm256_set1_epi16(16);
1304
233k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1305
233k
  max_base_x256 = _mm256_set1_epi16(max_base_x);
1306
233k
  c3f = _mm256_set1_epi16(0x3f);
1307
1308
233k
  int x = dx;
1309
3.35M
  for (int r = 0; r < N; r++) {
1310
3.12M
    __m256i b, res, res1, shift;
1311
1312
3.12M
    int base = x >> frac_bits;
1313
3.12M
    if (base >= max_base_x) {
1314
4.51k
      for (int i = r; i < N; ++i) {
1315
3.12k
        dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
1316
3.12k
      }
1317
1.38k
      return;
1318
1.38k
    }
1319
1320
3.12M
    a0_x128 = _mm_loadu_si128((__m128i *)(above + base));
1321
3.12M
    if (upsample_above) {
1322
605k
      __m128i mask, atmp0, atmp1, atmp2, atmp3;
1323
605k
      a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8));
1324
605k
      atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
1325
605k
      atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
1326
605k
      atmp2 =
1327
605k
          _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
1328
605k
      atmp3 =
1329
605k
          _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
1330
605k
      mask =
1331
605k
          _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15));
1332
605k
      a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
1333
605k
      mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16),
1334
605k
                            _mm_set1_epi8(15));
1335
605k
      a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
1336
1337
605k
      base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6,
1338
605k
                                      base + 8, base + 10, base + 12, base + 14,
1339
605k
                                      0, 0, 0, 0, 0, 0, 0, 0);
1340
605k
      shift = _mm256_srli_epi16(
1341
605k
          _mm256_and_si256(
1342
605k
              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
1343
605k
          1);
1344
2.51M
    } else {
1345
2.51M
      a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1));
1346
2.51M
      base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1347
2.51M
                                      base + 4, base + 5, base + 6, base + 7, 0,
1348
2.51M
                                      0, 0, 0, 0, 0, 0, 0);
1349
2.51M
      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1350
2.51M
    }
1351
3.12M
    a0 = _mm256_castsi128_si256(a0_x128);
1352
3.12M
    a1 = _mm256_castsi128_si256(a1_x128);
1353
1354
3.12M
    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1355
3.12M
    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1356
3.12M
    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1357
1358
3.12M
    b = _mm256_mullo_epi16(diff, shift);
1359
3.12M
    res = _mm256_add_epi16(a32, b);
1360
3.12M
    res = _mm256_srli_epi16(res, 5);
1361
1362
3.12M
    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1363
3.12M
    res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1364
3.12M
    dst[r] = _mm256_castsi256_si128(res1);
1365
3.12M
    x += dx;
1366
3.12M
  }
1367
233k
}
1368
1369
static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst,
1370
                                             ptrdiff_t stride,
1371
                                             const uint16_t *above,
1372
                                             int upsample_above, int dx,
1373
205k
                                             int bd) {
1374
205k
  __m128i dstvec[32];
1375
205k
  if (bd < 12) {
1376
87.6k
    highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above,
1377
87.6k
                                              dx);
1378
118k
  } else {
1379
118k
    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above,
1380
118k
                                                    upsample_above, dx);
1381
118k
  }
1382
2.13M
  for (int i = 0; i < N; i++) {
1383
1.92M
    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
1384
1.92M
  }
1385
205k
}
1386
1387
static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_16xN_internal_avx2(
1388
125k
    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1389
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1390
125k
  (void)upsample_above;
1391
125k
  const int frac_bits = 6;
1392
125k
  const int max_base_x = ((16 + N) - 1);
1393
1394
  // pre-filter above pixels
1395
  // store in temp buffers:
1396
  //   above[x] * 32 + 16
1397
  //   above[x+1] - above[x]
1398
  // final pixels will be calculated as:
1399
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1400
125k
  __m256i a0, a0_1, a1, a1_1, a32, a16;
1401
125k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1402
1403
125k
  a16 = _mm256_set1_epi32(16);
1404
125k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1405
125k
  max_base_x256 = _mm256_set1_epi16(max_base_x);
1406
1407
125k
  int x = dx;
1408
1.41M
  for (int r = 0; r < N; r++) {
1409
1.29M
    __m256i b, res[2], res1;
1410
1411
1.29M
    int base = x >> frac_bits;
1412
1.29M
    if (base >= max_base_x) {
1413
987
      for (int i = r; i < N; ++i) {
1414
812
        dstvec[i] = a_mbase_x;  // save 16 values
1415
812
      }
1416
175
      return;
1417
175
    }
1418
1.29M
    __m256i shift = _mm256_srli_epi32(
1419
1.29M
        _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1420
1421
1.29M
    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1422
1.29M
    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1423
1424
1.29M
    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1425
1.29M
    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1426
1.29M
    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1427
1.29M
    b = _mm256_mullo_epi32(diff, shift);
1428
1429
1.29M
    res[0] = _mm256_add_epi32(a32, b);
1430
1.29M
    res[0] = _mm256_srli_epi32(res[0], 5);
1431
1.29M
    res[0] = _mm256_packus_epi32(
1432
1.29M
        res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1433
1434
1.29M
    int mdif = max_base_x - base;
1435
1.29M
    if (mdif > 8) {
1436
1.29M
      a0_1 =
1437
1.29M
          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
1438
1.29M
      a1_1 =
1439
1.29M
          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
1440
1441
1.29M
      diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
1442
1.29M
      a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
1443
1.29M
      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
1444
1.29M
      b = _mm256_mullo_epi32(diff, shift);
1445
1446
1.29M
      res[1] = _mm256_add_epi32(a32, b);
1447
1.29M
      res[1] = _mm256_srli_epi32(res[1], 5);
1448
1.29M
      res[1] = _mm256_packus_epi32(
1449
1.29M
          res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1450
1.29M
    } else {
1451
2.73k
      res[1] = a_mbase_x;
1452
2.73k
    }
1453
1.29M
    res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1454
1.29M
                                   1);  // 16 16bit values
1455
1456
1.29M
    base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1457
1.29M
                                    base + 4, base + 5, base + 6, base + 7,
1458
1.29M
                                    base + 8, base + 9, base + 10, base + 11,
1459
1.29M
                                    base + 12, base + 13, base + 14, base + 15);
1460
1.29M
    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1461
1.29M
    dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1462
1.29M
    x += dx;
1463
1.29M
  }
1464
125k
}
1465
1466
static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2(
1467
223k
    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1468
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1469
223k
  (void)upsample_above;
1470
223k
  const int frac_bits = 6;
1471
223k
  const int max_base_x = ((16 + N) - 1);
1472
1473
  // pre-filter above pixels
1474
  // store in temp buffers:
1475
  //   above[x] * 32 + 16
1476
  //   above[x+1] - above[x]
1477
  // final pixels will be calculated as:
1478
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1479
223k
  __m256i a0, a1, a32, a16, c3f;
1480
223k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1481
1482
223k
  a16 = _mm256_set1_epi16(16);
1483
223k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1484
223k
  max_base_x256 = _mm256_set1_epi16(max_base_x);
1485
223k
  c3f = _mm256_set1_epi16(0x3f);
1486
1487
223k
  int x = dx;
1488
4.16M
  for (int r = 0; r < N; r++) {
1489
3.94M
    __m256i b, res;
1490
1491
3.94M
    int base = x >> frac_bits;
1492
3.94M
    if (base >= max_base_x) {
1493
2.51k
      for (int i = r; i < N; ++i) {
1494
2.00k
        dstvec[i] = a_mbase_x;  // save 16 values
1495
2.00k
      }
1496
507
      return;
1497
507
    }
1498
3.94M
    __m256i shift =
1499
3.94M
        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1500
1501
3.94M
    a0 = _mm256_loadu_si256((__m256i *)(above + base));
1502
3.94M
    a1 = _mm256_loadu_si256((__m256i *)(above + base + 1));
1503
1504
3.94M
    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1505
3.94M
    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1506
3.94M
    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1507
3.94M
    b = _mm256_mullo_epi16(diff, shift);
1508
1509
3.94M
    res = _mm256_add_epi16(a32, b);
1510
3.94M
    res = _mm256_srli_epi16(res, 5);  // 16 16bit values
1511
1512
3.94M
    base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1513
3.94M
                                    base + 4, base + 5, base + 6, base + 7,
1514
3.94M
                                    base + 8, base + 9, base + 10, base + 11,
1515
3.94M
                                    base + 12, base + 13, base + 14, base + 15);
1516
3.94M
    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1517
3.94M
    dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1518
3.94M
    x += dx;
1519
3.94M
  }
1520
223k
}
1521
1522
static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst,
1523
                                              ptrdiff_t stride,
1524
                                              const uint16_t *above,
1525
                                              int upsample_above, int dx,
1526
191k
                                              int bd) {
1527
191k
  __m256i dstvec[64];
1528
191k
  if (bd < 12) {
1529
95.6k
    highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above,
1530
95.6k
                                               dx);
1531
96.0k
  } else {
1532
96.0k
    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above,
1533
96.0k
                                                     upsample_above, dx);
1534
96.0k
  }
1535
2.57M
  for (int i = 0; i < N; i++) {
1536
2.38M
    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
1537
2.38M
  }
1538
191k
}
1539
1540
static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_32xN_internal_avx2(
1541
14.8k
    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1542
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1543
14.8k
  (void)upsample_above;
1544
14.8k
  const int frac_bits = 6;
1545
14.8k
  const int max_base_x = ((32 + N) - 1);
1546
1547
  // pre-filter above pixels
1548
  // store in temp buffers:
1549
  //   above[x] * 32 + 16
1550
  //   above[x+1] - above[x]
1551
  // final pixels will be calculated as:
1552
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1553
14.8k
  __m256i a0, a0_1, a1, a1_1, a32, a16, c3f;
1554
14.8k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1555
1556
14.8k
  a16 = _mm256_set1_epi32(16);
1557
14.8k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1558
14.8k
  max_base_x256 = _mm256_set1_epi16(max_base_x);
1559
14.8k
  c3f = _mm256_set1_epi16(0x3f);
1560
1561
14.8k
  int x = dx;
1562
359k
  for (int r = 0; r < N; r++) {
1563
344k
    __m256i b, res[2], res1;
1564
1565
344k
    int base = x >> frac_bits;
1566
344k
    if (base >= max_base_x) {
1567
0
      for (int i = r; i < N; ++i) {
1568
0
        dstvec[i] = a_mbase_x;  // save 32 values
1569
0
        dstvec[i + N] = a_mbase_x;
1570
0
      }
1571
0
      return;
1572
0
    }
1573
1574
344k
    __m256i shift =
1575
344k
        _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
1576
1577
1.03M
    for (int j = 0; j < 32; j += 16) {
1578
688k
      int mdif = max_base_x - (base + j);
1579
688k
      if (mdif <= 0) {
1580
416
        res1 = a_mbase_x;
1581
688k
      } else {
1582
688k
        a0 = _mm256_cvtepu16_epi32(
1583
688k
            _mm_loadu_si128((__m128i *)(above + base + j)));
1584
688k
        a1 = _mm256_cvtepu16_epi32(
1585
688k
            _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
1586
1587
688k
        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1588
688k
        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1589
688k
        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1590
688k
        b = _mm256_mullo_epi32(diff, shift);
1591
1592
688k
        res[0] = _mm256_add_epi32(a32, b);
1593
688k
        res[0] = _mm256_srli_epi32(res[0], 5);
1594
688k
        res[0] = _mm256_packus_epi32(
1595
688k
            res[0],
1596
688k
            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1597
688k
        if (mdif > 8) {
1598
686k
          a0_1 = _mm256_cvtepu16_epi32(
1599
686k
              _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
1600
686k
          a1_1 = _mm256_cvtepu16_epi32(
1601
686k
              _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
1602
1603
686k
          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
1604
686k
          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
1605
686k
          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
1606
686k
          b = _mm256_mullo_epi32(diff, shift);
1607
1608
686k
          res[1] = _mm256_add_epi32(a32, b);
1609
686k
          res[1] = _mm256_srli_epi32(res[1], 5);
1610
686k
          res[1] = _mm256_packus_epi32(
1611
686k
              res[1],
1612
686k
              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1613
686k
        } else {
1614
1.84k
          res[1] = a_mbase_x;
1615
1.84k
        }
1616
688k
        res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1617
688k
                                       1);  // 16 16bit values
1618
688k
        base_inc256 = _mm256_setr_epi16(
1619
688k
            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1620
688k
            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1621
688k
            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1622
688k
            base + j + 13, base + j + 14, base + j + 15);
1623
1624
688k
        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1625
688k
        res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1626
688k
      }
1627
688k
      if (!j) {
1628
344k
        dstvec[r] = res1;
1629
344k
      } else {
1630
344k
        dstvec[r + N] = res1;
1631
344k
      }
1632
688k
    }
1633
344k
    x += dx;
1634
344k
  }
1635
14.8k
}
1636
1637
static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2(
1638
165k
    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1639
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1640
165k
  (void)upsample_above;
1641
165k
  const int frac_bits = 6;
1642
165k
  const int max_base_x = ((32 + N) - 1);
1643
1644
  // pre-filter above pixels
1645
  // store in temp buffers:
1646
  //   above[x] * 32 + 16
1647
  //   above[x+1] - above[x]
1648
  // final pixels will be calculated as:
1649
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1650
165k
  __m256i a0, a1, a32, a16, c3f;
1651
165k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1652
1653
165k
  a16 = _mm256_set1_epi16(16);
1654
165k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1655
165k
  max_base_x256 = _mm256_set1_epi16(max_base_x);
1656
165k
  c3f = _mm256_set1_epi16(0x3f);
1657
1658
165k
  int x = dx;
1659
4.57M
  for (int r = 0; r < N; r++) {
1660
4.40M
    __m256i b, res;
1661
1662
4.40M
    int base = x >> frac_bits;
1663
4.40M
    if (base >= max_base_x) {
1664
0
      for (int i = r; i < N; ++i) {
1665
0
        dstvec[i] = a_mbase_x;  // save 32 values
1666
0
        dstvec[i + N] = a_mbase_x;
1667
0
      }
1668
0
      return;
1669
0
    }
1670
1671
4.40M
    __m256i shift =
1672
4.40M
        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1673
1674
13.2M
    for (int j = 0; j < 32; j += 16) {
1675
8.81M
      int mdif = max_base_x - (base + j);
1676
8.81M
      if (mdif <= 0) {
1677
334
        res = a_mbase_x;
1678
8.81M
      } else {
1679
8.81M
        a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
1680
8.81M
        a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
1681
1682
8.81M
        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1683
8.81M
        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1684
8.81M
        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1685
8.81M
        b = _mm256_mullo_epi16(diff, shift);
1686
1687
8.81M
        res = _mm256_add_epi16(a32, b);
1688
8.81M
        res = _mm256_srli_epi16(res, 5);
1689
1690
8.81M
        base_inc256 = _mm256_setr_epi16(
1691
8.81M
            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1692
8.81M
            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1693
8.81M
            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1694
8.81M
            base + j + 13, base + j + 14, base + j + 15);
1695
1696
8.81M
        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1697
8.81M
        res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1698
8.81M
      }
1699
8.81M
      if (!j) {
1700
4.40M
        dstvec[r] = res;
1701
4.40M
      } else {
1702
4.40M
        dstvec[r + N] = res;
1703
4.40M
      }
1704
8.81M
    }
1705
4.40M
    x += dx;
1706
4.40M
  }
1707
165k
}
1708
1709
static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst,
1710
                                              ptrdiff_t stride,
1711
                                              const uint16_t *above,
1712
                                              int upsample_above, int dx,
1713
63.8k
                                              int bd) {
1714
63.8k
  __m256i dstvec[128];
1715
63.8k
  if (bd < 12) {
1716
56.2k
    highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above,
1717
56.2k
                                               dx);
1718
56.2k
  } else {
1719
7.57k
    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above,
1720
7.57k
                                                     upsample_above, dx);
1721
7.57k
  }
1722
1.70M
  for (int i = 0; i < N; i++) {
1723
1.64M
    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
1724
1.64M
    _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]);
1725
1.64M
  }
1726
63.8k
}
1727
1728
static void highbd_dr_prediction_32bit_z1_64xN_avx2(int N, uint16_t *dst,
1729
                                                    ptrdiff_t stride,
1730
                                                    const uint16_t *above,
1731
                                                    int upsample_above,
1732
7.97k
                                                    int dx) {
1733
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1734
7.97k
  (void)upsample_above;
1735
7.97k
  const int frac_bits = 6;
1736
7.97k
  const int max_base_x = ((64 + N) - 1);
1737
1738
  // pre-filter above pixels
1739
  // store in temp buffers:
1740
  //   above[x] * 32 + 16
1741
  //   above[x+1] - above[x]
1742
  // final pixels will be calculated as:
1743
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1744
7.97k
  __m256i a0, a0_1, a1, a1_1, a32, a16;
1745
7.97k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1746
1747
7.97k
  a16 = _mm256_set1_epi32(16);
1748
7.97k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1749
7.97k
  max_base_x256 = _mm256_set1_epi16(max_base_x);
1750
1751
7.97k
  int x = dx;
1752
470k
  for (int r = 0; r < N; r++, dst += stride) {
1753
462k
    __m256i b, res[2], res1;
1754
1755
462k
    int base = x >> frac_bits;
1756
462k
    if (base >= max_base_x) {
1757
0
      for (int i = r; i < N; ++i) {
1758
0
        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
1759
0
        _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
1760
0
        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
1761
0
        _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
1762
0
        dst += stride;
1763
0
      }
1764
0
      return;
1765
0
    }
1766
1767
462k
    __m256i shift = _mm256_srli_epi32(
1768
462k
        _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1769
1770
462k
    __m128i a0_128, a0_1_128, a1_128, a1_1_128;
1771
2.30M
    for (int j = 0; j < 64; j += 16) {
1772
1.84M
      int mdif = max_base_x - (base + j);
1773
1.84M
      if (mdif <= 0) {
1774
1.89k
        _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
1775
1.84M
      } else {
1776
1.84M
        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
1777
1.84M
        a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
1778
1.84M
        a0 = _mm256_cvtepu16_epi32(a0_128);
1779
1.84M
        a1 = _mm256_cvtepu16_epi32(a1_128);
1780
1781
1.84M
        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1782
1.84M
        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1783
1.84M
        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1784
1.84M
        b = _mm256_mullo_epi32(diff, shift);
1785
1786
1.84M
        res[0] = _mm256_add_epi32(a32, b);
1787
1.84M
        res[0] = _mm256_srli_epi32(res[0], 5);
1788
1.84M
        res[0] = _mm256_packus_epi32(
1789
1.84M
            res[0],
1790
1.84M
            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1791
1.84M
        if (mdif > 8) {
1792
1.84M
          a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
1793
1.84M
          a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
1794
1.84M
          a0_1 = _mm256_cvtepu16_epi32(a0_1_128);
1795
1.84M
          a1_1 = _mm256_cvtepu16_epi32(a1_1_128);
1796
1797
1.84M
          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
1798
1.84M
          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
1799
1.84M
          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
1800
1.84M
          b = _mm256_mullo_epi32(diff, shift);
1801
1802
1.84M
          res[1] = _mm256_add_epi32(a32, b);
1803
1.84M
          res[1] = _mm256_srli_epi32(res[1], 5);
1804
1.84M
          res[1] = _mm256_packus_epi32(
1805
1.84M
              res[1],
1806
1.84M
              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1807
1.84M
        } else {
1808
2.98k
          res[1] = a_mbase_x;
1809
2.98k
        }
1810
1.84M
        res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1811
1.84M
                                       1);  // 16 16bit values
1812
1.84M
        base_inc256 = _mm256_setr_epi16(
1813
1.84M
            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1814
1.84M
            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1815
1.84M
            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1816
1.84M
            base + j + 13, base + j + 14, base + j + 15);
1817
1818
1.84M
        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1819
1.84M
        res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1820
1.84M
        _mm256_storeu_si256((__m256i *)(dst + j), res1);
1821
1.84M
      }
1822
1.84M
    }
1823
462k
    x += dx;
1824
462k
  }
1825
7.97k
}
1826
1827
static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst,
1828
                                              ptrdiff_t stride,
1829
                                              const uint16_t *above,
1830
31.2k
                                              int upsample_above, int dx) {
1831
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1832
31.2k
  (void)upsample_above;
1833
31.2k
  const int frac_bits = 6;
1834
31.2k
  const int max_base_x = ((64 + N) - 1);
1835
1836
  // pre-filter above pixels
1837
  // store in temp buffers:
1838
  //   above[x] * 32 + 16
1839
  //   above[x+1] - above[x]
1840
  // final pixels will be calculated as:
1841
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1842
31.2k
  __m256i a0, a1, a32, a16, c3f;
1843
31.2k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1844
1845
31.2k
  a16 = _mm256_set1_epi16(16);
1846
31.2k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1847
31.2k
  max_base_x256 = _mm256_set1_epi16(max_base_x);
1848
31.2k
  c3f = _mm256_set1_epi16(0x3f);
1849
1850
31.2k
  int x = dx;
1851
1.57M
  for (int r = 0; r < N; r++, dst += stride) {
1852
1.54M
    __m256i b, res;
1853
1854
1.54M
    int base = x >> frac_bits;
1855
1.54M
    if (base >= max_base_x) {
1856
0
      for (int i = r; i < N; ++i) {
1857
0
        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
1858
0
        _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
1859
0
        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
1860
0
        _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
1861
0
        dst += stride;
1862
0
      }
1863
0
      return;
1864
0
    }
1865
1866
1.54M
    __m256i shift =
1867
1.54M
        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1868
1869
7.71M
    for (int j = 0; j < 64; j += 16) {
1870
6.17M
      int mdif = max_base_x - (base + j);
1871
6.17M
      if (mdif <= 0) {
1872
2.50k
        _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
1873
6.17M
      } else {
1874
6.17M
        a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
1875
6.17M
        a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
1876
1877
6.17M
        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1878
6.17M
        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1879
6.17M
        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1880
6.17M
        b = _mm256_mullo_epi16(diff, shift);
1881
1882
6.17M
        res = _mm256_add_epi16(a32, b);
1883
6.17M
        res = _mm256_srli_epi16(res, 5);
1884
1885
6.17M
        base_inc256 = _mm256_setr_epi16(
1886
6.17M
            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1887
6.17M
            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1888
6.17M
            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1889
6.17M
            base + j + 13, base + j + 14, base + j + 15);
1890
1891
6.17M
        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1892
6.17M
        res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1893
6.17M
        _mm256_storeu_si256((__m256i *)(dst + j), res);  // 16 16bit values
1894
6.17M
      }
1895
6.17M
    }
1896
1.54M
    x += dx;
1897
1.54M
  }
1898
31.2k
}
1899
1900
// Directional prediction, zone 1: 0 < angle < 90
1901
void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
1902
                                      int bh, const uint16_t *above,
1903
                                      const uint16_t *left, int upsample_above,
1904
628k
                                      int dx, int dy, int bd) {
1905
628k
  (void)left;
1906
628k
  (void)dy;
1907
1908
628k
  switch (bw) {
1909
153k
    case 4:
1910
153k
      highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above,
1911
153k
                                       dx, bd);
1912
153k
      break;
1913
205k
    case 8:
1914
205k
      highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above,
1915
205k
                                       dx, bd);
1916
205k
      break;
1917
191k
    case 16:
1918
191k
      highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above,
1919
191k
                                        dx, bd);
1920
191k
      break;
1921
62.2k
    case 32:
1922
62.2k
      highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above,
1923
62.2k
                                        dx, bd);
1924
62.2k
      break;
1925
15.1k
    case 64:
1926
15.1k
      if (bd < 12) {
1927
9.47k
        highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above,
1928
9.47k
                                          upsample_above, dx);
1929
9.47k
      } else {
1930
5.62k
        highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above,
1931
5.62k
                                                upsample_above, dx);
1932
5.62k
      }
1933
15.1k
      break;
1934
0
    default: break;
1935
628k
  }
1936
628k
  return;
1937
628k
}
1938
1939
static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc,
1940
321k
                                      uint16_t *dst, ptrdiff_t pitchDst) {
1941
321k
  __m256i r[16];
1942
321k
  __m256i d[16];
1943
5.46M
  for (int j = 0; j < 16; j++) {
1944
5.14M
    r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc));
1945
5.14M
  }
1946
321k
  highbd_transpose16x16_avx2(r, d);
1947
5.46M
  for (int j = 0; j < 16; j++) {
1948
5.14M
    _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]);
1949
5.14M
  }
1950
321k
}
1951
1952
static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc,
1953
                             uint16_t *dst, ptrdiff_t pitchDst, int width,
1954
25.7k
                             int height) {
1955
125k
  for (int j = 0; j < height; j += 16)
1956
421k
    for (int i = 0; i < width; i += 16)
1957
321k
      highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
1958
321k
                                dst + j * pitchDst + i, pitchDst);
1959
25.7k
}
1960
1961
static void highbd_dr_prediction_32bit_z2_Nx4_avx2(
1962
    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1963
    const uint16_t *left, int upsample_above, int upsample_left, int dx,
1964
311k
    int dy) {
1965
311k
  const int min_base_x = -(1 << upsample_above);
1966
311k
  const int min_base_y = -(1 << upsample_left);
1967
311k
  const int frac_bits_x = 6 - upsample_above;
1968
311k
  const int frac_bits_y = 6 - upsample_left;
1969
1970
311k
  assert(dx > 0);
1971
  // pre-filter above pixels
1972
  // store in temp buffers:
1973
  //   above[x] * 32 + 16
1974
  //   above[x+1] - above[x]
1975
  // final pixels will be calculated as:
1976
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1977
0
  __m256i a0_x, a1_x, a32, a16;
1978
311k
  __m256i diff;
1979
311k
  __m128i c3f, min_base_y128;
1980
1981
311k
  a16 = _mm256_set1_epi32(16);
1982
311k
  c3f = _mm_set1_epi32(0x3f);
1983
311k
  min_base_y128 = _mm_set1_epi32(min_base_y);
1984
1985
1.81M
  for (int r = 0; r < N; r++) {
1986
1.50M
    __m256i b, res, shift;
1987
1.50M
    __m128i resx, resy, resxy;
1988
1.50M
    __m128i a0_x128, a1_x128;
1989
1.50M
    int y = r + 1;
1990
1.50M
    int base_x = (-y * dx) >> frac_bits_x;
1991
1.50M
    int base_shift = 0;
1992
1.50M
    if (base_x < (min_base_x - 1)) {
1993
1.25M
      base_shift = (min_base_x - base_x - 1) >> upsample_above;
1994
1.25M
    }
1995
1.50M
    int base_min_diff =
1996
1.50M
        (min_base_x - base_x + upsample_above) >> upsample_above;
1997
1.50M
    if (base_min_diff > 4) {
1998
1.04M
      base_min_diff = 4;
1999
1.04M
    } else {
2000
457k
      if (base_min_diff < 0) base_min_diff = 0;
2001
457k
    }
2002
2003
1.50M
    if (base_shift > 3) {
2004
1.04M
      a0_x = _mm256_setzero_si256();
2005
1.04M
      a1_x = _mm256_setzero_si256();
2006
1.04M
      shift = _mm256_setzero_si256();
2007
1.04M
    } else {
2008
457k
      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2009
457k
      if (upsample_above) {
2010
108k
        a0_x128 = _mm_shuffle_epi8(a0_x128,
2011
108k
                                   *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
2012
108k
        a1_x128 = _mm_srli_si128(a0_x128, 8);
2013
2014
108k
        shift = _mm256_castsi128_si256(_mm_srli_epi32(
2015
108k
            _mm_and_si128(
2016
108k
                _mm_slli_epi32(
2017
108k
                    _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
2018
108k
                                   (2 << 6) - y * dx, (3 << 6) - y * dx),
2019
108k
                    upsample_above),
2020
108k
                c3f),
2021
108k
            1));
2022
348k
      } else {
2023
348k
        a0_x128 =
2024
348k
            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2025
348k
        a1_x128 = _mm_srli_si128(a0_x128, 2);
2026
2027
348k
        shift = _mm256_castsi128_si256(_mm_srli_epi32(
2028
348k
            _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
2029
348k
                                         (2 << 6) - y * dx, (3 << 6) - y * dx),
2030
348k
                          c3f),
2031
348k
            1));
2032
348k
      }
2033
457k
      a0_x = _mm256_cvtepu16_epi32(a0_x128);
2034
457k
      a1_x = _mm256_cvtepu16_epi32(a1_x128);
2035
457k
    }
2036
    // y calc
2037
1.50M
    __m128i a0_y, a1_y, shifty;
2038
1.50M
    if (base_x < min_base_x) {
2039
1.35M
      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2040
1.35M
      DECLARE_ALIGNED(32, int, base_y_c[4]);
2041
1.35M
      r6 = _mm_set1_epi32(r << 6);
2042
1.35M
      dy128 = _mm_set1_epi32(dy);
2043
1.35M
      c1234 = _mm_setr_epi32(1, 2, 3, 4);
2044
1.35M
      y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
2045
1.35M
      base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
2046
1.35M
      mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
2047
1.35M
      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2048
1.35M
      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2049
2050
1.35M
      a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
2051
1.35M
                            left[base_y_c[2]], left[base_y_c[3]]);
2052
1.35M
      a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2053
1.35M
                            left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
2054
2055
1.35M
      if (upsample_left) {
2056
292k
        shifty = _mm_srli_epi32(
2057
292k
            _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
2058
1.05M
      } else {
2059
1.05M
        shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
2060
1.05M
      }
2061
1.35M
      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2062
1.35M
      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2063
1.35M
      shift = _mm256_inserti128_si256(shift, shifty, 1);
2064
1.35M
    }
2065
2066
1.50M
    diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
2067
1.50M
    a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
2068
1.50M
    a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2069
2070
1.50M
    b = _mm256_mullo_epi32(diff, shift);
2071
1.50M
    res = _mm256_add_epi32(a32, b);
2072
1.50M
    res = _mm256_srli_epi32(res, 5);
2073
2074
1.50M
    resx = _mm256_castsi256_si128(res);
2075
1.50M
    resx = _mm_packus_epi32(resx, resx);
2076
2077
1.50M
    resy = _mm256_extracti128_si256(res, 1);
2078
1.50M
    resy = _mm_packus_epi32(resy, resy);
2079
2080
1.50M
    resxy =
2081
1.50M
        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2082
1.50M
    _mm_storel_epi64((__m128i *)(dst), resxy);
2083
1.50M
    dst += stride;
2084
1.50M
  }
2085
311k
}
2086
2087
static void highbd_dr_prediction_z2_Nx4_avx2(
2088
    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2089
    const uint16_t *left, int upsample_above, int upsample_left, int dx,
2090
137k
    int dy) {
2091
137k
  const int min_base_x = -(1 << upsample_above);
2092
137k
  const int min_base_y = -(1 << upsample_left);
2093
137k
  const int frac_bits_x = 6 - upsample_above;
2094
137k
  const int frac_bits_y = 6 - upsample_left;
2095
2096
137k
  assert(dx > 0);
2097
  // pre-filter above pixels
2098
  // store in temp buffers:
2099
  //   above[x] * 32 + 16
2100
  //   above[x+1] - above[x]
2101
  // final pixels will be calculated as:
2102
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2103
0
  __m256i a0_x, a1_x, a32, a16;
2104
137k
  __m256i diff;
2105
137k
  __m128i c3f, min_base_y128;
2106
2107
137k
  a16 = _mm256_set1_epi16(16);
2108
137k
  c3f = _mm_set1_epi16(0x3f);
2109
137k
  min_base_y128 = _mm_set1_epi16(min_base_y);
2110
2111
982k
  for (int r = 0; r < N; r++) {
2112
844k
    __m256i b, res, shift;
2113
844k
    __m128i resx, resy, resxy;
2114
844k
    __m128i a0_x128, a1_x128;
2115
844k
    int y = r + 1;
2116
844k
    int base_x = (-y * dx) >> frac_bits_x;
2117
844k
    int base_shift = 0;
2118
844k
    if (base_x < (min_base_x - 1)) {
2119
604k
      base_shift = (min_base_x - base_x - 1) >> upsample_above;
2120
604k
    }
2121
844k
    int base_min_diff =
2122
844k
        (min_base_x - base_x + upsample_above) >> upsample_above;
2123
844k
    if (base_min_diff > 4) {
2124
372k
      base_min_diff = 4;
2125
472k
    } else {
2126
472k
      if (base_min_diff < 0) base_min_diff = 0;
2127
472k
    }
2128
2129
844k
    if (base_shift > 3) {
2130
372k
      a0_x = _mm256_setzero_si256();
2131
372k
      a1_x = _mm256_setzero_si256();
2132
372k
      shift = _mm256_setzero_si256();
2133
472k
    } else {
2134
472k
      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2135
472k
      if (upsample_above) {
2136
152k
        a0_x128 = _mm_shuffle_epi8(a0_x128,
2137
152k
                                   *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
2138
152k
        a1_x128 = _mm_srli_si128(a0_x128, 8);
2139
2140
152k
        shift = _mm256_castsi128_si256(_mm_srli_epi16(
2141
152k
            _mm_and_si128(
2142
152k
                _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2143
152k
                                              (2 << 6) - y * dx,
2144
152k
                                              (3 << 6) - y * dx, 0, 0, 0, 0),
2145
152k
                               upsample_above),
2146
152k
                c3f),
2147
152k
            1));
2148
320k
      } else {
2149
320k
        a0_x128 =
2150
320k
            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2151
320k
        a1_x128 = _mm_srli_si128(a0_x128, 2);
2152
2153
320k
        shift = _mm256_castsi128_si256(_mm_srli_epi16(
2154
320k
            _mm_and_si128(
2155
320k
                _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
2156
320k
                               (3 << 6) - y * dx, 0, 0, 0, 0),
2157
320k
                c3f),
2158
320k
            1));
2159
320k
      }
2160
472k
      a0_x = _mm256_castsi128_si256(a0_x128);
2161
472k
      a1_x = _mm256_castsi128_si256(a1_x128);
2162
472k
    }
2163
    // y calc
2164
844k
    __m128i a0_y, a1_y, shifty;
2165
844k
    if (base_x < min_base_x) {
2166
690k
      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2167
690k
      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
2168
690k
      r6 = _mm_set1_epi16(r << 6);
2169
690k
      dy128 = _mm_set1_epi16(dy);
2170
690k
      c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
2171
690k
      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
2172
690k
      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
2173
690k
      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
2174
690k
      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2175
690k
      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2176
2177
690k
      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
2178
690k
                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
2179
690k
      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2180
690k
                            left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0,
2181
690k
                            0, 0);
2182
2183
690k
      if (upsample_left) {
2184
216k
        shifty = _mm_srli_epi16(
2185
216k
            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
2186
474k
      } else {
2187
474k
        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
2188
474k
      }
2189
690k
      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2190
690k
      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2191
690k
      shift = _mm256_inserti128_si256(shift, shifty, 1);
2192
690k
    }
2193
2194
844k
    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
2195
844k
    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
2196
844k
    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2197
2198
844k
    b = _mm256_mullo_epi16(diff, shift);
2199
844k
    res = _mm256_add_epi16(a32, b);
2200
844k
    res = _mm256_srli_epi16(res, 5);
2201
2202
844k
    resx = _mm256_castsi256_si128(res);
2203
844k
    resy = _mm256_extracti128_si256(res, 1);
2204
844k
    resxy =
2205
844k
        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2206
844k
    _mm_storel_epi64((__m128i *)(dst), resxy);
2207
844k
    dst += stride;
2208
844k
  }
2209
137k
}
2210
2211
static void highbd_dr_prediction_32bit_z2_Nx8_avx2(
2212
    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2213
    const uint16_t *left, int upsample_above, int upsample_left, int dx,
2214
289k
    int dy) {
2215
289k
  const int min_base_x = -(1 << upsample_above);
2216
289k
  const int min_base_y = -(1 << upsample_left);
2217
289k
  const int frac_bits_x = 6 - upsample_above;
2218
289k
  const int frac_bits_y = 6 - upsample_left;
2219
2220
  // pre-filter above pixels
2221
  // store in temp buffers:
2222
  //   above[x] * 32 + 16
2223
  //   above[x+1] - above[x]
2224
  // final pixels will be calculated as:
2225
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2226
289k
  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
2227
289k
  __m256i diff;
2228
289k
  __m128i a0_x128, a1_x128;
2229
2230
289k
  a16 = _mm256_set1_epi32(16);
2231
289k
  c3f = _mm256_set1_epi32(0x3f);
2232
289k
  min_base_y256 = _mm256_set1_epi32(min_base_y);
2233
2234
3.04M
  for (int r = 0; r < N; r++) {
2235
2.75M
    __m256i b, res, shift;
2236
2.75M
    __m128i resx, resy, resxy;
2237
2.75M
    int y = r + 1;
2238
2.75M
    int base_x = (-y * dx) >> frac_bits_x;
2239
2.75M
    int base_shift = 0;
2240
2.75M
    if (base_x < (min_base_x - 1)) {
2241
2.09M
      base_shift = (min_base_x - base_x - 1) >> upsample_above;
2242
2.09M
    }
2243
2.75M
    int base_min_diff =
2244
2.75M
        (min_base_x - base_x + upsample_above) >> upsample_above;
2245
2.75M
    if (base_min_diff > 8) {
2246
1.29M
      base_min_diff = 8;
2247
1.46M
    } else {
2248
1.46M
      if (base_min_diff < 0) base_min_diff = 0;
2249
1.46M
    }
2250
2251
2.75M
    if (base_shift > 7) {
2252
1.29M
      resx = _mm_setzero_si128();
2253
1.46M
    } else {
2254
1.46M
      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2255
1.46M
      if (upsample_above) {
2256
104k
        __m128i mask, atmp0, atmp1, atmp2, atmp3;
2257
104k
        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
2258
104k
        atmp0 = _mm_shuffle_epi8(a0_x128,
2259
104k
                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2260
104k
        atmp1 = _mm_shuffle_epi8(a1_x128,
2261
104k
                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2262
104k
        atmp2 = _mm_shuffle_epi8(
2263
104k
            a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2264
104k
        atmp3 = _mm_shuffle_epi8(
2265
104k
            a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2266
104k
        mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
2267
104k
                              _mm_set1_epi8(15));
2268
104k
        a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
2269
104k
        mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
2270
104k
                              _mm_set1_epi8(15));
2271
104k
        a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
2272
104k
        shift = _mm256_srli_epi32(
2273
104k
            _mm256_and_si256(
2274
104k
                _mm256_slli_epi32(
2275
104k
                    _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx,
2276
104k
                                      (2 << 6) - y * dx, (3 << 6) - y * dx,
2277
104k
                                      (4 << 6) - y * dx, (5 << 6) - y * dx,
2278
104k
                                      (6 << 6) - y * dx, (7 << 6) - y * dx),
2279
104k
                    upsample_above),
2280
104k
                c3f),
2281
104k
            1);
2282
1.35M
      } else {
2283
1.35M
        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
2284
1.35M
        a0_x128 =
2285
1.35M
            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2286
1.35M
        a1_x128 =
2287
1.35M
            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2288
2289
1.35M
        shift = _mm256_srli_epi32(
2290
1.35M
            _mm256_and_si256(
2291
1.35M
                _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
2292
1.35M
                                  (3 << 6) - y * dx, (4 << 6) - y * dx,
2293
1.35M
                                  (5 << 6) - y * dx, (6 << 6) - y * dx,
2294
1.35M
                                  (7 << 6) - y * dx),
2295
1.35M
                c3f),
2296
1.35M
            1);
2297
1.35M
      }
2298
1.46M
      a0_x = _mm256_cvtepu16_epi32(a0_x128);
2299
1.46M
      a1_x = _mm256_cvtepu16_epi32(a1_x128);
2300
2301
1.46M
      diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
2302
1.46M
      a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
2303
1.46M
      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2304
2305
1.46M
      b = _mm256_mullo_epi32(diff, shift);
2306
1.46M
      res = _mm256_add_epi32(a32, b);
2307
1.46M
      res = _mm256_srli_epi32(res, 5);
2308
2309
1.46M
      resx = _mm256_castsi256_si128(_mm256_packus_epi32(
2310
1.46M
          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
2311
1.46M
    }
2312
    // y calc
2313
2.75M
    if (base_x < min_base_x) {
2314
2.33M
      DECLARE_ALIGNED(32, int, base_y_c[8]);
2315
2.33M
      __m256i r6, c256, dy256, y_c256, base_y_c256, mask256;
2316
2.33M
      r6 = _mm256_set1_epi32(r << 6);
2317
2.33M
      dy256 = _mm256_set1_epi32(dy);
2318
2.33M
      c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
2319
2.33M
      y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2320
2.33M
      base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
2321
2.33M
      mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2322
2.33M
      base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2323
2.33M
      _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2324
2325
2.33M
      a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2326
2.33M
          left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2327
2.33M
          left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2328
2.33M
          left[base_y_c[6]], left[base_y_c[7]]));
2329
2.33M
      a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2330
2.33M
          left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
2331
2.33M
          left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2332
2.33M
          left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
2333
2334
2.33M
      if (upsample_left) {
2335
81.7k
        shift = _mm256_srli_epi32(
2336
81.7k
            _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f),
2337
81.7k
            1);
2338
2.25M
      } else {
2339
2.25M
        shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
2340
2.25M
      }
2341
2.33M
      diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
2342
2.33M
      a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
2343
2.33M
      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2344
2345
2.33M
      b = _mm256_mullo_epi32(diff, shift);
2346
2.33M
      res = _mm256_add_epi32(a32, b);
2347
2.33M
      res = _mm256_srli_epi32(res, 5);
2348
2349
2.33M
      resy = _mm256_castsi256_si128(_mm256_packus_epi32(
2350
2.33M
          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
2351
2.33M
    } else {
2352
421k
      resy = resx;
2353
421k
    }
2354
2.75M
    resxy =
2355
2.75M
        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2356
2.75M
    _mm_storeu_si128((__m128i *)(dst), resxy);
2357
2.75M
    dst += stride;
2358
2.75M
  }
2359
289k
}
2360
2361
static void highbd_dr_prediction_z2_Nx8_avx2(
2362
    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2363
    const uint16_t *left, int upsample_above, int upsample_left, int dx,
2364
132k
    int dy) {
2365
132k
  const int min_base_x = -(1 << upsample_above);
2366
132k
  const int min_base_y = -(1 << upsample_left);
2367
132k
  const int frac_bits_x = 6 - upsample_above;
2368
132k
  const int frac_bits_y = 6 - upsample_left;
2369
2370
  // pre-filter above pixels
2371
  // store in temp buffers:
2372
  //   above[x] * 32 + 16
2373
  //   above[x+1] - above[x]
2374
  // final pixels will be calculated as:
2375
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2376
132k
  __m128i c3f, min_base_y128;
2377
132k
  __m256i a0_x, a1_x, diff, a32, a16;
2378
132k
  __m128i a0_x128, a1_x128;
2379
2380
132k
  a16 = _mm256_set1_epi16(16);
2381
132k
  c3f = _mm_set1_epi16(0x3f);
2382
132k
  min_base_y128 = _mm_set1_epi16(min_base_y);
2383
2384
1.56M
  for (int r = 0; r < N; r++) {
2385
1.43M
    __m256i b, res, shift;
2386
1.43M
    __m128i resx, resy, resxy;
2387
1.43M
    int y = r + 1;
2388
1.43M
    int base_x = (-y * dx) >> frac_bits_x;
2389
1.43M
    int base_shift = 0;
2390
1.43M
    if (base_x < (min_base_x - 1)) {
2391
1.08M
      base_shift = (min_base_x - base_x - 1) >> upsample_above;
2392
1.08M
    }
2393
1.43M
    int base_min_diff =
2394
1.43M
        (min_base_x - base_x + upsample_above) >> upsample_above;
2395
1.43M
    if (base_min_diff > 8) {
2396
725k
      base_min_diff = 8;
2397
725k
    } else {
2398
705k
      if (base_min_diff < 0) base_min_diff = 0;
2399
705k
    }
2400
2401
1.43M
    if (base_shift > 7) {
2402
725k
      a0_x = _mm256_setzero_si256();
2403
725k
      a1_x = _mm256_setzero_si256();
2404
725k
      shift = _mm256_setzero_si256();
2405
725k
    } else {
2406
705k
      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2407
705k
      if (upsample_above) {
2408
189k
        __m128i mask, atmp0, atmp1, atmp2, atmp3;
2409
189k
        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
2410
189k
        atmp0 = _mm_shuffle_epi8(a0_x128,
2411
189k
                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2412
189k
        atmp1 = _mm_shuffle_epi8(a1_x128,
2413
189k
                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2414
189k
        atmp2 = _mm_shuffle_epi8(
2415
189k
            a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2416
189k
        atmp3 = _mm_shuffle_epi8(
2417
189k
            a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2418
189k
        mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
2419
189k
                              _mm_set1_epi8(15));
2420
189k
        a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
2421
189k
        mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
2422
189k
                              _mm_set1_epi8(15));
2423
189k
        a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
2424
2425
189k
        shift = _mm256_castsi128_si256(_mm_srli_epi16(
2426
189k
            _mm_and_si128(
2427
189k
                _mm_slli_epi16(
2428
189k
                    _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2429
189k
                                   (2 << 6) - y * dx, (3 << 6) - y * dx,
2430
189k
                                   (4 << 6) - y * dx, (5 << 6) - y * dx,
2431
189k
                                   (6 << 6) - y * dx, (7 << 6) - y * dx),
2432
189k
                    upsample_above),
2433
189k
                c3f),
2434
189k
            1));
2435
515k
      } else {
2436
515k
        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
2437
515k
        a0_x128 =
2438
515k
            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2439
515k
        a1_x128 =
2440
515k
            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2441
2442
515k
        shift = _mm256_castsi128_si256(_mm_srli_epi16(
2443
515k
            _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2444
515k
                                         (2 << 6) - y * dx, (3 << 6) - y * dx,
2445
515k
                                         (4 << 6) - y * dx, (5 << 6) - y * dx,
2446
515k
                                         (6 << 6) - y * dx, (7 << 6) - y * dx),
2447
515k
                          c3f),
2448
515k
            1));
2449
515k
      }
2450
705k
      a0_x = _mm256_castsi128_si256(a0_x128);
2451
705k
      a1_x = _mm256_castsi128_si256(a1_x128);
2452
705k
    }
2453
2454
    // y calc
2455
1.43M
    __m128i a0_y, a1_y, shifty;
2456
1.43M
    if (base_x < min_base_x) {
2457
1.19M
      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
2458
1.19M
      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2459
1.19M
      r6 = _mm_set1_epi16(r << 6);
2460
1.19M
      dy128 = _mm_set1_epi16(dy);
2461
1.19M
      c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
2462
1.19M
      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
2463
1.19M
      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
2464
1.19M
      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
2465
1.19M
      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2466
1.19M
      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2467
2468
1.19M
      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
2469
1.19M
                            left[base_y_c[2]], left[base_y_c[3]],
2470
1.19M
                            left[base_y_c[4]], left[base_y_c[5]],
2471
1.19M
                            left[base_y_c[6]], left[base_y_c[7]]);
2472
1.19M
      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2473
1.19M
                            left[base_y_c[2] + 1], left[base_y_c[3] + 1],
2474
1.19M
                            left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2475
1.19M
                            left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
2476
2477
1.19M
      if (upsample_left) {
2478
281k
        shifty = _mm_srli_epi16(
2479
281k
            _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1);
2480
916k
      } else {
2481
916k
        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
2482
916k
      }
2483
1.19M
      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2484
1.19M
      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2485
1.19M
      shift = _mm256_inserti128_si256(shift, shifty, 1);
2486
1.19M
    }
2487
2488
1.43M
    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
2489
1.43M
    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
2490
1.43M
    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2491
2492
1.43M
    b = _mm256_mullo_epi16(diff, shift);
2493
1.43M
    res = _mm256_add_epi16(a32, b);
2494
1.43M
    res = _mm256_srli_epi16(res, 5);
2495
2496
1.43M
    resx = _mm256_castsi256_si128(res);
2497
1.43M
    resy = _mm256_extracti128_si256(res, 1);
2498
2499
1.43M
    resxy =
2500
1.43M
        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2501
1.43M
    _mm_storeu_si128((__m128i *)(dst), resxy);
2502
1.43M
    dst += stride;
2503
1.43M
  }
2504
132k
}
2505
2506
static void highbd_dr_prediction_32bit_z2_HxW_avx2(
2507
    int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2508
    const uint16_t *left, int upsample_above, int upsample_left, int dx,
2509
169k
    int dy) {
2510
  // here upsample_above and upsample_left are 0 by design of
2511
  // av1_use_intra_edge_upsample
2512
169k
  const int min_base_x = -1;
2513
169k
  const int min_base_y = -1;
2514
169k
  (void)upsample_above;
2515
169k
  (void)upsample_left;
2516
169k
  const int frac_bits_x = 6;
2517
169k
  const int frac_bits_y = 6;
2518
2519
  // pre-filter above pixels
2520
  // store in temp buffers:
2521
  //   above[x] * 32 + 16
2522
  //   above[x+1] - above[x]
2523
  // final pixels will be calculated as:
2524
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2525
169k
  __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1;
2526
169k
  __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8;
2527
169k
  __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
2528
169k
  DECLARE_ALIGNED(32, int, base_y_c[16]);
2529
2530
169k
  a16 = _mm256_set1_epi32(16);
2531
169k
  c1 = _mm256_srli_epi32(a16, 4);
2532
169k
  c8 = _mm256_srli_epi32(a16, 1);
2533
169k
  min_base_y256 = _mm256_set1_epi32(min_base_y);
2534
169k
  c3f = _mm256_set1_epi32(0x3f);
2535
169k
  dy256 = _mm256_set1_epi32(dy);
2536
169k
  c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2537
169k
  c1234 = _mm256_add_epi32(c0123, c1);
2538
2539
2.21M
  for (int r = 0; r < H; r++) {
2540
2.04M
    __m256i b, res, shift, ydx;
2541
2.04M
    __m256i resx[2], resy[2];
2542
2.04M
    __m256i resxy, j256, r6;
2543
5.10M
    for (int j = 0; j < W; j += 16) {
2544
3.05M
      j256 = _mm256_set1_epi32(j);
2545
3.05M
      int y = r + 1;
2546
3.05M
      ydx = _mm256_set1_epi32(y * dx);
2547
2548
3.05M
      int base_x = ((j << 6) - y * dx) >> frac_bits_x;
2549
3.05M
      int base_shift = 0;
2550
3.05M
      if ((base_x) < (min_base_x - 1)) {
2551
1.86M
        base_shift = (min_base_x - base_x - 1);
2552
1.86M
      }
2553
3.05M
      int base_min_diff = (min_base_x - base_x);
2554
3.05M
      if (base_min_diff > 16) {
2555
1.09M
        base_min_diff = 16;
2556
1.96M
      } else {
2557
1.96M
        if (base_min_diff < 0) base_min_diff = 0;
2558
1.96M
      }
2559
2560
3.05M
      if (base_shift > 7) {
2561
1.36M
        resx[0] = _mm256_setzero_si256();
2562
1.68M
      } else {
2563
1.68M
        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2564
1.68M
        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
2565
1.68M
        a0_x128 =
2566
1.68M
            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2567
1.68M
        a1_x128 =
2568
1.68M
            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2569
2570
1.68M
        a0_x = _mm256_cvtepu16_epi32(a0_x128);
2571
1.68M
        a1_x = _mm256_cvtepu16_epi32(a1_x128);
2572
2573
1.68M
        r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6);
2574
1.68M
        shift = _mm256_srli_epi32(
2575
1.68M
            _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
2576
2577
1.68M
        diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
2578
1.68M
        a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
2579
1.68M
        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2580
2581
1.68M
        b = _mm256_mullo_epi32(diff, shift);
2582
1.68M
        res = _mm256_add_epi32(a32, b);
2583
1.68M
        res = _mm256_srli_epi32(res, 5);
2584
2585
1.68M
        resx[0] = _mm256_packus_epi32(
2586
1.68M
            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2587
1.68M
      }
2588
3.05M
      int base_shift8 = 0;
2589
3.05M
      if ((base_x + 8) < (min_base_x - 1)) {
2590
1.33M
        base_shift8 = (min_base_x - (base_x + 8) - 1);
2591
1.33M
      }
2592
3.05M
      if (base_shift8 > 7) {
2593
1.09M
        resx[1] = _mm256_setzero_si256();
2594
1.96M
      } else {
2595
1.96M
        a0_1_x128 =
2596
1.96M
            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8));
2597
1.96M
        a1_1_x128 =
2598
1.96M
            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9));
2599
1.96M
        a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
2600
1.96M
                                     *(__m128i *)HighbdLoadMaskx[base_shift8]);
2601
1.96M
        a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
2602
1.96M
                                     *(__m128i *)HighbdLoadMaskx[base_shift8]);
2603
2604
1.96M
        a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
2605
1.96M
        a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
2606
2607
1.96M
        r6 = _mm256_slli_epi32(
2608
1.96M
            _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6);
2609
1.96M
        shift = _mm256_srli_epi32(
2610
1.96M
            _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
2611
2612
1.96M
        diff = _mm256_sub_epi32(a1_1_x, a0_1_x);  // a[x+1] - a[x]
2613
1.96M
        a32 = _mm256_slli_epi32(a0_1_x, 5);       // a[x] * 32
2614
1.96M
        a32 = _mm256_add_epi32(a32, a16);         // a[x] * 32 + 16
2615
1.96M
        b = _mm256_mullo_epi32(diff, shift);
2616
2617
1.96M
        resx[1] = _mm256_add_epi32(a32, b);
2618
1.96M
        resx[1] = _mm256_srli_epi32(resx[1], 5);
2619
1.96M
        resx[1] = _mm256_packus_epi32(
2620
1.96M
            resx[1],
2621
1.96M
            _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1)));
2622
1.96M
      }
2623
3.05M
      resx[0] =
2624
3.05M
          _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]),
2625
3.05M
                                  1);  // 16 16bit values
2626
2627
      // y calc
2628
3.05M
      resy[0] = _mm256_setzero_si256();
2629
3.05M
      if ((base_x < min_base_x)) {
2630
2.02M
        __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256;
2631
2.02M
        r6 = _mm256_set1_epi32(r << 6);
2632
2.02M
        c256 = _mm256_add_epi32(j256, c1234);
2633
2.02M
        y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2634
2.02M
        base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
2635
2.02M
        mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2636
2.02M
        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2637
2.02M
        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2638
2.02M
        c256 = _mm256_add_epi32(c256, c8);
2639
2.02M
        y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2640
2.02M
        base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
2641
2.02M
        mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2642
2.02M
        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2643
2.02M
        _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256);
2644
2645
2.02M
        a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2646
2.02M
            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2647
2.02M
            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2648
2.02M
            left[base_y_c[6]], left[base_y_c[7]]));
2649
2.02M
        a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2650
2.02M
            left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
2651
2.02M
            left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2652
2.02M
            left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
2653
2654
2.02M
        shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
2655
2656
2.02M
        diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
2657
2.02M
        a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
2658
2.02M
        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2659
2660
2.02M
        b = _mm256_mullo_epi32(diff, shift);
2661
2.02M
        res = _mm256_add_epi32(a32, b);
2662
2.02M
        res = _mm256_srli_epi32(res, 5);
2663
2664
2.02M
        resy[0] = _mm256_packus_epi32(
2665
2.02M
            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2666
2667
2.02M
        a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2668
2.02M
            left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]],
2669
2.02M
            left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]],
2670
2.02M
            left[base_y_c[14]], left[base_y_c[15]]));
2671
2.02M
        a1_y = _mm256_cvtepu16_epi32(
2672
2.02M
            _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1],
2673
2.02M
                           left[base_y_c[10] + 1], left[base_y_c[11] + 1],
2674
2.02M
                           left[base_y_c[12] + 1], left[base_y_c[13] + 1],
2675
2.02M
                           left[base_y_c[14] + 1], left[base_y_c[15] + 1]));
2676
2.02M
        shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1);
2677
2678
2.02M
        diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
2679
2.02M
        a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
2680
2.02M
        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2681
2682
2.02M
        b = _mm256_mullo_epi32(diff, shift);
2683
2.02M
        res = _mm256_add_epi32(a32, b);
2684
2.02M
        res = _mm256_srli_epi32(res, 5);
2685
2686
2.02M
        resy[1] = _mm256_packus_epi32(
2687
2.02M
            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2688
2689
2.02M
        resy[0] =
2690
2.02M
            _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
2691
2.02M
                                    1);  // 16 16bit values
2692
2.02M
      }
2693
2694
3.05M
      resxy = _mm256_blendv_epi8(resx[0], resy[0],
2695
3.05M
                                 *(__m256i *)HighbdBaseMask[base_min_diff]);
2696
3.05M
      _mm256_storeu_si256((__m256i *)(dst + j), resxy);
2697
3.05M
    }  // for j
2698
2.04M
    dst += stride;
2699
2.04M
  }
2700
169k
}
2701
2702
static void highbd_dr_prediction_z2_HxW_avx2(
2703
    int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2704
    const uint16_t *left, int upsample_above, int upsample_left, int dx,
2705
314k
    int dy) {
2706
  // here upsample_above and upsample_left are 0 by design of
2707
  // av1_use_intra_edge_upsample
2708
314k
  const int min_base_x = -1;
2709
314k
  const int min_base_y = -1;
2710
314k
  (void)upsample_above;
2711
314k
  (void)upsample_left;
2712
314k
  const int frac_bits_x = 6;
2713
314k
  const int frac_bits_y = 6;
2714
2715
  // pre-filter above pixels
2716
  // store in temp buffers:
2717
  //   above[x] * 32 + 16
2718
  //   above[x+1] - above[x]
2719
  // final pixels will be calculated as:
2720
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2721
314k
  __m256i a0_x, a1_x, a32, a16, c3f, c1;
2722
314k
  __m256i diff, min_base_y256, dy256, c1234, c0123;
2723
314k
  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
2724
2725
314k
  a16 = _mm256_set1_epi16(16);
2726
314k
  c1 = _mm256_srli_epi16(a16, 4);
2727
314k
  min_base_y256 = _mm256_set1_epi16(min_base_y);
2728
314k
  c3f = _mm256_set1_epi16(0x3f);
2729
314k
  dy256 = _mm256_set1_epi16(dy);
2730
314k
  c0123 =
2731
314k
      _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2732
314k
  c1234 = _mm256_add_epi16(c0123, c1);
2733
2734
6.70M
  for (int r = 0; r < H; r++) {
2735
6.39M
    __m256i b, res, shift;
2736
6.39M
    __m256i resx, resy, ydx;
2737
6.39M
    __m256i resxy, j256, r6;
2738
6.39M
    __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
2739
6.39M
    int y = r + 1;
2740
6.39M
    ydx = _mm256_set1_epi16((short)(y * dx));
2741
2742
17.8M
    for (int j = 0; j < W; j += 16) {
2743
11.4M
      j256 = _mm256_set1_epi16(j);
2744
11.4M
      int base_x = ((j << 6) - y * dx) >> frac_bits_x;
2745
11.4M
      int base_shift = 0;
2746
11.4M
      if ((base_x) < (min_base_x - 1)) {
2747
8.36M
        base_shift = (min_base_x - (base_x)-1);
2748
8.36M
      }
2749
11.4M
      int base_min_diff = (min_base_x - base_x);
2750
11.4M
      if (base_min_diff > 16) {
2751
6.09M
        base_min_diff = 16;
2752
6.09M
      } else {
2753
5.35M
        if (base_min_diff < 0) base_min_diff = 0;
2754
5.35M
      }
2755
2756
11.4M
      if (base_shift < 8) {
2757
4.44M
        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2758
4.44M
        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
2759
4.44M
        a0_x128 =
2760
4.44M
            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2761
4.44M
        a1_x128 =
2762
4.44M
            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2763
2764
4.44M
        a0_x = _mm256_castsi128_si256(a0_x128);
2765
4.44M
        a1_x = _mm256_castsi128_si256(a1_x128);
2766
7.00M
      } else {
2767
7.00M
        a0_x = _mm256_setzero_si256();
2768
7.00M
        a1_x = _mm256_setzero_si256();
2769
7.00M
      }
2770
2771
11.4M
      int base_shift1 = 0;
2772
11.4M
      if (base_shift > 8) {
2773
6.86M
        base_shift1 = base_shift - 8;
2774
6.86M
      }
2775
11.4M
      if (base_shift1 < 8) {
2776
5.35M
        a0_1_x128 =
2777
5.35M
            _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8));
2778
5.35M
        a1_1_x128 =
2779
5.35M
            _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9));
2780
5.35M
        a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
2781
5.35M
                                     *(__m128i *)HighbdLoadMaskx[base_shift1]);
2782
5.35M
        a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
2783
5.35M
                                     *(__m128i *)HighbdLoadMaskx[base_shift1]);
2784
2785
5.35M
        a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
2786
5.35M
        a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
2787
5.35M
      }
2788
11.4M
      r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
2789
11.4M
      shift = _mm256_srli_epi16(
2790
11.4M
          _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
2791
2792
11.4M
      diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
2793
11.4M
      a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
2794
11.4M
      a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2795
2796
11.4M
      b = _mm256_mullo_epi16(diff, shift);
2797
11.4M
      res = _mm256_add_epi16(a32, b);
2798
11.4M
      resx = _mm256_srli_epi16(res, 5);  // 16 16-bit values
2799
2800
      // y calc
2801
11.4M
      resy = _mm256_setzero_si256();
2802
11.4M
      __m256i a0_y, a1_y, shifty;
2803
11.4M
      if ((base_x < min_base_x)) {
2804
8.80M
        __m256i c256, y_c256, base_y_c256, mask256, mul16;
2805
8.80M
        r6 = _mm256_set1_epi16(r << 6);
2806
8.80M
        c256 = _mm256_add_epi16(j256, c1234);
2807
8.80M
        mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
2808
8.80M
                                 _mm256_srli_epi16(min_base_y256, 1));
2809
8.80M
        y_c256 = _mm256_sub_epi16(r6, mul16);
2810
8.80M
        base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
2811
8.80M
        mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
2812
8.80M
        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2813
8.80M
        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2814
2815
8.80M
        a0_y = _mm256_setr_epi16(
2816
8.80M
            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2817
8.80M
            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2818
8.80M
            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
2819
8.80M
            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
2820
8.80M
            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
2821
8.80M
            left[base_y_c[15]]);
2822
8.80M
        base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
2823
8.80M
        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2824
2825
8.80M
        a1_y = _mm256_setr_epi16(
2826
8.80M
            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2827
8.80M
            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2828
8.80M
            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
2829
8.80M
            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
2830
8.80M
            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
2831
8.80M
            left[base_y_c[15]]);
2832
2833
8.80M
        shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
2834
2835
8.80M
        diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
2836
8.80M
        a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
2837
8.80M
        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2838
2839
8.80M
        b = _mm256_mullo_epi16(diff, shifty);
2840
8.80M
        res = _mm256_add_epi16(a32, b);
2841
8.80M
        resy = _mm256_srli_epi16(res, 5);
2842
8.80M
      }
2843
2844
11.4M
      resxy = _mm256_blendv_epi8(resx, resy,
2845
11.4M
                                 *(__m256i *)HighbdBaseMask[base_min_diff]);
2846
11.4M
      _mm256_storeu_si256((__m256i *)(dst + j), resxy);
2847
11.4M
    }  // for j
2848
6.39M
    dst += stride;
2849
6.39M
  }
2850
314k
}
2851
2852
// Directional prediction, zone 2: 90 < angle < 180
2853
void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
2854
                                      int bh, const uint16_t *above,
2855
                                      const uint16_t *left, int upsample_above,
2856
                                      int upsample_left, int dx, int dy,
2857
1.35M
                                      int bd) {
2858
1.35M
  (void)bd;
2859
1.35M
  assert(dx > 0);
2860
0
  assert(dy > 0);
2861
0
  switch (bw) {
2862
448k
    case 4:
2863
448k
      if (bd < 12) {
2864
137k
        highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
2865
137k
                                         upsample_above, upsample_left, dx, dy);
2866
311k
      } else {
2867
311k
        highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left,
2868
311k
                                               upsample_above, upsample_left,
2869
311k
                                               dx, dy);
2870
311k
      }
2871
448k
      break;
2872
421k
    case 8:
2873
421k
      if (bd < 12) {
2874
132k
        highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left,
2875
132k
                                         upsample_above, upsample_left, dx, dy);
2876
289k
      } else {
2877
289k
        highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left,
2878
289k
                                               upsample_above, upsample_left,
2879
289k
                                               dx, dy);
2880
289k
      }
2881
421k
      break;
2882
484k
    default:
2883
484k
      if (bd < 12) {
2884
314k
        highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
2885
314k
                                         upsample_above, upsample_left, dx, dy);
2886
314k
      } else {
2887
169k
        highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left,
2888
169k
                                               upsample_above, upsample_left,
2889
169k
                                               dx, dy);
2890
169k
      }
2891
484k
      break;
2892
1.35M
  }
2893
1.35M
}
2894
2895
//  Directional prediction, zone 3 functions
2896
static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
2897
                                             const uint16_t *left,
2898
                                             int upsample_left, int dy,
2899
155k
                                             int bd) {
2900
155k
  __m128i dstvec[4], d[4];
2901
155k
  if (bd < 12) {
2902
101k
    highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left,
2903
101k
                                              dy);
2904
101k
  } else {
2905
54.6k
    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left,
2906
54.6k
                                                    upsample_left, dy);
2907
54.6k
  }
2908
155k
  highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2],
2909
155k
                                   &dstvec[3], &d[0], &d[1], &d[2], &d[3]);
2910
155k
  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
2911
155k
  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
2912
155k
  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
2913
155k
  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
2914
155k
  return;
2915
155k
}
2916
2917
static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
2918
                                             const uint16_t *left,
2919
                                             int upsample_left, int dy,
2920
165k
                                             int bd) {
2921
165k
  __m128i dstvec[8], d[8];
2922
165k
  if (bd < 12) {
2923
65.3k
    highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left,
2924
65.3k
                                              dy);
2925
100k
  } else {
2926
100k
    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left,
2927
100k
                                                    upsample_left, dy);
2928
100k
  }
2929
165k
  highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2930
165k
                           &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
2931
165k
                           &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
2932
165k
                           &d[7]);
2933
1.48M
  for (int i = 0; i < 8; i++) {
2934
1.32M
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
2935
1.32M
  }
2936
165k
}
2937
2938
static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride,
2939
                                             const uint16_t *left,
2940
                                             int upsample_left, int dy,
2941
24.3k
                                             int bd) {
2942
24.3k
  __m128i dstvec[4], d[8];
2943
24.3k
  if (bd < 12) {
2944
11.5k
    highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left,
2945
11.5k
                                              dy);
2946
12.8k
  } else {
2947
12.8k
    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left,
2948
12.8k
                                                    upsample_left, dy);
2949
12.8k
  }
2950
2951
24.3k
  highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2952
24.3k
                               &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
2953
24.3k
                               &d[7]);
2954
219k
  for (int i = 0; i < 8; i++) {
2955
195k
    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
2956
195k
  }
2957
24.3k
}
2958
2959
static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
2960
                                             const uint16_t *left,
2961
                                             int upsample_left, int dy,
2962
49.8k
                                             int bd) {
2963
49.8k
  __m128i dstvec[8], d[4];
2964
49.8k
  if (bd < 12) {
2965
19.8k
    highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left,
2966
19.8k
                                              dy);
2967
30.0k
  } else {
2968
30.0k
    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left,
2969
30.0k
                                                    upsample_left, dy);
2970
30.0k
  }
2971
2972
49.8k
  highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2973
49.8k
                               &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
2974
49.8k
                               &d[0], &d[1], &d[2], &d[3]);
2975
49.8k
  _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
2976
49.8k
  _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]);
2977
49.8k
  _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]);
2978
49.8k
  _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]);
2979
49.8k
}
2980
2981
static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
2982
                                              const uint16_t *left,
2983
                                              int upsample_left, int dy,
2984
30.3k
                                              int bd) {
2985
30.3k
  __m256i dstvec[8], d[8];
2986
30.3k
  if (bd < 12) {
2987
20.9k
    highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
2988
20.9k
                                               dy);
2989
20.9k
  } else {
2990
9.43k
    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left,
2991
9.43k
                                                     upsample_left, dy);
2992
9.43k
  }
2993
30.3k
  highbd_transpose8x16_16x8_avx2(dstvec, d);
2994
273k
  for (int i = 0; i < 8; i++) {
2995
242k
    _mm_storeu_si128((__m128i *)(dst + i * stride),
2996
242k
                     _mm256_castsi256_si128(d[i]));
2997
242k
  }
2998
273k
  for (int i = 8; i < 16; i++) {
2999
242k
    _mm_storeu_si128((__m128i *)(dst + i * stride),
3000
242k
                     _mm256_extracti128_si256(d[i - 8], 1));
3001
242k
  }
3002
30.3k
}
3003
3004
static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
3005
                                              const uint16_t *left,
3006
                                              int upsample_left, int dy,
3007
66.7k
                                              int bd) {
3008
66.7k
  __m128i dstvec[16], d[16];
3009
66.7k
  if (bd < 12) {
3010
35.1k
    highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left,
3011
35.1k
                                              dy);
3012
35.1k
  } else {
3013
31.6k
    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left,
3014
31.6k
                                                    upsample_left, dy);
3015
31.6k
  }
3016
200k
  for (int i = 0; i < 16; i += 8) {
3017
133k
    highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
3018
133k
                             &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
3019
133k
                             &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
3020
133k
                             &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
3021
133k
                             &d[5 + i], &d[6 + i], &d[7 + i]);
3022
133k
  }
3023
600k
  for (int i = 0; i < 8; i++) {
3024
533k
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
3025
533k
    _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
3026
533k
  }
3027
66.7k
}
3028
3029
static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride,
3030
                                              const uint16_t *left,
3031
                                              int upsample_left, int dy,
3032
17.8k
                                              int bd) {
3033
17.8k
  __m256i dstvec[4], d[4], d1;
3034
17.8k
  if (bd < 12) {
3035
12.8k
    highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left,
3036
12.8k
                                               dy);
3037
12.8k
  } else {
3038
4.97k
    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left,
3039
4.97k
                                                     upsample_left, dy);
3040
4.97k
  }
3041
17.8k
  highbd_transpose4x16_avx2(dstvec, d);
3042
89.1k
  for (int i = 0; i < 4; i++) {
3043
71.3k
    _mm_storel_epi64((__m128i *)(dst + i * stride),
3044
71.3k
                     _mm256_castsi256_si128(d[i]));
3045
71.3k
    d1 = _mm256_bsrli_epi128(d[i], 8);
3046
71.3k
    _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride),
3047
71.3k
                     _mm256_castsi256_si128(d1));
3048
71.3k
    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
3049
71.3k
                     _mm256_extracti128_si256(d[i], 1));
3050
71.3k
    _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride),
3051
71.3k
                     _mm256_extracti128_si256(d1, 1));
3052
71.3k
  }
3053
17.8k
}
3054
3055
static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
3056
                                              const uint16_t *left,
3057
                                              int upsample_left, int dy,
3058
53.2k
                                              int bd) {
3059
53.2k
  __m128i dstvec[16], d[8];
3060
53.2k
  if (bd < 12) {
3061
36.2k
    highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left,
3062
36.2k
                                              dy);
3063
36.2k
  } else {
3064
16.9k
    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left,
3065
16.9k
                                                    upsample_left, dy);
3066
16.9k
  }
3067
53.2k
  highbd_transpose16x4_8x8_sse2(dstvec, d);
3068
3069
53.2k
  _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
3070
53.2k
  _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]);
3071
53.2k
  _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]);
3072
53.2k
  _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]);
3073
53.2k
  _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]);
3074
53.2k
  _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]);
3075
53.2k
  _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]);
3076
53.2k
  _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]);
3077
53.2k
}
3078
3079
static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
3080
                                              const uint16_t *left,
3081
                                              int upsample_left, int dy,
3082
10.5k
                                              int bd) {
3083
10.5k
  __m256i dstvec[16], d[16];
3084
10.5k
  if (bd < 12) {
3085
9.25k
    highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left,
3086
9.25k
                                               dy);
3087
9.25k
  } else {
3088
1.25k
    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left,
3089
1.25k
                                                     upsample_left, dy);
3090
1.25k
  }
3091
3092
31.5k
  for (int i = 0; i < 16; i += 8) {
3093
21.0k
    highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
3094
21.0k
  }
3095
3096
94.5k
  for (int i = 0; i < 8; i++) {
3097
84.0k
    _mm_storeu_si128((__m128i *)(dst + i * stride),
3098
84.0k
                     _mm256_castsi256_si128(d[i]));
3099
84.0k
  }
3100
94.5k
  for (int i = 0; i < 8; i++) {
3101
84.0k
    _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
3102
84.0k
                     _mm256_extracti128_si256(d[i], 1));
3103
84.0k
  }
3104
94.5k
  for (int i = 8; i < 16; i++) {
3105
84.0k
    _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
3106
84.0k
                     _mm256_castsi256_si128(d[i]));
3107
84.0k
  }
3108
94.5k
  for (int i = 8; i < 16; i++) {
3109
84.0k
    _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride),
3110
84.0k
                     _mm256_extracti128_si256(d[i], 1));
3111
84.0k
  }
3112
10.5k
}
3113
3114
static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
3115
                                              const uint16_t *left,
3116
                                              int upsample_left, int dy,
3117
40.1k
                                              int bd) {
3118
40.1k
  __m128i dstvec[32], d[32];
3119
40.1k
  if (bd < 12) {
3120
33.8k
    highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left,
3121
33.8k
                                              dy);
3122
33.8k
  } else {
3123
6.30k
    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left,
3124
6.30k
                                                    upsample_left, dy);
3125
6.30k
  }
3126
3127
200k
  for (int i = 0; i < 32; i += 8) {
3128
160k
    highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
3129
160k
                             &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
3130
160k
                             &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
3131
160k
                             &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
3132
160k
                             &d[5 + i], &d[6 + i], &d[7 + i]);
3133
160k
  }
3134
361k
  for (int i = 0; i < 8; i++) {
3135
321k
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
3136
321k
    _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
3137
321k
    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]);
3138
321k
    _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]);
3139
321k
  }
3140
40.1k
}
3141
3142
static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
3143
                                               const uint16_t *left,
3144
                                               int upsample_left, int dy,
3145
78.3k
                                               int bd) {
3146
78.3k
  __m256i dstvec[16], d[16];
3147
78.3k
  if (bd < 12) {
3148
65.6k
    highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left,
3149
65.6k
                                               dy);
3150
65.6k
  } else {
3151
12.6k
    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left,
3152
12.6k
                                                     upsample_left, dy);
3153
12.6k
  }
3154
3155
78.3k
  highbd_transpose16x16_avx2(dstvec, d);
3156
3157
1.33M
  for (int i = 0; i < 16; i++) {
3158
1.25M
    _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]);
3159
1.25M
  }
3160
78.3k
}
3161
3162
static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
3163
                                               const uint16_t *left,
3164
                                               int upsample_left, int dy,
3165
83.4k
                                               int bd) {
3166
83.4k
  __m256i dstvec[64], d[16];
3167
83.4k
  if (bd < 12) {
3168
78.1k
    highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left,
3169
78.1k
                                               dy);
3170
78.1k
  } else {
3171
5.27k
    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left,
3172
5.27k
                                                     upsample_left, dy);
3173
5.27k
  }
3174
83.4k
  highbd_transpose16x16_avx2(dstvec, d);
3175
1.41M
  for (int j = 0; j < 16; j++) {
3176
1.33M
    _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]);
3177
1.33M
  }
3178
83.4k
  highbd_transpose16x16_avx2(dstvec + 16, d);
3179
1.41M
  for (int j = 0; j < 16; j++) {
3180
1.33M
    _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]);
3181
1.33M
  }
3182
83.4k
  highbd_transpose16x16_avx2(dstvec + 32, d);
3183
1.41M
  for (int j = 0; j < 16; j++) {
3184
1.33M
    _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]);
3185
1.33M
  }
3186
83.4k
  highbd_transpose16x16_avx2(dstvec + 48, d);
3187
1.41M
  for (int j = 0; j < 16; j++) {
3188
1.33M
    _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]);
3189
1.33M
  }
3190
83.4k
}
3191
3192
static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
3193
                                               const uint16_t *left,
3194
                                               int upsample_left, int dy,
3195
17.1k
                                               int bd) {
3196
17.1k
  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
3197
17.1k
  if (bd < 12) {
3198
15.6k
    highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
3199
15.6k
  } else {
3200
1.51k
    highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left,
3201
1.51k
                                            dy);
3202
1.51k
  }
3203
17.1k
  highbd_transpose(dstT, 64, dst, stride, 64, 64);
3204
17.1k
}
3205
3206
static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
3207
                                               const uint16_t *left,
3208
                                               int upsample_left, int dy,
3209
22.3k
                                               int bd) {
3210
22.3k
  __m256i dstvec[32], d[32];
3211
22.3k
  if (bd < 12) {
3212
21.5k
    highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left,
3213
21.5k
                                               dy);
3214
21.5k
  } else {
3215
790
    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left,
3216
790
                                                     upsample_left, dy);
3217
790
  }
3218
111k
  for (int i = 0; i < 32; i += 8) {
3219
89.2k
    highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
3220
89.2k
  }
3221
  // store
3222
66.9k
  for (int j = 0; j < 32; j += 16) {
3223
401k
    for (int i = 0; i < 8; i++) {
3224
356k
      _mm_storeu_si128((__m128i *)(dst + (i + j) * stride),
3225
356k
                       _mm256_castsi256_si128(d[(i + j)]));
3226
356k
    }
3227
401k
    for (int i = 0; i < 8; i++) {
3228
356k
      _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8),
3229
356k
                       _mm256_castsi256_si128(d[(i + j) + 8]));
3230
356k
    }
3231
401k
    for (int i = 8; i < 16; i++) {
3232
356k
      _mm256_storeu_si256(
3233
356k
          (__m256i *)(dst + (i + j) * stride),
3234
356k
          _mm256_inserti128_si256(
3235
356k
              d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0));
3236
356k
    }
3237
44.6k
  }
3238
22.3k
}
3239
3240
static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
3241
                                               const uint16_t *left,
3242
                                               int upsample_left, int dy,
3243
20.5k
                                               int bd) {
3244
20.5k
  __m256i dstvec[32], d[16];
3245
20.5k
  if (bd < 12) {
3246
18.4k
    highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left,
3247
18.4k
                                               dy);
3248
18.4k
  } else {
3249
2.17k
    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left,
3250
2.17k
                                                     upsample_left, dy);
3251
2.17k
  }
3252
61.7k
  for (int i = 0; i < 32; i += 16) {
3253
41.1k
    highbd_transpose16x16_avx2((dstvec + i), d);
3254
700k
    for (int j = 0; j < 16; j++) {
3255
658k
      _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
3256
658k
    }
3257
41.1k
  }
3258
20.5k
}
3259
3260
static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
3261
                                               const uint16_t *left,
3262
                                               int upsample_left, int dy,
3263
1.68k
                                               int bd) {
3264
1.68k
  uint16_t dstT[64 * 32];
3265
1.68k
  if (bd < 12) {
3266
1.56k
    highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
3267
1.56k
  } else {
3268
112
    highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left,
3269
112
                                            dy);
3270
112
  }
3271
1.68k
  highbd_transpose(dstT, 64, dst, stride, 32, 64);
3272
1.68k
}
3273
3274
static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
3275
                                               const uint16_t *left,
3276
                                               int upsample_left, int dy,
3277
1.56k
                                               int bd) {
3278
1.56k
  DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
3279
1.56k
  highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd);
3280
1.56k
  highbd_transpose(dstT, 32, dst, stride, 64, 32);
3281
1.56k
  return;
3282
1.56k
}
3283
3284
static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
3285
                                               const uint16_t *left,
3286
                                               int upsample_left, int dy,
3287
5.35k
                                               int bd) {
3288
5.35k
  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
3289
5.35k
  if (bd < 12) {
3290
4.62k
    highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
3291
4.62k
  } else {
3292
730
    highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left,
3293
730
                                            dy);
3294
730
  }
3295
5.35k
  highbd_transpose(dstT, 64, dst, stride, 16, 64);
3296
5.35k
}
3297
3298
static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
3299
                                               const uint16_t *left,
3300
                                               int upsample_left, int dy,
3301
9.89k
                                               int bd) {
3302
9.89k
  __m256i dstvec[64], d[16];
3303
9.89k
  if (bd < 12) {
3304
9.81k
    highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left,
3305
9.81k
                                               dy);
3306
9.81k
  } else {
3307
82
    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left,
3308
82
                                                     upsample_left, dy);
3309
82
  }
3310
49.4k
  for (int i = 0; i < 64; i += 16) {
3311
39.5k
    highbd_transpose16x16_avx2((dstvec + i), d);
3312
672k
    for (int j = 0; j < 16; j++) {
3313
632k
      _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
3314
632k
    }
3315
39.5k
  }
3316
9.89k
}
3317
3318
void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
3319
                                      int bh, const uint16_t *above,
3320
                                      const uint16_t *left, int upsample_left,
3321
854k
                                      int dx, int dy, int bd) {
3322
854k
  (void)above;
3323
854k
  (void)dx;
3324
3325
854k
  assert(dx == 1);
3326
0
  assert(dy > 0);
3327
854k
  if (bw == bh) {
3328
500k
    switch (bw) {
3329
155k
      case 4:
3330
155k
        highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy,
3331
155k
                                         bd);
3332
155k
        break;
3333
165k
      case 8:
3334
165k
        highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy,
3335
165k
                                         bd);
3336
165k
        break;
3337
78.3k
      case 16:
3338
78.3k
        highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy,
3339
78.3k
                                           bd);
3340
78.3k
        break;
3341
83.4k
      case 32:
3342
83.4k
        highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy,
3343
83.4k
                                           bd);
3344
83.4k
        break;
3345
17.1k
      case 64:
3346
17.1k
        highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy,
3347
17.1k
                                           bd);
3348
17.1k
        break;
3349
500k
    }
3350
500k
  } else {
3351
354k
    if (bw < bh) {
3352
112k
      if (bw + bw == bh) {
3353
78.7k
        switch (bw) {
3354
24.3k
          case 4:
3355
24.3k
            highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left,
3356
24.3k
                                             dy, bd);
3357
24.3k
            break;
3358
30.3k
          case 8:
3359
30.3k
            highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left,
3360
30.3k
                                              dy, bd);
3361
30.3k
            break;
3362
22.3k
          case 16:
3363
22.3k
            highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left,
3364
22.3k
                                               dy, bd);
3365
22.3k
            break;
3366
1.68k
          case 32:
3367
1.68k
            highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left,
3368
1.68k
                                               dy, bd);
3369
1.68k
            break;
3370
78.7k
        }
3371
78.7k
      } else {
3372
33.6k
        switch (bw) {
3373
17.8k
          case 4:
3374
17.8k
            highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left,
3375
17.8k
                                              dy, bd);
3376
17.8k
            break;
3377
10.5k
          case 8:
3378
10.5k
            highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left,
3379
10.5k
                                              dy, bd);
3380
10.5k
            break;
3381
5.35k
          case 16:
3382
5.35k
            highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left,
3383
5.35k
                                               dy, bd);
3384
5.35k
            break;
3385
33.6k
        }
3386
33.6k
      }
3387
241k
    } else {
3388
241k
      if (bh + bh == bw) {
3389
138k
        switch (bh) {
3390
49.8k
          case 4:
3391
49.8k
            highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left,
3392
49.8k
                                             dy, bd);
3393
49.8k
            break;
3394
66.7k
          case 8:
3395
66.7k
            highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left,
3396
66.7k
                                              dy, bd);
3397
66.7k
            break;
3398
20.5k
          case 16:
3399
20.5k
            highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left,
3400
20.5k
                                               dy, bd);
3401
20.5k
            break;
3402
1.56k
          case 32:
3403
1.56k
            highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left,
3404
1.56k
                                               dy, bd);
3405
1.56k
            break;
3406
138k
        }
3407
138k
      } else {
3408
103k
        switch (bh) {
3409
53.2k
          case 4:
3410
53.2k
            highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left,
3411
53.2k
                                              dy, bd);
3412
53.2k
            break;
3413
40.1k
          case 8:
3414
40.1k
            highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left,
3415
40.1k
                                              dy, bd);
3416
40.1k
            break;
3417
9.89k
          case 16:
3418
9.89k
            highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left,
3419
9.89k
                                               dy, bd);
3420
9.89k
            break;
3421
103k
        }
3422
103k
      }
3423
241k
    }
3424
354k
  }
3425
854k
  return;
3426
854k
}
3427
3428
// Low bit depth functions
3429
static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
3430
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3431
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3432
  { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3433
    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3434
  { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3435
    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3436
  { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3437
    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3438
  { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3439
    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3440
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3441
    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3442
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3443
    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3444
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3445
    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0 },
3446
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
3447
    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0 },
3448
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
3449
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0 },
3450
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
3451
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
3452
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3453
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3454
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
3455
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3456
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3457
    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
3458
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3459
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3460
    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,
3461
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3462
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3463
    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,
3464
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3465
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3466
    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,
3467
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3468
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3469
    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,
3470
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3471
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3472
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,
3473
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3474
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3475
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,
3476
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3477
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3478
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,
3479
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3480
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3481
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,
3482
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3483
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3484
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
3485
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3486
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3487
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3488
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3489
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3490
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3491
    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0 },
3492
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3493
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3494
    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0 },
3495
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3496
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3497
    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0 },
3498
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3499
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3500
    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0 },
3501
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3502
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3503
    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0 },
3504
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3505
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3506
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0 },
3507
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3508
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3509
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0 },
3510
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3511
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3512
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0 },
3513
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3514
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3515
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
3516
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3517
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3518
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
3519
};
3520
3521
/* clang-format on */
3522
static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2(
3523
    int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
3524
1.10M
    int dx) {
3525
1.10M
  const int frac_bits = 6 - upsample_above;
3526
1.10M
  const int max_base_x = ((W + H) - 1) << upsample_above;
3527
3528
1.10M
  assert(dx > 0);
3529
  // pre-filter above pixels
3530
  // store in temp buffers:
3531
  //   above[x] * 32 + 16
3532
  //   above[x+1] - above[x]
3533
  // final pixels will be calculated as:
3534
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3535
0
  __m256i a0, a1, a32, a16;
3536
1.10M
  __m256i diff, c3f;
3537
1.10M
  __m128i a_mbase_x;
3538
3539
1.10M
  a16 = _mm256_set1_epi16(16);
3540
1.10M
  a_mbase_x = _mm_set1_epi8((int8_t)above[max_base_x]);
3541
1.10M
  c3f = _mm256_set1_epi16(0x3f);
3542
3543
1.10M
  int x = dx;
3544
15.0M
  for (int r = 0; r < W; r++) {
3545
13.9M
    __m256i b, res, shift;
3546
13.9M
    __m128i res1, a0_128, a1_128;
3547
3548
13.9M
    int base = x >> frac_bits;
3549
13.9M
    int base_max_diff = (max_base_x - base) >> upsample_above;
3550
13.9M
    if (base_max_diff <= 0) {
3551
13.8k
      for (int i = r; i < W; ++i) {
3552
9.14k
        dst[i] = a_mbase_x;  // save 4 values
3553
9.14k
      }
3554
4.67k
      return;
3555
4.67k
    }
3556
13.9M
    if (base_max_diff > H) base_max_diff = H;
3557
13.9M
    a0_128 = _mm_loadu_si128((__m128i *)(above + base));
3558
13.9M
    a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
3559
3560
13.9M
    if (upsample_above) {
3561
2.41M
      a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]);
3562
2.41M
      a1_128 = _mm_srli_si128(a0_128, 8);
3563
3564
2.41M
      shift = _mm256_srli_epi16(
3565
2.41M
          _mm256_and_si256(
3566
2.41M
              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
3567
2.41M
          1);
3568
11.5M
    } else {
3569
11.5M
      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3570
11.5M
    }
3571
13.9M
    a0 = _mm256_cvtepu8_epi16(a0_128);
3572
13.9M
    a1 = _mm256_cvtepu8_epi16(a1_128);
3573
3574
13.9M
    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
3575
13.9M
    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
3576
13.9M
    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
3577
3578
13.9M
    b = _mm256_mullo_epi16(diff, shift);
3579
13.9M
    res = _mm256_add_epi16(a32, b);
3580
13.9M
    res = _mm256_srli_epi16(res, 5);
3581
3582
13.9M
    res = _mm256_packus_epi16(
3583
13.9M
        res, _mm256_castsi128_si256(
3584
13.9M
                 _mm256_extracti128_si256(res, 1)));  // goto 8 bit
3585
13.9M
    res1 = _mm256_castsi256_si128(res);               // 16 8bit values
3586
3587
13.9M
    dst[r] =
3588
13.9M
        _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
3589
13.9M
    x += dx;
3590
13.9M
  }
3591
1.10M
}
3592
3593
static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3594
                                      const uint8_t *above, int upsample_above,
3595
138k
                                      int dx) {
3596
138k
  __m128i dstvec[16];
3597
3598
138k
  dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx);
3599
939k
  for (int i = 0; i < N; i++) {
3600
801k
    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
3601
801k
  }
3602
138k
}
3603
3604
static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3605
                                      const uint8_t *above, int upsample_above,
3606
138k
                                      int dx) {
3607
138k
  __m128i dstvec[32];
3608
3609
138k
  dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx);
3610
1.47M
  for (int i = 0; i < N; i++) {
3611
1.33M
    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
3612
1.33M
  }
3613
138k
}
3614
3615
static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3616
                                       const uint8_t *above, int upsample_above,
3617
110k
                                       int dx) {
3618
110k
  __m128i dstvec[64];
3619
3620
110k
  dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx);
3621
1.61M
  for (int i = 0; i < N; i++) {
3622
1.50M
    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
3623
1.50M
  }
3624
110k
}
3625
3626
static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2(
3627
180k
    int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) {
3628
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
3629
180k
  (void)upsample_above;
3630
180k
  const int frac_bits = 6;
3631
180k
  const int max_base_x = ((32 + N) - 1);
3632
3633
  // pre-filter above pixels
3634
  // store in temp buffers:
3635
  //   above[x] * 32 + 16
3636
  //   above[x+1] - above[x]
3637
  // final pixels will be calculated as:
3638
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3639
180k
  __m256i a0, a1, a32, a16;
3640
180k
  __m256i a_mbase_x, diff, c3f;
3641
3642
180k
  a16 = _mm256_set1_epi16(16);
3643
180k
  a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
3644
180k
  c3f = _mm256_set1_epi16(0x3f);
3645
3646
180k
  int x = dx;
3647
5.03M
  for (int r = 0; r < N; r++) {
3648
4.85M
    __m256i b, res, res16[2];
3649
4.85M
    __m128i a0_128, a1_128;
3650
3651
4.85M
    int base = x >> frac_bits;
3652
4.85M
    int base_max_diff = (max_base_x - base);
3653
4.85M
    if (base_max_diff <= 0) {
3654
0
      for (int i = r; i < N; ++i) {
3655
0
        dstvec[i] = a_mbase_x;  // save 32 values
3656
0
      }
3657
0
      return;
3658
0
    }
3659
4.85M
    if (base_max_diff > 32) base_max_diff = 32;
3660
4.85M
    __m256i shift =
3661
4.85M
        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3662
3663
14.5M
    for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
3664
9.71M
      int mdiff = base_max_diff - j;
3665
9.71M
      if (mdiff <= 0) {
3666
774
        res16[jj] = a_mbase_x;
3667
9.71M
      } else {
3668
9.71M
        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
3669
9.71M
        a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1));
3670
9.71M
        a0 = _mm256_cvtepu8_epi16(a0_128);
3671
9.71M
        a1 = _mm256_cvtepu8_epi16(a1_128);
3672
3673
9.71M
        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
3674
9.71M
        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
3675
9.71M
        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
3676
9.71M
        b = _mm256_mullo_epi16(diff, shift);
3677
3678
9.71M
        res = _mm256_add_epi16(a32, b);
3679
9.71M
        res = _mm256_srli_epi16(res, 5);
3680
9.71M
        res16[jj] = _mm256_packus_epi16(
3681
9.71M
            res, _mm256_castsi128_si256(
3682
9.71M
                     _mm256_extracti128_si256(res, 1)));  // 16 8bit values
3683
9.71M
      }
3684
9.71M
    }
3685
4.85M
    res16[1] =
3686
4.85M
        _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]),
3687
4.85M
                                1);  // 32 8bit values
3688
3689
4.85M
    dstvec[r] = _mm256_blendv_epi8(
3690
4.85M
        a_mbase_x, res16[1],
3691
4.85M
        *(__m256i *)BaseMask[base_max_diff]);  // 32 8bit values
3692
4.85M
    x += dx;
3693
4.85M
  }
3694
180k
}
3695
3696
static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3697
                                       const uint8_t *above, int upsample_above,
3698
60.9k
                                       int dx) {
3699
60.9k
  __m256i dstvec[64];
3700
60.9k
  dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx);
3701
1.74M
  for (int i = 0; i < N; i++) {
3702
1.68M
    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
3703
1.68M
  }
3704
60.9k
}
3705
3706
static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3707
                                       const uint8_t *above, int upsample_above,
3708
39.5k
                                       int dx) {
3709
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
3710
39.5k
  (void)upsample_above;
3711
39.5k
  const int frac_bits = 6;
3712
39.5k
  const int max_base_x = ((64 + N) - 1);
3713
3714
  // pre-filter above pixels
3715
  // store in temp buffers:
3716
  //   above[x] * 32 + 16
3717
  //   above[x+1] - above[x]
3718
  // final pixels will be calculated as:
3719
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3720
39.5k
  __m256i a0, a1, a32, a16;
3721
39.5k
  __m256i a_mbase_x, diff, c3f;
3722
39.5k
  __m128i max_base_x128, base_inc128, mask128;
3723
3724
39.5k
  a16 = _mm256_set1_epi16(16);
3725
39.5k
  a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
3726
39.5k
  max_base_x128 = _mm_set1_epi8(max_base_x);
3727
39.5k
  c3f = _mm256_set1_epi16(0x3f);
3728
3729
39.5k
  int x = dx;
3730
2.11M
  for (int r = 0; r < N; r++, dst += stride) {
3731
2.07M
    __m256i b, res;
3732
2.07M
    int base = x >> frac_bits;
3733
2.07M
    if (base >= max_base_x) {
3734
0
      for (int i = r; i < N; ++i) {
3735
0
        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
3736
0
        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
3737
0
        dst += stride;
3738
0
      }
3739
0
      return;
3740
0
    }
3741
3742
2.07M
    __m256i shift =
3743
2.07M
        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3744
3745
2.07M
    __m128i a0_128, a1_128, res128;
3746
10.3M
    for (int j = 0; j < 64; j += 16) {
3747
8.31M
      int mdif = max_base_x - (base + j);
3748
8.31M
      if (mdif <= 0) {
3749
2.28k
        _mm_storeu_si128((__m128i *)(dst + j),
3750
2.28k
                         _mm256_castsi256_si128(a_mbase_x));
3751
8.31M
      } else {
3752
8.31M
        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
3753
8.31M
        a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
3754
8.31M
        a0 = _mm256_cvtepu8_epi16(a0_128);
3755
8.31M
        a1 = _mm256_cvtepu8_epi16(a1_128);
3756
3757
8.31M
        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
3758
8.31M
        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
3759
8.31M
        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
3760
8.31M
        b = _mm256_mullo_epi16(diff, shift);
3761
3762
8.31M
        res = _mm256_add_epi16(a32, b);
3763
8.31M
        res = _mm256_srli_epi16(res, 5);
3764
8.31M
        res = _mm256_packus_epi16(
3765
8.31M
            res, _mm256_castsi128_si256(
3766
8.31M
                     _mm256_extracti128_si256(res, 1)));  // 16 8bit values
3767
3768
8.31M
        base_inc128 =
3769
8.31M
            _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
3770
8.31M
                          (int8_t)(base + j + 2), (int8_t)(base + j + 3),
3771
8.31M
                          (int8_t)(base + j + 4), (int8_t)(base + j + 5),
3772
8.31M
                          (int8_t)(base + j + 6), (int8_t)(base + j + 7),
3773
8.31M
                          (int8_t)(base + j + 8), (int8_t)(base + j + 9),
3774
8.31M
                          (int8_t)(base + j + 10), (int8_t)(base + j + 11),
3775
8.31M
                          (int8_t)(base + j + 12), (int8_t)(base + j + 13),
3776
8.31M
                          (int8_t)(base + j + 14), (int8_t)(base + j + 15));
3777
3778
8.31M
        mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
3779
8.31M
                                 _mm_setzero_si128());
3780
8.31M
        res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x),
3781
8.31M
                                 _mm256_castsi256_si128(res), mask128);
3782
8.31M
        _mm_storeu_si128((__m128i *)(dst + j), res128);
3783
8.31M
      }
3784
8.31M
    }
3785
2.07M
    x += dx;
3786
2.07M
  }
3787
39.5k
}
3788
3789
// Directional prediction, zone 1: 0 < angle < 90
3790
void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
3791
                               const uint8_t *above, const uint8_t *left,
3792
458k
                               int upsample_above, int dx, int dy) {
3793
458k
  (void)left;
3794
458k
  (void)dy;
3795
458k
  switch (bw) {
3796
138k
    case 4:
3797
138k
      dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx);
3798
138k
      break;
3799
138k
    case 8:
3800
138k
      dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx);
3801
138k
      break;
3802
110k
    case 16:
3803
110k
      dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx);
3804
110k
      break;
3805
57.1k
    case 32:
3806
57.1k
      dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx);
3807
57.1k
      break;
3808
14.4k
    case 64:
3809
14.4k
      dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx);
3810
14.4k
      break;
3811
0
    default: break;
3812
458k
  }
3813
458k
  return;
3814
458k
}
3815
3816
static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3817
                                      const uint8_t *above, const uint8_t *left,
3818
                                      int upsample_above, int upsample_left,
3819
264k
                                      int dx, int dy) {
3820
264k
  const int min_base_x = -(1 << upsample_above);
3821
264k
  const int min_base_y = -(1 << upsample_left);
3822
264k
  const int frac_bits_x = 6 - upsample_above;
3823
264k
  const int frac_bits_y = 6 - upsample_left;
3824
3825
264k
  assert(dx > 0);
3826
  // pre-filter above pixels
3827
  // store in temp buffers:
3828
  //   above[x] * 32 + 16
3829
  //   above[x+1] - above[x]
3830
  // final pixels will be calculated as:
3831
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3832
0
  __m128i a0_x, a1_x, a32, a16, diff;
3833
264k
  __m128i c3f, min_base_y128, c1234, dy128;
3834
3835
264k
  a16 = _mm_set1_epi16(16);
3836
264k
  c3f = _mm_set1_epi16(0x3f);
3837
264k
  min_base_y128 = _mm_set1_epi16(min_base_y);
3838
264k
  c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
3839
264k
  dy128 = _mm_set1_epi16(dy);
3840
3841
1.84M
  for (int r = 0; r < N; r++) {
3842
1.58M
    __m128i b, res, shift, r6, ydx;
3843
1.58M
    __m128i resx, resy, resxy;
3844
1.58M
    __m128i a0_x128, a1_x128;
3845
1.58M
    int y = r + 1;
3846
1.58M
    int base_x = (-y * dx) >> frac_bits_x;
3847
1.58M
    int base_shift = 0;
3848
1.58M
    if (base_x < (min_base_x - 1)) {
3849
1.10M
      base_shift = (min_base_x - base_x - 1) >> upsample_above;
3850
1.10M
    }
3851
1.58M
    int base_min_diff =
3852
1.58M
        (min_base_x - base_x + upsample_above) >> upsample_above;
3853
1.58M
    if (base_min_diff > 4) {
3854
711k
      base_min_diff = 4;
3855
873k
    } else {
3856
873k
      if (base_min_diff < 0) base_min_diff = 0;
3857
873k
    }
3858
3859
1.58M
    if (base_shift > 3) {
3860
711k
      a0_x = _mm_setzero_si128();
3861
711k
      a1_x = _mm_setzero_si128();
3862
711k
      shift = _mm_setzero_si128();
3863
873k
    } else {
3864
873k
      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
3865
873k
      ydx = _mm_set1_epi16(y * dx);
3866
873k
      r6 = _mm_slli_epi16(c1234, 6);
3867
3868
873k
      if (upsample_above) {
3869
324k
        a0_x128 =
3870
324k
            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
3871
324k
        a1_x128 = _mm_srli_si128(a0_x128, 8);
3872
3873
324k
        shift = _mm_srli_epi16(
3874
324k
            _mm_and_si128(
3875
324k
                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
3876
324k
            1);
3877
549k
      } else {
3878
549k
        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
3879
549k
        a1_x128 = _mm_srli_si128(a0_x128, 1);
3880
3881
549k
        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
3882
549k
      }
3883
873k
      a0_x = _mm_cvtepu8_epi16(a0_x128);
3884
873k
      a1_x = _mm_cvtepu8_epi16(a1_x128);
3885
873k
    }
3886
    // y calc
3887
1.58M
    __m128i a0_y, a1_y, shifty;
3888
1.58M
    if (base_x < min_base_x) {
3889
1.26M
      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
3890
1.26M
      __m128i y_c128, base_y_c128, mask128, c1234_;
3891
1.26M
      c1234_ = _mm_srli_si128(c1234, 2);
3892
1.26M
      r6 = _mm_set1_epi16(r << 6);
3893
1.26M
      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128));
3894
1.26M
      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
3895
1.26M
      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
3896
1.26M
      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
3897
1.26M
      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
3898
3899
1.26M
      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
3900
1.26M
                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
3901
1.26M
      base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4));
3902
1.26M
      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
3903
1.26M
      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
3904
1.26M
                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
3905
3906
1.26M
      if (upsample_left) {
3907
521k
        shifty = _mm_srli_epi16(
3908
521k
            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
3909
748k
      } else {
3910
748k
        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
3911
748k
      }
3912
1.26M
      a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
3913
1.26M
      a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
3914
1.26M
      shift = _mm_unpacklo_epi64(shift, shifty);
3915
1.26M
    }
3916
3917
1.58M
    diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
3918
1.58M
    a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
3919
1.58M
    a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
3920
3921
1.58M
    b = _mm_mullo_epi16(diff, shift);
3922
1.58M
    res = _mm_add_epi16(a32, b);
3923
1.58M
    res = _mm_srli_epi16(res, 5);
3924
3925
1.58M
    resx = _mm_packus_epi16(res, res);
3926
1.58M
    resy = _mm_srli_si128(resx, 4);
3927
3928
1.58M
    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
3929
1.58M
    *(int *)(dst) = _mm_cvtsi128_si32(resxy);
3930
1.58M
    dst += stride;
3931
1.58M
  }
3932
264k
}
3933
3934
static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3935
                                      const uint8_t *above, const uint8_t *left,
3936
                                      int upsample_above, int upsample_left,
3937
276k
                                      int dx, int dy) {
3938
276k
  const int min_base_x = -(1 << upsample_above);
3939
276k
  const int min_base_y = -(1 << upsample_left);
3940
276k
  const int frac_bits_x = 6 - upsample_above;
3941
276k
  const int frac_bits_y = 6 - upsample_left;
3942
3943
  // pre-filter above pixels
3944
  // store in temp buffers:
3945
  //   above[x] * 32 + 16
3946
  //   above[x+1] - above[x]
3947
  // final pixels will be calculated as:
3948
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3949
276k
  __m256i diff, a32, a16;
3950
276k
  __m256i a0_x, a1_x;
3951
276k
  __m128i a0_x128, a1_x128, min_base_y128, c3f;
3952
276k
  __m128i c1234, dy128;
3953
3954
276k
  a16 = _mm256_set1_epi16(16);
3955
276k
  c3f = _mm_set1_epi16(0x3f);
3956
276k
  min_base_y128 = _mm_set1_epi16(min_base_y);
3957
276k
  dy128 = _mm_set1_epi16(dy);
3958
276k
  c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3959
3960
2.84M
  for (int r = 0; r < N; r++) {
3961
2.56M
    __m256i b, res, shift;
3962
2.56M
    __m128i resx, resy, resxy, r6, ydx;
3963
3964
2.56M
    int y = r + 1;
3965
2.56M
    int base_x = (-y * dx) >> frac_bits_x;
3966
2.56M
    int base_shift = 0;
3967
2.56M
    if (base_x < (min_base_x - 1)) {
3968
2.01M
      base_shift = (min_base_x - base_x - 1) >> upsample_above;
3969
2.01M
    }
3970
2.56M
    int base_min_diff =
3971
2.56M
        (min_base_x - base_x + upsample_above) >> upsample_above;
3972
2.56M
    if (base_min_diff > 8) {
3973
1.21M
      base_min_diff = 8;
3974
1.35M
    } else {
3975
1.35M
      if (base_min_diff < 0) base_min_diff = 0;
3976
1.35M
    }
3977
3978
2.56M
    if (base_shift > 7) {
3979
1.21M
      a0_x = _mm256_setzero_si256();
3980
1.21M
      a1_x = _mm256_setzero_si256();
3981
1.21M
      shift = _mm256_setzero_si256();
3982
1.35M
    } else {
3983
1.35M
      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
3984
1.35M
      ydx = _mm_set1_epi16(y * dx);
3985
1.35M
      r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
3986
1.35M
      if (upsample_above) {
3987
470k
        a0_x128 =
3988
470k
            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
3989
470k
        a1_x128 = _mm_srli_si128(a0_x128, 8);
3990
3991
470k
        shift = _mm256_castsi128_si256(_mm_srli_epi16(
3992
470k
            _mm_and_si128(
3993
470k
                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
3994
470k
            1));
3995
882k
      } else {
3996
882k
        a1_x128 = _mm_srli_si128(a0_x128, 1);
3997
882k
        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
3998
882k
        a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
3999
4000
882k
        shift = _mm256_castsi128_si256(
4001
882k
            _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1));
4002
882k
      }
4003
1.35M
      a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
4004
1.35M
      a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
4005
1.35M
    }
4006
4007
    // y calc
4008
2.56M
    __m128i a0_y, a1_y, shifty;
4009
2.56M
    if (base_x < min_base_x) {
4010
2.21M
      DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
4011
2.21M
      __m128i y_c128, base_y_c128, mask128;
4012
2.21M
      r6 = _mm_set1_epi16(r << 6);
4013
2.21M
      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
4014
2.21M
      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
4015
2.21M
      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
4016
2.21M
      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
4017
2.21M
      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
4018
4019
2.21M
      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
4020
2.21M
                            left[base_y_c[2]], left[base_y_c[3]],
4021
2.21M
                            left[base_y_c[4]], left[base_y_c[5]],
4022
2.21M
                            left[base_y_c[6]], left[base_y_c[7]]);
4023
2.21M
      base_y_c128 = _mm_add_epi16(
4024
2.21M
          base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4));
4025
2.21M
      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
4026
4027
2.21M
      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
4028
2.21M
                            left[base_y_c[2]], left[base_y_c[3]],
4029
2.21M
                            left[base_y_c[4]], left[base_y_c[5]],
4030
2.21M
                            left[base_y_c[6]], left[base_y_c[7]]);
4031
4032
2.21M
      if (upsample_left) {
4033
689k
        shifty = _mm_srli_epi16(
4034
689k
            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
4035
1.52M
      } else {
4036
1.52M
        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
4037
1.52M
      }
4038
4039
2.21M
      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
4040
2.21M
      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
4041
2.21M
      shift = _mm256_inserti128_si256(shift, shifty, 1);
4042
2.21M
    }
4043
4044
2.56M
    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
4045
2.56M
    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
4046
2.56M
    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
4047
4048
2.56M
    b = _mm256_mullo_epi16(diff, shift);
4049
2.56M
    res = _mm256_add_epi16(a32, b);
4050
2.56M
    res = _mm256_srli_epi16(res, 5);
4051
4052
2.56M
    resx = _mm_packus_epi16(_mm256_castsi256_si128(res),
4053
2.56M
                            _mm256_castsi256_si128(res));
4054
2.56M
    resy = _mm256_extracti128_si256(res, 1);
4055
2.56M
    resy = _mm_packus_epi16(resy, resy);
4056
4057
2.56M
    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
4058
2.56M
    _mm_storel_epi64((__m128i *)(dst), resxy);
4059
2.56M
    dst += stride;
4060
2.56M
  }
4061
276k
}
4062
4063
static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst,
4064
                                      ptrdiff_t stride, const uint8_t *above,
4065
                                      const uint8_t *left, int upsample_above,
4066
461k
                                      int upsample_left, int dx, int dy) {
4067
  // here upsample_above and upsample_left are 0 by design of
4068
  // av1_use_intra_edge_upsample
4069
461k
  const int min_base_x = -1;
4070
461k
  const int min_base_y = -1;
4071
461k
  (void)upsample_above;
4072
461k
  (void)upsample_left;
4073
461k
  const int frac_bits_x = 6;
4074
461k
  const int frac_bits_y = 6;
4075
4076
461k
  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123;
4077
461k
  __m256i diff, min_base_y256, c3f, shifty, dy256, c1;
4078
461k
  __m128i a0_x128, a1_x128;
4079
4080
461k
  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
4081
461k
  a16 = _mm256_set1_epi16(16);
4082
461k
  c1 = _mm256_srli_epi16(a16, 4);
4083
461k
  min_base_y256 = _mm256_set1_epi16(min_base_y);
4084
461k
  c3f = _mm256_set1_epi16(0x3f);
4085
461k
  dy256 = _mm256_set1_epi16(dy);
4086
461k
  c0123 =
4087
461k
      _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4088
461k
  c1234 = _mm256_add_epi16(c0123, c1);
4089
4090
8.59M
  for (int r = 0; r < H; r++) {
4091
8.13M
    __m256i b, res, shift, j256, r6, ydx;
4092
8.13M
    __m128i resx, resy;
4093
8.13M
    __m128i resxy;
4094
8.13M
    int y = r + 1;
4095
8.13M
    ydx = _mm256_set1_epi16((int16_t)(y * dx));
4096
4097
8.13M
    int base_x = (-y * dx) >> frac_bits_x;
4098
23.0M
    for (int j = 0; j < W; j += 16) {
4099
14.9M
      j256 = _mm256_set1_epi16(j);
4100
14.9M
      int base_shift = 0;
4101
14.9M
      if ((base_x + j) < (min_base_x - 1)) {
4102
11.4M
        base_shift = (min_base_x - (base_x + j) - 1);
4103
11.4M
      }
4104
14.9M
      int base_min_diff = (min_base_x - base_x - j);
4105
14.9M
      if (base_min_diff > 16) {
4106
8.74M
        base_min_diff = 16;
4107
8.74M
      } else {
4108
6.21M
        if (base_min_diff < 0) base_min_diff = 0;
4109
6.21M
      }
4110
4111
14.9M
      if (base_shift < 16) {
4112
6.21M
        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
4113
6.21M
        a1_x128 =
4114
6.21M
            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
4115
6.21M
        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
4116
6.21M
        a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
4117
4118
6.21M
        a0_x = _mm256_cvtepu8_epi16(a0_x128);
4119
6.21M
        a1_x = _mm256_cvtepu8_epi16(a1_x128);
4120
4121
6.21M
        r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
4122
6.21M
        shift = _mm256_srli_epi16(
4123
6.21M
            _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
4124
4125
6.21M
        diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
4126
6.21M
        a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
4127
6.21M
        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
4128
4129
6.21M
        b = _mm256_mullo_epi16(diff, shift);
4130
6.21M
        res = _mm256_add_epi16(a32, b);
4131
6.21M
        res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
4132
6.21M
        resx = _mm256_castsi256_si128(_mm256_packus_epi16(
4133
6.21M
            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
4134
8.74M
      } else {
4135
8.74M
        resx = _mm_setzero_si128();
4136
8.74M
      }
4137
4138
      // y calc
4139
14.9M
      if (base_x < min_base_x) {
4140
14.0M
        __m256i c256, y_c256, base_y_c256, mask256, mul16;
4141
14.0M
        r6 = _mm256_set1_epi16(r << 6);
4142
14.0M
        c256 = _mm256_add_epi16(j256, c1234);
4143
14.0M
        mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
4144
14.0M
                                 _mm256_srli_epi16(min_base_y256, 1));
4145
14.0M
        y_c256 = _mm256_sub_epi16(r6, mul16);
4146
4147
14.0M
        base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
4148
14.0M
        mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
4149
4150
14.0M
        base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256);
4151
14.0M
        int16_t min_y = (int16_t)_mm_extract_epi16(
4152
14.0M
            _mm256_extracti128_si256(base_y_c256, 1), 7);
4153
14.0M
        int16_t max_y =
4154
14.0M
            (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0);
4155
14.0M
        int16_t offset_diff = max_y - min_y;
4156
4157
14.0M
        if (offset_diff < 16) {
4158
13.4M
          __m256i min_y256 = _mm256_set1_epi16(min_y);
4159
4160
13.4M
          __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256);
4161
13.4M
          __m128i base_y_offset128 =
4162
13.4M
              _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0),
4163
13.4M
                              _mm256_extracti128_si256(base_y_offset, 1));
4164
4165
13.4M
          __m128i a0_y128 = _mm_maskload_epi32(
4166
13.4M
              (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]);
4167
13.4M
          __m128i a1_y128 =
4168
13.4M
              _mm_maskload_epi32((int *)(left + min_y + 1),
4169
13.4M
                                 *(__m128i *)LoadMaskz2[offset_diff / 4]);
4170
13.4M
          a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128);
4171
13.4M
          a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128);
4172
13.4M
          a0_y = _mm256_cvtepu8_epi16(a0_y128);
4173
13.4M
          a1_y = _mm256_cvtepu8_epi16(a1_y128);
4174
13.4M
        } else {
4175
647k
          base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
4176
647k
          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
4177
4178
647k
          a0_y = _mm256_setr_epi16(
4179
647k
              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
4180
647k
              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
4181
647k
              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
4182
647k
              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
4183
647k
              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
4184
647k
              left[base_y_c[15]]);
4185
647k
          base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
4186
647k
          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
4187
4188
647k
          a1_y = _mm256_setr_epi16(
4189
647k
              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
4190
647k
              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
4191
647k
              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
4192
647k
              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
4193
647k
              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
4194
647k
              left[base_y_c[15]]);
4195
647k
        }
4196
14.0M
        shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
4197
4198
14.0M
        diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
4199
14.0M
        a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
4200
14.0M
        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
4201
4202
14.0M
        b = _mm256_mullo_epi16(diff, shifty);
4203
14.0M
        res = _mm256_add_epi16(a32, b);
4204
14.0M
        res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
4205
14.0M
        resy = _mm256_castsi256_si128(_mm256_packus_epi16(
4206
14.0M
            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
4207
14.0M
      } else {
4208
882k
        resy = _mm_setzero_si128();
4209
882k
      }
4210
14.9M
      resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
4211
14.9M
      _mm_storeu_si128((__m128i *)(dst + j), resxy);
4212
14.9M
    }  // for j
4213
8.13M
    dst += stride;
4214
8.13M
  }
4215
461k
}
4216
4217
// Directional prediction, zone 2: 90 < angle < 180
4218
void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
4219
                               const uint8_t *above, const uint8_t *left,
4220
                               int upsample_above, int upsample_left, int dx,
4221
1.00M
                               int dy) {
4222
1.00M
  assert(dx > 0);
4223
0
  assert(dy > 0);
4224
0
  switch (bw) {
4225
264k
    case 4:
4226
264k
      dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above,
4227
264k
                                upsample_left, dx, dy);
4228
264k
      break;
4229
276k
    case 8:
4230
276k
      dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above,
4231
276k
                                upsample_left, dx, dy);
4232
276k
      break;
4233
461k
    default:
4234
461k
      dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
4235
461k
                                upsample_above, upsample_left, dx, dy);
4236
461k
      break;
4237
1.00M
  }
4238
1.00M
  return;
4239
1.00M
}
4240
4241
// z3 functions
4242
204k
static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) {
4243
204k
  __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
4244
204k
  __m256i w10, w11, w12, w13, w14, w15;
4245
4246
204k
  w0 = _mm256_unpacklo_epi8(x[0], x[1]);
4247
204k
  w1 = _mm256_unpacklo_epi8(x[2], x[3]);
4248
204k
  w2 = _mm256_unpacklo_epi8(x[4], x[5]);
4249
204k
  w3 = _mm256_unpacklo_epi8(x[6], x[7]);
4250
4251
204k
  w8 = _mm256_unpacklo_epi8(x[8], x[9]);
4252
204k
  w9 = _mm256_unpacklo_epi8(x[10], x[11]);
4253
204k
  w10 = _mm256_unpacklo_epi8(x[12], x[13]);
4254
204k
  w11 = _mm256_unpacklo_epi8(x[14], x[15]);
4255
4256
204k
  w4 = _mm256_unpacklo_epi16(w0, w1);
4257
204k
  w5 = _mm256_unpacklo_epi16(w2, w3);
4258
204k
  w12 = _mm256_unpacklo_epi16(w8, w9);
4259
204k
  w13 = _mm256_unpacklo_epi16(w10, w11);
4260
4261
204k
  w6 = _mm256_unpacklo_epi32(w4, w5);
4262
204k
  w7 = _mm256_unpackhi_epi32(w4, w5);
4263
204k
  w14 = _mm256_unpacklo_epi32(w12, w13);
4264
204k
  w15 = _mm256_unpackhi_epi32(w12, w13);
4265
4266
  // Store first 4-line result
4267
204k
  d[0] = _mm256_unpacklo_epi64(w6, w14);
4268
204k
  d[1] = _mm256_unpackhi_epi64(w6, w14);
4269
204k
  d[2] = _mm256_unpacklo_epi64(w7, w15);
4270
204k
  d[3] = _mm256_unpackhi_epi64(w7, w15);
4271
4272
204k
  w4 = _mm256_unpackhi_epi16(w0, w1);
4273
204k
  w5 = _mm256_unpackhi_epi16(w2, w3);
4274
204k
  w12 = _mm256_unpackhi_epi16(w8, w9);
4275
204k
  w13 = _mm256_unpackhi_epi16(w10, w11);
4276
4277
204k
  w6 = _mm256_unpacklo_epi32(w4, w5);
4278
204k
  w7 = _mm256_unpackhi_epi32(w4, w5);
4279
204k
  w14 = _mm256_unpacklo_epi32(w12, w13);
4280
204k
  w15 = _mm256_unpackhi_epi32(w12, w13);
4281
4282
  // Store second 4-line result
4283
204k
  d[4] = _mm256_unpacklo_epi64(w6, w14);
4284
204k
  d[5] = _mm256_unpackhi_epi64(w6, w14);
4285
204k
  d[6] = _mm256_unpacklo_epi64(w7, w15);
4286
204k
  d[7] = _mm256_unpackhi_epi64(w7, w15);
4287
4288
  // upper half
4289
204k
  w0 = _mm256_unpackhi_epi8(x[0], x[1]);
4290
204k
  w1 = _mm256_unpackhi_epi8(x[2], x[3]);
4291
204k
  w2 = _mm256_unpackhi_epi8(x[4], x[5]);
4292
204k
  w3 = _mm256_unpackhi_epi8(x[6], x[7]);
4293
4294
204k
  w8 = _mm256_unpackhi_epi8(x[8], x[9]);
4295
204k
  w9 = _mm256_unpackhi_epi8(x[10], x[11]);
4296
204k
  w10 = _mm256_unpackhi_epi8(x[12], x[13]);
4297
204k
  w11 = _mm256_unpackhi_epi8(x[14], x[15]);
4298
4299
204k
  w4 = _mm256_unpacklo_epi16(w0, w1);
4300
204k
  w5 = _mm256_unpacklo_epi16(w2, w3);
4301
204k
  w12 = _mm256_unpacklo_epi16(w8, w9);
4302
204k
  w13 = _mm256_unpacklo_epi16(w10, w11);
4303
4304
204k
  w6 = _mm256_unpacklo_epi32(w4, w5);
4305
204k
  w7 = _mm256_unpackhi_epi32(w4, w5);
4306
204k
  w14 = _mm256_unpacklo_epi32(w12, w13);
4307
204k
  w15 = _mm256_unpackhi_epi32(w12, w13);
4308
4309
  // Store first 4-line result
4310
204k
  d[8] = _mm256_unpacklo_epi64(w6, w14);
4311
204k
  d[9] = _mm256_unpackhi_epi64(w6, w14);
4312
204k
  d[10] = _mm256_unpacklo_epi64(w7, w15);
4313
204k
  d[11] = _mm256_unpackhi_epi64(w7, w15);
4314
4315
204k
  w4 = _mm256_unpackhi_epi16(w0, w1);
4316
204k
  w5 = _mm256_unpackhi_epi16(w2, w3);
4317
204k
  w12 = _mm256_unpackhi_epi16(w8, w9);
4318
204k
  w13 = _mm256_unpackhi_epi16(w10, w11);
4319
4320
204k
  w6 = _mm256_unpacklo_epi32(w4, w5);
4321
204k
  w7 = _mm256_unpackhi_epi32(w4, w5);
4322
204k
  w14 = _mm256_unpacklo_epi32(w12, w13);
4323
204k
  w15 = _mm256_unpackhi_epi32(w12, w13);
4324
4325
  // Store second 4-line result
4326
204k
  d[12] = _mm256_unpacklo_epi64(w6, w14);
4327
204k
  d[13] = _mm256_unpackhi_epi64(w6, w14);
4328
204k
  d[14] = _mm256_unpacklo_epi64(w7, w15);
4329
204k
  d[15] = _mm256_unpackhi_epi64(w7, w15);
4330
204k
}
4331
4332
static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride,
4333
                                      const uint8_t *left, int upsample_left,
4334
115k
                                      int dy) {
4335
115k
  __m128i dstvec[4], d[4];
4336
4337
115k
  dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy);
4338
115k
  transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
4339
115k
                            &d[0], &d[1], &d[2], &d[3]);
4340
4341
115k
  *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
4342
115k
  *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
4343
115k
  *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
4344
115k
  *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
4345
115k
  return;
4346
115k
}
4347
4348
static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride,
4349
                                      const uint8_t *left, int upsample_left,
4350
121k
                                      int dy) {
4351
121k
  __m128i dstvec[8], d[8];
4352
4353
121k
  dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy);
4354
121k
  transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
4355
121k
                    &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
4356
121k
                    &d[3]);
4357
4358
121k
  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
4359
121k
  _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
4360
121k
  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
4361
121k
  _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
4362
121k
  _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
4363
121k
  _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
4364
121k
  _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
4365
121k
  _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
4366
121k
}
4367
4368
static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride,
4369
                                      const uint8_t *left, int upsample_left,
4370
31.0k
                                      int dy) {
4371
31.0k
  __m128i dstvec[4], d[8];
4372
4373
31.0k
  dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy);
4374
31.0k
  transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
4375
31.0k
                        &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
4376
279k
  for (int i = 0; i < 8; i++) {
4377
248k
    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
4378
248k
  }
4379
31.0k
}
4380
4381
static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride,
4382
                                      const uint8_t *left, int upsample_left,
4383
56.9k
                                      int dy) {
4384
56.9k
  __m128i dstvec[8], d[4];
4385
4386
56.9k
  dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy);
4387
56.9k
  transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
4388
56.9k
                        &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
4389
56.9k
                        &d[1], &d[2], &d[3]);
4390
56.9k
  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
4391
56.9k
  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
4392
56.9k
  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
4393
56.9k
  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
4394
56.9k
}
4395
4396
static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride,
4397
                                       const uint8_t *left, int upsample_left,
4398
27.1k
                                       int dy) {
4399
27.1k
  __m128i dstvec[8], d[8];
4400
4401
27.1k
  dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy);
4402
27.1k
  transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
4403
27.1k
                          dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
4404
27.1k
                          d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
4405
244k
  for (int i = 0; i < 8; i++) {
4406
217k
    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
4407
217k
    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
4408
217k
                     _mm_srli_si128(d[i], 8));
4409
217k
  }
4410
27.1k
}
4411
4412
static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
4413
                                       const uint8_t *left, int upsample_left,
4414
62.8k
                                       int dy) {
4415
62.8k
  __m128i dstvec[16], d[16];
4416
4417
62.8k
  dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy);
4418
62.8k
  transpose16x8_8x16_sse2(
4419
62.8k
      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4420
62.8k
      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4421
62.8k
      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4422
62.8k
      &d[3], &d[4], &d[5], &d[6], &d[7]);
4423
4424
565k
  for (int i = 0; i < 8; i++) {
4425
502k
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4426
502k
  }
4427
62.8k
}
4428
4429
static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride,
4430
                                       const uint8_t *left, int upsample_left,
4431
18.6k
                                       int dy) {
4432
18.6k
  __m128i dstvec[4], d[16];
4433
4434
18.6k
  dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy);
4435
18.6k
  transpose4x16_sse2(dstvec, d);
4436
317k
  for (int i = 0; i < 16; i++) {
4437
298k
    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
4438
298k
  }
4439
18.6k
}
4440
4441
static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride,
4442
                                       const uint8_t *left, int upsample_left,
4443
71.8k
                                       int dy) {
4444
71.8k
  __m128i dstvec[16], d[8];
4445
4446
71.8k
  dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy);
4447
359k
  for (int i = 4; i < 8; i++) {
4448
287k
    d[i] = _mm_setzero_si128();
4449
287k
  }
4450
71.8k
  transpose16x8_8x16_sse2(
4451
71.8k
      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4452
71.8k
      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4453
71.8k
      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4454
71.8k
      &d[3], &d[4], &d[5], &d[6], &d[7]);
4455
4456
359k
  for (int i = 0; i < 4; i++) {
4457
287k
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4458
287k
  }
4459
71.8k
}
4460
4461
static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride,
4462
                                       const uint8_t *left, int upsample_left,
4463
12.0k
                                       int dy) {
4464
12.0k
  __m256i dstvec[16], d[16];
4465
4466
12.0k
  dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy);
4467
108k
  for (int i = 8; i < 16; i++) {
4468
96.3k
    dstvec[i] = _mm256_setzero_si256();
4469
96.3k
  }
4470
12.0k
  transpose16x32_avx2(dstvec, d);
4471
4472
204k
  for (int i = 0; i < 16; i++) {
4473
192k
    _mm_storel_epi64((__m128i *)(dst + i * stride),
4474
192k
                     _mm256_castsi256_si128(d[i]));
4475
192k
  }
4476
204k
  for (int i = 0; i < 16; i++) {
4477
192k
    _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride),
4478
192k
                     _mm256_extracti128_si256(d[i], 1));
4479
192k
  }
4480
12.0k
}
4481
4482
static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride,
4483
                                       const uint8_t *left, int upsample_left,
4484
59.0k
                                       int dy) {
4485
59.0k
  __m128i dstvec[32], d[16];
4486
4487
59.0k
  dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy);
4488
4489
59.0k
  transpose16x8_8x16_sse2(
4490
59.0k
      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4491
59.0k
      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4492
59.0k
      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4493
59.0k
      &d[3], &d[4], &d[5], &d[6], &d[7]);
4494
59.0k
  transpose16x8_8x16_sse2(
4495
59.0k
      &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
4496
59.0k
      &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
4497
59.0k
      &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
4498
59.0k
      &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
4499
59.0k
      &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
4500
59.0k
      &d[6 + 8], &d[7 + 8]);
4501
4502
531k
  for (int i = 0; i < 8; i++) {
4503
472k
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4504
472k
    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
4505
472k
  }
4506
59.0k
}
4507
4508
static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
4509
                                        const uint8_t *left, int upsample_left,
4510
112k
                                        int dy) {
4511
112k
  __m128i dstvec[16], d[16];
4512
4513
112k
  dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy);
4514
112k
  transpose16x16_sse2(dstvec, d);
4515
4516
1.91M
  for (int i = 0; i < 16; i++) {
4517
1.80M
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4518
1.80M
  }
4519
112k
}
4520
4521
static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
4522
                                        const uint8_t *left, int upsample_left,
4523
85.0k
                                        int dy) {
4524
85.0k
  __m256i dstvec[32], d[32];
4525
4526
85.0k
  dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy);
4527
85.0k
  transpose16x32_avx2(dstvec, d);
4528
85.0k
  transpose16x32_avx2(dstvec + 16, d + 16);
4529
1.44M
  for (int j = 0; j < 16; j++) {
4530
1.36M
    _mm_storeu_si128((__m128i *)(dst + j * stride),
4531
1.36M
                     _mm256_castsi256_si128(d[j]));
4532
1.36M
    _mm_storeu_si128((__m128i *)(dst + j * stride + 16),
4533
1.36M
                     _mm256_castsi256_si128(d[j + 16]));
4534
1.36M
  }
4535
1.44M
  for (int j = 0; j < 16; j++) {
4536
1.36M
    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
4537
1.36M
                     _mm256_extracti128_si256(d[j], 1));
4538
1.36M
    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16),
4539
1.36M
                     _mm256_extracti128_si256(d[j + 16], 1));
4540
1.36M
  }
4541
85.0k
}
4542
4543
static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
4544
                                        const uint8_t *left, int upsample_left,
4545
20.3k
                                        int dy) {
4546
20.3k
  DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
4547
20.3k
  dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
4548
20.3k
  transpose(dstT, 64, dst, stride, 64, 64);
4549
20.3k
}
4550
4551
static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
4552
                                        const uint8_t *left, int upsample_left,
4553
22.0k
                                        int dy) {
4554
22.0k
  __m256i dstvec[16], d[16];
4555
4556
22.0k
  dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy);
4557
22.0k
  transpose16x32_avx2(dstvec, d);
4558
  // store
4559
375k
  for (int j = 0; j < 16; j++) {
4560
353k
    _mm_storeu_si128((__m128i *)(dst + j * stride),
4561
353k
                     _mm256_castsi256_si128(d[j]));
4562
353k
    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
4563
353k
                     _mm256_extracti128_si256(d[j], 1));
4564
353k
  }
4565
22.0k
}
4566
4567
static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
4568
                                        const uint8_t *left, int upsample_left,
4569
22.4k
                                        int dy) {
4570
22.4k
  __m128i dstvec[32], d[16];
4571
4572
22.4k
  dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy);
4573
67.3k
  for (int i = 0; i < 32; i += 16) {
4574
44.9k
    transpose16x16_sse2((dstvec + i), d);
4575
763k
    for (int j = 0; j < 16; j++) {
4576
718k
      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
4577
718k
    }
4578
44.9k
  }
4579
22.4k
}
4580
4581
static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
4582
                                        const uint8_t *left, int upsample_left,
4583
1.69k
                                        int dy) {
4584
1.69k
  uint8_t dstT[64 * 32];
4585
1.69k
  dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
4586
1.69k
  transpose(dstT, 64, dst, stride, 32, 64);
4587
1.69k
}
4588
4589
static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
4590
                                        const uint8_t *left, int upsample_left,
4591
3.77k
                                        int dy) {
4592
3.77k
  uint8_t dstT[32 * 64];
4593
3.77k
  dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
4594
3.77k
  transpose(dstT, 32, dst, stride, 64, 32);
4595
3.77k
  return;
4596
3.77k
}
4597
4598
static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
4599
                                        const uint8_t *left, int upsample_left,
4600
3.07k
                                        int dy) {
4601
3.07k
  uint8_t dstT[64 * 16];
4602
3.07k
  dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
4603
3.07k
  transpose(dstT, 64, dst, stride, 16, 64);
4604
3.07k
}
4605
4606
static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
4607
                                        const uint8_t *left, int upsample_left,
4608
23.2k
                                        int dy) {
4609
23.2k
  __m128i dstvec[64], d[16];
4610
4611
23.2k
  dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy);
4612
116k
  for (int i = 0; i < 64; i += 16) {
4613
92.8k
    transpose16x16_sse2((dstvec + i), d);
4614
1.57M
    for (int j = 0; j < 16; j++) {
4615
1.48M
      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
4616
1.48M
    }
4617
92.8k
  }
4618
23.2k
}
4619
4620
void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
4621
                               const uint8_t *above, const uint8_t *left,
4622
869k
                               int upsample_left, int dx, int dy) {
4623
869k
  (void)above;
4624
869k
  (void)dx;
4625
869k
  assert(dx == 1);
4626
0
  assert(dy > 0);
4627
4628
869k
  if (bw == bh) {
4629
454k
    switch (bw) {
4630
115k
      case 4:
4631
115k
        dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
4632
115k
        break;
4633
121k
      case 8:
4634
121k
        dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
4635
121k
        break;
4636
112k
      case 16:
4637
112k
        dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy);
4638
112k
        break;
4639
85.0k
      case 32:
4640
85.0k
        dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy);
4641
85.0k
        break;
4642
20.3k
      case 64:
4643
20.3k
        dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy);
4644
20.3k
        break;
4645
454k
    }
4646
454k
  } else {
4647
415k
    if (bw < bh) {
4648
115k
      if (bw + bw == bh) {
4649
81.8k
        switch (bw) {
4650
31.0k
          case 4:
4651
31.0k
            dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy);
4652
31.0k
            break;
4653
27.1k
          case 8:
4654
27.1k
            dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy);
4655
27.1k
            break;
4656
22.0k
          case 16:
4657
22.0k
            dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy);
4658
22.0k
            break;
4659
1.69k
          case 32:
4660
1.69k
            dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy);
4661
1.69k
            break;
4662
81.8k
        }
4663
81.8k
      } else {
4664
33.7k
        switch (bw) {
4665
18.6k
          case 4:
4666
18.6k
            dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy);
4667
18.6k
            break;
4668
12.0k
          case 8:
4669
12.0k
            dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy);
4670
12.0k
            break;
4671
3.07k
          case 16:
4672
3.07k
            dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy);
4673
3.07k
            break;
4674
33.7k
        }
4675
33.7k
      }
4676
300k
    } else {
4677
300k
      if (bh + bh == bw) {
4678
146k
        switch (bh) {
4679
56.9k
          case 4:
4680
56.9k
            dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy);
4681
56.9k
            break;
4682
62.8k
          case 8:
4683
62.8k
            dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy);
4684
62.8k
            break;
4685
22.4k
          case 16:
4686
22.4k
            dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy);
4687
22.4k
            break;
4688
3.77k
          case 32:
4689
3.77k
            dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy);
4690
3.77k
            break;
4691
146k
        }
4692
153k
      } else {
4693
153k
        switch (bh) {
4694
71.8k
          case 4:
4695
71.8k
            dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy);
4696
71.8k
            break;
4697
59.0k
          case 8:
4698
59.0k
            dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy);
4699
59.0k
            break;
4700
23.2k
          case 16:
4701
23.2k
            dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy);
4702
23.2k
            break;
4703
153k
        }
4704
153k
      }
4705
300k
    }
4706
415k
  }
4707
869k
}