Coverage Report

Created: 2025-06-13 07:07

/src/aom/aom_dsp/x86/intrapred_avx2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <immintrin.h>
13
14
#include "config/av1_rtcd.h"
15
#include "aom_dsp/x86/intrapred_x86.h"
16
#include "aom_dsp/x86/intrapred_utils.h"
17
#include "aom_dsp/x86/lpf_common_sse2.h"
18
19
317k
static inline __m256i dc_sum_64(const uint8_t *ref) {
20
317k
  const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
21
317k
  const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
22
317k
  const __m256i zero = _mm256_setzero_si256();
23
317k
  __m256i y0 = _mm256_sad_epu8(x0, zero);
24
317k
  __m256i y1 = _mm256_sad_epu8(x1, zero);
25
317k
  y0 = _mm256_add_epi64(y0, y1);
26
317k
  __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
27
317k
  y0 = _mm256_add_epi64(u0, y0);
28
317k
  u0 = _mm256_unpackhi_epi64(y0, y0);
29
317k
  return _mm256_add_epi16(y0, u0);
30
317k
}
31
32
2.02M
static inline __m256i dc_sum_32(const uint8_t *ref) {
33
2.02M
  const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
34
2.02M
  const __m256i zero = _mm256_setzero_si256();
35
2.02M
  __m256i y = _mm256_sad_epu8(x, zero);
36
2.02M
  __m256i u = _mm256_permute2x128_si256(y, y, 1);
37
2.02M
  y = _mm256_add_epi64(u, y);
38
2.02M
  u = _mm256_unpackhi_epi64(y, y);
39
2.02M
  return _mm256_add_epi16(y, u);
40
2.02M
}
41
42
static inline void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
43
1.31M
                                  ptrdiff_t stride) {
44
41.0M
  for (int i = 0; i < height; ++i) {
45
39.7M
    _mm256_storeu_si256((__m256i *)dst, *r);
46
39.7M
    dst += stride;
47
39.7M
  }
48
1.31M
}
49
50
static inline void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
51
                                    int height, uint8_t *dst,
52
4.45k
                                    ptrdiff_t stride) {
53
207k
  for (int i = 0; i < height; ++i) {
54
203k
    _mm256_storeu_si256((__m256i *)dst, *r0);
55
203k
    _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
56
203k
    dst += stride;
57
203k
  }
58
4.45k
}
59
60
static inline void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
61
219k
                                  ptrdiff_t stride) {
62
10.8M
  for (int i = 0; i < height; ++i) {
63
10.6M
    _mm256_storeu_si256((__m256i *)dst, *r);
64
10.6M
    _mm256_storeu_si256((__m256i *)(dst + 32), *r);
65
10.6M
    dst += stride;
66
10.6M
  }
67
219k
}
68
69
#if CONFIG_AV1_HIGHBITDEPTH
70
static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = {
71
  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
72
  { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
73
  { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
74
  { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
75
  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 },
76
  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 },
77
  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 },
78
  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
79
};
80
81
static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = {
82
  { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 },
83
  { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 },
84
  { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 },
85
  { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 }
86
};
87
88
static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = {
89
  { 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25, 28, 29,
90
    2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
91
  { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27,
92
    0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
93
  { 0, 1, 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25,
94
    0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27 },
95
  { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23,
96
    0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25 },
97
  { 0, 1, 0, 1, 0, 1, 0, 1, 8,  9,  12, 13, 16, 17, 20, 21,
98
    0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23 },
99
  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19,
100
    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21 },
101
  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17,
102
    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19 },
103
  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15,
104
    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17 }
105
};
106
107
static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = {
108
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
109
  { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
110
  { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
111
  { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
112
  { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
113
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
114
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115
    0 },
116
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0,
117
    0, 0 },
118
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0,
119
    0, 0, 0, 0 },
120
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0,
121
    0, 0, 0, 0, 0, 0 },
122
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
123
    0xffff, 0, 0, 0, 0, 0, 0 },
124
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
125
    0xffff, 0xffff, 0, 0, 0, 0, 0 },
126
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
127
    0xffff, 0xffff, 0xffff, 0, 0, 0, 0 },
128
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
129
    0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 },
130
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
131
    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 },
132
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
133
    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 },
134
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
135
    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
136
};
137
138
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
139
63.8k
static inline void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
140
63.8k
  __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
141
142
63.8k
  r0 = _mm_unpacklo_epi16(x[0], x[1]);
143
63.8k
  r1 = _mm_unpacklo_epi16(x[2], x[3]);
144
63.8k
  r2 = _mm_unpacklo_epi16(x[4], x[5]);
145
63.8k
  r3 = _mm_unpacklo_epi16(x[6], x[7]);
146
147
63.8k
  r4 = _mm_unpacklo_epi16(x[8], x[9]);
148
63.8k
  r5 = _mm_unpacklo_epi16(x[10], x[11]);
149
63.8k
  r6 = _mm_unpacklo_epi16(x[12], x[13]);
150
63.8k
  r7 = _mm_unpacklo_epi16(x[14], x[15]);
151
152
63.8k
  r8 = _mm_unpacklo_epi32(r0, r1);
153
63.8k
  r9 = _mm_unpackhi_epi32(r0, r1);
154
63.8k
  r10 = _mm_unpacklo_epi32(r2, r3);
155
63.8k
  r11 = _mm_unpackhi_epi32(r2, r3);
156
157
63.8k
  r12 = _mm_unpacklo_epi32(r4, r5);
158
63.8k
  r13 = _mm_unpackhi_epi32(r4, r5);
159
63.8k
  r14 = _mm_unpacklo_epi32(r6, r7);
160
63.8k
  r15 = _mm_unpackhi_epi32(r6, r7);
161
162
63.8k
  r0 = _mm_unpacklo_epi64(r8, r9);
163
63.8k
  r1 = _mm_unpackhi_epi64(r8, r9);
164
63.8k
  r2 = _mm_unpacklo_epi64(r10, r11);
165
63.8k
  r3 = _mm_unpackhi_epi64(r10, r11);
166
167
63.8k
  r4 = _mm_unpacklo_epi64(r12, r13);
168
63.8k
  r5 = _mm_unpackhi_epi64(r12, r13);
169
63.8k
  r6 = _mm_unpacklo_epi64(r14, r15);
170
63.8k
  r7 = _mm_unpackhi_epi64(r14, r15);
171
172
63.8k
  d[0] = _mm_unpacklo_epi64(r0, r2);
173
63.8k
  d[1] = _mm_unpacklo_epi64(r4, r6);
174
63.8k
  d[2] = _mm_unpacklo_epi64(r1, r3);
175
63.8k
  d[3] = _mm_unpacklo_epi64(r5, r7);
176
177
63.8k
  d[4] = _mm_unpackhi_epi64(r0, r2);
178
63.8k
  d[5] = _mm_unpackhi_epi64(r4, r6);
179
63.8k
  d[6] = _mm_unpackhi_epi64(r1, r3);
180
63.8k
  d[7] = _mm_unpackhi_epi64(r5, r7);
181
63.8k
}
182
183
24.1k
static inline void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
184
24.1k
  __m256i w0, w1, w2, w3, ww0, ww1;
185
186
24.1k
  w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
187
24.1k
  w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
188
24.1k
  w2 = _mm256_unpackhi_epi16(x[0], x[1]);  // 40 50 41 51 42 52 43 53
189
24.1k
  w3 = _mm256_unpackhi_epi16(x[2], x[3]);  // 60 70 61 71 62 72 63 73
190
191
24.1k
  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
192
24.1k
  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
193
194
24.1k
  d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
195
24.1k
  d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
196
197
24.1k
  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
198
24.1k
  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
199
200
24.1k
  d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
201
24.1k
  d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
202
24.1k
}
203
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
204
205
172k
static inline void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
206
172k
  __m256i w0, w1, w2, w3, ww0, ww1;
207
208
172k
  w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
209
172k
  w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
210
172k
  w2 = _mm256_unpacklo_epi16(x[4], x[5]);  // 40 50 41 51 42 52 43 53
211
172k
  w3 = _mm256_unpacklo_epi16(x[6], x[7]);  // 60 70 61 71 62 72 63 73
212
213
172k
  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
214
172k
  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
215
216
172k
  d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
217
172k
  d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
218
219
172k
  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
220
172k
  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
221
222
172k
  d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
223
172k
  d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
224
225
172k
  w0 = _mm256_unpackhi_epi16(x[0], x[1]);  // 04 14 05 15 06 16 07 17
226
172k
  w1 = _mm256_unpackhi_epi16(x[2], x[3]);  // 24 34 25 35 26 36 27 37
227
172k
  w2 = _mm256_unpackhi_epi16(x[4], x[5]);  // 44 54 45 55 46 56 47 57
228
172k
  w3 = _mm256_unpackhi_epi16(x[6], x[7]);  // 64 74 65 75 66 76 67 77
229
230
172k
  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
231
172k
  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
232
233
172k
  d[4] = _mm256_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
234
172k
  d[5] = _mm256_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
235
236
172k
  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
237
172k
  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
238
239
172k
  d[6] = _mm256_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
240
172k
  d[7] = _mm256_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
241
172k
}
242
243
1.17M
static inline void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) {
244
1.17M
  __m256i w0, w1, w2, w3, ww0, ww1;
245
1.17M
  __m256i dd[16];
246
1.17M
  w0 = _mm256_unpacklo_epi16(x[0], x[1]);
247
1.17M
  w1 = _mm256_unpacklo_epi16(x[2], x[3]);
248
1.17M
  w2 = _mm256_unpacklo_epi16(x[4], x[5]);
249
1.17M
  w3 = _mm256_unpacklo_epi16(x[6], x[7]);
250
251
1.17M
  ww0 = _mm256_unpacklo_epi32(w0, w1);  //
252
1.17M
  ww1 = _mm256_unpacklo_epi32(w2, w3);  //
253
254
1.17M
  dd[0] = _mm256_unpacklo_epi64(ww0, ww1);
255
1.17M
  dd[1] = _mm256_unpackhi_epi64(ww0, ww1);
256
257
1.17M
  ww0 = _mm256_unpackhi_epi32(w0, w1);  //
258
1.17M
  ww1 = _mm256_unpackhi_epi32(w2, w3);  //
259
260
1.17M
  dd[2] = _mm256_unpacklo_epi64(ww0, ww1);
261
1.17M
  dd[3] = _mm256_unpackhi_epi64(ww0, ww1);
262
263
1.17M
  w0 = _mm256_unpackhi_epi16(x[0], x[1]);
264
1.17M
  w1 = _mm256_unpackhi_epi16(x[2], x[3]);
265
1.17M
  w2 = _mm256_unpackhi_epi16(x[4], x[5]);
266
1.17M
  w3 = _mm256_unpackhi_epi16(x[6], x[7]);
267
268
1.17M
  ww0 = _mm256_unpacklo_epi32(w0, w1);  //
269
1.17M
  ww1 = _mm256_unpacklo_epi32(w2, w3);  //
270
271
1.17M
  dd[4] = _mm256_unpacklo_epi64(ww0, ww1);
272
1.17M
  dd[5] = _mm256_unpackhi_epi64(ww0, ww1);
273
274
1.17M
  ww0 = _mm256_unpackhi_epi32(w0, w1);  //
275
1.17M
  ww1 = _mm256_unpackhi_epi32(w2, w3);  //
276
277
1.17M
  dd[6] = _mm256_unpacklo_epi64(ww0, ww1);
278
1.17M
  dd[7] = _mm256_unpackhi_epi64(ww0, ww1);
279
280
1.17M
  w0 = _mm256_unpacklo_epi16(x[8], x[9]);
281
1.17M
  w1 = _mm256_unpacklo_epi16(x[10], x[11]);
282
1.17M
  w2 = _mm256_unpacklo_epi16(x[12], x[13]);
283
1.17M
  w3 = _mm256_unpacklo_epi16(x[14], x[15]);
284
285
1.17M
  ww0 = _mm256_unpacklo_epi32(w0, w1);
286
1.17M
  ww1 = _mm256_unpacklo_epi32(w2, w3);
287
288
1.17M
  dd[8] = _mm256_unpacklo_epi64(ww0, ww1);
289
1.17M
  dd[9] = _mm256_unpackhi_epi64(ww0, ww1);
290
291
1.17M
  ww0 = _mm256_unpackhi_epi32(w0, w1);
292
1.17M
  ww1 = _mm256_unpackhi_epi32(w2, w3);
293
294
1.17M
  dd[10] = _mm256_unpacklo_epi64(ww0, ww1);
295
1.17M
  dd[11] = _mm256_unpackhi_epi64(ww0, ww1);
296
297
1.17M
  w0 = _mm256_unpackhi_epi16(x[8], x[9]);
298
1.17M
  w1 = _mm256_unpackhi_epi16(x[10], x[11]);
299
1.17M
  w2 = _mm256_unpackhi_epi16(x[12], x[13]);
300
1.17M
  w3 = _mm256_unpackhi_epi16(x[14], x[15]);
301
302
1.17M
  ww0 = _mm256_unpacklo_epi32(w0, w1);
303
1.17M
  ww1 = _mm256_unpacklo_epi32(w2, w3);
304
305
1.17M
  dd[12] = _mm256_unpacklo_epi64(ww0, ww1);
306
1.17M
  dd[13] = _mm256_unpackhi_epi64(ww0, ww1);
307
308
1.17M
  ww0 = _mm256_unpackhi_epi32(w0, w1);
309
1.17M
  ww1 = _mm256_unpackhi_epi32(w2, w3);
310
311
1.17M
  dd[14] = _mm256_unpacklo_epi64(ww0, ww1);
312
1.17M
  dd[15] = _mm256_unpackhi_epi64(ww0, ww1);
313
314
10.5M
  for (int i = 0; i < 8; i++) {
315
9.41M
    d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1);
316
9.41M
    d[i + 8] = _mm256_insertf128_si256(dd[i + 8],
317
9.41M
                                       _mm256_extracti128_si256(dd[i], 1), 0);
318
9.41M
  }
319
1.17M
}
320
#endif  // CONFIG_AV1_HIGHBITDEPTH
321
322
void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
323
912k
                                 const uint8_t *above, const uint8_t *left) {
324
912k
  const __m256i sum_above = dc_sum_32(above);
325
912k
  __m256i sum_left = dc_sum_32(left);
326
912k
  sum_left = _mm256_add_epi16(sum_left, sum_above);
327
912k
  const __m256i thirtytwo = _mm256_set1_epi16(32);
328
912k
  sum_left = _mm256_add_epi16(sum_left, thirtytwo);
329
912k
  sum_left = _mm256_srai_epi16(sum_left, 6);
330
912k
  const __m256i zero = _mm256_setzero_si256();
331
912k
  __m256i row = _mm256_shuffle_epi8(sum_left, zero);
332
912k
  row_store_32xh(&row, 32, dst, stride);
333
912k
}
334
335
void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
336
                                     const uint8_t *above,
337
63.3k
                                     const uint8_t *left) {
338
63.3k
  __m256i sum = dc_sum_32(above);
339
63.3k
  (void)left;
340
341
63.3k
  const __m256i sixteen = _mm256_set1_epi16(16);
342
63.3k
  sum = _mm256_add_epi16(sum, sixteen);
343
63.3k
  sum = _mm256_srai_epi16(sum, 5);
344
63.3k
  const __m256i zero = _mm256_setzero_si256();
345
63.3k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
346
63.3k
  row_store_32xh(&row, 32, dst, stride);
347
63.3k
}
348
349
void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
350
                                      const uint8_t *above,
351
109k
                                      const uint8_t *left) {
352
109k
  __m256i sum = dc_sum_32(left);
353
109k
  (void)above;
354
355
109k
  const __m256i sixteen = _mm256_set1_epi16(16);
356
109k
  sum = _mm256_add_epi16(sum, sixteen);
357
109k
  sum = _mm256_srai_epi16(sum, 5);
358
109k
  const __m256i zero = _mm256_setzero_si256();
359
109k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
360
109k
  row_store_32xh(&row, 32, dst, stride);
361
109k
}
362
363
void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
364
                                     const uint8_t *above,
365
20.6k
                                     const uint8_t *left) {
366
20.6k
  (void)above;
367
20.6k
  (void)left;
368
20.6k
  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
369
20.6k
  row_store_32xh(&row, 32, dst, stride);
370
20.6k
}
371
372
void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
373
28.3k
                                const uint8_t *above, const uint8_t *left) {
374
28.3k
  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
375
28.3k
  (void)left;
376
28.3k
  row_store_32xh(&row, 32, dst, stride);
377
28.3k
}
378
379
// There are 32 rows togeter. This function does line:
380
// 0,1,2,3, and 16,17,18,19. The next call would do
381
// 4,5,6,7, and 20,21,22,23. So 4 times of calling
382
// would finish 32 rows.
383
static inline void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
384
613k
                                        ptrdiff_t stride) {
385
613k
  __m256i t[4];
386
613k
  __m256i m = _mm256_setzero_si256();
387
613k
  const __m256i inc = _mm256_set1_epi8(4);
388
613k
  int i;
389
390
3.06M
  for (i = 0; i < 4; i++) {
391
2.45M
    t[i] = _mm256_shuffle_epi8(*row, m);
392
2.45M
    __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
393
2.45M
    __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
394
2.45M
    _mm256_storeu_si256((__m256i *)dst, r0);
395
2.45M
    _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
396
2.45M
    dst += stride;
397
2.45M
    m = _mm256_add_epi8(m, inc);
398
2.45M
  }
399
613k
}
400
401
void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
402
153k
                                const uint8_t *above, const uint8_t *left) {
403
153k
  (void)above;
404
153k
  const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
405
406
153k
  __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
407
408
153k
  __m256i v = _mm256_unpacklo_epi8(u, u);
409
153k
  h_predictor_32x8line(&v, dst, stride);
410
153k
  dst += stride << 2;
411
412
153k
  v = _mm256_unpackhi_epi8(u, u);
413
153k
  h_predictor_32x8line(&v, dst, stride);
414
153k
  dst += stride << 2;
415
416
153k
  u = _mm256_unpackhi_epi8(left_col, left_col);
417
418
153k
  v = _mm256_unpacklo_epi8(u, u);
419
153k
  h_predictor_32x8line(&v, dst, stride);
420
153k
  dst += stride << 2;
421
422
153k
  v = _mm256_unpackhi_epi8(u, u);
423
153k
  h_predictor_32x8line(&v, dst, stride);
424
153k
}
425
426
// -----------------------------------------------------------------------------
427
// Rectangle
428
void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
429
147k
                                 const uint8_t *above, const uint8_t *left) {
430
147k
  const __m128i top_sum = dc_sum_32_sse2(above);
431
147k
  __m128i left_sum = dc_sum_16_sse2(left);
432
147k
  left_sum = _mm_add_epi16(top_sum, left_sum);
433
147k
  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(left_sum);
434
147k
  sum += 24;
435
147k
  sum /= 48;
436
147k
  const __m256i row = _mm256_set1_epi8((int8_t)sum);
437
147k
  row_store_32xh(&row, 16, dst, stride);
438
147k
}
439
440
void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
441
7.18k
                                 const uint8_t *above, const uint8_t *left) {
442
7.18k
  const __m256i sum_above = dc_sum_32(above);
443
7.18k
  __m256i sum_left = dc_sum_64(left);
444
7.18k
  sum_left = _mm256_add_epi16(sum_left, sum_above);
445
7.18k
  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
446
7.18k
  sum += 48;
447
7.18k
  sum /= 96;
448
7.18k
  const __m256i row = _mm256_set1_epi8((int8_t)sum);
449
7.18k
  row_store_32xh(&row, 64, dst, stride);
450
7.18k
}
451
452
void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
453
100k
                                 const uint8_t *above, const uint8_t *left) {
454
100k
  const __m256i sum_above = dc_sum_64(above);
455
100k
  __m256i sum_left = dc_sum_64(left);
456
100k
  sum_left = _mm256_add_epi16(sum_left, sum_above);
457
100k
  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
458
100k
  sum += 64;
459
100k
  sum /= 128;
460
100k
  const __m256i row = _mm256_set1_epi8((int8_t)sum);
461
100k
  row_store_64xh(&row, 64, dst, stride);
462
100k
}
463
464
void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
465
15.0k
                                 const uint8_t *above, const uint8_t *left) {
466
15.0k
  const __m256i sum_above = dc_sum_64(above);
467
15.0k
  __m256i sum_left = dc_sum_32(left);
468
15.0k
  sum_left = _mm256_add_epi16(sum_left, sum_above);
469
15.0k
  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
470
15.0k
  sum += 48;
471
15.0k
  sum /= 96;
472
15.0k
  const __m256i row = _mm256_set1_epi8((int8_t)sum);
473
15.0k
  row_store_64xh(&row, 32, dst, stride);
474
15.0k
}
475
476
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
477
void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
478
56.6k
                                 const uint8_t *above, const uint8_t *left) {
479
56.6k
  const __m256i sum_above = dc_sum_64(above);
480
56.6k
  __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
481
56.6k
  sum_left = _mm256_add_epi16(sum_left, sum_above);
482
56.6k
  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
483
56.6k
  sum += 40;
484
56.6k
  sum /= 80;
485
56.6k
  const __m256i row = _mm256_set1_epi8((int8_t)sum);
486
56.6k
  row_store_64xh(&row, 16, dst, stride);
487
56.6k
}
488
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
489
490
void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
491
                                     const uint8_t *above,
492
4.40k
                                     const uint8_t *left) {
493
4.40k
  __m256i sum = dc_sum_32(above);
494
4.40k
  (void)left;
495
496
4.40k
  const __m256i sixteen = _mm256_set1_epi16(16);
497
4.40k
  sum = _mm256_add_epi16(sum, sixteen);
498
4.40k
  sum = _mm256_srai_epi16(sum, 5);
499
4.40k
  const __m256i zero = _mm256_setzero_si256();
500
4.40k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
501
4.40k
  row_store_32xh(&row, 16, dst, stride);
502
4.40k
}
503
504
void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
505
                                     const uint8_t *above,
506
1.07k
                                     const uint8_t *left) {
507
1.07k
  __m256i sum = dc_sum_32(above);
508
1.07k
  (void)left;
509
510
1.07k
  const __m256i sixteen = _mm256_set1_epi16(16);
511
1.07k
  sum = _mm256_add_epi16(sum, sixteen);
512
1.07k
  sum = _mm256_srai_epi16(sum, 5);
513
1.07k
  const __m256i zero = _mm256_setzero_si256();
514
1.07k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
515
1.07k
  row_store_32xh(&row, 64, dst, stride);
516
1.07k
}
517
518
void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
519
                                     const uint8_t *above,
520
14.2k
                                     const uint8_t *left) {
521
14.2k
  __m256i sum = dc_sum_64(above);
522
14.2k
  (void)left;
523
524
14.2k
  const __m256i thirtytwo = _mm256_set1_epi16(32);
525
14.2k
  sum = _mm256_add_epi16(sum, thirtytwo);
526
14.2k
  sum = _mm256_srai_epi16(sum, 6);
527
14.2k
  const __m256i zero = _mm256_setzero_si256();
528
14.2k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
529
14.2k
  row_store_64xh(&row, 64, dst, stride);
530
14.2k
}
531
532
void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
533
                                     const uint8_t *above,
534
421
                                     const uint8_t *left) {
535
421
  __m256i sum = dc_sum_64(above);
536
421
  (void)left;
537
538
421
  const __m256i thirtytwo = _mm256_set1_epi16(32);
539
421
  sum = _mm256_add_epi16(sum, thirtytwo);
540
421
  sum = _mm256_srai_epi16(sum, 6);
541
421
  const __m256i zero = _mm256_setzero_si256();
542
421
  __m256i row = _mm256_shuffle_epi8(sum, zero);
543
421
  row_store_64xh(&row, 32, dst, stride);
544
421
}
545
546
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
547
void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
548
                                     const uint8_t *above,
549
2.06k
                                     const uint8_t *left) {
550
2.06k
  __m256i sum = dc_sum_64(above);
551
2.06k
  (void)left;
552
553
2.06k
  const __m256i thirtytwo = _mm256_set1_epi16(32);
554
2.06k
  sum = _mm256_add_epi16(sum, thirtytwo);
555
2.06k
  sum = _mm256_srai_epi16(sum, 6);
556
2.06k
  const __m256i zero = _mm256_setzero_si256();
557
2.06k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
558
2.06k
  row_store_64xh(&row, 16, dst, stride);
559
2.06k
}
560
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
561
562
void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
563
                                      const uint8_t *above,
564
3.91k
                                      const uint8_t *left) {
565
3.91k
  __m128i sum = dc_sum_16_sse2(left);
566
3.91k
  (void)above;
567
568
3.91k
  const __m128i eight = _mm_set1_epi16(8);
569
3.91k
  sum = _mm_add_epi16(sum, eight);
570
3.91k
  sum = _mm_srai_epi16(sum, 4);
571
3.91k
  const __m128i zero = _mm_setzero_si128();
572
3.91k
  const __m128i r = _mm_shuffle_epi8(sum, zero);
573
3.91k
  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
574
3.91k
  row_store_32xh(&row, 16, dst, stride);
575
3.91k
}
576
577
void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
578
                                      const uint8_t *above,
579
1.40k
                                      const uint8_t *left) {
580
1.40k
  __m256i sum = dc_sum_64(left);
581
1.40k
  (void)above;
582
583
1.40k
  const __m256i thirtytwo = _mm256_set1_epi16(32);
584
1.40k
  sum = _mm256_add_epi16(sum, thirtytwo);
585
1.40k
  sum = _mm256_srai_epi16(sum, 6);
586
1.40k
  const __m256i zero = _mm256_setzero_si256();
587
1.40k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
588
1.40k
  row_store_32xh(&row, 64, dst, stride);
589
1.40k
}
590
591
void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
592
                                      const uint8_t *above,
593
19.2k
                                      const uint8_t *left) {
594
19.2k
  __m256i sum = dc_sum_64(left);
595
19.2k
  (void)above;
596
597
19.2k
  const __m256i thirtytwo = _mm256_set1_epi16(32);
598
19.2k
  sum = _mm256_add_epi16(sum, thirtytwo);
599
19.2k
  sum = _mm256_srai_epi16(sum, 6);
600
19.2k
  const __m256i zero = _mm256_setzero_si256();
601
19.2k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
602
19.2k
  row_store_64xh(&row, 64, dst, stride);
603
19.2k
}
604
605
void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
606
                                      const uint8_t *above,
607
838
                                      const uint8_t *left) {
608
838
  __m256i sum = dc_sum_32(left);
609
838
  (void)above;
610
611
838
  const __m256i sixteen = _mm256_set1_epi16(16);
612
838
  sum = _mm256_add_epi16(sum, sixteen);
613
838
  sum = _mm256_srai_epi16(sum, 5);
614
838
  const __m256i zero = _mm256_setzero_si256();
615
838
  __m256i row = _mm256_shuffle_epi8(sum, zero);
616
838
  row_store_64xh(&row, 32, dst, stride);
617
838
}
618
619
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
620
void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
621
                                      const uint8_t *above,
622
271
                                      const uint8_t *left) {
623
271
  __m128i sum = dc_sum_16_sse2(left);
624
271
  (void)above;
625
626
271
  const __m128i eight = _mm_set1_epi16(8);
627
271
  sum = _mm_add_epi16(sum, eight);
628
271
  sum = _mm_srai_epi16(sum, 4);
629
271
  const __m128i zero = _mm_setzero_si128();
630
271
  const __m128i r = _mm_shuffle_epi8(sum, zero);
631
271
  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
632
271
  row_store_64xh(&row, 16, dst, stride);
633
271
}
634
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
635
636
void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
637
                                     const uint8_t *above,
638
4.55k
                                     const uint8_t *left) {
639
4.55k
  (void)above;
640
4.55k
  (void)left;
641
4.55k
  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
642
4.55k
  row_store_32xh(&row, 16, dst, stride);
643
4.55k
}
644
645
void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
646
                                     const uint8_t *above,
647
917
                                     const uint8_t *left) {
648
917
  (void)above;
649
917
  (void)left;
650
917
  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
651
917
  row_store_32xh(&row, 64, dst, stride);
652
917
}
653
654
void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
655
                                     const uint8_t *above,
656
7.84k
                                     const uint8_t *left) {
657
7.84k
  (void)above;
658
7.84k
  (void)left;
659
7.84k
  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
660
7.84k
  row_store_64xh(&row, 64, dst, stride);
661
7.84k
}
662
663
void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
664
                                     const uint8_t *above,
665
2.18k
                                     const uint8_t *left) {
666
2.18k
  (void)above;
667
2.18k
  (void)left;
668
2.18k
  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
669
2.18k
  row_store_64xh(&row, 32, dst, stride);
670
2.18k
}
671
672
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
673
void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
674
                                     const uint8_t *above,
675
451
                                     const uint8_t *left) {
676
451
  (void)above;
677
451
  (void)left;
678
451
  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
679
451
  row_store_64xh(&row, 16, dst, stride);
680
451
}
681
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
682
683
void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
684
9.83k
                                const uint8_t *above, const uint8_t *left) {
685
9.83k
  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
686
9.83k
  (void)left;
687
9.83k
  row_store_32xh(&row, 16, dst, stride);
688
9.83k
}
689
690
void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
691
576
                                const uint8_t *above, const uint8_t *left) {
692
576
  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
693
576
  (void)left;
694
576
  row_store_32xh(&row, 64, dst, stride);
695
576
}
696
697
void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
698
2.53k
                                const uint8_t *above, const uint8_t *left) {
699
2.53k
  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
700
2.53k
  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
701
2.53k
  (void)left;
702
2.53k
  row_store_32x2xh(&row0, &row1, 64, dst, stride);
703
2.53k
}
704
705
void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
706
636
                                const uint8_t *above, const uint8_t *left) {
707
636
  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
708
636
  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
709
636
  (void)left;
710
636
  row_store_32x2xh(&row0, &row1, 32, dst, stride);
711
636
}
712
713
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
714
void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
715
1.28k
                                const uint8_t *above, const uint8_t *left) {
716
1.28k
  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
717
1.28k
  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
718
1.28k
  (void)left;
719
1.28k
  row_store_32x2xh(&row0, &row1, 16, dst, stride);
720
1.28k
}
721
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
722
723
// -----------------------------------------------------------------------------
724
// PAETH_PRED
725
726
// Return 16 16-bit pixels in one row (__m256i)
727
static inline __m256i paeth_pred(const __m256i *left, const __m256i *top,
728
79.7M
                                 const __m256i *topleft) {
729
79.7M
  const __m256i base =
730
79.7M
      _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
731
732
79.7M
  __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
733
79.7M
  __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
734
79.7M
  __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
735
736
79.7M
  __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
737
79.7M
  mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
738
79.7M
  __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
739
740
79.7M
  pl = _mm256_andnot_si256(mask1, *left);
741
742
79.7M
  ptl = _mm256_and_si256(mask2, *topleft);
743
79.7M
  pt = _mm256_andnot_si256(mask2, *top);
744
79.7M
  pt = _mm256_or_si256(pt, ptl);
745
79.7M
  pt = _mm256_and_si256(mask1, pt);
746
747
79.7M
  return _mm256_or_si256(pt, pl);
748
79.7M
}
749
750
// Return 16 8-bit pixels in one row (__m128i)
751
static inline __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
752
78.8M
                                      const __m256i *topleft) {
753
78.8M
  const __m256i p0 = paeth_pred(left, top, topleft);
754
78.8M
  const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
755
78.8M
  const __m256i p = _mm256_packus_epi16(p0, p1);
756
78.8M
  return _mm256_castsi256_si128(p);
757
78.8M
}
758
759
2.23M
static inline __m256i get_top_vector(const uint8_t *above) {
760
2.23M
  const __m128i x = _mm_load_si128((const __m128i *)above);
761
2.23M
  const __m128i zero = _mm_setzero_si128();
762
2.23M
  const __m128i t0 = _mm_unpacklo_epi8(x, zero);
763
2.23M
  const __m128i t1 = _mm_unpackhi_epi8(x, zero);
764
2.23M
  return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
765
2.23M
}
766
767
void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
768
76.4k
                                   const uint8_t *above, const uint8_t *left) {
769
76.4k
  __m128i x = _mm_loadl_epi64((const __m128i *)left);
770
76.4k
  const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
771
76.4k
  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
772
76.4k
  __m256i rep = _mm256_set1_epi16((short)0x8000);
773
76.4k
  const __m256i one = _mm256_set1_epi16(1);
774
76.4k
  const __m256i top = get_top_vector(above);
775
776
76.4k
  int i;
777
687k
  for (i = 0; i < 8; ++i) {
778
611k
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
779
611k
    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
780
781
611k
    _mm_store_si128((__m128i *)dst, row);
782
611k
    dst += stride;
783
611k
    rep = _mm256_add_epi16(rep, one);
784
611k
  }
785
76.4k
}
786
787
3.82M
static inline __m256i get_left_vector(const uint8_t *left) {
788
3.82M
  const __m128i x = _mm_load_si128((const __m128i *)left);
789
3.82M
  return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
790
3.82M
}
791
792
void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
793
96.3k
                                    const uint8_t *above, const uint8_t *left) {
794
96.3k
  const __m256i l = get_left_vector(left);
795
96.3k
  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
796
96.3k
  __m256i rep = _mm256_set1_epi16((short)0x8000);
797
96.3k
  const __m256i one = _mm256_set1_epi16(1);
798
96.3k
  const __m256i top = get_top_vector(above);
799
800
96.3k
  int i;
801
1.63M
  for (i = 0; i < 16; ++i) {
802
1.54M
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
803
1.54M
    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
804
805
1.54M
    _mm_store_si128((__m128i *)dst, row);
806
1.54M
    dst += stride;
807
1.54M
    rep = _mm256_add_epi16(rep, one);
808
1.54M
  }
809
96.3k
}
810
811
void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
812
990k
                                    const uint8_t *above, const uint8_t *left) {
813
990k
  __m256i l = get_left_vector(left);
814
990k
  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
815
990k
  __m256i rep = _mm256_set1_epi16((short)0x8000);
816
990k
  const __m256i one = _mm256_set1_epi16(1);
817
990k
  const __m256i top = get_top_vector(above);
818
819
990k
  int i;
820
16.8M
  for (i = 0; i < 16; ++i) {
821
15.8M
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
822
15.8M
    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
823
824
15.8M
    _mm_store_si128((__m128i *)dst, row);
825
15.8M
    dst += stride;
826
15.8M
    rep = _mm256_add_epi16(rep, one);
827
15.8M
  }
828
829
990k
  l = get_left_vector(left + 16);
830
990k
  rep = _mm256_set1_epi16((short)0x8000);
831
16.8M
  for (i = 0; i < 16; ++i) {
832
15.8M
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
833
15.8M
    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
834
835
15.8M
    _mm_store_si128((__m128i *)dst, row);
836
15.8M
    dst += stride;
837
15.8M
    rep = _mm256_add_epi16(rep, one);
838
15.8M
  }
839
990k
}
840
841
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
842
void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
843
246k
                                    const uint8_t *above, const uint8_t *left) {
844
246k
  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
845
246k
  const __m256i one = _mm256_set1_epi16(1);
846
246k
  const __m256i top = get_top_vector(above);
847
848
1.23M
  for (int j = 0; j < 4; ++j) {
849
984k
    const __m256i l = get_left_vector(left + j * 16);
850
984k
    __m256i rep = _mm256_set1_epi16((short)0x8000);
851
16.7M
    for (int i = 0; i < 16; ++i) {
852
15.7M
      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
853
15.7M
      const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
854
855
15.7M
      _mm_store_si128((__m128i *)dst, row);
856
15.7M
      dst += stride;
857
15.7M
      rep = _mm256_add_epi16(rep, one);
858
15.7M
    }
859
984k
  }
860
246k
}
861
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
862
863
// Return 32 8-bit pixels in one row (__m256i)
864
static inline __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
865
                                      const __m256i *top1,
866
464k
                                      const __m256i *topleft) {
867
464k
  __m256i p0 = paeth_pred(left, top0, topleft);
868
464k
  __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
869
464k
  const __m256i x0 = _mm256_packus_epi16(p0, p1);
870
871
464k
  p0 = paeth_pred(left, top1, topleft);
872
464k
  p1 = _mm256_permute4x64_epi64(p0, 0xe);
873
464k
  const __m256i x1 = _mm256_packus_epi16(p0, p1);
874
875
464k
  return _mm256_permute2x128_si256(x0, x1, 0x20);
876
464k
}
877
878
void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
879
29.0k
                                    const uint8_t *above, const uint8_t *left) {
880
29.0k
  const __m256i l = get_left_vector(left);
881
29.0k
  const __m256i t0 = get_top_vector(above);
882
29.0k
  const __m256i t1 = get_top_vector(above + 16);
883
29.0k
  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
884
29.0k
  __m256i rep = _mm256_set1_epi16((short)0x8000);
885
29.0k
  const __m256i one = _mm256_set1_epi16(1);
886
887
29.0k
  int i;
888
493k
  for (i = 0; i < 16; ++i) {
889
464k
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
890
891
464k
    const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
892
893
464k
    _mm256_storeu_si256((__m256i *)dst, r);
894
895
464k
    dst += stride;
896
464k
    rep = _mm256_add_epi16(rep, one);
897
464k
  }
898
29.0k
}
899
900
void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
901
268k
                                    const uint8_t *above, const uint8_t *left) {
902
268k
  __m256i l = get_left_vector(left);
903
268k
  const __m256i t0 = get_top_vector(above);
904
268k
  const __m256i t1 = get_top_vector(above + 16);
905
268k
  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
906
268k
  __m256i rep = _mm256_set1_epi16((short)0x8000);
907
268k
  const __m256i one = _mm256_set1_epi16(1);
908
909
268k
  int i;
910
4.56M
  for (i = 0; i < 16; ++i) {
911
4.30M
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
912
913
4.30M
    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
914
4.30M
    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
915
916
4.30M
    _mm_store_si128((__m128i *)dst, r0);
917
4.30M
    _mm_store_si128((__m128i *)(dst + 16), r1);
918
919
4.30M
    dst += stride;
920
4.30M
    rep = _mm256_add_epi16(rep, one);
921
4.30M
  }
922
923
268k
  l = get_left_vector(left + 16);
924
268k
  rep = _mm256_set1_epi16((short)0x8000);
925
4.56M
  for (i = 0; i < 16; ++i) {
926
4.30M
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
927
928
4.30M
    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
929
4.30M
    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
930
931
4.30M
    _mm_store_si128((__m128i *)dst, r0);
932
4.30M
    _mm_store_si128((__m128i *)(dst + 16), r1);
933
934
4.30M
    dst += stride;
935
4.30M
    rep = _mm256_add_epi16(rep, one);
936
4.30M
  }
937
268k
}
938
939
void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
940
5.11k
                                    const uint8_t *above, const uint8_t *left) {
941
5.11k
  const __m256i t0 = get_top_vector(above);
942
5.11k
  const __m256i t1 = get_top_vector(above + 16);
943
5.11k
  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
944
5.11k
  const __m256i one = _mm256_set1_epi16(1);
945
946
5.11k
  int i, j;
947
25.5k
  for (j = 0; j < 4; ++j) {
948
20.4k
    const __m256i l = get_left_vector(left + j * 16);
949
20.4k
    __m256i rep = _mm256_set1_epi16((short)0x8000);
950
347k
    for (i = 0; i < 16; ++i) {
951
327k
      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
952
953
327k
      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
954
327k
      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
955
956
327k
      _mm_store_si128((__m128i *)dst, r0);
957
327k
      _mm_store_si128((__m128i *)(dst + 16), r1);
958
959
327k
      dst += stride;
960
327k
      rep = _mm256_add_epi16(rep, one);
961
327k
    }
962
20.4k
  }
963
5.11k
}
964
965
void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
966
5.00k
                                    const uint8_t *above, const uint8_t *left) {
967
5.00k
  const __m256i t0 = get_top_vector(above);
968
5.00k
  const __m256i t1 = get_top_vector(above + 16);
969
5.00k
  const __m256i t2 = get_top_vector(above + 32);
970
5.00k
  const __m256i t3 = get_top_vector(above + 48);
971
5.00k
  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
972
5.00k
  const __m256i one = _mm256_set1_epi16(1);
973
974
5.00k
  int i, j;
975
15.0k
  for (j = 0; j < 2; ++j) {
976
10.0k
    const __m256i l = get_left_vector(left + j * 16);
977
10.0k
    __m256i rep = _mm256_set1_epi16((short)0x8000);
978
170k
    for (i = 0; i < 16; ++i) {
979
160k
      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
980
981
160k
      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
982
160k
      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
983
160k
      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
984
160k
      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
985
986
160k
      _mm_store_si128((__m128i *)dst, r0);
987
160k
      _mm_store_si128((__m128i *)(dst + 16), r1);
988
160k
      _mm_store_si128((__m128i *)(dst + 32), r2);
989
160k
      _mm_store_si128((__m128i *)(dst + 48), r3);
990
991
160k
      dst += stride;
992
160k
      rep = _mm256_add_epi16(rep, one);
993
160k
    }
994
10.0k
  }
995
5.00k
}
996
997
void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
998
39.5k
                                    const uint8_t *above, const uint8_t *left) {
999
39.5k
  const __m256i t0 = get_top_vector(above);
1000
39.5k
  const __m256i t1 = get_top_vector(above + 16);
1001
39.5k
  const __m256i t2 = get_top_vector(above + 32);
1002
39.5k
  const __m256i t3 = get_top_vector(above + 48);
1003
39.5k
  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
1004
39.5k
  const __m256i one = _mm256_set1_epi16(1);
1005
1006
39.5k
  int i, j;
1007
197k
  for (j = 0; j < 4; ++j) {
1008
158k
    const __m256i l = get_left_vector(left + j * 16);
1009
158k
    __m256i rep = _mm256_set1_epi16((short)0x8000);
1010
2.69M
    for (i = 0; i < 16; ++i) {
1011
2.53M
      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
1012
1013
2.53M
      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
1014
2.53M
      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
1015
2.53M
      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
1016
2.53M
      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
1017
1018
2.53M
      _mm_store_si128((__m128i *)dst, r0);
1019
2.53M
      _mm_store_si128((__m128i *)(dst + 16), r1);
1020
2.53M
      _mm_store_si128((__m128i *)(dst + 32), r2);
1021
2.53M
      _mm_store_si128((__m128i *)(dst + 48), r3);
1022
1023
2.53M
      dst += stride;
1024
2.53M
      rep = _mm256_add_epi16(rep, one);
1025
2.53M
    }
1026
158k
  }
1027
39.5k
}
1028
1029
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1030
void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
1031
9.66k
                                    const uint8_t *above, const uint8_t *left) {
1032
9.66k
  const __m256i t0 = get_top_vector(above);
1033
9.66k
  const __m256i t1 = get_top_vector(above + 16);
1034
9.66k
  const __m256i t2 = get_top_vector(above + 32);
1035
9.66k
  const __m256i t3 = get_top_vector(above + 48);
1036
9.66k
  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
1037
9.66k
  const __m256i one = _mm256_set1_epi16(1);
1038
1039
9.66k
  int i;
1040
9.66k
  const __m256i l = get_left_vector(left);
1041
9.66k
  __m256i rep = _mm256_set1_epi16((short)0x8000);
1042
164k
  for (i = 0; i < 16; ++i) {
1043
154k
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
1044
1045
154k
    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
1046
154k
    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
1047
154k
    const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
1048
154k
    const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
1049
1050
154k
    _mm_store_si128((__m128i *)dst, r0);
1051
154k
    _mm_store_si128((__m128i *)(dst + 16), r1);
1052
154k
    _mm_store_si128((__m128i *)(dst + 32), r2);
1053
154k
    _mm_store_si128((__m128i *)(dst + 48), r3);
1054
1055
154k
    dst += stride;
1056
154k
    rep = _mm256_add_epi16(rep, one);
1057
154k
  }
1058
9.66k
}
1059
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1060
1061
#if CONFIG_AV1_HIGHBITDEPTH
1062
1063
static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2(
1064
393k
    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1065
393k
  const int frac_bits = 6 - upsample_above;
1066
393k
  const int max_base_x = ((N + 4) - 1) << upsample_above;
1067
1068
393k
  assert(dx > 0);
1069
  // pre-filter above pixels
1070
  // store in temp buffers:
1071
  //   above[x] * 32 + 16
1072
  //   above[x+1] - above[x]
1073
  // final pixels will be calculated as:
1074
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1075
393k
  __m256i a0, a1, a32, a16;
1076
393k
  __m256i diff, c3f;
1077
393k
  __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
1078
393k
  __m128i a0_128, a1_128;
1079
393k
  a16 = _mm256_set1_epi16(16);
1080
393k
  a_mbase_x = _mm_set1_epi16(above[max_base_x]);
1081
393k
  max_base_x128 = _mm_set1_epi16(max_base_x);
1082
393k
  c3f = _mm256_set1_epi16(0x3f);
1083
1084
393k
  int x = dx;
1085
3.00M
  for (int r = 0; r < N; r++) {
1086
2.61M
    __m256i b, res, shift;
1087
2.61M
    __m128i res1;
1088
1089
2.61M
    int base = x >> frac_bits;
1090
2.61M
    if (base >= max_base_x) {
1091
8.51k
      for (int i = r; i < N; ++i) {
1092
4.97k
        dst[i] = a_mbase_x;  // save 4 values
1093
4.97k
      }
1094
3.53k
      return;
1095
3.53k
    }
1096
1097
2.61M
    a0_128 = _mm_loadu_si128((__m128i *)(above + base));
1098
2.61M
    a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
1099
1100
2.61M
    if (upsample_above) {
1101
1.07M
      a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]);
1102
1.07M
      a1_128 = _mm_srli_si128(a0_128, 8);
1103
1104
1.07M
      base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8,
1105
1.07M
                                   base + 10, base + 12, base + 14);
1106
1.07M
      shift = _mm256_srli_epi16(
1107
1.07M
          _mm256_and_si256(
1108
1.07M
              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above),
1109
1.07M
              _mm256_set1_epi16(0x3f)),
1110
1.07M
          1);
1111
1.54M
    } else {
1112
1.54M
      base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4,
1113
1.54M
                                   base + 5, base + 6, base + 7);
1114
1.54M
      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1115
1.54M
    }
1116
2.61M
    a0 = _mm256_castsi128_si256(a0_128);
1117
2.61M
    a1 = _mm256_castsi128_si256(a1_128);
1118
2.61M
    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1119
2.61M
    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1120
2.61M
    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1121
1122
2.61M
    b = _mm256_mullo_epi16(diff, shift);
1123
2.61M
    res = _mm256_add_epi16(a32, b);
1124
2.61M
    res = _mm256_srli_epi16(res, 5);
1125
2.61M
    res1 = _mm256_castsi256_si128(res);
1126
1127
2.61M
    mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128);
1128
2.61M
    dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
1129
2.61M
    x += dx;
1130
2.61M
  }
1131
393k
}
1132
1133
static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_4xN_internal_avx2(
1134
145k
    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1135
145k
  const int frac_bits = 6 - upsample_above;
1136
145k
  const int max_base_x = ((N + 4) - 1) << upsample_above;
1137
1138
145k
  assert(dx > 0);
1139
  // pre-filter above pixels
1140
  // store in temp buffers:
1141
  //   above[x] * 32 + 16
1142
  //   above[x+1] - above[x]
1143
  // final pixels will be calculated as:
1144
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1145
145k
  __m256i a0, a1, a32, a16;
1146
145k
  __m256i diff;
1147
145k
  __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
1148
1149
145k
  a16 = _mm256_set1_epi32(16);
1150
145k
  a_mbase_x = _mm_set1_epi16(above[max_base_x]);
1151
145k
  max_base_x128 = _mm_set1_epi32(max_base_x);
1152
1153
145k
  int x = dx;
1154
1.20M
  for (int r = 0; r < N; r++) {
1155
1.05M
    __m256i b, res, shift;
1156
1.05M
    __m128i res1;
1157
1158
1.05M
    int base = x >> frac_bits;
1159
1.05M
    if (base >= max_base_x) {
1160
2.78k
      for (int i = r; i < N; ++i) {
1161
1.96k
        dst[i] = a_mbase_x;  // save 4 values
1162
1.96k
      }
1163
828
      return;
1164
828
    }
1165
1166
1.05M
    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1167
1.05M
    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1168
1169
1.05M
    if (upsample_above) {
1170
125k
      a0 = _mm256_permutevar8x32_epi32(
1171
125k
          a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1172
125k
      a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
1173
125k
      base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6);
1174
125k
      shift = _mm256_srli_epi32(
1175
125k
          _mm256_and_si256(
1176
125k
              _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
1177
125k
              _mm256_set1_epi32(0x3f)),
1178
125k
          1);
1179
928k
    } else {
1180
928k
      base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3);
1181
928k
      shift = _mm256_srli_epi32(
1182
928k
          _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1183
928k
    }
1184
1185
1.05M
    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1186
1.05M
    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1187
1.05M
    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1188
1189
1.05M
    b = _mm256_mullo_epi32(diff, shift);
1190
1.05M
    res = _mm256_add_epi32(a32, b);
1191
1.05M
    res = _mm256_srli_epi32(res, 5);
1192
1193
1.05M
    res1 = _mm256_castsi256_si128(res);
1194
1.05M
    res1 = _mm_packus_epi32(res1, res1);
1195
1196
1.05M
    mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128);
1197
1.05M
    mask128 = _mm_packs_epi32(mask128, mask128);  // goto 16 bit
1198
1.05M
    dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
1199
1.05M
    x += dx;
1200
1.05M
  }
1201
145k
}
1202
1203
static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst,
1204
                                             ptrdiff_t stride,
1205
                                             const uint16_t *above,
1206
                                             int upsample_above, int dx,
1207
183k
                                             int bd) {
1208
183k
  __m128i dstvec[16];
1209
183k
  if (bd < 12) {
1210
116k
    highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above,
1211
116k
                                              dx);
1212
116k
  } else {
1213
67.2k
    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above,
1214
67.2k
                                                    upsample_above, dx);
1215
67.2k
  }
1216
1.40M
  for (int i = 0; i < N; i++) {
1217
1.22M
    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
1218
1.22M
  }
1219
183k
}
1220
1221
static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_8xN_internal_avx2(
1222
282k
    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1223
282k
  const int frac_bits = 6 - upsample_above;
1224
282k
  const int max_base_x = ((8 + N) - 1) << upsample_above;
1225
1226
282k
  assert(dx > 0);
1227
  // pre-filter above pixels
1228
  // store in temp buffers:
1229
  //   above[x] * 32 + 16
1230
  //   above[x+1] - above[x]
1231
  // final pixels will be calculated as:
1232
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1233
282k
  __m256i a0, a1, a0_1, a1_1, a32, a16;
1234
282k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1235
1236
282k
  a16 = _mm256_set1_epi32(16);
1237
282k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1238
282k
  max_base_x256 = _mm256_set1_epi32(max_base_x);
1239
1240
282k
  int x = dx;
1241
3.07M
  for (int r = 0; r < N; r++) {
1242
2.79M
    __m256i b, res, res1, shift;
1243
1244
2.79M
    int base = x >> frac_bits;
1245
2.79M
    if (base >= max_base_x) {
1246
2.34k
      for (int i = r; i < N; ++i) {
1247
1.52k
        dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
1248
1.52k
      }
1249
817
      return;
1250
817
    }
1251
1252
2.79M
    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1253
2.79M
    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1254
1255
2.79M
    if (upsample_above) {
1256
300k
      a0 = _mm256_permutevar8x32_epi32(
1257
300k
          a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1258
300k
      a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
1259
1260
300k
      a0_1 =
1261
300k
          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
1262
300k
      a0_1 = _mm256_permutevar8x32_epi32(
1263
300k
          a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1264
300k
      a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
1265
1266
300k
      a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
1267
300k
      a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
1268
300k
      base_inc256 =
1269
300k
          _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8,
1270
300k
                            base + 10, base + 12, base + 14);
1271
300k
      shift = _mm256_srli_epi32(
1272
300k
          _mm256_and_si256(
1273
300k
              _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
1274
300k
              _mm256_set1_epi32(0x3f)),
1275
300k
          1);
1276
2.49M
    } else {
1277
2.49M
      base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3,
1278
2.49M
                                      base + 4, base + 5, base + 6, base + 7);
1279
2.49M
      shift = _mm256_srli_epi32(
1280
2.49M
          _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1281
2.49M
    }
1282
1283
2.79M
    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1284
2.79M
    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1285
2.79M
    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1286
1287
2.79M
    b = _mm256_mullo_epi32(diff, shift);
1288
2.79M
    res = _mm256_add_epi32(a32, b);
1289
2.79M
    res = _mm256_srli_epi32(res, 5);
1290
1291
2.79M
    res1 = _mm256_packus_epi32(
1292
2.79M
        res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
1293
1294
2.79M
    mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256);
1295
2.79M
    mask256 = _mm256_packs_epi32(
1296
2.79M
        mask256, _mm256_castsi128_si256(
1297
2.79M
                     _mm256_extracti128_si256(mask256, 1)));  // goto 16 bit
1298
2.79M
    res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1299
2.79M
    dst[r] = _mm256_castsi256_si128(res1);
1300
2.79M
    x += dx;
1301
2.79M
  }
1302
282k
}
1303
1304
static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2(
1305
376k
    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1306
376k
  const int frac_bits = 6 - upsample_above;
1307
376k
  const int max_base_x = ((8 + N) - 1) << upsample_above;
1308
1309
376k
  assert(dx > 0);
1310
  // pre-filter above pixels
1311
  // store in temp buffers:
1312
  //   above[x] * 32 + 16
1313
  //   above[x+1] - above[x]
1314
  // final pixels will be calculated as:
1315
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1316
376k
  __m256i a0, a1, a32, a16, c3f;
1317
376k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1318
376k
  __m128i a0_x128, a1_x128;
1319
1320
376k
  a16 = _mm256_set1_epi16(16);
1321
376k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1322
376k
  max_base_x256 = _mm256_set1_epi16(max_base_x);
1323
376k
  c3f = _mm256_set1_epi16(0x3f);
1324
1325
376k
  int x = dx;
1326
5.12M
  for (int r = 0; r < N; r++) {
1327
4.75M
    __m256i b, res, res1, shift;
1328
1329
4.75M
    int base = x >> frac_bits;
1330
4.75M
    if (base >= max_base_x) {
1331
5.12k
      for (int i = r; i < N; ++i) {
1332
3.67k
        dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
1333
3.67k
      }
1334
1.44k
      return;
1335
1.44k
    }
1336
1337
4.75M
    a0_x128 = _mm_loadu_si128((__m128i *)(above + base));
1338
4.75M
    if (upsample_above) {
1339
1.12M
      __m128i mask, atmp0, atmp1, atmp2, atmp3;
1340
1.12M
      a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8));
1341
1.12M
      atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
1342
1.12M
      atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
1343
1.12M
      atmp2 =
1344
1.12M
          _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
1345
1.12M
      atmp3 =
1346
1.12M
          _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
1347
1.12M
      mask =
1348
1.12M
          _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15));
1349
1.12M
      a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
1350
1.12M
      mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16),
1351
1.12M
                            _mm_set1_epi8(15));
1352
1.12M
      a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
1353
1354
1.12M
      base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6,
1355
1.12M
                                      base + 8, base + 10, base + 12, base + 14,
1356
1.12M
                                      0, 0, 0, 0, 0, 0, 0, 0);
1357
1.12M
      shift = _mm256_srli_epi16(
1358
1.12M
          _mm256_and_si256(
1359
1.12M
              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
1360
1.12M
          1);
1361
3.62M
    } else {
1362
3.62M
      a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1));
1363
3.62M
      base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1364
3.62M
                                      base + 4, base + 5, base + 6, base + 7, 0,
1365
3.62M
                                      0, 0, 0, 0, 0, 0, 0);
1366
3.62M
      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1367
3.62M
    }
1368
4.75M
    a0 = _mm256_castsi128_si256(a0_x128);
1369
4.75M
    a1 = _mm256_castsi128_si256(a1_x128);
1370
1371
4.75M
    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1372
4.75M
    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1373
4.75M
    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1374
1375
4.75M
    b = _mm256_mullo_epi16(diff, shift);
1376
4.75M
    res = _mm256_add_epi16(a32, b);
1377
4.75M
    res = _mm256_srli_epi16(res, 5);
1378
1379
4.75M
    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1380
4.75M
    res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1381
4.75M
    dst[r] = _mm256_castsi256_si128(res1);
1382
4.75M
    x += dx;
1383
4.75M
  }
1384
376k
}
1385
1386
static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst,
1387
                                             ptrdiff_t stride,
1388
                                             const uint16_t *above,
1389
                                             int upsample_above, int dx,
1390
269k
                                             int bd) {
1391
269k
  __m128i dstvec[32];
1392
269k
  if (bd < 12) {
1393
146k
    highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above,
1394
146k
                                              dx);
1395
146k
  } else {
1396
123k
    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above,
1397
123k
                                                    upsample_above, dx);
1398
123k
  }
1399
2.78M
  for (int i = 0; i < N; i++) {
1400
2.52M
    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
1401
2.52M
  }
1402
269k
}
1403
1404
static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_16xN_internal_avx2(
1405
131k
    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1406
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1407
131k
  (void)upsample_above;
1408
131k
  const int frac_bits = 6;
1409
131k
  const int max_base_x = ((16 + N) - 1);
1410
1411
  // pre-filter above pixels
1412
  // store in temp buffers:
1413
  //   above[x] * 32 + 16
1414
  //   above[x+1] - above[x]
1415
  // final pixels will be calculated as:
1416
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1417
131k
  __m256i a0, a0_1, a1, a1_1, a32, a16;
1418
131k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1419
1420
131k
  a16 = _mm256_set1_epi32(16);
1421
131k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1422
131k
  max_base_x256 = _mm256_set1_epi16(max_base_x);
1423
1424
131k
  int x = dx;
1425
1.64M
  for (int r = 0; r < N; r++) {
1426
1.51M
    __m256i b, res[2], res1;
1427
1428
1.51M
    int base = x >> frac_bits;
1429
1.51M
    if (base >= max_base_x) {
1430
534
      for (int i = r; i < N; ++i) {
1431
421
        dstvec[i] = a_mbase_x;  // save 16 values
1432
421
      }
1433
113
      return;
1434
113
    }
1435
1.51M
    __m256i shift = _mm256_srli_epi32(
1436
1.51M
        _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1437
1438
1.51M
    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1439
1.51M
    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1440
1441
1.51M
    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1442
1.51M
    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1443
1.51M
    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1444
1.51M
    b = _mm256_mullo_epi32(diff, shift);
1445
1446
1.51M
    res[0] = _mm256_add_epi32(a32, b);
1447
1.51M
    res[0] = _mm256_srli_epi32(res[0], 5);
1448
1.51M
    res[0] = _mm256_packus_epi32(
1449
1.51M
        res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1450
1451
1.51M
    int mdif = max_base_x - base;
1452
1.51M
    if (mdif > 8) {
1453
1.51M
      a0_1 =
1454
1.51M
          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
1455
1.51M
      a1_1 =
1456
1.51M
          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
1457
1458
1.51M
      diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
1459
1.51M
      a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
1460
1.51M
      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
1461
1.51M
      b = _mm256_mullo_epi32(diff, shift);
1462
1463
1.51M
      res[1] = _mm256_add_epi32(a32, b);
1464
1.51M
      res[1] = _mm256_srli_epi32(res[1], 5);
1465
1.51M
      res[1] = _mm256_packus_epi32(
1466
1.51M
          res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1467
1.51M
    } else {
1468
2.71k
      res[1] = a_mbase_x;
1469
2.71k
    }
1470
1.51M
    res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1471
1.51M
                                   1);  // 16 16bit values
1472
1473
1.51M
    base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1474
1.51M
                                    base + 4, base + 5, base + 6, base + 7,
1475
1.51M
                                    base + 8, base + 9, base + 10, base + 11,
1476
1.51M
                                    base + 12, base + 13, base + 14, base + 15);
1477
1.51M
    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1478
1.51M
    dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1479
1.51M
    x += dx;
1480
1.51M
  }
1481
131k
}
1482
1483
static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2(
1484
304k
    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1485
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1486
304k
  (void)upsample_above;
1487
304k
  const int frac_bits = 6;
1488
304k
  const int max_base_x = ((16 + N) - 1);
1489
1490
  // pre-filter above pixels
1491
  // store in temp buffers:
1492
  //   above[x] * 32 + 16
1493
  //   above[x+1] - above[x]
1494
  // final pixels will be calculated as:
1495
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1496
304k
  __m256i a0, a1, a32, a16, c3f;
1497
304k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1498
1499
304k
  a16 = _mm256_set1_epi16(16);
1500
304k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1501
304k
  max_base_x256 = _mm256_set1_epi16(max_base_x);
1502
304k
  c3f = _mm256_set1_epi16(0x3f);
1503
1504
304k
  int x = dx;
1505
5.80M
  for (int r = 0; r < N; r++) {
1506
5.50M
    __m256i b, res;
1507
1508
5.50M
    int base = x >> frac_bits;
1509
5.50M
    if (base >= max_base_x) {
1510
2.26k
      for (int i = r; i < N; ++i) {
1511
1.81k
        dstvec[i] = a_mbase_x;  // save 16 values
1512
1.81k
      }
1513
452
      return;
1514
452
    }
1515
5.50M
    __m256i shift =
1516
5.50M
        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1517
1518
5.50M
    a0 = _mm256_loadu_si256((__m256i *)(above + base));
1519
5.50M
    a1 = _mm256_loadu_si256((__m256i *)(above + base + 1));
1520
1521
5.50M
    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1522
5.50M
    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1523
5.50M
    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1524
5.50M
    b = _mm256_mullo_epi16(diff, shift);
1525
1526
5.50M
    res = _mm256_add_epi16(a32, b);
1527
5.50M
    res = _mm256_srli_epi16(res, 5);  // 16 16bit values
1528
1529
5.50M
    base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1530
5.50M
                                    base + 4, base + 5, base + 6, base + 7,
1531
5.50M
                                    base + 8, base + 9, base + 10, base + 11,
1532
5.50M
                                    base + 12, base + 13, base + 14, base + 15);
1533
5.50M
    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1534
5.50M
    dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1535
5.50M
    x += dx;
1536
5.50M
  }
1537
304k
}
1538
1539
static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst,
1540
                                              ptrdiff_t stride,
1541
                                              const uint16_t *above,
1542
                                              int upsample_above, int dx,
1543
209k
                                              int bd) {
1544
209k
  __m256i dstvec[64];
1545
209k
  if (bd < 12) {
1546
121k
    highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above,
1547
121k
                                               dx);
1548
121k
  } else {
1549
88.0k
    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above,
1550
88.0k
                                                     upsample_above, dx);
1551
88.0k
  }
1552
2.91M
  for (int i = 0; i < N; i++) {
1553
2.70M
    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
1554
2.70M
  }
1555
209k
}
1556
1557
static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_32xN_internal_avx2(
1558
21.7k
    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1559
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1560
21.7k
  (void)upsample_above;
1561
21.7k
  const int frac_bits = 6;
1562
21.7k
  const int max_base_x = ((32 + N) - 1);
1563
1564
  // pre-filter above pixels
1565
  // store in temp buffers:
1566
  //   above[x] * 32 + 16
1567
  //   above[x+1] - above[x]
1568
  // final pixels will be calculated as:
1569
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1570
21.7k
  __m256i a0, a0_1, a1, a1_1, a32, a16, c3f;
1571
21.7k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1572
1573
21.7k
  a16 = _mm256_set1_epi32(16);
1574
21.7k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1575
21.7k
  max_base_x256 = _mm256_set1_epi16(max_base_x);
1576
21.7k
  c3f = _mm256_set1_epi16(0x3f);
1577
1578
21.7k
  int x = dx;
1579
492k
  for (int r = 0; r < N; r++) {
1580
470k
    __m256i b, res[2], res1;
1581
1582
470k
    int base = x >> frac_bits;
1583
470k
    if (base >= max_base_x) {
1584
0
      for (int i = r; i < N; ++i) {
1585
0
        dstvec[i] = a_mbase_x;  // save 32 values
1586
0
        dstvec[i + N] = a_mbase_x;
1587
0
      }
1588
0
      return;
1589
0
    }
1590
1591
470k
    __m256i shift =
1592
470k
        _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
1593
1594
1.41M
    for (int j = 0; j < 32; j += 16) {
1595
940k
      int mdif = max_base_x - (base + j);
1596
940k
      if (mdif <= 0) {
1597
681
        res1 = a_mbase_x;
1598
940k
      } else {
1599
940k
        a0 = _mm256_cvtepu16_epi32(
1600
940k
            _mm_loadu_si128((__m128i *)(above + base + j)));
1601
940k
        a1 = _mm256_cvtepu16_epi32(
1602
940k
            _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
1603
1604
940k
        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1605
940k
        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1606
940k
        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1607
940k
        b = _mm256_mullo_epi32(diff, shift);
1608
1609
940k
        res[0] = _mm256_add_epi32(a32, b);
1610
940k
        res[0] = _mm256_srli_epi32(res[0], 5);
1611
940k
        res[0] = _mm256_packus_epi32(
1612
940k
            res[0],
1613
940k
            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1614
940k
        if (mdif > 8) {
1615
936k
          a0_1 = _mm256_cvtepu16_epi32(
1616
936k
              _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
1617
936k
          a1_1 = _mm256_cvtepu16_epi32(
1618
936k
              _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
1619
1620
936k
          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
1621
936k
          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
1622
936k
          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
1623
936k
          b = _mm256_mullo_epi32(diff, shift);
1624
1625
936k
          res[1] = _mm256_add_epi32(a32, b);
1626
936k
          res[1] = _mm256_srli_epi32(res[1], 5);
1627
936k
          res[1] = _mm256_packus_epi32(
1628
936k
              res[1],
1629
936k
              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1630
936k
        } else {
1631
3.42k
          res[1] = a_mbase_x;
1632
3.42k
        }
1633
940k
        res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1634
940k
                                       1);  // 16 16bit values
1635
940k
        base_inc256 = _mm256_setr_epi16(
1636
940k
            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1637
940k
            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1638
940k
            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1639
940k
            base + j + 13, base + j + 14, base + j + 15);
1640
1641
940k
        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1642
940k
        res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1643
940k
      }
1644
940k
      if (!j) {
1645
470k
        dstvec[r] = res1;
1646
470k
      } else {
1647
470k
        dstvec[r + N] = res1;
1648
470k
      }
1649
940k
    }
1650
470k
    x += dx;
1651
470k
  }
1652
21.7k
}
1653
1654
static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2(
1655
193k
    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1656
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1657
193k
  (void)upsample_above;
1658
193k
  const int frac_bits = 6;
1659
193k
  const int max_base_x = ((32 + N) - 1);
1660
1661
  // pre-filter above pixels
1662
  // store in temp buffers:
1663
  //   above[x] * 32 + 16
1664
  //   above[x+1] - above[x]
1665
  // final pixels will be calculated as:
1666
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1667
193k
  __m256i a0, a1, a32, a16, c3f;
1668
193k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1669
1670
193k
  a16 = _mm256_set1_epi16(16);
1671
193k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1672
193k
  max_base_x256 = _mm256_set1_epi16(max_base_x);
1673
193k
  c3f = _mm256_set1_epi16(0x3f);
1674
1675
193k
  int x = dx;
1676
5.35M
  for (int r = 0; r < N; r++) {
1677
5.16M
    __m256i b, res;
1678
1679
5.16M
    int base = x >> frac_bits;
1680
5.16M
    if (base >= max_base_x) {
1681
0
      for (int i = r; i < N; ++i) {
1682
0
        dstvec[i] = a_mbase_x;  // save 32 values
1683
0
        dstvec[i + N] = a_mbase_x;
1684
0
      }
1685
0
      return;
1686
0
    }
1687
1688
5.16M
    __m256i shift =
1689
5.16M
        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1690
1691
15.4M
    for (int j = 0; j < 32; j += 16) {
1692
10.3M
      int mdif = max_base_x - (base + j);
1693
10.3M
      if (mdif <= 0) {
1694
1.13k
        res = a_mbase_x;
1695
10.3M
      } else {
1696
10.3M
        a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
1697
10.3M
        a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
1698
1699
10.3M
        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1700
10.3M
        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1701
10.3M
        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1702
10.3M
        b = _mm256_mullo_epi16(diff, shift);
1703
1704
10.3M
        res = _mm256_add_epi16(a32, b);
1705
10.3M
        res = _mm256_srli_epi16(res, 5);
1706
1707
10.3M
        base_inc256 = _mm256_setr_epi16(
1708
10.3M
            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1709
10.3M
            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1710
10.3M
            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1711
10.3M
            base + j + 13, base + j + 14, base + j + 15);
1712
1713
10.3M
        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1714
10.3M
        res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1715
10.3M
      }
1716
10.3M
      if (!j) {
1717
5.16M
        dstvec[r] = res;
1718
5.16M
      } else {
1719
5.16M
        dstvec[r + N] = res;
1720
5.16M
      }
1721
10.3M
    }
1722
5.16M
    x += dx;
1723
5.16M
  }
1724
193k
}
1725
1726
static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst,
1727
                                              ptrdiff_t stride,
1728
                                              const uint16_t *above,
1729
                                              int upsample_above, int dx,
1730
80.9k
                                              int bd) {
1731
80.9k
  __m256i dstvec[128];
1732
80.9k
  if (bd < 12) {
1733
69.8k
    highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above,
1734
69.8k
                                               dx);
1735
69.8k
  } else {
1736
11.0k
    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above,
1737
11.0k
                                                     upsample_above, dx);
1738
11.0k
  }
1739
2.16M
  for (int i = 0; i < N; i++) {
1740
2.08M
    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
1741
2.08M
    _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]);
1742
2.08M
  }
1743
80.9k
}
1744
1745
static void highbd_dr_prediction_32bit_z1_64xN_avx2(int N, uint16_t *dst,
1746
                                                    ptrdiff_t stride,
1747
                                                    const uint16_t *above,
1748
                                                    int upsample_above,
1749
18.4k
                                                    int dx) {
1750
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1751
18.4k
  (void)upsample_above;
1752
18.4k
  const int frac_bits = 6;
1753
18.4k
  const int max_base_x = ((64 + N) - 1);
1754
1755
  // pre-filter above pixels
1756
  // store in temp buffers:
1757
  //   above[x] * 32 + 16
1758
  //   above[x+1] - above[x]
1759
  // final pixels will be calculated as:
1760
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1761
18.4k
  __m256i a0, a0_1, a1, a1_1, a32, a16;
1762
18.4k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1763
1764
18.4k
  a16 = _mm256_set1_epi32(16);
1765
18.4k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1766
18.4k
  max_base_x256 = _mm256_set1_epi16(max_base_x);
1767
1768
18.4k
  int x = dx;
1769
1.12M
  for (int r = 0; r < N; r++, dst += stride) {
1770
1.11M
    __m256i b, res[2], res1;
1771
1772
1.11M
    int base = x >> frac_bits;
1773
1.11M
    if (base >= max_base_x) {
1774
0
      for (int i = r; i < N; ++i) {
1775
0
        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
1776
0
        _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
1777
0
        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
1778
0
        _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
1779
0
        dst += stride;
1780
0
      }
1781
0
      return;
1782
0
    }
1783
1784
1.11M
    __m256i shift = _mm256_srli_epi32(
1785
1.11M
        _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1786
1787
1.11M
    __m128i a0_128, a0_1_128, a1_128, a1_1_128;
1788
5.55M
    for (int j = 0; j < 64; j += 16) {
1789
4.44M
      int mdif = max_base_x - (base + j);
1790
4.44M
      if (mdif <= 0) {
1791
3.50k
        _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
1792
4.44M
      } else {
1793
4.44M
        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
1794
4.44M
        a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
1795
4.44M
        a0 = _mm256_cvtepu16_epi32(a0_128);
1796
4.44M
        a1 = _mm256_cvtepu16_epi32(a1_128);
1797
1798
4.44M
        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1799
4.44M
        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1800
4.44M
        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1801
4.44M
        b = _mm256_mullo_epi32(diff, shift);
1802
1803
4.44M
        res[0] = _mm256_add_epi32(a32, b);
1804
4.44M
        res[0] = _mm256_srli_epi32(res[0], 5);
1805
4.44M
        res[0] = _mm256_packus_epi32(
1806
4.44M
            res[0],
1807
4.44M
            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1808
4.44M
        if (mdif > 8) {
1809
4.43M
          a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
1810
4.43M
          a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
1811
4.43M
          a0_1 = _mm256_cvtepu16_epi32(a0_1_128);
1812
4.43M
          a1_1 = _mm256_cvtepu16_epi32(a1_1_128);
1813
1814
4.43M
          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
1815
4.43M
          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
1816
4.43M
          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
1817
4.43M
          b = _mm256_mullo_epi32(diff, shift);
1818
1819
4.43M
          res[1] = _mm256_add_epi32(a32, b);
1820
4.43M
          res[1] = _mm256_srli_epi32(res[1], 5);
1821
4.43M
          res[1] = _mm256_packus_epi32(
1822
4.43M
              res[1],
1823
4.43M
              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1824
4.43M
        } else {
1825
5.50k
          res[1] = a_mbase_x;
1826
5.50k
        }
1827
4.44M
        res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1828
4.44M
                                       1);  // 16 16bit values
1829
4.44M
        base_inc256 = _mm256_setr_epi16(
1830
4.44M
            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1831
4.44M
            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1832
4.44M
            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1833
4.44M
            base + j + 13, base + j + 14, base + j + 15);
1834
1835
4.44M
        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1836
4.44M
        res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1837
4.44M
        _mm256_storeu_si256((__m256i *)(dst + j), res1);
1838
4.44M
      }
1839
4.44M
    }
1840
1.11M
    x += dx;
1841
1.11M
  }
1842
18.4k
}
1843
1844
static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst,
1845
                                              ptrdiff_t stride,
1846
                                              const uint16_t *above,
1847
44.8k
                                              int upsample_above, int dx) {
1848
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1849
44.8k
  (void)upsample_above;
1850
44.8k
  const int frac_bits = 6;
1851
44.8k
  const int max_base_x = ((64 + N) - 1);
1852
1853
  // pre-filter above pixels
1854
  // store in temp buffers:
1855
  //   above[x] * 32 + 16
1856
  //   above[x+1] - above[x]
1857
  // final pixels will be calculated as:
1858
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1859
44.8k
  __m256i a0, a1, a32, a16, c3f;
1860
44.8k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1861
1862
44.8k
  a16 = _mm256_set1_epi16(16);
1863
44.8k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1864
44.8k
  max_base_x256 = _mm256_set1_epi16(max_base_x);
1865
44.8k
  c3f = _mm256_set1_epi16(0x3f);
1866
1867
44.8k
  int x = dx;
1868
2.36M
  for (int r = 0; r < N; r++, dst += stride) {
1869
2.32M
    __m256i b, res;
1870
1871
2.32M
    int base = x >> frac_bits;
1872
2.32M
    if (base >= max_base_x) {
1873
0
      for (int i = r; i < N; ++i) {
1874
0
        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
1875
0
        _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
1876
0
        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
1877
0
        _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
1878
0
        dst += stride;
1879
0
      }
1880
0
      return;
1881
0
    }
1882
1883
2.32M
    __m256i shift =
1884
2.32M
        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1885
1886
11.6M
    for (int j = 0; j < 64; j += 16) {
1887
9.28M
      int mdif = max_base_x - (base + j);
1888
9.28M
      if (mdif <= 0) {
1889
2.42k
        _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
1890
9.28M
      } else {
1891
9.28M
        a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
1892
9.28M
        a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
1893
1894
9.28M
        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1895
9.28M
        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1896
9.28M
        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1897
9.28M
        b = _mm256_mullo_epi16(diff, shift);
1898
1899
9.28M
        res = _mm256_add_epi16(a32, b);
1900
9.28M
        res = _mm256_srli_epi16(res, 5);
1901
1902
9.28M
        base_inc256 = _mm256_setr_epi16(
1903
9.28M
            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1904
9.28M
            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1905
9.28M
            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1906
9.28M
            base + j + 13, base + j + 14, base + j + 15);
1907
1908
9.28M
        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1909
9.28M
        res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1910
9.28M
        _mm256_storeu_si256((__m256i *)(dst + j), res);  // 16 16bit values
1911
9.28M
      }
1912
9.28M
    }
1913
2.32M
    x += dx;
1914
2.32M
  }
1915
44.8k
}
1916
1917
// Directional prediction, zone 1: 0 < angle < 90
1918
void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
1919
                                      int bh, const uint16_t *above,
1920
                                      const uint16_t *left, int upsample_above,
1921
765k
                                      int dx, int dy, int bd) {
1922
765k
  (void)left;
1923
765k
  (void)dy;
1924
1925
765k
  switch (bw) {
1926
183k
    case 4:
1927
183k
      highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above,
1928
183k
                                       dx, bd);
1929
183k
      break;
1930
269k
    case 8:
1931
269k
      highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above,
1932
269k
                                       dx, bd);
1933
269k
      break;
1934
209k
    case 16:
1935
209k
      highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above,
1936
209k
                                        dx, bd);
1937
209k
      break;
1938
78.0k
    case 32:
1939
78.0k
      highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above,
1940
78.0k
                                        dx, bd);
1941
78.0k
      break;
1942
25.2k
    case 64:
1943
25.2k
      if (bd < 12) {
1944
11.9k
        highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above,
1945
11.9k
                                          upsample_above, dx);
1946
13.3k
      } else {
1947
13.3k
        highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above,
1948
13.3k
                                                upsample_above, dx);
1949
13.3k
      }
1950
25.2k
      break;
1951
0
    default: break;
1952
765k
  }
1953
765k
  return;
1954
765k
}
1955
1956
static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc,
1957
555k
                                      uint16_t *dst, ptrdiff_t pitchDst) {
1958
555k
  __m256i r[16];
1959
555k
  __m256i d[16];
1960
9.43M
  for (int j = 0; j < 16; j++) {
1961
8.88M
    r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc));
1962
8.88M
  }
1963
555k
  highbd_transpose16x16_avx2(r, d);
1964
9.43M
  for (int j = 0; j < 16; j++) {
1965
8.88M
    _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]);
1966
8.88M
  }
1967
555k
}
1968
1969
static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc,
1970
                             uint16_t *dst, ptrdiff_t pitchDst, int width,
1971
40.9k
                             int height) {
1972
198k
  for (int j = 0; j < height; j += 16)
1973
713k
    for (int i = 0; i < width; i += 16)
1974
555k
      highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
1975
555k
                                dst + j * pitchDst + i, pitchDst);
1976
40.9k
}
1977
1978
static void highbd_dr_prediction_32bit_z2_Nx4_avx2(
1979
    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1980
    const uint16_t *left, int upsample_above, int upsample_left, int dx,
1981
290k
    int dy) {
1982
290k
  const int min_base_x = -(1 << upsample_above);
1983
290k
  const int min_base_y = -(1 << upsample_left);
1984
290k
  const int frac_bits_x = 6 - upsample_above;
1985
290k
  const int frac_bits_y = 6 - upsample_left;
1986
1987
290k
  assert(dx > 0);
1988
  // pre-filter above pixels
1989
  // store in temp buffers:
1990
  //   above[x] * 32 + 16
1991
  //   above[x+1] - above[x]
1992
  // final pixels will be calculated as:
1993
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1994
290k
  __m256i a0_x, a1_x, a32, a16;
1995
290k
  __m256i diff;
1996
290k
  __m128i c3f, min_base_y128;
1997
1998
290k
  a16 = _mm256_set1_epi32(16);
1999
290k
  c3f = _mm_set1_epi32(0x3f);
2000
290k
  min_base_y128 = _mm_set1_epi32(min_base_y);
2001
2002
1.73M
  for (int r = 0; r < N; r++) {
2003
1.44M
    __m256i b, res, shift;
2004
1.44M
    __m128i resx, resy, resxy;
2005
1.44M
    __m128i a0_x128, a1_x128;
2006
1.44M
    int y = r + 1;
2007
1.44M
    int base_x = (-y * dx) >> frac_bits_x;
2008
1.44M
    int base_shift = 0;
2009
1.44M
    if (base_x < (min_base_x - 1)) {
2010
1.21M
      base_shift = (min_base_x - base_x - 1) >> upsample_above;
2011
1.21M
    }
2012
1.44M
    int base_min_diff =
2013
1.44M
        (min_base_x - base_x + upsample_above) >> upsample_above;
2014
1.44M
    if (base_min_diff > 4) {
2015
1.00M
      base_min_diff = 4;
2016
1.00M
    } else {
2017
443k
      if (base_min_diff < 0) base_min_diff = 0;
2018
443k
    }
2019
2020
1.44M
    if (base_shift > 3) {
2021
1.00M
      a0_x = _mm256_setzero_si256();
2022
1.00M
      a1_x = _mm256_setzero_si256();
2023
1.00M
      shift = _mm256_setzero_si256();
2024
1.00M
    } else {
2025
443k
      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2026
443k
      if (upsample_above) {
2027
76.6k
        a0_x128 = _mm_shuffle_epi8(a0_x128,
2028
76.6k
                                   *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
2029
76.6k
        a1_x128 = _mm_srli_si128(a0_x128, 8);
2030
2031
76.6k
        shift = _mm256_castsi128_si256(_mm_srli_epi32(
2032
76.6k
            _mm_and_si128(
2033
76.6k
                _mm_slli_epi32(
2034
76.6k
                    _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
2035
76.6k
                                   (2 << 6) - y * dx, (3 << 6) - y * dx),
2036
76.6k
                    upsample_above),
2037
76.6k
                c3f),
2038
76.6k
            1));
2039
366k
      } else {
2040
366k
        a0_x128 =
2041
366k
            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2042
366k
        a1_x128 = _mm_srli_si128(a0_x128, 2);
2043
2044
366k
        shift = _mm256_castsi128_si256(_mm_srli_epi32(
2045
366k
            _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
2046
366k
                                         (2 << 6) - y * dx, (3 << 6) - y * dx),
2047
366k
                          c3f),
2048
366k
            1));
2049
366k
      }
2050
443k
      a0_x = _mm256_cvtepu16_epi32(a0_x128);
2051
443k
      a1_x = _mm256_cvtepu16_epi32(a1_x128);
2052
443k
    }
2053
    // y calc
2054
1.44M
    __m128i a0_y, a1_y, shifty;
2055
1.44M
    if (base_x < min_base_x) {
2056
1.29M
      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2057
1.29M
      DECLARE_ALIGNED(32, int, base_y_c[4]);
2058
1.29M
      r6 = _mm_set1_epi32(r << 6);
2059
1.29M
      dy128 = _mm_set1_epi32(dy);
2060
1.29M
      c1234 = _mm_setr_epi32(1, 2, 3, 4);
2061
1.29M
      y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
2062
1.29M
      base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
2063
1.29M
      mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
2064
1.29M
      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2065
1.29M
      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2066
2067
1.29M
      a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
2068
1.29M
                            left[base_y_c[2]], left[base_y_c[3]]);
2069
1.29M
      a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2070
1.29M
                            left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
2071
2072
1.29M
      if (upsample_left) {
2073
229k
        shifty = _mm_srli_epi32(
2074
229k
            _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
2075
1.06M
      } else {
2076
1.06M
        shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
2077
1.06M
      }
2078
1.29M
      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2079
1.29M
      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2080
1.29M
      shift = _mm256_inserti128_si256(shift, shifty, 1);
2081
1.29M
    }
2082
2083
1.44M
    diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
2084
1.44M
    a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
2085
1.44M
    a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2086
2087
1.44M
    b = _mm256_mullo_epi32(diff, shift);
2088
1.44M
    res = _mm256_add_epi32(a32, b);
2089
1.44M
    res = _mm256_srli_epi32(res, 5);
2090
2091
1.44M
    resx = _mm256_castsi256_si128(res);
2092
1.44M
    resx = _mm_packus_epi32(resx, resx);
2093
2094
1.44M
    resy = _mm256_extracti128_si256(res, 1);
2095
1.44M
    resy = _mm_packus_epi32(resy, resy);
2096
2097
1.44M
    resxy =
2098
1.44M
        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2099
1.44M
    _mm_storel_epi64((__m128i *)(dst), resxy);
2100
1.44M
    dst += stride;
2101
1.44M
  }
2102
290k
}
2103
2104
static void highbd_dr_prediction_z2_Nx4_avx2(
2105
    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2106
    const uint16_t *left, int upsample_above, int upsample_left, int dx,
2107
213k
    int dy) {
2108
213k
  const int min_base_x = -(1 << upsample_above);
2109
213k
  const int min_base_y = -(1 << upsample_left);
2110
213k
  const int frac_bits_x = 6 - upsample_above;
2111
213k
  const int frac_bits_y = 6 - upsample_left;
2112
2113
213k
  assert(dx > 0);
2114
  // pre-filter above pixels
2115
  // store in temp buffers:
2116
  //   above[x] * 32 + 16
2117
  //   above[x+1] - above[x]
2118
  // final pixels will be calculated as:
2119
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2120
213k
  __m256i a0_x, a1_x, a32, a16;
2121
213k
  __m256i diff;
2122
213k
  __m128i c3f, min_base_y128;
2123
2124
213k
  a16 = _mm256_set1_epi16(16);
2125
213k
  c3f = _mm_set1_epi16(0x3f);
2126
213k
  min_base_y128 = _mm_set1_epi16(min_base_y);
2127
2128
1.59M
  for (int r = 0; r < N; r++) {
2129
1.38M
    __m256i b, res, shift;
2130
1.38M
    __m128i resx, resy, resxy;
2131
1.38M
    __m128i a0_x128, a1_x128;
2132
1.38M
    int y = r + 1;
2133
1.38M
    int base_x = (-y * dx) >> frac_bits_x;
2134
1.38M
    int base_shift = 0;
2135
1.38M
    if (base_x < (min_base_x - 1)) {
2136
976k
      base_shift = (min_base_x - base_x - 1) >> upsample_above;
2137
976k
    }
2138
1.38M
    int base_min_diff =
2139
1.38M
        (min_base_x - base_x + upsample_above) >> upsample_above;
2140
1.38M
    if (base_min_diff > 4) {
2141
604k
      base_min_diff = 4;
2142
776k
    } else {
2143
776k
      if (base_min_diff < 0) base_min_diff = 0;
2144
776k
    }
2145
2146
1.38M
    if (base_shift > 3) {
2147
604k
      a0_x = _mm256_setzero_si256();
2148
604k
      a1_x = _mm256_setzero_si256();
2149
604k
      shift = _mm256_setzero_si256();
2150
776k
    } else {
2151
776k
      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2152
776k
      if (upsample_above) {
2153
325k
        a0_x128 = _mm_shuffle_epi8(a0_x128,
2154
325k
                                   *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
2155
325k
        a1_x128 = _mm_srli_si128(a0_x128, 8);
2156
2157
325k
        shift = _mm256_castsi128_si256(_mm_srli_epi16(
2158
325k
            _mm_and_si128(
2159
325k
                _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2160
325k
                                              (2 << 6) - y * dx,
2161
325k
                                              (3 << 6) - y * dx, 0, 0, 0, 0),
2162
325k
                               upsample_above),
2163
325k
                c3f),
2164
325k
            1));
2165
450k
      } else {
2166
450k
        a0_x128 =
2167
450k
            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2168
450k
        a1_x128 = _mm_srli_si128(a0_x128, 2);
2169
2170
450k
        shift = _mm256_castsi128_si256(_mm_srli_epi16(
2171
450k
            _mm_and_si128(
2172
450k
                _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
2173
450k
                               (3 << 6) - y * dx, 0, 0, 0, 0),
2174
450k
                c3f),
2175
450k
            1));
2176
450k
      }
2177
776k
      a0_x = _mm256_castsi128_si256(a0_x128);
2178
776k
      a1_x = _mm256_castsi128_si256(a1_x128);
2179
776k
    }
2180
    // y calc
2181
1.38M
    __m128i a0_y, a1_y, shifty;
2182
1.38M
    if (base_x < min_base_x) {
2183
1.11M
      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2184
1.11M
      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
2185
1.11M
      r6 = _mm_set1_epi16(r << 6);
2186
1.11M
      dy128 = _mm_set1_epi16(dy);
2187
1.11M
      c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
2188
1.11M
      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
2189
1.11M
      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
2190
1.11M
      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
2191
1.11M
      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2192
1.11M
      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2193
2194
1.11M
      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
2195
1.11M
                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
2196
1.11M
      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2197
1.11M
                            left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0,
2198
1.11M
                            0, 0);
2199
2200
1.11M
      if (upsample_left) {
2201
346k
        shifty = _mm_srli_epi16(
2202
346k
            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
2203
773k
      } else {
2204
773k
        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
2205
773k
      }
2206
1.11M
      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2207
1.11M
      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2208
1.11M
      shift = _mm256_inserti128_si256(shift, shifty, 1);
2209
1.11M
    }
2210
2211
1.38M
    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
2212
1.38M
    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
2213
1.38M
    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2214
2215
1.38M
    b = _mm256_mullo_epi16(diff, shift);
2216
1.38M
    res = _mm256_add_epi16(a32, b);
2217
1.38M
    res = _mm256_srli_epi16(res, 5);
2218
2219
1.38M
    resx = _mm256_castsi256_si128(res);
2220
1.38M
    resy = _mm256_extracti128_si256(res, 1);
2221
1.38M
    resxy =
2222
1.38M
        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2223
1.38M
    _mm_storel_epi64((__m128i *)(dst), resxy);
2224
1.38M
    dst += stride;
2225
1.38M
  }
2226
213k
}
2227
2228
static void highbd_dr_prediction_32bit_z2_Nx8_avx2(
2229
    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2230
    const uint16_t *left, int upsample_above, int upsample_left, int dx,
2231
282k
    int dy) {
2232
282k
  const int min_base_x = -(1 << upsample_above);
2233
282k
  const int min_base_y = -(1 << upsample_left);
2234
282k
  const int frac_bits_x = 6 - upsample_above;
2235
282k
  const int frac_bits_y = 6 - upsample_left;
2236
2237
  // pre-filter above pixels
2238
  // store in temp buffers:
2239
  //   above[x] * 32 + 16
2240
  //   above[x+1] - above[x]
2241
  // final pixels will be calculated as:
2242
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2243
282k
  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
2244
282k
  __m256i diff;
2245
282k
  __m128i a0_x128, a1_x128;
2246
2247
282k
  a16 = _mm256_set1_epi32(16);
2248
282k
  c3f = _mm256_set1_epi32(0x3f);
2249
282k
  min_base_y256 = _mm256_set1_epi32(min_base_y);
2250
2251
2.99M
  for (int r = 0; r < N; r++) {
2252
2.71M
    __m256i b, res, shift;
2253
2.71M
    __m128i resx, resy, resxy;
2254
2.71M
    int y = r + 1;
2255
2.71M
    int base_x = (-y * dx) >> frac_bits_x;
2256
2.71M
    int base_shift = 0;
2257
2.71M
    if (base_x < (min_base_x - 1)) {
2258
2.13M
      base_shift = (min_base_x - base_x - 1) >> upsample_above;
2259
2.13M
    }
2260
2.71M
    int base_min_diff =
2261
2.71M
        (min_base_x - base_x + upsample_above) >> upsample_above;
2262
2.71M
    if (base_min_diff > 8) {
2263
1.37M
      base_min_diff = 8;
2264
1.37M
    } else {
2265
1.33M
      if (base_min_diff < 0) base_min_diff = 0;
2266
1.33M
    }
2267
2268
2.71M
    if (base_shift > 7) {
2269
1.37M
      resx = _mm_setzero_si128();
2270
1.37M
    } else {
2271
1.33M
      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2272
1.33M
      if (upsample_above) {
2273
69.4k
        __m128i mask, atmp0, atmp1, atmp2, atmp3;
2274
69.4k
        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
2275
69.4k
        atmp0 = _mm_shuffle_epi8(a0_x128,
2276
69.4k
                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2277
69.4k
        atmp1 = _mm_shuffle_epi8(a1_x128,
2278
69.4k
                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2279
69.4k
        atmp2 = _mm_shuffle_epi8(
2280
69.4k
            a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2281
69.4k
        atmp3 = _mm_shuffle_epi8(
2282
69.4k
            a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2283
69.4k
        mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
2284
69.4k
                              _mm_set1_epi8(15));
2285
69.4k
        a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
2286
69.4k
        mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
2287
69.4k
                              _mm_set1_epi8(15));
2288
69.4k
        a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
2289
69.4k
        shift = _mm256_srli_epi32(
2290
69.4k
            _mm256_and_si256(
2291
69.4k
                _mm256_slli_epi32(
2292
69.4k
                    _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx,
2293
69.4k
                                      (2 << 6) - y * dx, (3 << 6) - y * dx,
2294
69.4k
                                      (4 << 6) - y * dx, (5 << 6) - y * dx,
2295
69.4k
                                      (6 << 6) - y * dx, (7 << 6) - y * dx),
2296
69.4k
                    upsample_above),
2297
69.4k
                c3f),
2298
69.4k
            1);
2299
1.26M
      } else {
2300
1.26M
        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
2301
1.26M
        a0_x128 =
2302
1.26M
            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2303
1.26M
        a1_x128 =
2304
1.26M
            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2305
2306
1.26M
        shift = _mm256_srli_epi32(
2307
1.26M
            _mm256_and_si256(
2308
1.26M
                _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
2309
1.26M
                                  (3 << 6) - y * dx, (4 << 6) - y * dx,
2310
1.26M
                                  (5 << 6) - y * dx, (6 << 6) - y * dx,
2311
1.26M
                                  (7 << 6) - y * dx),
2312
1.26M
                c3f),
2313
1.26M
            1);
2314
1.26M
      }
2315
1.33M
      a0_x = _mm256_cvtepu16_epi32(a0_x128);
2316
1.33M
      a1_x = _mm256_cvtepu16_epi32(a1_x128);
2317
2318
1.33M
      diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
2319
1.33M
      a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
2320
1.33M
      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2321
2322
1.33M
      b = _mm256_mullo_epi32(diff, shift);
2323
1.33M
      res = _mm256_add_epi32(a32, b);
2324
1.33M
      res = _mm256_srli_epi32(res, 5);
2325
2326
1.33M
      resx = _mm256_castsi256_si128(_mm256_packus_epi32(
2327
1.33M
          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
2328
1.33M
    }
2329
    // y calc
2330
2.71M
    if (base_x < min_base_x) {
2331
2.34M
      DECLARE_ALIGNED(32, int, base_y_c[8]);
2332
2.34M
      __m256i r6, c256, dy256, y_c256, base_y_c256, mask256;
2333
2.34M
      r6 = _mm256_set1_epi32(r << 6);
2334
2.34M
      dy256 = _mm256_set1_epi32(dy);
2335
2.34M
      c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
2336
2.34M
      y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2337
2.34M
      base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
2338
2.34M
      mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2339
2.34M
      base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2340
2.34M
      _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2341
2342
2.34M
      a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2343
2.34M
          left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2344
2.34M
          left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2345
2.34M
          left[base_y_c[6]], left[base_y_c[7]]));
2346
2.34M
      a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2347
2.34M
          left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
2348
2.34M
          left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2349
2.34M
          left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
2350
2351
2.34M
      if (upsample_left) {
2352
120k
        shift = _mm256_srli_epi32(
2353
120k
            _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f),
2354
120k
            1);
2355
2.21M
      } else {
2356
2.21M
        shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
2357
2.21M
      }
2358
2.34M
      diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
2359
2.34M
      a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
2360
2.34M
      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2361
2362
2.34M
      b = _mm256_mullo_epi32(diff, shift);
2363
2.34M
      res = _mm256_add_epi32(a32, b);
2364
2.34M
      res = _mm256_srli_epi32(res, 5);
2365
2366
2.34M
      resy = _mm256_castsi256_si128(_mm256_packus_epi32(
2367
2.34M
          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
2368
2.34M
    } else {
2369
371k
      resy = resx;
2370
371k
    }
2371
2.71M
    resxy =
2372
2.71M
        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2373
2.71M
    _mm_storeu_si128((__m128i *)(dst), resxy);
2374
2.71M
    dst += stride;
2375
2.71M
  }
2376
282k
}
2377
2378
static void highbd_dr_prediction_z2_Nx8_avx2(
2379
    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2380
    const uint16_t *left, int upsample_above, int upsample_left, int dx,
2381
252k
    int dy) {
2382
252k
  const int min_base_x = -(1 << upsample_above);
2383
252k
  const int min_base_y = -(1 << upsample_left);
2384
252k
  const int frac_bits_x = 6 - upsample_above;
2385
252k
  const int frac_bits_y = 6 - upsample_left;
2386
2387
  // pre-filter above pixels
2388
  // store in temp buffers:
2389
  //   above[x] * 32 + 16
2390
  //   above[x+1] - above[x]
2391
  // final pixels will be calculated as:
2392
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2393
252k
  __m128i c3f, min_base_y128;
2394
252k
  __m256i a0_x, a1_x, diff, a32, a16;
2395
252k
  __m128i a0_x128, a1_x128;
2396
2397
252k
  a16 = _mm256_set1_epi16(16);
2398
252k
  c3f = _mm_set1_epi16(0x3f);
2399
252k
  min_base_y128 = _mm_set1_epi16(min_base_y);
2400
2401
2.68M
  for (int r = 0; r < N; r++) {
2402
2.43M
    __m256i b, res, shift;
2403
2.43M
    __m128i resx, resy, resxy;
2404
2.43M
    int y = r + 1;
2405
2.43M
    int base_x = (-y * dx) >> frac_bits_x;
2406
2.43M
    int base_shift = 0;
2407
2.43M
    if (base_x < (min_base_x - 1)) {
2408
1.78M
      base_shift = (min_base_x - base_x - 1) >> upsample_above;
2409
1.78M
    }
2410
2.43M
    int base_min_diff =
2411
2.43M
        (min_base_x - base_x + upsample_above) >> upsample_above;
2412
2.43M
    if (base_min_diff > 8) {
2413
996k
      base_min_diff = 8;
2414
1.43M
    } else {
2415
1.43M
      if (base_min_diff < 0) base_min_diff = 0;
2416
1.43M
    }
2417
2418
2.43M
    if (base_shift > 7) {
2419
996k
      a0_x = _mm256_setzero_si256();
2420
996k
      a1_x = _mm256_setzero_si256();
2421
996k
      shift = _mm256_setzero_si256();
2422
1.43M
    } else {
2423
1.43M
      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2424
1.43M
      if (upsample_above) {
2425
448k
        __m128i mask, atmp0, atmp1, atmp2, atmp3;
2426
448k
        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
2427
448k
        atmp0 = _mm_shuffle_epi8(a0_x128,
2428
448k
                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2429
448k
        atmp1 = _mm_shuffle_epi8(a1_x128,
2430
448k
                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2431
448k
        atmp2 = _mm_shuffle_epi8(
2432
448k
            a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2433
448k
        atmp3 = _mm_shuffle_epi8(
2434
448k
            a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2435
448k
        mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
2436
448k
                              _mm_set1_epi8(15));
2437
448k
        a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
2438
448k
        mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
2439
448k
                              _mm_set1_epi8(15));
2440
448k
        a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
2441
2442
448k
        shift = _mm256_castsi128_si256(_mm_srli_epi16(
2443
448k
            _mm_and_si128(
2444
448k
                _mm_slli_epi16(
2445
448k
                    _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2446
448k
                                   (2 << 6) - y * dx, (3 << 6) - y * dx,
2447
448k
                                   (4 << 6) - y * dx, (5 << 6) - y * dx,
2448
448k
                                   (6 << 6) - y * dx, (7 << 6) - y * dx),
2449
448k
                    upsample_above),
2450
448k
                c3f),
2451
448k
            1));
2452
987k
      } else {
2453
987k
        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
2454
987k
        a0_x128 =
2455
987k
            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2456
987k
        a1_x128 =
2457
987k
            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2458
2459
987k
        shift = _mm256_castsi128_si256(_mm_srli_epi16(
2460
987k
            _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2461
987k
                                         (2 << 6) - y * dx, (3 << 6) - y * dx,
2462
987k
                                         (4 << 6) - y * dx, (5 << 6) - y * dx,
2463
987k
                                         (6 << 6) - y * dx, (7 << 6) - y * dx),
2464
987k
                          c3f),
2465
987k
            1));
2466
987k
      }
2467
1.43M
      a0_x = _mm256_castsi128_si256(a0_x128);
2468
1.43M
      a1_x = _mm256_castsi128_si256(a1_x128);
2469
1.43M
    }
2470
2471
    // y calc
2472
2.43M
    __m128i a0_y, a1_y, shifty;
2473
2.43M
    if (base_x < min_base_x) {
2474
2.00M
      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
2475
2.00M
      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2476
2.00M
      r6 = _mm_set1_epi16(r << 6);
2477
2.00M
      dy128 = _mm_set1_epi16(dy);
2478
2.00M
      c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
2479
2.00M
      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
2480
2.00M
      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
2481
2.00M
      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
2482
2.00M
      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2483
2.00M
      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2484
2485
2.00M
      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
2486
2.00M
                            left[base_y_c[2]], left[base_y_c[3]],
2487
2.00M
                            left[base_y_c[4]], left[base_y_c[5]],
2488
2.00M
                            left[base_y_c[6]], left[base_y_c[7]]);
2489
2.00M
      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2490
2.00M
                            left[base_y_c[2] + 1], left[base_y_c[3] + 1],
2491
2.00M
                            left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2492
2.00M
                            left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
2493
2494
2.00M
      if (upsample_left) {
2495
548k
        shifty = _mm_srli_epi16(
2496
548k
            _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1);
2497
1.45M
      } else {
2498
1.45M
        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
2499
1.45M
      }
2500
2.00M
      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2501
2.00M
      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2502
2.00M
      shift = _mm256_inserti128_si256(shift, shifty, 1);
2503
2.00M
    }
2504
2505
2.43M
    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
2506
2.43M
    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
2507
2.43M
    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2508
2509
2.43M
    b = _mm256_mullo_epi16(diff, shift);
2510
2.43M
    res = _mm256_add_epi16(a32, b);
2511
2.43M
    res = _mm256_srli_epi16(res, 5);
2512
2513
2.43M
    resx = _mm256_castsi256_si128(res);
2514
2.43M
    resy = _mm256_extracti128_si256(res, 1);
2515
2516
2.43M
    resxy =
2517
2.43M
        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2518
2.43M
    _mm_storeu_si128((__m128i *)(dst), resxy);
2519
2.43M
    dst += stride;
2520
2.43M
  }
2521
252k
}
2522
2523
static void highbd_dr_prediction_32bit_z2_HxW_avx2(
2524
    int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2525
    const uint16_t *left, int upsample_above, int upsample_left, int dx,
2526
177k
    int dy) {
2527
  // here upsample_above and upsample_left are 0 by design of
2528
  // av1_use_intra_edge_upsample
2529
177k
  const int min_base_x = -1;
2530
177k
  const int min_base_y = -1;
2531
177k
  (void)upsample_above;
2532
177k
  (void)upsample_left;
2533
177k
  const int frac_bits_x = 6;
2534
177k
  const int frac_bits_y = 6;
2535
2536
  // pre-filter above pixels
2537
  // store in temp buffers:
2538
  //   above[x] * 32 + 16
2539
  //   above[x+1] - above[x]
2540
  // final pixels will be calculated as:
2541
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2542
177k
  __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1;
2543
177k
  __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8;
2544
177k
  __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
2545
177k
  DECLARE_ALIGNED(32, int, base_y_c[16]);
2546
2547
177k
  a16 = _mm256_set1_epi32(16);
2548
177k
  c1 = _mm256_srli_epi32(a16, 4);
2549
177k
  c8 = _mm256_srli_epi32(a16, 1);
2550
177k
  min_base_y256 = _mm256_set1_epi32(min_base_y);
2551
177k
  c3f = _mm256_set1_epi32(0x3f);
2552
177k
  dy256 = _mm256_set1_epi32(dy);
2553
177k
  c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2554
177k
  c1234 = _mm256_add_epi32(c0123, c1);
2555
2556
2.68M
  for (int r = 0; r < H; r++) {
2557
2.50M
    __m256i b, res, shift, ydx;
2558
2.50M
    __m256i resx[2], resy[2];
2559
2.50M
    __m256i resxy, j256, r6;
2560
7.67M
    for (int j = 0; j < W; j += 16) {
2561
5.16M
      j256 = _mm256_set1_epi32(j);
2562
5.16M
      int y = r + 1;
2563
5.16M
      ydx = _mm256_set1_epi32(y * dx);
2564
2565
5.16M
      int base_x = ((j << 6) - y * dx) >> frac_bits_x;
2566
5.16M
      int base_shift = 0;
2567
5.16M
      if ((base_x) < (min_base_x - 1)) {
2568
3.24M
        base_shift = (min_base_x - base_x - 1);
2569
3.24M
      }
2570
5.16M
      int base_min_diff = (min_base_x - base_x);
2571
5.16M
      if (base_min_diff > 16) {
2572
2.10M
        base_min_diff = 16;
2573
3.05M
      } else {
2574
3.05M
        if (base_min_diff < 0) base_min_diff = 0;
2575
3.05M
      }
2576
2577
5.16M
      if (base_shift > 7) {
2578
2.55M
        resx[0] = _mm256_setzero_si256();
2579
2.60M
      } else {
2580
2.60M
        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2581
2.60M
        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
2582
2.60M
        a0_x128 =
2583
2.60M
            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2584
2.60M
        a1_x128 =
2585
2.60M
            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2586
2587
2.60M
        a0_x = _mm256_cvtepu16_epi32(a0_x128);
2588
2.60M
        a1_x = _mm256_cvtepu16_epi32(a1_x128);
2589
2590
2.60M
        r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6);
2591
2.60M
        shift = _mm256_srli_epi32(
2592
2.60M
            _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
2593
2594
2.60M
        diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
2595
2.60M
        a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
2596
2.60M
        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2597
2598
2.60M
        b = _mm256_mullo_epi32(diff, shift);
2599
2.60M
        res = _mm256_add_epi32(a32, b);
2600
2.60M
        res = _mm256_srli_epi32(res, 5);
2601
2602
2.60M
        resx[0] = _mm256_packus_epi32(
2603
2.60M
            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2604
2.60M
      }
2605
5.16M
      int base_shift8 = 0;
2606
5.16M
      if ((base_x + 8) < (min_base_x - 1)) {
2607
2.50M
        base_shift8 = (min_base_x - (base_x + 8) - 1);
2608
2.50M
      }
2609
5.16M
      if (base_shift8 > 7) {
2610
2.10M
        resx[1] = _mm256_setzero_si256();
2611
3.05M
      } else {
2612
3.05M
        a0_1_x128 =
2613
3.05M
            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8));
2614
3.05M
        a1_1_x128 =
2615
3.05M
            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9));
2616
3.05M
        a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
2617
3.05M
                                     *(__m128i *)HighbdLoadMaskx[base_shift8]);
2618
3.05M
        a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
2619
3.05M
                                     *(__m128i *)HighbdLoadMaskx[base_shift8]);
2620
2621
3.05M
        a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
2622
3.05M
        a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
2623
2624
3.05M
        r6 = _mm256_slli_epi32(
2625
3.05M
            _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6);
2626
3.05M
        shift = _mm256_srli_epi32(
2627
3.05M
            _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
2628
2629
3.05M
        diff = _mm256_sub_epi32(a1_1_x, a0_1_x);  // a[x+1] - a[x]
2630
3.05M
        a32 = _mm256_slli_epi32(a0_1_x, 5);       // a[x] * 32
2631
3.05M
        a32 = _mm256_add_epi32(a32, a16);         // a[x] * 32 + 16
2632
3.05M
        b = _mm256_mullo_epi32(diff, shift);
2633
2634
3.05M
        resx[1] = _mm256_add_epi32(a32, b);
2635
3.05M
        resx[1] = _mm256_srli_epi32(resx[1], 5);
2636
3.05M
        resx[1] = _mm256_packus_epi32(
2637
3.05M
            resx[1],
2638
3.05M
            _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1)));
2639
3.05M
      }
2640
5.16M
      resx[0] =
2641
5.16M
          _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]),
2642
5.16M
                                  1);  // 16 16bit values
2643
2644
      // y calc
2645
5.16M
      resy[0] = _mm256_setzero_si256();
2646
5.16M
      if ((base_x < min_base_x)) {
2647
3.42M
        __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256;
2648
3.42M
        r6 = _mm256_set1_epi32(r << 6);
2649
3.42M
        c256 = _mm256_add_epi32(j256, c1234);
2650
3.42M
        y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2651
3.42M
        base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
2652
3.42M
        mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2653
3.42M
        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2654
3.42M
        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2655
3.42M
        c256 = _mm256_add_epi32(c256, c8);
2656
3.42M
        y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2657
3.42M
        base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
2658
3.42M
        mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2659
3.42M
        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2660
3.42M
        _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256);
2661
2662
3.42M
        a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2663
3.42M
            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2664
3.42M
            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2665
3.42M
            left[base_y_c[6]], left[base_y_c[7]]));
2666
3.42M
        a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2667
3.42M
            left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
2668
3.42M
            left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2669
3.42M
            left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
2670
2671
3.42M
        shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
2672
2673
3.42M
        diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
2674
3.42M
        a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
2675
3.42M
        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2676
2677
3.42M
        b = _mm256_mullo_epi32(diff, shift);
2678
3.42M
        res = _mm256_add_epi32(a32, b);
2679
3.42M
        res = _mm256_srli_epi32(res, 5);
2680
2681
3.42M
        resy[0] = _mm256_packus_epi32(
2682
3.42M
            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2683
2684
3.42M
        a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2685
3.42M
            left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]],
2686
3.42M
            left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]],
2687
3.42M
            left[base_y_c[14]], left[base_y_c[15]]));
2688
3.42M
        a1_y = _mm256_cvtepu16_epi32(
2689
3.42M
            _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1],
2690
3.42M
                           left[base_y_c[10] + 1], left[base_y_c[11] + 1],
2691
3.42M
                           left[base_y_c[12] + 1], left[base_y_c[13] + 1],
2692
3.42M
                           left[base_y_c[14] + 1], left[base_y_c[15] + 1]));
2693
3.42M
        shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1);
2694
2695
3.42M
        diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
2696
3.42M
        a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
2697
3.42M
        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2698
2699
3.42M
        b = _mm256_mullo_epi32(diff, shift);
2700
3.42M
        res = _mm256_add_epi32(a32, b);
2701
3.42M
        res = _mm256_srli_epi32(res, 5);
2702
2703
3.42M
        resy[1] = _mm256_packus_epi32(
2704
3.42M
            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2705
2706
3.42M
        resy[0] =
2707
3.42M
            _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
2708
3.42M
                                    1);  // 16 16bit values
2709
3.42M
      }
2710
2711
5.16M
      resxy = _mm256_blendv_epi8(resx[0], resy[0],
2712
5.16M
                                 *(__m256i *)HighbdBaseMask[base_min_diff]);
2713
5.16M
      _mm256_storeu_si256((__m256i *)(dst + j), resxy);
2714
5.16M
    }  // for j
2715
2.50M
    dst += stride;
2716
2.50M
  }
2717
177k
}
2718
2719
static void highbd_dr_prediction_z2_HxW_avx2(
2720
    int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2721
    const uint16_t *left, int upsample_above, int upsample_left, int dx,
2722
420k
    int dy) {
2723
  // here upsample_above and upsample_left are 0 by design of
2724
  // av1_use_intra_edge_upsample
2725
420k
  const int min_base_x = -1;
2726
420k
  const int min_base_y = -1;
2727
420k
  (void)upsample_above;
2728
420k
  (void)upsample_left;
2729
420k
  const int frac_bits_x = 6;
2730
420k
  const int frac_bits_y = 6;
2731
2732
  // pre-filter above pixels
2733
  // store in temp buffers:
2734
  //   above[x] * 32 + 16
2735
  //   above[x+1] - above[x]
2736
  // final pixels will be calculated as:
2737
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2738
420k
  __m256i a0_x, a1_x, a32, a16, c3f, c1;
2739
420k
  __m256i diff, min_base_y256, dy256, c1234, c0123;
2740
420k
  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
2741
2742
420k
  a16 = _mm256_set1_epi16(16);
2743
420k
  c1 = _mm256_srli_epi16(a16, 4);
2744
420k
  min_base_y256 = _mm256_set1_epi16(min_base_y);
2745
420k
  c3f = _mm256_set1_epi16(0x3f);
2746
420k
  dy256 = _mm256_set1_epi16(dy);
2747
420k
  c0123 =
2748
420k
      _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2749
420k
  c1234 = _mm256_add_epi16(c0123, c1);
2750
2751
8.12M
  for (int r = 0; r < H; r++) {
2752
7.70M
    __m256i b, res, shift;
2753
7.70M
    __m256i resx, resy, ydx;
2754
7.70M
    __m256i resxy, j256, r6;
2755
7.70M
    __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
2756
7.70M
    int y = r + 1;
2757
7.70M
    ydx = _mm256_set1_epi16((short)(y * dx));
2758
2759
21.0M
    for (int j = 0; j < W; j += 16) {
2760
13.3M
      j256 = _mm256_set1_epi16(j);
2761
13.3M
      int base_x = ((j << 6) - y * dx) >> frac_bits_x;
2762
13.3M
      int base_shift = 0;
2763
13.3M
      if ((base_x) < (min_base_x - 1)) {
2764
9.93M
        base_shift = (min_base_x - (base_x)-1);
2765
9.93M
      }
2766
13.3M
      int base_min_diff = (min_base_x - base_x);
2767
13.3M
      if (base_min_diff > 16) {
2768
7.23M
        base_min_diff = 16;
2769
7.23M
      } else {
2770
6.15M
        if (base_min_diff < 0) base_min_diff = 0;
2771
6.15M
      }
2772
2773
13.3M
      if (base_shift < 8) {
2774
5.12M
        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2775
5.12M
        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
2776
5.12M
        a0_x128 =
2777
5.12M
            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2778
5.12M
        a1_x128 =
2779
5.12M
            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2780
2781
5.12M
        a0_x = _mm256_castsi128_si256(a0_x128);
2782
5.12M
        a1_x = _mm256_castsi128_si256(a1_x128);
2783
8.26M
      } else {
2784
8.26M
        a0_x = _mm256_setzero_si256();
2785
8.26M
        a1_x = _mm256_setzero_si256();
2786
8.26M
      }
2787
2788
13.3M
      int base_shift1 = 0;
2789
13.3M
      if (base_shift > 8) {
2790
8.10M
        base_shift1 = base_shift - 8;
2791
8.10M
      }
2792
13.3M
      if (base_shift1 < 8) {
2793
6.15M
        a0_1_x128 =
2794
6.15M
            _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8));
2795
6.15M
        a1_1_x128 =
2796
6.15M
            _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9));
2797
6.15M
        a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
2798
6.15M
                                     *(__m128i *)HighbdLoadMaskx[base_shift1]);
2799
6.15M
        a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
2800
6.15M
                                     *(__m128i *)HighbdLoadMaskx[base_shift1]);
2801
2802
6.15M
        a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
2803
6.15M
        a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
2804
6.15M
      }
2805
13.3M
      r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
2806
13.3M
      shift = _mm256_srli_epi16(
2807
13.3M
          _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
2808
2809
13.3M
      diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
2810
13.3M
      a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
2811
13.3M
      a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2812
2813
13.3M
      b = _mm256_mullo_epi16(diff, shift);
2814
13.3M
      res = _mm256_add_epi16(a32, b);
2815
13.3M
      resx = _mm256_srli_epi16(res, 5);  // 16 16-bit values
2816
2817
      // y calc
2818
13.3M
      resy = _mm256_setzero_si256();
2819
13.3M
      __m256i a0_y, a1_y, shifty;
2820
13.3M
      if ((base_x < min_base_x)) {
2821
10.4M
        __m256i c256, y_c256, base_y_c256, mask256, mul16;
2822
10.4M
        r6 = _mm256_set1_epi16(r << 6);
2823
10.4M
        c256 = _mm256_add_epi16(j256, c1234);
2824
10.4M
        mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
2825
10.4M
                                 _mm256_srli_epi16(min_base_y256, 1));
2826
10.4M
        y_c256 = _mm256_sub_epi16(r6, mul16);
2827
10.4M
        base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
2828
10.4M
        mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
2829
10.4M
        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2830
10.4M
        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2831
2832
10.4M
        a0_y = _mm256_setr_epi16(
2833
10.4M
            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2834
10.4M
            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2835
10.4M
            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
2836
10.4M
            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
2837
10.4M
            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
2838
10.4M
            left[base_y_c[15]]);
2839
10.4M
        base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
2840
10.4M
        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2841
2842
10.4M
        a1_y = _mm256_setr_epi16(
2843
10.4M
            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2844
10.4M
            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2845
10.4M
            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
2846
10.4M
            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
2847
10.4M
            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
2848
10.4M
            left[base_y_c[15]]);
2849
2850
10.4M
        shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
2851
2852
10.4M
        diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
2853
10.4M
        a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
2854
10.4M
        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2855
2856
10.4M
        b = _mm256_mullo_epi16(diff, shifty);
2857
10.4M
        res = _mm256_add_epi16(a32, b);
2858
10.4M
        resy = _mm256_srli_epi16(res, 5);
2859
10.4M
      }
2860
2861
13.3M
      resxy = _mm256_blendv_epi8(resx, resy,
2862
13.3M
                                 *(__m256i *)HighbdBaseMask[base_min_diff]);
2863
13.3M
      _mm256_storeu_si256((__m256i *)(dst + j), resxy);
2864
13.3M
    }  // for j
2865
7.70M
    dst += stride;
2866
7.70M
  }
2867
420k
}
2868
2869
// Directional prediction, zone 2: 90 < angle < 180
2870
void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
2871
                                      int bh, const uint16_t *above,
2872
                                      const uint16_t *left, int upsample_above,
2873
                                      int upsample_left, int dx, int dy,
2874
1.63M
                                      int bd) {
2875
1.63M
  (void)bd;
2876
1.63M
  assert(dx > 0);
2877
1.63M
  assert(dy > 0);
2878
1.63M
  switch (bw) {
2879
504k
    case 4:
2880
504k
      if (bd < 12) {
2881
213k
        highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
2882
213k
                                         upsample_above, upsample_left, dx, dy);
2883
290k
      } else {
2884
290k
        highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left,
2885
290k
                                               upsample_above, upsample_left,
2886
290k
                                               dx, dy);
2887
290k
      }
2888
504k
      break;
2889
534k
    case 8:
2890
534k
      if (bd < 12) {
2891
252k
        highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left,
2892
252k
                                         upsample_above, upsample_left, dx, dy);
2893
282k
      } else {
2894
282k
        highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left,
2895
282k
                                               upsample_above, upsample_left,
2896
282k
                                               dx, dy);
2897
282k
      }
2898
534k
      break;
2899
598k
    default:
2900
598k
      if (bd < 12) {
2901
420k
        highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
2902
420k
                                         upsample_above, upsample_left, dx, dy);
2903
420k
      } else {
2904
177k
        highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left,
2905
177k
                                               upsample_above, upsample_left,
2906
177k
                                               dx, dy);
2907
177k
      }
2908
598k
      break;
2909
1.63M
  }
2910
1.63M
}
2911
2912
//  Directional prediction, zone 3 functions
2913
static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
2914
                                             const uint16_t *left,
2915
                                             int upsample_left, int dy,
2916
225k
                                             int bd) {
2917
225k
  __m128i dstvec[4], d[4];
2918
225k
  if (bd < 12) {
2919
189k
    highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left,
2920
189k
                                              dy);
2921
189k
  } else {
2922
36.4k
    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left,
2923
36.4k
                                                    upsample_left, dy);
2924
36.4k
  }
2925
225k
  highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2],
2926
225k
                                   &dstvec[3], &d[0], &d[1], &d[2], &d[3]);
2927
225k
  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
2928
225k
  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
2929
225k
  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
2930
225k
  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
2931
225k
  return;
2932
225k
}
2933
2934
static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
2935
                                             const uint16_t *left,
2936
                                             int upsample_left, int dy,
2937
210k
                                             int bd) {
2938
210k
  __m128i dstvec[8], d[8];
2939
210k
  if (bd < 12) {
2940
110k
    highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left,
2941
110k
                                              dy);
2942
110k
  } else {
2943
100k
    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left,
2944
100k
                                                    upsample_left, dy);
2945
100k
  }
2946
210k
  highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2947
210k
                           &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
2948
210k
                           &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
2949
210k
                           &d[7]);
2950
1.89M
  for (int i = 0; i < 8; i++) {
2951
1.68M
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
2952
1.68M
  }
2953
210k
}
2954
2955
static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride,
2956
                                             const uint16_t *left,
2957
                                             int upsample_left, int dy,
2958
35.7k
                                             int bd) {
2959
35.7k
  __m128i dstvec[4], d[8];
2960
35.7k
  if (bd < 12) {
2961
19.7k
    highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left,
2962
19.7k
                                              dy);
2963
19.7k
  } else {
2964
15.9k
    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left,
2965
15.9k
                                                    upsample_left, dy);
2966
15.9k
  }
2967
2968
35.7k
  highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2969
35.7k
                               &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
2970
35.7k
                               &d[7]);
2971
321k
  for (int i = 0; i < 8; i++) {
2972
286k
    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
2973
286k
  }
2974
35.7k
}
2975
2976
static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
2977
                                             const uint16_t *left,
2978
                                             int upsample_left, int dy,
2979
66.0k
                                             int bd) {
2980
66.0k
  __m128i dstvec[8], d[4];
2981
66.0k
  if (bd < 12) {
2982
37.7k
    highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left,
2983
37.7k
                                              dy);
2984
37.7k
  } else {
2985
28.3k
    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left,
2986
28.3k
                                                    upsample_left, dy);
2987
28.3k
  }
2988
2989
66.0k
  highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2990
66.0k
                               &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
2991
66.0k
                               &d[0], &d[1], &d[2], &d[3]);
2992
66.0k
  _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
2993
66.0k
  _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]);
2994
66.0k
  _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]);
2995
66.0k
  _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]);
2996
66.0k
}
2997
2998
static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
2999
                                              const uint16_t *left,
3000
                                              int upsample_left, int dy,
3001
42.5k
                                              int bd) {
3002
42.5k
  __m256i dstvec[8], d[8];
3003
42.5k
  if (bd < 12) {
3004
28.4k
    highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
3005
28.4k
                                               dy);
3006
28.4k
  } else {
3007
14.1k
    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left,
3008
14.1k
                                                     upsample_left, dy);
3009
14.1k
  }
3010
42.5k
  highbd_transpose8x16_16x8_avx2(dstvec, d);
3011
382k
  for (int i = 0; i < 8; i++) {
3012
340k
    _mm_storeu_si128((__m128i *)(dst + i * stride),
3013
340k
                     _mm256_castsi256_si128(d[i]));
3014
340k
  }
3015
382k
  for (int i = 8; i < 16; i++) {
3016
340k
    _mm_storeu_si128((__m128i *)(dst + i * stride),
3017
340k
                     _mm256_extracti128_si256(d[i - 8], 1));
3018
340k
  }
3019
42.5k
}
3020
3021
static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
3022
                                              const uint16_t *left,
3023
                                              int upsample_left, int dy,
3024
85.7k
                                              int bd) {
3025
85.7k
  __m128i dstvec[16], d[16];
3026
85.7k
  if (bd < 12) {
3027
51.0k
    highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left,
3028
51.0k
                                              dy);
3029
51.0k
  } else {
3030
34.7k
    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left,
3031
34.7k
                                                    upsample_left, dy);
3032
34.7k
  }
3033
257k
  for (int i = 0; i < 16; i += 8) {
3034
171k
    highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
3035
171k
                             &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
3036
171k
                             &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
3037
171k
                             &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
3038
171k
                             &d[5 + i], &d[6 + i], &d[7 + i]);
3039
171k
  }
3040
771k
  for (int i = 0; i < 8; i++) {
3041
685k
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
3042
685k
    _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
3043
685k
  }
3044
85.7k
}
3045
3046
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3047
static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride,
3048
                                              const uint16_t *left,
3049
                                              int upsample_left, int dy,
3050
24.1k
                                              int bd) {
3051
24.1k
  __m256i dstvec[4], d[4], d1;
3052
24.1k
  if (bd < 12) {
3053
15.9k
    highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left,
3054
15.9k
                                               dy);
3055
15.9k
  } else {
3056
8.21k
    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left,
3057
8.21k
                                                     upsample_left, dy);
3058
8.21k
  }
3059
24.1k
  highbd_transpose4x16_avx2(dstvec, d);
3060
120k
  for (int i = 0; i < 4; i++) {
3061
96.5k
    _mm_storel_epi64((__m128i *)(dst + i * stride),
3062
96.5k
                     _mm256_castsi256_si128(d[i]));
3063
96.5k
    d1 = _mm256_bsrli_epi128(d[i], 8);
3064
96.5k
    _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride),
3065
96.5k
                     _mm256_castsi256_si128(d1));
3066
96.5k
    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
3067
96.5k
                     _mm256_extracti128_si256(d[i], 1));
3068
96.5k
    _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride),
3069
96.5k
                     _mm256_extracti128_si256(d1, 1));
3070
96.5k
  }
3071
24.1k
}
3072
3073
static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
3074
                                              const uint16_t *left,
3075
                                              int upsample_left, int dy,
3076
63.8k
                                              int bd) {
3077
63.8k
  __m128i dstvec[16], d[8];
3078
63.8k
  if (bd < 12) {
3079
49.9k
    highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left,
3080
49.9k
                                              dy);
3081
49.9k
  } else {
3082
13.8k
    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left,
3083
13.8k
                                                    upsample_left, dy);
3084
13.8k
  }
3085
63.8k
  highbd_transpose16x4_8x8_sse2(dstvec, d);
3086
3087
63.8k
  _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
3088
63.8k
  _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]);
3089
63.8k
  _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]);
3090
63.8k
  _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]);
3091
63.8k
  _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]);
3092
63.8k
  _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]);
3093
63.8k
  _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]);
3094
63.8k
  _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]);
3095
63.8k
}
3096
3097
static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
3098
                                              const uint16_t *left,
3099
                                              int upsample_left, int dy,
3100
13.1k
                                              int bd) {
3101
13.1k
  __m256i dstvec[16], d[16];
3102
13.1k
  if (bd < 12) {
3103
10.9k
    highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left,
3104
10.9k
                                               dy);
3105
10.9k
  } else {
3106
2.21k
    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left,
3107
2.21k
                                                     upsample_left, dy);
3108
2.21k
  }
3109
3110
39.4k
  for (int i = 0; i < 16; i += 8) {
3111
26.2k
    highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
3112
26.2k
  }
3113
3114
118k
  for (int i = 0; i < 8; i++) {
3115
105k
    _mm_storeu_si128((__m128i *)(dst + i * stride),
3116
105k
                     _mm256_castsi256_si128(d[i]));
3117
105k
  }
3118
118k
  for (int i = 0; i < 8; i++) {
3119
105k
    _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
3120
105k
                     _mm256_extracti128_si256(d[i], 1));
3121
105k
  }
3122
118k
  for (int i = 8; i < 16; i++) {
3123
105k
    _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
3124
105k
                     _mm256_castsi256_si128(d[i]));
3125
105k
  }
3126
118k
  for (int i = 8; i < 16; i++) {
3127
105k
    _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride),
3128
105k
                     _mm256_extracti128_si256(d[i], 1));
3129
105k
  }
3130
13.1k
}
3131
3132
static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
3133
                                              const uint16_t *left,
3134
                                              int upsample_left, int dy,
3135
57.0k
                                              int bd) {
3136
57.0k
  __m128i dstvec[32], d[32];
3137
57.0k
  if (bd < 12) {
3138
48.8k
    highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left,
3139
48.8k
                                              dy);
3140
48.8k
  } else {
3141
8.23k
    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left,
3142
8.23k
                                                    upsample_left, dy);
3143
8.23k
  }
3144
3145
285k
  for (int i = 0; i < 32; i += 8) {
3146
228k
    highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
3147
228k
                             &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
3148
228k
                             &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
3149
228k
                             &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
3150
228k
                             &d[5 + i], &d[6 + i], &d[7 + i]);
3151
228k
  }
3152
513k
  for (int i = 0; i < 8; i++) {
3153
456k
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
3154
456k
    _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
3155
456k
    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]);
3156
456k
    _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]);
3157
456k
  }
3158
57.0k
}
3159
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3160
3161
static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
3162
                                               const uint16_t *left,
3163
                                               int upsample_left, int dy,
3164
112k
                                               int bd) {
3165
112k
  __m256i dstvec[16], d[16];
3166
112k
  if (bd < 12) {
3167
94.1k
    highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left,
3168
94.1k
                                               dy);
3169
94.1k
  } else {
3170
18.2k
    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left,
3171
18.2k
                                                     upsample_left, dy);
3172
18.2k
  }
3173
3174
112k
  highbd_transpose16x16_avx2(dstvec, d);
3175
3176
1.90M
  for (int i = 0; i < 16; i++) {
3177
1.79M
    _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]);
3178
1.79M
  }
3179
112k
}
3180
3181
static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
3182
                                               const uint16_t *left,
3183
                                               int upsample_left, int dy,
3184
94.8k
                                               int bd) {
3185
94.8k
  __m256i dstvec[64], d[16];
3186
94.8k
  if (bd < 12) {
3187
89.8k
    highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left,
3188
89.8k
                                               dy);
3189
89.8k
  } else {
3190
5.02k
    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left,
3191
5.02k
                                                     upsample_left, dy);
3192
5.02k
  }
3193
94.8k
  highbd_transpose16x16_avx2(dstvec, d);
3194
1.61M
  for (int j = 0; j < 16; j++) {
3195
1.51M
    _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]);
3196
1.51M
  }
3197
94.8k
  highbd_transpose16x16_avx2(dstvec + 16, d);
3198
1.61M
  for (int j = 0; j < 16; j++) {
3199
1.51M
    _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]);
3200
1.51M
  }
3201
94.8k
  highbd_transpose16x16_avx2(dstvec + 32, d);
3202
1.61M
  for (int j = 0; j < 16; j++) {
3203
1.51M
    _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]);
3204
1.51M
  }
3205
94.8k
  highbd_transpose16x16_avx2(dstvec + 48, d);
3206
1.61M
  for (int j = 0; j < 16; j++) {
3207
1.51M
    _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]);
3208
1.51M
  }
3209
94.8k
}
3210
3211
static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
3212
                                               const uint16_t *left,
3213
                                               int upsample_left, int dy,
3214
30.9k
                                               int bd) {
3215
30.9k
  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
3216
30.9k
  if (bd < 12) {
3217
26.9k
    highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
3218
26.9k
  } else {
3219
4.01k
    highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left,
3220
4.01k
                                            dy);
3221
4.01k
  }
3222
30.9k
  highbd_transpose(dstT, 64, dst, stride, 64, 64);
3223
30.9k
}
3224
3225
static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
3226
                                               const uint16_t *left,
3227
                                               int upsample_left, int dy,
3228
25.9k
                                               int bd) {
3229
25.9k
  __m256i dstvec[32], d[32];
3230
25.9k
  if (bd < 12) {
3231
22.4k
    highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left,
3232
22.4k
                                               dy);
3233
22.4k
  } else {
3234
3.45k
    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left,
3235
3.45k
                                                     upsample_left, dy);
3236
3.45k
  }
3237
129k
  for (int i = 0; i < 32; i += 8) {
3238
103k
    highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
3239
103k
  }
3240
  // store
3241
77.7k
  for (int j = 0; j < 32; j += 16) {
3242
466k
    for (int i = 0; i < 8; i++) {
3243
414k
      _mm_storeu_si128((__m128i *)(dst + (i + j) * stride),
3244
414k
                       _mm256_castsi256_si128(d[(i + j)]));
3245
414k
    }
3246
466k
    for (int i = 0; i < 8; i++) {
3247
414k
      _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8),
3248
414k
                       _mm256_castsi256_si128(d[(i + j) + 8]));
3249
414k
    }
3250
466k
    for (int i = 8; i < 16; i++) {
3251
414k
      _mm256_storeu_si256(
3252
414k
          (__m256i *)(dst + (i + j) * stride),
3253
414k
          _mm256_inserti128_si256(
3254
414k
              d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0));
3255
414k
    }
3256
51.8k
  }
3257
25.9k
}
3258
3259
static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
3260
                                               const uint16_t *left,
3261
                                               int upsample_left, int dy,
3262
30.4k
                                               int bd) {
3263
30.4k
  __m256i dstvec[32], d[16];
3264
30.4k
  if (bd < 12) {
3265
27.6k
    highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left,
3266
27.6k
                                               dy);
3267
27.6k
  } else {
3268
2.74k
    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left,
3269
2.74k
                                                     upsample_left, dy);
3270
2.74k
  }
3271
91.2k
  for (int i = 0; i < 32; i += 16) {
3272
60.8k
    highbd_transpose16x16_avx2((dstvec + i), d);
3273
1.03M
    for (int j = 0; j < 16; j++) {
3274
973k
      _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
3275
973k
    }
3276
60.8k
  }
3277
30.4k
}
3278
3279
static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
3280
                                               const uint16_t *left,
3281
                                               int upsample_left, int dy,
3282
2.18k
                                               int bd) {
3283
2.18k
  uint16_t dstT[64 * 32];
3284
2.18k
  if (bd < 12) {
3285
1.74k
    highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
3286
1.74k
  } else {
3287
446
    highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left,
3288
446
                                            dy);
3289
446
  }
3290
2.18k
  highbd_transpose(dstT, 64, dst, stride, 32, 64);
3291
2.18k
}
3292
3293
static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
3294
                                               const uint16_t *left,
3295
                                               int upsample_left, int dy,
3296
2.94k
                                               int bd) {
3297
2.94k
  DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
3298
2.94k
  highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd);
3299
2.94k
  highbd_transpose(dstT, 32, dst, stride, 64, 32);
3300
2.94k
  return;
3301
2.94k
}
3302
3303
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3304
static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
3305
                                               const uint16_t *left,
3306
                                               int upsample_left, int dy,
3307
4.87k
                                               int bd) {
3308
4.87k
  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
3309
4.87k
  if (bd < 12) {
3310
4.23k
    highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
3311
4.23k
  } else {
3312
631
    highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left,
3313
631
                                            dy);
3314
631
  }
3315
4.87k
  highbd_transpose(dstT, 64, dst, stride, 16, 64);
3316
4.87k
}
3317
3318
static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
3319
                                               const uint16_t *left,
3320
                                               int upsample_left, int dy,
3321
17.3k
                                               int bd) {
3322
17.3k
  __m256i dstvec[64], d[16];
3323
17.3k
  if (bd < 12) {
3324
16.7k
    highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left,
3325
16.7k
                                               dy);
3326
16.7k
  } else {
3327
605
    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left,
3328
605
                                                     upsample_left, dy);
3329
605
  }
3330
86.7k
  for (int i = 0; i < 64; i += 16) {
3331
69.3k
    highbd_transpose16x16_avx2((dstvec + i), d);
3332
1.17M
    for (int j = 0; j < 16; j++) {
3333
1.10M
      _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
3334
1.10M
    }
3335
69.3k
  }
3336
17.3k
}
3337
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3338
3339
void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
3340
                                      int bh, const uint16_t *above,
3341
                                      const uint16_t *left, int upsample_left,
3342
1.14M
                                      int dx, int dy, int bd) {
3343
1.14M
  (void)above;
3344
1.14M
  (void)dx;
3345
3346
1.14M
  assert(dx == 1);
3347
1.14M
  assert(dy > 0);
3348
1.14M
  if (bw == bh) {
3349
674k
    switch (bw) {
3350
225k
      case 4:
3351
225k
        highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy,
3352
225k
                                         bd);
3353
225k
        break;
3354
210k
      case 8:
3355
210k
        highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy,
3356
210k
                                         bd);
3357
210k
        break;
3358
112k
      case 16:
3359
112k
        highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy,
3360
112k
                                           bd);
3361
112k
        break;
3362
94.8k
      case 32:
3363
94.8k
        highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy,
3364
94.8k
                                           bd);
3365
94.8k
        break;
3366
30.9k
      case 64:
3367
30.9k
        highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy,
3368
30.9k
                                           bd);
3369
30.9k
        break;
3370
674k
    }
3371
674k
  } else {
3372
471k
    if (bw < bh) {
3373
148k
      if (bw + bw == bh) {
3374
106k
        switch (bw) {
3375
35.7k
          case 4:
3376
35.7k
            highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left,
3377
35.7k
                                             dy, bd);
3378
35.7k
            break;
3379
42.5k
          case 8:
3380
42.5k
            highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left,
3381
42.5k
                                              dy, bd);
3382
42.5k
            break;
3383
25.9k
          case 16:
3384
25.9k
            highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left,
3385
25.9k
                                               dy, bd);
3386
25.9k
            break;
3387
2.18k
          case 32:
3388
2.18k
            highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left,
3389
2.18k
                                               dy, bd);
3390
2.18k
            break;
3391
106k
        }
3392
106k
      } else {
3393
42.1k
        switch (bw) {
3394
0
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3395
24.1k
          case 4:
3396
24.1k
            highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left,
3397
24.1k
                                              dy, bd);
3398
24.1k
            break;
3399
13.1k
          case 8:
3400
13.1k
            highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left,
3401
13.1k
                                              dy, bd);
3402
13.1k
            break;
3403
4.87k
          case 16:
3404
4.87k
            highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left,
3405
4.87k
                                               dy, bd);
3406
4.87k
            break;
3407
42.1k
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3408
42.1k
        }
3409
42.1k
      }
3410
323k
    } else {
3411
323k
      if (bh + bh == bw) {
3412
185k
        switch (bh) {
3413
66.0k
          case 4:
3414
66.0k
            highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left,
3415
66.0k
                                             dy, bd);
3416
66.0k
            break;
3417
85.7k
          case 8:
3418
85.7k
            highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left,
3419
85.7k
                                              dy, bd);
3420
85.7k
            break;
3421
30.4k
          case 16:
3422
30.4k
            highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left,
3423
30.4k
                                               dy, bd);
3424
30.4k
            break;
3425
2.94k
          case 32:
3426
2.94k
            highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left,
3427
2.94k
                                               dy, bd);
3428
2.94k
            break;
3429
185k
        }
3430
185k
      } else {
3431
138k
        switch (bh) {
3432
0
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3433
63.8k
          case 4:
3434
63.8k
            highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left,
3435
63.8k
                                              dy, bd);
3436
63.8k
            break;
3437
57.0k
          case 8:
3438
57.0k
            highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left,
3439
57.0k
                                              dy, bd);
3440
57.0k
            break;
3441
17.3k
          case 16:
3442
17.3k
            highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left,
3443
17.3k
                                               dy, bd);
3444
17.3k
            break;
3445
138k
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3446
138k
        }
3447
138k
      }
3448
323k
    }
3449
471k
  }
3450
1.14M
  return;
3451
1.14M
}
3452
#endif  // CONFIG_AV1_HIGHBITDEPTH
3453
3454
// Low bit depth functions
3455
static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
3456
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3457
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3458
  { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3459
    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3460
  { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3461
    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3462
  { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3463
    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3464
  { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3465
    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3466
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3467
    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3468
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3469
    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3470
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3471
    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0 },
3472
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
3473
    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0 },
3474
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
3475
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0 },
3476
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
3477
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
3478
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3479
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3480
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
3481
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3482
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3483
    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
3484
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3485
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3486
    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,
3487
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3488
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3489
    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,
3490
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3491
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3492
    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,
3493
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3494
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3495
    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,
3496
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3497
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3498
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,
3499
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3500
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3501
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,
3502
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3503
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3504
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,
3505
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3506
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3507
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,
3508
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3509
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3510
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
3511
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3512
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3513
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3514
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3515
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3516
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3517
    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0 },
3518
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3519
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3520
    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0 },
3521
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3522
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3523
    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0 },
3524
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3525
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3526
    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0 },
3527
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3528
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3529
    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0 },
3530
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3531
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3532
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0 },
3533
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3534
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3535
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0 },
3536
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3537
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3538
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0 },
3539
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3540
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3541
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
3542
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3543
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3544
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
3545
};
3546
3547
/* clang-format on */
3548
static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2(
3549
    int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
3550
978k
    int dx) {
3551
978k
  const int frac_bits = 6 - upsample_above;
3552
978k
  const int max_base_x = ((W + H) - 1) << upsample_above;
3553
3554
978k
  assert(dx > 0);
3555
  // pre-filter above pixels
3556
  // store in temp buffers:
3557
  //   above[x] * 32 + 16
3558
  //   above[x+1] - above[x]
3559
  // final pixels will be calculated as:
3560
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3561
978k
  __m256i a0, a1, a32, a16;
3562
978k
  __m256i diff, c3f;
3563
978k
  __m128i a_mbase_x;
3564
3565
978k
  a16 = _mm256_set1_epi16(16);
3566
978k
  a_mbase_x = _mm_set1_epi8((int8_t)above[max_base_x]);
3567
978k
  c3f = _mm256_set1_epi16(0x3f);
3568
3569
978k
  int x = dx;
3570
12.8M
  for (int r = 0; r < W; r++) {
3571
11.8M
    __m256i b, res, shift;
3572
11.8M
    __m128i res1, a0_128, a1_128;
3573
3574
11.8M
    int base = x >> frac_bits;
3575
11.8M
    int base_max_diff = (max_base_x - base) >> upsample_above;
3576
11.8M
    if (base_max_diff <= 0) {
3577
15.6k
      for (int i = r; i < W; ++i) {
3578
10.6k
        dst[i] = a_mbase_x;  // save 4 values
3579
10.6k
      }
3580
4.96k
      return;
3581
4.96k
    }
3582
11.8M
    if (base_max_diff > H) base_max_diff = H;
3583
11.8M
    a0_128 = _mm_loadu_si128((__m128i *)(above + base));
3584
11.8M
    a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
3585
3586
11.8M
    if (upsample_above) {
3587
1.94M
      a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]);
3588
1.94M
      a1_128 = _mm_srli_si128(a0_128, 8);
3589
3590
1.94M
      shift = _mm256_srli_epi16(
3591
1.94M
          _mm256_and_si256(
3592
1.94M
              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
3593
1.94M
          1);
3594
9.88M
    } else {
3595
9.88M
      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3596
9.88M
    }
3597
11.8M
    a0 = _mm256_cvtepu8_epi16(a0_128);
3598
11.8M
    a1 = _mm256_cvtepu8_epi16(a1_128);
3599
3600
11.8M
    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
3601
11.8M
    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
3602
11.8M
    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
3603
3604
11.8M
    b = _mm256_mullo_epi16(diff, shift);
3605
11.8M
    res = _mm256_add_epi16(a32, b);
3606
11.8M
    res = _mm256_srli_epi16(res, 5);
3607
3608
11.8M
    res = _mm256_packus_epi16(
3609
11.8M
        res, _mm256_castsi128_si256(
3610
11.8M
                 _mm256_extracti128_si256(res, 1)));  // goto 8 bit
3611
11.8M
    res1 = _mm256_castsi256_si128(res);               // 16 8bit values
3612
3613
11.8M
    dst[r] =
3614
11.8M
        _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
3615
11.8M
    x += dx;
3616
11.8M
  }
3617
978k
}
3618
3619
static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3620
                                      const uint8_t *above, int upsample_above,
3621
138k
                                      int dx) {
3622
138k
  __m128i dstvec[16];
3623
3624
138k
  dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx);
3625
965k
  for (int i = 0; i < N; i++) {
3626
826k
    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
3627
826k
  }
3628
138k
}
3629
3630
static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3631
                                      const uint8_t *above, int upsample_above,
3632
126k
                                      int dx) {
3633
126k
  __m128i dstvec[32];
3634
3635
126k
  dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx);
3636
1.36M
  for (int i = 0; i < N; i++) {
3637
1.23M
    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
3638
1.23M
  }
3639
126k
}
3640
3641
static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3642
                                       const uint8_t *above, int upsample_above,
3643
115k
                                       int dx) {
3644
115k
  __m128i dstvec[64];
3645
3646
115k
  dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx);
3647
1.71M
  for (int i = 0; i < N; i++) {
3648
1.59M
    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
3649
1.59M
  }
3650
115k
}
3651
3652
static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2(
3653
188k
    int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) {
3654
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
3655
188k
  (void)upsample_above;
3656
188k
  const int frac_bits = 6;
3657
188k
  const int max_base_x = ((32 + N) - 1);
3658
3659
  // pre-filter above pixels
3660
  // store in temp buffers:
3661
  //   above[x] * 32 + 16
3662
  //   above[x+1] - above[x]
3663
  // final pixels will be calculated as:
3664
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3665
188k
  __m256i a0, a1, a32, a16;
3666
188k
  __m256i a_mbase_x, diff, c3f;
3667
3668
188k
  a16 = _mm256_set1_epi16(16);
3669
188k
  a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
3670
188k
  c3f = _mm256_set1_epi16(0x3f);
3671
3672
188k
  int x = dx;
3673
5.28M
  for (int r = 0; r < N; r++) {
3674
5.09M
    __m256i b, res, res16[2];
3675
5.09M
    __m128i a0_128, a1_128;
3676
3677
5.09M
    int base = x >> frac_bits;
3678
5.09M
    int base_max_diff = (max_base_x - base);
3679
5.09M
    if (base_max_diff <= 0) {
3680
0
      for (int i = r; i < N; ++i) {
3681
0
        dstvec[i] = a_mbase_x;  // save 32 values
3682
0
      }
3683
0
      return;
3684
0
    }
3685
5.09M
    if (base_max_diff > 32) base_max_diff = 32;
3686
5.09M
    __m256i shift =
3687
5.09M
        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3688
3689
15.2M
    for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
3690
10.1M
      int mdiff = base_max_diff - j;
3691
10.1M
      if (mdiff <= 0) {
3692
592
        res16[jj] = a_mbase_x;
3693
10.1M
      } else {
3694
10.1M
        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
3695
10.1M
        a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1));
3696
10.1M
        a0 = _mm256_cvtepu8_epi16(a0_128);
3697
10.1M
        a1 = _mm256_cvtepu8_epi16(a1_128);
3698
3699
10.1M
        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
3700
10.1M
        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
3701
10.1M
        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
3702
10.1M
        b = _mm256_mullo_epi16(diff, shift);
3703
3704
10.1M
        res = _mm256_add_epi16(a32, b);
3705
10.1M
        res = _mm256_srli_epi16(res, 5);
3706
10.1M
        res16[jj] = _mm256_packus_epi16(
3707
10.1M
            res, _mm256_castsi128_si256(
3708
10.1M
                     _mm256_extracti128_si256(res, 1)));  // 16 8bit values
3709
10.1M
      }
3710
10.1M
    }
3711
5.09M
    res16[1] =
3712
5.09M
        _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]),
3713
5.09M
                                1);  // 32 8bit values
3714
3715
5.09M
    dstvec[r] = _mm256_blendv_epi8(
3716
5.09M
        a_mbase_x, res16[1],
3717
5.09M
        *(__m256i *)BaseMask[base_max_diff]);  // 32 8bit values
3718
5.09M
    x += dx;
3719
5.09M
  }
3720
188k
}
3721
3722
static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3723
                                       const uint8_t *above, int upsample_above,
3724
76.0k
                                       int dx) {
3725
76.0k
  __m256i dstvec[64];
3726
76.0k
  dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx);
3727
2.15M
  for (int i = 0; i < N; i++) {
3728
2.07M
    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
3729
2.07M
  }
3730
76.0k
}
3731
3732
static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3733
                                       const uint8_t *above, int upsample_above,
3734
43.2k
                                       int dx) {
3735
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
3736
43.2k
  (void)upsample_above;
3737
43.2k
  const int frac_bits = 6;
3738
43.2k
  const int max_base_x = ((64 + N) - 1);
3739
3740
  // pre-filter above pixels
3741
  // store in temp buffers:
3742
  //   above[x] * 32 + 16
3743
  //   above[x+1] - above[x]
3744
  // final pixels will be calculated as:
3745
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3746
43.2k
  __m256i a0, a1, a32, a16;
3747
43.2k
  __m256i a_mbase_x, diff, c3f;
3748
43.2k
  __m128i max_base_x128, base_inc128, mask128;
3749
3750
43.2k
  a16 = _mm256_set1_epi16(16);
3751
43.2k
  a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
3752
43.2k
  max_base_x128 = _mm_set1_epi8(max_base_x);
3753
43.2k
  c3f = _mm256_set1_epi16(0x3f);
3754
3755
43.2k
  int x = dx;
3756
2.35M
  for (int r = 0; r < N; r++, dst += stride) {
3757
2.31M
    __m256i b, res;
3758
2.31M
    int base = x >> frac_bits;
3759
2.31M
    if (base >= max_base_x) {
3760
0
      for (int i = r; i < N; ++i) {
3761
0
        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
3762
0
        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
3763
0
        dst += stride;
3764
0
      }
3765
0
      return;
3766
0
    }
3767
3768
2.31M
    __m256i shift =
3769
2.31M
        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3770
3771
2.31M
    __m128i a0_128, a1_128, res128;
3772
11.5M
    for (int j = 0; j < 64; j += 16) {
3773
9.25M
      int mdif = max_base_x - (base + j);
3774
9.25M
      if (mdif <= 0) {
3775
3.83k
        _mm_storeu_si128((__m128i *)(dst + j),
3776
3.83k
                         _mm256_castsi256_si128(a_mbase_x));
3777
9.25M
      } else {
3778
9.25M
        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
3779
9.25M
        a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
3780
9.25M
        a0 = _mm256_cvtepu8_epi16(a0_128);
3781
9.25M
        a1 = _mm256_cvtepu8_epi16(a1_128);
3782
3783
9.25M
        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
3784
9.25M
        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
3785
9.25M
        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
3786
9.25M
        b = _mm256_mullo_epi16(diff, shift);
3787
3788
9.25M
        res = _mm256_add_epi16(a32, b);
3789
9.25M
        res = _mm256_srli_epi16(res, 5);
3790
9.25M
        res = _mm256_packus_epi16(
3791
9.25M
            res, _mm256_castsi128_si256(
3792
9.25M
                     _mm256_extracti128_si256(res, 1)));  // 16 8bit values
3793
3794
9.25M
        base_inc128 =
3795
9.25M
            _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
3796
9.25M
                          (int8_t)(base + j + 2), (int8_t)(base + j + 3),
3797
9.25M
                          (int8_t)(base + j + 4), (int8_t)(base + j + 5),
3798
9.25M
                          (int8_t)(base + j + 6), (int8_t)(base + j + 7),
3799
9.25M
                          (int8_t)(base + j + 8), (int8_t)(base + j + 9),
3800
9.25M
                          (int8_t)(base + j + 10), (int8_t)(base + j + 11),
3801
9.25M
                          (int8_t)(base + j + 12), (int8_t)(base + j + 13),
3802
9.25M
                          (int8_t)(base + j + 14), (int8_t)(base + j + 15));
3803
3804
9.25M
        mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
3805
9.25M
                                 _mm_setzero_si128());
3806
9.25M
        res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x),
3807
9.25M
                                 _mm256_castsi256_si128(res), mask128);
3808
9.25M
        _mm_storeu_si128((__m128i *)(dst + j), res128);
3809
9.25M
      }
3810
9.25M
    }
3811
2.31M
    x += dx;
3812
2.31M
  }
3813
43.2k
}
3814
3815
// Directional prediction, zone 1: 0 < angle < 90
3816
void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
3817
                               const uint8_t *above, const uint8_t *left,
3818
469k
                               int upsample_above, int dx, int dy) {
3819
469k
  (void)left;
3820
469k
  (void)dy;
3821
469k
  switch (bw) {
3822
138k
    case 4:
3823
138k
      dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx);
3824
138k
      break;
3825
126k
    case 8:
3826
126k
      dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx);
3827
126k
      break;
3828
115k
    case 16:
3829
115k
      dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx);
3830
115k
      break;
3831
73.1k
    case 32:
3832
73.1k
      dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx);
3833
73.1k
      break;
3834
14.5k
    case 64:
3835
14.5k
      dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx);
3836
14.5k
      break;
3837
0
    default: break;
3838
469k
  }
3839
469k
  return;
3840
469k
}
3841
3842
static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3843
                                      const uint8_t *above, const uint8_t *left,
3844
                                      int upsample_above, int upsample_left,
3845
576k
                                      int dx, int dy) {
3846
576k
  const int min_base_x = -(1 << upsample_above);
3847
576k
  const int min_base_y = -(1 << upsample_left);
3848
576k
  const int frac_bits_x = 6 - upsample_above;
3849
576k
  const int frac_bits_y = 6 - upsample_left;
3850
3851
576k
  assert(dx > 0);
3852
  // pre-filter above pixels
3853
  // store in temp buffers:
3854
  //   above[x] * 32 + 16
3855
  //   above[x+1] - above[x]
3856
  // final pixels will be calculated as:
3857
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3858
576k
  __m128i a0_x, a1_x, a32, a16, diff;
3859
576k
  __m128i c3f, min_base_y128, c1234, dy128;
3860
3861
576k
  a16 = _mm_set1_epi16(16);
3862
576k
  c3f = _mm_set1_epi16(0x3f);
3863
576k
  min_base_y128 = _mm_set1_epi16(min_base_y);
3864
576k
  c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
3865
576k
  dy128 = _mm_set1_epi16(dy);
3866
3867
3.44M
  for (int r = 0; r < N; r++) {
3868
2.87M
    __m128i b, res, shift, r6, ydx;
3869
2.87M
    __m128i resx, resy, resxy;
3870
2.87M
    __m128i a0_x128, a1_x128;
3871
2.87M
    int y = r + 1;
3872
2.87M
    int base_x = (-y * dx) >> frac_bits_x;
3873
2.87M
    int base_shift = 0;
3874
2.87M
    if (base_x < (min_base_x - 1)) {
3875
2.33M
      base_shift = (min_base_x - base_x - 1) >> upsample_above;
3876
2.33M
    }
3877
2.87M
    int base_min_diff =
3878
2.87M
        (min_base_x - base_x + upsample_above) >> upsample_above;
3879
2.87M
    if (base_min_diff > 4) {
3880
1.65M
      base_min_diff = 4;
3881
1.65M
    } else {
3882
1.22M
      if (base_min_diff < 0) base_min_diff = 0;
3883
1.22M
    }
3884
3885
2.87M
    if (base_shift > 3) {
3886
1.65M
      a0_x = _mm_setzero_si128();
3887
1.65M
      a1_x = _mm_setzero_si128();
3888
1.65M
      shift = _mm_setzero_si128();
3889
1.65M
    } else {
3890
1.22M
      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
3891
1.22M
      ydx = _mm_set1_epi16(y * dx);
3892
1.22M
      r6 = _mm_slli_epi16(c1234, 6);
3893
3894
1.22M
      if (upsample_above) {
3895
272k
        a0_x128 =
3896
272k
            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
3897
272k
        a1_x128 = _mm_srli_si128(a0_x128, 8);
3898
3899
272k
        shift = _mm_srli_epi16(
3900
272k
            _mm_and_si128(
3901
272k
                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
3902
272k
            1);
3903
950k
      } else {
3904
950k
        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
3905
950k
        a1_x128 = _mm_srli_si128(a0_x128, 1);
3906
3907
950k
        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
3908
950k
      }
3909
1.22M
      a0_x = _mm_cvtepu8_epi16(a0_x128);
3910
1.22M
      a1_x = _mm_cvtepu8_epi16(a1_x128);
3911
1.22M
    }
3912
    // y calc
3913
2.87M
    __m128i a0_y, a1_y, shifty;
3914
2.87M
    if (base_x < min_base_x) {
3915
2.57M
      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
3916
2.57M
      __m128i y_c128, base_y_c128, mask128, c1234_;
3917
2.57M
      c1234_ = _mm_srli_si128(c1234, 2);
3918
2.57M
      r6 = _mm_set1_epi16(r << 6);
3919
2.57M
      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128));
3920
2.57M
      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
3921
2.57M
      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
3922
2.57M
      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
3923
2.57M
      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
3924
3925
2.57M
      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
3926
2.57M
                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
3927
2.57M
      base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4));
3928
2.57M
      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
3929
2.57M
      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
3930
2.57M
                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
3931
3932
2.57M
      if (upsample_left) {
3933
1.79M
        shifty = _mm_srli_epi16(
3934
1.79M
            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
3935
1.79M
      } else {
3936
783k
        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
3937
783k
      }
3938
2.57M
      a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
3939
2.57M
      a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
3940
2.57M
      shift = _mm_unpacklo_epi64(shift, shifty);
3941
2.57M
    }
3942
3943
2.87M
    diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
3944
2.87M
    a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
3945
2.87M
    a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
3946
3947
2.87M
    b = _mm_mullo_epi16(diff, shift);
3948
2.87M
    res = _mm_add_epi16(a32, b);
3949
2.87M
    res = _mm_srli_epi16(res, 5);
3950
3951
2.87M
    resx = _mm_packus_epi16(res, res);
3952
2.87M
    resy = _mm_srli_si128(resx, 4);
3953
3954
2.87M
    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
3955
2.87M
    *(int *)(dst) = _mm_cvtsi128_si32(resxy);
3956
2.87M
    dst += stride;
3957
2.87M
  }
3958
576k
}
3959
3960
static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3961
                                      const uint8_t *above, const uint8_t *left,
3962
                                      int upsample_above, int upsample_left,
3963
264k
                                      int dx, int dy) {
3964
264k
  const int min_base_x = -(1 << upsample_above);
3965
264k
  const int min_base_y = -(1 << upsample_left);
3966
264k
  const int frac_bits_x = 6 - upsample_above;
3967
264k
  const int frac_bits_y = 6 - upsample_left;
3968
3969
  // pre-filter above pixels
3970
  // store in temp buffers:
3971
  //   above[x] * 32 + 16
3972
  //   above[x+1] - above[x]
3973
  // final pixels will be calculated as:
3974
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3975
264k
  __m256i diff, a32, a16;
3976
264k
  __m256i a0_x, a1_x;
3977
264k
  __m128i a0_x128, a1_x128, min_base_y128, c3f;
3978
264k
  __m128i c1234, dy128;
3979
3980
264k
  a16 = _mm256_set1_epi16(16);
3981
264k
  c3f = _mm_set1_epi16(0x3f);
3982
264k
  min_base_y128 = _mm_set1_epi16(min_base_y);
3983
264k
  dy128 = _mm_set1_epi16(dy);
3984
264k
  c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3985
3986
2.71M
  for (int r = 0; r < N; r++) {
3987
2.44M
    __m256i b, res, shift;
3988
2.44M
    __m128i resx, resy, resxy, r6, ydx;
3989
3990
2.44M
    int y = r + 1;
3991
2.44M
    int base_x = (-y * dx) >> frac_bits_x;
3992
2.44M
    int base_shift = 0;
3993
2.44M
    if (base_x < (min_base_x - 1)) {
3994
1.87M
      base_shift = (min_base_x - base_x - 1) >> upsample_above;
3995
1.87M
    }
3996
2.44M
    int base_min_diff =
3997
2.44M
        (min_base_x - base_x + upsample_above) >> upsample_above;
3998
2.44M
    if (base_min_diff > 8) {
3999
1.10M
      base_min_diff = 8;
4000
1.33M
    } else {
4001
1.33M
      if (base_min_diff < 0) base_min_diff = 0;
4002
1.33M
    }
4003
4004
2.44M
    if (base_shift > 7) {
4005
1.10M
      a0_x = _mm256_setzero_si256();
4006
1.10M
      a1_x = _mm256_setzero_si256();
4007
1.10M
      shift = _mm256_setzero_si256();
4008
1.33M
    } else {
4009
1.33M
      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
4010
1.33M
      ydx = _mm_set1_epi16(y * dx);
4011
1.33M
      r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
4012
1.33M
      if (upsample_above) {
4013
408k
        a0_x128 =
4014
408k
            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
4015
408k
        a1_x128 = _mm_srli_si128(a0_x128, 8);
4016
4017
408k
        shift = _mm256_castsi128_si256(_mm_srli_epi16(
4018
408k
            _mm_and_si128(
4019
408k
                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
4020
408k
            1));
4021
930k
      } else {
4022
930k
        a1_x128 = _mm_srli_si128(a0_x128, 1);
4023
930k
        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
4024
930k
        a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
4025
4026
930k
        shift = _mm256_castsi128_si256(
4027
930k
            _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1));
4028
930k
      }
4029
1.33M
      a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
4030
1.33M
      a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
4031
1.33M
    }
4032
4033
    // y calc
4034
2.44M
    __m128i a0_y, a1_y, shifty;
4035
2.44M
    if (base_x < min_base_x) {
4036
2.07M
      DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
4037
2.07M
      __m128i y_c128, base_y_c128, mask128;
4038
2.07M
      r6 = _mm_set1_epi16(r << 6);
4039
2.07M
      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
4040
2.07M
      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
4041
2.07M
      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
4042
2.07M
      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
4043
2.07M
      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
4044
4045
2.07M
      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
4046
2.07M
                            left[base_y_c[2]], left[base_y_c[3]],
4047
2.07M
                            left[base_y_c[4]], left[base_y_c[5]],
4048
2.07M
                            left[base_y_c[6]], left[base_y_c[7]]);
4049
2.07M
      base_y_c128 = _mm_add_epi16(
4050
2.07M
          base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4));
4051
2.07M
      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
4052
4053
2.07M
      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
4054
2.07M
                            left[base_y_c[2]], left[base_y_c[3]],
4055
2.07M
                            left[base_y_c[4]], left[base_y_c[5]],
4056
2.07M
                            left[base_y_c[6]], left[base_y_c[7]]);
4057
4058
2.07M
      if (upsample_left) {
4059
614k
        shifty = _mm_srli_epi16(
4060
614k
            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
4061
1.45M
      } else {
4062
1.45M
        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
4063
1.45M
      }
4064
4065
2.07M
      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
4066
2.07M
      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
4067
2.07M
      shift = _mm256_inserti128_si256(shift, shifty, 1);
4068
2.07M
    }
4069
4070
2.44M
    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
4071
2.44M
    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
4072
2.44M
    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
4073
4074
2.44M
    b = _mm256_mullo_epi16(diff, shift);
4075
2.44M
    res = _mm256_add_epi16(a32, b);
4076
2.44M
    res = _mm256_srli_epi16(res, 5);
4077
4078
2.44M
    resx = _mm_packus_epi16(_mm256_castsi256_si128(res),
4079
2.44M
                            _mm256_castsi256_si128(res));
4080
2.44M
    resy = _mm256_extracti128_si256(res, 1);
4081
2.44M
    resy = _mm_packus_epi16(resy, resy);
4082
4083
2.44M
    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
4084
2.44M
    _mm_storel_epi64((__m128i *)(dst), resxy);
4085
2.44M
    dst += stride;
4086
2.44M
  }
4087
264k
}
4088
4089
static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst,
4090
                                      ptrdiff_t stride, const uint8_t *above,
4091
                                      const uint8_t *left, int upsample_above,
4092
446k
                                      int upsample_left, int dx, int dy) {
4093
  // here upsample_above and upsample_left are 0 by design of
4094
  // av1_use_intra_edge_upsample
4095
446k
  const int min_base_x = -1;
4096
446k
  const int min_base_y = -1;
4097
446k
  (void)upsample_above;
4098
446k
  (void)upsample_left;
4099
446k
  const int frac_bits_x = 6;
4100
446k
  const int frac_bits_y = 6;
4101
4102
446k
  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123;
4103
446k
  __m256i diff, min_base_y256, c3f, shifty, dy256, c1;
4104
446k
  __m128i a0_x128, a1_x128;
4105
4106
446k
  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
4107
446k
  a16 = _mm256_set1_epi16(16);
4108
446k
  c1 = _mm256_srli_epi16(a16, 4);
4109
446k
  min_base_y256 = _mm256_set1_epi16(min_base_y);
4110
446k
  c3f = _mm256_set1_epi16(0x3f);
4111
446k
  dy256 = _mm256_set1_epi16(dy);
4112
446k
  c0123 =
4113
446k
      _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4114
446k
  c1234 = _mm256_add_epi16(c0123, c1);
4115
4116
8.84M
  for (int r = 0; r < H; r++) {
4117
8.40M
    __m256i b, res, shift, j256, r6, ydx;
4118
8.40M
    __m128i resx, resy;
4119
8.40M
    __m128i resxy;
4120
8.40M
    int y = r + 1;
4121
8.40M
    ydx = _mm256_set1_epi16((int16_t)(y * dx));
4122
4123
8.40M
    int base_x = (-y * dx) >> frac_bits_x;
4124
23.7M
    for (int j = 0; j < W; j += 16) {
4125
15.3M
      j256 = _mm256_set1_epi16(j);
4126
15.3M
      int base_shift = 0;
4127
15.3M
      if ((base_x + j) < (min_base_x - 1)) {
4128
11.0M
        base_shift = (min_base_x - (base_x + j) - 1);
4129
11.0M
      }
4130
15.3M
      int base_min_diff = (min_base_x - base_x - j);
4131
15.3M
      if (base_min_diff > 16) {
4132
7.80M
        base_min_diff = 16;
4133
7.80M
      } else {
4134
7.57M
        if (base_min_diff < 0) base_min_diff = 0;
4135
7.57M
      }
4136
4137
15.3M
      if (base_shift < 16) {
4138
7.58M
        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
4139
7.58M
        a1_x128 =
4140
7.58M
            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
4141
7.58M
        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
4142
7.58M
        a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
4143
4144
7.58M
        a0_x = _mm256_cvtepu8_epi16(a0_x128);
4145
7.58M
        a1_x = _mm256_cvtepu8_epi16(a1_x128);
4146
4147
7.58M
        r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
4148
7.58M
        shift = _mm256_srli_epi16(
4149
7.58M
            _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
4150
4151
7.58M
        diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
4152
7.58M
        a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
4153
7.58M
        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
4154
4155
7.58M
        b = _mm256_mullo_epi16(diff, shift);
4156
7.58M
        res = _mm256_add_epi16(a32, b);
4157
7.58M
        res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
4158
7.58M
        resx = _mm256_castsi256_si128(_mm256_packus_epi16(
4159
7.58M
            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
4160
7.79M
      } else {
4161
7.79M
        resx = _mm_setzero_si128();
4162
7.79M
      }
4163
4164
      // y calc
4165
15.3M
      if (base_x < min_base_x) {
4166
14.2M
        __m256i c256, y_c256, base_y_c256, mask256, mul16;
4167
14.2M
        r6 = _mm256_set1_epi16(r << 6);
4168
14.2M
        c256 = _mm256_add_epi16(j256, c1234);
4169
14.2M
        mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
4170
14.2M
                                 _mm256_srli_epi16(min_base_y256, 1));
4171
14.2M
        y_c256 = _mm256_sub_epi16(r6, mul16);
4172
4173
14.2M
        base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
4174
14.2M
        mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
4175
4176
14.2M
        base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256);
4177
14.2M
        int16_t min_y = (int16_t)_mm_extract_epi16(
4178
14.2M
            _mm256_extracti128_si256(base_y_c256, 1), 7);
4179
14.2M
        int16_t max_y =
4180
14.2M
            (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0);
4181
14.2M
        int16_t offset_diff = max_y - min_y;
4182
4183
14.2M
        if (offset_diff < 16) {
4184
13.3M
          __m256i min_y256 = _mm256_set1_epi16(min_y);
4185
4186
13.3M
          __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256);
4187
13.3M
          __m128i base_y_offset128 =
4188
13.3M
              _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0),
4189
13.3M
                              _mm256_extracti128_si256(base_y_offset, 1));
4190
4191
13.3M
          __m128i a0_y128 = _mm_maskload_epi32(
4192
13.3M
              (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]);
4193
13.3M
          __m128i a1_y128 =
4194
13.3M
              _mm_maskload_epi32((int *)(left + min_y + 1),
4195
13.3M
                                 *(__m128i *)LoadMaskz2[offset_diff / 4]);
4196
13.3M
          a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128);
4197
13.3M
          a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128);
4198
13.3M
          a0_y = _mm256_cvtepu8_epi16(a0_y128);
4199
13.3M
          a1_y = _mm256_cvtepu8_epi16(a1_y128);
4200
13.3M
        } else {
4201
871k
          base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
4202
871k
          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
4203
4204
871k
          a0_y = _mm256_setr_epi16(
4205
871k
              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
4206
871k
              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
4207
871k
              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
4208
871k
              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
4209
871k
              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
4210
871k
              left[base_y_c[15]]);
4211
871k
          base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
4212
871k
          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
4213
4214
871k
          a1_y = _mm256_setr_epi16(
4215
871k
              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
4216
871k
              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
4217
871k
              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
4218
871k
              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
4219
871k
              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
4220
871k
              left[base_y_c[15]]);
4221
871k
        }
4222
14.2M
        shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
4223
4224
14.2M
        diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
4225
14.2M
        a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
4226
14.2M
        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
4227
4228
14.2M
        b = _mm256_mullo_epi16(diff, shifty);
4229
14.2M
        res = _mm256_add_epi16(a32, b);
4230
14.2M
        res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
4231
14.2M
        resy = _mm256_castsi256_si128(_mm256_packus_epi16(
4232
14.2M
            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
4233
14.2M
      } else {
4234
1.15M
        resy = _mm_setzero_si128();
4235
1.15M
      }
4236
15.3M
      resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
4237
15.3M
      _mm_storeu_si128((__m128i *)(dst + j), resxy);
4238
15.3M
    }  // for j
4239
8.40M
    dst += stride;
4240
8.40M
  }
4241
446k
}
4242
4243
// Directional prediction, zone 2: 90 < angle < 180
4244
void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
4245
                               const uint8_t *above, const uint8_t *left,
4246
                               int upsample_above, int upsample_left, int dx,
4247
1.28M
                               int dy) {
4248
1.28M
  assert(dx > 0);
4249
1.28M
  assert(dy > 0);
4250
1.28M
  switch (bw) {
4251
576k
    case 4:
4252
576k
      dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above,
4253
576k
                                upsample_left, dx, dy);
4254
576k
      break;
4255
264k
    case 8:
4256
264k
      dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above,
4257
264k
                                upsample_left, dx, dy);
4258
264k
      break;
4259
446k
    default:
4260
446k
      dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
4261
446k
                                upsample_above, upsample_left, dx, dy);
4262
446k
      break;
4263
1.28M
  }
4264
1.28M
  return;
4265
1.28M
}
4266
4267
// z3 functions
4268
193k
static inline void transpose16x32_avx2(__m256i *x, __m256i *d) {
4269
193k
  __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
4270
193k
  __m256i w10, w11, w12, w13, w14, w15;
4271
4272
193k
  w0 = _mm256_unpacklo_epi8(x[0], x[1]);
4273
193k
  w1 = _mm256_unpacklo_epi8(x[2], x[3]);
4274
193k
  w2 = _mm256_unpacklo_epi8(x[4], x[5]);
4275
193k
  w3 = _mm256_unpacklo_epi8(x[6], x[7]);
4276
4277
193k
  w8 = _mm256_unpacklo_epi8(x[8], x[9]);
4278
193k
  w9 = _mm256_unpacklo_epi8(x[10], x[11]);
4279
193k
  w10 = _mm256_unpacklo_epi8(x[12], x[13]);
4280
193k
  w11 = _mm256_unpacklo_epi8(x[14], x[15]);
4281
4282
193k
  w4 = _mm256_unpacklo_epi16(w0, w1);
4283
193k
  w5 = _mm256_unpacklo_epi16(w2, w3);
4284
193k
  w12 = _mm256_unpacklo_epi16(w8, w9);
4285
193k
  w13 = _mm256_unpacklo_epi16(w10, w11);
4286
4287
193k
  w6 = _mm256_unpacklo_epi32(w4, w5);
4288
193k
  w7 = _mm256_unpackhi_epi32(w4, w5);
4289
193k
  w14 = _mm256_unpacklo_epi32(w12, w13);
4290
193k
  w15 = _mm256_unpackhi_epi32(w12, w13);
4291
4292
  // Store first 4-line result
4293
193k
  d[0] = _mm256_unpacklo_epi64(w6, w14);
4294
193k
  d[1] = _mm256_unpackhi_epi64(w6, w14);
4295
193k
  d[2] = _mm256_unpacklo_epi64(w7, w15);
4296
193k
  d[3] = _mm256_unpackhi_epi64(w7, w15);
4297
4298
193k
  w4 = _mm256_unpackhi_epi16(w0, w1);
4299
193k
  w5 = _mm256_unpackhi_epi16(w2, w3);
4300
193k
  w12 = _mm256_unpackhi_epi16(w8, w9);
4301
193k
  w13 = _mm256_unpackhi_epi16(w10, w11);
4302
4303
193k
  w6 = _mm256_unpacklo_epi32(w4, w5);
4304
193k
  w7 = _mm256_unpackhi_epi32(w4, w5);
4305
193k
  w14 = _mm256_unpacklo_epi32(w12, w13);
4306
193k
  w15 = _mm256_unpackhi_epi32(w12, w13);
4307
4308
  // Store second 4-line result
4309
193k
  d[4] = _mm256_unpacklo_epi64(w6, w14);
4310
193k
  d[5] = _mm256_unpackhi_epi64(w6, w14);
4311
193k
  d[6] = _mm256_unpacklo_epi64(w7, w15);
4312
193k
  d[7] = _mm256_unpackhi_epi64(w7, w15);
4313
4314
  // upper half
4315
193k
  w0 = _mm256_unpackhi_epi8(x[0], x[1]);
4316
193k
  w1 = _mm256_unpackhi_epi8(x[2], x[3]);
4317
193k
  w2 = _mm256_unpackhi_epi8(x[4], x[5]);
4318
193k
  w3 = _mm256_unpackhi_epi8(x[6], x[7]);
4319
4320
193k
  w8 = _mm256_unpackhi_epi8(x[8], x[9]);
4321
193k
  w9 = _mm256_unpackhi_epi8(x[10], x[11]);
4322
193k
  w10 = _mm256_unpackhi_epi8(x[12], x[13]);
4323
193k
  w11 = _mm256_unpackhi_epi8(x[14], x[15]);
4324
4325
193k
  w4 = _mm256_unpacklo_epi16(w0, w1);
4326
193k
  w5 = _mm256_unpacklo_epi16(w2, w3);
4327
193k
  w12 = _mm256_unpacklo_epi16(w8, w9);
4328
193k
  w13 = _mm256_unpacklo_epi16(w10, w11);
4329
4330
193k
  w6 = _mm256_unpacklo_epi32(w4, w5);
4331
193k
  w7 = _mm256_unpackhi_epi32(w4, w5);
4332
193k
  w14 = _mm256_unpacklo_epi32(w12, w13);
4333
193k
  w15 = _mm256_unpackhi_epi32(w12, w13);
4334
4335
  // Store first 4-line result
4336
193k
  d[8] = _mm256_unpacklo_epi64(w6, w14);
4337
193k
  d[9] = _mm256_unpackhi_epi64(w6, w14);
4338
193k
  d[10] = _mm256_unpacklo_epi64(w7, w15);
4339
193k
  d[11] = _mm256_unpackhi_epi64(w7, w15);
4340
4341
193k
  w4 = _mm256_unpackhi_epi16(w0, w1);
4342
193k
  w5 = _mm256_unpackhi_epi16(w2, w3);
4343
193k
  w12 = _mm256_unpackhi_epi16(w8, w9);
4344
193k
  w13 = _mm256_unpackhi_epi16(w10, w11);
4345
4346
193k
  w6 = _mm256_unpacklo_epi32(w4, w5);
4347
193k
  w7 = _mm256_unpackhi_epi32(w4, w5);
4348
193k
  w14 = _mm256_unpacklo_epi32(w12, w13);
4349
193k
  w15 = _mm256_unpackhi_epi32(w12, w13);
4350
4351
  // Store second 4-line result
4352
193k
  d[12] = _mm256_unpacklo_epi64(w6, w14);
4353
193k
  d[13] = _mm256_unpackhi_epi64(w6, w14);
4354
193k
  d[14] = _mm256_unpacklo_epi64(w7, w15);
4355
193k
  d[15] = _mm256_unpackhi_epi64(w7, w15);
4356
193k
}
4357
4358
static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride,
4359
                                      const uint8_t *left, int upsample_left,
4360
115k
                                      int dy) {
4361
115k
  __m128i dstvec[4], d[4];
4362
4363
115k
  dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy);
4364
115k
  transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
4365
115k
                            &d[0], &d[1], &d[2], &d[3]);
4366
4367
115k
  *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
4368
115k
  *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
4369
115k
  *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
4370
115k
  *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
4371
115k
  return;
4372
115k
}
4373
4374
static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride,
4375
                                      const uint8_t *left, int upsample_left,
4376
98.0k
                                      int dy) {
4377
98.0k
  __m128i dstvec[8], d[8];
4378
4379
98.0k
  dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy);
4380
98.0k
  transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
4381
98.0k
                    &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
4382
98.0k
                    &d[3]);
4383
4384
98.0k
  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
4385
98.0k
  _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
4386
98.0k
  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
4387
98.0k
  _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
4388
98.0k
  _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
4389
98.0k
  _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
4390
98.0k
  _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
4391
98.0k
  _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
4392
98.0k
}
4393
4394
static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride,
4395
                                      const uint8_t *left, int upsample_left,
4396
23.9k
                                      int dy) {
4397
23.9k
  __m128i dstvec[4], d[8];
4398
4399
23.9k
  dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy);
4400
23.9k
  transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
4401
23.9k
                        &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
4402
215k
  for (int i = 0; i < 8; i++) {
4403
191k
    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
4404
191k
  }
4405
23.9k
}
4406
4407
static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride,
4408
                                      const uint8_t *left, int upsample_left,
4409
40.7k
                                      int dy) {
4410
40.7k
  __m128i dstvec[8], d[4];
4411
4412
40.7k
  dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy);
4413
40.7k
  transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
4414
40.7k
                        &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
4415
40.7k
                        &d[1], &d[2], &d[3]);
4416
40.7k
  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
4417
40.7k
  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
4418
40.7k
  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
4419
40.7k
  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
4420
40.7k
}
4421
4422
static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride,
4423
                                       const uint8_t *left, int upsample_left,
4424
26.2k
                                       int dy) {
4425
26.2k
  __m128i dstvec[8], d[8];
4426
4427
26.2k
  dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy);
4428
26.2k
  transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
4429
26.2k
                          dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
4430
26.2k
                          d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
4431
236k
  for (int i = 0; i < 8; i++) {
4432
210k
    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
4433
210k
    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
4434
210k
                     _mm_srli_si128(d[i], 8));
4435
210k
  }
4436
26.2k
}
4437
4438
static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
4439
                                       const uint8_t *left, int upsample_left,
4440
50.2k
                                       int dy) {
4441
50.2k
  __m128i dstvec[16], d[16];
4442
4443
50.2k
  dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy);
4444
50.2k
  transpose16x8_8x16_sse2(
4445
50.2k
      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4446
50.2k
      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4447
50.2k
      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4448
50.2k
      &d[3], &d[4], &d[5], &d[6], &d[7]);
4449
4450
452k
  for (int i = 0; i < 8; i++) {
4451
402k
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4452
402k
  }
4453
50.2k
}
4454
4455
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4456
static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride,
4457
                                       const uint8_t *left, int upsample_left,
4458
17.3k
                                       int dy) {
4459
17.3k
  __m128i dstvec[4], d[16];
4460
4461
17.3k
  dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy);
4462
17.3k
  transpose4x16_sse2(dstvec, d);
4463
295k
  for (int i = 0; i < 16; i++) {
4464
278k
    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
4465
278k
  }
4466
17.3k
}
4467
4468
static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride,
4469
                                       const uint8_t *left, int upsample_left,
4470
55.1k
                                       int dy) {
4471
55.1k
  __m128i dstvec[16], d[8];
4472
4473
55.1k
  dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy);
4474
275k
  for (int i = 4; i < 8; i++) {
4475
220k
    d[i] = _mm_setzero_si128();
4476
220k
  }
4477
55.1k
  transpose16x8_8x16_sse2(
4478
55.1k
      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4479
55.1k
      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4480
55.1k
      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4481
55.1k
      &d[3], &d[4], &d[5], &d[6], &d[7]);
4482
4483
275k
  for (int i = 0; i < 4; i++) {
4484
220k
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4485
220k
  }
4486
55.1k
}
4487
4488
static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride,
4489
                                       const uint8_t *left, int upsample_left,
4490
10.5k
                                       int dy) {
4491
10.5k
  __m256i dstvec[16], d[16];
4492
4493
10.5k
  dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy);
4494
94.7k
  for (int i = 8; i < 16; i++) {
4495
84.2k
    dstvec[i] = _mm256_setzero_si256();
4496
84.2k
  }
4497
10.5k
  transpose16x32_avx2(dstvec, d);
4498
4499
178k
  for (int i = 0; i < 16; i++) {
4500
168k
    _mm_storel_epi64((__m128i *)(dst + i * stride),
4501
168k
                     _mm256_castsi256_si128(d[i]));
4502
168k
  }
4503
178k
  for (int i = 0; i < 16; i++) {
4504
168k
    _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride),
4505
168k
                     _mm256_extracti128_si256(d[i], 1));
4506
168k
  }
4507
10.5k
}
4508
4509
static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride,
4510
                                       const uint8_t *left, int upsample_left,
4511
45.2k
                                       int dy) {
4512
45.2k
  __m128i dstvec[32], d[16];
4513
4514
45.2k
  dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy);
4515
4516
45.2k
  transpose16x8_8x16_sse2(
4517
45.2k
      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4518
45.2k
      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4519
45.2k
      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4520
45.2k
      &d[3], &d[4], &d[5], &d[6], &d[7]);
4521
45.2k
  transpose16x8_8x16_sse2(
4522
45.2k
      &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
4523
45.2k
      &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
4524
45.2k
      &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
4525
45.2k
      &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
4526
45.2k
      &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
4527
45.2k
      &d[6 + 8], &d[7 + 8]);
4528
4529
406k
  for (int i = 0; i < 8; i++) {
4530
361k
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4531
361k
    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
4532
361k
  }
4533
45.2k
}
4534
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4535
4536
static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
4537
                                        const uint8_t *left, int upsample_left,
4538
87.1k
                                        int dy) {
4539
87.1k
  __m128i dstvec[16], d[16];
4540
4541
87.1k
  dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy);
4542
87.1k
  transpose16x16_sse2(dstvec, d);
4543
4544
1.48M
  for (int i = 0; i < 16; i++) {
4545
1.39M
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4546
1.39M
  }
4547
87.1k
}
4548
4549
static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
4550
                                        const uint8_t *left, int upsample_left,
4551
81.0k
                                        int dy) {
4552
81.0k
  __m256i dstvec[32], d[32];
4553
4554
81.0k
  dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy);
4555
81.0k
  transpose16x32_avx2(dstvec, d);
4556
81.0k
  transpose16x32_avx2(dstvec + 16, d + 16);
4557
1.37M
  for (int j = 0; j < 16; j++) {
4558
1.29M
    _mm_storeu_si128((__m128i *)(dst + j * stride),
4559
1.29M
                     _mm256_castsi256_si128(d[j]));
4560
1.29M
    _mm_storeu_si128((__m128i *)(dst + j * stride + 16),
4561
1.29M
                     _mm256_castsi256_si128(d[j + 16]));
4562
1.29M
  }
4563
1.37M
  for (int j = 0; j < 16; j++) {
4564
1.29M
    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
4565
1.29M
                     _mm256_extracti128_si256(d[j], 1));
4566
1.29M
    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16),
4567
1.29M
                     _mm256_extracti128_si256(d[j + 16], 1));
4568
1.29M
  }
4569
81.0k
}
4570
4571
static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
4572
                                        const uint8_t *left, int upsample_left,
4573
23.0k
                                        int dy) {
4574
23.0k
  DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
4575
23.0k
  dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
4576
23.0k
  transpose(dstT, 64, dst, stride, 64, 64);
4577
23.0k
}
4578
4579
static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
4580
                                        const uint8_t *left, int upsample_left,
4581
21.2k
                                        int dy) {
4582
21.2k
  __m256i dstvec[16], d[16];
4583
4584
21.2k
  dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy);
4585
21.2k
  transpose16x32_avx2(dstvec, d);
4586
  // store
4587
361k
  for (int j = 0; j < 16; j++) {
4588
340k
    _mm_storeu_si128((__m128i *)(dst + j * stride),
4589
340k
                     _mm256_castsi256_si128(d[j]));
4590
340k
    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
4591
340k
                     _mm256_extracti128_si256(d[j], 1));
4592
340k
  }
4593
21.2k
}
4594
4595
static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
4596
                                        const uint8_t *left, int upsample_left,
4597
21.0k
                                        int dy) {
4598
21.0k
  __m128i dstvec[32], d[16];
4599
4600
21.0k
  dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy);
4601
63.2k
  for (int i = 0; i < 32; i += 16) {
4602
42.1k
    transpose16x16_sse2((dstvec + i), d);
4603
717k
    for (int j = 0; j < 16; j++) {
4604
674k
      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
4605
674k
    }
4606
42.1k
  }
4607
21.0k
}
4608
4609
static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
4610
                                        const uint8_t *left, int upsample_left,
4611
1.55k
                                        int dy) {
4612
1.55k
  uint8_t dstT[64 * 32];
4613
1.55k
  dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
4614
1.55k
  transpose(dstT, 64, dst, stride, 32, 64);
4615
1.55k
}
4616
4617
static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
4618
                                        const uint8_t *left, int upsample_left,
4619
2.90k
                                        int dy) {
4620
2.90k
  uint8_t dstT[32 * 64];
4621
2.90k
  dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
4622
2.90k
  transpose(dstT, 32, dst, stride, 64, 32);
4623
2.90k
  return;
4624
2.90k
}
4625
4626
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4627
static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
4628
                                        const uint8_t *left, int upsample_left,
4629
4.15k
                                        int dy) {
4630
4.15k
  uint8_t dstT[64 * 16];
4631
4.15k
  dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
4632
4.15k
  transpose(dstT, 64, dst, stride, 16, 64);
4633
4.15k
}
4634
4635
static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
4636
                                        const uint8_t *left, int upsample_left,
4637
16.3k
                                        int dy) {
4638
16.3k
  __m128i dstvec[64], d[16];
4639
4640
16.3k
  dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy);
4641
81.5k
  for (int i = 0; i < 64; i += 16) {
4642
65.2k
    transpose16x16_sse2((dstvec + i), d);
4643
1.10M
    for (int j = 0; j < 16; j++) {
4644
1.04M
      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
4645
1.04M
    }
4646
65.2k
  }
4647
16.3k
}
4648
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4649
4650
void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
4651
                               const uint8_t *above, const uint8_t *left,
4652
741k
                               int upsample_left, int dx, int dy) {
4653
741k
  (void)above;
4654
741k
  (void)dx;
4655
741k
  assert(dx == 1);
4656
741k
  assert(dy > 0);
4657
4658
741k
  if (bw == bh) {
4659
404k
    switch (bw) {
4660
115k
      case 4:
4661
115k
        dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
4662
115k
        break;
4663
98.0k
      case 8:
4664
98.0k
        dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
4665
98.0k
        break;
4666
87.1k
      case 16:
4667
87.1k
        dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy);
4668
87.1k
        break;
4669
81.0k
      case 32:
4670
81.0k
        dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy);
4671
81.0k
        break;
4672
23.0k
      case 64:
4673
23.0k
        dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy);
4674
23.0k
        break;
4675
404k
    }
4676
404k
  } else {
4677
336k
    if (bw < bh) {
4678
105k
      if (bw + bw == bh) {
4679
73.0k
        switch (bw) {
4680
23.9k
          case 4:
4681
23.9k
            dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy);
4682
23.9k
            break;
4683
26.2k
          case 8:
4684
26.2k
            dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy);
4685
26.2k
            break;
4686
21.2k
          case 16:
4687
21.2k
            dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy);
4688
21.2k
            break;
4689
1.55k
          case 32:
4690
1.55k
            dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy);
4691
1.55k
            break;
4692
73.0k
        }
4693
73.0k
      } else {
4694
32.0k
        switch (bw) {
4695
0
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4696
17.3k
          case 4:
4697
17.3k
            dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy);
4698
17.3k
            break;
4699
10.5k
          case 8:
4700
10.5k
            dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy);
4701
10.5k
            break;
4702
4.15k
          case 16:
4703
4.15k
            dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy);
4704
4.15k
            break;
4705
32.0k
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4706
32.0k
        }
4707
32.0k
      }
4708
231k
    } else {
4709
231k
      if (bh + bh == bw) {
4710
114k
        switch (bh) {
4711
40.7k
          case 4:
4712
40.7k
            dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy);
4713
40.7k
            break;
4714
50.2k
          case 8:
4715
50.2k
            dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy);
4716
50.2k
            break;
4717
21.0k
          case 16:
4718
21.0k
            dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy);
4719
21.0k
            break;
4720
2.90k
          case 32:
4721
2.90k
            dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy);
4722
2.90k
            break;
4723
114k
        }
4724
116k
      } else {
4725
116k
        switch (bh) {
4726
0
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4727
55.1k
          case 4:
4728
55.1k
            dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy);
4729
55.1k
            break;
4730
45.2k
          case 8:
4731
45.2k
            dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy);
4732
45.2k
            break;
4733
16.3k
          case 16:
4734
16.3k
            dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy);
4735
16.3k
            break;
4736
116k
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4737
116k
        }
4738
116k
      }
4739
231k
    }
4740
336k
  }
4741
741k
}