Coverage Report

Created: 2024-06-18 06:48

/src/aom/aom_dsp/x86/intrapred_avx2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <immintrin.h>
13
14
#include "config/av1_rtcd.h"
15
#include "aom_dsp/x86/intrapred_x86.h"
16
#include "aom_dsp/x86/intrapred_utils.h"
17
#include "aom_dsp/x86/lpf_common_sse2.h"
18
19
307k
static INLINE __m256i dc_sum_64(const uint8_t *ref) {
20
307k
  const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
21
307k
  const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
22
307k
  const __m256i zero = _mm256_setzero_si256();
23
307k
  __m256i y0 = _mm256_sad_epu8(x0, zero);
24
307k
  __m256i y1 = _mm256_sad_epu8(x1, zero);
25
307k
  y0 = _mm256_add_epi64(y0, y1);
26
307k
  __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
27
307k
  y0 = _mm256_add_epi64(u0, y0);
28
307k
  u0 = _mm256_unpackhi_epi64(y0, y0);
29
307k
  return _mm256_add_epi16(y0, u0);
30
307k
}
31
32
2.24M
static INLINE __m256i dc_sum_32(const uint8_t *ref) {
33
2.24M
  const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
34
2.24M
  const __m256i zero = _mm256_setzero_si256();
35
2.24M
  __m256i y = _mm256_sad_epu8(x, zero);
36
2.24M
  __m256i u = _mm256_permute2x128_si256(y, y, 1);
37
2.24M
  y = _mm256_add_epi64(u, y);
38
2.24M
  u = _mm256_unpackhi_epi64(y, y);
39
2.24M
  return _mm256_add_epi16(y, u);
40
2.24M
}
41
42
static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
43
1.43M
                                  ptrdiff_t stride) {
44
45.6M
  for (int i = 0; i < height; ++i) {
45
44.2M
    _mm256_storeu_si256((__m256i *)dst, *r);
46
44.2M
    dst += stride;
47
44.2M
  }
48
1.43M
}
49
50
static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
51
                                    int height, uint8_t *dst,
52
3.26k
                                    ptrdiff_t stride) {
53
156k
  for (int i = 0; i < height; ++i) {
54
153k
    _mm256_storeu_si256((__m256i *)dst, *r0);
55
153k
    _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
56
153k
    dst += stride;
57
153k
  }
58
3.26k
}
59
60
static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
61
218k
                                  ptrdiff_t stride) {
62
10.9M
  for (int i = 0; i < height; ++i) {
63
10.7M
    _mm256_storeu_si256((__m256i *)dst, *r);
64
10.7M
    _mm256_storeu_si256((__m256i *)(dst + 32), *r);
65
10.7M
    dst += stride;
66
10.7M
  }
67
218k
}
68
69
static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = {
70
  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
71
  { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
72
  { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
73
  { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
74
  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 },
75
  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 },
76
  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 },
77
  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
78
};
79
80
static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = {
81
  { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 },
82
  { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 },
83
  { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 },
84
  { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 }
85
};
86
87
static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = {
88
  { 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25, 28, 29,
89
    2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
90
  { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27,
91
    0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
92
  { 0, 1, 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25,
93
    0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27 },
94
  { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23,
95
    0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25 },
96
  { 0, 1, 0, 1, 0, 1, 0, 1, 8,  9,  12, 13, 16, 17, 20, 21,
97
    0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23 },
98
  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19,
99
    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21 },
100
  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17,
101
    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19 },
102
  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15,
103
    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17 }
104
};
105
106
static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = {
107
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
108
  { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
109
  { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
110
  { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
111
  { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
112
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
113
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
114
    0 },
115
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0,
116
    0, 0 },
117
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0,
118
    0, 0, 0, 0 },
119
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0,
120
    0, 0, 0, 0, 0, 0 },
121
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
122
    0xffff, 0, 0, 0, 0, 0, 0 },
123
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
124
    0xffff, 0xffff, 0, 0, 0, 0, 0 },
125
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
126
    0xffff, 0xffff, 0xffff, 0, 0, 0, 0 },
127
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
128
    0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 },
129
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
130
    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 },
131
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
132
    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 },
133
  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
134
    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
135
};
136
137
69.1k
static INLINE void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
138
69.1k
  __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
139
140
69.1k
  r0 = _mm_unpacklo_epi16(x[0], x[1]);
141
69.1k
  r1 = _mm_unpacklo_epi16(x[2], x[3]);
142
69.1k
  r2 = _mm_unpacklo_epi16(x[4], x[5]);
143
69.1k
  r3 = _mm_unpacklo_epi16(x[6], x[7]);
144
145
69.1k
  r4 = _mm_unpacklo_epi16(x[8], x[9]);
146
69.1k
  r5 = _mm_unpacklo_epi16(x[10], x[11]);
147
69.1k
  r6 = _mm_unpacklo_epi16(x[12], x[13]);
148
69.1k
  r7 = _mm_unpacklo_epi16(x[14], x[15]);
149
150
69.1k
  r8 = _mm_unpacklo_epi32(r0, r1);
151
69.1k
  r9 = _mm_unpackhi_epi32(r0, r1);
152
69.1k
  r10 = _mm_unpacklo_epi32(r2, r3);
153
69.1k
  r11 = _mm_unpackhi_epi32(r2, r3);
154
155
69.1k
  r12 = _mm_unpacklo_epi32(r4, r5);
156
69.1k
  r13 = _mm_unpackhi_epi32(r4, r5);
157
69.1k
  r14 = _mm_unpacklo_epi32(r6, r7);
158
69.1k
  r15 = _mm_unpackhi_epi32(r6, r7);
159
160
69.1k
  r0 = _mm_unpacklo_epi64(r8, r9);
161
69.1k
  r1 = _mm_unpackhi_epi64(r8, r9);
162
69.1k
  r2 = _mm_unpacklo_epi64(r10, r11);
163
69.1k
  r3 = _mm_unpackhi_epi64(r10, r11);
164
165
69.1k
  r4 = _mm_unpacklo_epi64(r12, r13);
166
69.1k
  r5 = _mm_unpackhi_epi64(r12, r13);
167
69.1k
  r6 = _mm_unpacklo_epi64(r14, r15);
168
69.1k
  r7 = _mm_unpackhi_epi64(r14, r15);
169
170
69.1k
  d[0] = _mm_unpacklo_epi64(r0, r2);
171
69.1k
  d[1] = _mm_unpacklo_epi64(r4, r6);
172
69.1k
  d[2] = _mm_unpacklo_epi64(r1, r3);
173
69.1k
  d[3] = _mm_unpacklo_epi64(r5, r7);
174
175
69.1k
  d[4] = _mm_unpackhi_epi64(r0, r2);
176
69.1k
  d[5] = _mm_unpackhi_epi64(r4, r6);
177
69.1k
  d[6] = _mm_unpackhi_epi64(r1, r3);
178
69.1k
  d[7] = _mm_unpackhi_epi64(r5, r7);
179
69.1k
}
180
181
25.3k
static INLINE void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
182
25.3k
  __m256i w0, w1, w2, w3, ww0, ww1;
183
184
25.3k
  w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
185
25.3k
  w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
186
25.3k
  w2 = _mm256_unpackhi_epi16(x[0], x[1]);  // 40 50 41 51 42 52 43 53
187
25.3k
  w3 = _mm256_unpackhi_epi16(x[2], x[3]);  // 60 70 61 71 62 72 63 73
188
189
25.3k
  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
190
25.3k
  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
191
192
25.3k
  d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
193
25.3k
  d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
194
195
25.3k
  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
196
25.3k
  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
197
198
25.3k
  d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
199
25.3k
  d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
200
25.3k
}
201
202
164k
static INLINE void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
203
164k
  __m256i w0, w1, w2, w3, ww0, ww1;
204
205
164k
  w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
206
164k
  w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
207
164k
  w2 = _mm256_unpacklo_epi16(x[4], x[5]);  // 40 50 41 51 42 52 43 53
208
164k
  w3 = _mm256_unpacklo_epi16(x[6], x[7]);  // 60 70 61 71 62 72 63 73
209
210
164k
  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
211
164k
  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
212
213
164k
  d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
214
164k
  d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
215
216
164k
  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
217
164k
  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
218
219
164k
  d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
220
164k
  d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
221
222
164k
  w0 = _mm256_unpackhi_epi16(x[0], x[1]);  // 04 14 05 15 06 16 07 17
223
164k
  w1 = _mm256_unpackhi_epi16(x[2], x[3]);  // 24 34 25 35 26 36 27 37
224
164k
  w2 = _mm256_unpackhi_epi16(x[4], x[5]);  // 44 54 45 55 46 56 47 57
225
164k
  w3 = _mm256_unpackhi_epi16(x[6], x[7]);  // 64 74 65 75 66 76 67 77
226
227
164k
  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
228
164k
  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
229
230
164k
  d[4] = _mm256_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
231
164k
  d[5] = _mm256_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
232
233
164k
  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
234
164k
  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
235
236
164k
  d[6] = _mm256_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
237
164k
  d[7] = _mm256_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
238
164k
}
239
240
1.19M
static INLINE void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) {
241
1.19M
  __m256i w0, w1, w2, w3, ww0, ww1;
242
1.19M
  __m256i dd[16];
243
1.19M
  w0 = _mm256_unpacklo_epi16(x[0], x[1]);
244
1.19M
  w1 = _mm256_unpacklo_epi16(x[2], x[3]);
245
1.19M
  w2 = _mm256_unpacklo_epi16(x[4], x[5]);
246
1.19M
  w3 = _mm256_unpacklo_epi16(x[6], x[7]);
247
248
1.19M
  ww0 = _mm256_unpacklo_epi32(w0, w1);  //
249
1.19M
  ww1 = _mm256_unpacklo_epi32(w2, w3);  //
250
251
1.19M
  dd[0] = _mm256_unpacklo_epi64(ww0, ww1);
252
1.19M
  dd[1] = _mm256_unpackhi_epi64(ww0, ww1);
253
254
1.19M
  ww0 = _mm256_unpackhi_epi32(w0, w1);  //
255
1.19M
  ww1 = _mm256_unpackhi_epi32(w2, w3);  //
256
257
1.19M
  dd[2] = _mm256_unpacklo_epi64(ww0, ww1);
258
1.19M
  dd[3] = _mm256_unpackhi_epi64(ww0, ww1);
259
260
1.19M
  w0 = _mm256_unpackhi_epi16(x[0], x[1]);
261
1.19M
  w1 = _mm256_unpackhi_epi16(x[2], x[3]);
262
1.19M
  w2 = _mm256_unpackhi_epi16(x[4], x[5]);
263
1.19M
  w3 = _mm256_unpackhi_epi16(x[6], x[7]);
264
265
1.19M
  ww0 = _mm256_unpacklo_epi32(w0, w1);  //
266
1.19M
  ww1 = _mm256_unpacklo_epi32(w2, w3);  //
267
268
1.19M
  dd[4] = _mm256_unpacklo_epi64(ww0, ww1);
269
1.19M
  dd[5] = _mm256_unpackhi_epi64(ww0, ww1);
270
271
1.19M
  ww0 = _mm256_unpackhi_epi32(w0, w1);  //
272
1.19M
  ww1 = _mm256_unpackhi_epi32(w2, w3);  //
273
274
1.19M
  dd[6] = _mm256_unpacklo_epi64(ww0, ww1);
275
1.19M
  dd[7] = _mm256_unpackhi_epi64(ww0, ww1);
276
277
1.19M
  w0 = _mm256_unpacklo_epi16(x[8], x[9]);
278
1.19M
  w1 = _mm256_unpacklo_epi16(x[10], x[11]);
279
1.19M
  w2 = _mm256_unpacklo_epi16(x[12], x[13]);
280
1.19M
  w3 = _mm256_unpacklo_epi16(x[14], x[15]);
281
282
1.19M
  ww0 = _mm256_unpacklo_epi32(w0, w1);
283
1.19M
  ww1 = _mm256_unpacklo_epi32(w2, w3);
284
285
1.19M
  dd[8] = _mm256_unpacklo_epi64(ww0, ww1);
286
1.19M
  dd[9] = _mm256_unpackhi_epi64(ww0, ww1);
287
288
1.19M
  ww0 = _mm256_unpackhi_epi32(w0, w1);
289
1.19M
  ww1 = _mm256_unpackhi_epi32(w2, w3);
290
291
1.19M
  dd[10] = _mm256_unpacklo_epi64(ww0, ww1);
292
1.19M
  dd[11] = _mm256_unpackhi_epi64(ww0, ww1);
293
294
1.19M
  w0 = _mm256_unpackhi_epi16(x[8], x[9]);
295
1.19M
  w1 = _mm256_unpackhi_epi16(x[10], x[11]);
296
1.19M
  w2 = _mm256_unpackhi_epi16(x[12], x[13]);
297
1.19M
  w3 = _mm256_unpackhi_epi16(x[14], x[15]);
298
299
1.19M
  ww0 = _mm256_unpacklo_epi32(w0, w1);
300
1.19M
  ww1 = _mm256_unpacklo_epi32(w2, w3);
301
302
1.19M
  dd[12] = _mm256_unpacklo_epi64(ww0, ww1);
303
1.19M
  dd[13] = _mm256_unpackhi_epi64(ww0, ww1);
304
305
1.19M
  ww0 = _mm256_unpackhi_epi32(w0, w1);
306
1.19M
  ww1 = _mm256_unpackhi_epi32(w2, w3);
307
308
1.19M
  dd[14] = _mm256_unpacklo_epi64(ww0, ww1);
309
1.19M
  dd[15] = _mm256_unpackhi_epi64(ww0, ww1);
310
311
10.7M
  for (int i = 0; i < 8; i++) {
312
9.56M
    d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1);
313
9.56M
    d[i + 8] = _mm256_insertf128_si256(dd[i + 8],
314
9.56M
                                       _mm256_extracti128_si256(dd[i], 1), 0);
315
9.56M
  }
316
1.19M
}
317
318
void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
319
978k
                                 const uint8_t *above, const uint8_t *left) {
320
978k
  const __m256i sum_above = dc_sum_32(above);
321
978k
  __m256i sum_left = dc_sum_32(left);
322
978k
  sum_left = _mm256_add_epi16(sum_left, sum_above);
323
978k
  const __m256i thirtytwo = _mm256_set1_epi16(32);
324
978k
  sum_left = _mm256_add_epi16(sum_left, thirtytwo);
325
978k
  sum_left = _mm256_srai_epi16(sum_left, 6);
326
978k
  const __m256i zero = _mm256_setzero_si256();
327
978k
  __m256i row = _mm256_shuffle_epi8(sum_left, zero);
328
978k
  row_store_32xh(&row, 32, dst, stride);
329
978k
}
330
331
void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
332
                                     const uint8_t *above,
333
99.1k
                                     const uint8_t *left) {
334
99.1k
  __m256i sum = dc_sum_32(above);
335
99.1k
  (void)left;
336
337
99.1k
  const __m256i sixteen = _mm256_set1_epi16(16);
338
99.1k
  sum = _mm256_add_epi16(sum, sixteen);
339
99.1k
  sum = _mm256_srai_epi16(sum, 5);
340
99.1k
  const __m256i zero = _mm256_setzero_si256();
341
99.1k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
342
99.1k
  row_store_32xh(&row, 32, dst, stride);
343
99.1k
}
344
345
void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
346
                                      const uint8_t *above,
347
154k
                                      const uint8_t *left) {
348
154k
  __m256i sum = dc_sum_32(left);
349
154k
  (void)above;
350
351
154k
  const __m256i sixteen = _mm256_set1_epi16(16);
352
154k
  sum = _mm256_add_epi16(sum, sixteen);
353
154k
  sum = _mm256_srai_epi16(sum, 5);
354
154k
  const __m256i zero = _mm256_setzero_si256();
355
154k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
356
154k
  row_store_32xh(&row, 32, dst, stride);
357
154k
}
358
359
void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
360
                                     const uint8_t *above,
361
36.9k
                                     const uint8_t *left) {
362
36.9k
  (void)above;
363
36.9k
  (void)left;
364
36.9k
  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
365
36.9k
  row_store_32xh(&row, 32, dst, stride);
366
36.9k
}
367
368
void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
369
23.2k
                                const uint8_t *above, const uint8_t *left) {
370
23.2k
  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
371
23.2k
  (void)left;
372
23.2k
  row_store_32xh(&row, 32, dst, stride);
373
23.2k
}
374
375
// There are 32 rows togeter. This function does line:
376
// 0,1,2,3, and 16,17,18,19. The next call would do
377
// 4,5,6,7, and 20,21,22,23. So 4 times of calling
378
// would finish 32 rows.
379
static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
380
604k
                                        ptrdiff_t stride) {
381
604k
  __m256i t[4];
382
604k
  __m256i m = _mm256_setzero_si256();
383
604k
  const __m256i inc = _mm256_set1_epi8(4);
384
604k
  int i;
385
386
3.02M
  for (i = 0; i < 4; i++) {
387
2.41M
    t[i] = _mm256_shuffle_epi8(*row, m);
388
2.41M
    __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
389
2.41M
    __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
390
2.41M
    _mm256_storeu_si256((__m256i *)dst, r0);
391
2.41M
    _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
392
2.41M
    dst += stride;
393
2.41M
    m = _mm256_add_epi8(m, inc);
394
2.41M
  }
395
604k
}
396
397
void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
398
151k
                                const uint8_t *above, const uint8_t *left) {
399
151k
  (void)above;
400
151k
  const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
401
402
151k
  __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
403
404
151k
  __m256i v = _mm256_unpacklo_epi8(u, u);
405
151k
  h_predictor_32x8line(&v, dst, stride);
406
151k
  dst += stride << 2;
407
408
151k
  v = _mm256_unpackhi_epi8(u, u);
409
151k
  h_predictor_32x8line(&v, dst, stride);
410
151k
  dst += stride << 2;
411
412
151k
  u = _mm256_unpackhi_epi8(left_col, left_col);
413
414
151k
  v = _mm256_unpacklo_epi8(u, u);
415
151k
  h_predictor_32x8line(&v, dst, stride);
416
151k
  dst += stride << 2;
417
418
151k
  v = _mm256_unpackhi_epi8(u, u);
419
151k
  h_predictor_32x8line(&v, dst, stride);
420
151k
}
421
422
// -----------------------------------------------------------------------------
423
// Rectangle
424
void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
425
103k
                                 const uint8_t *above, const uint8_t *left) {
426
103k
  const __m128i top_sum = dc_sum_32_sse2(above);
427
103k
  __m128i left_sum = dc_sum_16_sse2(left);
428
103k
  left_sum = _mm_add_epi16(top_sum, left_sum);
429
103k
  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(left_sum);
430
103k
  sum += 24;
431
103k
  sum /= 48;
432
103k
  const __m256i row = _mm256_set1_epi8((int8_t)sum);
433
103k
  row_store_32xh(&row, 16, dst, stride);
434
103k
}
435
436
void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
437
7.76k
                                 const uint8_t *above, const uint8_t *left) {
438
7.76k
  const __m256i sum_above = dc_sum_32(above);
439
7.76k
  __m256i sum_left = dc_sum_64(left);
440
7.76k
  sum_left = _mm256_add_epi16(sum_left, sum_above);
441
7.76k
  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
442
7.76k
  sum += 48;
443
7.76k
  sum /= 96;
444
7.76k
  const __m256i row = _mm256_set1_epi8((int8_t)sum);
445
7.76k
  row_store_32xh(&row, 64, dst, stride);
446
7.76k
}
447
448
void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
449
94.8k
                                 const uint8_t *above, const uint8_t *left) {
450
94.8k
  const __m256i sum_above = dc_sum_64(above);
451
94.8k
  __m256i sum_left = dc_sum_64(left);
452
94.8k
  sum_left = _mm256_add_epi16(sum_left, sum_above);
453
94.8k
  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
454
94.8k
  sum += 64;
455
94.8k
  sum /= 128;
456
94.8k
  const __m256i row = _mm256_set1_epi8((int8_t)sum);
457
94.8k
  row_store_64xh(&row, 64, dst, stride);
458
94.8k
}
459
460
void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
461
13.8k
                                 const uint8_t *above, const uint8_t *left) {
462
13.8k
  const __m256i sum_above = dc_sum_64(above);
463
13.8k
  __m256i sum_left = dc_sum_32(left);
464
13.8k
  sum_left = _mm256_add_epi16(sum_left, sum_above);
465
13.8k
  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
466
13.8k
  sum += 48;
467
13.8k
  sum /= 96;
468
13.8k
  const __m256i row = _mm256_set1_epi8((int8_t)sum);
469
13.8k
  row_store_64xh(&row, 32, dst, stride);
470
13.8k
}
471
472
void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
473
54.5k
                                 const uint8_t *above, const uint8_t *left) {
474
54.5k
  const __m256i sum_above = dc_sum_64(above);
475
54.5k
  __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
476
54.5k
  sum_left = _mm256_add_epi16(sum_left, sum_above);
477
54.5k
  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
478
54.5k
  sum += 40;
479
54.5k
  sum /= 80;
480
54.5k
  const __m256i row = _mm256_set1_epi8((int8_t)sum);
481
54.5k
  row_store_64xh(&row, 16, dst, stride);
482
54.5k
}
483
484
void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
485
                                     const uint8_t *above,
486
4.81k
                                     const uint8_t *left) {
487
4.81k
  __m256i sum = dc_sum_32(above);
488
4.81k
  (void)left;
489
490
4.81k
  const __m256i sixteen = _mm256_set1_epi16(16);
491
4.81k
  sum = _mm256_add_epi16(sum, sixteen);
492
4.81k
  sum = _mm256_srai_epi16(sum, 5);
493
4.81k
  const __m256i zero = _mm256_setzero_si256();
494
4.81k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
495
4.81k
  row_store_32xh(&row, 16, dst, stride);
496
4.81k
}
497
498
void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
499
                                     const uint8_t *above,
500
1.57k
                                     const uint8_t *left) {
501
1.57k
  __m256i sum = dc_sum_32(above);
502
1.57k
  (void)left;
503
504
1.57k
  const __m256i sixteen = _mm256_set1_epi16(16);
505
1.57k
  sum = _mm256_add_epi16(sum, sixteen);
506
1.57k
  sum = _mm256_srai_epi16(sum, 5);
507
1.57k
  const __m256i zero = _mm256_setzero_si256();
508
1.57k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
509
1.57k
  row_store_32xh(&row, 64, dst, stride);
510
1.57k
}
511
512
void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
513
                                     const uint8_t *above,
514
15.9k
                                     const uint8_t *left) {
515
15.9k
  __m256i sum = dc_sum_64(above);
516
15.9k
  (void)left;
517
518
15.9k
  const __m256i thirtytwo = _mm256_set1_epi16(32);
519
15.9k
  sum = _mm256_add_epi16(sum, thirtytwo);
520
15.9k
  sum = _mm256_srai_epi16(sum, 6);
521
15.9k
  const __m256i zero = _mm256_setzero_si256();
522
15.9k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
523
15.9k
  row_store_64xh(&row, 64, dst, stride);
524
15.9k
}
525
526
void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
527
                                     const uint8_t *above,
528
533
                                     const uint8_t *left) {
529
533
  __m256i sum = dc_sum_64(above);
530
533
  (void)left;
531
532
533
  const __m256i thirtytwo = _mm256_set1_epi16(32);
533
533
  sum = _mm256_add_epi16(sum, thirtytwo);
534
533
  sum = _mm256_srai_epi16(sum, 6);
535
533
  const __m256i zero = _mm256_setzero_si256();
536
533
  __m256i row = _mm256_shuffle_epi8(sum, zero);
537
533
  row_store_64xh(&row, 32, dst, stride);
538
533
}
539
540
void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
541
                                     const uint8_t *above,
542
1.94k
                                     const uint8_t *left) {
543
1.94k
  __m256i sum = dc_sum_64(above);
544
1.94k
  (void)left;
545
546
1.94k
  const __m256i thirtytwo = _mm256_set1_epi16(32);
547
1.94k
  sum = _mm256_add_epi16(sum, thirtytwo);
548
1.94k
  sum = _mm256_srai_epi16(sum, 6);
549
1.94k
  const __m256i zero = _mm256_setzero_si256();
550
1.94k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
551
1.94k
  row_store_64xh(&row, 16, dst, stride);
552
1.94k
}
553
554
void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
555
                                      const uint8_t *above,
556
4.61k
                                      const uint8_t *left) {
557
4.61k
  __m128i sum = dc_sum_16_sse2(left);
558
4.61k
  (void)above;
559
560
4.61k
  const __m128i eight = _mm_set1_epi16(8);
561
4.61k
  sum = _mm_add_epi16(sum, eight);
562
4.61k
  sum = _mm_srai_epi16(sum, 4);
563
4.61k
  const __m128i zero = _mm_setzero_si128();
564
4.61k
  const __m128i r = _mm_shuffle_epi8(sum, zero);
565
4.61k
  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
566
4.61k
  row_store_32xh(&row, 16, dst, stride);
567
4.61k
}
568
569
void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
570
                                      const uint8_t *above,
571
1.59k
                                      const uint8_t *left) {
572
1.59k
  __m256i sum = dc_sum_64(left);
573
1.59k
  (void)above;
574
575
1.59k
  const __m256i thirtytwo = _mm256_set1_epi16(32);
576
1.59k
  sum = _mm256_add_epi16(sum, thirtytwo);
577
1.59k
  sum = _mm256_srai_epi16(sum, 6);
578
1.59k
  const __m256i zero = _mm256_setzero_si256();
579
1.59k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
580
1.59k
  row_store_32xh(&row, 64, dst, stride);
581
1.59k
}
582
583
void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
584
                                      const uint8_t *above,
585
21.5k
                                      const uint8_t *left) {
586
21.5k
  __m256i sum = dc_sum_64(left);
587
21.5k
  (void)above;
588
589
21.5k
  const __m256i thirtytwo = _mm256_set1_epi16(32);
590
21.5k
  sum = _mm256_add_epi16(sum, thirtytwo);
591
21.5k
  sum = _mm256_srai_epi16(sum, 6);
592
21.5k
  const __m256i zero = _mm256_setzero_si256();
593
21.5k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
594
21.5k
  row_store_64xh(&row, 64, dst, stride);
595
21.5k
}
596
597
void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
598
                                      const uint8_t *above,
599
1.52k
                                      const uint8_t *left) {
600
1.52k
  __m256i sum = dc_sum_32(left);
601
1.52k
  (void)above;
602
603
1.52k
  const __m256i sixteen = _mm256_set1_epi16(16);
604
1.52k
  sum = _mm256_add_epi16(sum, sixteen);
605
1.52k
  sum = _mm256_srai_epi16(sum, 5);
606
1.52k
  const __m256i zero = _mm256_setzero_si256();
607
1.52k
  __m256i row = _mm256_shuffle_epi8(sum, zero);
608
1.52k
  row_store_64xh(&row, 32, dst, stride);
609
1.52k
}
610
611
void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
612
                                      const uint8_t *above,
613
303
                                      const uint8_t *left) {
614
303
  __m128i sum = dc_sum_16_sse2(left);
615
303
  (void)above;
616
617
303
  const __m128i eight = _mm_set1_epi16(8);
618
303
  sum = _mm_add_epi16(sum, eight);
619
303
  sum = _mm_srai_epi16(sum, 4);
620
303
  const __m128i zero = _mm_setzero_si128();
621
303
  const __m128i r = _mm_shuffle_epi8(sum, zero);
622
303
  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
623
303
  row_store_64xh(&row, 16, dst, stride);
624
303
}
625
626
void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
627
                                     const uint8_t *above,
628
3.99k
                                     const uint8_t *left) {
629
3.99k
  (void)above;
630
3.99k
  (void)left;
631
3.99k
  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
632
3.99k
  row_store_32xh(&row, 16, dst, stride);
633
3.99k
}
634
635
void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
636
                                     const uint8_t *above,
637
1.76k
                                     const uint8_t *left) {
638
1.76k
  (void)above;
639
1.76k
  (void)left;
640
1.76k
  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
641
1.76k
  row_store_32xh(&row, 64, dst, stride);
642
1.76k
}
643
644
void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
645
                                     const uint8_t *above,
646
11.7k
                                     const uint8_t *left) {
647
11.7k
  (void)above;
648
11.7k
  (void)left;
649
11.7k
  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
650
11.7k
  row_store_64xh(&row, 64, dst, stride);
651
11.7k
}
652
653
void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
654
                                     const uint8_t *above,
655
1.41k
                                     const uint8_t *left) {
656
1.41k
  (void)above;
657
1.41k
  (void)left;
658
1.41k
  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
659
1.41k
  row_store_64xh(&row, 32, dst, stride);
660
1.41k
}
661
662
void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
663
                                     const uint8_t *above,
664
565
                                     const uint8_t *left) {
665
565
  (void)above;
666
565
  (void)left;
667
565
  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
668
565
  row_store_64xh(&row, 16, dst, stride);
669
565
}
670
671
void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
672
8.22k
                                const uint8_t *above, const uint8_t *left) {
673
8.22k
  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
674
8.22k
  (void)left;
675
8.22k
  row_store_32xh(&row, 16, dst, stride);
676
8.22k
}
677
678
void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
679
476
                                const uint8_t *above, const uint8_t *left) {
680
476
  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
681
476
  (void)left;
682
476
  row_store_32xh(&row, 64, dst, stride);
683
476
}
684
685
void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
686
1.94k
                                const uint8_t *above, const uint8_t *left) {
687
1.94k
  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
688
1.94k
  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
689
1.94k
  (void)left;
690
1.94k
  row_store_32x2xh(&row0, &row1, 64, dst, stride);
691
1.94k
}
692
693
void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
694
495
                                const uint8_t *above, const uint8_t *left) {
695
495
  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
696
495
  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
697
495
  (void)left;
698
495
  row_store_32x2xh(&row0, &row1, 32, dst, stride);
699
495
}
700
701
void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
702
832
                                const uint8_t *above, const uint8_t *left) {
703
832
  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
704
832
  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
705
832
  (void)left;
706
832
  row_store_32x2xh(&row0, &row1, 16, dst, stride);
707
832
}
708
709
// -----------------------------------------------------------------------------
710
// PAETH_PRED
711
712
// Return 16 16-bit pixels in one row (__m256i)
713
static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
714
68.1M
                                 const __m256i *topleft) {
715
68.1M
  const __m256i base =
716
68.1M
      _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
717
718
68.1M
  __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
719
68.1M
  __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
720
68.1M
  __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
721
722
68.1M
  __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
723
68.1M
  mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
724
68.1M
  __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
725
726
68.1M
  pl = _mm256_andnot_si256(mask1, *left);
727
728
68.1M
  ptl = _mm256_and_si256(mask2, *topleft);
729
68.1M
  pt = _mm256_andnot_si256(mask2, *top);
730
68.1M
  pt = _mm256_or_si256(pt, ptl);
731
68.1M
  pt = _mm256_and_si256(mask1, pt);
732
733
68.1M
  return _mm256_or_si256(pt, pl);
734
68.1M
}
735
736
// Return 16 8-bit pixels in one row (__m128i)
737
static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
738
67.5M
                                      const __m256i *topleft) {
739
67.5M
  const __m256i p0 = paeth_pred(left, top, topleft);
740
67.5M
  const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
741
67.5M
  const __m256i p = _mm256_packus_epi16(p0, p1);
742
67.5M
  return _mm256_castsi256_si128(p);
743
67.5M
}
744
745
1.88M
static INLINE __m256i get_top_vector(const uint8_t *above) {
746
1.88M
  const __m128i x = _mm_load_si128((const __m128i *)above);
747
1.88M
  const __m128i zero = _mm_setzero_si128();
748
1.88M
  const __m128i t0 = _mm_unpacklo_epi8(x, zero);
749
1.88M
  const __m128i t1 = _mm_unpackhi_epi8(x, zero);
750
1.88M
  return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
751
1.88M
}
752
753
void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
754
58.1k
                                   const uint8_t *above, const uint8_t *left) {
755
58.1k
  __m128i x = _mm_loadl_epi64((const __m128i *)left);
756
58.1k
  const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
757
58.1k
  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
758
58.1k
  __m256i rep = _mm256_set1_epi16((short)0x8000);
759
58.1k
  const __m256i one = _mm256_set1_epi16(1);
760
58.1k
  const __m256i top = get_top_vector(above);
761
762
58.1k
  int i;
763
523k
  for (i = 0; i < 8; ++i) {
764
465k
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
765
465k
    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
766
767
465k
    _mm_store_si128((__m128i *)dst, row);
768
465k
    dst += stride;
769
465k
    rep = _mm256_add_epi16(rep, one);
770
465k
  }
771
58.1k
}
772
773
3.33M
static INLINE __m256i get_left_vector(const uint8_t *left) {
774
3.33M
  const __m128i x = _mm_load_si128((const __m128i *)left);
775
3.33M
  return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
776
3.33M
}
777
778
void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
779
82.1k
                                    const uint8_t *above, const uint8_t *left) {
780
82.1k
  const __m256i l = get_left_vector(left);
781
82.1k
  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
782
82.1k
  __m256i rep = _mm256_set1_epi16((short)0x8000);
783
82.1k
  const __m256i one = _mm256_set1_epi16(1);
784
82.1k
  const __m256i top = get_top_vector(above);
785
786
82.1k
  int i;
787
1.39M
  for (i = 0; i < 16; ++i) {
788
1.31M
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
789
1.31M
    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
790
791
1.31M
    _mm_store_si128((__m128i *)dst, row);
792
1.31M
    dst += stride;
793
1.31M
    rep = _mm256_add_epi16(rep, one);
794
1.31M
  }
795
82.1k
}
796
797
void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
798
893k
                                    const uint8_t *above, const uint8_t *left) {
799
893k
  __m256i l = get_left_vector(left);
800
893k
  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
801
893k
  __m256i rep = _mm256_set1_epi16((short)0x8000);
802
893k
  const __m256i one = _mm256_set1_epi16(1);
803
893k
  const __m256i top = get_top_vector(above);
804
805
893k
  int i;
806
15.1M
  for (i = 0; i < 16; ++i) {
807
14.2M
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
808
14.2M
    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
809
810
14.2M
    _mm_store_si128((__m128i *)dst, row);
811
14.2M
    dst += stride;
812
14.2M
    rep = _mm256_add_epi16(rep, one);
813
14.2M
  }
814
815
893k
  l = get_left_vector(left + 16);
816
893k
  rep = _mm256_set1_epi16((short)0x8000);
817
15.1M
  for (i = 0; i < 16; ++i) {
818
14.2M
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
819
14.2M
    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
820
821
14.2M
    _mm_store_si128((__m128i *)dst, row);
822
14.2M
    dst += stride;
823
14.2M
    rep = _mm256_add_epi16(rep, one);
824
14.2M
  }
825
893k
}
826
827
void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
828
222k
                                    const uint8_t *above, const uint8_t *left) {
829
222k
  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
830
222k
  const __m256i one = _mm256_set1_epi16(1);
831
222k
  const __m256i top = get_top_vector(above);
832
833
1.11M
  for (int j = 0; j < 4; ++j) {
834
888k
    const __m256i l = get_left_vector(left + j * 16);
835
888k
    __m256i rep = _mm256_set1_epi16((short)0x8000);
836
15.1M
    for (int i = 0; i < 16; ++i) {
837
14.2M
      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
838
14.2M
      const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
839
840
14.2M
      _mm_store_si128((__m128i *)dst, row);
841
14.2M
      dst += stride;
842
14.2M
      rep = _mm256_add_epi16(rep, one);
843
14.2M
    }
844
888k
  }
845
222k
}
846
847
// Return 32 8-bit pixels in one row (__m256i)
848
static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
849
                                      const __m256i *top1,
850
317k
                                      const __m256i *topleft) {
851
317k
  __m256i p0 = paeth_pred(left, top0, topleft);
852
317k
  __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
853
317k
  const __m256i x0 = _mm256_packus_epi16(p0, p1);
854
855
317k
  p0 = paeth_pred(left, top1, topleft);
856
317k
  p1 = _mm256_permute4x64_epi64(p0, 0xe);
857
317k
  const __m256i x1 = _mm256_packus_epi16(p0, p1);
858
859
317k
  return _mm256_permute2x128_si256(x0, x1, 0x20);
860
317k
}
861
862
void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
863
19.8k
                                    const uint8_t *above, const uint8_t *left) {
864
19.8k
  const __m256i l = get_left_vector(left);
865
19.8k
  const __m256i t0 = get_top_vector(above);
866
19.8k
  const __m256i t1 = get_top_vector(above + 16);
867
19.8k
  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
868
19.8k
  __m256i rep = _mm256_set1_epi16((short)0x8000);
869
19.8k
  const __m256i one = _mm256_set1_epi16(1);
870
871
19.8k
  int i;
872
337k
  for (i = 0; i < 16; ++i) {
873
317k
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
874
875
317k
    const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
876
877
317k
    _mm256_storeu_si256((__m256i *)dst, r);
878
879
317k
    dst += stride;
880
317k
    rep = _mm256_add_epi16(rep, one);
881
317k
  }
882
19.8k
}
883
884
void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
885
193k
                                    const uint8_t *above, const uint8_t *left) {
886
193k
  __m256i l = get_left_vector(left);
887
193k
  const __m256i t0 = get_top_vector(above);
888
193k
  const __m256i t1 = get_top_vector(above + 16);
889
193k
  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
890
193k
  __m256i rep = _mm256_set1_epi16((short)0x8000);
891
193k
  const __m256i one = _mm256_set1_epi16(1);
892
893
193k
  int i;
894
3.29M
  for (i = 0; i < 16; ++i) {
895
3.10M
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
896
897
3.10M
    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
898
3.10M
    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
899
900
3.10M
    _mm_store_si128((__m128i *)dst, r0);
901
3.10M
    _mm_store_si128((__m128i *)(dst + 16), r1);
902
903
3.10M
    dst += stride;
904
3.10M
    rep = _mm256_add_epi16(rep, one);
905
3.10M
  }
906
907
193k
  l = get_left_vector(left + 16);
908
193k
  rep = _mm256_set1_epi16((short)0x8000);
909
3.29M
  for (i = 0; i < 16; ++i) {
910
3.10M
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
911
912
3.10M
    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
913
3.10M
    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
914
915
3.10M
    _mm_store_si128((__m128i *)dst, r0);
916
3.10M
    _mm_store_si128((__m128i *)(dst + 16), r1);
917
918
3.10M
    dst += stride;
919
3.10M
    rep = _mm256_add_epi16(rep, one);
920
3.10M
  }
921
193k
}
922
923
void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
924
4.65k
                                    const uint8_t *above, const uint8_t *left) {
925
4.65k
  const __m256i t0 = get_top_vector(above);
926
4.65k
  const __m256i t1 = get_top_vector(above + 16);
927
4.65k
  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
928
4.65k
  const __m256i one = _mm256_set1_epi16(1);
929
930
4.65k
  int i, j;
931
23.2k
  for (j = 0; j < 4; ++j) {
932
18.6k
    const __m256i l = get_left_vector(left + j * 16);
933
18.6k
    __m256i rep = _mm256_set1_epi16((short)0x8000);
934
316k
    for (i = 0; i < 16; ++i) {
935
297k
      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
936
937
297k
      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
938
297k
      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
939
940
297k
      _mm_store_si128((__m128i *)dst, r0);
941
297k
      _mm_store_si128((__m128i *)(dst + 16), r1);
942
943
297k
      dst += stride;
944
297k
      rep = _mm256_add_epi16(rep, one);
945
297k
    }
946
18.6k
  }
947
4.65k
}
948
949
void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
950
4.78k
                                    const uint8_t *above, const uint8_t *left) {
951
4.78k
  const __m256i t0 = get_top_vector(above);
952
4.78k
  const __m256i t1 = get_top_vector(above + 16);
953
4.78k
  const __m256i t2 = get_top_vector(above + 32);
954
4.78k
  const __m256i t3 = get_top_vector(above + 48);
955
4.78k
  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
956
4.78k
  const __m256i one = _mm256_set1_epi16(1);
957
958
4.78k
  int i, j;
959
14.3k
  for (j = 0; j < 2; ++j) {
960
9.57k
    const __m256i l = get_left_vector(left + j * 16);
961
9.57k
    __m256i rep = _mm256_set1_epi16((short)0x8000);
962
162k
    for (i = 0; i < 16; ++i) {
963
153k
      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
964
965
153k
      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
966
153k
      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
967
153k
      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
968
153k
      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
969
970
153k
      _mm_store_si128((__m128i *)dst, r0);
971
153k
      _mm_store_si128((__m128i *)(dst + 16), r1);
972
153k
      _mm_store_si128((__m128i *)(dst + 32), r2);
973
153k
      _mm_store_si128((__m128i *)(dst + 48), r3);
974
975
153k
      dst += stride;
976
153k
      rep = _mm256_add_epi16(rep, one);
977
153k
    }
978
9.57k
  }
979
4.78k
}
980
981
void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
982
34.2k
                                    const uint8_t *above, const uint8_t *left) {
983
34.2k
  const __m256i t0 = get_top_vector(above);
984
34.2k
  const __m256i t1 = get_top_vector(above + 16);
985
34.2k
  const __m256i t2 = get_top_vector(above + 32);
986
34.2k
  const __m256i t3 = get_top_vector(above + 48);
987
34.2k
  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
988
34.2k
  const __m256i one = _mm256_set1_epi16(1);
989
990
34.2k
  int i, j;
991
171k
  for (j = 0; j < 4; ++j) {
992
137k
    const __m256i l = get_left_vector(left + j * 16);
993
137k
    __m256i rep = _mm256_set1_epi16((short)0x8000);
994
2.33M
    for (i = 0; i < 16; ++i) {
995
2.19M
      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
996
997
2.19M
      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
998
2.19M
      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
999
2.19M
      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
1000
2.19M
      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
1001
1002
2.19M
      _mm_store_si128((__m128i *)dst, r0);
1003
2.19M
      _mm_store_si128((__m128i *)(dst + 16), r1);
1004
2.19M
      _mm_store_si128((__m128i *)(dst + 32), r2);
1005
2.19M
      _mm_store_si128((__m128i *)(dst + 48), r3);
1006
1007
2.19M
      dst += stride;
1008
2.19M
      rep = _mm256_add_epi16(rep, one);
1009
2.19M
    }
1010
137k
  }
1011
34.2k
}
1012
1013
void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
1014
8.81k
                                    const uint8_t *above, const uint8_t *left) {
1015
8.81k
  const __m256i t0 = get_top_vector(above);
1016
8.81k
  const __m256i t1 = get_top_vector(above + 16);
1017
8.81k
  const __m256i t2 = get_top_vector(above + 32);
1018
8.81k
  const __m256i t3 = get_top_vector(above + 48);
1019
8.81k
  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
1020
8.81k
  const __m256i one = _mm256_set1_epi16(1);
1021
1022
8.81k
  int i;
1023
8.81k
  const __m256i l = get_left_vector(left);
1024
8.81k
  __m256i rep = _mm256_set1_epi16((short)0x8000);
1025
149k
  for (i = 0; i < 16; ++i) {
1026
141k
    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
1027
1028
141k
    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
1029
141k
    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
1030
141k
    const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
1031
141k
    const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
1032
1033
141k
    _mm_store_si128((__m128i *)dst, r0);
1034
141k
    _mm_store_si128((__m128i *)(dst + 16), r1);
1035
141k
    _mm_store_si128((__m128i *)(dst + 32), r2);
1036
141k
    _mm_store_si128((__m128i *)(dst + 48), r3);
1037
1038
141k
    dst += stride;
1039
141k
    rep = _mm256_add_epi16(rep, one);
1040
141k
  }
1041
8.81k
}
1042
1043
#define PERM4x64(c0, c1, c2, c3) c0 + (c1 << 2) + (c2 << 4) + (c3 << 6)
1044
#define PERM2x128(c0, c1) c0 + (c1 << 4)
1045
1046
static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2(
1047
357k
    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1048
357k
  const int frac_bits = 6 - upsample_above;
1049
357k
  const int max_base_x = ((N + 4) - 1) << upsample_above;
1050
1051
357k
  assert(dx > 0);
1052
  // pre-filter above pixels
1053
  // store in temp buffers:
1054
  //   above[x] * 32 + 16
1055
  //   above[x+1] - above[x]
1056
  // final pixels will be calculated as:
1057
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1058
357k
  __m256i a0, a1, a32, a16;
1059
357k
  __m256i diff, c3f;
1060
357k
  __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
1061
357k
  __m128i a0_128, a1_128;
1062
357k
  a16 = _mm256_set1_epi16(16);
1063
357k
  a_mbase_x = _mm_set1_epi16(above[max_base_x]);
1064
357k
  max_base_x128 = _mm_set1_epi16(max_base_x);
1065
357k
  c3f = _mm256_set1_epi16(0x3f);
1066
1067
357k
  int x = dx;
1068
2.88M
  for (int r = 0; r < N; r++) {
1069
2.53M
    __m256i b, res, shift;
1070
2.53M
    __m128i res1;
1071
1072
2.53M
    int base = x >> frac_bits;
1073
2.53M
    if (base >= max_base_x) {
1074
8.23k
      for (int i = r; i < N; ++i) {
1075
4.91k
        dst[i] = a_mbase_x;  // save 4 values
1076
4.91k
      }
1077
3.31k
      return;
1078
3.31k
    }
1079
1080
2.52M
    a0_128 = _mm_loadu_si128((__m128i *)(above + base));
1081
2.52M
    a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
1082
1083
2.52M
    if (upsample_above) {
1084
996k
      a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]);
1085
996k
      a1_128 = _mm_srli_si128(a0_128, 8);
1086
1087
996k
      base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8,
1088
996k
                                   base + 10, base + 12, base + 14);
1089
996k
      shift = _mm256_srli_epi16(
1090
996k
          _mm256_and_si256(
1091
996k
              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above),
1092
996k
              _mm256_set1_epi16(0x3f)),
1093
996k
          1);
1094
1.53M
    } else {
1095
1.53M
      base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4,
1096
1.53M
                                   base + 5, base + 6, base + 7);
1097
1.53M
      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1098
1.53M
    }
1099
2.52M
    a0 = _mm256_castsi128_si256(a0_128);
1100
2.52M
    a1 = _mm256_castsi128_si256(a1_128);
1101
2.52M
    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1102
2.52M
    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1103
2.52M
    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1104
1105
2.52M
    b = _mm256_mullo_epi16(diff, shift);
1106
2.52M
    res = _mm256_add_epi16(a32, b);
1107
2.52M
    res = _mm256_srli_epi16(res, 5);
1108
2.52M
    res1 = _mm256_castsi256_si128(res);
1109
1110
2.52M
    mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128);
1111
2.52M
    dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
1112
2.52M
    x += dx;
1113
2.52M
  }
1114
357k
}
1115
1116
static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_4xN_internal_avx2(
1117
111k
    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1118
111k
  const int frac_bits = 6 - upsample_above;
1119
111k
  const int max_base_x = ((N + 4) - 1) << upsample_above;
1120
1121
111k
  assert(dx > 0);
1122
  // pre-filter above pixels
1123
  // store in temp buffers:
1124
  //   above[x] * 32 + 16
1125
  //   above[x+1] - above[x]
1126
  // final pixels will be calculated as:
1127
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1128
111k
  __m256i a0, a1, a32, a16;
1129
111k
  __m256i diff;
1130
111k
  __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
1131
1132
111k
  a16 = _mm256_set1_epi32(16);
1133
111k
  a_mbase_x = _mm_set1_epi16(above[max_base_x]);
1134
111k
  max_base_x128 = _mm_set1_epi32(max_base_x);
1135
1136
111k
  int x = dx;
1137
972k
  for (int r = 0; r < N; r++) {
1138
861k
    __m256i b, res, shift;
1139
861k
    __m128i res1;
1140
1141
861k
    int base = x >> frac_bits;
1142
861k
    if (base >= max_base_x) {
1143
764
      for (int i = r; i < N; ++i) {
1144
501
        dst[i] = a_mbase_x;  // save 4 values
1145
501
      }
1146
263
      return;
1147
263
    }
1148
1149
861k
    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1150
861k
    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1151
1152
861k
    if (upsample_above) {
1153
98.0k
      a0 = _mm256_permutevar8x32_epi32(
1154
98.0k
          a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1155
98.0k
      a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
1156
98.0k
      base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6);
1157
98.0k
      shift = _mm256_srli_epi32(
1158
98.0k
          _mm256_and_si256(
1159
98.0k
              _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
1160
98.0k
              _mm256_set1_epi32(0x3f)),
1161
98.0k
          1);
1162
763k
    } else {
1163
763k
      base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3);
1164
763k
      shift = _mm256_srli_epi32(
1165
763k
          _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1166
763k
    }
1167
1168
861k
    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1169
861k
    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1170
861k
    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1171
1172
861k
    b = _mm256_mullo_epi32(diff, shift);
1173
861k
    res = _mm256_add_epi32(a32, b);
1174
861k
    res = _mm256_srli_epi32(res, 5);
1175
1176
861k
    res1 = _mm256_castsi256_si128(res);
1177
861k
    res1 = _mm_packus_epi32(res1, res1);
1178
1179
861k
    mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128);
1180
861k
    mask128 = _mm_packs_epi32(mask128, mask128);  // goto 16 bit
1181
861k
    dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
1182
861k
    x += dx;
1183
861k
  }
1184
111k
}
1185
1186
static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst,
1187
                                             ptrdiff_t stride,
1188
                                             const uint16_t *above,
1189
                                             int upsample_above, int dx,
1190
161k
                                             int bd) {
1191
161k
  __m128i dstvec[16];
1192
161k
  if (bd < 12) {
1193
105k
    highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above,
1194
105k
                                              dx);
1195
105k
  } else {
1196
55.5k
    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above,
1197
55.5k
                                                    upsample_above, dx);
1198
55.5k
  }
1199
1.26M
  for (int i = 0; i < N; i++) {
1200
1.10M
    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
1201
1.10M
  }
1202
161k
}
1203
1204
static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_8xN_internal_avx2(
1205
253k
    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1206
253k
  const int frac_bits = 6 - upsample_above;
1207
253k
  const int max_base_x = ((8 + N) - 1) << upsample_above;
1208
1209
253k
  assert(dx > 0);
1210
  // pre-filter above pixels
1211
  // store in temp buffers:
1212
  //   above[x] * 32 + 16
1213
  //   above[x+1] - above[x]
1214
  // final pixels will be calculated as:
1215
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1216
253k
  __m256i a0, a1, a0_1, a1_1, a32, a16;
1217
253k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1218
1219
253k
  a16 = _mm256_set1_epi32(16);
1220
253k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1221
253k
  max_base_x256 = _mm256_set1_epi32(max_base_x);
1222
1223
253k
  int x = dx;
1224
2.77M
  for (int r = 0; r < N; r++) {
1225
2.52M
    __m256i b, res, res1, shift;
1226
1227
2.52M
    int base = x >> frac_bits;
1228
2.52M
    if (base >= max_base_x) {
1229
1.37k
      for (int i = r; i < N; ++i) {
1230
846
        dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
1231
846
      }
1232
530
      return;
1233
530
    }
1234
1235
2.51M
    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1236
2.51M
    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1237
1238
2.51M
    if (upsample_above) {
1239
381k
      a0 = _mm256_permutevar8x32_epi32(
1240
381k
          a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1241
381k
      a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
1242
1243
381k
      a0_1 =
1244
381k
          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
1245
381k
      a0_1 = _mm256_permutevar8x32_epi32(
1246
381k
          a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1247
381k
      a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
1248
1249
381k
      a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
1250
381k
      a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
1251
381k
      base_inc256 =
1252
381k
          _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8,
1253
381k
                            base + 10, base + 12, base + 14);
1254
381k
      shift = _mm256_srli_epi32(
1255
381k
          _mm256_and_si256(
1256
381k
              _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
1257
381k
              _mm256_set1_epi32(0x3f)),
1258
381k
          1);
1259
2.13M
    } else {
1260
2.13M
      base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3,
1261
2.13M
                                      base + 4, base + 5, base + 6, base + 7);
1262
2.13M
      shift = _mm256_srli_epi32(
1263
2.13M
          _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1264
2.13M
    }
1265
1266
2.51M
    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1267
2.51M
    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1268
2.51M
    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1269
1270
2.51M
    b = _mm256_mullo_epi32(diff, shift);
1271
2.51M
    res = _mm256_add_epi32(a32, b);
1272
2.51M
    res = _mm256_srli_epi32(res, 5);
1273
1274
2.51M
    res1 = _mm256_packus_epi32(
1275
2.51M
        res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
1276
1277
2.51M
    mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256);
1278
2.51M
    mask256 = _mm256_packs_epi32(
1279
2.51M
        mask256, _mm256_castsi128_si256(
1280
2.51M
                     _mm256_extracti128_si256(mask256, 1)));  // goto 16 bit
1281
2.51M
    res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1282
2.51M
    dst[r] = _mm256_castsi256_si128(res1);
1283
2.51M
    x += dx;
1284
2.51M
  }
1285
253k
}
1286
1287
static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2(
1288
410k
    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1289
410k
  const int frac_bits = 6 - upsample_above;
1290
410k
  const int max_base_x = ((8 + N) - 1) << upsample_above;
1291
1292
410k
  assert(dx > 0);
1293
  // pre-filter above pixels
1294
  // store in temp buffers:
1295
  //   above[x] * 32 + 16
1296
  //   above[x+1] - above[x]
1297
  // final pixels will be calculated as:
1298
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1299
410k
  __m256i a0, a1, a32, a16, c3f;
1300
410k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1301
410k
  __m128i a0_x128, a1_x128;
1302
1303
410k
  a16 = _mm256_set1_epi16(16);
1304
410k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1305
410k
  max_base_x256 = _mm256_set1_epi16(max_base_x);
1306
410k
  c3f = _mm256_set1_epi16(0x3f);
1307
1308
410k
  int x = dx;
1309
5.60M
  for (int r = 0; r < N; r++) {
1310
5.19M
    __m256i b, res, res1, shift;
1311
1312
5.19M
    int base = x >> frac_bits;
1313
5.19M
    if (base >= max_base_x) {
1314
7.56k
      for (int i = r; i < N; ++i) {
1315
5.61k
        dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
1316
5.61k
      }
1317
1.95k
      return;
1318
1.95k
    }
1319
1320
5.19M
    a0_x128 = _mm_loadu_si128((__m128i *)(above + base));
1321
5.19M
    if (upsample_above) {
1322
1.22M
      __m128i mask, atmp0, atmp1, atmp2, atmp3;
1323
1.22M
      a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8));
1324
1.22M
      atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
1325
1.22M
      atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
1326
1.22M
      atmp2 =
1327
1.22M
          _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
1328
1.22M
      atmp3 =
1329
1.22M
          _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
1330
1.22M
      mask =
1331
1.22M
          _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15));
1332
1.22M
      a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
1333
1.22M
      mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16),
1334
1.22M
                            _mm_set1_epi8(15));
1335
1.22M
      a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
1336
1337
1.22M
      base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6,
1338
1.22M
                                      base + 8, base + 10, base + 12, base + 14,
1339
1.22M
                                      0, 0, 0, 0, 0, 0, 0, 0);
1340
1.22M
      shift = _mm256_srli_epi16(
1341
1.22M
          _mm256_and_si256(
1342
1.22M
              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
1343
1.22M
          1);
1344
3.96M
    } else {
1345
3.96M
      a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1));
1346
3.96M
      base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1347
3.96M
                                      base + 4, base + 5, base + 6, base + 7, 0,
1348
3.96M
                                      0, 0, 0, 0, 0, 0, 0);
1349
3.96M
      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1350
3.96M
    }
1351
5.19M
    a0 = _mm256_castsi128_si256(a0_x128);
1352
5.19M
    a1 = _mm256_castsi128_si256(a1_x128);
1353
1354
5.19M
    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1355
5.19M
    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1356
5.19M
    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1357
1358
5.19M
    b = _mm256_mullo_epi16(diff, shift);
1359
5.19M
    res = _mm256_add_epi16(a32, b);
1360
5.19M
    res = _mm256_srli_epi16(res, 5);
1361
1362
5.19M
    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1363
5.19M
    res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1364
5.19M
    dst[r] = _mm256_castsi256_si128(res1);
1365
5.19M
    x += dx;
1366
5.19M
  }
1367
410k
}
1368
1369
static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst,
1370
                                             ptrdiff_t stride,
1371
                                             const uint16_t *above,
1372
                                             int upsample_above, int dx,
1373
262k
                                             int bd) {
1374
262k
  __m128i dstvec[32];
1375
262k
  if (bd < 12) {
1376
160k
    highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above,
1377
160k
                                              dx);
1378
160k
  } else {
1379
101k
    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above,
1380
101k
                                                    upsample_above, dx);
1381
101k
  }
1382
2.81M
  for (int i = 0; i < N; i++) {
1383
2.55M
    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
1384
2.55M
  }
1385
262k
}
1386
1387
static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_16xN_internal_avx2(
1388
108k
    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1389
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1390
108k
  (void)upsample_above;
1391
108k
  const int frac_bits = 6;
1392
108k
  const int max_base_x = ((16 + N) - 1);
1393
1394
  // pre-filter above pixels
1395
  // store in temp buffers:
1396
  //   above[x] * 32 + 16
1397
  //   above[x+1] - above[x]
1398
  // final pixels will be calculated as:
1399
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1400
108k
  __m256i a0, a0_1, a1, a1_1, a32, a16;
1401
108k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1402
1403
108k
  a16 = _mm256_set1_epi32(16);
1404
108k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1405
108k
  max_base_x256 = _mm256_set1_epi16(max_base_x);
1406
1407
108k
  int x = dx;
1408
1.36M
  for (int r = 0; r < N; r++) {
1409
1.25M
    __m256i b, res[2], res1;
1410
1411
1.25M
    int base = x >> frac_bits;
1412
1.25M
    if (base >= max_base_x) {
1413
347
      for (int i = r; i < N; ++i) {
1414
261
        dstvec[i] = a_mbase_x;  // save 16 values
1415
261
      }
1416
86
      return;
1417
86
    }
1418
1.25M
    __m256i shift = _mm256_srli_epi32(
1419
1.25M
        _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1420
1421
1.25M
    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1422
1.25M
    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1423
1424
1.25M
    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1425
1.25M
    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1426
1.25M
    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1427
1.25M
    b = _mm256_mullo_epi32(diff, shift);
1428
1429
1.25M
    res[0] = _mm256_add_epi32(a32, b);
1430
1.25M
    res[0] = _mm256_srli_epi32(res[0], 5);
1431
1.25M
    res[0] = _mm256_packus_epi32(
1432
1.25M
        res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1433
1434
1.25M
    int mdif = max_base_x - base;
1435
1.25M
    if (mdif > 8) {
1436
1.25M
      a0_1 =
1437
1.25M
          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
1438
1.25M
      a1_1 =
1439
1.25M
          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
1440
1441
1.25M
      diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
1442
1.25M
      a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
1443
1.25M
      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
1444
1.25M
      b = _mm256_mullo_epi32(diff, shift);
1445
1446
1.25M
      res[1] = _mm256_add_epi32(a32, b);
1447
1.25M
      res[1] = _mm256_srli_epi32(res[1], 5);
1448
1.25M
      res[1] = _mm256_packus_epi32(
1449
1.25M
          res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1450
1.25M
    } else {
1451
1.73k
      res[1] = a_mbase_x;
1452
1.73k
    }
1453
1.25M
    res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1454
1.25M
                                   1);  // 16 16bit values
1455
1456
1.25M
    base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1457
1.25M
                                    base + 4, base + 5, base + 6, base + 7,
1458
1.25M
                                    base + 8, base + 9, base + 10, base + 11,
1459
1.25M
                                    base + 12, base + 13, base + 14, base + 15);
1460
1.25M
    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1461
1.25M
    dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1462
1.25M
    x += dx;
1463
1.25M
  }
1464
108k
}
1465
1466
static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2(
1467
336k
    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1468
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1469
336k
  (void)upsample_above;
1470
336k
  const int frac_bits = 6;
1471
336k
  const int max_base_x = ((16 + N) - 1);
1472
1473
  // pre-filter above pixels
1474
  // store in temp buffers:
1475
  //   above[x] * 32 + 16
1476
  //   above[x+1] - above[x]
1477
  // final pixels will be calculated as:
1478
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1479
336k
  __m256i a0, a1, a32, a16, c3f;
1480
336k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1481
1482
336k
  a16 = _mm256_set1_epi16(16);
1483
336k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1484
336k
  max_base_x256 = _mm256_set1_epi16(max_base_x);
1485
336k
  c3f = _mm256_set1_epi16(0x3f);
1486
1487
336k
  int x = dx;
1488
6.47M
  for (int r = 0; r < N; r++) {
1489
6.13M
    __m256i b, res;
1490
1491
6.13M
    int base = x >> frac_bits;
1492
6.13M
    if (base >= max_base_x) {
1493
5.88k
      for (int i = r; i < N; ++i) {
1494
4.92k
        dstvec[i] = a_mbase_x;  // save 16 values
1495
4.92k
      }
1496
957
      return;
1497
957
    }
1498
6.13M
    __m256i shift =
1499
6.13M
        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1500
1501
6.13M
    a0 = _mm256_loadu_si256((__m256i *)(above + base));
1502
6.13M
    a1 = _mm256_loadu_si256((__m256i *)(above + base + 1));
1503
1504
6.13M
    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1505
6.13M
    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1506
6.13M
    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1507
6.13M
    b = _mm256_mullo_epi16(diff, shift);
1508
1509
6.13M
    res = _mm256_add_epi16(a32, b);
1510
6.13M
    res = _mm256_srli_epi16(res, 5);  // 16 16bit values
1511
1512
6.13M
    base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1513
6.13M
                                    base + 4, base + 5, base + 6, base + 7,
1514
6.13M
                                    base + 8, base + 9, base + 10, base + 11,
1515
6.13M
                                    base + 12, base + 13, base + 14, base + 15);
1516
6.13M
    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1517
6.13M
    dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1518
6.13M
    x += dx;
1519
6.13M
  }
1520
336k
}
1521
1522
static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst,
1523
                                              ptrdiff_t stride,
1524
                                              const uint16_t *above,
1525
                                              int upsample_above, int dx,
1526
201k
                                              int bd) {
1527
201k
  __m256i dstvec[64];
1528
201k
  if (bd < 12) {
1529
137k
    highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above,
1530
137k
                                               dx);
1531
137k
  } else {
1532
64.4k
    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above,
1533
64.4k
                                                     upsample_above, dx);
1534
64.4k
  }
1535
2.94M
  for (int i = 0; i < N; i++) {
1536
2.74M
    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
1537
2.74M
  }
1538
201k
}
1539
1540
static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_32xN_internal_avx2(
1541
18.2k
    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1542
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1543
18.2k
  (void)upsample_above;
1544
18.2k
  const int frac_bits = 6;
1545
18.2k
  const int max_base_x = ((32 + N) - 1);
1546
1547
  // pre-filter above pixels
1548
  // store in temp buffers:
1549
  //   above[x] * 32 + 16
1550
  //   above[x+1] - above[x]
1551
  // final pixels will be calculated as:
1552
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1553
18.2k
  __m256i a0, a0_1, a1, a1_1, a32, a16, c3f;
1554
18.2k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1555
1556
18.2k
  a16 = _mm256_set1_epi32(16);
1557
18.2k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1558
18.2k
  max_base_x256 = _mm256_set1_epi16(max_base_x);
1559
18.2k
  c3f = _mm256_set1_epi16(0x3f);
1560
1561
18.2k
  int x = dx;
1562
466k
  for (int r = 0; r < N; r++) {
1563
448k
    __m256i b, res[2], res1;
1564
1565
448k
    int base = x >> frac_bits;
1566
448k
    if (base >= max_base_x) {
1567
0
      for (int i = r; i < N; ++i) {
1568
0
        dstvec[i] = a_mbase_x;  // save 32 values
1569
0
        dstvec[i + N] = a_mbase_x;
1570
0
      }
1571
0
      return;
1572
0
    }
1573
1574
448k
    __m256i shift =
1575
448k
        _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
1576
1577
1.34M
    for (int j = 0; j < 32; j += 16) {
1578
896k
      int mdif = max_base_x - (base + j);
1579
896k
      if (mdif <= 0) {
1580
285
        res1 = a_mbase_x;
1581
896k
      } else {
1582
896k
        a0 = _mm256_cvtepu16_epi32(
1583
896k
            _mm_loadu_si128((__m128i *)(above + base + j)));
1584
896k
        a1 = _mm256_cvtepu16_epi32(
1585
896k
            _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
1586
1587
896k
        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1588
896k
        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1589
896k
        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1590
896k
        b = _mm256_mullo_epi32(diff, shift);
1591
1592
896k
        res[0] = _mm256_add_epi32(a32, b);
1593
896k
        res[0] = _mm256_srli_epi32(res[0], 5);
1594
896k
        res[0] = _mm256_packus_epi32(
1595
896k
            res[0],
1596
896k
            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1597
896k
        if (mdif > 8) {
1598
894k
          a0_1 = _mm256_cvtepu16_epi32(
1599
894k
              _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
1600
894k
          a1_1 = _mm256_cvtepu16_epi32(
1601
894k
              _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
1602
1603
894k
          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
1604
894k
          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
1605
894k
          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
1606
894k
          b = _mm256_mullo_epi32(diff, shift);
1607
1608
894k
          res[1] = _mm256_add_epi32(a32, b);
1609
894k
          res[1] = _mm256_srli_epi32(res[1], 5);
1610
894k
          res[1] = _mm256_packus_epi32(
1611
894k
              res[1],
1612
894k
              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1613
894k
        } else {
1614
1.67k
          res[1] = a_mbase_x;
1615
1.67k
        }
1616
896k
        res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1617
896k
                                       1);  // 16 16bit values
1618
896k
        base_inc256 = _mm256_setr_epi16(
1619
896k
            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1620
896k
            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1621
896k
            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1622
896k
            base + j + 13, base + j + 14, base + j + 15);
1623
1624
896k
        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1625
896k
        res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1626
896k
      }
1627
896k
      if (!j) {
1628
448k
        dstvec[r] = res1;
1629
448k
      } else {
1630
448k
        dstvec[r + N] = res1;
1631
448k
      }
1632
896k
    }
1633
448k
    x += dx;
1634
448k
  }
1635
18.2k
}
1636
1637
static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2(
1638
234k
    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1639
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1640
234k
  (void)upsample_above;
1641
234k
  const int frac_bits = 6;
1642
234k
  const int max_base_x = ((32 + N) - 1);
1643
1644
  // pre-filter above pixels
1645
  // store in temp buffers:
1646
  //   above[x] * 32 + 16
1647
  //   above[x+1] - above[x]
1648
  // final pixels will be calculated as:
1649
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1650
234k
  __m256i a0, a1, a32, a16, c3f;
1651
234k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1652
1653
234k
  a16 = _mm256_set1_epi16(16);
1654
234k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1655
234k
  max_base_x256 = _mm256_set1_epi16(max_base_x);
1656
234k
  c3f = _mm256_set1_epi16(0x3f);
1657
1658
234k
  int x = dx;
1659
6.67M
  for (int r = 0; r < N; r++) {
1660
6.43M
    __m256i b, res;
1661
1662
6.43M
    int base = x >> frac_bits;
1663
6.43M
    if (base >= max_base_x) {
1664
0
      for (int i = r; i < N; ++i) {
1665
0
        dstvec[i] = a_mbase_x;  // save 32 values
1666
0
        dstvec[i + N] = a_mbase_x;
1667
0
      }
1668
0
      return;
1669
0
    }
1670
1671
6.43M
    __m256i shift =
1672
6.43M
        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1673
1674
19.3M
    for (int j = 0; j < 32; j += 16) {
1675
12.8M
      int mdif = max_base_x - (base + j);
1676
12.8M
      if (mdif <= 0) {
1677
900
        res = a_mbase_x;
1678
12.8M
      } else {
1679
12.8M
        a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
1680
12.8M
        a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
1681
1682
12.8M
        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1683
12.8M
        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1684
12.8M
        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1685
12.8M
        b = _mm256_mullo_epi16(diff, shift);
1686
1687
12.8M
        res = _mm256_add_epi16(a32, b);
1688
12.8M
        res = _mm256_srli_epi16(res, 5);
1689
1690
12.8M
        base_inc256 = _mm256_setr_epi16(
1691
12.8M
            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1692
12.8M
            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1693
12.8M
            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1694
12.8M
            base + j + 13, base + j + 14, base + j + 15);
1695
1696
12.8M
        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1697
12.8M
        res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1698
12.8M
      }
1699
12.8M
      if (!j) {
1700
6.43M
        dstvec[r] = res;
1701
6.43M
      } else {
1702
6.43M
        dstvec[r + N] = res;
1703
6.43M
      }
1704
12.8M
    }
1705
6.43M
    x += dx;
1706
6.43M
  }
1707
234k
}
1708
1709
static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst,
1710
                                              ptrdiff_t stride,
1711
                                              const uint16_t *above,
1712
                                              int upsample_above, int dx,
1713
101k
                                              int bd) {
1714
101k
  __m256i dstvec[128];
1715
101k
  if (bd < 12) {
1716
89.2k
    highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above,
1717
89.2k
                                               dx);
1718
89.2k
  } else {
1719
12.6k
    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above,
1720
12.6k
                                                     upsample_above, dx);
1721
12.6k
  }
1722
2.83M
  for (int i = 0; i < N; i++) {
1723
2.72M
    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
1724
2.72M
    _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]);
1725
2.72M
  }
1726
101k
}
1727
1728
static void highbd_dr_prediction_32bit_z1_64xN_avx2(int N, uint16_t *dst,
1729
                                                    ptrdiff_t stride,
1730
                                                    const uint16_t *above,
1731
                                                    int upsample_above,
1732
11.5k
                                                    int dx) {
1733
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1734
11.5k
  (void)upsample_above;
1735
11.5k
  const int frac_bits = 6;
1736
11.5k
  const int max_base_x = ((64 + N) - 1);
1737
1738
  // pre-filter above pixels
1739
  // store in temp buffers:
1740
  //   above[x] * 32 + 16
1741
  //   above[x+1] - above[x]
1742
  // final pixels will be calculated as:
1743
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1744
11.5k
  __m256i a0, a0_1, a1, a1_1, a32, a16;
1745
11.5k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1746
1747
11.5k
  a16 = _mm256_set1_epi32(16);
1748
11.5k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1749
11.5k
  max_base_x256 = _mm256_set1_epi16(max_base_x);
1750
1751
11.5k
  int x = dx;
1752
729k
  for (int r = 0; r < N; r++, dst += stride) {
1753
718k
    __m256i b, res[2], res1;
1754
1755
718k
    int base = x >> frac_bits;
1756
718k
    if (base >= max_base_x) {
1757
0
      for (int i = r; i < N; ++i) {
1758
0
        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
1759
0
        _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
1760
0
        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
1761
0
        _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
1762
0
        dst += stride;
1763
0
      }
1764
0
      return;
1765
0
    }
1766
1767
718k
    __m256i shift = _mm256_srli_epi32(
1768
718k
        _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1769
1770
718k
    __m128i a0_128, a0_1_128, a1_128, a1_1_128;
1771
3.59M
    for (int j = 0; j < 64; j += 16) {
1772
2.87M
      int mdif = max_base_x - (base + j);
1773
2.87M
      if (mdif <= 0) {
1774
856
        _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
1775
2.87M
      } else {
1776
2.87M
        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
1777
2.87M
        a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
1778
2.87M
        a0 = _mm256_cvtepu16_epi32(a0_128);
1779
2.87M
        a1 = _mm256_cvtepu16_epi32(a1_128);
1780
1781
2.87M
        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1782
2.87M
        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1783
2.87M
        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1784
2.87M
        b = _mm256_mullo_epi32(diff, shift);
1785
1786
2.87M
        res[0] = _mm256_add_epi32(a32, b);
1787
2.87M
        res[0] = _mm256_srli_epi32(res[0], 5);
1788
2.87M
        res[0] = _mm256_packus_epi32(
1789
2.87M
            res[0],
1790
2.87M
            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1791
2.87M
        if (mdif > 8) {
1792
2.86M
          a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
1793
2.86M
          a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
1794
2.86M
          a0_1 = _mm256_cvtepu16_epi32(a0_1_128);
1795
2.86M
          a1_1 = _mm256_cvtepu16_epi32(a1_1_128);
1796
1797
2.86M
          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
1798
2.86M
          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
1799
2.86M
          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
1800
2.86M
          b = _mm256_mullo_epi32(diff, shift);
1801
1802
2.86M
          res[1] = _mm256_add_epi32(a32, b);
1803
2.86M
          res[1] = _mm256_srli_epi32(res[1], 5);
1804
2.86M
          res[1] = _mm256_packus_epi32(
1805
2.86M
              res[1],
1806
2.86M
              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1807
2.86M
        } else {
1808
2.11k
          res[1] = a_mbase_x;
1809
2.11k
        }
1810
2.87M
        res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1811
2.87M
                                       1);  // 16 16bit values
1812
2.87M
        base_inc256 = _mm256_setr_epi16(
1813
2.87M
            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1814
2.87M
            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1815
2.87M
            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1816
2.87M
            base + j + 13, base + j + 14, base + j + 15);
1817
1818
2.87M
        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1819
2.87M
        res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1820
2.87M
        _mm256_storeu_si256((__m256i *)(dst + j), res1);
1821
2.87M
      }
1822
2.87M
    }
1823
718k
    x += dx;
1824
718k
  }
1825
11.5k
}
1826
1827
static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst,
1828
                                              ptrdiff_t stride,
1829
                                              const uint16_t *above,
1830
41.8k
                                              int upsample_above, int dx) {
1831
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1832
41.8k
  (void)upsample_above;
1833
41.8k
  const int frac_bits = 6;
1834
41.8k
  const int max_base_x = ((64 + N) - 1);
1835
1836
  // pre-filter above pixels
1837
  // store in temp buffers:
1838
  //   above[x] * 32 + 16
1839
  //   above[x+1] - above[x]
1840
  // final pixels will be calculated as:
1841
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1842
41.8k
  __m256i a0, a1, a32, a16, c3f;
1843
41.8k
  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1844
1845
41.8k
  a16 = _mm256_set1_epi16(16);
1846
41.8k
  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1847
41.8k
  max_base_x256 = _mm256_set1_epi16(max_base_x);
1848
41.8k
  c3f = _mm256_set1_epi16(0x3f);
1849
1850
41.8k
  int x = dx;
1851
2.14M
  for (int r = 0; r < N; r++, dst += stride) {
1852
2.10M
    __m256i b, res;
1853
1854
2.10M
    int base = x >> frac_bits;
1855
2.10M
    if (base >= max_base_x) {
1856
0
      for (int i = r; i < N; ++i) {
1857
0
        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
1858
0
        _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
1859
0
        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
1860
0
        _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
1861
0
        dst += stride;
1862
0
      }
1863
0
      return;
1864
0
    }
1865
1866
2.10M
    __m256i shift =
1867
2.10M
        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1868
1869
10.5M
    for (int j = 0; j < 64; j += 16) {
1870
8.40M
      int mdif = max_base_x - (base + j);
1871
8.40M
      if (mdif <= 0) {
1872
5.83k
        _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
1873
8.40M
      } else {
1874
8.40M
        a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
1875
8.40M
        a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
1876
1877
8.40M
        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1878
8.40M
        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1879
8.40M
        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1880
8.40M
        b = _mm256_mullo_epi16(diff, shift);
1881
1882
8.40M
        res = _mm256_add_epi16(a32, b);
1883
8.40M
        res = _mm256_srli_epi16(res, 5);
1884
1885
8.40M
        base_inc256 = _mm256_setr_epi16(
1886
8.40M
            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1887
8.40M
            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1888
8.40M
            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1889
8.40M
            base + j + 13, base + j + 14, base + j + 15);
1890
1891
8.40M
        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1892
8.40M
        res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1893
8.40M
        _mm256_storeu_si256((__m256i *)(dst + j), res);  // 16 16bit values
1894
8.40M
      }
1895
8.40M
    }
1896
2.10M
    x += dx;
1897
2.10M
  }
1898
41.8k
}
1899
1900
// Directional prediction, zone 1: 0 < angle < 90
1901
void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
1902
                                      int bh, const uint16_t *above,
1903
                                      const uint16_t *left, int upsample_above,
1904
743k
                                      int dx, int dy, int bd) {
1905
743k
  (void)left;
1906
743k
  (void)dy;
1907
1908
743k
  switch (bw) {
1909
161k
    case 4:
1910
161k
      highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above,
1911
161k
                                       dx, bd);
1912
161k
      break;
1913
262k
    case 8:
1914
262k
      highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above,
1915
262k
                                       dx, bd);
1916
262k
      break;
1917
201k
    case 16:
1918
201k
      highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above,
1919
201k
                                        dx, bd);
1920
201k
      break;
1921
97.1k
    case 32:
1922
97.1k
      highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above,
1923
97.1k
                                        dx, bd);
1924
97.1k
      break;
1925
21.2k
    case 64:
1926
21.2k
      if (bd < 12) {
1927
13.6k
        highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above,
1928
13.6k
                                          upsample_above, dx);
1929
13.6k
      } else {
1930
7.59k
        highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above,
1931
7.59k
                                                upsample_above, dx);
1932
7.59k
      }
1933
21.2k
      break;
1934
0
    default: break;
1935
743k
  }
1936
743k
  return;
1937
743k
}
1938
1939
static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc,
1940
472k
                                      uint16_t *dst, ptrdiff_t pitchDst) {
1941
472k
  __m256i r[16];
1942
472k
  __m256i d[16];
1943
8.03M
  for (int j = 0; j < 16; j++) {
1944
7.56M
    r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc));
1945
7.56M
  }
1946
472k
  highbd_transpose16x16_avx2(r, d);
1947
8.03M
  for (int j = 0; j < 16; j++) {
1948
7.56M
    _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]);
1949
7.56M
  }
1950
472k
}
1951
1952
static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc,
1953
                             uint16_t *dst, ptrdiff_t pitchDst, int width,
1954
36.7k
                             int height) {
1955
174k
  for (int j = 0; j < height; j += 16)
1956
610k
    for (int i = 0; i < width; i += 16)
1957
472k
      highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
1958
472k
                                dst + j * pitchDst + i, pitchDst);
1959
36.7k
}
1960
1961
static void highbd_dr_prediction_32bit_z2_Nx4_avx2(
1962
    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1963
    const uint16_t *left, int upsample_above, int upsample_left, int dx,
1964
199k
    int dy) {
1965
199k
  const int min_base_x = -(1 << upsample_above);
1966
199k
  const int min_base_y = -(1 << upsample_left);
1967
199k
  const int frac_bits_x = 6 - upsample_above;
1968
199k
  const int frac_bits_y = 6 - upsample_left;
1969
1970
199k
  assert(dx > 0);
1971
  // pre-filter above pixels
1972
  // store in temp buffers:
1973
  //   above[x] * 32 + 16
1974
  //   above[x+1] - above[x]
1975
  // final pixels will be calculated as:
1976
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1977
199k
  __m256i a0_x, a1_x, a32, a16;
1978
199k
  __m256i diff;
1979
199k
  __m128i c3f, min_base_y128;
1980
1981
199k
  a16 = _mm256_set1_epi32(16);
1982
199k
  c3f = _mm_set1_epi32(0x3f);
1983
199k
  min_base_y128 = _mm_set1_epi32(min_base_y);
1984
1985
1.22M
  for (int r = 0; r < N; r++) {
1986
1.02M
    __m256i b, res, shift;
1987
1.02M
    __m128i resx, resy, resxy;
1988
1.02M
    __m128i a0_x128, a1_x128;
1989
1.02M
    int y = r + 1;
1990
1.02M
    int base_x = (-y * dx) >> frac_bits_x;
1991
1.02M
    int base_shift = 0;
1992
1.02M
    if (base_x < (min_base_x - 1)) {
1993
873k
      base_shift = (min_base_x - base_x - 1) >> upsample_above;
1994
873k
    }
1995
1.02M
    int base_min_diff =
1996
1.02M
        (min_base_x - base_x + upsample_above) >> upsample_above;
1997
1.02M
    if (base_min_diff > 4) {
1998
706k
      base_min_diff = 4;
1999
706k
    } else {
2000
317k
      if (base_min_diff < 0) base_min_diff = 0;
2001
317k
    }
2002
2003
1.02M
    if (base_shift > 3) {
2004
706k
      a0_x = _mm256_setzero_si256();
2005
706k
      a1_x = _mm256_setzero_si256();
2006
706k
      shift = _mm256_setzero_si256();
2007
706k
    } else {
2008
317k
      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2009
317k
      if (upsample_above) {
2010
51.1k
        a0_x128 = _mm_shuffle_epi8(a0_x128,
2011
51.1k
                                   *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
2012
51.1k
        a1_x128 = _mm_srli_si128(a0_x128, 8);
2013
2014
51.1k
        shift = _mm256_castsi128_si256(_mm_srli_epi32(
2015
51.1k
            _mm_and_si128(
2016
51.1k
                _mm_slli_epi32(
2017
51.1k
                    _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
2018
51.1k
                                   (2 << 6) - y * dx, (3 << 6) - y * dx),
2019
51.1k
                    upsample_above),
2020
51.1k
                c3f),
2021
51.1k
            1));
2022
266k
      } else {
2023
266k
        a0_x128 =
2024
266k
            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2025
266k
        a1_x128 = _mm_srli_si128(a0_x128, 2);
2026
2027
266k
        shift = _mm256_castsi128_si256(_mm_srli_epi32(
2028
266k
            _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
2029
266k
                                         (2 << 6) - y * dx, (3 << 6) - y * dx),
2030
266k
                          c3f),
2031
266k
            1));
2032
266k
      }
2033
317k
      a0_x = _mm256_cvtepu16_epi32(a0_x128);
2034
317k
      a1_x = _mm256_cvtepu16_epi32(a1_x128);
2035
317k
    }
2036
    // y calc
2037
1.02M
    __m128i a0_y, a1_y, shifty;
2038
1.02M
    if (base_x < min_base_x) {
2039
935k
      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2040
935k
      DECLARE_ALIGNED(32, int, base_y_c[4]);
2041
935k
      r6 = _mm_set1_epi32(r << 6);
2042
935k
      dy128 = _mm_set1_epi32(dy);
2043
935k
      c1234 = _mm_setr_epi32(1, 2, 3, 4);
2044
935k
      y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
2045
935k
      base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
2046
935k
      mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
2047
935k
      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2048
935k
      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2049
2050
935k
      a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
2051
935k
                            left[base_y_c[2]], left[base_y_c[3]]);
2052
935k
      a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2053
935k
                            left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
2054
2055
935k
      if (upsample_left) {
2056
188k
        shifty = _mm_srli_epi32(
2057
188k
            _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
2058
746k
      } else {
2059
746k
        shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
2060
746k
      }
2061
935k
      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2062
935k
      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2063
935k
      shift = _mm256_inserti128_si256(shift, shifty, 1);
2064
935k
    }
2065
2066
1.02M
    diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
2067
1.02M
    a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
2068
1.02M
    a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2069
2070
1.02M
    b = _mm256_mullo_epi32(diff, shift);
2071
1.02M
    res = _mm256_add_epi32(a32, b);
2072
1.02M
    res = _mm256_srli_epi32(res, 5);
2073
2074
1.02M
    resx = _mm256_castsi256_si128(res);
2075
1.02M
    resx = _mm_packus_epi32(resx, resx);
2076
2077
1.02M
    resy = _mm256_extracti128_si256(res, 1);
2078
1.02M
    resy = _mm_packus_epi32(resy, resy);
2079
2080
1.02M
    resxy =
2081
1.02M
        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2082
1.02M
    _mm_storel_epi64((__m128i *)(dst), resxy);
2083
1.02M
    dst += stride;
2084
1.02M
  }
2085
199k
}
2086
2087
static void highbd_dr_prediction_z2_Nx4_avx2(
2088
    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2089
    const uint16_t *left, int upsample_above, int upsample_left, int dx,
2090
226k
    int dy) {
2091
226k
  const int min_base_x = -(1 << upsample_above);
2092
226k
  const int min_base_y = -(1 << upsample_left);
2093
226k
  const int frac_bits_x = 6 - upsample_above;
2094
226k
  const int frac_bits_y = 6 - upsample_left;
2095
2096
226k
  assert(dx > 0);
2097
  // pre-filter above pixels
2098
  // store in temp buffers:
2099
  //   above[x] * 32 + 16
2100
  //   above[x+1] - above[x]
2101
  // final pixels will be calculated as:
2102
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2103
226k
  __m256i a0_x, a1_x, a32, a16;
2104
226k
  __m256i diff;
2105
226k
  __m128i c3f, min_base_y128;
2106
2107
226k
  a16 = _mm256_set1_epi16(16);
2108
226k
  c3f = _mm_set1_epi16(0x3f);
2109
226k
  min_base_y128 = _mm_set1_epi16(min_base_y);
2110
2111
1.72M
  for (int r = 0; r < N; r++) {
2112
1.49M
    __m256i b, res, shift;
2113
1.49M
    __m128i resx, resy, resxy;
2114
1.49M
    __m128i a0_x128, a1_x128;
2115
1.49M
    int y = r + 1;
2116
1.49M
    int base_x = (-y * dx) >> frac_bits_x;
2117
1.49M
    int base_shift = 0;
2118
1.49M
    if (base_x < (min_base_x - 1)) {
2119
1.12M
      base_shift = (min_base_x - base_x - 1) >> upsample_above;
2120
1.12M
    }
2121
1.49M
    int base_min_diff =
2122
1.49M
        (min_base_x - base_x + upsample_above) >> upsample_above;
2123
1.49M
    if (base_min_diff > 4) {
2124
714k
      base_min_diff = 4;
2125
784k
    } else {
2126
784k
      if (base_min_diff < 0) base_min_diff = 0;
2127
784k
    }
2128
2129
1.49M
    if (base_shift > 3) {
2130
714k
      a0_x = _mm256_setzero_si256();
2131
714k
      a1_x = _mm256_setzero_si256();
2132
714k
      shift = _mm256_setzero_si256();
2133
784k
    } else {
2134
784k
      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2135
784k
      if (upsample_above) {
2136
262k
        a0_x128 = _mm_shuffle_epi8(a0_x128,
2137
262k
                                   *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
2138
262k
        a1_x128 = _mm_srli_si128(a0_x128, 8);
2139
2140
262k
        shift = _mm256_castsi128_si256(_mm_srli_epi16(
2141
262k
            _mm_and_si128(
2142
262k
                _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2143
262k
                                              (2 << 6) - y * dx,
2144
262k
                                              (3 << 6) - y * dx, 0, 0, 0, 0),
2145
262k
                               upsample_above),
2146
262k
                c3f),
2147
262k
            1));
2148
522k
      } else {
2149
522k
        a0_x128 =
2150
522k
            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2151
522k
        a1_x128 = _mm_srli_si128(a0_x128, 2);
2152
2153
522k
        shift = _mm256_castsi128_si256(_mm_srli_epi16(
2154
522k
            _mm_and_si128(
2155
522k
                _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
2156
522k
                               (3 << 6) - y * dx, 0, 0, 0, 0),
2157
522k
                c3f),
2158
522k
            1));
2159
522k
      }
2160
784k
      a0_x = _mm256_castsi128_si256(a0_x128);
2161
784k
      a1_x = _mm256_castsi128_si256(a1_x128);
2162
784k
    }
2163
    // y calc
2164
1.49M
    __m128i a0_y, a1_y, shifty;
2165
1.49M
    if (base_x < min_base_x) {
2166
1.27M
      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2167
1.27M
      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
2168
1.27M
      r6 = _mm_set1_epi16(r << 6);
2169
1.27M
      dy128 = _mm_set1_epi16(dy);
2170
1.27M
      c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
2171
1.27M
      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
2172
1.27M
      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
2173
1.27M
      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
2174
1.27M
      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2175
1.27M
      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2176
2177
1.27M
      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
2178
1.27M
                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
2179
1.27M
      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2180
1.27M
                            left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0,
2181
1.27M
                            0, 0);
2182
2183
1.27M
      if (upsample_left) {
2184
460k
        shifty = _mm_srli_epi16(
2185
460k
            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
2186
816k
      } else {
2187
816k
        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
2188
816k
      }
2189
1.27M
      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2190
1.27M
      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2191
1.27M
      shift = _mm256_inserti128_si256(shift, shifty, 1);
2192
1.27M
    }
2193
2194
1.49M
    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
2195
1.49M
    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
2196
1.49M
    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2197
2198
1.49M
    b = _mm256_mullo_epi16(diff, shift);
2199
1.49M
    res = _mm256_add_epi16(a32, b);
2200
1.49M
    res = _mm256_srli_epi16(res, 5);
2201
2202
1.49M
    resx = _mm256_castsi256_si128(res);
2203
1.49M
    resy = _mm256_extracti128_si256(res, 1);
2204
1.49M
    resxy =
2205
1.49M
        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2206
1.49M
    _mm_storel_epi64((__m128i *)(dst), resxy);
2207
1.49M
    dst += stride;
2208
1.49M
  }
2209
226k
}
2210
2211
static void highbd_dr_prediction_32bit_z2_Nx8_avx2(
2212
    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2213
    const uint16_t *left, int upsample_above, int upsample_left, int dx,
2214
207k
    int dy) {
2215
207k
  const int min_base_x = -(1 << upsample_above);
2216
207k
  const int min_base_y = -(1 << upsample_left);
2217
207k
  const int frac_bits_x = 6 - upsample_above;
2218
207k
  const int frac_bits_y = 6 - upsample_left;
2219
2220
  // pre-filter above pixels
2221
  // store in temp buffers:
2222
  //   above[x] * 32 + 16
2223
  //   above[x+1] - above[x]
2224
  // final pixels will be calculated as:
2225
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2226
207k
  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
2227
207k
  __m256i diff;
2228
207k
  __m128i a0_x128, a1_x128;
2229
2230
207k
  a16 = _mm256_set1_epi32(16);
2231
207k
  c3f = _mm256_set1_epi32(0x3f);
2232
207k
  min_base_y256 = _mm256_set1_epi32(min_base_y);
2233
2234
2.22M
  for (int r = 0; r < N; r++) {
2235
2.01M
    __m256i b, res, shift;
2236
2.01M
    __m128i resx, resy, resxy;
2237
2.01M
    int y = r + 1;
2238
2.01M
    int base_x = (-y * dx) >> frac_bits_x;
2239
2.01M
    int base_shift = 0;
2240
2.01M
    if (base_x < (min_base_x - 1)) {
2241
1.59M
      base_shift = (min_base_x - base_x - 1) >> upsample_above;
2242
1.59M
    }
2243
2.01M
    int base_min_diff =
2244
2.01M
        (min_base_x - base_x + upsample_above) >> upsample_above;
2245
2.01M
    if (base_min_diff > 8) {
2246
1.02M
      base_min_diff = 8;
2247
1.02M
    } else {
2248
991k
      if (base_min_diff < 0) base_min_diff = 0;
2249
991k
    }
2250
2251
2.01M
    if (base_shift > 7) {
2252
1.02M
      resx = _mm_setzero_si128();
2253
1.02M
    } else {
2254
991k
      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2255
991k
      if (upsample_above) {
2256
49.5k
        __m128i mask, atmp0, atmp1, atmp2, atmp3;
2257
49.5k
        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
2258
49.5k
        atmp0 = _mm_shuffle_epi8(a0_x128,
2259
49.5k
                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2260
49.5k
        atmp1 = _mm_shuffle_epi8(a1_x128,
2261
49.5k
                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2262
49.5k
        atmp2 = _mm_shuffle_epi8(
2263
49.5k
            a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2264
49.5k
        atmp3 = _mm_shuffle_epi8(
2265
49.5k
            a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2266
49.5k
        mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
2267
49.5k
                              _mm_set1_epi8(15));
2268
49.5k
        a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
2269
49.5k
        mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
2270
49.5k
                              _mm_set1_epi8(15));
2271
49.5k
        a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
2272
49.5k
        shift = _mm256_srli_epi32(
2273
49.5k
            _mm256_and_si256(
2274
49.5k
                _mm256_slli_epi32(
2275
49.5k
                    _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx,
2276
49.5k
                                      (2 << 6) - y * dx, (3 << 6) - y * dx,
2277
49.5k
                                      (4 << 6) - y * dx, (5 << 6) - y * dx,
2278
49.5k
                                      (6 << 6) - y * dx, (7 << 6) - y * dx),
2279
49.5k
                    upsample_above),
2280
49.5k
                c3f),
2281
49.5k
            1);
2282
942k
      } else {
2283
942k
        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
2284
942k
        a0_x128 =
2285
942k
            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2286
942k
        a1_x128 =
2287
942k
            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2288
2289
942k
        shift = _mm256_srli_epi32(
2290
942k
            _mm256_and_si256(
2291
942k
                _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
2292
942k
                                  (3 << 6) - y * dx, (4 << 6) - y * dx,
2293
942k
                                  (5 << 6) - y * dx, (6 << 6) - y * dx,
2294
942k
                                  (7 << 6) - y * dx),
2295
942k
                c3f),
2296
942k
            1);
2297
942k
      }
2298
991k
      a0_x = _mm256_cvtepu16_epi32(a0_x128);
2299
991k
      a1_x = _mm256_cvtepu16_epi32(a1_x128);
2300
2301
991k
      diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
2302
991k
      a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
2303
991k
      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2304
2305
991k
      b = _mm256_mullo_epi32(diff, shift);
2306
991k
      res = _mm256_add_epi32(a32, b);
2307
991k
      res = _mm256_srli_epi32(res, 5);
2308
2309
991k
      resx = _mm256_castsi256_si128(_mm256_packus_epi32(
2310
991k
          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
2311
991k
    }
2312
    // y calc
2313
2.01M
    if (base_x < min_base_x) {
2314
1.75M
      DECLARE_ALIGNED(32, int, base_y_c[8]);
2315
1.75M
      __m256i r6, c256, dy256, y_c256, base_y_c256, mask256;
2316
1.75M
      r6 = _mm256_set1_epi32(r << 6);
2317
1.75M
      dy256 = _mm256_set1_epi32(dy);
2318
1.75M
      c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
2319
1.75M
      y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2320
1.75M
      base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
2321
1.75M
      mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2322
1.75M
      base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2323
1.75M
      _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2324
2325
1.75M
      a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2326
1.75M
          left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2327
1.75M
          left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2328
1.75M
          left[base_y_c[6]], left[base_y_c[7]]));
2329
1.75M
      a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2330
1.75M
          left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
2331
1.75M
          left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2332
1.75M
          left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
2333
2334
1.75M
      if (upsample_left) {
2335
99.0k
        shift = _mm256_srli_epi32(
2336
99.0k
            _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f),
2337
99.0k
            1);
2338
1.65M
      } else {
2339
1.65M
        shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
2340
1.65M
      }
2341
1.75M
      diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
2342
1.75M
      a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
2343
1.75M
      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2344
2345
1.75M
      b = _mm256_mullo_epi32(diff, shift);
2346
1.75M
      res = _mm256_add_epi32(a32, b);
2347
1.75M
      res = _mm256_srli_epi32(res, 5);
2348
2349
1.75M
      resy = _mm256_castsi256_si128(_mm256_packus_epi32(
2350
1.75M
          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
2351
1.75M
    } else {
2352
260k
      resy = resx;
2353
260k
    }
2354
2.01M
    resxy =
2355
2.01M
        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2356
2.01M
    _mm_storeu_si128((__m128i *)(dst), resxy);
2357
2.01M
    dst += stride;
2358
2.01M
  }
2359
207k
}
2360
2361
static void highbd_dr_prediction_z2_Nx8_avx2(
2362
    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2363
    const uint16_t *left, int upsample_above, int upsample_left, int dx,
2364
275k
    int dy) {
2365
275k
  const int min_base_x = -(1 << upsample_above);
2366
275k
  const int min_base_y = -(1 << upsample_left);
2367
275k
  const int frac_bits_x = 6 - upsample_above;
2368
275k
  const int frac_bits_y = 6 - upsample_left;
2369
2370
  // pre-filter above pixels
2371
  // store in temp buffers:
2372
  //   above[x] * 32 + 16
2373
  //   above[x+1] - above[x]
2374
  // final pixels will be calculated as:
2375
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2376
275k
  __m128i c3f, min_base_y128;
2377
275k
  __m256i a0_x, a1_x, diff, a32, a16;
2378
275k
  __m128i a0_x128, a1_x128;
2379
2380
275k
  a16 = _mm256_set1_epi16(16);
2381
275k
  c3f = _mm_set1_epi16(0x3f);
2382
275k
  min_base_y128 = _mm_set1_epi16(min_base_y);
2383
2384
2.85M
  for (int r = 0; r < N; r++) {
2385
2.57M
    __m256i b, res, shift;
2386
2.57M
    __m128i resx, resy, resxy;
2387
2.57M
    int y = r + 1;
2388
2.57M
    int base_x = (-y * dx) >> frac_bits_x;
2389
2.57M
    int base_shift = 0;
2390
2.57M
    if (base_x < (min_base_x - 1)) {
2391
1.96M
      base_shift = (min_base_x - base_x - 1) >> upsample_above;
2392
1.96M
    }
2393
2.57M
    int base_min_diff =
2394
2.57M
        (min_base_x - base_x + upsample_above) >> upsample_above;
2395
2.57M
    if (base_min_diff > 8) {
2396
1.15M
      base_min_diff = 8;
2397
1.42M
    } else {
2398
1.42M
      if (base_min_diff < 0) base_min_diff = 0;
2399
1.42M
    }
2400
2401
2.57M
    if (base_shift > 7) {
2402
1.15M
      a0_x = _mm256_setzero_si256();
2403
1.15M
      a1_x = _mm256_setzero_si256();
2404
1.15M
      shift = _mm256_setzero_si256();
2405
1.42M
    } else {
2406
1.42M
      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2407
1.42M
      if (upsample_above) {
2408
408k
        __m128i mask, atmp0, atmp1, atmp2, atmp3;
2409
408k
        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
2410
408k
        atmp0 = _mm_shuffle_epi8(a0_x128,
2411
408k
                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2412
408k
        atmp1 = _mm_shuffle_epi8(a1_x128,
2413
408k
                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2414
408k
        atmp2 = _mm_shuffle_epi8(
2415
408k
            a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2416
408k
        atmp3 = _mm_shuffle_epi8(
2417
408k
            a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2418
408k
        mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
2419
408k
                              _mm_set1_epi8(15));
2420
408k
        a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
2421
408k
        mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
2422
408k
                              _mm_set1_epi8(15));
2423
408k
        a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
2424
2425
408k
        shift = _mm256_castsi128_si256(_mm_srli_epi16(
2426
408k
            _mm_and_si128(
2427
408k
                _mm_slli_epi16(
2428
408k
                    _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2429
408k
                                   (2 << 6) - y * dx, (3 << 6) - y * dx,
2430
408k
                                   (4 << 6) - y * dx, (5 << 6) - y * dx,
2431
408k
                                   (6 << 6) - y * dx, (7 << 6) - y * dx),
2432
408k
                    upsample_above),
2433
408k
                c3f),
2434
408k
            1));
2435
1.01M
      } else {
2436
1.01M
        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
2437
1.01M
        a0_x128 =
2438
1.01M
            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2439
1.01M
        a1_x128 =
2440
1.01M
            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2441
2442
1.01M
        shift = _mm256_castsi128_si256(_mm_srli_epi16(
2443
1.01M
            _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2444
1.01M
                                         (2 << 6) - y * dx, (3 << 6) - y * dx,
2445
1.01M
                                         (4 << 6) - y * dx, (5 << 6) - y * dx,
2446
1.01M
                                         (6 << 6) - y * dx, (7 << 6) - y * dx),
2447
1.01M
                          c3f),
2448
1.01M
            1));
2449
1.01M
      }
2450
1.42M
      a0_x = _mm256_castsi128_si256(a0_x128);
2451
1.42M
      a1_x = _mm256_castsi128_si256(a1_x128);
2452
1.42M
    }
2453
2454
    // y calc
2455
2.57M
    __m128i a0_y, a1_y, shifty;
2456
2.57M
    if (base_x < min_base_x) {
2457
2.18M
      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
2458
2.18M
      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2459
2.18M
      r6 = _mm_set1_epi16(r << 6);
2460
2.18M
      dy128 = _mm_set1_epi16(dy);
2461
2.18M
      c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
2462
2.18M
      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
2463
2.18M
      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
2464
2.18M
      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
2465
2.18M
      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2466
2.18M
      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2467
2468
2.18M
      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
2469
2.18M
                            left[base_y_c[2]], left[base_y_c[3]],
2470
2.18M
                            left[base_y_c[4]], left[base_y_c[5]],
2471
2.18M
                            left[base_y_c[6]], left[base_y_c[7]]);
2472
2.18M
      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2473
2.18M
                            left[base_y_c[2] + 1], left[base_y_c[3] + 1],
2474
2.18M
                            left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2475
2.18M
                            left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
2476
2477
2.18M
      if (upsample_left) {
2478
639k
        shifty = _mm_srli_epi16(
2479
639k
            _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1);
2480
1.54M
      } else {
2481
1.54M
        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
2482
1.54M
      }
2483
2.18M
      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2484
2.18M
      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2485
2.18M
      shift = _mm256_inserti128_si256(shift, shifty, 1);
2486
2.18M
    }
2487
2488
2.57M
    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
2489
2.57M
    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
2490
2.57M
    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2491
2492
2.57M
    b = _mm256_mullo_epi16(diff, shift);
2493
2.57M
    res = _mm256_add_epi16(a32, b);
2494
2.57M
    res = _mm256_srli_epi16(res, 5);
2495
2496
2.57M
    resx = _mm256_castsi256_si128(res);
2497
2.57M
    resy = _mm256_extracti128_si256(res, 1);
2498
2499
2.57M
    resxy =
2500
2.57M
        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2501
2.57M
    _mm_storeu_si128((__m128i *)(dst), resxy);
2502
2.57M
    dst += stride;
2503
2.57M
  }
2504
275k
}
2505
2506
static void highbd_dr_prediction_32bit_z2_HxW_avx2(
2507
    int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2508
    const uint16_t *left, int upsample_above, int upsample_left, int dx,
2509
132k
    int dy) {
2510
  // here upsample_above and upsample_left are 0 by design of
2511
  // av1_use_intra_edge_upsample
2512
132k
  const int min_base_x = -1;
2513
132k
  const int min_base_y = -1;
2514
132k
  (void)upsample_above;
2515
132k
  (void)upsample_left;
2516
132k
  const int frac_bits_x = 6;
2517
132k
  const int frac_bits_y = 6;
2518
2519
  // pre-filter above pixels
2520
  // store in temp buffers:
2521
  //   above[x] * 32 + 16
2522
  //   above[x+1] - above[x]
2523
  // final pixels will be calculated as:
2524
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2525
132k
  __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1;
2526
132k
  __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8;
2527
132k
  __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
2528
132k
  DECLARE_ALIGNED(32, int, base_y_c[16]);
2529
2530
132k
  a16 = _mm256_set1_epi32(16);
2531
132k
  c1 = _mm256_srli_epi32(a16, 4);
2532
132k
  c8 = _mm256_srli_epi32(a16, 1);
2533
132k
  min_base_y256 = _mm256_set1_epi32(min_base_y);
2534
132k
  c3f = _mm256_set1_epi32(0x3f);
2535
132k
  dy256 = _mm256_set1_epi32(dy);
2536
132k
  c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2537
132k
  c1234 = _mm256_add_epi32(c0123, c1);
2538
2539
1.92M
  for (int r = 0; r < H; r++) {
2540
1.78M
    __m256i b, res, shift, ydx;
2541
1.78M
    __m256i resx[2], resy[2];
2542
1.78M
    __m256i resxy, j256, r6;
2543
5.53M
    for (int j = 0; j < W; j += 16) {
2544
3.74M
      j256 = _mm256_set1_epi32(j);
2545
3.74M
      int y = r + 1;
2546
3.74M
      ydx = _mm256_set1_epi32(y * dx);
2547
2548
3.74M
      int base_x = ((j << 6) - y * dx) >> frac_bits_x;
2549
3.74M
      int base_shift = 0;
2550
3.74M
      if ((base_x) < (min_base_x - 1)) {
2551
2.33M
        base_shift = (min_base_x - base_x - 1);
2552
2.33M
      }
2553
3.74M
      int base_min_diff = (min_base_x - base_x);
2554
3.74M
      if (base_min_diff > 16) {
2555
1.51M
        base_min_diff = 16;
2556
2.22M
      } else {
2557
2.22M
        if (base_min_diff < 0) base_min_diff = 0;
2558
2.22M
      }
2559
2560
3.74M
      if (base_shift > 7) {
2561
1.84M
        resx[0] = _mm256_setzero_si256();
2562
1.90M
      } else {
2563
1.90M
        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2564
1.90M
        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
2565
1.90M
        a0_x128 =
2566
1.90M
            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2567
1.90M
        a1_x128 =
2568
1.90M
            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2569
2570
1.90M
        a0_x = _mm256_cvtepu16_epi32(a0_x128);
2571
1.90M
        a1_x = _mm256_cvtepu16_epi32(a1_x128);
2572
2573
1.90M
        r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6);
2574
1.90M
        shift = _mm256_srli_epi32(
2575
1.90M
            _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
2576
2577
1.90M
        diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
2578
1.90M
        a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
2579
1.90M
        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2580
2581
1.90M
        b = _mm256_mullo_epi32(diff, shift);
2582
1.90M
        res = _mm256_add_epi32(a32, b);
2583
1.90M
        res = _mm256_srli_epi32(res, 5);
2584
2585
1.90M
        resx[0] = _mm256_packus_epi32(
2586
1.90M
            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2587
1.90M
      }
2588
3.74M
      int base_shift8 = 0;
2589
3.74M
      if ((base_x + 8) < (min_base_x - 1)) {
2590
1.79M
        base_shift8 = (min_base_x - (base_x + 8) - 1);
2591
1.79M
      }
2592
3.74M
      if (base_shift8 > 7) {
2593
1.51M
        resx[1] = _mm256_setzero_si256();
2594
2.22M
      } else {
2595
2.22M
        a0_1_x128 =
2596
2.22M
            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8));
2597
2.22M
        a1_1_x128 =
2598
2.22M
            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9));
2599
2.22M
        a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
2600
2.22M
                                     *(__m128i *)HighbdLoadMaskx[base_shift8]);
2601
2.22M
        a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
2602
2.22M
                                     *(__m128i *)HighbdLoadMaskx[base_shift8]);
2603
2604
2.22M
        a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
2605
2.22M
        a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
2606
2607
2.22M
        r6 = _mm256_slli_epi32(
2608
2.22M
            _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6);
2609
2.22M
        shift = _mm256_srli_epi32(
2610
2.22M
            _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
2611
2612
2.22M
        diff = _mm256_sub_epi32(a1_1_x, a0_1_x);  // a[x+1] - a[x]
2613
2.22M
        a32 = _mm256_slli_epi32(a0_1_x, 5);       // a[x] * 32
2614
2.22M
        a32 = _mm256_add_epi32(a32, a16);         // a[x] * 32 + 16
2615
2.22M
        b = _mm256_mullo_epi32(diff, shift);
2616
2617
2.22M
        resx[1] = _mm256_add_epi32(a32, b);
2618
2.22M
        resx[1] = _mm256_srli_epi32(resx[1], 5);
2619
2.22M
        resx[1] = _mm256_packus_epi32(
2620
2.22M
            resx[1],
2621
2.22M
            _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1)));
2622
2.22M
      }
2623
3.74M
      resx[0] =
2624
3.74M
          _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]),
2625
3.74M
                                  1);  // 16 16bit values
2626
2627
      // y calc
2628
3.74M
      resy[0] = _mm256_setzero_si256();
2629
3.74M
      if ((base_x < min_base_x)) {
2630
2.46M
        __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256;
2631
2.46M
        r6 = _mm256_set1_epi32(r << 6);
2632
2.46M
        c256 = _mm256_add_epi32(j256, c1234);
2633
2.46M
        y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2634
2.46M
        base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
2635
2.46M
        mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2636
2.46M
        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2637
2.46M
        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2638
2.46M
        c256 = _mm256_add_epi32(c256, c8);
2639
2.46M
        y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2640
2.46M
        base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
2641
2.46M
        mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2642
2.46M
        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2643
2.46M
        _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256);
2644
2645
2.46M
        a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2646
2.46M
            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2647
2.46M
            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2648
2.46M
            left[base_y_c[6]], left[base_y_c[7]]));
2649
2.46M
        a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2650
2.46M
            left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
2651
2.46M
            left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2652
2.46M
            left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
2653
2654
2.46M
        shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
2655
2656
2.46M
        diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
2657
2.46M
        a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
2658
2.46M
        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2659
2660
2.46M
        b = _mm256_mullo_epi32(diff, shift);
2661
2.46M
        res = _mm256_add_epi32(a32, b);
2662
2.46M
        res = _mm256_srli_epi32(res, 5);
2663
2664
2.46M
        resy[0] = _mm256_packus_epi32(
2665
2.46M
            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2666
2667
2.46M
        a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2668
2.46M
            left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]],
2669
2.46M
            left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]],
2670
2.46M
            left[base_y_c[14]], left[base_y_c[15]]));
2671
2.46M
        a1_y = _mm256_cvtepu16_epi32(
2672
2.46M
            _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1],
2673
2.46M
                           left[base_y_c[10] + 1], left[base_y_c[11] + 1],
2674
2.46M
                           left[base_y_c[12] + 1], left[base_y_c[13] + 1],
2675
2.46M
                           left[base_y_c[14] + 1], left[base_y_c[15] + 1]));
2676
2.46M
        shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1);
2677
2678
2.46M
        diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
2679
2.46M
        a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
2680
2.46M
        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2681
2682
2.46M
        b = _mm256_mullo_epi32(diff, shift);
2683
2.46M
        res = _mm256_add_epi32(a32, b);
2684
2.46M
        res = _mm256_srli_epi32(res, 5);
2685
2686
2.46M
        resy[1] = _mm256_packus_epi32(
2687
2.46M
            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2688
2689
2.46M
        resy[0] =
2690
2.46M
            _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
2691
2.46M
                                    1);  // 16 16bit values
2692
2.46M
      }
2693
2694
3.74M
      resxy = _mm256_blendv_epi8(resx[0], resy[0],
2695
3.74M
                                 *(__m256i *)HighbdBaseMask[base_min_diff]);
2696
3.74M
      _mm256_storeu_si256((__m256i *)(dst + j), resxy);
2697
3.74M
    }  // for j
2698
1.78M
    dst += stride;
2699
1.78M
  }
2700
132k
}
2701
2702
static void highbd_dr_prediction_z2_HxW_avx2(
2703
    int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2704
    const uint16_t *left, int upsample_above, int upsample_left, int dx,
2705
490k
    int dy) {
2706
  // here upsample_above and upsample_left are 0 by design of
2707
  // av1_use_intra_edge_upsample
2708
490k
  const int min_base_x = -1;
2709
490k
  const int min_base_y = -1;
2710
490k
  (void)upsample_above;
2711
490k
  (void)upsample_left;
2712
490k
  const int frac_bits_x = 6;
2713
490k
  const int frac_bits_y = 6;
2714
2715
  // pre-filter above pixels
2716
  // store in temp buffers:
2717
  //   above[x] * 32 + 16
2718
  //   above[x+1] - above[x]
2719
  // final pixels will be calculated as:
2720
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2721
490k
  __m256i a0_x, a1_x, a32, a16, c3f, c1;
2722
490k
  __m256i diff, min_base_y256, dy256, c1234, c0123;
2723
490k
  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
2724
2725
490k
  a16 = _mm256_set1_epi16(16);
2726
490k
  c1 = _mm256_srli_epi16(a16, 4);
2727
490k
  min_base_y256 = _mm256_set1_epi16(min_base_y);
2728
490k
  c3f = _mm256_set1_epi16(0x3f);
2729
490k
  dy256 = _mm256_set1_epi16(dy);
2730
490k
  c0123 =
2731
490k
      _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2732
490k
  c1234 = _mm256_add_epi16(c0123, c1);
2733
2734
10.2M
  for (int r = 0; r < H; r++) {
2735
9.73M
    __m256i b, res, shift;
2736
9.73M
    __m256i resx, resy, ydx;
2737
9.73M
    __m256i resxy, j256, r6;
2738
9.73M
    __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
2739
9.73M
    int y = r + 1;
2740
9.73M
    ydx = _mm256_set1_epi16((short)(y * dx));
2741
2742
27.8M
    for (int j = 0; j < W; j += 16) {
2743
18.1M
      j256 = _mm256_set1_epi16(j);
2744
18.1M
      int base_x = ((j << 6) - y * dx) >> frac_bits_x;
2745
18.1M
      int base_shift = 0;
2746
18.1M
      if ((base_x) < (min_base_x - 1)) {
2747
13.3M
        base_shift = (min_base_x - (base_x)-1);
2748
13.3M
      }
2749
18.1M
      int base_min_diff = (min_base_x - base_x);
2750
18.1M
      if (base_min_diff > 16) {
2751
9.90M
        base_min_diff = 16;
2752
9.90M
      } else {
2753
8.23M
        if (base_min_diff < 0) base_min_diff = 0;
2754
8.23M
      }
2755
2756
18.1M
      if (base_shift < 8) {
2757
6.82M
        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2758
6.82M
        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
2759
6.82M
        a0_x128 =
2760
6.82M
            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2761
6.82M
        a1_x128 =
2762
6.82M
            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2763
2764
6.82M
        a0_x = _mm256_castsi128_si256(a0_x128);
2765
6.82M
        a1_x = _mm256_castsi128_si256(a1_x128);
2766
11.3M
      } else {
2767
11.3M
        a0_x = _mm256_setzero_si256();
2768
11.3M
        a1_x = _mm256_setzero_si256();
2769
11.3M
      }
2770
2771
18.1M
      int base_shift1 = 0;
2772
18.1M
      if (base_shift > 8) {
2773
11.0M
        base_shift1 = base_shift - 8;
2774
11.0M
      }
2775
18.1M
      if (base_shift1 < 8) {
2776
8.23M
        a0_1_x128 =
2777
8.23M
            _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8));
2778
8.23M
        a1_1_x128 =
2779
8.23M
            _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9));
2780
8.23M
        a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
2781
8.23M
                                     *(__m128i *)HighbdLoadMaskx[base_shift1]);
2782
8.23M
        a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
2783
8.23M
                                     *(__m128i *)HighbdLoadMaskx[base_shift1]);
2784
2785
8.23M
        a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
2786
8.23M
        a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
2787
8.23M
      }
2788
18.1M
      r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
2789
18.1M
      shift = _mm256_srli_epi16(
2790
18.1M
          _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
2791
2792
18.1M
      diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
2793
18.1M
      a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
2794
18.1M
      a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2795
2796
18.1M
      b = _mm256_mullo_epi16(diff, shift);
2797
18.1M
      res = _mm256_add_epi16(a32, b);
2798
18.1M
      resx = _mm256_srli_epi16(res, 5);  // 16 16-bit values
2799
2800
      // y calc
2801
18.1M
      resy = _mm256_setzero_si256();
2802
18.1M
      __m256i a0_y, a1_y, shifty;
2803
18.1M
      if ((base_x < min_base_x)) {
2804
14.0M
        __m256i c256, y_c256, base_y_c256, mask256, mul16;
2805
14.0M
        r6 = _mm256_set1_epi16(r << 6);
2806
14.0M
        c256 = _mm256_add_epi16(j256, c1234);
2807
14.0M
        mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
2808
14.0M
                                 _mm256_srli_epi16(min_base_y256, 1));
2809
14.0M
        y_c256 = _mm256_sub_epi16(r6, mul16);
2810
14.0M
        base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
2811
14.0M
        mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
2812
14.0M
        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2813
14.0M
        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2814
2815
14.0M
        a0_y = _mm256_setr_epi16(
2816
14.0M
            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2817
14.0M
            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2818
14.0M
            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
2819
14.0M
            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
2820
14.0M
            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
2821
14.0M
            left[base_y_c[15]]);
2822
14.0M
        base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
2823
14.0M
        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2824
2825
14.0M
        a1_y = _mm256_setr_epi16(
2826
14.0M
            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2827
14.0M
            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2828
14.0M
            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
2829
14.0M
            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
2830
14.0M
            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
2831
14.0M
            left[base_y_c[15]]);
2832
2833
14.0M
        shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
2834
2835
14.0M
        diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
2836
14.0M
        a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
2837
14.0M
        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2838
2839
14.0M
        b = _mm256_mullo_epi16(diff, shifty);
2840
14.0M
        res = _mm256_add_epi16(a32, b);
2841
14.0M
        resy = _mm256_srli_epi16(res, 5);
2842
14.0M
      }
2843
2844
18.1M
      resxy = _mm256_blendv_epi8(resx, resy,
2845
18.1M
                                 *(__m256i *)HighbdBaseMask[base_min_diff]);
2846
18.1M
      _mm256_storeu_si256((__m256i *)(dst + j), resxy);
2847
18.1M
    }  // for j
2848
9.73M
    dst += stride;
2849
9.73M
  }
2850
490k
}
2851
2852
// Directional prediction, zone 2: 90 < angle < 180
2853
void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
2854
                                      int bh, const uint16_t *above,
2855
                                      const uint16_t *left, int upsample_above,
2856
                                      int upsample_left, int dx, int dy,
2857
1.53M
                                      int bd) {
2858
1.53M
  (void)bd;
2859
1.53M
  assert(dx > 0);
2860
1.53M
  assert(dy > 0);
2861
1.53M
  switch (bw) {
2862
425k
    case 4:
2863
425k
      if (bd < 12) {
2864
226k
        highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
2865
226k
                                         upsample_above, upsample_left, dx, dy);
2866
226k
      } else {
2867
199k
        highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left,
2868
199k
                                               upsample_above, upsample_left,
2869
199k
                                               dx, dy);
2870
199k
      }
2871
425k
      break;
2872
483k
    case 8:
2873
483k
      if (bd < 12) {
2874
275k
        highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left,
2875
275k
                                         upsample_above, upsample_left, dx, dy);
2876
275k
      } else {
2877
207k
        highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left,
2878
207k
                                               upsample_above, upsample_left,
2879
207k
                                               dx, dy);
2880
207k
      }
2881
483k
      break;
2882
622k
    default:
2883
622k
      if (bd < 12) {
2884
490k
        highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
2885
490k
                                         upsample_above, upsample_left, dx, dy);
2886
490k
      } else {
2887
132k
        highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left,
2888
132k
                                               upsample_above, upsample_left,
2889
132k
                                               dx, dy);
2890
132k
      }
2891
622k
      break;
2892
1.53M
  }
2893
1.53M
}
2894
2895
//  Directional prediction, zone 3 functions
2896
static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
2897
                                             const uint16_t *left,
2898
                                             int upsample_left, int dy,
2899
179k
                                             int bd) {
2900
179k
  __m128i dstvec[4], d[4];
2901
179k
  if (bd < 12) {
2902
157k
    highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left,
2903
157k
                                              dy);
2904
157k
  } else {
2905
22.0k
    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left,
2906
22.0k
                                                    upsample_left, dy);
2907
22.0k
  }
2908
179k
  highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2],
2909
179k
                                   &dstvec[3], &d[0], &d[1], &d[2], &d[3]);
2910
179k
  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
2911
179k
  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
2912
179k
  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
2913
179k
  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
2914
179k
  return;
2915
179k
}
2916
2917
static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
2918
                                             const uint16_t *left,
2919
                                             int upsample_left, int dy,
2920
203k
                                             int bd) {
2921
203k
  __m128i dstvec[8], d[8];
2922
203k
  if (bd < 12) {
2923
121k
    highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left,
2924
121k
                                              dy);
2925
121k
  } else {
2926
82.4k
    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left,
2927
82.4k
                                                    upsample_left, dy);
2928
82.4k
  }
2929
203k
  highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2930
203k
                           &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
2931
203k
                           &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
2932
203k
                           &d[7]);
2933
1.83M
  for (int i = 0; i < 8; i++) {
2934
1.63M
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
2935
1.63M
  }
2936
203k
}
2937
2938
static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride,
2939
                                             const uint16_t *left,
2940
                                             int upsample_left, int dy,
2941
52.2k
                                             int bd) {
2942
52.2k
  __m128i dstvec[4], d[8];
2943
52.2k
  if (bd < 12) {
2944
24.5k
    highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left,
2945
24.5k
                                              dy);
2946
27.7k
  } else {
2947
27.7k
    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left,
2948
27.7k
                                                    upsample_left, dy);
2949
27.7k
  }
2950
2951
52.2k
  highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2952
52.2k
                               &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
2953
52.2k
                               &d[7]);
2954
470k
  for (int i = 0; i < 8; i++) {
2955
418k
    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
2956
418k
  }
2957
52.2k
}
2958
2959
static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
2960
                                             const uint16_t *left,
2961
                                             int upsample_left, int dy,
2962
58.7k
                                             int bd) {
2963
58.7k
  __m128i dstvec[8], d[4];
2964
58.7k
  if (bd < 12) {
2965
39.8k
    highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left,
2966
39.8k
                                              dy);
2967
39.8k
  } else {
2968
18.8k
    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left,
2969
18.8k
                                                    upsample_left, dy);
2970
18.8k
  }
2971
2972
58.7k
  highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2973
58.7k
                               &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
2974
58.7k
                               &d[0], &d[1], &d[2], &d[3]);
2975
58.7k
  _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
2976
58.7k
  _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]);
2977
58.7k
  _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]);
2978
58.7k
  _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]);
2979
58.7k
}
2980
2981
static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
2982
                                              const uint16_t *left,
2983
                                              int upsample_left, int dy,
2984
44.1k
                                              int bd) {
2985
44.1k
  __m256i dstvec[8], d[8];
2986
44.1k
  if (bd < 12) {
2987
33.4k
    highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
2988
33.4k
                                               dy);
2989
33.4k
  } else {
2990
10.7k
    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left,
2991
10.7k
                                                     upsample_left, dy);
2992
10.7k
  }
2993
44.1k
  highbd_transpose8x16_16x8_avx2(dstvec, d);
2994
397k
  for (int i = 0; i < 8; i++) {
2995
353k
    _mm_storeu_si128((__m128i *)(dst + i * stride),
2996
353k
                     _mm256_castsi256_si128(d[i]));
2997
353k
  }
2998
397k
  for (int i = 8; i < 16; i++) {
2999
353k
    _mm_storeu_si128((__m128i *)(dst + i * stride),
3000
353k
                     _mm256_extracti128_si256(d[i - 8], 1));
3001
353k
  }
3002
44.1k
}
3003
3004
static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
3005
                                              const uint16_t *left,
3006
                                              int upsample_left, int dy,
3007
84.2k
                                              int bd) {
3008
84.2k
  __m128i dstvec[16], d[16];
3009
84.2k
  if (bd < 12) {
3010
49.9k
    highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left,
3011
49.9k
                                              dy);
3012
49.9k
  } else {
3013
34.2k
    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left,
3014
34.2k
                                                    upsample_left, dy);
3015
34.2k
  }
3016
252k
  for (int i = 0; i < 16; i += 8) {
3017
168k
    highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
3018
168k
                             &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
3019
168k
                             &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
3020
168k
                             &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
3021
168k
                             &d[5 + i], &d[6 + i], &d[7 + i]);
3022
168k
  }
3023
758k
  for (int i = 0; i < 8; i++) {
3024
673k
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
3025
673k
    _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
3026
673k
  }
3027
84.2k
}
3028
3029
static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride,
3030
                                              const uint16_t *left,
3031
                                              int upsample_left, int dy,
3032
25.3k
                                              int bd) {
3033
25.3k
  __m256i dstvec[4], d[4], d1;
3034
25.3k
  if (bd < 12) {
3035
20.2k
    highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left,
3036
20.2k
                                               dy);
3037
20.2k
  } else {
3038
5.18k
    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left,
3039
5.18k
                                                     upsample_left, dy);
3040
5.18k
  }
3041
25.3k
  highbd_transpose4x16_avx2(dstvec, d);
3042
126k
  for (int i = 0; i < 4; i++) {
3043
101k
    _mm_storel_epi64((__m128i *)(dst + i * stride),
3044
101k
                     _mm256_castsi256_si128(d[i]));
3045
101k
    d1 = _mm256_bsrli_epi128(d[i], 8);
3046
101k
    _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride),
3047
101k
                     _mm256_castsi256_si128(d1));
3048
101k
    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
3049
101k
                     _mm256_extracti128_si256(d[i], 1));
3050
101k
    _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride),
3051
101k
                     _mm256_extracti128_si256(d1, 1));
3052
101k
  }
3053
25.3k
}
3054
3055
static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
3056
                                              const uint16_t *left,
3057
                                              int upsample_left, int dy,
3058
69.1k
                                              int bd) {
3059
69.1k
  __m128i dstvec[16], d[8];
3060
69.1k
  if (bd < 12) {
3061
54.2k
    highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left,
3062
54.2k
                                              dy);
3063
54.2k
  } else {
3064
14.8k
    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left,
3065
14.8k
                                                    upsample_left, dy);
3066
14.8k
  }
3067
69.1k
  highbd_transpose16x4_8x8_sse2(dstvec, d);
3068
3069
69.1k
  _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
3070
69.1k
  _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]);
3071
69.1k
  _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]);
3072
69.1k
  _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]);
3073
69.1k
  _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]);
3074
69.1k
  _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]);
3075
69.1k
  _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]);
3076
69.1k
  _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]);
3077
69.1k
}
3078
3079
static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
3080
                                              const uint16_t *left,
3081
                                              int upsample_left, int dy,
3082
11.3k
                                              int bd) {
3083
11.3k
  __m256i dstvec[16], d[16];
3084
11.3k
  if (bd < 12) {
3085
10.4k
    highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left,
3086
10.4k
                                               dy);
3087
10.4k
  } else {
3088
866
    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left,
3089
866
                                                     upsample_left, dy);
3090
866
  }
3091
3092
34.0k
  for (int i = 0; i < 16; i += 8) {
3093
22.6k
    highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
3094
22.6k
  }
3095
3096
102k
  for (int i = 0; i < 8; i++) {
3097
90.7k
    _mm_storeu_si128((__m128i *)(dst + i * stride),
3098
90.7k
                     _mm256_castsi256_si128(d[i]));
3099
90.7k
  }
3100
102k
  for (int i = 0; i < 8; i++) {
3101
90.7k
    _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
3102
90.7k
                     _mm256_extracti128_si256(d[i], 1));
3103
90.7k
  }
3104
102k
  for (int i = 8; i < 16; i++) {
3105
90.7k
    _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
3106
90.7k
                     _mm256_castsi256_si128(d[i]));
3107
90.7k
  }
3108
102k
  for (int i = 8; i < 16; i++) {
3109
90.7k
    _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride),
3110
90.7k
                     _mm256_extracti128_si256(d[i], 1));
3111
90.7k
  }
3112
11.3k
}
3113
3114
static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
3115
                                              const uint16_t *left,
3116
                                              int upsample_left, int dy,
3117
61.7k
                                              int bd) {
3118
61.7k
  __m128i dstvec[32], d[32];
3119
61.7k
  if (bd < 12) {
3120
54.5k
    highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left,
3121
54.5k
                                              dy);
3122
54.5k
  } else {
3123
7.17k
    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left,
3124
7.17k
                                                    upsample_left, dy);
3125
7.17k
  }
3126
3127
308k
  for (int i = 0; i < 32; i += 8) {
3128
247k
    highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
3129
247k
                             &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
3130
247k
                             &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
3131
247k
                             &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
3132
247k
                             &d[5 + i], &d[6 + i], &d[7 + i]);
3133
247k
  }
3134
555k
  for (int i = 0; i < 8; i++) {
3135
494k
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
3136
494k
    _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
3137
494k
    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]);
3138
494k
    _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]);
3139
494k
  }
3140
61.7k
}
3141
3142
static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
3143
                                               const uint16_t *left,
3144
                                               int upsample_left, int dy,
3145
123k
                                               int bd) {
3146
123k
  __m256i dstvec[16], d[16];
3147
123k
  if (bd < 12) {
3148
100k
    highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left,
3149
100k
                                               dy);
3150
100k
  } else {
3151
22.8k
    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left,
3152
22.8k
                                                     upsample_left, dy);
3153
22.8k
  }
3154
3155
123k
  highbd_transpose16x16_avx2(dstvec, d);
3156
3157
2.10M
  for (int i = 0; i < 16; i++) {
3158
1.97M
    _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]);
3159
1.97M
  }
3160
123k
}
3161
3162
static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
3163
                                               const uint16_t *left,
3164
                                               int upsample_left, int dy,
3165
114k
                                               int bd) {
3166
114k
  __m256i dstvec[64], d[16];
3167
114k
  if (bd < 12) {
3168
110k
    highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left,
3169
110k
                                               dy);
3170
110k
  } else {
3171
4.02k
    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left,
3172
4.02k
                                                     upsample_left, dy);
3173
4.02k
  }
3174
114k
  highbd_transpose16x16_avx2(dstvec, d);
3175
1.95M
  for (int j = 0; j < 16; j++) {
3176
1.83M
    _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]);
3177
1.83M
  }
3178
114k
  highbd_transpose16x16_avx2(dstvec + 16, d);
3179
1.95M
  for (int j = 0; j < 16; j++) {
3180
1.83M
    _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]);
3181
1.83M
  }
3182
114k
  highbd_transpose16x16_avx2(dstvec + 32, d);
3183
1.95M
  for (int j = 0; j < 16; j++) {
3184
1.83M
    _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]);
3185
1.83M
  }
3186
114k
  highbd_transpose16x16_avx2(dstvec + 48, d);
3187
1.95M
  for (int j = 0; j < 16; j++) {
3188
1.83M
    _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]);
3189
1.83M
  }
3190
114k
}
3191
3192
static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
3193
                                               const uint16_t *left,
3194
                                               int upsample_left, int dy,
3195
24.9k
                                               int bd) {
3196
24.9k
  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
3197
24.9k
  if (bd < 12) {
3198
21.3k
    highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
3199
21.3k
  } else {
3200
3.59k
    highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left,
3201
3.59k
                                            dy);
3202
3.59k
  }
3203
24.9k
  highbd_transpose(dstT, 64, dst, stride, 64, 64);
3204
24.9k
}
3205
3206
static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
3207
                                               const uint16_t *left,
3208
                                               int upsample_left, int dy,
3209
24.4k
                                               int bd) {
3210
24.4k
  __m256i dstvec[32], d[32];
3211
24.4k
  if (bd < 12) {
3212
23.6k
    highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left,
3213
23.6k
                                               dy);
3214
23.6k
  } else {
3215
791
    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left,
3216
791
                                                     upsample_left, dy);
3217
791
  }
3218
122k
  for (int i = 0; i < 32; i += 8) {
3219
97.7k
    highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
3220
97.7k
  }
3221
  // store
3222
73.2k
  for (int j = 0; j < 32; j += 16) {
3223
439k
    for (int i = 0; i < 8; i++) {
3224
390k
      _mm_storeu_si128((__m128i *)(dst + (i + j) * stride),
3225
390k
                       _mm256_castsi256_si128(d[(i + j)]));
3226
390k
    }
3227
439k
    for (int i = 0; i < 8; i++) {
3228
390k
      _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8),
3229
390k
                       _mm256_castsi256_si128(d[(i + j) + 8]));
3230
390k
    }
3231
439k
    for (int i = 8; i < 16; i++) {
3232
390k
      _mm256_storeu_si256(
3233
390k
          (__m256i *)(dst + (i + j) * stride),
3234
390k
          _mm256_inserti128_si256(
3235
390k
              d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0));
3236
390k
    }
3237
48.8k
  }
3238
24.4k
}
3239
3240
static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
3241
                                               const uint16_t *left,
3242
                                               int upsample_left, int dy,
3243
31.0k
                                               int bd) {
3244
31.0k
  __m256i dstvec[32], d[16];
3245
31.0k
  if (bd < 12) {
3246
26.2k
    highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left,
3247
26.2k
                                               dy);
3248
26.2k
  } else {
3249
4.72k
    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left,
3250
4.72k
                                                     upsample_left, dy);
3251
4.72k
  }
3252
93.0k
  for (int i = 0; i < 32; i += 16) {
3253
62.0k
    highbd_transpose16x16_avx2((dstvec + i), d);
3254
1.05M
    for (int j = 0; j < 16; j++) {
3255
992k
      _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
3256
992k
    }
3257
62.0k
  }
3258
31.0k
}
3259
3260
static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
3261
                                               const uint16_t *left,
3262
                                               int upsample_left, int dy,
3263
2.04k
                                               int bd) {
3264
2.04k
  uint16_t dstT[64 * 32];
3265
2.04k
  if (bd < 12) {
3266
1.85k
    highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
3267
1.85k
  } else {
3268
191
    highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left,
3269
191
                                            dy);
3270
191
  }
3271
2.04k
  highbd_transpose(dstT, 64, dst, stride, 32, 64);
3272
2.04k
}
3273
3274
static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
3275
                                               const uint16_t *left,
3276
                                               int upsample_left, int dy,
3277
4.63k
                                               int bd) {
3278
4.63k
  DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
3279
4.63k
  highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd);
3280
4.63k
  highbd_transpose(dstT, 32, dst, stride, 64, 32);
3281
4.63k
  return;
3282
4.63k
}
3283
3284
static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
3285
                                               const uint16_t *left,
3286
                                               int upsample_left, int dy,
3287
5.20k
                                               int bd) {
3288
5.20k
  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
3289
5.20k
  if (bd < 12) {
3290
5.02k
    highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
3291
5.02k
  } else {
3292
174
    highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left,
3293
174
                                            dy);
3294
174
  }
3295
5.20k
  highbd_transpose(dstT, 64, dst, stride, 16, 64);
3296
5.20k
}
3297
3298
static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
3299
                                               const uint16_t *left,
3300
                                               int upsample_left, int dy,
3301
19.3k
                                               int bd) {
3302
19.3k
  __m256i dstvec[64], d[16];
3303
19.3k
  if (bd < 12) {
3304
19.2k
    highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left,
3305
19.2k
                                               dy);
3306
19.2k
  } else {
3307
90
    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left,
3308
90
                                                     upsample_left, dy);
3309
90
  }
3310
96.7k
  for (int i = 0; i < 64; i += 16) {
3311
77.4k
    highbd_transpose16x16_avx2((dstvec + i), d);
3312
1.31M
    for (int j = 0; j < 16; j++) {
3313
1.23M
      _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
3314
1.23M
    }
3315
77.4k
  }
3316
19.3k
}
3317
3318
void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
3319
                                      int bh, const uint16_t *above,
3320
                                      const uint16_t *left, int upsample_left,
3321
1.14M
                                      int dx, int dy, int bd) {
3322
1.14M
  (void)above;
3323
1.14M
  (void)dx;
3324
3325
1.14M
  assert(dx == 1);
3326
1.14M
  assert(dy > 0);
3327
1.14M
  if (bw == bh) {
3328
646k
    switch (bw) {
3329
179k
      case 4:
3330
179k
        highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy,
3331
179k
                                         bd);
3332
179k
        break;
3333
203k
      case 8:
3334
203k
        highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy,
3335
203k
                                         bd);
3336
203k
        break;
3337
123k
      case 16:
3338
123k
        highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy,
3339
123k
                                           bd);
3340
123k
        break;
3341
114k
      case 32:
3342
114k
        highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy,
3343
114k
                                           bd);
3344
114k
        break;
3345
24.9k
      case 64:
3346
24.9k
        highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy,
3347
24.9k
                                           bd);
3348
24.9k
        break;
3349
646k
    }
3350
646k
  } else {
3351
493k
    if (bw < bh) {
3352
164k
      if (bw + bw == bh) {
3353
122k
        switch (bw) {
3354
52.2k
          case 4:
3355
52.2k
            highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left,
3356
52.2k
                                             dy, bd);
3357
52.2k
            break;
3358
44.1k
          case 8:
3359
44.1k
            highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left,
3360
44.1k
                                              dy, bd);
3361
44.1k
            break;
3362
24.4k
          case 16:
3363
24.4k
            highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left,
3364
24.4k
                                               dy, bd);
3365
24.4k
            break;
3366
2.04k
          case 32:
3367
2.04k
            highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left,
3368
2.04k
                                               dy, bd);
3369
2.04k
            break;
3370
122k
        }
3371
122k
      } else {
3372
41.9k
        switch (bw) {
3373
25.3k
          case 4:
3374
25.3k
            highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left,
3375
25.3k
                                              dy, bd);
3376
25.3k
            break;
3377
11.3k
          case 8:
3378
11.3k
            highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left,
3379
11.3k
                                              dy, bd);
3380
11.3k
            break;
3381
5.20k
          case 16:
3382
5.20k
            highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left,
3383
5.20k
                                               dy, bd);
3384
5.20k
            break;
3385
41.9k
        }
3386
41.9k
      }
3387
328k
    } else {
3388
328k
      if (bh + bh == bw) {
3389
178k
        switch (bh) {
3390
58.7k
          case 4:
3391
58.7k
            highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left,
3392
58.7k
                                             dy, bd);
3393
58.7k
            break;
3394
84.2k
          case 8:
3395
84.2k
            highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left,
3396
84.2k
                                              dy, bd);
3397
84.2k
            break;
3398
31.0k
          case 16:
3399
31.0k
            highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left,
3400
31.0k
                                               dy, bd);
3401
31.0k
            break;
3402
4.63k
          case 32:
3403
4.63k
            highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left,
3404
4.63k
                                               dy, bd);
3405
4.63k
            break;
3406
178k
        }
3407
178k
      } else {
3408
150k
        switch (bh) {
3409
69.1k
          case 4:
3410
69.1k
            highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left,
3411
69.1k
                                              dy, bd);
3412
69.1k
            break;
3413
61.7k
          case 8:
3414
61.7k
            highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left,
3415
61.7k
                                              dy, bd);
3416
61.7k
            break;
3417
19.3k
          case 16:
3418
19.3k
            highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left,
3419
19.3k
                                               dy, bd);
3420
19.3k
            break;
3421
150k
        }
3422
150k
      }
3423
328k
    }
3424
493k
  }
3425
1.14M
  return;
3426
1.14M
}
3427
3428
// Low bit depth functions
3429
static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
3430
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3431
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3432
  { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3433
    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3434
  { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3435
    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3436
  { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3437
    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3438
  { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3439
    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3440
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3441
    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3442
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3443
    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3444
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3445
    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0 },
3446
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
3447
    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0 },
3448
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
3449
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0 },
3450
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
3451
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
3452
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3453
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3454
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
3455
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3456
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3457
    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
3458
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3459
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3460
    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,
3461
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3462
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3463
    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,
3464
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3465
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3466
    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,
3467
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3468
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3469
    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,
3470
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3471
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3472
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,
3473
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3474
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3475
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,
3476
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3477
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3478
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,
3479
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3480
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3481
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,
3482
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3483
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3484
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
3485
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3486
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3487
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3488
    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3489
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3490
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3491
    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0 },
3492
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3493
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3494
    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0 },
3495
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3496
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3497
    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0 },
3498
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3499
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3500
    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0 },
3501
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3502
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3503
    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0 },
3504
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3505
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3506
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0 },
3507
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3508
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3509
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0 },
3510
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3511
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3512
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0 },
3513
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3514
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3515
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
3516
  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3517
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3518
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
3519
};
3520
3521
/* clang-format on */
3522
static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2(
3523
    int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
3524
1.04M
    int dx) {
3525
1.04M
  const int frac_bits = 6 - upsample_above;
3526
1.04M
  const int max_base_x = ((W + H) - 1) << upsample_above;
3527
3528
1.04M
  assert(dx > 0);
3529
  // pre-filter above pixels
3530
  // store in temp buffers:
3531
  //   above[x] * 32 + 16
3532
  //   above[x+1] - above[x]
3533
  // final pixels will be calculated as:
3534
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3535
1.04M
  __m256i a0, a1, a32, a16;
3536
1.04M
  __m256i diff, c3f;
3537
1.04M
  __m128i a_mbase_x;
3538
3539
1.04M
  a16 = _mm256_set1_epi16(16);
3540
1.04M
  a_mbase_x = _mm_set1_epi8((int8_t)above[max_base_x]);
3541
1.04M
  c3f = _mm256_set1_epi16(0x3f);
3542
3543
1.04M
  int x = dx;
3544
13.0M
  for (int r = 0; r < W; r++) {
3545
12.0M
    __m256i b, res, shift;
3546
12.0M
    __m128i res1, a0_128, a1_128;
3547
3548
12.0M
    int base = x >> frac_bits;
3549
12.0M
    int base_max_diff = (max_base_x - base) >> upsample_above;
3550
12.0M
    if (base_max_diff <= 0) {
3551
19.0k
      for (int i = r; i < W; ++i) {
3552
12.9k
        dst[i] = a_mbase_x;  // save 4 values
3553
12.9k
      }
3554
6.15k
      return;
3555
6.15k
    }
3556
11.9M
    if (base_max_diff > H) base_max_diff = H;
3557
11.9M
    a0_128 = _mm_loadu_si128((__m128i *)(above + base));
3558
11.9M
    a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
3559
3560
11.9M
    if (upsample_above) {
3561
2.27M
      a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]);
3562
2.27M
      a1_128 = _mm_srli_si128(a0_128, 8);
3563
3564
2.27M
      shift = _mm256_srli_epi16(
3565
2.27M
          _mm256_and_si256(
3566
2.27M
              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
3567
2.27M
          1);
3568
9.72M
    } else {
3569
9.72M
      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3570
9.72M
    }
3571
11.9M
    a0 = _mm256_cvtepu8_epi16(a0_128);
3572
11.9M
    a1 = _mm256_cvtepu8_epi16(a1_128);
3573
3574
11.9M
    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
3575
11.9M
    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
3576
11.9M
    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
3577
3578
11.9M
    b = _mm256_mullo_epi16(diff, shift);
3579
11.9M
    res = _mm256_add_epi16(a32, b);
3580
11.9M
    res = _mm256_srli_epi16(res, 5);
3581
3582
11.9M
    res = _mm256_packus_epi16(
3583
11.9M
        res, _mm256_castsi128_si256(
3584
11.9M
                 _mm256_extracti128_si256(res, 1)));  // goto 8 bit
3585
11.9M
    res1 = _mm256_castsi256_si128(res);               // 16 8bit values
3586
3587
11.9M
    dst[r] =
3588
11.9M
        _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
3589
11.9M
    x += dx;
3590
11.9M
  }
3591
1.04M
}
3592
3593
static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3594
                                      const uint8_t *above, int upsample_above,
3595
168k
                                      int dx) {
3596
168k
  __m128i dstvec[16];
3597
3598
168k
  dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx);
3599
1.11M
  for (int i = 0; i < N; i++) {
3600
942k
    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
3601
942k
  }
3602
168k
}
3603
3604
static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3605
                                      const uint8_t *above, int upsample_above,
3606
142k
                                      int dx) {
3607
142k
  __m128i dstvec[32];
3608
3609
142k
  dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx);
3610
1.48M
  for (int i = 0; i < N; i++) {
3611
1.34M
    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
3612
1.34M
  }
3613
142k
}
3614
3615
static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3616
                                       const uint8_t *above, int upsample_above,
3617
118k
                                       int dx) {
3618
118k
  __m128i dstvec[64];
3619
3620
118k
  dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx);
3621
1.74M
  for (int i = 0; i < N; i++) {
3622
1.62M
    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
3623
1.62M
  }
3624
118k
}
3625
3626
static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2(
3627
154k
    int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) {
3628
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
3629
154k
  (void)upsample_above;
3630
154k
  const int frac_bits = 6;
3631
154k
  const int max_base_x = ((32 + N) - 1);
3632
3633
  // pre-filter above pixels
3634
  // store in temp buffers:
3635
  //   above[x] * 32 + 16
3636
  //   above[x+1] - above[x]
3637
  // final pixels will be calculated as:
3638
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3639
154k
  __m256i a0, a1, a32, a16;
3640
154k
  __m256i a_mbase_x, diff, c3f;
3641
3642
154k
  a16 = _mm256_set1_epi16(16);
3643
154k
  a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
3644
154k
  c3f = _mm256_set1_epi16(0x3f);
3645
3646
154k
  int x = dx;
3647
4.28M
  for (int r = 0; r < N; r++) {
3648
4.13M
    __m256i b, res, res16[2];
3649
4.13M
    __m128i a0_128, a1_128;
3650
3651
4.13M
    int base = x >> frac_bits;
3652
4.13M
    int base_max_diff = (max_base_x - base);
3653
4.13M
    if (base_max_diff <= 0) {
3654
0
      for (int i = r; i < N; ++i) {
3655
0
        dstvec[i] = a_mbase_x;  // save 32 values
3656
0
      }
3657
0
      return;
3658
0
    }
3659
4.13M
    if (base_max_diff > 32) base_max_diff = 32;
3660
4.13M
    __m256i shift =
3661
4.13M
        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3662
3663
12.3M
    for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
3664
8.26M
      int mdiff = base_max_diff - j;
3665
8.26M
      if (mdiff <= 0) {
3666
818
        res16[jj] = a_mbase_x;
3667
8.25M
      } else {
3668
8.25M
        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
3669
8.25M
        a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1));
3670
8.25M
        a0 = _mm256_cvtepu8_epi16(a0_128);
3671
8.25M
        a1 = _mm256_cvtepu8_epi16(a1_128);
3672
3673
8.25M
        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
3674
8.25M
        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
3675
8.25M
        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
3676
8.25M
        b = _mm256_mullo_epi16(diff, shift);
3677
3678
8.25M
        res = _mm256_add_epi16(a32, b);
3679
8.25M
        res = _mm256_srli_epi16(res, 5);
3680
8.25M
        res16[jj] = _mm256_packus_epi16(
3681
8.25M
            res, _mm256_castsi128_si256(
3682
8.25M
                     _mm256_extracti128_si256(res, 1)));  // 16 8bit values
3683
8.25M
      }
3684
8.26M
    }
3685
4.13M
    res16[1] =
3686
4.13M
        _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]),
3687
4.13M
                                1);  // 32 8bit values
3688
3689
4.13M
    dstvec[r] = _mm256_blendv_epi8(
3690
4.13M
        a_mbase_x, res16[1],
3691
4.13M
        *(__m256i *)BaseMask[base_max_diff]);  // 32 8bit values
3692
4.13M
    x += dx;
3693
4.13M
  }
3694
154k
}
3695
3696
static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3697
                                       const uint8_t *above, int upsample_above,
3698
59.9k
                                       int dx) {
3699
59.9k
  __m256i dstvec[64];
3700
59.9k
  dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx);
3701
1.69M
  for (int i = 0; i < N; i++) {
3702
1.63M
    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
3703
1.63M
  }
3704
59.9k
}
3705
3706
static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3707
                                       const uint8_t *above, int upsample_above,
3708
35.6k
                                       int dx) {
3709
  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
3710
35.6k
  (void)upsample_above;
3711
35.6k
  const int frac_bits = 6;
3712
35.6k
  const int max_base_x = ((64 + N) - 1);
3713
3714
  // pre-filter above pixels
3715
  // store in temp buffers:
3716
  //   above[x] * 32 + 16
3717
  //   above[x+1] - above[x]
3718
  // final pixels will be calculated as:
3719
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3720
35.6k
  __m256i a0, a1, a32, a16;
3721
35.6k
  __m256i a_mbase_x, diff, c3f;
3722
35.6k
  __m128i max_base_x128, base_inc128, mask128;
3723
3724
35.6k
  a16 = _mm256_set1_epi16(16);
3725
35.6k
  a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
3726
35.6k
  max_base_x128 = _mm_set1_epi8(max_base_x);
3727
35.6k
  c3f = _mm256_set1_epi16(0x3f);
3728
3729
35.6k
  int x = dx;
3730
1.86M
  for (int r = 0; r < N; r++, dst += stride) {
3731
1.83M
    __m256i b, res;
3732
1.83M
    int base = x >> frac_bits;
3733
1.83M
    if (base >= max_base_x) {
3734
0
      for (int i = r; i < N; ++i) {
3735
0
        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
3736
0
        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
3737
0
        dst += stride;
3738
0
      }
3739
0
      return;
3740
0
    }
3741
3742
1.83M
    __m256i shift =
3743
1.83M
        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3744
3745
1.83M
    __m128i a0_128, a1_128, res128;
3746
9.15M
    for (int j = 0; j < 64; j += 16) {
3747
7.32M
      int mdif = max_base_x - (base + j);
3748
7.32M
      if (mdif <= 0) {
3749
3.20k
        _mm_storeu_si128((__m128i *)(dst + j),
3750
3.20k
                         _mm256_castsi256_si128(a_mbase_x));
3751
7.32M
      } else {
3752
7.32M
        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
3753
7.32M
        a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
3754
7.32M
        a0 = _mm256_cvtepu8_epi16(a0_128);
3755
7.32M
        a1 = _mm256_cvtepu8_epi16(a1_128);
3756
3757
7.32M
        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
3758
7.32M
        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
3759
7.32M
        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
3760
7.32M
        b = _mm256_mullo_epi16(diff, shift);
3761
3762
7.32M
        res = _mm256_add_epi16(a32, b);
3763
7.32M
        res = _mm256_srli_epi16(res, 5);
3764
7.32M
        res = _mm256_packus_epi16(
3765
7.32M
            res, _mm256_castsi128_si256(
3766
7.32M
                     _mm256_extracti128_si256(res, 1)));  // 16 8bit values
3767
3768
7.32M
        base_inc128 =
3769
7.32M
            _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
3770
7.32M
                          (int8_t)(base + j + 2), (int8_t)(base + j + 3),
3771
7.32M
                          (int8_t)(base + j + 4), (int8_t)(base + j + 5),
3772
7.32M
                          (int8_t)(base + j + 6), (int8_t)(base + j + 7),
3773
7.32M
                          (int8_t)(base + j + 8), (int8_t)(base + j + 9),
3774
7.32M
                          (int8_t)(base + j + 10), (int8_t)(base + j + 11),
3775
7.32M
                          (int8_t)(base + j + 12), (int8_t)(base + j + 13),
3776
7.32M
                          (int8_t)(base + j + 14), (int8_t)(base + j + 15));
3777
3778
7.32M
        mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
3779
7.32M
                                 _mm_setzero_si128());
3780
7.32M
        res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x),
3781
7.32M
                                 _mm256_castsi256_si128(res), mask128);
3782
7.32M
        _mm_storeu_si128((__m128i *)(dst + j), res128);
3783
7.32M
      }
3784
7.32M
    }
3785
1.83M
    x += dx;
3786
1.83M
  }
3787
35.6k
}
3788
3789
// Directional prediction, zone 1: 0 < angle < 90
3790
void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
3791
                               const uint8_t *above, const uint8_t *left,
3792
500k
                               int upsample_above, int dx, int dy) {
3793
500k
  (void)left;
3794
500k
  (void)dy;
3795
500k
  switch (bw) {
3796
168k
    case 4:
3797
168k
      dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx);
3798
168k
      break;
3799
142k
    case 8:
3800
142k
      dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx);
3801
142k
      break;
3802
118k
    case 16:
3803
118k
      dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx);
3804
118k
      break;
3805
56.6k
    case 32:
3806
56.6k
      dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx);
3807
56.6k
      break;
3808
13.0k
    case 64:
3809
13.0k
      dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx);
3810
13.0k
      break;
3811
0
    default: break;
3812
500k
  }
3813
500k
  return;
3814
500k
}
3815
3816
static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3817
                                      const uint8_t *above, const uint8_t *left,
3818
                                      int upsample_above, int upsample_left,
3819
1.02M
                                      int dx, int dy) {
3820
1.02M
  const int min_base_x = -(1 << upsample_above);
3821
1.02M
  const int min_base_y = -(1 << upsample_left);
3822
1.02M
  const int frac_bits_x = 6 - upsample_above;
3823
1.02M
  const int frac_bits_y = 6 - upsample_left;
3824
3825
1.02M
  assert(dx > 0);
3826
  // pre-filter above pixels
3827
  // store in temp buffers:
3828
  //   above[x] * 32 + 16
3829
  //   above[x+1] - above[x]
3830
  // final pixels will be calculated as:
3831
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3832
1.02M
  __m128i a0_x, a1_x, a32, a16, diff;
3833
1.02M
  __m128i c3f, min_base_y128, c1234, dy128;
3834
3835
1.02M
  a16 = _mm_set1_epi16(16);
3836
1.02M
  c3f = _mm_set1_epi16(0x3f);
3837
1.02M
  min_base_y128 = _mm_set1_epi16(min_base_y);
3838
1.02M
  c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
3839
1.02M
  dy128 = _mm_set1_epi16(dy);
3840
3841
5.62M
  for (int r = 0; r < N; r++) {
3842
4.59M
    __m128i b, res, shift, r6, ydx;
3843
4.59M
    __m128i resx, resy, resxy;
3844
4.59M
    __m128i a0_x128, a1_x128;
3845
4.59M
    int y = r + 1;
3846
4.59M
    int base_x = (-y * dx) >> frac_bits_x;
3847
4.59M
    int base_shift = 0;
3848
4.59M
    if (base_x < (min_base_x - 1)) {
3849
3.93M
      base_shift = (min_base_x - base_x - 1) >> upsample_above;
3850
3.93M
    }
3851
4.59M
    int base_min_diff =
3852
4.59M
        (min_base_x - base_x + upsample_above) >> upsample_above;
3853
4.59M
    if (base_min_diff > 4) {
3854
2.77M
      base_min_diff = 4;
3855
2.77M
    } else {
3856
1.82M
      if (base_min_diff < 0) base_min_diff = 0;
3857
1.82M
    }
3858
3859
4.59M
    if (base_shift > 3) {
3860
2.77M
      a0_x = _mm_setzero_si128();
3861
2.77M
      a1_x = _mm_setzero_si128();
3862
2.77M
      shift = _mm_setzero_si128();
3863
2.77M
    } else {
3864
1.82M
      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
3865
1.82M
      ydx = _mm_set1_epi16(y * dx);
3866
1.82M
      r6 = _mm_slli_epi16(c1234, 6);
3867
3868
1.82M
      if (upsample_above) {
3869
340k
        a0_x128 =
3870
340k
            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
3871
340k
        a1_x128 = _mm_srli_si128(a0_x128, 8);
3872
3873
340k
        shift = _mm_srli_epi16(
3874
340k
            _mm_and_si128(
3875
340k
                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
3876
340k
            1);
3877
1.48M
      } else {
3878
1.48M
        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
3879
1.48M
        a1_x128 = _mm_srli_si128(a0_x128, 1);
3880
3881
1.48M
        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
3882
1.48M
      }
3883
1.82M
      a0_x = _mm_cvtepu8_epi16(a0_x128);
3884
1.82M
      a1_x = _mm_cvtepu8_epi16(a1_x128);
3885
1.82M
    }
3886
    // y calc
3887
4.59M
    __m128i a0_y, a1_y, shifty;
3888
4.59M
    if (base_x < min_base_x) {
3889
4.33M
      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
3890
4.33M
      __m128i y_c128, base_y_c128, mask128, c1234_;
3891
4.33M
      c1234_ = _mm_srli_si128(c1234, 2);
3892
4.33M
      r6 = _mm_set1_epi16(r << 6);
3893
4.33M
      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128));
3894
4.33M
      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
3895
4.33M
      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
3896
4.33M
      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
3897
4.33M
      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
3898
3899
4.33M
      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
3900
4.33M
                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
3901
4.33M
      base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4));
3902
4.33M
      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
3903
4.33M
      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
3904
4.33M
                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
3905
3906
4.33M
      if (upsample_left) {
3907
3.63M
        shifty = _mm_srli_epi16(
3908
3.63M
            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
3909
3.63M
      } else {
3910
705k
        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
3911
705k
      }
3912
4.33M
      a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
3913
4.33M
      a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
3914
4.33M
      shift = _mm_unpacklo_epi64(shift, shifty);
3915
4.33M
    }
3916
3917
4.59M
    diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
3918
4.59M
    a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
3919
4.59M
    a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
3920
3921
4.59M
    b = _mm_mullo_epi16(diff, shift);
3922
4.59M
    res = _mm_add_epi16(a32, b);
3923
4.59M
    res = _mm_srli_epi16(res, 5);
3924
3925
4.59M
    resx = _mm_packus_epi16(res, res);
3926
4.59M
    resy = _mm_srli_si128(resx, 4);
3927
3928
4.59M
    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
3929
4.59M
    *(int *)(dst) = _mm_cvtsi128_si32(resxy);
3930
4.59M
    dst += stride;
3931
4.59M
  }
3932
1.02M
}
3933
3934
static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3935
                                      const uint8_t *above, const uint8_t *left,
3936
                                      int upsample_above, int upsample_left,
3937
265k
                                      int dx, int dy) {
3938
265k
  const int min_base_x = -(1 << upsample_above);
3939
265k
  const int min_base_y = -(1 << upsample_left);
3940
265k
  const int frac_bits_x = 6 - upsample_above;
3941
265k
  const int frac_bits_y = 6 - upsample_left;
3942
3943
  // pre-filter above pixels
3944
  // store in temp buffers:
3945
  //   above[x] * 32 + 16
3946
  //   above[x+1] - above[x]
3947
  // final pixels will be calculated as:
3948
  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3949
265k
  __m256i diff, a32, a16;
3950
265k
  __m256i a0_x, a1_x;
3951
265k
  __m128i a0_x128, a1_x128, min_base_y128, c3f;
3952
265k
  __m128i c1234, dy128;
3953
3954
265k
  a16 = _mm256_set1_epi16(16);
3955
265k
  c3f = _mm_set1_epi16(0x3f);
3956
265k
  min_base_y128 = _mm_set1_epi16(min_base_y);
3957
265k
  dy128 = _mm_set1_epi16(dy);
3958
265k
  c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3959
3960
2.67M
  for (int r = 0; r < N; r++) {
3961
2.40M
    __m256i b, res, shift;
3962
2.40M
    __m128i resx, resy, resxy, r6, ydx;
3963
3964
2.40M
    int y = r + 1;
3965
2.40M
    int base_x = (-y * dx) >> frac_bits_x;
3966
2.40M
    int base_shift = 0;
3967
2.40M
    if (base_x < (min_base_x - 1)) {
3968
1.82M
      base_shift = (min_base_x - base_x - 1) >> upsample_above;
3969
1.82M
    }
3970
2.40M
    int base_min_diff =
3971
2.40M
        (min_base_x - base_x + upsample_above) >> upsample_above;
3972
2.40M
    if (base_min_diff > 8) {
3973
1.06M
      base_min_diff = 8;
3974
1.34M
    } else {
3975
1.34M
      if (base_min_diff < 0) base_min_diff = 0;
3976
1.34M
    }
3977
3978
2.40M
    if (base_shift > 7) {
3979
1.06M
      a0_x = _mm256_setzero_si256();
3980
1.06M
      a1_x = _mm256_setzero_si256();
3981
1.06M
      shift = _mm256_setzero_si256();
3982
1.34M
    } else {
3983
1.34M
      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
3984
1.34M
      ydx = _mm_set1_epi16(y * dx);
3985
1.34M
      r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
3986
1.34M
      if (upsample_above) {
3987
454k
        a0_x128 =
3988
454k
            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
3989
454k
        a1_x128 = _mm_srli_si128(a0_x128, 8);
3990
3991
454k
        shift = _mm256_castsi128_si256(_mm_srli_epi16(
3992
454k
            _mm_and_si128(
3993
454k
                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
3994
454k
            1));
3995
889k
      } else {
3996
889k
        a1_x128 = _mm_srli_si128(a0_x128, 1);
3997
889k
        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
3998
889k
        a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
3999
4000
889k
        shift = _mm256_castsi128_si256(
4001
889k
            _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1));
4002
889k
      }
4003
1.34M
      a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
4004
1.34M
      a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
4005
1.34M
    }
4006
4007
    // y calc
4008
2.40M
    __m128i a0_y, a1_y, shifty;
4009
2.40M
    if (base_x < min_base_x) {
4010
2.02M
      DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
4011
2.02M
      __m128i y_c128, base_y_c128, mask128;
4012
2.02M
      r6 = _mm_set1_epi16(r << 6);
4013
2.02M
      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
4014
2.02M
      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
4015
2.02M
      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
4016
2.02M
      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
4017
2.02M
      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
4018
4019
2.02M
      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
4020
2.02M
                            left[base_y_c[2]], left[base_y_c[3]],
4021
2.02M
                            left[base_y_c[4]], left[base_y_c[5]],
4022
2.02M
                            left[base_y_c[6]], left[base_y_c[7]]);
4023
2.02M
      base_y_c128 = _mm_add_epi16(
4024
2.02M
          base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4));
4025
2.02M
      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
4026
4027
2.02M
      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
4028
2.02M
                            left[base_y_c[2]], left[base_y_c[3]],
4029
2.02M
                            left[base_y_c[4]], left[base_y_c[5]],
4030
2.02M
                            left[base_y_c[6]], left[base_y_c[7]]);
4031
4032
2.02M
      if (upsample_left) {
4033
597k
        shifty = _mm_srli_epi16(
4034
597k
            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
4035
1.43M
      } else {
4036
1.43M
        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
4037
1.43M
      }
4038
4039
2.02M
      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
4040
2.02M
      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
4041
2.02M
      shift = _mm256_inserti128_si256(shift, shifty, 1);
4042
2.02M
    }
4043
4044
2.40M
    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
4045
2.40M
    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
4046
2.40M
    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
4047
4048
2.40M
    b = _mm256_mullo_epi16(diff, shift);
4049
2.40M
    res = _mm256_add_epi16(a32, b);
4050
2.40M
    res = _mm256_srli_epi16(res, 5);
4051
4052
2.40M
    resx = _mm_packus_epi16(_mm256_castsi256_si128(res),
4053
2.40M
                            _mm256_castsi256_si128(res));
4054
2.40M
    resy = _mm256_extracti128_si256(res, 1);
4055
2.40M
    resy = _mm_packus_epi16(resy, resy);
4056
4057
2.40M
    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
4058
2.40M
    _mm_storel_epi64((__m128i *)(dst), resxy);
4059
2.40M
    dst += stride;
4060
2.40M
  }
4061
265k
}
4062
4063
static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst,
4064
                                      ptrdiff_t stride, const uint8_t *above,
4065
                                      const uint8_t *left, int upsample_above,
4066
434k
                                      int upsample_left, int dx, int dy) {
4067
  // here upsample_above and upsample_left are 0 by design of
4068
  // av1_use_intra_edge_upsample
4069
434k
  const int min_base_x = -1;
4070
434k
  const int min_base_y = -1;
4071
434k
  (void)upsample_above;
4072
434k
  (void)upsample_left;
4073
434k
  const int frac_bits_x = 6;
4074
434k
  const int frac_bits_y = 6;
4075
4076
434k
  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123;
4077
434k
  __m256i diff, min_base_y256, c3f, shifty, dy256, c1;
4078
434k
  __m128i a0_x128, a1_x128;
4079
4080
434k
  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
4081
434k
  a16 = _mm256_set1_epi16(16);
4082
434k
  c1 = _mm256_srli_epi16(a16, 4);
4083
434k
  min_base_y256 = _mm256_set1_epi16(min_base_y);
4084
434k
  c3f = _mm256_set1_epi16(0x3f);
4085
434k
  dy256 = _mm256_set1_epi16(dy);
4086
434k
  c0123 =
4087
434k
      _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4088
434k
  c1234 = _mm256_add_epi16(c0123, c1);
4089
4090
8.79M
  for (int r = 0; r < H; r++) {
4091
8.35M
    __m256i b, res, shift, j256, r6, ydx;
4092
8.35M
    __m128i resx, resy;
4093
8.35M
    __m128i resxy;
4094
8.35M
    int y = r + 1;
4095
8.35M
    ydx = _mm256_set1_epi16((int16_t)(y * dx));
4096
4097
8.35M
    int base_x = (-y * dx) >> frac_bits_x;
4098
24.0M
    for (int j = 0; j < W; j += 16) {
4099
15.6M
      j256 = _mm256_set1_epi16(j);
4100
15.6M
      int base_shift = 0;
4101
15.6M
      if ((base_x + j) < (min_base_x - 1)) {
4102
11.9M
        base_shift = (min_base_x - (base_x + j) - 1);
4103
11.9M
      }
4104
15.6M
      int base_min_diff = (min_base_x - base_x - j);
4105
15.6M
      if (base_min_diff > 16) {
4106
9.17M
        base_min_diff = 16;
4107
9.17M
      } else {
4108
6.50M
        if (base_min_diff < 0) base_min_diff = 0;
4109
6.50M
      }
4110
4111
15.6M
      if (base_shift < 16) {
4112
6.50M
        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
4113
6.50M
        a1_x128 =
4114
6.50M
            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
4115
6.50M
        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
4116
6.50M
        a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
4117
4118
6.50M
        a0_x = _mm256_cvtepu8_epi16(a0_x128);
4119
6.50M
        a1_x = _mm256_cvtepu8_epi16(a1_x128);
4120
4121
6.50M
        r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
4122
6.50M
        shift = _mm256_srli_epi16(
4123
6.50M
            _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
4124
4125
6.50M
        diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
4126
6.50M
        a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
4127
6.50M
        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
4128
4129
6.50M
        b = _mm256_mullo_epi16(diff, shift);
4130
6.50M
        res = _mm256_add_epi16(a32, b);
4131
6.50M
        res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
4132
6.50M
        resx = _mm256_castsi256_si128(_mm256_packus_epi16(
4133
6.50M
            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
4134
9.17M
      } else {
4135
9.17M
        resx = _mm_setzero_si128();
4136
9.17M
      }
4137
4138
      // y calc
4139
15.6M
      if (base_x < min_base_x) {
4140
14.7M
        __m256i c256, y_c256, base_y_c256, mask256, mul16;
4141
14.7M
        r6 = _mm256_set1_epi16(r << 6);
4142
14.7M
        c256 = _mm256_add_epi16(j256, c1234);
4143
14.7M
        mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
4144
14.7M
                                 _mm256_srli_epi16(min_base_y256, 1));
4145
14.7M
        y_c256 = _mm256_sub_epi16(r6, mul16);
4146
4147
14.7M
        base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
4148
14.7M
        mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
4149
4150
14.7M
        base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256);
4151
14.7M
        int16_t min_y = (int16_t)_mm_extract_epi16(
4152
14.7M
            _mm256_extracti128_si256(base_y_c256, 1), 7);
4153
14.7M
        int16_t max_y =
4154
14.7M
            (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0);
4155
14.7M
        int16_t offset_diff = max_y - min_y;
4156
4157
14.7M
        if (offset_diff < 16) {
4158
14.0M
          __m256i min_y256 = _mm256_set1_epi16(min_y);
4159
4160
14.0M
          __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256);
4161
14.0M
          __m128i base_y_offset128 =
4162
14.0M
              _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0),
4163
14.0M
                              _mm256_extracti128_si256(base_y_offset, 1));
4164
4165
14.0M
          __m128i a0_y128 = _mm_maskload_epi32(
4166
14.0M
              (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]);
4167
14.0M
          __m128i a1_y128 =
4168
14.0M
              _mm_maskload_epi32((int *)(left + min_y + 1),
4169
14.0M
                                 *(__m128i *)LoadMaskz2[offset_diff / 4]);
4170
14.0M
          a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128);
4171
14.0M
          a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128);
4172
14.0M
          a0_y = _mm256_cvtepu8_epi16(a0_y128);
4173
14.0M
          a1_y = _mm256_cvtepu8_epi16(a1_y128);
4174
14.0M
        } else {
4175
707k
          base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
4176
707k
          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
4177
4178
707k
          a0_y = _mm256_setr_epi16(
4179
707k
              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
4180
707k
              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
4181
707k
              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
4182
707k
              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
4183
707k
              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
4184
707k
              left[base_y_c[15]]);
4185
707k
          base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
4186
707k
          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
4187
4188
707k
          a1_y = _mm256_setr_epi16(
4189
707k
              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
4190
707k
              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
4191
707k
              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
4192
707k
              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
4193
707k
              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
4194
707k
              left[base_y_c[15]]);
4195
707k
        }
4196
14.7M
        shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
4197
4198
14.7M
        diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
4199
14.7M
        a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
4200
14.7M
        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
4201
4202
14.7M
        b = _mm256_mullo_epi16(diff, shifty);
4203
14.7M
        res = _mm256_add_epi16(a32, b);
4204
14.7M
        res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
4205
14.7M
        resy = _mm256_castsi256_si128(_mm256_packus_epi16(
4206
14.7M
            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
4207
14.7M
      } else {
4208
966k
        resy = _mm_setzero_si128();
4209
966k
      }
4210
15.6M
      resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
4211
15.6M
      _mm_storeu_si128((__m128i *)(dst + j), resxy);
4212
15.6M
    }  // for j
4213
8.35M
    dst += stride;
4214
8.35M
  }
4215
434k
}
4216
4217
// Directional prediction, zone 2: 90 < angle < 180
4218
void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
4219
                               const uint8_t *above, const uint8_t *left,
4220
                               int upsample_above, int upsample_left, int dx,
4221
1.72M
                               int dy) {
4222
1.72M
  assert(dx > 0);
4223
1.72M
  assert(dy > 0);
4224
1.72M
  switch (bw) {
4225
1.02M
    case 4:
4226
1.02M
      dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above,
4227
1.02M
                                upsample_left, dx, dy);
4228
1.02M
      break;
4229
265k
    case 8:
4230
265k
      dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above,
4231
265k
                                upsample_left, dx, dy);
4232
265k
      break;
4233
434k
    default:
4234
434k
      dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
4235
434k
                                upsample_above, upsample_left, dx, dy);
4236
434k
      break;
4237
1.72M
  }
4238
1.72M
  return;
4239
1.72M
}
4240
4241
// z3 functions
4242
159k
static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) {
4243
159k
  __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
4244
159k
  __m256i w10, w11, w12, w13, w14, w15;
4245
4246
159k
  w0 = _mm256_unpacklo_epi8(x[0], x[1]);
4247
159k
  w1 = _mm256_unpacklo_epi8(x[2], x[3]);
4248
159k
  w2 = _mm256_unpacklo_epi8(x[4], x[5]);
4249
159k
  w3 = _mm256_unpacklo_epi8(x[6], x[7]);
4250
4251
159k
  w8 = _mm256_unpacklo_epi8(x[8], x[9]);
4252
159k
  w9 = _mm256_unpacklo_epi8(x[10], x[11]);
4253
159k
  w10 = _mm256_unpacklo_epi8(x[12], x[13]);
4254
159k
  w11 = _mm256_unpacklo_epi8(x[14], x[15]);
4255
4256
159k
  w4 = _mm256_unpacklo_epi16(w0, w1);
4257
159k
  w5 = _mm256_unpacklo_epi16(w2, w3);
4258
159k
  w12 = _mm256_unpacklo_epi16(w8, w9);
4259
159k
  w13 = _mm256_unpacklo_epi16(w10, w11);
4260
4261
159k
  w6 = _mm256_unpacklo_epi32(w4, w5);
4262
159k
  w7 = _mm256_unpackhi_epi32(w4, w5);
4263
159k
  w14 = _mm256_unpacklo_epi32(w12, w13);
4264
159k
  w15 = _mm256_unpackhi_epi32(w12, w13);
4265
4266
  // Store first 4-line result
4267
159k
  d[0] = _mm256_unpacklo_epi64(w6, w14);
4268
159k
  d[1] = _mm256_unpackhi_epi64(w6, w14);
4269
159k
  d[2] = _mm256_unpacklo_epi64(w7, w15);
4270
159k
  d[3] = _mm256_unpackhi_epi64(w7, w15);
4271
4272
159k
  w4 = _mm256_unpackhi_epi16(w0, w1);
4273
159k
  w5 = _mm256_unpackhi_epi16(w2, w3);
4274
159k
  w12 = _mm256_unpackhi_epi16(w8, w9);
4275
159k
  w13 = _mm256_unpackhi_epi16(w10, w11);
4276
4277
159k
  w6 = _mm256_unpacklo_epi32(w4, w5);
4278
159k
  w7 = _mm256_unpackhi_epi32(w4, w5);
4279
159k
  w14 = _mm256_unpacklo_epi32(w12, w13);
4280
159k
  w15 = _mm256_unpackhi_epi32(w12, w13);
4281
4282
  // Store second 4-line result
4283
159k
  d[4] = _mm256_unpacklo_epi64(w6, w14);
4284
159k
  d[5] = _mm256_unpackhi_epi64(w6, w14);
4285
159k
  d[6] = _mm256_unpacklo_epi64(w7, w15);
4286
159k
  d[7] = _mm256_unpackhi_epi64(w7, w15);
4287
4288
  // upper half
4289
159k
  w0 = _mm256_unpackhi_epi8(x[0], x[1]);
4290
159k
  w1 = _mm256_unpackhi_epi8(x[2], x[3]);
4291
159k
  w2 = _mm256_unpackhi_epi8(x[4], x[5]);
4292
159k
  w3 = _mm256_unpackhi_epi8(x[6], x[7]);
4293
4294
159k
  w8 = _mm256_unpackhi_epi8(x[8], x[9]);
4295
159k
  w9 = _mm256_unpackhi_epi8(x[10], x[11]);
4296
159k
  w10 = _mm256_unpackhi_epi8(x[12], x[13]);
4297
159k
  w11 = _mm256_unpackhi_epi8(x[14], x[15]);
4298
4299
159k
  w4 = _mm256_unpacklo_epi16(w0, w1);
4300
159k
  w5 = _mm256_unpacklo_epi16(w2, w3);
4301
159k
  w12 = _mm256_unpacklo_epi16(w8, w9);
4302
159k
  w13 = _mm256_unpacklo_epi16(w10, w11);
4303
4304
159k
  w6 = _mm256_unpacklo_epi32(w4, w5);
4305
159k
  w7 = _mm256_unpackhi_epi32(w4, w5);
4306
159k
  w14 = _mm256_unpacklo_epi32(w12, w13);
4307
159k
  w15 = _mm256_unpackhi_epi32(w12, w13);
4308
4309
  // Store first 4-line result
4310
159k
  d[8] = _mm256_unpacklo_epi64(w6, w14);
4311
159k
  d[9] = _mm256_unpackhi_epi64(w6, w14);
4312
159k
  d[10] = _mm256_unpacklo_epi64(w7, w15);
4313
159k
  d[11] = _mm256_unpackhi_epi64(w7, w15);
4314
4315
159k
  w4 = _mm256_unpackhi_epi16(w0, w1);
4316
159k
  w5 = _mm256_unpackhi_epi16(w2, w3);
4317
159k
  w12 = _mm256_unpackhi_epi16(w8, w9);
4318
159k
  w13 = _mm256_unpackhi_epi16(w10, w11);
4319
4320
159k
  w6 = _mm256_unpacklo_epi32(w4, w5);
4321
159k
  w7 = _mm256_unpackhi_epi32(w4, w5);
4322
159k
  w14 = _mm256_unpacklo_epi32(w12, w13);
4323
159k
  w15 = _mm256_unpackhi_epi32(w12, w13);
4324
4325
  // Store second 4-line result
4326
159k
  d[12] = _mm256_unpacklo_epi64(w6, w14);
4327
159k
  d[13] = _mm256_unpackhi_epi64(w6, w14);
4328
159k
  d[14] = _mm256_unpacklo_epi64(w7, w15);
4329
159k
  d[15] = _mm256_unpackhi_epi64(w7, w15);
4330
159k
}
4331
4332
static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride,
4333
                                      const uint8_t *left, int upsample_left,
4334
121k
                                      int dy) {
4335
121k
  __m128i dstvec[4], d[4];
4336
4337
121k
  dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy);
4338
121k
  transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
4339
121k
                            &d[0], &d[1], &d[2], &d[3]);
4340
4341
121k
  *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
4342
121k
  *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
4343
121k
  *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
4344
121k
  *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
4345
121k
  return;
4346
121k
}
4347
4348
static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride,
4349
                                      const uint8_t *left, int upsample_left,
4350
102k
                                      int dy) {
4351
102k
  __m128i dstvec[8], d[8];
4352
4353
102k
  dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy);
4354
102k
  transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
4355
102k
                    &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
4356
102k
                    &d[3]);
4357
4358
102k
  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
4359
102k
  _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
4360
102k
  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
4361
102k
  _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
4362
102k
  _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
4363
102k
  _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
4364
102k
  _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
4365
102k
  _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
4366
102k
}
4367
4368
static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride,
4369
                                      const uint8_t *left, int upsample_left,
4370
29.4k
                                      int dy) {
4371
29.4k
  __m128i dstvec[4], d[8];
4372
4373
29.4k
  dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy);
4374
29.4k
  transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
4375
29.4k
                        &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
4376
264k
  for (int i = 0; i < 8; i++) {
4377
235k
    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
4378
235k
  }
4379
29.4k
}
4380
4381
static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride,
4382
                                      const uint8_t *left, int upsample_left,
4383
51.6k
                                      int dy) {
4384
51.6k
  __m128i dstvec[8], d[4];
4385
4386
51.6k
  dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy);
4387
51.6k
  transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
4388
51.6k
                        &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
4389
51.6k
                        &d[1], &d[2], &d[3]);
4390
51.6k
  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
4391
51.6k
  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
4392
51.6k
  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
4393
51.6k
  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
4394
51.6k
}
4395
4396
static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride,
4397
                                       const uint8_t *left, int upsample_left,
4398
24.2k
                                       int dy) {
4399
24.2k
  __m128i dstvec[8], d[8];
4400
4401
24.2k
  dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy);
4402
24.2k
  transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
4403
24.2k
                          dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
4404
24.2k
                          d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
4405
218k
  for (int i = 0; i < 8; i++) {
4406
194k
    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
4407
194k
    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
4408
194k
                     _mm_srli_si128(d[i], 8));
4409
194k
  }
4410
24.2k
}
4411
4412
static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
4413
                                       const uint8_t *left, int upsample_left,
4414
47.9k
                                       int dy) {
4415
47.9k
  __m128i dstvec[16], d[16];
4416
4417
47.9k
  dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy);
4418
47.9k
  transpose16x8_8x16_sse2(
4419
47.9k
      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4420
47.9k
      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4421
47.9k
      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4422
47.9k
      &d[3], &d[4], &d[5], &d[6], &d[7]);
4423
4424
431k
  for (int i = 0; i < 8; i++) {
4425
383k
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4426
383k
  }
4427
47.9k
}
4428
4429
static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride,
4430
                                       const uint8_t *left, int upsample_left,
4431
16.1k
                                       int dy) {
4432
16.1k
  __m128i dstvec[4], d[16];
4433
4434
16.1k
  dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy);
4435
16.1k
  transpose4x16_sse2(dstvec, d);
4436
274k
  for (int i = 0; i < 16; i++) {
4437
258k
    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
4438
258k
  }
4439
16.1k
}
4440
4441
static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride,
4442
                                       const uint8_t *left, int upsample_left,
4443
61.2k
                                       int dy) {
4444
61.2k
  __m128i dstvec[16], d[8];
4445
4446
61.2k
  dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy);
4447
306k
  for (int i = 4; i < 8; i++) {
4448
244k
    d[i] = _mm_setzero_si128();
4449
244k
  }
4450
61.2k
  transpose16x8_8x16_sse2(
4451
61.2k
      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4452
61.2k
      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4453
61.2k
      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4454
61.2k
      &d[3], &d[4], &d[5], &d[6], &d[7]);
4455
4456
306k
  for (int i = 0; i < 4; i++) {
4457
244k
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4458
244k
  }
4459
61.2k
}
4460
4461
static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride,
4462
                                       const uint8_t *left, int upsample_left,
4463
8.40k
                                       int dy) {
4464
8.40k
  __m256i dstvec[16], d[16];
4465
4466
8.40k
  dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy);
4467
75.6k
  for (int i = 8; i < 16; i++) {
4468
67.2k
    dstvec[i] = _mm256_setzero_si256();
4469
67.2k
  }
4470
8.40k
  transpose16x32_avx2(dstvec, d);
4471
4472
142k
  for (int i = 0; i < 16; i++) {
4473
134k
    _mm_storel_epi64((__m128i *)(dst + i * stride),
4474
134k
                     _mm256_castsi256_si128(d[i]));
4475
134k
  }
4476
142k
  for (int i = 0; i < 16; i++) {
4477
134k
    _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride),
4478
134k
                     _mm256_extracti128_si256(d[i], 1));
4479
134k
  }
4480
8.40k
}
4481
4482
static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride,
4483
                                       const uint8_t *left, int upsample_left,
4484
43.2k
                                       int dy) {
4485
43.2k
  __m128i dstvec[32], d[16];
4486
4487
43.2k
  dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy);
4488
4489
43.2k
  transpose16x8_8x16_sse2(
4490
43.2k
      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4491
43.2k
      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4492
43.2k
      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4493
43.2k
      &d[3], &d[4], &d[5], &d[6], &d[7]);
4494
43.2k
  transpose16x8_8x16_sse2(
4495
43.2k
      &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
4496
43.2k
      &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
4497
43.2k
      &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
4498
43.2k
      &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
4499
43.2k
      &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
4500
43.2k
      &d[6 + 8], &d[7 + 8]);
4501
4502
389k
  for (int i = 0; i < 8; i++) {
4503
345k
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4504
345k
    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
4505
345k
  }
4506
43.2k
}
4507
4508
static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
4509
                                        const uint8_t *left, int upsample_left,
4510
85.7k
                                        int dy) {
4511
85.7k
  __m128i dstvec[16], d[16];
4512
4513
85.7k
  dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy);
4514
85.7k
  transpose16x16_sse2(dstvec, d);
4515
4516
1.45M
  for (int i = 0; i < 16; i++) {
4517
1.37M
    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4518
1.37M
  }
4519
85.7k
}
4520
4521
static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
4522
                                        const uint8_t *left, int upsample_left,
4523
65.5k
                                        int dy) {
4524
65.5k
  __m256i dstvec[32], d[32];
4525
4526
65.5k
  dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy);
4527
65.5k
  transpose16x32_avx2(dstvec, d);
4528
65.5k
  transpose16x32_avx2(dstvec + 16, d + 16);
4529
1.11M
  for (int j = 0; j < 16; j++) {
4530
1.04M
    _mm_storeu_si128((__m128i *)(dst + j * stride),
4531
1.04M
                     _mm256_castsi256_si128(d[j]));
4532
1.04M
    _mm_storeu_si128((__m128i *)(dst + j * stride + 16),
4533
1.04M
                     _mm256_castsi256_si128(d[j + 16]));
4534
1.04M
  }
4535
1.11M
  for (int j = 0; j < 16; j++) {
4536
1.04M
    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
4537
1.04M
                     _mm256_extracti128_si256(d[j], 1));
4538
1.04M
    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16),
4539
1.04M
                     _mm256_extracti128_si256(d[j + 16], 1));
4540
1.04M
  }
4541
65.5k
}
4542
4543
static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
4544
                                        const uint8_t *left, int upsample_left,
4545
17.0k
                                        int dy) {
4546
17.0k
  DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
4547
17.0k
  dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
4548
17.0k
  transpose(dstT, 64, dst, stride, 64, 64);
4549
17.0k
}
4550
4551
static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
4552
                                        const uint8_t *left, int upsample_left,
4553
20.4k
                                        int dy) {
4554
20.4k
  __m256i dstvec[16], d[16];
4555
4556
20.4k
  dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy);
4557
20.4k
  transpose16x32_avx2(dstvec, d);
4558
  // store
4559
347k
  for (int j = 0; j < 16; j++) {
4560
327k
    _mm_storeu_si128((__m128i *)(dst + j * stride),
4561
327k
                     _mm256_castsi256_si128(d[j]));
4562
327k
    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
4563
327k
                     _mm256_extracti128_si256(d[j], 1));
4564
327k
  }
4565
20.4k
}
4566
4567
static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
4568
                                        const uint8_t *left, int upsample_left,
4569
15.3k
                                        int dy) {
4570
15.3k
  __m128i dstvec[32], d[16];
4571
4572
15.3k
  dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy);
4573
45.9k
  for (int i = 0; i < 32; i += 16) {
4574
30.6k
    transpose16x16_sse2((dstvec + i), d);
4575
521k
    for (int j = 0; j < 16; j++) {
4576
490k
      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
4577
490k
    }
4578
30.6k
  }
4579
15.3k
}
4580
4581
static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
4582
                                        const uint8_t *left, int upsample_left,
4583
1.53k
                                        int dy) {
4584
1.53k
  uint8_t dstT[64 * 32];
4585
1.53k
  dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
4586
1.53k
  transpose(dstT, 64, dst, stride, 32, 64);
4587
1.53k
}
4588
4589
static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
4590
                                        const uint8_t *left, int upsample_left,
4591
3.28k
                                        int dy) {
4592
3.28k
  uint8_t dstT[32 * 64];
4593
3.28k
  dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
4594
3.28k
  transpose(dstT, 32, dst, stride, 64, 32);
4595
3.28k
  return;
4596
3.28k
}
4597
4598
static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
4599
                                        const uint8_t *left, int upsample_left,
4600
3.98k
                                        int dy) {
4601
3.98k
  uint8_t dstT[64 * 16];
4602
3.98k
  dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
4603
3.98k
  transpose(dstT, 64, dst, stride, 16, 64);
4604
3.98k
}
4605
4606
static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
4607
                                        const uint8_t *left, int upsample_left,
4608
15.9k
                                        int dy) {
4609
15.9k
  __m128i dstvec[64], d[16];
4610
4611
15.9k
  dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy);
4612
79.9k
  for (int i = 0; i < 64; i += 16) {
4613
63.9k
    transpose16x16_sse2((dstvec + i), d);
4614
1.08M
    for (int j = 0; j < 16; j++) {
4615
1.02M
      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
4616
1.02M
    }
4617
63.9k
  }
4618
15.9k
}
4619
4620
void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
4621
                               const uint8_t *above, const uint8_t *left,
4622
734k
                               int upsample_left, int dx, int dy) {
4623
734k
  (void)above;
4624
734k
  (void)dx;
4625
734k
  assert(dx == 1);
4626
734k
  assert(dy > 0);
4627
4628
734k
  if (bw == bh) {
4629
391k
    switch (bw) {
4630
121k
      case 4:
4631
121k
        dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
4632
121k
        break;
4633
102k
      case 8:
4634
102k
        dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
4635
102k
        break;
4636
85.7k
      case 16:
4637
85.7k
        dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy);
4638
85.7k
        break;
4639
65.5k
      case 32:
4640
65.5k
        dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy);
4641
65.5k
        break;
4642
17.0k
      case 64:
4643
17.0k
        dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy);
4644
17.0k
        break;
4645
391k
    }
4646
391k
  } else {
4647
342k
    if (bw < bh) {
4648
104k
      if (bw + bw == bh) {
4649
75.6k
        switch (bw) {
4650
29.4k
          case 4:
4651
29.4k
            dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy);
4652
29.4k
            break;
4653
24.2k
          case 8:
4654
24.2k
            dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy);
4655
24.2k
            break;
4656
20.4k
          case 16:
4657
20.4k
            dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy);
4658
20.4k
            break;
4659
1.53k
          case 32:
4660
1.53k
            dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy);
4661
1.53k
            break;
4662
75.6k
        }
4663
75.6k
      } else {
4664
28.5k
        switch (bw) {
4665
16.1k
          case 4:
4666
16.1k
            dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy);
4667
16.1k
            break;
4668
8.40k
          case 8:
4669
8.40k
            dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy);
4670
8.40k
            break;
4671
3.98k
          case 16:
4672
3.98k
            dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy);
4673
3.98k
            break;
4674
28.5k
        }
4675
28.5k
      }
4676
238k
    } else {
4677
238k
      if (bh + bh == bw) {
4678
118k
        switch (bh) {
4679
51.6k
          case 4:
4680
51.6k
            dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy);
4681
51.6k
            break;
4682
47.9k
          case 8:
4683
47.9k
            dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy);
4684
47.9k
            break;
4685
15.3k
          case 16:
4686
15.3k
            dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy);
4687
15.3k
            break;
4688
3.28k
          case 32:
4689
3.28k
            dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy);
4690
3.28k
            break;
4691
118k
        }
4692
120k
      } else {
4693
120k
        switch (bh) {
4694
61.2k
          case 4:
4695
61.2k
            dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy);
4696
61.2k
            break;
4697
43.2k
          case 8:
4698
43.2k
            dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy);
4699
43.2k
            break;
4700
15.9k
          case 16:
4701
15.9k
            dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy);
4702
15.9k
            break;
4703
120k
        }
4704
120k
      }
4705
238k
    }
4706
342k
  }
4707
734k
}