Coverage Report

Created: 2023-06-07 06:31

/src/aom/av1/common/x86/warp_plane_avx2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2019, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <immintrin.h>
13
#include "config/av1_rtcd.h"
14
#include "av1/common/warped_motion.h"
15
#include "aom_dsp/x86/synonyms.h"
16
17
DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask01_avx2[32]) = {
18
  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
19
  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
20
};
21
22
DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask23_avx2[32]) = {
23
  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
24
  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
25
};
26
27
DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask45_avx2[32]) = {
28
  4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
29
  4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5
30
};
31
32
DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask67_avx2[32]) = {
33
  6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
34
  6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7
35
};
36
37
DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask0_avx2[32]) = {
38
  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
39
  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
40
};
41
42
DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask1_avx2[32]) = {
43
  4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
44
  4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
45
};
46
47
DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask2_avx2[32]) = {
48
  8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11,
49
  8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
50
};
51
52
DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask3_avx2[32]) = {
53
  12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15,
54
  12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
55
};
56
57
DECLARE_ALIGNED(32, static const uint8_t,
58
                shuffle_src0[32]) = { 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3,
59
                                      5, 5, 7, 7, 9, 0, 2, 2, 4, 4, 6,
60
                                      6, 8, 1, 3, 3, 5, 5, 7, 7, 9 };
61
62
DECLARE_ALIGNED(32, static const uint8_t,
63
                shuffle_src1[32]) = { 4,  6,  6,  8,  8,  10, 10, 12, 5,  7, 7,
64
                                      9,  9,  11, 11, 13, 4,  6,  6,  8,  8, 10,
65
                                      10, 12, 5,  7,  7,  9,  9,  11, 11, 13 };
66
67
DECLARE_ALIGNED(32, static const uint8_t,
68
                shuffle_src2[32]) = { 1, 3, 3, 5, 5,  7, 7, 9, 2, 4, 4,
69
                                      6, 6, 8, 8, 10, 1, 3, 3, 5, 5, 7,
70
                                      7, 9, 2, 4, 4,  6, 6, 8, 8, 10 };
71
72
DECLARE_ALIGNED(32, static const uint8_t,
73
                shuffle_src3[32]) = { 5,  7,  7,  9,  9,  11, 11, 13, 6,  8, 8,
74
                                      10, 10, 12, 12, 14, 5,  7,  7,  9,  9, 11,
75
                                      11, 13, 6,  8,  8,  10, 10, 12, 12, 14 };
76
77
static INLINE void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out,
78
                                          __m256i *coeff,
79
                                          const __m256i *shuffle_src,
80
                                          const __m256i *round_const,
81
22.8M
                                          const __m128i *shift, int row) {
82
22.8M
  const __m256i src_0 = _mm256_shuffle_epi8(src, shuffle_src[0]);
83
22.8M
  const __m256i src_1 = _mm256_shuffle_epi8(src, shuffle_src[1]);
84
22.8M
  const __m256i src_2 = _mm256_shuffle_epi8(src, shuffle_src[2]);
85
22.8M
  const __m256i src_3 = _mm256_shuffle_epi8(src, shuffle_src[3]);
86
87
22.8M
  const __m256i res_02 = _mm256_maddubs_epi16(src_0, coeff[0]);
88
22.8M
  const __m256i res_46 = _mm256_maddubs_epi16(src_1, coeff[1]);
89
22.8M
  const __m256i res_13 = _mm256_maddubs_epi16(src_2, coeff[2]);
90
22.8M
  const __m256i res_57 = _mm256_maddubs_epi16(src_3, coeff[3]);
91
92
22.8M
  const __m256i res_even = _mm256_add_epi16(res_02, res_46);
93
22.8M
  const __m256i res_odd = _mm256_add_epi16(res_13, res_57);
94
22.8M
  const __m256i res =
95
22.8M
      _mm256_add_epi16(_mm256_add_epi16(res_even, res_odd), *round_const);
96
22.8M
  horz_out[row] = _mm256_srl_epi16(res, *shift);
97
22.8M
}
98
99
static INLINE void prepare_horizontal_filter_coeff_avx2(int alpha, int beta,
100
                                                        int sx,
101
8.16M
                                                        __m256i *coeff) {
102
8.16M
  __m128i tmp_0 = _mm_loadl_epi64(
103
8.16M
      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 0 * alpha)) >>
104
8.16M
                                  WARPEDDIFF_PREC_BITS]);
105
8.16M
  __m128i tmp_1 = _mm_loadl_epi64(
106
8.16M
      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 1 * alpha)) >>
107
8.16M
                                  WARPEDDIFF_PREC_BITS]);
108
8.16M
  __m128i tmp_2 = _mm_loadl_epi64(
109
8.16M
      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 2 * alpha)) >>
110
8.16M
                                  WARPEDDIFF_PREC_BITS]);
111
8.16M
  __m128i tmp_3 = _mm_loadl_epi64(
112
8.16M
      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 3 * alpha)) >>
113
8.16M
                                  WARPEDDIFF_PREC_BITS]);
114
115
8.16M
  __m128i tmp_4 = _mm_loadl_epi64(
116
8.16M
      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 4 * alpha)) >>
117
8.16M
                                  WARPEDDIFF_PREC_BITS]);
118
8.16M
  __m128i tmp_5 = _mm_loadl_epi64(
119
8.16M
      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 5 * alpha)) >>
120
8.16M
                                  WARPEDDIFF_PREC_BITS]);
121
8.16M
  __m128i tmp_6 = _mm_loadl_epi64(
122
8.16M
      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 6 * alpha)) >>
123
8.16M
                                  WARPEDDIFF_PREC_BITS]);
124
8.16M
  __m128i tmp_7 = _mm_loadl_epi64(
125
8.16M
      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 7 * alpha)) >>
126
8.16M
                                  WARPEDDIFF_PREC_BITS]);
127
128
8.16M
  __m256i tmp0_256 = _mm256_castsi128_si256(tmp_0);
129
8.16M
  __m256i tmp2_256 = _mm256_castsi128_si256(tmp_2);
130
8.16M
  __m256i tmp1_256 = _mm256_castsi128_si256(tmp_1);
131
8.16M
  __m256i tmp3_256 = _mm256_castsi128_si256(tmp_3);
132
133
8.16M
  __m256i tmp4_256 = _mm256_castsi128_si256(tmp_4);
134
8.16M
  __m256i tmp6_256 = _mm256_castsi128_si256(tmp_6);
135
8.16M
  __m256i tmp5_256 = _mm256_castsi128_si256(tmp_5);
136
8.16M
  __m256i tmp7_256 = _mm256_castsi128_si256(tmp_7);
137
138
8.16M
  __m128i tmp_8 = _mm_loadl_epi64(
139
8.16M
      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 0 * alpha) >>
140
8.16M
                                  WARPEDDIFF_PREC_BITS]);
141
8.16M
  tmp0_256 = _mm256_inserti128_si256(tmp0_256, tmp_8, 1);
142
143
8.16M
  __m128i tmp_9 = _mm_loadl_epi64(
144
8.16M
      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 1 * alpha) >>
145
8.16M
                                  WARPEDDIFF_PREC_BITS]);
146
8.16M
  tmp1_256 = _mm256_inserti128_si256(tmp1_256, tmp_9, 1);
147
148
8.16M
  __m128i tmp_10 = _mm_loadl_epi64(
149
8.16M
      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 2 * alpha) >>
150
8.16M
                                  WARPEDDIFF_PREC_BITS]);
151
8.16M
  tmp2_256 = _mm256_inserti128_si256(tmp2_256, tmp_10, 1);
152
153
8.16M
  __m128i tmp_11 = _mm_loadl_epi64(
154
8.16M
      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 3 * alpha) >>
155
8.16M
                                  WARPEDDIFF_PREC_BITS]);
156
8.16M
  tmp3_256 = _mm256_inserti128_si256(tmp3_256, tmp_11, 1);
157
158
8.16M
  tmp_2 = _mm_loadl_epi64(
159
8.16M
      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 4 * alpha) >>
160
8.16M
                                  WARPEDDIFF_PREC_BITS]);
161
8.16M
  tmp4_256 = _mm256_inserti128_si256(tmp4_256, tmp_2, 1);
162
163
8.16M
  tmp_3 = _mm_loadl_epi64(
164
8.16M
      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 5 * alpha) >>
165
8.16M
                                  WARPEDDIFF_PREC_BITS]);
166
8.16M
  tmp5_256 = _mm256_inserti128_si256(tmp5_256, tmp_3, 1);
167
168
8.16M
  tmp_6 = _mm_loadl_epi64(
169
8.16M
      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 6 * alpha) >>
170
8.16M
                                  WARPEDDIFF_PREC_BITS]);
171
8.16M
  tmp6_256 = _mm256_inserti128_si256(tmp6_256, tmp_6, 1);
172
173
8.16M
  tmp_7 = _mm_loadl_epi64(
174
8.16M
      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 7 * alpha) >>
175
8.16M
                                  WARPEDDIFF_PREC_BITS]);
176
8.16M
  tmp7_256 = _mm256_inserti128_si256(tmp7_256, tmp_7, 1);
177
178
8.16M
  const __m256i tmp_12 = _mm256_unpacklo_epi16(tmp0_256, tmp2_256);
179
8.16M
  const __m256i tmp_13 = _mm256_unpacklo_epi16(tmp1_256, tmp3_256);
180
8.16M
  const __m256i tmp_14 = _mm256_unpacklo_epi16(tmp4_256, tmp6_256);
181
8.16M
  const __m256i tmp_15 = _mm256_unpacklo_epi16(tmp5_256, tmp7_256);
182
183
8.16M
  const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
184
8.16M
  const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
185
8.16M
  const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
186
8.16M
  const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
187
188
8.16M
  coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
189
8.16M
  coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
190
8.16M
  coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
191
8.16M
  coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
192
8.16M
}
193
194
static INLINE void prepare_horizontal_filter_coeff_beta0_avx2(int alpha, int sx,
195
646k
                                                              __m256i *coeff) {
196
646k
  __m128i tmp_0 = _mm_loadl_epi64(
197
646k
      (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
198
646k
  __m128i tmp_1 = _mm_loadl_epi64(
199
646k
      (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
200
646k
  __m128i tmp_2 = _mm_loadl_epi64(
201
646k
      (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
202
646k
  __m128i tmp_3 = _mm_loadl_epi64(
203
646k
      (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
204
646k
  __m128i tmp_4 = _mm_loadl_epi64(
205
646k
      (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
206
646k
  __m128i tmp_5 = _mm_loadl_epi64(
207
646k
      (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
208
646k
  __m128i tmp_6 = _mm_loadl_epi64(
209
646k
      (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
210
646k
  __m128i tmp_7 = _mm_loadl_epi64(
211
646k
      (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
212
213
646k
  tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2);
214
646k
  tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3);
215
646k
  tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6);
216
646k
  tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7);
217
218
646k
  const __m256i tmp_12 = _mm256_broadcastsi128_si256(tmp_0);
219
646k
  const __m256i tmp_13 = _mm256_broadcastsi128_si256(tmp_1);
220
646k
  const __m256i tmp_14 = _mm256_broadcastsi128_si256(tmp_4);
221
646k
  const __m256i tmp_15 = _mm256_broadcastsi128_si256(tmp_5);
222
223
646k
  const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
224
646k
  const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
225
646k
  const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
226
646k
  const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
227
228
646k
  coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
229
646k
  coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
230
646k
  coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
231
646k
  coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
232
646k
}
233
234
static INLINE void prepare_horizontal_filter_coeff_alpha0_avx2(int beta, int sx,
235
2.83M
                                                               __m256i *coeff) {
236
2.83M
  const __m128i tmp_0 =
237
2.83M
      _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
238
2.83M
  const __m128i tmp_1 = _mm_loadl_epi64(
239
2.83M
      (__m128i *)&av1_filter_8bit[(sx + beta) >> WARPEDDIFF_PREC_BITS]);
240
241
2.83M
  const __m256i res_0 =
242
2.83M
      _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_1, 0x1);
243
244
2.83M
  coeff[0] = _mm256_shuffle_epi8(
245
2.83M
      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask01_avx2));
246
2.83M
  coeff[1] = _mm256_shuffle_epi8(
247
2.83M
      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask23_avx2));
248
2.83M
  coeff[2] = _mm256_shuffle_epi8(
249
2.83M
      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask45_avx2));
250
2.83M
  coeff[3] = _mm256_shuffle_epi8(
251
2.83M
      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2));
252
2.83M
}
253
254
static INLINE void horizontal_filter_avx2(const __m256i src, __m256i *horz_out,
255
                                          int sx, int alpha, int beta, int row,
256
                                          const __m256i *shuffle_src,
257
                                          const __m256i *round_const,
258
8.16M
                                          const __m128i *shift) {
259
8.16M
  __m256i coeff[4];
260
8.16M
  prepare_horizontal_filter_coeff_avx2(alpha, beta, sx, coeff);
261
8.16M
  filter_src_pixels_avx2(src, horz_out, coeff, shuffle_src, round_const, shift,
262
8.16M
                         row);
263
8.16M
}
264
static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
265
1.17M
                                                   __m256i *coeff) {
266
1.17M
  const __m128i tmp_0 = _mm_loadl_epi64(
267
1.17M
      (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
268
1.17M
  const __m128i tmp_1 = _mm_loadl_epi64(
269
1.17M
      (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
270
1.17M
  const __m128i tmp_2 = _mm_loadl_epi64(
271
1.17M
      (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
272
1.17M
  const __m128i tmp_3 = _mm_loadl_epi64(
273
1.17M
      (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
274
1.17M
  const __m128i tmp_4 = _mm_loadl_epi64(
275
1.17M
      (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
276
1.17M
  const __m128i tmp_5 = _mm_loadl_epi64(
277
1.17M
      (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
278
1.17M
  const __m128i tmp_6 = _mm_loadl_epi64(
279
1.17M
      (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
280
1.17M
  const __m128i tmp_7 = _mm_loadl_epi64(
281
1.17M
      (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
282
283
1.17M
  const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
284
1.17M
  const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
285
1.17M
  const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
286
1.17M
  const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
287
288
1.17M
  const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
289
1.17M
  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
290
1.17M
  const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
291
1.17M
  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
292
293
1.17M
  coeff[0] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_12, tmp_14));
294
1.17M
  coeff[1] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_12, tmp_14));
295
1.17M
  coeff[2] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_13, tmp_15));
296
1.17M
  coeff[3] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_13, tmp_15));
297
1.17M
}
298
299
static INLINE void warp_horizontal_filter_avx2(
300
    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
301
    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
302
    const __m256i *round_const, const __m128i *shift,
303
1.08M
    const __m256i *shuffle_src) {
304
1.08M
  int k, iy, sx, row = 0;
305
1.08M
  __m256i coeff[4];
306
8.62M
  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
307
7.54M
    iy = iy4 + k;
308
7.54M
    iy = clamp(iy, 0, height - 1);
309
7.54M
    const __m128i src_0 =
310
7.54M
        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
311
7.54M
    iy = iy4 + k + 1;
312
7.54M
    iy = clamp(iy, 0, height - 1);
313
7.54M
    const __m128i src_1 =
314
7.54M
        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
315
7.54M
    const __m256i src_01 =
316
7.54M
        _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
317
7.54M
    sx = sx4 + beta * (k + 4);
318
7.54M
    horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, shuffle_src,
319
7.54M
                           round_const, shift);
320
7.54M
    row += 1;
321
7.54M
  }
322
1.08M
  iy = iy4 + k;
323
1.08M
  iy = clamp(iy, 0, height - 1);
324
1.08M
  const __m256i src_01 = _mm256_castsi128_si256(
325
1.08M
      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
326
1.08M
  sx = sx4 + beta * (k + 4);
327
1.08M
  prepare_horizontal_filter_coeff(alpha, sx, coeff);
328
1.08M
  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
329
1.08M
                         shift, row);
330
1.08M
}
331
332
static INLINE void warp_horizontal_filter_alpha0_avx2(
333
    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
334
    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
335
    const __m256i *round_const, const __m128i *shift,
336
253k
    const __m256i *shuffle_src) {
337
253k
  (void)alpha;
338
253k
  int k, iy, sx, row = 0;
339
253k
  __m256i coeff[4];
340
2.03M
  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
341
1.77M
    iy = iy4 + k;
342
1.77M
    iy = clamp(iy, 0, height - 1);
343
1.77M
    const __m128i src_0 =
344
1.77M
        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
345
1.77M
    iy = iy4 + k + 1;
346
1.77M
    iy = clamp(iy, 0, height - 1);
347
1.77M
    const __m128i src_1 =
348
1.77M
        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
349
1.77M
    const __m256i src_01 =
350
1.77M
        _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
351
1.77M
    sx = sx4 + beta * (k + 4);
352
1.77M
    prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
353
1.77M
    filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
354
1.77M
                           shift, row);
355
1.77M
    row += 1;
356
1.77M
  }
357
253k
  iy = iy4 + k;
358
253k
  iy = clamp(iy, 0, height - 1);
359
253k
  const __m256i src_01 = _mm256_castsi128_si256(
360
253k
      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
361
253k
  sx = sx4 + beta * (k + 4);
362
253k
  prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
363
253k
  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
364
253k
                         shift, row);
365
253k
}
366
367
static INLINE void warp_horizontal_filter_beta0_avx2(
368
    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
369
    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
370
    const __m256i *round_const, const __m128i *shift,
371
646k
    const __m256i *shuffle_src) {
372
646k
  (void)beta;
373
646k
  int k, iy, row = 0;
374
646k
  __m256i coeff[4];
375
646k
  prepare_horizontal_filter_coeff_beta0_avx2(alpha, sx4, coeff);
376
5.16M
  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
377
4.52M
    iy = iy4 + k;
378
4.52M
    iy = clamp(iy, 0, height - 1);
379
4.52M
    const __m128i src_0 =
380
4.52M
        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
381
4.52M
    iy = iy4 + k + 1;
382
4.52M
    iy = clamp(iy, 0, height - 1);
383
4.52M
    const __m128i src_1 =
384
4.52M
        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
385
4.52M
    const __m256i src_01 =
386
4.52M
        _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
387
4.52M
    filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
388
4.52M
                           shift, row);
389
4.52M
    row += 1;
390
4.52M
  }
391
646k
  iy = iy4 + k;
392
646k
  iy = clamp(iy, 0, height - 1);
393
646k
  const __m256i src_01 = _mm256_castsi128_si256(
394
646k
      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
395
646k
  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
396
646k
                         shift, row);
397
646k
}
398
399
static INLINE void warp_horizontal_filter_alpha0_beta0_avx2(
400
    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
401
    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
402
    const __m256i *round_const, const __m128i *shift,
403
805k
    const __m256i *shuffle_src) {
404
805k
  (void)alpha;
405
805k
  int k, iy, row = 0;
406
805k
  __m256i coeff[4];
407
805k
  prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx4, coeff);
408
6.43M
  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
409
5.62M
    iy = iy4 + k;
410
5.62M
    iy = clamp(iy, 0, height - 1);
411
5.62M
    const __m128i src0 =
412
5.62M
        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
413
5.62M
    iy = iy4 + k + 1;
414
5.62M
    iy = clamp(iy, 0, height - 1);
415
5.62M
    const __m128i src1 =
416
5.62M
        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
417
5.62M
    const __m256i src_01 =
418
5.62M
        _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
419
5.62M
    filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
420
5.62M
                           shift, row);
421
5.62M
    row += 1;
422
5.62M
  }
423
805k
  iy = iy4 + k;
424
805k
  iy = clamp(iy, 0, height - 1);
425
805k
  const __m256i src_01 = _mm256_castsi128_si256(
426
805k
      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
427
805k
  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
428
805k
                         shift, row);
429
805k
}
430
431
static INLINE void unpack_weights_and_set_round_const_avx2(
432
    ConvolveParams *conv_params, const int round_bits, const int offset_bits,
433
576k
    __m256i *res_sub_const, __m256i *round_bits_const, __m256i *wt) {
434
576k
  *res_sub_const =
435
576k
      _mm256_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
436
576k
                        (1 << (offset_bits - conv_params->round_1 - 1)));
437
576k
  *round_bits_const = _mm256_set1_epi16(((1 << round_bits) >> 1));
438
439
576k
  const int w0 = conv_params->fwd_offset;
440
576k
  const int w1 = conv_params->bck_offset;
441
576k
  const __m256i wt0 = _mm256_set1_epi16((short)w0);
442
576k
  const __m256i wt1 = _mm256_set1_epi16((short)w1);
443
576k
  *wt = _mm256_unpacklo_epi16(wt0, wt1);
444
576k
}
445
446
static INLINE void prepare_vertical_filter_coeffs_avx2(int gamma, int delta,
447
                                                       int sy,
448
5.11M
                                                       __m256i *coeffs) {
449
5.11M
  __m128i filt_00 =
450
5.11M
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
451
5.11M
                                  ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
452
5.11M
  __m128i filt_01 =
453
5.11M
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
454
5.11M
                                  ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
455
5.11M
  __m128i filt_02 =
456
5.11M
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
457
5.11M
                                  ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
458
5.11M
  __m128i filt_03 =
459
5.11M
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
460
5.11M
                                  ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
461
462
5.11M
  __m128i filt_10 = _mm_loadu_si128(
463
5.11M
      (__m128i *)(av1_warped_filter +
464
5.11M
                  (((sy + delta) + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
465
5.11M
  __m128i filt_11 = _mm_loadu_si128(
466
5.11M
      (__m128i *)(av1_warped_filter +
467
5.11M
                  (((sy + delta) + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
468
5.11M
  __m128i filt_12 = _mm_loadu_si128(
469
5.11M
      (__m128i *)(av1_warped_filter +
470
5.11M
                  (((sy + delta) + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
471
5.11M
  __m128i filt_13 = _mm_loadu_si128(
472
5.11M
      (__m128i *)(av1_warped_filter +
473
5.11M
                  (((sy + delta) + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
474
475
5.11M
  __m256i filt_0 =
476
5.11M
      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
477
5.11M
  __m256i filt_1 =
478
5.11M
      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
479
5.11M
  __m256i filt_2 =
480
5.11M
      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
481
5.11M
  __m256i filt_3 =
482
5.11M
      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
483
484
5.11M
  __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
485
5.11M
  __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
486
5.11M
  __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
487
5.11M
  __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
488
489
5.11M
  coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
490
5.11M
  coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
491
5.11M
  coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
492
5.11M
  coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
493
494
5.11M
  filt_00 =
495
5.11M
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
496
5.11M
                                  ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
497
5.11M
  filt_01 =
498
5.11M
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
499
5.11M
                                  ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
500
5.11M
  filt_02 =
501
5.11M
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
502
5.11M
                                  ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
503
5.11M
  filt_03 =
504
5.11M
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
505
5.11M
                                  ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
506
507
5.11M
  filt_10 = _mm_loadu_si128(
508
5.11M
      (__m128i *)(av1_warped_filter +
509
5.11M
                  (((sy + delta) + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
510
5.11M
  filt_11 = _mm_loadu_si128(
511
5.11M
      (__m128i *)(av1_warped_filter +
512
5.11M
                  (((sy + delta) + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
513
5.11M
  filt_12 = _mm_loadu_si128(
514
5.11M
      (__m128i *)(av1_warped_filter +
515
5.11M
                  (((sy + delta) + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
516
5.11M
  filt_13 = _mm_loadu_si128(
517
5.11M
      (__m128i *)(av1_warped_filter +
518
5.11M
                  (((sy + delta) + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
519
520
5.11M
  filt_0 =
521
5.11M
      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
522
5.11M
  filt_1 =
523
5.11M
      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
524
5.11M
  filt_2 =
525
5.11M
      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
526
5.11M
  filt_3 =
527
5.11M
      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
528
529
5.11M
  res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
530
5.11M
  res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
531
5.11M
  res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
532
5.11M
  res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
533
534
5.11M
  coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
535
5.11M
  coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
536
5.11M
  coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
537
5.11M
  coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
538
5.11M
}
539
540
static INLINE void prepare_vertical_filter_coeffs_delta0_avx2(int gamma, int sy,
541
325k
                                                              __m256i *coeffs) {
542
325k
  __m128i filt_00 =
543
325k
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
544
325k
                                  ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
545
325k
  __m128i filt_01 =
546
325k
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
547
325k
                                  ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
548
325k
  __m128i filt_02 =
549
325k
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
550
325k
                                  ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
551
325k
  __m128i filt_03 =
552
325k
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
553
325k
                                  ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
554
555
325k
  __m256i filt_0 = _mm256_broadcastsi128_si256(filt_00);
556
325k
  __m256i filt_1 = _mm256_broadcastsi128_si256(filt_01);
557
325k
  __m256i filt_2 = _mm256_broadcastsi128_si256(filt_02);
558
325k
  __m256i filt_3 = _mm256_broadcastsi128_si256(filt_03);
559
560
325k
  __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
561
325k
  __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
562
325k
  __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
563
325k
  __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
564
565
325k
  coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
566
325k
  coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
567
325k
  coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
568
325k
  coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
569
570
325k
  filt_00 =
571
325k
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
572
325k
                                  ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
573
325k
  filt_01 =
574
325k
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
575
325k
                                  ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
576
325k
  filt_02 =
577
325k
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
578
325k
                                  ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
579
325k
  filt_03 =
580
325k
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
581
325k
                                  ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
582
583
325k
  filt_0 = _mm256_broadcastsi128_si256(filt_00);
584
325k
  filt_1 = _mm256_broadcastsi128_si256(filt_01);
585
325k
  filt_2 = _mm256_broadcastsi128_si256(filt_02);
586
325k
  filt_3 = _mm256_broadcastsi128_si256(filt_03);
587
588
325k
  res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
589
325k
  res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
590
325k
  res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
591
325k
  res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
592
593
325k
  coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
594
325k
  coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
595
325k
  coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
596
325k
  coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
597
325k
}
598
599
static INLINE void prepare_vertical_filter_coeffs_gamma0_avx2(int delta, int sy,
600
3.31M
                                                              __m256i *coeffs) {
601
3.31M
  const __m128i filt_0 = _mm_loadu_si128(
602
3.31M
      (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
603
3.31M
  const __m128i filt_1 = _mm_loadu_si128(
604
3.31M
      (__m128i *)(av1_warped_filter + ((sy + delta) >> WARPEDDIFF_PREC_BITS)));
605
606
3.31M
  __m256i res_0 =
607
3.31M
      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_0), filt_1, 0x1);
608
609
3.31M
  coeffs[0] = _mm256_shuffle_epi8(
610
3.31M
      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0_avx2));
611
3.31M
  coeffs[1] = _mm256_shuffle_epi8(
612
3.31M
      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1_avx2));
613
3.31M
  coeffs[2] = _mm256_shuffle_epi8(
614
3.31M
      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2_avx2));
615
3.31M
  coeffs[3] = _mm256_shuffle_epi8(
616
3.31M
      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3_avx2));
617
618
3.31M
  coeffs[4] = coeffs[0];
619
3.31M
  coeffs[5] = coeffs[1];
620
3.31M
  coeffs[6] = coeffs[2];
621
3.31M
  coeffs[7] = coeffs[3];
622
3.31M
}
623
624
static INLINE void filter_src_pixels_vertical_avx2(__m256i *horz_out,
625
                                                   __m256i *src,
626
                                                   __m256i *coeffs,
627
                                                   __m256i *res_lo,
628
12.1M
                                                   __m256i *res_hi, int row) {
629
12.1M
  const __m256i src_6 = horz_out[row + 3];
630
12.1M
  const __m256i src_7 =
631
12.1M
      _mm256_permute2x128_si256(horz_out[row + 3], horz_out[row + 4], 0x21);
632
633
12.1M
  src[6] = _mm256_unpacklo_epi16(src_6, src_7);
634
635
12.1M
  const __m256i res_0 = _mm256_madd_epi16(src[0], coeffs[0]);
636
12.1M
  const __m256i res_2 = _mm256_madd_epi16(src[2], coeffs[1]);
637
12.1M
  const __m256i res_4 = _mm256_madd_epi16(src[4], coeffs[2]);
638
12.1M
  const __m256i res_6 = _mm256_madd_epi16(src[6], coeffs[3]);
639
640
12.1M
  const __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_2),
641
12.1M
                                            _mm256_add_epi32(res_4, res_6));
642
643
12.1M
  src[7] = _mm256_unpackhi_epi16(src_6, src_7);
644
645
12.1M
  const __m256i res_1 = _mm256_madd_epi16(src[1], coeffs[4]);
646
12.1M
  const __m256i res_3 = _mm256_madd_epi16(src[3], coeffs[5]);
647
12.1M
  const __m256i res_5 = _mm256_madd_epi16(src[5], coeffs[6]);
648
12.1M
  const __m256i res_7 = _mm256_madd_epi16(src[7], coeffs[7]);
649
650
12.1M
  const __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_3),
651
12.1M
                                           _mm256_add_epi32(res_5, res_7));
652
653
  // Rearrange pixels back into the order 0 ... 7
654
12.1M
  *res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
655
12.1M
  *res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
656
12.1M
}
657
658
static INLINE void store_vertical_filter_output_avx2(
659
    const __m256i *res_lo, const __m256i *res_hi, const __m256i *res_add_const,
660
    const __m256i *wt, const __m256i *res_sub_const,
661
    const __m256i *round_bits_const, uint8_t *pred, ConvolveParams *conv_params,
662
    int i, int j, int k, const int reduce_bits_vert, int p_stride, int p_width,
663
12.1M
    const int round_bits) {
664
12.1M
  __m256i res_lo_1 = *res_lo;
665
12.1M
  __m256i res_hi_1 = *res_hi;
666
667
12.1M
  if (conv_params->is_compound) {
668
1.06M
    __m128i *const p_0 =
669
1.06M
        (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
670
1.06M
    __m128i *const p_1 =
671
1.06M
        (__m128i *)&conv_params
672
1.06M
            ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j];
673
674
1.06M
    res_lo_1 = _mm256_srai_epi32(_mm256_add_epi32(res_lo_1, *res_add_const),
675
1.06M
                                 reduce_bits_vert);
676
677
1.06M
    const __m256i temp_lo_16 = _mm256_packus_epi32(res_lo_1, res_lo_1);
678
1.06M
    __m256i res_lo_16;
679
1.06M
    if (conv_params->do_average) {
680
228k
      __m128i *const dst8_0 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
681
228k
      __m128i *const dst8_1 =
682
228k
          (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
683
228k
      const __m128i p_16_0 = _mm_loadl_epi64(p_0);
684
228k
      const __m128i p_16_1 = _mm_loadl_epi64(p_1);
685
228k
      const __m256i p_16 =
686
228k
          _mm256_inserti128_si256(_mm256_castsi128_si256(p_16_0), p_16_1, 1);
687
228k
      if (conv_params->use_dist_wtd_comp_avg) {
688
62.7k
        const __m256i p_16_lo = _mm256_unpacklo_epi16(p_16, temp_lo_16);
689
62.7k
        const __m256i wt_res_lo = _mm256_madd_epi16(p_16_lo, *wt);
690
62.7k
        const __m256i shifted_32 =
691
62.7k
            _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
692
62.7k
        res_lo_16 = _mm256_packus_epi32(shifted_32, shifted_32);
693
165k
      } else {
694
165k
        res_lo_16 = _mm256_srai_epi16(_mm256_add_epi16(p_16, temp_lo_16), 1);
695
165k
      }
696
228k
      res_lo_16 = _mm256_add_epi16(res_lo_16, *res_sub_const);
697
228k
      res_lo_16 = _mm256_srai_epi16(
698
228k
          _mm256_add_epi16(res_lo_16, *round_bits_const), round_bits);
699
228k
      const __m256i res_8_lo = _mm256_packus_epi16(res_lo_16, res_lo_16);
700
228k
      const __m128i res_8_lo_0 = _mm256_castsi256_si128(res_8_lo);
701
228k
      const __m128i res_8_lo_1 = _mm256_extracti128_si256(res_8_lo, 1);
702
228k
      *(int *)dst8_0 = _mm_cvtsi128_si32(res_8_lo_0);
703
228k
      *(int *)dst8_1 = _mm_cvtsi128_si32(res_8_lo_1);
704
835k
    } else {
705
835k
      const __m128i temp_lo_16_0 = _mm256_castsi256_si128(temp_lo_16);
706
835k
      const __m128i temp_lo_16_1 = _mm256_extracti128_si256(temp_lo_16, 1);
707
835k
      _mm_storel_epi64(p_0, temp_lo_16_0);
708
835k
      _mm_storel_epi64(p_1, temp_lo_16_1);
709
835k
    }
710
1.06M
    if (p_width > 4) {
711
1.06M
      __m128i *const p4_0 =
712
1.06M
          (__m128i *)&conv_params
713
1.06M
              ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
714
1.06M
      __m128i *const p4_1 =
715
1.06M
          (__m128i *)&conv_params
716
1.06M
              ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j + 4];
717
1.06M
      res_hi_1 = _mm256_srai_epi32(_mm256_add_epi32(res_hi_1, *res_add_const),
718
1.06M
                                   reduce_bits_vert);
719
1.06M
      const __m256i temp_hi_16 = _mm256_packus_epi32(res_hi_1, res_hi_1);
720
1.06M
      __m256i res_hi_16;
721
1.06M
      if (conv_params->do_average) {
722
228k
        __m128i *const dst8_4_0 =
723
228k
            (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
724
228k
        __m128i *const dst8_4_1 =
725
228k
            (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j + 4];
726
228k
        const __m128i p4_16_0 = _mm_loadl_epi64(p4_0);
727
228k
        const __m128i p4_16_1 = _mm_loadl_epi64(p4_1);
728
228k
        const __m256i p4_16 = _mm256_inserti128_si256(
729
228k
            _mm256_castsi128_si256(p4_16_0), p4_16_1, 1);
730
228k
        if (conv_params->use_dist_wtd_comp_avg) {
731
62.7k
          const __m256i p_16_hi = _mm256_unpacklo_epi16(p4_16, temp_hi_16);
732
62.7k
          const __m256i wt_res_hi = _mm256_madd_epi16(p_16_hi, *wt);
733
62.7k
          const __m256i shifted_32 =
734
62.7k
              _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
735
62.7k
          res_hi_16 = _mm256_packus_epi32(shifted_32, shifted_32);
736
165k
        } else {
737
165k
          res_hi_16 = _mm256_srai_epi16(_mm256_add_epi16(p4_16, temp_hi_16), 1);
738
165k
        }
739
228k
        res_hi_16 = _mm256_add_epi16(res_hi_16, *res_sub_const);
740
228k
        res_hi_16 = _mm256_srai_epi16(
741
228k
            _mm256_add_epi16(res_hi_16, *round_bits_const), round_bits);
742
228k
        __m256i res_8_hi = _mm256_packus_epi16(res_hi_16, res_hi_16);
743
228k
        const __m128i res_8_hi_0 = _mm256_castsi256_si128(res_8_hi);
744
228k
        const __m128i res_8_hi_1 = _mm256_extracti128_si256(res_8_hi, 1);
745
228k
        *(int *)dst8_4_0 = _mm_cvtsi128_si32(res_8_hi_0);
746
228k
        *(int *)dst8_4_1 = _mm_cvtsi128_si32(res_8_hi_1);
747
835k
      } else {
748
835k
        const __m128i temp_hi_16_0 = _mm256_castsi256_si128(temp_hi_16);
749
835k
        const __m128i temp_hi_16_1 = _mm256_extracti128_si256(temp_hi_16, 1);
750
835k
        _mm_storel_epi64(p4_0, temp_hi_16_0);
751
835k
        _mm_storel_epi64(p4_1, temp_hi_16_1);
752
835k
      }
753
1.06M
    }
754
11.0M
  } else {
755
11.0M
    const __m256i res_lo_round = _mm256_srai_epi32(
756
11.0M
        _mm256_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
757
11.0M
    const __m256i res_hi_round = _mm256_srai_epi32(
758
11.0M
        _mm256_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
759
760
11.0M
    const __m256i res_16bit = _mm256_packs_epi32(res_lo_round, res_hi_round);
761
11.0M
    const __m256i res_8bit = _mm256_packus_epi16(res_16bit, res_16bit);
762
11.0M
    const __m128i res_8bit0 = _mm256_castsi256_si128(res_8bit);
763
11.0M
    const __m128i res_8bit1 = _mm256_extracti128_si256(res_8bit, 1);
764
765
    // Store, blending with 'pred' if needed
766
11.0M
    __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
767
11.0M
    __m128i *const p1 = (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
768
769
11.0M
    if (p_width == 4) {
770
0
      *(int *)p = _mm_cvtsi128_si32(res_8bit0);
771
0
      *(int *)p1 = _mm_cvtsi128_si32(res_8bit1);
772
11.0M
    } else {
773
11.0M
      _mm_storel_epi64(p, res_8bit0);
774
11.0M
      _mm_storel_epi64(p1, res_8bit1);
775
11.0M
    }
776
11.0M
  }
777
12.1M
}
778
779
static INLINE void warp_vertical_filter_avx2(
780
    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
781
    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
782
    int i, int j, int sy4, const int reduce_bits_vert,
783
    const __m256i *res_add_const, const int round_bits,
784
    const __m256i *res_sub_const, const __m256i *round_bits_const,
785
1.28M
    const __m256i *wt) {
786
1.28M
  int k, row = 0;
787
1.28M
  __m256i src[8];
788
1.28M
  const __m256i src_0 = horz_out[0];
789
1.28M
  const __m256i src_1 =
790
1.28M
      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
791
1.28M
  const __m256i src_2 = horz_out[1];
792
1.28M
  const __m256i src_3 =
793
1.28M
      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
794
1.28M
  const __m256i src_4 = horz_out[2];
795
1.28M
  const __m256i src_5 =
796
1.28M
      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
797
798
1.28M
  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
799
1.28M
  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
800
1.28M
  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
801
802
1.28M
  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
803
1.28M
  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
804
1.28M
  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
805
806
6.39M
  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
807
5.11M
    int sy = sy4 + delta * (k + 4);
808
5.11M
    __m256i coeffs[8];
809
5.11M
    prepare_vertical_filter_coeffs_avx2(gamma, delta, sy, coeffs);
810
5.11M
    __m256i res_lo, res_hi;
811
5.11M
    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
812
5.11M
                                    row);
813
5.11M
    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
814
5.11M
                                      res_sub_const, round_bits_const, pred,
815
5.11M
                                      conv_params, i, j, k, reduce_bits_vert,
816
5.11M
                                      p_stride, p_width, round_bits);
817
5.11M
    src[0] = src[2];
818
5.11M
    src[2] = src[4];
819
5.11M
    src[4] = src[6];
820
5.11M
    src[1] = src[3];
821
5.11M
    src[3] = src[5];
822
5.11M
    src[5] = src[7];
823
824
5.11M
    row += 1;
825
5.11M
  }
826
1.28M
}
827
828
static INLINE void warp_vertical_filter_gamma0_avx2(
829
    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
830
    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
831
    int i, int j, int sy4, const int reduce_bits_vert,
832
    const __m256i *res_add_const, const int round_bits,
833
    const __m256i *res_sub_const, const __m256i *round_bits_const,
834
628k
    const __m256i *wt) {
835
628k
  (void)gamma;
836
628k
  int k, row = 0;
837
628k
  __m256i src[8];
838
628k
  const __m256i src_0 = horz_out[0];
839
628k
  const __m256i src_1 =
840
628k
      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
841
628k
  const __m256i src_2 = horz_out[1];
842
628k
  const __m256i src_3 =
843
628k
      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
844
628k
  const __m256i src_4 = horz_out[2];
845
628k
  const __m256i src_5 =
846
628k
      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
847
848
628k
  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
849
628k
  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
850
628k
  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
851
852
628k
  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
853
628k
  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
854
628k
  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
855
856
3.14M
  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
857
2.51M
    int sy = sy4 + delta * (k + 4);
858
2.51M
    __m256i coeffs[8];
859
2.51M
    prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy, coeffs);
860
2.51M
    __m256i res_lo, res_hi;
861
2.51M
    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
862
2.51M
                                    row);
863
2.51M
    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
864
2.51M
                                      res_sub_const, round_bits_const, pred,
865
2.51M
                                      conv_params, i, j, k, reduce_bits_vert,
866
2.51M
                                      p_stride, p_width, round_bits);
867
2.51M
    src[0] = src[2];
868
2.51M
    src[2] = src[4];
869
2.51M
    src[4] = src[6];
870
2.51M
    src[1] = src[3];
871
2.51M
    src[3] = src[5];
872
2.51M
    src[5] = src[7];
873
2.51M
    row += 1;
874
2.51M
  }
875
628k
}
876
877
static INLINE void warp_vertical_filter_delta0_avx2(
878
    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
879
    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
880
    int i, int j, int sy4, const int reduce_bits_vert,
881
    const __m256i *res_add_const, const int round_bits,
882
    const __m256i *res_sub_const, const __m256i *round_bits_const,
883
325k
    const __m256i *wt) {
884
325k
  (void)delta;
885
325k
  int k, row = 0;
886
325k
  __m256i src[8], coeffs[8];
887
325k
  const __m256i src_0 = horz_out[0];
888
325k
  const __m256i src_1 =
889
325k
      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
890
325k
  const __m256i src_2 = horz_out[1];
891
325k
  const __m256i src_3 =
892
325k
      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
893
325k
  const __m256i src_4 = horz_out[2];
894
325k
  const __m256i src_5 =
895
325k
      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
896
897
325k
  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
898
325k
  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
899
325k
  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
900
901
325k
  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
902
325k
  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
903
325k
  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
904
905
325k
  prepare_vertical_filter_coeffs_delta0_avx2(gamma, sy4, coeffs);
906
907
1.62M
  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
908
1.30M
    __m256i res_lo, res_hi;
909
1.30M
    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
910
1.30M
                                    row);
911
1.30M
    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
912
1.30M
                                      res_sub_const, round_bits_const, pred,
913
1.30M
                                      conv_params, i, j, k, reduce_bits_vert,
914
1.30M
                                      p_stride, p_width, round_bits);
915
1.30M
    src[0] = src[2];
916
1.30M
    src[2] = src[4];
917
1.30M
    src[4] = src[6];
918
1.30M
    src[1] = src[3];
919
1.30M
    src[3] = src[5];
920
1.30M
    src[5] = src[7];
921
1.30M
    row += 1;
922
1.30M
  }
923
325k
}
924
925
static INLINE void warp_vertical_filter_gamma0_delta0_avx2(
926
    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
927
    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
928
    int i, int j, int sy4, const int reduce_bits_vert,
929
    const __m256i *res_add_const, const int round_bits,
930
    const __m256i *res_sub_const, const __m256i *round_bits_const,
931
802k
    const __m256i *wt) {
932
802k
  (void)gamma;
933
802k
  int k, row = 0;
934
802k
  __m256i src[8], coeffs[8];
935
802k
  const __m256i src_0 = horz_out[0];
936
802k
  const __m256i src_1 =
937
802k
      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
938
802k
  const __m256i src_2 = horz_out[1];
939
802k
  const __m256i src_3 =
940
802k
      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
941
802k
  const __m256i src_4 = horz_out[2];
942
802k
  const __m256i src_5 =
943
802k
      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
944
945
802k
  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
946
802k
  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
947
802k
  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
948
949
802k
  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
950
802k
  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
951
802k
  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
952
953
802k
  prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy4, coeffs);
954
955
4.00M
  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
956
3.20M
    __m256i res_lo, res_hi;
957
3.20M
    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
958
3.20M
                                    row);
959
3.20M
    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
960
3.20M
                                      res_sub_const, round_bits_const, pred,
961
3.20M
                                      conv_params, i, j, k, reduce_bits_vert,
962
3.20M
                                      p_stride, p_width, round_bits);
963
3.20M
    src[0] = src[2];
964
3.20M
    src[2] = src[4];
965
3.20M
    src[4] = src[6];
966
3.20M
    src[1] = src[3];
967
3.20M
    src[3] = src[5];
968
3.20M
    src[5] = src[7];
969
3.20M
    row += 1;
970
3.20M
  }
971
802k
}
972
973
static INLINE void prepare_warp_vertical_filter_avx2(
974
    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
975
    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
976
    int i, int j, int sy4, const int reduce_bits_vert,
977
    const __m256i *res_add_const, const int round_bits,
978
    const __m256i *res_sub_const, const __m256i *round_bits_const,
979
3.03M
    const __m256i *wt) {
980
3.03M
  if (gamma == 0 && delta == 0)
981
802k
    warp_vertical_filter_gamma0_delta0_avx2(
982
802k
        pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
983
802k
        i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
984
802k
        round_bits_const, wt);
985
2.23M
  else if (gamma == 0 && delta != 0)
986
628k
    warp_vertical_filter_gamma0_avx2(
987
628k
        pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
988
628k
        i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
989
628k
        round_bits_const, wt);
990
1.60M
  else if (gamma != 0 && delta == 0)
991
325k
    warp_vertical_filter_delta0_avx2(
992
325k
        pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
993
325k
        i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
994
325k
        round_bits_const, wt);
995
1.28M
  else
996
1.28M
    warp_vertical_filter_avx2(pred, horz_out, conv_params, gamma, delta,
997
1.28M
                              p_height, p_stride, p_width, i, j, sy4,
998
1.28M
                              reduce_bits_vert, res_add_const, round_bits,
999
1.28M
                              res_sub_const, round_bits_const, wt);
1000
3.03M
}
1001
1002
static INLINE void prepare_warp_horizontal_filter_avx2(
1003
    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
1004
    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
1005
    const __m256i *round_const, const __m128i *shift,
1006
2.78M
    const __m256i *shuffle_src) {
1007
2.78M
  if (alpha == 0 && beta == 0)
1008
805k
    warp_horizontal_filter_alpha0_beta0_avx2(
1009
805k
        ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
1010
805k
        round_const, shift, shuffle_src);
1011
1.97M
  else if (alpha == 0 && beta != 0)
1012
253k
    warp_horizontal_filter_alpha0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
1013
253k
                                       alpha, beta, p_height, height, i,
1014
253k
                                       round_const, shift, shuffle_src);
1015
1.72M
  else if (alpha != 0 && beta == 0)
1016
646k
    warp_horizontal_filter_beta0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
1017
646k
                                      alpha, beta, p_height, height, i,
1018
646k
                                      round_const, shift, shuffle_src);
1019
1.07M
  else
1020
1.07M
    warp_horizontal_filter_avx2(ref, horz_out, stride, ix4, iy4, sx4, alpha,
1021
1.07M
                                beta, p_height, height, i, round_const, shift,
1022
1.07M
                                shuffle_src);
1023
2.78M
}
1024
1025
int64_t av1_calc_frame_error_avx2(const uint8_t *const ref, int ref_stride,
1026
                                  const uint8_t *const dst, int p_width,
1027
0
                                  int p_height, int dst_stride) {
1028
0
  int64_t sum_error = 0;
1029
0
  int i, j;
1030
0
  __m256i row_error, col_error;
1031
0
  __m256i zero = _mm256_setzero_si256();
1032
0
  __m256i dup_255 = _mm256_set1_epi16(255);
1033
0
  col_error = zero;
1034
1035
0
  for (i = 0; i < (p_height / 4); i++) {
1036
0
    row_error = _mm256_setzero_si256();
1037
0
    for (j = 0; j < (p_width / 16); j++) {
1038
0
      __m256i ref_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
1039
0
          (__m128i *)(ref + (j * 16) + (((i * 4) + 0) * ref_stride))));
1040
0
      __m256i dst_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
1041
0
          (__m128i *)(dst + (j * 16) + (((i * 4) + 0) * dst_stride))));
1042
0
      __m256i ref_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
1043
0
          (__m128i *)(ref + (j * 16) + (((i * 4) + 1) * ref_stride))));
1044
0
      __m256i dst_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
1045
0
          (__m128i *)(dst + (j * 16) + (((i * 4) + 1) * dst_stride))));
1046
0
      __m256i ref_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
1047
0
          (__m128i *)(ref + (j * 16) + (((i * 4) + 2) * ref_stride))));
1048
0
      __m256i dst_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
1049
0
          (__m128i *)(dst + (j * 16) + (((i * 4) + 2) * dst_stride))));
1050
0
      __m256i ref_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
1051
0
          (__m128i *)(ref + (j * 16) + (((i * 4) + 3) * ref_stride))));
1052
0
      __m256i dst_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
1053
0
          (__m128i *)(dst + (j * 16) + (((i * 4) + 3) * dst_stride))));
1054
1055
0
      __m256i diff_1 =
1056
0
          _mm256_add_epi16(_mm256_sub_epi16(dst_1_16, ref_1_16), dup_255);
1057
0
      __m256i diff_2 =
1058
0
          _mm256_add_epi16(_mm256_sub_epi16(dst_2_16, ref_2_16), dup_255);
1059
0
      __m256i diff_3 =
1060
0
          _mm256_add_epi16(_mm256_sub_epi16(dst_3_16, ref_3_16), dup_255);
1061
0
      __m256i diff_4 =
1062
0
          _mm256_add_epi16(_mm256_sub_epi16(dst_4_16, ref_4_16), dup_255);
1063
1064
0
      __m256i diff_1_lo = _mm256_unpacklo_epi16(diff_1, zero);
1065
0
      __m256i diff_1_hi = _mm256_unpackhi_epi16(diff_1, zero);
1066
0
      __m256i diff_2_lo = _mm256_unpacklo_epi16(diff_2, zero);
1067
0
      __m256i diff_2_hi = _mm256_unpackhi_epi16(diff_2, zero);
1068
0
      __m256i diff_3_lo = _mm256_unpacklo_epi16(diff_3, zero);
1069
0
      __m256i diff_3_hi = _mm256_unpackhi_epi16(diff_3, zero);
1070
0
      __m256i diff_4_lo = _mm256_unpacklo_epi16(diff_4, zero);
1071
0
      __m256i diff_4_hi = _mm256_unpackhi_epi16(diff_4, zero);
1072
1073
0
      __m256i error_1_lo =
1074
0
          _mm256_i32gather_epi32(error_measure_lut, diff_1_lo, 4);
1075
0
      __m256i error_1_hi =
1076
0
          _mm256_i32gather_epi32(error_measure_lut, diff_1_hi, 4);
1077
0
      __m256i error_2_lo =
1078
0
          _mm256_i32gather_epi32(error_measure_lut, diff_2_lo, 4);
1079
0
      __m256i error_2_hi =
1080
0
          _mm256_i32gather_epi32(error_measure_lut, diff_2_hi, 4);
1081
0
      __m256i error_3_lo =
1082
0
          _mm256_i32gather_epi32(error_measure_lut, diff_3_lo, 4);
1083
0
      __m256i error_3_hi =
1084
0
          _mm256_i32gather_epi32(error_measure_lut, diff_3_hi, 4);
1085
0
      __m256i error_4_lo =
1086
0
          _mm256_i32gather_epi32(error_measure_lut, diff_4_lo, 4);
1087
0
      __m256i error_4_hi =
1088
0
          _mm256_i32gather_epi32(error_measure_lut, diff_4_hi, 4);
1089
1090
0
      __m256i error_1 = _mm256_add_epi32(error_1_lo, error_1_hi);
1091
0
      __m256i error_2 = _mm256_add_epi32(error_2_lo, error_2_hi);
1092
0
      __m256i error_3 = _mm256_add_epi32(error_3_lo, error_3_hi);
1093
0
      __m256i error_4 = _mm256_add_epi32(error_4_lo, error_4_hi);
1094
1095
0
      __m256i error_1_2 = _mm256_add_epi32(error_1, error_2);
1096
0
      __m256i error_3_4 = _mm256_add_epi32(error_3, error_4);
1097
1098
0
      __m256i error_1_2_3_4 = _mm256_add_epi32(error_1_2, error_3_4);
1099
0
      row_error = _mm256_add_epi32(row_error, error_1_2_3_4);
1100
0
    }
1101
0
    __m256i col_error_lo = _mm256_unpacklo_epi32(row_error, zero);
1102
0
    __m256i col_error_hi = _mm256_unpackhi_epi32(row_error, zero);
1103
0
    __m256i col_error_temp = _mm256_add_epi64(col_error_lo, col_error_hi);
1104
0
    col_error = _mm256_add_epi64(col_error, col_error_temp);
1105
    // Error summation for remaining width, which is not multiple of 16
1106
0
    if (p_width & 0xf) {
1107
0
      for (int k = 0; k < 4; ++k) {
1108
0
        for (int l = j * 16; l < p_width; ++l) {
1109
0
          sum_error +=
1110
0
              (int64_t)error_measure(dst[l + ((i * 4) + k) * dst_stride] -
1111
0
                                     ref[l + ((i * 4) + k) * ref_stride]);
1112
0
        }
1113
0
      }
1114
0
    }
1115
0
  }
1116
0
  __m128i sum_error_q_0 = _mm256_castsi256_si128(col_error);
1117
0
  __m128i sum_error_q_1 = _mm256_extracti128_si256(col_error, 1);
1118
0
  sum_error_q_0 = _mm_add_epi64(sum_error_q_0, sum_error_q_1);
1119
0
  int64_t sum_error_d_0, sum_error_d_1;
1120
0
  xx_storel_64(&sum_error_d_0, sum_error_q_0);
1121
0
  xx_storel_64(&sum_error_d_1, _mm_srli_si128(sum_error_q_0, 8));
1122
0
  sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
1123
  // Error summation for remaining height, which is not multiple of 4
1124
0
  if (p_height & 0x3) {
1125
0
    for (int k = i * 4; k < p_height; ++k) {
1126
0
      for (int l = 0; l < p_width; ++l) {
1127
0
        sum_error += (int64_t)error_measure(dst[l + k * dst_stride] -
1128
0
                                            ref[l + k * ref_stride]);
1129
0
      }
1130
0
    }
1131
0
  }
1132
0
  return sum_error;
1133
0
}
1134
1135
void av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width,
1136
                          int height, int stride, uint8_t *pred, int p_col,
1137
                          int p_row, int p_width, int p_height, int p_stride,
1138
                          int subsampling_x, int subsampling_y,
1139
                          ConvolveParams *conv_params, int16_t alpha,
1140
576k
                          int16_t beta, int16_t gamma, int16_t delta) {
1141
576k
  __m256i horz_out[8];
1142
576k
  int i, j, k;
1143
576k
  const int bd = 8;
1144
576k
  const int reduce_bits_horiz = conv_params->round_0;
1145
576k
  const int reduce_bits_vert = conv_params->is_compound
1146
576k
                                   ? conv_params->round_1
1147
576k
                                   : 2 * FILTER_BITS - reduce_bits_horiz;
1148
576k
  const int offset_bits_horiz = bd + FILTER_BITS - 1;
1149
576k
  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
1150
1151
576k
  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
1152
576k
  const __m256i reduce_bits_vert_const =
1153
576k
      _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
1154
576k
  const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
1155
576k
  const int round_bits =
1156
576k
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1157
576k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1158
576k
  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
1159
1160
0
  const __m256i round_const = _mm256_set1_epi16(
1161
576k
      (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
1162
576k
  const __m128i shift = _mm_cvtsi32_si128(reduce_bits_horiz);
1163
1164
576k
  __m256i res_sub_const, round_bits_const, wt;
1165
576k
  unpack_weights_and_set_round_const_avx2(conv_params, round_bits, offset_bits,
1166
576k
                                          &res_sub_const, &round_bits_const,
1167
576k
                                          &wt);
1168
1169
576k
  __m256i res_add_const_1;
1170
576k
  if (conv_params->is_compound == 1) {
1171
50.0k
    res_add_const_1 = _mm256_add_epi32(reduce_bits_vert_const, res_add_const);
1172
526k
  } else {
1173
526k
    res_add_const_1 = _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
1174
526k
                                        ((1 << reduce_bits_vert) >> 1));
1175
526k
  }
1176
576k
  const int32_t const1 = alpha * (-4) + beta * (-4) +
1177
576k
                         (1 << (WARPEDDIFF_PREC_BITS - 1)) +
1178
576k
                         (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
1179
576k
  const int32_t const2 = gamma * (-4) + delta * (-4) +
1180
576k
                         (1 << (WARPEDDIFF_PREC_BITS - 1)) +
1181
576k
                         (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
1182
576k
  const int32_t const3 = ((1 << WARP_PARAM_REDUCE_BITS) - 1);
1183
576k
  const int16_t const4 = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1));
1184
576k
  const int16_t const5 = (1 << (FILTER_BITS - reduce_bits_horiz));
1185
1186
576k
  __m256i shuffle_src[4];
1187
576k
  shuffle_src[0] = _mm256_load_si256((__m256i *)shuffle_src0);
1188
576k
  shuffle_src[1] = _mm256_load_si256((__m256i *)shuffle_src1);
1189
576k
  shuffle_src[2] = _mm256_load_si256((__m256i *)shuffle_src2);
1190
576k
  shuffle_src[3] = _mm256_load_si256((__m256i *)shuffle_src3);
1191
1192
1.68M
  for (i = 0; i < p_height; i += 8) {
1193
4.14M
    for (j = 0; j < p_width; j += 8) {
1194
3.03M
      const int32_t src_x = (p_col + j + 4) << subsampling_x;
1195
3.03M
      const int32_t src_y = (p_row + i + 4) << subsampling_y;
1196
3.03M
      const int64_t dst_x =
1197
3.03M
          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
1198
3.03M
      const int64_t dst_y =
1199
3.03M
          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
1200
3.03M
      const int64_t x4 = dst_x >> subsampling_x;
1201
3.03M
      const int64_t y4 = dst_y >> subsampling_y;
1202
1203
3.03M
      int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
1204
3.03M
      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
1205
3.03M
      int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
1206
3.03M
      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
1207
1208
      // Add in all the constant terms, including rounding and offset
1209
3.03M
      sx4 += const1;
1210
3.03M
      sy4 += const2;
1211
1212
3.03M
      sx4 &= ~const3;
1213
3.03M
      sy4 &= ~const3;
1214
1215
      // Horizontal filter
1216
      // If the block is aligned such that, after clamping, every sample
1217
      // would be taken from the leftmost/rightmost column, then we can
1218
      // skip the expensive horizontal filter.
1219
1220
3.03M
      if (ix4 <= -7) {
1221
35.5k
        int iy, row = 0;
1222
284k
        for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
1223
248k
          iy = iy4 + k;
1224
248k
          iy = clamp(iy, 0, height - 1);
1225
248k
          const __m256i temp_0 =
1226
248k
              _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
1227
248k
          iy = iy4 + k + 1;
1228
248k
          iy = clamp(iy, 0, height - 1);
1229
248k
          const __m256i temp_1 =
1230
248k
              _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
1231
248k
          horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
1232
248k
          row += 1;
1233
248k
        }
1234
35.5k
        iy = iy4 + k;
1235
35.5k
        iy = clamp(iy, 0, height - 1);
1236
35.5k
        horz_out[row] = _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
1237
3.00M
      } else if (ix4 >= width + 6) {
1238
127k
        int iy, row = 0;
1239
1.01M
        for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
1240
892k
          iy = iy4 + k;
1241
892k
          iy = clamp(iy, 0, height - 1);
1242
892k
          const __m256i temp_0 = _mm256_set1_epi16(
1243
892k
              const4 + ref[iy * stride + (width - 1)] * const5);
1244
892k
          iy = iy4 + k + 1;
1245
892k
          iy = clamp(iy, 0, height - 1);
1246
892k
          const __m256i temp_1 = _mm256_set1_epi16(
1247
892k
              const4 + ref[iy * stride + (width - 1)] * const5);
1248
892k
          horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
1249
892k
          row += 1;
1250
892k
        }
1251
127k
        iy = iy4 + k;
1252
127k
        iy = clamp(iy, 0, height - 1);
1253
127k
        horz_out[row] =
1254
127k
            _mm256_set1_epi16(const4 + ref[iy * stride + (width - 1)] * const5);
1255
2.87M
      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
1256
92.7k
        const int out_of_boundary_left = -(ix4 - 6);
1257
92.7k
        const int out_of_boundary_right = (ix4 + 8) - width;
1258
92.7k
        int iy, sx, row = 0;
1259
741k
        for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
1260
649k
          iy = iy4 + k;
1261
649k
          iy = clamp(iy, 0, height - 1);
1262
649k
          __m128i src0 =
1263
649k
              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
1264
649k
          iy = iy4 + k + 1;
1265
649k
          iy = clamp(iy, 0, height - 1);
1266
649k
          __m128i src1 =
1267
649k
              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
1268
1269
649k
          if (out_of_boundary_left >= 0) {
1270
340k
            const __m128i shuffle_reg_left =
1271
340k
                _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
1272
340k
            src0 = _mm_shuffle_epi8(src0, shuffle_reg_left);
1273
340k
            src1 = _mm_shuffle_epi8(src1, shuffle_reg_left);
1274
340k
          }
1275
649k
          if (out_of_boundary_right >= 0) {
1276
430k
            const __m128i shuffle_reg_right = _mm_loadu_si128(
1277
430k
                (__m128i *)warp_pad_right[out_of_boundary_right]);
1278
430k
            src0 = _mm_shuffle_epi8(src0, shuffle_reg_right);
1279
430k
            src1 = _mm_shuffle_epi8(src1, shuffle_reg_right);
1280
430k
          }
1281
649k
          sx = sx4 + beta * (k + 4);
1282
649k
          const __m256i src_01 =
1283
649k
              _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
1284
649k
          horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row,
1285
649k
                                 shuffle_src, &round_const, &shift);
1286
649k
          row += 1;
1287
649k
        }
1288
92.7k
        iy = iy4 + k;
1289
92.7k
        iy = clamp(iy, 0, height - 1);
1290
92.7k
        __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
1291
92.7k
        if (out_of_boundary_left >= 0) {
1292
48.6k
          const __m128i shuffle_reg_left =
1293
48.6k
              _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
1294
48.6k
          src = _mm_shuffle_epi8(src, shuffle_reg_left);
1295
48.6k
        }
1296
92.7k
        if (out_of_boundary_right >= 0) {
1297
61.5k
          const __m128i shuffle_reg_right =
1298
61.5k
              _mm_loadu_si128((__m128i *)warp_pad_right[out_of_boundary_right]);
1299
61.5k
          src = _mm_shuffle_epi8(src, shuffle_reg_right);
1300
61.5k
        }
1301
92.7k
        sx = sx4 + beta * (k + 4);
1302
92.7k
        const __m256i src_01 = _mm256_castsi128_si256(src);
1303
92.7k
        __m256i coeff[4];
1304
92.7k
        prepare_horizontal_filter_coeff(alpha, sx, coeff);
1305
92.7k
        filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src,
1306
92.7k
                               &round_const, &shift, row);
1307
2.78M
      } else {
1308
2.78M
        prepare_warp_horizontal_filter_avx2(
1309
2.78M
            ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height,
1310
2.78M
            i, &round_const, &shift, shuffle_src);
1311
2.78M
      }
1312
1313
      // Vertical filter
1314
3.03M
      prepare_warp_vertical_filter_avx2(
1315
3.03M
          pred, horz_out, conv_params, gamma, delta, p_height, p_stride,
1316
3.03M
          p_width, i, j, sy4, reduce_bits_vert, &res_add_const_1, round_bits,
1317
3.03M
          &res_sub_const, &round_bits_const, &wt);
1318
3.03M
    }
1319
1.11M
  }
1320
576k
}