Coverage Report

Created: 2025-11-16 07:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/av1/common/x86/warp_plane_sse4.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <emmintrin.h>
13
#include <smmintrin.h>
14
15
#include "config/av1_rtcd.h"
16
17
#include "av1/common/warped_motion.h"
18
19
/* This is a modified version of 'av1_warped_filter' from warped_motion.c:
20
   * Each coefficient is stored in 8 bits instead of 16 bits
21
   * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
22
23
     This is done in order to avoid overflow: Since the tap with the largest
24
     coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
25
     order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
26
     convolve functions.
27
28
     Instead, we use the summation order
29
     ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
30
     The rearrangement of coefficients in this table is so that we can get the
31
     coefficients into the correct order more quickly.
32
*/
33
/* clang-format off */
34
DECLARE_ALIGNED(8, const int8_t,
35
                av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
36
  // [-1, 0)
37
  { 0, 127,   0, 0,   0,   1, 0, 0}, { 0, 127,   0, 0,  -1,   2, 0, 0},
38
  { 1, 127,  -1, 0,  -3,   4, 0, 0}, { 1, 126,  -2, 0,  -4,   6, 1, 0},
39
  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 125,  -4, 0,  -6,  11, 1, 0},
40
  { 1, 124,  -4, 0,  -7,  13, 1, 0}, { 2, 123,  -5, 0,  -8,  15, 1, 0},
41
  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 121,  -6, 0, -10,  20, 1, 0},
42
  { 2, 120,  -7, 0, -11,  22, 2, 0}, { 2, 119,  -8, 0, -12,  25, 2, 0},
43
  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 116,  -9, 0, -13,  29, 2, 0},
44
  { 3, 114, -10, 0, -14,  32, 3, 0}, { 3, 113, -10, 0, -15,  35, 2, 0},
45
  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 109, -11, 0, -16,  40, 3, 0},
46
  { 3, 108, -12, 0, -16,  42, 3, 0}, { 4, 106, -13, 0, -17,  45, 3, 0},
47
  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 102, -14, 0, -17,  50, 3, 0},
48
  { 4, 100, -14, 0, -17,  52, 3, 0}, { 4,  98, -15, 0, -18,  55, 4, 0},
49
  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  94, -16, 0, -18,  60, 4, 0},
50
  { 4,  91, -16, 0, -18,  63, 4, 0}, { 4,  89, -16, 0, -18,  65, 4, 0},
51
  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  85, -17, 0, -18,  70, 4, 0},
52
  { 4,  82, -17, 0, -18,  73, 4, 0}, { 4,  80, -17, 0, -18,  75, 4, 0},
53
  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  75, -18, 0, -17,  80, 4, 0},
54
  { 4,  73, -18, 0, -17,  82, 4, 0}, { 4,  70, -18, 0, -17,  85, 4, 0},
55
  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  65, -18, 0, -16,  89, 4, 0},
56
  { 4,  63, -18, 0, -16,  91, 4, 0}, { 4,  60, -18, 0, -16,  94, 4, 0},
57
  { 3,  58, -18, 0, -15,  96, 4, 0}, { 4,  55, -18, 0, -15,  98, 4, 0},
58
  { 3,  52, -17, 0, -14, 100, 4, 0}, { 3,  50, -17, 0, -14, 102, 4, 0},
59
  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  45, -17, 0, -13, 106, 4, 0},
60
  { 3,  42, -16, 0, -12, 108, 3, 0}, { 3,  40, -16, 0, -11, 109, 3, 0},
61
  { 3,  37, -15, 0, -11, 111, 3, 0}, { 2,  35, -15, 0, -10, 113, 3, 0},
62
  { 3,  32, -14, 0, -10, 114, 3, 0}, { 2,  29, -13, 0,  -9, 116, 3, 0},
63
  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  25, -12, 0,  -8, 119, 2, 0},
64
  { 2,  22, -11, 0,  -7, 120, 2, 0}, { 1,  20, -10, 0,  -6, 121, 2, 0},
65
  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  15,  -8, 0,  -5, 123, 2, 0},
66
  { 1,  13,  -7, 0,  -4, 124, 1, 0}, { 1,  11,  -6, 0,  -4, 125, 1, 0},
67
  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 1,   6,  -4, 0,  -2, 126, 1, 0},
68
  { 0,   4,  -3, 0,  -1, 127, 1, 0}, { 0,   2,  -1, 0,   0, 127, 0, 0},
69
  // [0, 1)
70
  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -1,   2, 0, 0, 127,   0,  0},
71
  { 0,  -3,   4, 1, 1, 127,  -2,  0}, { 0,  -5,   6, 1, 1, 127,  -2,  0},
72
  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -7,  11, 2, 2, 126,  -4, -1},
73
  {-1,  -8,  13, 2, 3, 125,  -5, -1}, {-1, -10,  16, 3, 3, 124,  -6, -1},
74
  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -12,  20, 3, 4, 122,  -7, -1},
75
  {-1, -13,  23, 3, 4, 121,  -8, -1}, {-2, -14,  25, 4, 5, 120,  -9, -1},
76
  {-1, -15,  27, 4, 5, 119, -10, -1}, {-1, -16,  30, 4, 5, 118, -11, -1},
77
  {-2, -17,  33, 5, 6, 116, -12, -1}, {-2, -17,  35, 5, 6, 114, -12, -1},
78
  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  41, 6, 7, 111, -14, -2},
79
  {-2, -19,  43, 6, 7, 110, -15, -2}, {-2, -20,  46, 6, 7, 108, -15, -2},
80
  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  51, 7, 7, 104, -16, -2},
81
  {-2, -21,  54, 7, 7, 102, -17, -2}, {-2, -21,  56, 7, 8, 100, -18, -2},
82
  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  62, 7, 8,  96, -19, -2},
83
  {-2, -22,  64, 7, 8,  94, -19, -2}, {-2, -22,  67, 8, 8,  91, -20, -2},
84
  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -22,  72, 8, 8,  87, -21, -2},
85
  {-2, -21,  74, 8, 8,  84, -21, -2}, {-2, -22,  77, 8, 8,  82, -21, -2},
86
  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  82, 8, 8,  77, -22, -2},
87
  {-2, -21,  84, 8, 8,  74, -21, -2}, {-2, -21,  87, 8, 8,  72, -22, -2},
88
  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -20,  91, 8, 8,  67, -22, -2},
89
  {-2, -19,  94, 8, 7,  64, -22, -2}, {-2, -19,  96, 8, 7,  62, -22, -2},
90
  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -18, 100, 8, 7,  56, -21, -2},
91
  {-2, -17, 102, 7, 7,  54, -21, -2}, {-2, -16, 104, 7, 7,  51, -21, -2},
92
  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 108, 7, 6,  46, -20, -2},
93
  {-2, -15, 110, 7, 6,  43, -19, -2}, {-2, -14, 111, 7, 6,  41, -19, -2},
94
  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 114, 6, 5,  35, -17, -2},
95
  {-1, -12, 116, 6, 5,  33, -17, -2}, {-1, -11, 118, 5, 4,  30, -16, -1},
96
  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -9, 120, 5, 4,  25, -14, -2},
97
  {-1,  -8, 121, 4, 3,  23, -13, -1}, {-1,  -7, 122, 4, 3,  20, -12, -1},
98
  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -6, 124, 3, 3,  16, -10, -1},
99
  {-1,  -5, 125, 3, 2,  13,  -8, -1}, {-1,  -4, 126, 2, 2,  11,  -7, -1},
100
  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   6,  -5,  0},
101
  { 0,  -2, 127, 1, 1,   4,  -3,  0}, { 0,   0, 127, 0, 0,   2,  -1,  0},
102
  // [1, 2)
103
  { 0, 0, 127,   0, 0,   1,   0, 0}, { 0, 0, 127,   0, 0,  -1,   2, 0},
104
  { 0, 1, 127,  -1, 0,  -3,   4, 0}, { 0, 1, 126,  -2, 0,  -4,   6, 1},
105
  { 0, 1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 125,  -4, 0,  -6,  11, 1},
106
  { 0, 1, 124,  -4, 0,  -7,  13, 1}, { 0, 2, 123,  -5, 0,  -8,  15, 1},
107
  { 0, 2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 121,  -6, 0, -10,  20, 1},
108
  { 0, 2, 120,  -7, 0, -11,  22, 2}, { 0, 2, 119,  -8, 0, -12,  25, 2},
109
  { 0, 3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 116,  -9, 0, -13,  29, 2},
110
  { 0, 3, 114, -10, 0, -14,  32, 3}, { 0, 3, 113, -10, 0, -15,  35, 2},
111
  { 0, 3, 111, -11, 0, -15,  37, 3}, { 0, 3, 109, -11, 0, -16,  40, 3},
112
  { 0, 3, 108, -12, 0, -16,  42, 3}, { 0, 4, 106, -13, 0, -17,  45, 3},
113
  { 0, 4, 104, -13, 0, -17,  47, 3}, { 0, 4, 102, -14, 0, -17,  50, 3},
114
  { 0, 4, 100, -14, 0, -17,  52, 3}, { 0, 4,  98, -15, 0, -18,  55, 4},
115
  { 0, 4,  96, -15, 0, -18,  58, 3}, { 0, 4,  94, -16, 0, -18,  60, 4},
116
  { 0, 4,  91, -16, 0, -18,  63, 4}, { 0, 4,  89, -16, 0, -18,  65, 4},
117
  { 0, 4,  87, -17, 0, -18,  68, 4}, { 0, 4,  85, -17, 0, -18,  70, 4},
118
  { 0, 4,  82, -17, 0, -18,  73, 4}, { 0, 4,  80, -17, 0, -18,  75, 4},
119
  { 0, 4,  78, -18, 0, -18,  78, 4}, { 0, 4,  75, -18, 0, -17,  80, 4},
120
  { 0, 4,  73, -18, 0, -17,  82, 4}, { 0, 4,  70, -18, 0, -17,  85, 4},
121
  { 0, 4,  68, -18, 0, -17,  87, 4}, { 0, 4,  65, -18, 0, -16,  89, 4},
122
  { 0, 4,  63, -18, 0, -16,  91, 4}, { 0, 4,  60, -18, 0, -16,  94, 4},
123
  { 0, 3,  58, -18, 0, -15,  96, 4}, { 0, 4,  55, -18, 0, -15,  98, 4},
124
  { 0, 3,  52, -17, 0, -14, 100, 4}, { 0, 3,  50, -17, 0, -14, 102, 4},
125
  { 0, 3,  47, -17, 0, -13, 104, 4}, { 0, 3,  45, -17, 0, -13, 106, 4},
126
  { 0, 3,  42, -16, 0, -12, 108, 3}, { 0, 3,  40, -16, 0, -11, 109, 3},
127
  { 0, 3,  37, -15, 0, -11, 111, 3}, { 0, 2,  35, -15, 0, -10, 113, 3},
128
  { 0, 3,  32, -14, 0, -10, 114, 3}, { 0, 2,  29, -13, 0,  -9, 116, 3},
129
  { 0, 2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  25, -12, 0,  -8, 119, 2},
130
  { 0, 2,  22, -11, 0,  -7, 120, 2}, { 0, 1,  20, -10, 0,  -6, 121, 2},
131
  { 0, 1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  15,  -8, 0,  -5, 123, 2},
132
  { 0, 1,  13,  -7, 0,  -4, 124, 1}, { 0, 1,  11,  -6, 0,  -4, 125, 1},
133
  { 0, 1,   8,  -5, 0,  -3, 126, 1}, { 0, 1,   6,  -4, 0,  -2, 126, 1},
134
  { 0, 0,   4,  -3, 0,  -1, 127, 1}, { 0, 0,   2,  -1, 0,   0, 127, 0},
135
  // dummy (replicate row index 191)
136
  { 0, 0,   2,  -1, 0,   0, 127, 0},
137
};
138
/* clang-format on */
139
140
#if !CONFIG_HIGHWAY
141
142
// Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15
143
// in an SSE register into two sequences:
144
// 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
145
// 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
146
DECLARE_ALIGNED(16, static const uint8_t,
147
                even_mask[16]) = { 0, 2,  2,  4,  4,  6,  6,  8,
148
                                   8, 10, 10, 12, 12, 14, 14, 0 };
149
150
DECLARE_ALIGNED(16, static const uint8_t,
151
                odd_mask[16]) = { 1, 3,  3,  5,  5,  7,  7,  9,
152
                                  9, 11, 11, 13, 13, 15, 15, 0 };
153
154
DECLARE_ALIGNED(16, static const uint8_t,
155
                shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1,
156
                                               0, 1, 0, 1, 0, 1, 0, 1 };
157
158
DECLARE_ALIGNED(16, static const uint8_t,
159
                shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3,
160
                                               2, 3, 2, 3, 2, 3, 2, 3 };
161
162
DECLARE_ALIGNED(16, static const uint8_t,
163
                shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5,
164
                                               4, 5, 4, 5, 4, 5, 4, 5 };
165
166
DECLARE_ALIGNED(16, static const uint8_t,
167
                shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7,
168
                                               6, 7, 6, 7, 6, 7, 6, 7 };
169
170
DECLARE_ALIGNED(16, static const uint8_t,
171
                shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3,
172
                                              0, 1, 2, 3, 0, 1, 2, 3 };
173
174
DECLARE_ALIGNED(16, static const uint8_t,
175
                shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7,
176
                                              4, 5, 6, 7, 4, 5, 6, 7 };
177
178
DECLARE_ALIGNED(16, static const uint8_t,
179
                shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11,
180
                                              8, 9, 10, 11, 8, 9, 10, 11 };
181
182
DECLARE_ALIGNED(16, static const uint8_t,
183
                shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15,
184
                                              12, 13, 14, 15, 12, 13, 14, 15 };
185
186
static inline void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
187
                                     const int offset_bits_horiz,
188
0
                                     const int reduce_bits_horiz, int k) {
189
0
  const __m128i src_even =
190
0
      _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask));
191
0
  const __m128i src_odd =
192
0
      _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask));
193
  // The pixel order we need for 'src' is:
194
  // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
195
0
  const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
196
0
  const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]);
197
  // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
198
0
  const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
199
0
                                            _mm_srli_si128(src_odd, 4));
200
0
  const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]);
201
  // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
202
0
  const __m128i src_13 =
203
0
      _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
204
0
  const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]);
205
  // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
206
0
  const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
207
0
                                            _mm_srli_si128(src_even, 6));
208
0
  const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]);
209
210
0
  const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
211
0
                                             ((1 << reduce_bits_horiz) >> 1));
212
213
  // Note: The values res_02 + res_46 and res_13 + res_57 both
214
  // fit into int16s at this point, but their sum may be too wide to fit
215
  // into an int16. However, once we also add round_const, the sum of
216
  // all of these fits into a uint16.
217
  //
218
  // The wrapping behaviour of _mm_add_* is used here to make sure we
219
  // get the correct result despite converting between different
220
  // (implicit) types.
221
0
  const __m128i res_even = _mm_add_epi16(res_02, res_46);
222
0
  const __m128i res_odd = _mm_add_epi16(res_13, res_57);
223
0
  const __m128i res =
224
0
      _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
225
0
  tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
226
0
}
227
228
static inline void prepare_horizontal_filter_coeff(int alpha, int sx,
229
0
                                                   __m128i *coeff) {
230
  // Filter even-index pixels
231
0
  const __m128i tmp_0 = _mm_loadl_epi64(
232
0
      (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
233
0
  const __m128i tmp_1 = _mm_loadl_epi64(
234
0
      (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
235
0
  const __m128i tmp_2 = _mm_loadl_epi64(
236
0
      (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
237
0
  const __m128i tmp_3 = _mm_loadl_epi64(
238
0
      (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
239
0
  const __m128i tmp_4 = _mm_loadl_epi64(
240
0
      (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
241
0
  const __m128i tmp_5 = _mm_loadl_epi64(
242
0
      (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
243
0
  const __m128i tmp_6 = _mm_loadl_epi64(
244
0
      (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
245
0
  const __m128i tmp_7 = _mm_loadl_epi64(
246
0
      (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
247
248
  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
249
0
  const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
250
  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
251
0
  const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
252
  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
253
0
  const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
254
  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
255
0
  const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
256
257
  // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
258
0
  const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
259
  // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
260
0
  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
261
  // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
262
0
  const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
263
  // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
264
0
  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
265
266
  // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
267
0
  coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14);
268
  // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
269
0
  coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14);
270
  // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
271
0
  coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15);
272
  // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
273
0
  coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15);
274
0
}
275
276
static inline void prepare_horizontal_filter_coeff_alpha0(int sx,
277
0
                                                          __m128i *coeff) {
278
  // Filter even-index pixels
279
0
  const __m128i tmp_0 =
280
0
      _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
281
282
  // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
283
0
  coeff[0] =
284
0
      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01));
285
  // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
286
0
  coeff[1] =
287
0
      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23));
288
  // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
289
0
  coeff[2] =
290
0
      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45));
291
  // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
292
0
  coeff[3] =
293
0
      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67));
294
0
}
295
296
static inline void horizontal_filter(__m128i src, __m128i *tmp, int sx,
297
                                     int alpha, int k,
298
                                     const int offset_bits_horiz,
299
0
                                     const int reduce_bits_horiz) {
300
0
  __m128i coeff[4];
301
0
  prepare_horizontal_filter_coeff(alpha, sx, coeff);
302
0
  filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
303
0
}
304
305
static inline void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
306
                                          int stride, int32_t ix4, int32_t iy4,
307
                                          int32_t sx4, int alpha, int beta,
308
                                          int p_height, int height, int i,
309
                                          const int offset_bits_horiz,
310
0
                                          const int reduce_bits_horiz) {
311
0
  int k;
312
0
  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
313
0
    int iy = iy4 + k;
314
0
    if (iy < 0)
315
0
      iy = 0;
316
0
    else if (iy > height - 1)
317
0
      iy = height - 1;
318
0
    int sx = sx4 + beta * (k + 4);
319
320
    // Load source pixels
321
0
    const __m128i src =
322
0
        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
323
0
    horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
324
0
                      reduce_bits_horiz);
325
0
  }
326
0
}
327
328
static inline void warp_horizontal_filter_alpha0(
329
    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
330
    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
331
0
    const int offset_bits_horiz, const int reduce_bits_horiz) {
332
0
  (void)alpha;
333
0
  int k;
334
0
  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
335
0
    int iy = iy4 + k;
336
0
    if (iy < 0)
337
0
      iy = 0;
338
0
    else if (iy > height - 1)
339
0
      iy = height - 1;
340
0
    int sx = sx4 + beta * (k + 4);
341
342
    // Load source pixels
343
0
    const __m128i src =
344
0
        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
345
346
0
    __m128i coeff[4];
347
0
    prepare_horizontal_filter_coeff_alpha0(sx, coeff);
348
0
    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
349
0
  }
350
0
}
351
352
static inline void warp_horizontal_filter_beta0(
353
    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
354
    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
355
0
    const int offset_bits_horiz, const int reduce_bits_horiz) {
356
0
  (void)beta;
357
0
  int k;
358
0
  __m128i coeff[4];
359
0
  prepare_horizontal_filter_coeff(alpha, sx4, coeff);
360
361
0
  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
362
0
    int iy = iy4 + k;
363
0
    if (iy < 0)
364
0
      iy = 0;
365
0
    else if (iy > height - 1)
366
0
      iy = height - 1;
367
368
    // Load source pixels
369
0
    const __m128i src =
370
0
        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
371
0
    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
372
0
  }
373
0
}
374
375
static inline void warp_horizontal_filter_alpha0_beta0(
376
    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
377
    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
378
0
    const int offset_bits_horiz, const int reduce_bits_horiz) {
379
0
  (void)beta;
380
0
  (void)alpha;
381
0
  int k;
382
383
0
  __m128i coeff[4];
384
0
  prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
385
386
0
  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
387
0
    int iy = iy4 + k;
388
0
    if (iy < 0)
389
0
      iy = 0;
390
0
    else if (iy > height - 1)
391
0
      iy = height - 1;
392
393
    // Load source pixels
394
0
    const __m128i src =
395
0
        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
396
0
    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
397
0
  }
398
0
}
399
400
static inline void unpack_weights_and_set_round_const(
401
    ConvolveParams *conv_params, const int round_bits, const int offset_bits,
402
0
    __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) {
403
0
  *res_sub_const =
404
0
      _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
405
0
                     (1 << (offset_bits - conv_params->round_1 - 1)));
406
0
  *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
407
408
0
  const int w0 = conv_params->fwd_offset;
409
0
  const int w1 = conv_params->bck_offset;
410
0
  const __m128i wt0 = _mm_set1_epi16((int16_t)w0);
411
0
  const __m128i wt1 = _mm_set1_epi16((int16_t)w1);
412
0
  *wt = _mm_unpacklo_epi16(wt0, wt1);
413
0
}
414
415
static inline void prepare_vertical_filter_coeffs(int gamma, int sy,
416
0
                                                  __m128i *coeffs) {
417
0
  const __m128i tmp_0 =
418
0
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
419
0
                                  ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
420
0
  const __m128i tmp_2 =
421
0
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
422
0
                                  ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
423
0
  const __m128i tmp_4 =
424
0
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
425
0
                                  ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
426
0
  const __m128i tmp_6 =
427
0
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
428
0
                                  ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
429
430
0
  const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
431
0
  const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
432
0
  const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
433
0
  const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
434
435
  // even coeffs
436
0
  coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
437
0
  coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10);
438
0
  coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
439
0
  coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
440
441
0
  const __m128i tmp_1 =
442
0
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
443
0
                                  ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
444
0
  const __m128i tmp_3 =
445
0
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
446
0
                                  ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
447
0
  const __m128i tmp_5 =
448
0
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
449
0
                                  ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
450
0
  const __m128i tmp_7 =
451
0
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
452
0
                                  ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
453
454
0
  const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
455
0
  const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
456
0
  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
457
0
  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
458
459
  // odd coeffs
460
0
  coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11);
461
0
  coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11);
462
0
  coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15);
463
0
  coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
464
0
}
465
466
static inline void prepare_vertical_filter_coeffs_gamma0(int sy,
467
0
                                                         __m128i *coeffs) {
468
0
  const __m128i tmp_0 = _mm_loadu_si128(
469
0
      (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
470
471
  // even coeffs
472
0
  coeffs[0] =
473
0
      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0));
474
0
  coeffs[1] =
475
0
      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1));
476
0
  coeffs[2] =
477
0
      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2));
478
0
  coeffs[3] =
479
0
      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3));
480
481
  // odd coeffs
482
0
  coeffs[4] = coeffs[0];
483
0
  coeffs[5] = coeffs[1];
484
0
  coeffs[6] = coeffs[2];
485
0
  coeffs[7] = coeffs[3];
486
0
}
487
488
static inline void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
489
                                              __m128i *res_lo, __m128i *res_hi,
490
0
                                              int k) {
491
  // Load from tmp and rearrange pairs of consecutive rows into the
492
  // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
493
0
  const __m128i *src = tmp + (k + 4);
494
0
  const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
495
0
  const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
496
0
  const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
497
0
  const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
498
499
0
  const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
500
0
  const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
501
0
  const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
502
0
  const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
503
504
0
  const __m128i res_even =
505
0
      _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6));
506
507
  // Filter odd-index pixels
508
0
  const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
509
0
  const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
510
0
  const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
511
0
  const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
512
513
0
  const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]);
514
0
  const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]);
515
0
  const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]);
516
0
  const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]);
517
518
0
  const __m128i res_odd =
519
0
      _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7));
520
521
  // Rearrange pixels back into the order 0 ... 7
522
0
  *res_lo = _mm_unpacklo_epi32(res_even, res_odd);
523
0
  *res_hi = _mm_unpackhi_epi32(res_even, res_odd);
524
0
}
525
526
static inline void store_vertical_filter_output(
527
    __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const,
528
    const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const,
529
    uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k,
530
    const int reduce_bits_vert, int p_stride, int p_width,
531
0
    const int round_bits) {
532
0
  __m128i res_lo_1 = *res_lo;
533
0
  __m128i res_hi_1 = *res_hi;
534
535
0
  if (conv_params->is_compound) {
536
0
    __m128i *const p =
537
0
        (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
538
0
    res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const),
539
0
                              reduce_bits_vert);
540
0
    const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1);
541
0
    __m128i res_lo_16;
542
0
    if (conv_params->do_average) {
543
0
      __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
544
0
      const __m128i p_16 = _mm_loadl_epi64(p);
545
546
0
      if (conv_params->use_dist_wtd_comp_avg) {
547
0
        const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
548
0
        const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
549
0
        const __m128i shifted_32 =
550
0
            _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
551
0
        res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
552
0
      } else {
553
0
        res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
554
0
      }
555
556
0
      res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const);
557
558
0
      res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const),
559
0
                                 round_bits);
560
0
      __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
561
0
      *(int *)dst8 = _mm_cvtsi128_si32(res_8_lo);
562
0
    } else {
563
0
      _mm_storel_epi64(p, temp_lo_16);
564
0
    }
565
0
    if (p_width > 4) {
566
0
      __m128i *const p4 =
567
0
          (__m128i *)&conv_params
568
0
              ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
569
0
      res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const),
570
0
                                reduce_bits_vert);
571
0
      const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1);
572
0
      __m128i res_hi_16;
573
574
0
      if (conv_params->do_average) {
575
0
        __m128i *const dst8_4 =
576
0
            (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
577
0
        const __m128i p4_16 = _mm_loadl_epi64(p4);
578
579
0
        if (conv_params->use_dist_wtd_comp_avg) {
580
0
          const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
581
0
          const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
582
0
          const __m128i shifted_32 =
583
0
              _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
584
0
          res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
585
0
        } else {
586
0
          res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
587
0
        }
588
0
        res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const);
589
590
0
        res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const),
591
0
                                   round_bits);
592
0
        __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
593
0
        *(int *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
594
595
0
      } else {
596
0
        _mm_storel_epi64(p4, temp_hi_16);
597
0
      }
598
0
    }
599
0
  } else {
600
0
    const __m128i res_lo_round = _mm_srai_epi32(
601
0
        _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
602
0
    const __m128i res_hi_round = _mm_srai_epi32(
603
0
        _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
604
605
0
    const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
606
0
    __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
607
608
    // Store, blending with 'pred' if needed
609
0
    __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
610
611
    // Note: If we're outputting a 4x4 block, we need to be very careful
612
    // to only output 4 pixels at this point, to avoid encode/decode
613
    // mismatches when encoding with multiple threads.
614
0
    if (p_width == 4) {
615
0
      *(int *)p = _mm_cvtsi128_si32(res_8bit);
616
0
    } else {
617
0
      _mm_storel_epi64(p, res_8bit);
618
0
    }
619
0
  }
620
0
}
621
622
static inline void warp_vertical_filter(
623
    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
624
    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
625
    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
626
0
    const int round_bits, const int offset_bits) {
627
0
  int k;
628
0
  __m128i res_sub_const, round_bits_const, wt;
629
0
  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
630
0
                                     &res_sub_const, &round_bits_const, &wt);
631
  // Vertical filter
632
0
  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
633
0
    int sy = sy4 + delta * (k + 4);
634
635
0
    __m128i coeffs[8];
636
0
    prepare_vertical_filter_coeffs(gamma, sy, coeffs);
637
638
0
    __m128i res_lo;
639
0
    __m128i res_hi;
640
0
    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
641
642
0
    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
643
0
                                 &res_sub_const, &round_bits_const, pred,
644
0
                                 conv_params, i, j, k, reduce_bits_vert,
645
0
                                 p_stride, p_width, round_bits);
646
0
  }
647
0
}
648
649
static inline void warp_vertical_filter_gamma0(
650
    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
651
    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
652
    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
653
0
    const int round_bits, const int offset_bits) {
654
0
  int k;
655
0
  (void)gamma;
656
0
  __m128i res_sub_const, round_bits_const, wt;
657
0
  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
658
0
                                     &res_sub_const, &round_bits_const, &wt);
659
  // Vertical filter
660
0
  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
661
0
    int sy = sy4 + delta * (k + 4);
662
663
0
    __m128i coeffs[8];
664
0
    prepare_vertical_filter_coeffs_gamma0(sy, coeffs);
665
666
0
    __m128i res_lo;
667
0
    __m128i res_hi;
668
0
    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
669
670
0
    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
671
0
                                 &res_sub_const, &round_bits_const, pred,
672
0
                                 conv_params, i, j, k, reduce_bits_vert,
673
0
                                 p_stride, p_width, round_bits);
674
0
  }
675
0
}
676
677
static inline void warp_vertical_filter_delta0(
678
    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
679
    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
680
    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
681
0
    const int round_bits, const int offset_bits) {
682
0
  (void)delta;
683
0
  int k;
684
0
  __m128i res_sub_const, round_bits_const, wt;
685
0
  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
686
0
                                     &res_sub_const, &round_bits_const, &wt);
687
688
0
  __m128i coeffs[8];
689
0
  prepare_vertical_filter_coeffs(gamma, sy4, coeffs);
690
  // Vertical filter
691
0
  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
692
0
    __m128i res_lo;
693
0
    __m128i res_hi;
694
0
    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
695
696
0
    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
697
0
                                 &res_sub_const, &round_bits_const, pred,
698
0
                                 conv_params, i, j, k, reduce_bits_vert,
699
0
                                 p_stride, p_width, round_bits);
700
0
  }
701
0
}
702
703
static inline void warp_vertical_filter_gamma0_delta0(
704
    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
705
    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
706
    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
707
0
    const int round_bits, const int offset_bits) {
708
0
  (void)delta;
709
0
  (void)gamma;
710
0
  int k;
711
0
  __m128i res_sub_const, round_bits_const, wt;
712
0
  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
713
0
                                     &res_sub_const, &round_bits_const, &wt);
714
715
0
  __m128i coeffs[8];
716
0
  prepare_vertical_filter_coeffs_gamma0(sy4, coeffs);
717
  // Vertical filter
718
0
  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
719
0
    __m128i res_lo;
720
0
    __m128i res_hi;
721
0
    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
722
723
0
    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
724
0
                                 &res_sub_const, &round_bits_const, pred,
725
0
                                 conv_params, i, j, k, reduce_bits_vert,
726
0
                                 p_stride, p_width, round_bits);
727
0
  }
728
0
}
729
730
static inline void prepare_warp_vertical_filter(
731
    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
732
    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
733
    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
734
0
    const int round_bits, const int offset_bits) {
735
0
  if (gamma == 0 && delta == 0)
736
0
    warp_vertical_filter_gamma0_delta0(
737
0
        pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j,
738
0
        sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits);
739
0
  else if (gamma == 0 && delta != 0)
740
0
    warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height,
741
0
                                p_stride, p_width, i, j, sy4, reduce_bits_vert,
742
0
                                res_add_const, round_bits, offset_bits);
743
0
  else if (gamma != 0 && delta == 0)
744
0
    warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height,
745
0
                                p_stride, p_width, i, j, sy4, reduce_bits_vert,
746
0
                                res_add_const, round_bits, offset_bits);
747
0
  else
748
0
    warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height,
749
0
                         p_stride, p_width, i, j, sy4, reduce_bits_vert,
750
0
                         res_add_const, round_bits, offset_bits);
751
0
}
752
753
static inline void prepare_warp_horizontal_filter(
754
    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
755
    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
756
0
    const int offset_bits_horiz, const int reduce_bits_horiz) {
757
0
  if (alpha == 0 && beta == 0)
758
0
    warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
759
0
                                        beta, p_height, height, i,
760
0
                                        offset_bits_horiz, reduce_bits_horiz);
761
0
  else if (alpha == 0 && beta != 0)
762
0
    warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
763
0
                                  p_height, height, i, offset_bits_horiz,
764
0
                                  reduce_bits_horiz);
765
0
  else if (alpha != 0 && beta == 0)
766
0
    warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
767
0
                                 p_height, height, i, offset_bits_horiz,
768
0
                                 reduce_bits_horiz);
769
0
  else
770
0
    warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
771
0
                           p_height, height, i, offset_bits_horiz,
772
0
                           reduce_bits_horiz);
773
0
}
774
775
void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
776
                            int height, int stride, uint8_t *pred, int p_col,
777
                            int p_row, int p_width, int p_height, int p_stride,
778
                            int subsampling_x, int subsampling_y,
779
                            ConvolveParams *conv_params, int16_t alpha,
780
0
                            int16_t beta, int16_t gamma, int16_t delta) {
781
0
  __m128i tmp[15];
782
0
  int i, j, k;
783
0
  const int bd = 8;
784
0
  const int reduce_bits_horiz = conv_params->round_0;
785
0
  const int reduce_bits_vert = conv_params->is_compound
786
0
                                   ? conv_params->round_1
787
0
                                   : 2 * FILTER_BITS - reduce_bits_horiz;
788
0
  const int offset_bits_horiz = bd + FILTER_BITS - 1;
789
0
  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
790
791
0
  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
792
0
  const __m128i reduce_bits_vert_const =
793
0
      _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
794
0
  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
795
0
  const int round_bits =
796
0
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
797
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
798
0
  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
799
800
  /* Note: For this code to work, the left/right frame borders need to be
801
  extended by at least 13 pixels each. By the time we get here, other
802
  code will have set up this border, but we allow an explicit check
803
  for debugging purposes.
804
  */
805
  /*for (i = 0; i < height; ++i) {
806
  for (j = 0; j < 13; ++j) {
807
  assert(ref[i * stride - 13 + j] == ref[i * stride]);
808
  assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
809
  }
810
  }*/
811
0
  __m128i res_add_const_1;
812
0
  if (conv_params->is_compound == 1) {
813
0
    res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const);
814
0
  } else {
815
0
    res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
816
0
                                     ((1 << reduce_bits_vert) >> 1));
817
0
  }
818
819
0
  for (i = 0; i < p_height; i += 8) {
820
0
    for (j = 0; j < p_width; j += 8) {
821
0
      const int32_t src_x = (p_col + j + 4) << subsampling_x;
822
0
      const int32_t src_y = (p_row + i + 4) << subsampling_y;
823
0
      const int64_t dst_x =
824
0
          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
825
0
      const int64_t dst_y =
826
0
          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
827
0
      const int64_t x4 = dst_x >> subsampling_x;
828
0
      const int64_t y4 = dst_y >> subsampling_y;
829
830
0
      int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
831
0
      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
832
0
      int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
833
0
      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
834
835
      // Add in all the constant terms, including rounding and offset
836
0
      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
837
0
             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
838
0
      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
839
0
             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
840
841
0
      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
842
0
      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
843
844
      // Horizontal filter
845
      // If the block is aligned such that, after clamping, every sample
846
      // would be taken from the leftmost/rightmost column, then we can
847
      // skip the expensive horizontal filter.
848
0
      if (ix4 <= -7) {
849
0
        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
850
0
          int iy = iy4 + k;
851
0
          if (iy < 0)
852
0
            iy = 0;
853
0
          else if (iy > height - 1)
854
0
            iy = height - 1;
855
0
          tmp[k + 7] = _mm_set1_epi16(
856
0
              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
857
0
              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
858
0
        }
859
0
      } else if (ix4 >= width + 6) {
860
0
        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
861
0
          int iy = iy4 + k;
862
0
          if (iy < 0)
863
0
            iy = 0;
864
0
          else if (iy > height - 1)
865
0
            iy = height - 1;
866
0
          tmp[k + 7] =
867
0
              _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
868
0
                             ref[iy * stride + (width - 1)] *
869
0
                                 (1 << (FILTER_BITS - reduce_bits_horiz)));
870
0
        }
871
0
      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
872
0
        const int out_of_boundary_left = -(ix4 - 6);
873
0
        const int out_of_boundary_right = (ix4 + 8) - width;
874
0
        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
875
0
          int iy = iy4 + k;
876
0
          if (iy < 0)
877
0
            iy = 0;
878
0
          else if (iy > height - 1)
879
0
            iy = height - 1;
880
0
          int sx = sx4 + beta * (k + 4);
881
882
          // Load source pixels
883
0
          __m128i src =
884
0
              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
885
0
          if (out_of_boundary_left >= 0) {
886
0
            const __m128i shuffle_reg_left =
887
0
                _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
888
0
            src = _mm_shuffle_epi8(src, shuffle_reg_left);
889
0
          }
890
0
          if (out_of_boundary_right >= 0) {
891
0
            const __m128i shuffle_reg_right = _mm_loadu_si128(
892
0
                (__m128i *)warp_pad_right[out_of_boundary_right]);
893
0
            src = _mm_shuffle_epi8(src, shuffle_reg_right);
894
0
          }
895
0
          horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
896
0
                            reduce_bits_horiz);
897
0
        }
898
0
      } else {
899
0
        prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha,
900
0
                                       beta, p_height, height, i,
901
0
                                       offset_bits_horiz, reduce_bits_horiz);
902
0
      }
903
904
      // Vertical filter
905
0
      prepare_warp_vertical_filter(
906
0
          pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i,
907
0
          j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits);
908
0
    }
909
0
  }
910
0
}
911
912
#endif  // !CONFIG_HIGHWAY