Coverage Report

Created: 2025-06-13 07:07

/src/aom/av1/common/x86/warp_plane_sse4.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <emmintrin.h>
13
#include <smmintrin.h>
14
15
#include "config/av1_rtcd.h"
16
17
#include "av1/common/warped_motion.h"
18
19
/* This is a modified version of 'av1_warped_filter' from warped_motion.c:
20
   * Each coefficient is stored in 8 bits instead of 16 bits
21
   * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
22
23
     This is done in order to avoid overflow: Since the tap with the largest
24
     coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
25
     order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
26
     convolve functions.
27
28
     Instead, we use the summation order
29
     ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
30
     The rearrangement of coefficients in this table is so that we can get the
31
     coefficients into the correct order more quickly.
32
*/
33
/* clang-format off */
34
DECLARE_ALIGNED(8, const int8_t,
35
                av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
36
  // [-1, 0)
37
  { 0, 127,   0, 0,   0,   1, 0, 0}, { 0, 127,   0, 0,  -1,   2, 0, 0},
38
  { 1, 127,  -1, 0,  -3,   4, 0, 0}, { 1, 126,  -2, 0,  -4,   6, 1, 0},
39
  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 125,  -4, 0,  -6,  11, 1, 0},
40
  { 1, 124,  -4, 0,  -7,  13, 1, 0}, { 2, 123,  -5, 0,  -8,  15, 1, 0},
41
  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 121,  -6, 0, -10,  20, 1, 0},
42
  { 2, 120,  -7, 0, -11,  22, 2, 0}, { 2, 119,  -8, 0, -12,  25, 2, 0},
43
  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 116,  -9, 0, -13,  29, 2, 0},
44
  { 3, 114, -10, 0, -14,  32, 3, 0}, { 3, 113, -10, 0, -15,  35, 2, 0},
45
  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 109, -11, 0, -16,  40, 3, 0},
46
  { 3, 108, -12, 0, -16,  42, 3, 0}, { 4, 106, -13, 0, -17,  45, 3, 0},
47
  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 102, -14, 0, -17,  50, 3, 0},
48
  { 4, 100, -14, 0, -17,  52, 3, 0}, { 4,  98, -15, 0, -18,  55, 4, 0},
49
  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  94, -16, 0, -18,  60, 4, 0},
50
  { 4,  91, -16, 0, -18,  63, 4, 0}, { 4,  89, -16, 0, -18,  65, 4, 0},
51
  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  85, -17, 0, -18,  70, 4, 0},
52
  { 4,  82, -17, 0, -18,  73, 4, 0}, { 4,  80, -17, 0, -18,  75, 4, 0},
53
  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  75, -18, 0, -17,  80, 4, 0},
54
  { 4,  73, -18, 0, -17,  82, 4, 0}, { 4,  70, -18, 0, -17,  85, 4, 0},
55
  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  65, -18, 0, -16,  89, 4, 0},
56
  { 4,  63, -18, 0, -16,  91, 4, 0}, { 4,  60, -18, 0, -16,  94, 4, 0},
57
  { 3,  58, -18, 0, -15,  96, 4, 0}, { 4,  55, -18, 0, -15,  98, 4, 0},
58
  { 3,  52, -17, 0, -14, 100, 4, 0}, { 3,  50, -17, 0, -14, 102, 4, 0},
59
  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  45, -17, 0, -13, 106, 4, 0},
60
  { 3,  42, -16, 0, -12, 108, 3, 0}, { 3,  40, -16, 0, -11, 109, 3, 0},
61
  { 3,  37, -15, 0, -11, 111, 3, 0}, { 2,  35, -15, 0, -10, 113, 3, 0},
62
  { 3,  32, -14, 0, -10, 114, 3, 0}, { 2,  29, -13, 0,  -9, 116, 3, 0},
63
  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  25, -12, 0,  -8, 119, 2, 0},
64
  { 2,  22, -11, 0,  -7, 120, 2, 0}, { 1,  20, -10, 0,  -6, 121, 2, 0},
65
  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  15,  -8, 0,  -5, 123, 2, 0},
66
  { 1,  13,  -7, 0,  -4, 124, 1, 0}, { 1,  11,  -6, 0,  -4, 125, 1, 0},
67
  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 1,   6,  -4, 0,  -2, 126, 1, 0},
68
  { 0,   4,  -3, 0,  -1, 127, 1, 0}, { 0,   2,  -1, 0,   0, 127, 0, 0},
69
  // [0, 1)
70
  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -1,   2, 0, 0, 127,   0,  0},
71
  { 0,  -3,   4, 1, 1, 127,  -2,  0}, { 0,  -5,   6, 1, 1, 127,  -2,  0},
72
  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -7,  11, 2, 2, 126,  -4, -1},
73
  {-1,  -8,  13, 2, 3, 125,  -5, -1}, {-1, -10,  16, 3, 3, 124,  -6, -1},
74
  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -12,  20, 3, 4, 122,  -7, -1},
75
  {-1, -13,  23, 3, 4, 121,  -8, -1}, {-2, -14,  25, 4, 5, 120,  -9, -1},
76
  {-1, -15,  27, 4, 5, 119, -10, -1}, {-1, -16,  30, 4, 5, 118, -11, -1},
77
  {-2, -17,  33, 5, 6, 116, -12, -1}, {-2, -17,  35, 5, 6, 114, -12, -1},
78
  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  41, 6, 7, 111, -14, -2},
79
  {-2, -19,  43, 6, 7, 110, -15, -2}, {-2, -20,  46, 6, 7, 108, -15, -2},
80
  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  51, 7, 7, 104, -16, -2},
81
  {-2, -21,  54, 7, 7, 102, -17, -2}, {-2, -21,  56, 7, 8, 100, -18, -2},
82
  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  62, 7, 8,  96, -19, -2},
83
  {-2, -22,  64, 7, 8,  94, -19, -2}, {-2, -22,  67, 8, 8,  91, -20, -2},
84
  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -22,  72, 8, 8,  87, -21, -2},
85
  {-2, -21,  74, 8, 8,  84, -21, -2}, {-2, -22,  77, 8, 8,  82, -21, -2},
86
  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  82, 8, 8,  77, -22, -2},
87
  {-2, -21,  84, 8, 8,  74, -21, -2}, {-2, -21,  87, 8, 8,  72, -22, -2},
88
  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -20,  91, 8, 8,  67, -22, -2},
89
  {-2, -19,  94, 8, 7,  64, -22, -2}, {-2, -19,  96, 8, 7,  62, -22, -2},
90
  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -18, 100, 8, 7,  56, -21, -2},
91
  {-2, -17, 102, 7, 7,  54, -21, -2}, {-2, -16, 104, 7, 7,  51, -21, -2},
92
  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 108, 7, 6,  46, -20, -2},
93
  {-2, -15, 110, 7, 6,  43, -19, -2}, {-2, -14, 111, 7, 6,  41, -19, -2},
94
  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 114, 6, 5,  35, -17, -2},
95
  {-1, -12, 116, 6, 5,  33, -17, -2}, {-1, -11, 118, 5, 4,  30, -16, -1},
96
  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -9, 120, 5, 4,  25, -14, -2},
97
  {-1,  -8, 121, 4, 3,  23, -13, -1}, {-1,  -7, 122, 4, 3,  20, -12, -1},
98
  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -6, 124, 3, 3,  16, -10, -1},
99
  {-1,  -5, 125, 3, 2,  13,  -8, -1}, {-1,  -4, 126, 2, 2,  11,  -7, -1},
100
  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   6,  -5,  0},
101
  { 0,  -2, 127, 1, 1,   4,  -3,  0}, { 0,   0, 127, 0, 0,   2,  -1,  0},
102
  // [1, 2)
103
  { 0, 0, 127,   0, 0,   1,   0, 0}, { 0, 0, 127,   0, 0,  -1,   2, 0},
104
  { 0, 1, 127,  -1, 0,  -3,   4, 0}, { 0, 1, 126,  -2, 0,  -4,   6, 1},
105
  { 0, 1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 125,  -4, 0,  -6,  11, 1},
106
  { 0, 1, 124,  -4, 0,  -7,  13, 1}, { 0, 2, 123,  -5, 0,  -8,  15, 1},
107
  { 0, 2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 121,  -6, 0, -10,  20, 1},
108
  { 0, 2, 120,  -7, 0, -11,  22, 2}, { 0, 2, 119,  -8, 0, -12,  25, 2},
109
  { 0, 3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 116,  -9, 0, -13,  29, 2},
110
  { 0, 3, 114, -10, 0, -14,  32, 3}, { 0, 3, 113, -10, 0, -15,  35, 2},
111
  { 0, 3, 111, -11, 0, -15,  37, 3}, { 0, 3, 109, -11, 0, -16,  40, 3},
112
  { 0, 3, 108, -12, 0, -16,  42, 3}, { 0, 4, 106, -13, 0, -17,  45, 3},
113
  { 0, 4, 104, -13, 0, -17,  47, 3}, { 0, 4, 102, -14, 0, -17,  50, 3},
114
  { 0, 4, 100, -14, 0, -17,  52, 3}, { 0, 4,  98, -15, 0, -18,  55, 4},
115
  { 0, 4,  96, -15, 0, -18,  58, 3}, { 0, 4,  94, -16, 0, -18,  60, 4},
116
  { 0, 4,  91, -16, 0, -18,  63, 4}, { 0, 4,  89, -16, 0, -18,  65, 4},
117
  { 0, 4,  87, -17, 0, -18,  68, 4}, { 0, 4,  85, -17, 0, -18,  70, 4},
118
  { 0, 4,  82, -17, 0, -18,  73, 4}, { 0, 4,  80, -17, 0, -18,  75, 4},
119
  { 0, 4,  78, -18, 0, -18,  78, 4}, { 0, 4,  75, -18, 0, -17,  80, 4},
120
  { 0, 4,  73, -18, 0, -17,  82, 4}, { 0, 4,  70, -18, 0, -17,  85, 4},
121
  { 0, 4,  68, -18, 0, -17,  87, 4}, { 0, 4,  65, -18, 0, -16,  89, 4},
122
  { 0, 4,  63, -18, 0, -16,  91, 4}, { 0, 4,  60, -18, 0, -16,  94, 4},
123
  { 0, 3,  58, -18, 0, -15,  96, 4}, { 0, 4,  55, -18, 0, -15,  98, 4},
124
  { 0, 3,  52, -17, 0, -14, 100, 4}, { 0, 3,  50, -17, 0, -14, 102, 4},
125
  { 0, 3,  47, -17, 0, -13, 104, 4}, { 0, 3,  45, -17, 0, -13, 106, 4},
126
  { 0, 3,  42, -16, 0, -12, 108, 3}, { 0, 3,  40, -16, 0, -11, 109, 3},
127
  { 0, 3,  37, -15, 0, -11, 111, 3}, { 0, 2,  35, -15, 0, -10, 113, 3},
128
  { 0, 3,  32, -14, 0, -10, 114, 3}, { 0, 2,  29, -13, 0,  -9, 116, 3},
129
  { 0, 2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  25, -12, 0,  -8, 119, 2},
130
  { 0, 2,  22, -11, 0,  -7, 120, 2}, { 0, 1,  20, -10, 0,  -6, 121, 2},
131
  { 0, 1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  15,  -8, 0,  -5, 123, 2},
132
  { 0, 1,  13,  -7, 0,  -4, 124, 1}, { 0, 1,  11,  -6, 0,  -4, 125, 1},
133
  { 0, 1,   8,  -5, 0,  -3, 126, 1}, { 0, 1,   6,  -4, 0,  -2, 126, 1},
134
  { 0, 0,   4,  -3, 0,  -1, 127, 1}, { 0, 0,   2,  -1, 0,   0, 127, 0},
135
  // dummy (replicate row index 191)
136
  { 0, 0,   2,  -1, 0,   0, 127, 0},
137
};
138
/* clang-format on */
139
140
// Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15
141
// in an SSE register into two sequences:
142
// 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
143
// 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
144
DECLARE_ALIGNED(16, static const uint8_t,
145
                even_mask[16]) = { 0, 2,  2,  4,  4,  6,  6,  8,
146
                                   8, 10, 10, 12, 12, 14, 14, 0 };
147
148
DECLARE_ALIGNED(16, static const uint8_t,
149
                odd_mask[16]) = { 1, 3,  3,  5,  5,  7,  7,  9,
150
                                  9, 11, 11, 13, 13, 15, 15, 0 };
151
152
DECLARE_ALIGNED(16, static const uint8_t,
153
                shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1,
154
                                               0, 1, 0, 1, 0, 1, 0, 1 };
155
156
DECLARE_ALIGNED(16, static const uint8_t,
157
                shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3,
158
                                               2, 3, 2, 3, 2, 3, 2, 3 };
159
160
DECLARE_ALIGNED(16, static const uint8_t,
161
                shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5,
162
                                               4, 5, 4, 5, 4, 5, 4, 5 };
163
164
DECLARE_ALIGNED(16, static const uint8_t,
165
                shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7,
166
                                               6, 7, 6, 7, 6, 7, 6, 7 };
167
168
DECLARE_ALIGNED(16, static const uint8_t,
169
                shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3,
170
                                              0, 1, 2, 3, 0, 1, 2, 3 };
171
172
DECLARE_ALIGNED(16, static const uint8_t,
173
                shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7,
174
                                              4, 5, 6, 7, 4, 5, 6, 7 };
175
176
DECLARE_ALIGNED(16, static const uint8_t,
177
                shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11,
178
                                              8, 9, 10, 11, 8, 9, 10, 11 };
179
180
DECLARE_ALIGNED(16, static const uint8_t,
181
                shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15,
182
                                              12, 13, 14, 15, 12, 13, 14, 15 };
183
184
static inline void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
185
                                     const int offset_bits_horiz,
186
0
                                     const int reduce_bits_horiz, int k) {
187
0
  const __m128i src_even =
188
0
      _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask));
189
0
  const __m128i src_odd =
190
0
      _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask));
191
  // The pixel order we need for 'src' is:
192
  // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
193
0
  const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
194
0
  const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]);
195
  // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
196
0
  const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
197
0
                                            _mm_srli_si128(src_odd, 4));
198
0
  const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]);
199
  // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
200
0
  const __m128i src_13 =
201
0
      _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
202
0
  const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]);
203
  // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
204
0
  const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
205
0
                                            _mm_srli_si128(src_even, 6));
206
0
  const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]);
207
208
0
  const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
209
0
                                             ((1 << reduce_bits_horiz) >> 1));
210
211
  // Note: The values res_02 + res_46 and res_13 + res_57 both
212
  // fit into int16s at this point, but their sum may be too wide to fit
213
  // into an int16. However, once we also add round_const, the sum of
214
  // all of these fits into a uint16.
215
  //
216
  // The wrapping behaviour of _mm_add_* is used here to make sure we
217
  // get the correct result despite converting between different
218
  // (implicit) types.
219
0
  const __m128i res_even = _mm_add_epi16(res_02, res_46);
220
0
  const __m128i res_odd = _mm_add_epi16(res_13, res_57);
221
0
  const __m128i res =
222
0
      _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
223
0
  tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
224
0
}
225
226
static inline void prepare_horizontal_filter_coeff(int alpha, int sx,
227
0
                                                   __m128i *coeff) {
228
  // Filter even-index pixels
229
0
  const __m128i tmp_0 = _mm_loadl_epi64(
230
0
      (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
231
0
  const __m128i tmp_1 = _mm_loadl_epi64(
232
0
      (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
233
0
  const __m128i tmp_2 = _mm_loadl_epi64(
234
0
      (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
235
0
  const __m128i tmp_3 = _mm_loadl_epi64(
236
0
      (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
237
0
  const __m128i tmp_4 = _mm_loadl_epi64(
238
0
      (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
239
0
  const __m128i tmp_5 = _mm_loadl_epi64(
240
0
      (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
241
0
  const __m128i tmp_6 = _mm_loadl_epi64(
242
0
      (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
243
0
  const __m128i tmp_7 = _mm_loadl_epi64(
244
0
      (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
245
246
  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
247
0
  const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
248
  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
249
0
  const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
250
  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
251
0
  const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
252
  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
253
0
  const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
254
255
  // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
256
0
  const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
257
  // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
258
0
  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
259
  // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
260
0
  const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
261
  // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
262
0
  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
263
264
  // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
265
0
  coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14);
266
  // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
267
0
  coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14);
268
  // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
269
0
  coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15);
270
  // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
271
0
  coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15);
272
0
}
273
274
static inline void prepare_horizontal_filter_coeff_alpha0(int sx,
275
0
                                                          __m128i *coeff) {
276
  // Filter even-index pixels
277
0
  const __m128i tmp_0 =
278
0
      _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
279
280
  // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
281
0
  coeff[0] =
282
0
      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01));
283
  // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
284
0
  coeff[1] =
285
0
      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23));
286
  // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
287
0
  coeff[2] =
288
0
      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45));
289
  // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
290
0
  coeff[3] =
291
0
      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67));
292
0
}
293
294
static inline void horizontal_filter(__m128i src, __m128i *tmp, int sx,
295
                                     int alpha, int k,
296
                                     const int offset_bits_horiz,
297
0
                                     const int reduce_bits_horiz) {
298
0
  __m128i coeff[4];
299
0
  prepare_horizontal_filter_coeff(alpha, sx, coeff);
300
0
  filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
301
0
}
302
303
static inline void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
304
                                          int stride, int32_t ix4, int32_t iy4,
305
                                          int32_t sx4, int alpha, int beta,
306
                                          int p_height, int height, int i,
307
                                          const int offset_bits_horiz,
308
0
                                          const int reduce_bits_horiz) {
309
0
  int k;
310
0
  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
311
0
    int iy = iy4 + k;
312
0
    if (iy < 0)
313
0
      iy = 0;
314
0
    else if (iy > height - 1)
315
0
      iy = height - 1;
316
0
    int sx = sx4 + beta * (k + 4);
317
318
    // Load source pixels
319
0
    const __m128i src =
320
0
        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
321
0
    horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
322
0
                      reduce_bits_horiz);
323
0
  }
324
0
}
325
326
static inline void warp_horizontal_filter_alpha0(
327
    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
328
    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
329
0
    const int offset_bits_horiz, const int reduce_bits_horiz) {
330
0
  (void)alpha;
331
0
  int k;
332
0
  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
333
0
    int iy = iy4 + k;
334
0
    if (iy < 0)
335
0
      iy = 0;
336
0
    else if (iy > height - 1)
337
0
      iy = height - 1;
338
0
    int sx = sx4 + beta * (k + 4);
339
340
    // Load source pixels
341
0
    const __m128i src =
342
0
        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
343
344
0
    __m128i coeff[4];
345
0
    prepare_horizontal_filter_coeff_alpha0(sx, coeff);
346
0
    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
347
0
  }
348
0
}
349
350
static inline void warp_horizontal_filter_beta0(
351
    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
352
    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
353
0
    const int offset_bits_horiz, const int reduce_bits_horiz) {
354
0
  (void)beta;
355
0
  int k;
356
0
  __m128i coeff[4];
357
0
  prepare_horizontal_filter_coeff(alpha, sx4, coeff);
358
359
0
  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
360
0
    int iy = iy4 + k;
361
0
    if (iy < 0)
362
0
      iy = 0;
363
0
    else if (iy > height - 1)
364
0
      iy = height - 1;
365
366
    // Load source pixels
367
0
    const __m128i src =
368
0
        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
369
0
    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
370
0
  }
371
0
}
372
373
static inline void warp_horizontal_filter_alpha0_beta0(
374
    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
375
    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
376
0
    const int offset_bits_horiz, const int reduce_bits_horiz) {
377
0
  (void)beta;
378
0
  (void)alpha;
379
0
  int k;
380
381
0
  __m128i coeff[4];
382
0
  prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
383
384
0
  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
385
0
    int iy = iy4 + k;
386
0
    if (iy < 0)
387
0
      iy = 0;
388
0
    else if (iy > height - 1)
389
0
      iy = height - 1;
390
391
    // Load source pixels
392
0
    const __m128i src =
393
0
        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
394
0
    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
395
0
  }
396
0
}
397
398
static inline void unpack_weights_and_set_round_const(
399
    ConvolveParams *conv_params, const int round_bits, const int offset_bits,
400
0
    __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) {
401
0
  *res_sub_const =
402
0
      _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
403
0
                     (1 << (offset_bits - conv_params->round_1 - 1)));
404
0
  *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
405
406
0
  const int w0 = conv_params->fwd_offset;
407
0
  const int w1 = conv_params->bck_offset;
408
0
  const __m128i wt0 = _mm_set1_epi16((int16_t)w0);
409
0
  const __m128i wt1 = _mm_set1_epi16((int16_t)w1);
410
0
  *wt = _mm_unpacklo_epi16(wt0, wt1);
411
0
}
412
413
static inline void prepare_vertical_filter_coeffs(int gamma, int sy,
414
0
                                                  __m128i *coeffs) {
415
0
  const __m128i tmp_0 =
416
0
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
417
0
                                  ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
418
0
  const __m128i tmp_2 =
419
0
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
420
0
                                  ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
421
0
  const __m128i tmp_4 =
422
0
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
423
0
                                  ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
424
0
  const __m128i tmp_6 =
425
0
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
426
0
                                  ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
427
428
0
  const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
429
0
  const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
430
0
  const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
431
0
  const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
432
433
  // even coeffs
434
0
  coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
435
0
  coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10);
436
0
  coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
437
0
  coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
438
439
0
  const __m128i tmp_1 =
440
0
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
441
0
                                  ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
442
0
  const __m128i tmp_3 =
443
0
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
444
0
                                  ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
445
0
  const __m128i tmp_5 =
446
0
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
447
0
                                  ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
448
0
  const __m128i tmp_7 =
449
0
      _mm_loadu_si128((__m128i *)(av1_warped_filter +
450
0
                                  ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
451
452
0
  const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
453
0
  const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
454
0
  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
455
0
  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
456
457
  // odd coeffs
458
0
  coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11);
459
0
  coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11);
460
0
  coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15);
461
0
  coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
462
0
}
463
464
static inline void prepare_vertical_filter_coeffs_gamma0(int sy,
465
0
                                                         __m128i *coeffs) {
466
0
  const __m128i tmp_0 = _mm_loadu_si128(
467
0
      (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
468
469
  // even coeffs
470
0
  coeffs[0] =
471
0
      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0));
472
0
  coeffs[1] =
473
0
      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1));
474
0
  coeffs[2] =
475
0
      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2));
476
0
  coeffs[3] =
477
0
      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3));
478
479
  // odd coeffs
480
0
  coeffs[4] = coeffs[0];
481
0
  coeffs[5] = coeffs[1];
482
0
  coeffs[6] = coeffs[2];
483
0
  coeffs[7] = coeffs[3];
484
0
}
485
486
static inline void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
487
                                              __m128i *res_lo, __m128i *res_hi,
488
0
                                              int k) {
489
  // Load from tmp and rearrange pairs of consecutive rows into the
490
  // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
491
0
  const __m128i *src = tmp + (k + 4);
492
0
  const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
493
0
  const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
494
0
  const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
495
0
  const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
496
497
0
  const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
498
0
  const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
499
0
  const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
500
0
  const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
501
502
0
  const __m128i res_even =
503
0
      _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6));
504
505
  // Filter odd-index pixels
506
0
  const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
507
0
  const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
508
0
  const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
509
0
  const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
510
511
0
  const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]);
512
0
  const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]);
513
0
  const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]);
514
0
  const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]);
515
516
0
  const __m128i res_odd =
517
0
      _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7));
518
519
  // Rearrange pixels back into the order 0 ... 7
520
0
  *res_lo = _mm_unpacklo_epi32(res_even, res_odd);
521
0
  *res_hi = _mm_unpackhi_epi32(res_even, res_odd);
522
0
}
523
524
static inline void store_vertical_filter_output(
525
    __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const,
526
    const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const,
527
    uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k,
528
    const int reduce_bits_vert, int p_stride, int p_width,
529
0
    const int round_bits) {
530
0
  __m128i res_lo_1 = *res_lo;
531
0
  __m128i res_hi_1 = *res_hi;
532
533
0
  if (conv_params->is_compound) {
534
0
    __m128i *const p =
535
0
        (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
536
0
    res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const),
537
0
                              reduce_bits_vert);
538
0
    const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1);
539
0
    __m128i res_lo_16;
540
0
    if (conv_params->do_average) {
541
0
      __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
542
0
      const __m128i p_16 = _mm_loadl_epi64(p);
543
544
0
      if (conv_params->use_dist_wtd_comp_avg) {
545
0
        const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
546
0
        const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
547
0
        const __m128i shifted_32 =
548
0
            _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
549
0
        res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
550
0
      } else {
551
0
        res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
552
0
      }
553
554
0
      res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const);
555
556
0
      res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const),
557
0
                                 round_bits);
558
0
      __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
559
0
      *(int *)dst8 = _mm_cvtsi128_si32(res_8_lo);
560
0
    } else {
561
0
      _mm_storel_epi64(p, temp_lo_16);
562
0
    }
563
0
    if (p_width > 4) {
564
0
      __m128i *const p4 =
565
0
          (__m128i *)&conv_params
566
0
              ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
567
0
      res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const),
568
0
                                reduce_bits_vert);
569
0
      const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1);
570
0
      __m128i res_hi_16;
571
572
0
      if (conv_params->do_average) {
573
0
        __m128i *const dst8_4 =
574
0
            (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
575
0
        const __m128i p4_16 = _mm_loadl_epi64(p4);
576
577
0
        if (conv_params->use_dist_wtd_comp_avg) {
578
0
          const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
579
0
          const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
580
0
          const __m128i shifted_32 =
581
0
              _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
582
0
          res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
583
0
        } else {
584
0
          res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
585
0
        }
586
0
        res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const);
587
588
0
        res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const),
589
0
                                   round_bits);
590
0
        __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
591
0
        *(int *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
592
593
0
      } else {
594
0
        _mm_storel_epi64(p4, temp_hi_16);
595
0
      }
596
0
    }
597
0
  } else {
598
0
    const __m128i res_lo_round = _mm_srai_epi32(
599
0
        _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
600
0
    const __m128i res_hi_round = _mm_srai_epi32(
601
0
        _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
602
603
0
    const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
604
0
    __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
605
606
    // Store, blending with 'pred' if needed
607
0
    __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
608
609
    // Note: If we're outputting a 4x4 block, we need to be very careful
610
    // to only output 4 pixels at this point, to avoid encode/decode
611
    // mismatches when encoding with multiple threads.
612
0
    if (p_width == 4) {
613
0
      *(int *)p = _mm_cvtsi128_si32(res_8bit);
614
0
    } else {
615
0
      _mm_storel_epi64(p, res_8bit);
616
0
    }
617
0
  }
618
0
}
619
620
static inline void warp_vertical_filter(
621
    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
622
    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
623
    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
624
0
    const int round_bits, const int offset_bits) {
625
0
  int k;
626
0
  __m128i res_sub_const, round_bits_const, wt;
627
0
  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
628
0
                                     &res_sub_const, &round_bits_const, &wt);
629
  // Vertical filter
630
0
  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
631
0
    int sy = sy4 + delta * (k + 4);
632
633
0
    __m128i coeffs[8];
634
0
    prepare_vertical_filter_coeffs(gamma, sy, coeffs);
635
636
0
    __m128i res_lo;
637
0
    __m128i res_hi;
638
0
    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
639
640
0
    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
641
0
                                 &res_sub_const, &round_bits_const, pred,
642
0
                                 conv_params, i, j, k, reduce_bits_vert,
643
0
                                 p_stride, p_width, round_bits);
644
0
  }
645
0
}
646
647
static inline void warp_vertical_filter_gamma0(
648
    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
649
    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
650
    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
651
0
    const int round_bits, const int offset_bits) {
652
0
  int k;
653
0
  (void)gamma;
654
0
  __m128i res_sub_const, round_bits_const, wt;
655
0
  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
656
0
                                     &res_sub_const, &round_bits_const, &wt);
657
  // Vertical filter
658
0
  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
659
0
    int sy = sy4 + delta * (k + 4);
660
661
0
    __m128i coeffs[8];
662
0
    prepare_vertical_filter_coeffs_gamma0(sy, coeffs);
663
664
0
    __m128i res_lo;
665
0
    __m128i res_hi;
666
0
    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
667
668
0
    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
669
0
                                 &res_sub_const, &round_bits_const, pred,
670
0
                                 conv_params, i, j, k, reduce_bits_vert,
671
0
                                 p_stride, p_width, round_bits);
672
0
  }
673
0
}
674
675
static inline void warp_vertical_filter_delta0(
676
    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
677
    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
678
    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
679
0
    const int round_bits, const int offset_bits) {
680
0
  (void)delta;
681
0
  int k;
682
0
  __m128i res_sub_const, round_bits_const, wt;
683
0
  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
684
0
                                     &res_sub_const, &round_bits_const, &wt);
685
686
0
  __m128i coeffs[8];
687
0
  prepare_vertical_filter_coeffs(gamma, sy4, coeffs);
688
  // Vertical filter
689
0
  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
690
0
    __m128i res_lo;
691
0
    __m128i res_hi;
692
0
    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
693
694
0
    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
695
0
                                 &res_sub_const, &round_bits_const, pred,
696
0
                                 conv_params, i, j, k, reduce_bits_vert,
697
0
                                 p_stride, p_width, round_bits);
698
0
  }
699
0
}
700
701
static inline void warp_vertical_filter_gamma0_delta0(
702
    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
703
    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
704
    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
705
0
    const int round_bits, const int offset_bits) {
706
0
  (void)delta;
707
0
  (void)gamma;
708
0
  int k;
709
0
  __m128i res_sub_const, round_bits_const, wt;
710
0
  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
711
0
                                     &res_sub_const, &round_bits_const, &wt);
712
713
0
  __m128i coeffs[8];
714
0
  prepare_vertical_filter_coeffs_gamma0(sy4, coeffs);
715
  // Vertical filter
716
0
  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
717
0
    __m128i res_lo;
718
0
    __m128i res_hi;
719
0
    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
720
721
0
    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
722
0
                                 &res_sub_const, &round_bits_const, pred,
723
0
                                 conv_params, i, j, k, reduce_bits_vert,
724
0
                                 p_stride, p_width, round_bits);
725
0
  }
726
0
}
727
728
static inline void prepare_warp_vertical_filter(
729
    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
730
    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
731
    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
732
0
    const int round_bits, const int offset_bits) {
733
0
  if (gamma == 0 && delta == 0)
734
0
    warp_vertical_filter_gamma0_delta0(
735
0
        pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j,
736
0
        sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits);
737
0
  else if (gamma == 0 && delta != 0)
738
0
    warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height,
739
0
                                p_stride, p_width, i, j, sy4, reduce_bits_vert,
740
0
                                res_add_const, round_bits, offset_bits);
741
0
  else if (gamma != 0 && delta == 0)
742
0
    warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height,
743
0
                                p_stride, p_width, i, j, sy4, reduce_bits_vert,
744
0
                                res_add_const, round_bits, offset_bits);
745
0
  else
746
0
    warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height,
747
0
                         p_stride, p_width, i, j, sy4, reduce_bits_vert,
748
0
                         res_add_const, round_bits, offset_bits);
749
0
}
750
751
static inline void prepare_warp_horizontal_filter(
752
    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
753
    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
754
0
    const int offset_bits_horiz, const int reduce_bits_horiz) {
755
0
  if (alpha == 0 && beta == 0)
756
0
    warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
757
0
                                        beta, p_height, height, i,
758
0
                                        offset_bits_horiz, reduce_bits_horiz);
759
0
  else if (alpha == 0 && beta != 0)
760
0
    warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
761
0
                                  p_height, height, i, offset_bits_horiz,
762
0
                                  reduce_bits_horiz);
763
0
  else if (alpha != 0 && beta == 0)
764
0
    warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
765
0
                                 p_height, height, i, offset_bits_horiz,
766
0
                                 reduce_bits_horiz);
767
0
  else
768
0
    warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
769
0
                           p_height, height, i, offset_bits_horiz,
770
0
                           reduce_bits_horiz);
771
0
}
772
773
void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
774
                            int height, int stride, uint8_t *pred, int p_col,
775
                            int p_row, int p_width, int p_height, int p_stride,
776
                            int subsampling_x, int subsampling_y,
777
                            ConvolveParams *conv_params, int16_t alpha,
778
0
                            int16_t beta, int16_t gamma, int16_t delta) {
779
0
  __m128i tmp[15];
780
0
  int i, j, k;
781
0
  const int bd = 8;
782
0
  const int reduce_bits_horiz = conv_params->round_0;
783
0
  const int reduce_bits_vert = conv_params->is_compound
784
0
                                   ? conv_params->round_1
785
0
                                   : 2 * FILTER_BITS - reduce_bits_horiz;
786
0
  const int offset_bits_horiz = bd + FILTER_BITS - 1;
787
0
  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
788
789
0
  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
790
0
  const __m128i reduce_bits_vert_const =
791
0
      _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
792
0
  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
793
0
  const int round_bits =
794
0
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
795
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
796
0
  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
797
798
  /* Note: For this code to work, the left/right frame borders need to be
799
  extended by at least 13 pixels each. By the time we get here, other
800
  code will have set up this border, but we allow an explicit check
801
  for debugging purposes.
802
  */
803
  /*for (i = 0; i < height; ++i) {
804
  for (j = 0; j < 13; ++j) {
805
  assert(ref[i * stride - 13 + j] == ref[i * stride]);
806
  assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
807
  }
808
  }*/
809
0
  __m128i res_add_const_1;
810
0
  if (conv_params->is_compound == 1) {
811
0
    res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const);
812
0
  } else {
813
0
    res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
814
0
                                     ((1 << reduce_bits_vert) >> 1));
815
0
  }
816
817
0
  for (i = 0; i < p_height; i += 8) {
818
0
    for (j = 0; j < p_width; j += 8) {
819
0
      const int32_t src_x = (p_col + j + 4) << subsampling_x;
820
0
      const int32_t src_y = (p_row + i + 4) << subsampling_y;
821
0
      const int64_t dst_x =
822
0
          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
823
0
      const int64_t dst_y =
824
0
          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
825
0
      const int64_t x4 = dst_x >> subsampling_x;
826
0
      const int64_t y4 = dst_y >> subsampling_y;
827
828
0
      int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
829
0
      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
830
0
      int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
831
0
      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
832
833
      // Add in all the constant terms, including rounding and offset
834
0
      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
835
0
             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
836
0
      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
837
0
             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
838
839
0
      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
840
0
      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
841
842
      // Horizontal filter
843
      // If the block is aligned such that, after clamping, every sample
844
      // would be taken from the leftmost/rightmost column, then we can
845
      // skip the expensive horizontal filter.
846
0
      if (ix4 <= -7) {
847
0
        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
848
0
          int iy = iy4 + k;
849
0
          if (iy < 0)
850
0
            iy = 0;
851
0
          else if (iy > height - 1)
852
0
            iy = height - 1;
853
0
          tmp[k + 7] = _mm_set1_epi16(
854
0
              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
855
0
              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
856
0
        }
857
0
      } else if (ix4 >= width + 6) {
858
0
        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
859
0
          int iy = iy4 + k;
860
0
          if (iy < 0)
861
0
            iy = 0;
862
0
          else if (iy > height - 1)
863
0
            iy = height - 1;
864
0
          tmp[k + 7] =
865
0
              _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
866
0
                             ref[iy * stride + (width - 1)] *
867
0
                                 (1 << (FILTER_BITS - reduce_bits_horiz)));
868
0
        }
869
0
      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
870
0
        const int out_of_boundary_left = -(ix4 - 6);
871
0
        const int out_of_boundary_right = (ix4 + 8) - width;
872
0
        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
873
0
          int iy = iy4 + k;
874
0
          if (iy < 0)
875
0
            iy = 0;
876
0
          else if (iy > height - 1)
877
0
            iy = height - 1;
878
0
          int sx = sx4 + beta * (k + 4);
879
880
          // Load source pixels
881
0
          __m128i src =
882
0
              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
883
0
          if (out_of_boundary_left >= 0) {
884
0
            const __m128i shuffle_reg_left =
885
0
                _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
886
0
            src = _mm_shuffle_epi8(src, shuffle_reg_left);
887
0
          }
888
0
          if (out_of_boundary_right >= 0) {
889
0
            const __m128i shuffle_reg_right = _mm_loadu_si128(
890
0
                (__m128i *)warp_pad_right[out_of_boundary_right]);
891
0
            src = _mm_shuffle_epi8(src, shuffle_reg_right);
892
0
          }
893
0
          horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
894
0
                            reduce_bits_horiz);
895
0
        }
896
0
      } else {
897
0
        prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha,
898
0
                                       beta, p_height, height, i,
899
0
                                       offset_bits_horiz, reduce_bits_horiz);
900
0
      }
901
902
      // Vertical filter
903
0
      prepare_warp_vertical_filter(
904
0
          pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i,
905
0
          j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits);
906
0
    }
907
0
  }
908
0
}