Coverage Report

Created: 2026-02-14 07:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/av1/common/warped_motion.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <stdio.h>
13
#include <stdlib.h>
14
#include <memory.h>
15
#include <math.h>
16
#include <assert.h>
17
18
#include "config/av1_rtcd.h"
19
20
#include "av1/common/av1_common_int.h"
21
#include "av1/common/warped_motion.h"
22
#include "av1/common/scale.h"
23
24
// For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
25
// at a time. The zoom/rotation/shear in the model are applied to the
26
// "fractional" position of each pixel, which therefore varies within
27
// [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
28
// We need an extra 2 taps to fit this in, for a total of 8 taps.
29
/* clang-format off */
30
const WarpedFilterCoeff av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1]
31
                                         [8] = {
32
  // [-1, 0)
33
  { 0,   0, 127,   1,   0, 0, 0, 0 }, { 0, - 1, 127,   2,   0, 0, 0, 0 },
34
  { 1, - 3, 127,   4, - 1, 0, 0, 0 }, { 1, - 4, 126,   6, - 2, 1, 0, 0 },
35
  { 1, - 5, 126,   8, - 3, 1, 0, 0 }, { 1, - 6, 125,  11, - 4, 1, 0, 0 },
36
  { 1, - 7, 124,  13, - 4, 1, 0, 0 }, { 2, - 8, 123,  15, - 5, 1, 0, 0 },
37
  { 2, - 9, 122,  18, - 6, 1, 0, 0 }, { 2, -10, 121,  20, - 6, 1, 0, 0 },
38
  { 2, -11, 120,  22, - 7, 2, 0, 0 }, { 2, -12, 119,  25, - 8, 2, 0, 0 },
39
  { 3, -13, 117,  27, - 8, 2, 0, 0 }, { 3, -13, 116,  29, - 9, 2, 0, 0 },
40
  { 3, -14, 114,  32, -10, 3, 0, 0 }, { 3, -15, 113,  35, -10, 2, 0, 0 },
41
  { 3, -15, 111,  37, -11, 3, 0, 0 }, { 3, -16, 109,  40, -11, 3, 0, 0 },
42
  { 3, -16, 108,  42, -12, 3, 0, 0 }, { 4, -17, 106,  45, -13, 3, 0, 0 },
43
  { 4, -17, 104,  47, -13, 3, 0, 0 }, { 4, -17, 102,  50, -14, 3, 0, 0 },
44
  { 4, -17, 100,  52, -14, 3, 0, 0 }, { 4, -18,  98,  55, -15, 4, 0, 0 },
45
  { 4, -18,  96,  58, -15, 3, 0, 0 }, { 4, -18,  94,  60, -16, 4, 0, 0 },
46
  { 4, -18,  91,  63, -16, 4, 0, 0 }, { 4, -18,  89,  65, -16, 4, 0, 0 },
47
  { 4, -18,  87,  68, -17, 4, 0, 0 }, { 4, -18,  85,  70, -17, 4, 0, 0 },
48
  { 4, -18,  82,  73, -17, 4, 0, 0 }, { 4, -18,  80,  75, -17, 4, 0, 0 },
49
  { 4, -18,  78,  78, -18, 4, 0, 0 }, { 4, -17,  75,  80, -18, 4, 0, 0 },
50
  { 4, -17,  73,  82, -18, 4, 0, 0 }, { 4, -17,  70,  85, -18, 4, 0, 0 },
51
  { 4, -17,  68,  87, -18, 4, 0, 0 }, { 4, -16,  65,  89, -18, 4, 0, 0 },
52
  { 4, -16,  63,  91, -18, 4, 0, 0 }, { 4, -16,  60,  94, -18, 4, 0, 0 },
53
  { 3, -15,  58,  96, -18, 4, 0, 0 }, { 4, -15,  55,  98, -18, 4, 0, 0 },
54
  { 3, -14,  52, 100, -17, 4, 0, 0 }, { 3, -14,  50, 102, -17, 4, 0, 0 },
55
  { 3, -13,  47, 104, -17, 4, 0, 0 }, { 3, -13,  45, 106, -17, 4, 0, 0 },
56
  { 3, -12,  42, 108, -16, 3, 0, 0 }, { 3, -11,  40, 109, -16, 3, 0, 0 },
57
  { 3, -11,  37, 111, -15, 3, 0, 0 }, { 2, -10,  35, 113, -15, 3, 0, 0 },
58
  { 3, -10,  32, 114, -14, 3, 0, 0 }, { 2, - 9,  29, 116, -13, 3, 0, 0 },
59
  { 2, - 8,  27, 117, -13, 3, 0, 0 }, { 2, - 8,  25, 119, -12, 2, 0, 0 },
60
  { 2, - 7,  22, 120, -11, 2, 0, 0 }, { 1, - 6,  20, 121, -10, 2, 0, 0 },
61
  { 1, - 6,  18, 122, - 9, 2, 0, 0 }, { 1, - 5,  15, 123, - 8, 2, 0, 0 },
62
  { 1, - 4,  13, 124, - 7, 1, 0, 0 }, { 1, - 4,  11, 125, - 6, 1, 0, 0 },
63
  { 1, - 3,   8, 126, - 5, 1, 0, 0 }, { 1, - 2,   6, 126, - 4, 1, 0, 0 },
64
  { 0, - 1,   4, 127, - 3, 1, 0, 0 }, { 0,   0,   2, 127, - 1, 0, 0, 0 },
65
66
  // [0, 1)
67
  { 0,  0,   0, 127,   1,   0,  0,  0}, { 0,  0,  -1, 127,   2,   0,  0,  0},
68
  { 0,  1,  -3, 127,   4,  -2,  1,  0}, { 0,  1,  -5, 127,   6,  -2,  1,  0},
69
  { 0,  2,  -6, 126,   8,  -3,  1,  0}, {-1,  2,  -7, 126,  11,  -4,  2, -1},
70
  {-1,  3,  -8, 125,  13,  -5,  2, -1}, {-1,  3, -10, 124,  16,  -6,  3, -1},
71
  {-1,  4, -11, 123,  18,  -7,  3, -1}, {-1,  4, -12, 122,  20,  -7,  3, -1},
72
  {-1,  4, -13, 121,  23,  -8,  3, -1}, {-2,  5, -14, 120,  25,  -9,  4, -1},
73
  {-1,  5, -15, 119,  27, -10,  4, -1}, {-1,  5, -16, 118,  30, -11,  4, -1},
74
  {-2,  6, -17, 116,  33, -12,  5, -1}, {-2,  6, -17, 114,  35, -12,  5, -1},
75
  {-2,  6, -18, 113,  38, -13,  5, -1}, {-2,  7, -19, 111,  41, -14,  6, -2},
76
  {-2,  7, -19, 110,  43, -15,  6, -2}, {-2,  7, -20, 108,  46, -15,  6, -2},
77
  {-2,  7, -20, 106,  49, -16,  6, -2}, {-2,  7, -21, 104,  51, -16,  7, -2},
78
  {-2,  7, -21, 102,  54, -17,  7, -2}, {-2,  8, -21, 100,  56, -18,  7, -2},
79
  {-2,  8, -22,  98,  59, -18,  7, -2}, {-2,  8, -22,  96,  62, -19,  7, -2},
80
  {-2,  8, -22,  94,  64, -19,  7, -2}, {-2,  8, -22,  91,  67, -20,  8, -2},
81
  {-2,  8, -22,  89,  69, -20,  8, -2}, {-2,  8, -22,  87,  72, -21,  8, -2},
82
  {-2,  8, -21,  84,  74, -21,  8, -2}, {-2,  8, -22,  82,  77, -21,  8, -2},
83
  {-2,  8, -21,  79,  79, -21,  8, -2}, {-2,  8, -21,  77,  82, -22,  8, -2},
84
  {-2,  8, -21,  74,  84, -21,  8, -2}, {-2,  8, -21,  72,  87, -22,  8, -2},
85
  {-2,  8, -20,  69,  89, -22,  8, -2}, {-2,  8, -20,  67,  91, -22,  8, -2},
86
  {-2,  7, -19,  64,  94, -22,  8, -2}, {-2,  7, -19,  62,  96, -22,  8, -2},
87
  {-2,  7, -18,  59,  98, -22,  8, -2}, {-2,  7, -18,  56, 100, -21,  8, -2},
88
  {-2,  7, -17,  54, 102, -21,  7, -2}, {-2,  7, -16,  51, 104, -21,  7, -2},
89
  {-2,  6, -16,  49, 106, -20,  7, -2}, {-2,  6, -15,  46, 108, -20,  7, -2},
90
  {-2,  6, -15,  43, 110, -19,  7, -2}, {-2,  6, -14,  41, 111, -19,  7, -2},
91
  {-1,  5, -13,  38, 113, -18,  6, -2}, {-1,  5, -12,  35, 114, -17,  6, -2},
92
  {-1,  5, -12,  33, 116, -17,  6, -2}, {-1,  4, -11,  30, 118, -16,  5, -1},
93
  {-1,  4, -10,  27, 119, -15,  5, -1}, {-1,  4,  -9,  25, 120, -14,  5, -2},
94
  {-1,  3,  -8,  23, 121, -13,  4, -1}, {-1,  3,  -7,  20, 122, -12,  4, -1},
95
  {-1,  3,  -7,  18, 123, -11,  4, -1}, {-1,  3,  -6,  16, 124, -10,  3, -1},
96
  {-1,  2,  -5,  13, 125,  -8,  3, -1}, {-1,  2,  -4,  11, 126,  -7,  2, -1},
97
  { 0,  1,  -3,   8, 126,  -6,  2,  0}, { 0,  1,  -2,   6, 127,  -5,  1,  0},
98
  { 0,  1,  -2,   4, 127,  -3,  1,  0}, { 0,  0,   0,   2, 127,  -1,  0,  0},
99
100
  // [1, 2)
101
  { 0, 0, 0,   1, 127,   0,   0, 0 }, { 0, 0, 0, - 1, 127,   2,   0, 0 },
102
  { 0, 0, 1, - 3, 127,   4, - 1, 0 }, { 0, 0, 1, - 4, 126,   6, - 2, 1 },
103
  { 0, 0, 1, - 5, 126,   8, - 3, 1 }, { 0, 0, 1, - 6, 125,  11, - 4, 1 },
104
  { 0, 0, 1, - 7, 124,  13, - 4, 1 }, { 0, 0, 2, - 8, 123,  15, - 5, 1 },
105
  { 0, 0, 2, - 9, 122,  18, - 6, 1 }, { 0, 0, 2, -10, 121,  20, - 6, 1 },
106
  { 0, 0, 2, -11, 120,  22, - 7, 2 }, { 0, 0, 2, -12, 119,  25, - 8, 2 },
107
  { 0, 0, 3, -13, 117,  27, - 8, 2 }, { 0, 0, 3, -13, 116,  29, - 9, 2 },
108
  { 0, 0, 3, -14, 114,  32, -10, 3 }, { 0, 0, 3, -15, 113,  35, -10, 2 },
109
  { 0, 0, 3, -15, 111,  37, -11, 3 }, { 0, 0, 3, -16, 109,  40, -11, 3 },
110
  { 0, 0, 3, -16, 108,  42, -12, 3 }, { 0, 0, 4, -17, 106,  45, -13, 3 },
111
  { 0, 0, 4, -17, 104,  47, -13, 3 }, { 0, 0, 4, -17, 102,  50, -14, 3 },
112
  { 0, 0, 4, -17, 100,  52, -14, 3 }, { 0, 0, 4, -18,  98,  55, -15, 4 },
113
  { 0, 0, 4, -18,  96,  58, -15, 3 }, { 0, 0, 4, -18,  94,  60, -16, 4 },
114
  { 0, 0, 4, -18,  91,  63, -16, 4 }, { 0, 0, 4, -18,  89,  65, -16, 4 },
115
  { 0, 0, 4, -18,  87,  68, -17, 4 }, { 0, 0, 4, -18,  85,  70, -17, 4 },
116
  { 0, 0, 4, -18,  82,  73, -17, 4 }, { 0, 0, 4, -18,  80,  75, -17, 4 },
117
  { 0, 0, 4, -18,  78,  78, -18, 4 }, { 0, 0, 4, -17,  75,  80, -18, 4 },
118
  { 0, 0, 4, -17,  73,  82, -18, 4 }, { 0, 0, 4, -17,  70,  85, -18, 4 },
119
  { 0, 0, 4, -17,  68,  87, -18, 4 }, { 0, 0, 4, -16,  65,  89, -18, 4 },
120
  { 0, 0, 4, -16,  63,  91, -18, 4 }, { 0, 0, 4, -16,  60,  94, -18, 4 },
121
  { 0, 0, 3, -15,  58,  96, -18, 4 }, { 0, 0, 4, -15,  55,  98, -18, 4 },
122
  { 0, 0, 3, -14,  52, 100, -17, 4 }, { 0, 0, 3, -14,  50, 102, -17, 4 },
123
  { 0, 0, 3, -13,  47, 104, -17, 4 }, { 0, 0, 3, -13,  45, 106, -17, 4 },
124
  { 0, 0, 3, -12,  42, 108, -16, 3 }, { 0, 0, 3, -11,  40, 109, -16, 3 },
125
  { 0, 0, 3, -11,  37, 111, -15, 3 }, { 0, 0, 2, -10,  35, 113, -15, 3 },
126
  { 0, 0, 3, -10,  32, 114, -14, 3 }, { 0, 0, 2, - 9,  29, 116, -13, 3 },
127
  { 0, 0, 2, - 8,  27, 117, -13, 3 }, { 0, 0, 2, - 8,  25, 119, -12, 2 },
128
  { 0, 0, 2, - 7,  22, 120, -11, 2 }, { 0, 0, 1, - 6,  20, 121, -10, 2 },
129
  { 0, 0, 1, - 6,  18, 122, - 9, 2 }, { 0, 0, 1, - 5,  15, 123, - 8, 2 },
130
  { 0, 0, 1, - 4,  13, 124, - 7, 1 }, { 0, 0, 1, - 4,  11, 125, - 6, 1 },
131
  { 0, 0, 1, - 3,   8, 126, - 5, 1 }, { 0, 0, 1, - 2,   6, 126, - 4, 1 },
132
  { 0, 0, 0, - 1,   4, 127, - 3, 1 }, { 0, 0, 0,   0,   2, 127, - 1, 0 },
133
  // dummy (replicate row index 191)
134
  { 0, 0, 0,   0,   2, 127, - 1, 0 },
135
};
136
137
/* clang-format on */
138
139
92.5k
#define DIV_LUT_PREC_BITS 14
140
92.6k
#define DIV_LUT_BITS 8
141
#define DIV_LUT_NUM (1 << DIV_LUT_BITS)
142
143
static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
144
  16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
145
  15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
146
  15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
147
  14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
148
  13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
149
  13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
150
  13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
151
  12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
152
  12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
153
  11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
154
  11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
155
  11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
156
  10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
157
  10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
158
  10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
159
  9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
160
  9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
161
  9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
162
  9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
163
  9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
164
  8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
165
  8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
166
  8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
167
  8240,  8224,  8208,  8192,
168
};
169
170
// Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
171
// at precision of DIV_LUT_PREC_BITS along with the shift.
172
2.19k
static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
173
2.19k
  int64_t f;
174
2.19k
  *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
175
2.19k
                               : get_msb((unsigned int)D));
176
  // e is obtained from D after resetting the most significant 1 bit.
177
2.19k
  const int64_t e = D - ((uint64_t)1 << *shift);
178
  // Get the most significant DIV_LUT_BITS (8) bits of e into f
179
2.19k
  if (*shift > DIV_LUT_BITS)
180
2.19k
    f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
181
0
  else
182
0
    f = e << (DIV_LUT_BITS - *shift);
183
2.19k
  assert(f <= DIV_LUT_NUM);
184
2.19k
  *shift += DIV_LUT_PREC_BITS;
185
  // Use f as lookup into the precomputed table of multipliers
186
2.19k
  return div_lut[f];
187
2.19k
}
188
189
90.3k
static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
190
90.3k
  int32_t f;
191
90.3k
  *shift = get_msb(D);
192
  // e is obtained from D after resetting the most significant 1 bit.
193
90.3k
  const int32_t e = D - ((uint32_t)1 << *shift);
194
  // Get the most significant DIV_LUT_BITS (8) bits of e into f
195
90.3k
  if (*shift > DIV_LUT_BITS)
196
90.3k
    f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
197
36
  else
198
36
    f = e << (DIV_LUT_BITS - *shift);
199
90.3k
  assert(f <= DIV_LUT_NUM);
200
90.3k
  *shift += DIV_LUT_PREC_BITS;
201
  // Use f as lookup into the precomputed table of multipliers
202
90.3k
  return div_lut[f];
203
90.3k
}
204
205
90.4k
static int is_affine_valid(const WarpedMotionParams *const wm) {
206
90.4k
  const int32_t *mat = wm->wmmat;
207
90.4k
  return (mat[2] > 0);
208
90.4k
}
209
210
static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
211
90.3k
                                   int16_t delta) {
212
90.3k
  if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
213
89.7k
      (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
214
940
    return 0;
215
89.4k
  else
216
89.4k
    return 1;
217
90.3k
}
218
219
#ifndef NDEBUG
220
// Check that the given warp model satisfies the relevant constraints for
221
// its stated model type
222
static void check_model_consistency(WarpedMotionParams *wm) {
223
  switch (wm->wmtype) {
224
    case IDENTITY:
225
      assert(wm->wmmat[0] == 0);
226
      assert(wm->wmmat[1] == 0);
227
      AOM_FALLTHROUGH_INTENDED;
228
    case TRANSLATION:
229
      assert(wm->wmmat[2] == 1 << WARPEDMODEL_PREC_BITS);
230
      assert(wm->wmmat[3] == 0);
231
      AOM_FALLTHROUGH_INTENDED;
232
    case ROTZOOM:
233
      assert(wm->wmmat[4] == -wm->wmmat[3]);
234
      assert(wm->wmmat[5] == wm->wmmat[2]);
235
      AOM_FALLTHROUGH_INTENDED;
236
    case AFFINE: break;
237
    default: assert(0 && "Bad wmtype");
238
  }
239
}
240
#endif  // NDEBUG
241
242
// Returns 1 on success or 0 on an invalid affine set
243
90.4k
int av1_get_shear_params(WarpedMotionParams *wm) {
244
#ifndef NDEBUG
245
  // Check that models have been constructed sensibly
246
  // This is a good place to check, because this function does not need to
247
  // be called until after model construction is complete, but must be called
248
  // before the model can be used for prediction.
249
  check_model_consistency(wm);
250
#endif  // NDEBUG
251
252
90.4k
  const int32_t *mat = wm->wmmat;
253
90.4k
  if (!is_affine_valid(wm)) return 0;
254
255
90.3k
  wm->alpha =
256
90.3k
      clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
257
90.3k
  wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
258
90.3k
  int16_t shift;
259
90.3k
  int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
260
90.3k
  int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
261
90.3k
  wm->gamma =
262
90.3k
      clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
263
90.3k
  v = ((int64_t)mat[3] * mat[4]) * y;
264
90.3k
  wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
265
90.3k
                        (1 << WARPEDMODEL_PREC_BITS),
266
90.3k
                    INT16_MIN, INT16_MAX);
267
268
90.3k
  wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
269
90.3k
              (1 << WARP_PARAM_REDUCE_BITS);
270
90.3k
  wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
271
90.3k
             (1 << WARP_PARAM_REDUCE_BITS);
272
90.3k
  wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
273
90.3k
              (1 << WARP_PARAM_REDUCE_BITS);
274
90.3k
  wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
275
90.3k
              (1 << WARP_PARAM_REDUCE_BITS);
276
277
90.3k
  if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
278
940
    return 0;
279
280
89.4k
  return 1;
281
90.3k
}
282
283
#if CONFIG_AV1_HIGHBITDEPTH
284
/* Note: For an explanation of the warp algorithm, and some notes on bit widths
285
    for hardware implementations, see the comments above av1_warp_affine_c
286
*/
287
void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
288
                              int width, int height, int stride, uint16_t *pred,
289
                              int p_col, int p_row, int p_width, int p_height,
290
                              int p_stride, int subsampling_x,
291
                              int subsampling_y, int bd,
292
                              ConvolveParams *conv_params, int16_t alpha,
293
7.84k
                              int16_t beta, int16_t gamma, int16_t delta) {
294
7.84k
  int32_t tmp[15 * 8];
295
7.84k
  const int reduce_bits_horiz = conv_params->round_0;
296
7.84k
  const int reduce_bits_vert = conv_params->is_compound
297
7.84k
                                   ? conv_params->round_1
298
7.84k
                                   : 2 * FILTER_BITS - reduce_bits_horiz;
299
7.84k
  const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
300
7.84k
  const int offset_bits_horiz = bd + FILTER_BITS - 1;
301
7.84k
  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
302
7.84k
  const int round_bits =
303
7.84k
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
304
7.84k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
305
7.84k
  (void)max_bits_horiz;
306
7.84k
  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
307
308
  // Check that, even with 12-bit input, the intermediate values will fit
309
  // into an unsigned 16-bit intermediate array.
310
7.84k
  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
311
312
16.8k
  for (int i = p_row; i < p_row + p_height; i += 8) {
313
25.2k
    for (int j = p_col; j < p_col + p_width; j += 8) {
314
      // Calculate the center of this 8x8 block,
315
      // project to luma coordinates (if in a subsampled chroma plane),
316
      // apply the affine transformation,
317
      // then convert back to the original coordinates (if necessary)
318
16.2k
      const int32_t src_x = (j + 4) << subsampling_x;
319
16.2k
      const int32_t src_y = (i + 4) << subsampling_y;
320
16.2k
      const int64_t dst_x =
321
16.2k
          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
322
16.2k
      const int64_t dst_y =
323
16.2k
          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
324
16.2k
      const int64_t x4 = dst_x >> subsampling_x;
325
16.2k
      const int64_t y4 = dst_y >> subsampling_y;
326
327
16.2k
      const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
328
16.2k
      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
329
16.2k
      const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
330
16.2k
      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
331
332
16.2k
      sx4 += alpha * (-4) + beta * (-4);
333
16.2k
      sy4 += gamma * (-4) + delta * (-4);
334
335
16.2k
      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
336
16.2k
      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
337
338
      // Horizontal filter
339
260k
      for (int k = -7; k < 8; ++k) {
340
243k
        const int iy = clamp(iy4 + k, 0, height - 1);
341
342
243k
        int sx = sx4 + beta * (k + 4);
343
2.19M
        for (int l = -4; l < 4; ++l) {
344
1.95M
          int ix = ix4 + l - 3;
345
1.95M
          const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
346
1.95M
                           WARPEDPIXEL_PREC_SHIFTS;
347
1.95M
          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
348
1.95M
          const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
349
350
1.95M
          int32_t sum = 1 << offset_bits_horiz;
351
17.5M
          for (int m = 0; m < 8; ++m) {
352
15.6M
            const int sample_x = clamp(ix + m, 0, width - 1);
353
15.6M
            sum += ref[iy * stride + sample_x] * coeffs[m];
354
15.6M
          }
355
1.95M
          sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
356
1.95M
          assert(0 <= sum && sum < (1 << max_bits_horiz));
357
1.95M
          tmp[(k + 7) * 8 + (l + 4)] = sum;
358
1.95M
          sx += alpha;
359
1.95M
        }
360
243k
      }
361
362
      // Vertical filter
363
146k
      for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
364
130k
        int sy = sy4 + delta * (k + 4);
365
1.17M
        for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
366
1.04M
          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
367
1.04M
                           WARPEDPIXEL_PREC_SHIFTS;
368
1.04M
          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
369
1.04M
          const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
370
371
1.04M
          int32_t sum = 1 << offset_bits_vert;
372
9.36M
          for (int m = 0; m < 8; ++m) {
373
8.32M
            sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
374
8.32M
          }
375
376
1.04M
          if (conv_params->is_compound) {
377
223k
            CONV_BUF_TYPE *p =
378
223k
                &conv_params
379
223k
                     ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
380
223k
                           (j - p_col + l + 4)];
381
223k
            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
382
223k
            if (conv_params->do_average) {
383
122k
              uint16_t *dst16 =
384
122k
                  &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
385
122k
              int32_t tmp32 = *p;
386
122k
              if (conv_params->use_dist_wtd_comp_avg) {
387
73.6k
                tmp32 = tmp32 * conv_params->fwd_offset +
388
73.6k
                        sum * conv_params->bck_offset;
389
73.6k
                tmp32 = tmp32 >> DIST_PRECISION_BITS;
390
73.6k
              } else {
391
49.2k
                tmp32 += sum;
392
49.2k
                tmp32 = tmp32 >> 1;
393
49.2k
              }
394
122k
              tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
395
122k
                      (1 << (offset_bits - conv_params->round_1 - 1));
396
122k
              *dst16 =
397
122k
                  clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd);
398
122k
            } else {
399
100k
              *p = sum;
400
100k
            }
401
816k
          } else {
402
816k
            uint16_t *p =
403
816k
                &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
404
816k
            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
405
816k
            assert(0 <= sum && sum < (1 << (bd + 2)));
406
816k
            *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
407
816k
          }
408
1.04M
          sy += gamma;
409
1.04M
        }
410
130k
      }
411
16.2k
    }
412
8.96k
  }
413
7.84k
}
414
415
void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
416
                       int width, int height, int stride, uint16_t *const pred,
417
                       int p_col, int p_row, int p_width, int p_height,
418
                       int p_stride, int subsampling_x, int subsampling_y,
419
7.84k
                       int bd, ConvolveParams *conv_params) {
420
7.84k
  const int32_t *const mat = wm->wmmat;
421
7.84k
  const int16_t alpha = wm->alpha;
422
7.84k
  const int16_t beta = wm->beta;
423
7.84k
  const int16_t gamma = wm->gamma;
424
7.84k
  const int16_t delta = wm->delta;
425
426
7.84k
  av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
427
7.84k
                         p_width, p_height, p_stride, subsampling_x,
428
7.84k
                         subsampling_y, bd, conv_params, alpha, beta, gamma,
429
7.84k
                         delta);
430
7.84k
}
431
#endif  // CONFIG_AV1_HIGHBITDEPTH
432
433
/* The warp filter for ROTZOOM and AFFINE models works as follows:
434
   * Split the input into 8x8 blocks
435
   * For each block, project the point (4, 4) within the block, to get the
436
     overall block position. Split into integer and fractional coordinates,
437
     maintaining full WARPEDMODEL precision
438
   * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
439
     variable horizontal offset. This means that, while the rows of the
440
     intermediate buffer align with the rows of the *reference* image, the
441
     columns align with the columns of the *destination* image.
442
   * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
443
     destination is too small we crop the output at this stage). Each pixel has
444
     a variable vertical offset, so that the resulting rows are aligned with
445
     the rows of the destination image.
446
447
   To accomplish these alignments, we factor the warp matrix as a
448
   product of two shear / asymmetric zoom matrices:
449
   / a b \  = /   1       0    \ * / 1+alpha  beta \
450
   \ c d /    \ gamma  1+delta /   \    0      1   /
451
   where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
452
   The horizontal shear (with alpha and beta) is applied first,
453
   then the vertical shear (with gamma and delta) is applied second.
454
455
   The only limitation is that, to fit this in a fixed 8-tap filter size,
456
   the fractional pixel offsets must be at most +-1. Since the horizontal filter
457
   generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
458
   within the block, the parameters must satisfy
459
   4 * |alpha| + 7 * |beta| <= 1   and   4 * |gamma| + 4 * |delta| <= 1
460
   for this filter to be applicable.
461
462
   Note: This function assumes that the caller has done all of the relevant
463
   checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
464
   are set appropriately (if using a ROTZOOM model), and that alpha, beta,
465
   gamma, delta are all in range.
466
467
   TODO(rachelbarker): Maybe support scaled references?
468
*/
469
/* A note on hardware implementation:
470
    The warp filter is intended to be implementable using the same hardware as
471
    the high-precision convolve filters from the loop-restoration and
472
    convolve-round experiments.
473
474
    For a single filter stage, considering all of the coefficient sets for the
475
    warp filter and the regular convolution filter, an input in the range
476
    [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
477
    before rounding.
478
479
    Allowing for some changes to the filter coefficient sets, call the range
480
    [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
481
    we can replace this by the range [0, 256 * 2^k], which can be stored in an
482
    unsigned value with 8 + k bits.
483
484
    This allows the derivation of the appropriate bit widths and offsets for
485
    the various intermediate values: If
486
487
    F := FILTER_BITS = 7 (or else the above ranges need adjusting)
488
         So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
489
         intermediate value.
490
    H := ROUND0_BITS
491
    V := VERSHEAR_REDUCE_PREC_BITS
492
    (and note that we must have H + V = 2*F for the output to have the same
493
     scale as the input)
494
495
    then we end up with the following offsets and ranges:
496
    Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
497
                       uint{bd + F + 1}
498
    After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
499
    Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
500
                     uint{bd + 2*F + 2 - H}
501
    After rounding: The final value, before undoing the offset, fits into a
502
                    uint{bd + 2}.
503
504
    Then we need to undo the offsets before clamping to a pixel. Note that,
505
    if we do this at the end, the amount to subtract is actually independent
506
    of H and V:
507
508
    offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
509
                         (1 << ((bd + 2*F - H) - V))
510
                      == (1 << (bd - 1)) + (1 << bd)
511
512
    This allows us to entirely avoid clamping in both the warp filter and
513
    the convolve-round experiment. As of the time of writing, the Wiener filter
514
    from loop-restoration can encode a central coefficient up to 216, which
515
    leads to a maximum value of about 282 * 2^k after applying the offset.
516
    So in that case we still need to clamp.
517
*/
518
void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
519
                       int height, int stride, uint8_t *pred, int p_col,
520
                       int p_row, int p_width, int p_height, int p_stride,
521
                       int subsampling_x, int subsampling_y,
522
                       ConvolveParams *conv_params, int16_t alpha, int16_t beta,
523
7.67k
                       int16_t gamma, int16_t delta) {
524
7.67k
  int32_t tmp[15 * 8];
525
7.67k
  const int bd = 8;
526
7.67k
  const int reduce_bits_horiz = conv_params->round_0;
527
7.67k
  const int reduce_bits_vert = conv_params->is_compound
528
7.67k
                                   ? conv_params->round_1
529
7.67k
                                   : 2 * FILTER_BITS - reduce_bits_horiz;
530
7.67k
  const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
531
7.67k
  const int offset_bits_horiz = bd + FILTER_BITS - 1;
532
7.67k
  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
533
7.67k
  const int round_bits =
534
7.67k
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
535
7.67k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
536
7.67k
  (void)max_bits_horiz;
537
7.67k
  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
538
7.67k
  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
539
540
16.9k
  for (int i = p_row; i < p_row + p_height; i += 8) {
541
25.2k
    for (int j = p_col; j < p_col + p_width; j += 8) {
542
      // Calculate the center of this 8x8 block,
543
      // project to luma coordinates (if in a subsampled chroma plane),
544
      // apply the affine transformation,
545
      // then convert back to the original coordinates (if necessary)
546
15.9k
      const int32_t src_x = (j + 4) << subsampling_x;
547
15.9k
      const int32_t src_y = (i + 4) << subsampling_y;
548
15.9k
      const int64_t dst_x =
549
15.9k
          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
550
15.9k
      const int64_t dst_y =
551
15.9k
          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
552
15.9k
      const int64_t x4 = dst_x >> subsampling_x;
553
15.9k
      const int64_t y4 = dst_y >> subsampling_y;
554
555
15.9k
      int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
556
15.9k
      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
557
15.9k
      int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
558
15.9k
      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
559
560
15.9k
      sx4 += alpha * (-4) + beta * (-4);
561
15.9k
      sy4 += gamma * (-4) + delta * (-4);
562
563
15.9k
      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
564
15.9k
      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
565
566
      // Horizontal filter
567
255k
      for (int k = -7; k < 8; ++k) {
568
        // Clamp to top/bottom edge of the frame
569
239k
        const int iy = clamp(iy4 + k, 0, height - 1);
570
571
239k
        int sx = sx4 + beta * (k + 4);
572
573
2.15M
        for (int l = -4; l < 4; ++l) {
574
1.91M
          int ix = ix4 + l - 3;
575
          // At this point, sx = sx4 + alpha * l + beta * k
576
1.91M
          const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
577
1.91M
                           WARPEDPIXEL_PREC_SHIFTS;
578
1.91M
          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
579
1.91M
          const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
580
581
1.91M
          int32_t sum = 1 << offset_bits_horiz;
582
17.2M
          for (int m = 0; m < 8; ++m) {
583
            // Clamp to left/right edge of the frame
584
15.3M
            const int sample_x = clamp(ix + m, 0, width - 1);
585
586
15.3M
            sum += ref[iy * stride + sample_x] * coeffs[m];
587
15.3M
          }
588
1.91M
          sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
589
1.91M
          assert(0 <= sum && sum < (1 << max_bits_horiz));
590
1.91M
          tmp[(k + 7) * 8 + (l + 4)] = sum;
591
1.91M
          sx += alpha;
592
1.91M
        }
593
239k
      }
594
595
      // Vertical filter
596
143k
      for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
597
127k
        int sy = sy4 + delta * (k + 4);
598
1.14M
        for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
599
          // At this point, sy = sy4 + gamma * l + delta * k
600
1.02M
          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
601
1.02M
                           WARPEDPIXEL_PREC_SHIFTS;
602
1.02M
          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
603
1.02M
          const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
604
605
1.02M
          int32_t sum = 1 << offset_bits_vert;
606
9.19M
          for (int m = 0; m < 8; ++m) {
607
8.17M
            sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
608
8.17M
          }
609
610
1.02M
          if (conv_params->is_compound) {
611
285k
            CONV_BUF_TYPE *p =
612
285k
                &conv_params
613
285k
                     ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
614
285k
                           (j - p_col + l + 4)];
615
285k
            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
616
285k
            if (conv_params->do_average) {
617
107k
              uint8_t *dst8 =
618
107k
                  &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
619
107k
              int32_t tmp32 = *p;
620
107k
              if (conv_params->use_dist_wtd_comp_avg) {
621
39.8k
                tmp32 = tmp32 * conv_params->fwd_offset +
622
39.8k
                        sum * conv_params->bck_offset;
623
39.8k
                tmp32 = tmp32 >> DIST_PRECISION_BITS;
624
67.9k
              } else {
625
67.9k
                tmp32 += sum;
626
67.9k
                tmp32 = tmp32 >> 1;
627
67.9k
              }
628
107k
              tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
629
107k
                      (1 << (offset_bits - conv_params->round_1 - 1));
630
107k
              *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
631
177k
            } else {
632
177k
              *p = sum;
633
177k
            }
634
736k
          } else {
635
736k
            uint8_t *p =
636
736k
                &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
637
736k
            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
638
736k
            assert(0 <= sum && sum < (1 << (bd + 2)));
639
736k
            *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
640
736k
          }
641
1.02M
          sy += gamma;
642
1.02M
        }
643
127k
      }
644
15.9k
    }
645
9.24k
  }
646
7.67k
}
647
648
void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
649
                int height, int stride, uint8_t *pred, int p_col, int p_row,
650
                int p_width, int p_height, int p_stride, int subsampling_x,
651
7.67k
                int subsampling_y, ConvolveParams *conv_params) {
652
7.67k
  const int32_t *const mat = wm->wmmat;
653
7.67k
  const int16_t alpha = wm->alpha;
654
7.67k
  const int16_t beta = wm->beta;
655
7.67k
  const int16_t gamma = wm->gamma;
656
7.67k
  const int16_t delta = wm->delta;
657
7.67k
  av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
658
7.67k
                  p_height, p_stride, subsampling_x, subsampling_y, conv_params,
659
7.67k
                  alpha, beta, gamma, delta);
660
7.67k
}
661
662
void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
663
                    const uint8_t *ref, int width, int height, int stride,
664
                    uint8_t *pred, int p_col, int p_row, int p_width,
665
                    int p_height, int p_stride, int subsampling_x,
666
15.5k
                    int subsampling_y, ConvolveParams *conv_params) {
667
15.5k
#if CONFIG_AV1_HIGHBITDEPTH
668
15.5k
  if (use_hbd)
669
7.84k
    highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride,
670
7.84k
                      CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width,
671
7.84k
                      p_height, p_stride, subsampling_x, subsampling_y, bd,
672
7.84k
                      conv_params);
673
7.67k
  else
674
7.67k
    warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
675
7.67k
               p_height, p_stride, subsampling_x, subsampling_y, conv_params);
676
#else
677
  (void)use_hbd;
678
  (void)bd;
679
  warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
680
             p_height, p_stride, subsampling_x, subsampling_y, conv_params);
681
#endif
682
15.5k
}
683
684
7.12k
#define LS_MV_MAX 256  // max mv in 1/8-pel
685
// Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By.
686
49.3k
#define LS_STEP 8
687
688
// Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
689
// the precision needed is:
690
//   (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
691
//   (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
692
//   1 [for sign] +
693
//   LEAST_SQUARES_SAMPLES_MAX_BITS
694
//        [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
695
// The value is 23
696
#define LS_MAT_RANGE_BITS \
697
  ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
698
699
// Bit-depth reduction from the full-range
700
16.4k
#define LS_MAT_DOWN_BITS 2
701
702
// bits range of A, Bx and By after downshifting
703
#define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
704
#define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
705
#define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
706
707
// By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are
708
// 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
709
#define LS_SQUARE(a)                                          \
710
4.69k
  (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
711
4.69k
   (2 + LS_MAT_DOWN_BITS))
712
#define LS_PRODUCT1(a, b)                                           \
713
7.04k
  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
714
7.04k
   (2 + LS_MAT_DOWN_BITS))
715
#define LS_PRODUCT2(a, b)                                               \
716
4.69k
  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
717
4.69k
   (2 + LS_MAT_DOWN_BITS))
718
719
#define USE_LIMITED_PREC_MULT 0
720
721
#if USE_LIMITED_PREC_MULT
722
723
#define MUL_PREC_BITS 16
724
static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
725
  int msb = 0;
726
  uint16_t mult = 0;
727
  *shift = 0;
728
  if (D != 0) {
729
    msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
730
                              : get_msb((unsigned int)D));
731
    if (msb >= MUL_PREC_BITS) {
732
      mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
733
      *shift = msb + 1 - MUL_PREC_BITS;
734
    } else {
735
      mult = (uint16_t)D;
736
      *shift = 0;
737
    }
738
  }
739
  return mult;
740
}
741
742
static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
743
  int32_t ret;
744
  int16_t mshift;
745
  uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
746
  int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
747
  shift -= mshift;
748
  if (shift > 0) {
749
    return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
750
                          -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
751
                          WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
752
  } else {
753
    return (int32_t)clamp(v * (1 << (-shift)),
754
                          -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
755
                          WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
756
  }
757
  return ret;
758
}
759
760
static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
761
  int16_t mshift;
762
  uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
763
  int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
764
  shift -= mshift;
765
  if (shift > 0) {
766
    return (int32_t)clamp(
767
        ROUND_POWER_OF_TWO_SIGNED(v, shift),
768
        (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
769
        (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
770
  } else {
771
    return (int32_t)clamp(
772
        v * (1 << (-shift)),
773
        (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
774
        (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
775
  }
776
}
777
778
#else
779
780
4.38k
static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
781
4.38k
  int64_t v = Px * (int64_t)iDet;
782
4.38k
  return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
783
4.38k
                          -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
784
4.38k
                          WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
785
4.38k
}
786
787
4.38k
static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
788
4.38k
  int64_t v = Px * (int64_t)iDet;
789
4.38k
  return (int32_t)clamp64(
790
4.38k
      ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
791
4.38k
      (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
792
4.38k
      (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
793
4.38k
}
794
#endif  // USE_LIMITED_PREC_MULT
795
796
static int find_affine_int(int np, const int *pts1, const int *pts2,
797
                           BLOCK_SIZE bsize, int mvy, int mvx,
798
2.22k
                           WarpedMotionParams *wm, int mi_row, int mi_col) {
799
2.22k
  int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
800
2.22k
  int32_t Bx[2] = { 0, 0 };
801
2.22k
  int32_t By[2] = { 0, 0 };
802
803
2.22k
  const int bw = block_size_wide[bsize];
804
2.22k
  const int bh = block_size_high[bsize];
805
2.22k
  const int rsuy = bh / 2 - 1;
806
2.22k
  const int rsux = bw / 2 - 1;
807
2.22k
  const int suy = rsuy * 8;
808
2.22k
  const int sux = rsux * 8;
809
2.22k
  const int duy = suy + mvy;
810
2.22k
  const int dux = sux + mvx;
811
812
  // Assume the center pixel of the block has exactly the same motion vector
813
  // as transmitted for the block. First shift the origin of the source
814
  // points to the block center, and the origin of the destination points to
815
  // the block center added to the motion vector transmitted.
816
  // Let (xi, yi) denote the source points and (xi', yi') denote destination
817
  // points after origin shfifting, for i = 0, 1, 2, .... n-1.
818
  // Then if  P = [x0, y0,
819
  //               x1, y1
820
  //               x2, y1,
821
  //                ....
822
  //              ]
823
  //          q = [x0', x1', x2', ... ]'
824
  //          r = [y0', y1', y2', ... ]'
825
  // the least squares problems that need to be solved are:
826
  //          [h1, h2]' = inv(P'P)P'q and
827
  //          [h3, h4]' = inv(P'P)P'r
828
  // where the affine transformation is given by:
829
  //          x' = h1.x + h2.y
830
  //          y' = h3.x + h4.y
831
  //
832
  // The loop below computes: A = P'P, Bx = P'q, By = P'r
833
  // We need to just compute inv(A).Bx and inv(A).By for the solutions.
834
  // Contribution from neighbor block
835
4.60k
  for (int i = 0; i < np; i++) {
836
2.37k
    const int dx = pts2[i * 2] - dux;
837
2.37k
    const int dy = pts2[i * 2 + 1] - duy;
838
2.37k
    const int sx = pts1[i * 2] - sux;
839
2.37k
    const int sy = pts1[i * 2 + 1] - suy;
840
    // (TODO)yunqing: This comparison wouldn't be necessary if the sample
841
    // selection is done in find_samples(). Also, global offset can be removed
842
    // while collecting samples.
843
2.37k
    if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
844
2.34k
      A[0][0] += LS_SQUARE(sx);
845
2.34k
      A[0][1] += LS_PRODUCT1(sx, sy);
846
2.34k
      A[1][1] += LS_SQUARE(sy);
847
2.34k
      Bx[0] += LS_PRODUCT2(sx, dx);
848
2.34k
      Bx[1] += LS_PRODUCT1(sy, dx);
849
2.34k
      By[0] += LS_PRODUCT1(sx, dy);
850
2.34k
      By[1] += LS_PRODUCT2(sy, dy);
851
2.34k
    }
852
2.37k
  }
853
854
  // Just for debugging, and can be removed later.
855
2.22k
  assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
856
2.22k
  assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
857
2.22k
  assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
858
2.22k
  assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
859
2.22k
  assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
860
2.22k
  assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
861
2.22k
  assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
862
863
  // Compute Determinant of A
864
2.22k
  const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
865
2.22k
  if (Det == 0) return 1;
866
867
2.19k
  int16_t shift;
868
2.19k
  int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
869
2.19k
  shift -= WARPEDMODEL_PREC_BITS;
870
2.19k
  if (shift < 0) {
871
0
    iDet <<= (-shift);
872
0
    shift = 0;
873
0
  }
874
875
2.19k
  int64_t Px[2], Py[2];
876
  // These divided by the Det, are the least squares solutions
877
2.19k
  Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
878
2.19k
  Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
879
2.19k
  Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
880
2.19k
  Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
881
882
2.19k
  wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
883
2.19k
  wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
884
2.19k
  wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
885
2.19k
  wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
886
887
2.19k
  const int isuy = (mi_row * MI_SIZE + rsuy);
888
2.19k
  const int isux = (mi_col * MI_SIZE + rsux);
889
  // Note: In the vx, vy expressions below, the max value of each of the
890
  // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
891
  // for the first term so that the overall sum in the worst case fits
892
  // within 32 bits overall.
893
2.19k
  const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
894
2.19k
                     (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
895
2.19k
                      isuy * wm->wmmat[3]);
896
2.19k
  const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
897
2.19k
                     (isux * wm->wmmat[4] +
898
2.19k
                      isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
899
2.19k
  wm->wmmat[0] =
900
2.19k
      clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
901
2.19k
  wm->wmmat[1] =
902
2.19k
      clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
903
2.19k
  return 0;
904
2.22k
}
905
906
int av1_find_projection(int np, const int *pts1, const int *pts2,
907
                        BLOCK_SIZE bsize, int mvy, int mvx,
908
2.22k
                        WarpedMotionParams *wm_params, int mi_row, int mi_col) {
909
2.22k
  assert(wm_params->wmtype == AFFINE);
910
911
2.22k
  if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
912
2.22k
                      mi_col))
913
30
    return 1;
914
915
  // check compatibility with the fast warp filter
916
2.19k
  if (!av1_get_shear_params(wm_params)) return 1;
917
918
2.00k
  return 0;
919
2.19k
}