Coverage Report

Created: 2023-06-07 06:31

/src/aom/av1/common/warped_motion.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <stdio.h>
13
#include <stdlib.h>
14
#include <memory.h>
15
#include <math.h>
16
#include <assert.h>
17
18
#include "config/av1_rtcd.h"
19
20
#include "av1/common/warped_motion.h"
21
#include "av1/common/scale.h"
22
23
// For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
24
// at a time. The zoom/rotation/shear in the model are applied to the
25
// "fractional" position of each pixel, which therefore varies within
26
// [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
27
// We need an extra 2 taps to fit this in, for a total of 8 taps.
28
/* clang-format off */
29
const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
30
  // [-1, 0)
31
  { 0,   0, 127,   1,   0, 0, 0, 0 }, { 0, - 1, 127,   2,   0, 0, 0, 0 },
32
  { 1, - 3, 127,   4, - 1, 0, 0, 0 }, { 1, - 4, 126,   6, - 2, 1, 0, 0 },
33
  { 1, - 5, 126,   8, - 3, 1, 0, 0 }, { 1, - 6, 125,  11, - 4, 1, 0, 0 },
34
  { 1, - 7, 124,  13, - 4, 1, 0, 0 }, { 2, - 8, 123,  15, - 5, 1, 0, 0 },
35
  { 2, - 9, 122,  18, - 6, 1, 0, 0 }, { 2, -10, 121,  20, - 6, 1, 0, 0 },
36
  { 2, -11, 120,  22, - 7, 2, 0, 0 }, { 2, -12, 119,  25, - 8, 2, 0, 0 },
37
  { 3, -13, 117,  27, - 8, 2, 0, 0 }, { 3, -13, 116,  29, - 9, 2, 0, 0 },
38
  { 3, -14, 114,  32, -10, 3, 0, 0 }, { 3, -15, 113,  35, -10, 2, 0, 0 },
39
  { 3, -15, 111,  37, -11, 3, 0, 0 }, { 3, -16, 109,  40, -11, 3, 0, 0 },
40
  { 3, -16, 108,  42, -12, 3, 0, 0 }, { 4, -17, 106,  45, -13, 3, 0, 0 },
41
  { 4, -17, 104,  47, -13, 3, 0, 0 }, { 4, -17, 102,  50, -14, 3, 0, 0 },
42
  { 4, -17, 100,  52, -14, 3, 0, 0 }, { 4, -18,  98,  55, -15, 4, 0, 0 },
43
  { 4, -18,  96,  58, -15, 3, 0, 0 }, { 4, -18,  94,  60, -16, 4, 0, 0 },
44
  { 4, -18,  91,  63, -16, 4, 0, 0 }, { 4, -18,  89,  65, -16, 4, 0, 0 },
45
  { 4, -18,  87,  68, -17, 4, 0, 0 }, { 4, -18,  85,  70, -17, 4, 0, 0 },
46
  { 4, -18,  82,  73, -17, 4, 0, 0 }, { 4, -18,  80,  75, -17, 4, 0, 0 },
47
  { 4, -18,  78,  78, -18, 4, 0, 0 }, { 4, -17,  75,  80, -18, 4, 0, 0 },
48
  { 4, -17,  73,  82, -18, 4, 0, 0 }, { 4, -17,  70,  85, -18, 4, 0, 0 },
49
  { 4, -17,  68,  87, -18, 4, 0, 0 }, { 4, -16,  65,  89, -18, 4, 0, 0 },
50
  { 4, -16,  63,  91, -18, 4, 0, 0 }, { 4, -16,  60,  94, -18, 4, 0, 0 },
51
  { 3, -15,  58,  96, -18, 4, 0, 0 }, { 4, -15,  55,  98, -18, 4, 0, 0 },
52
  { 3, -14,  52, 100, -17, 4, 0, 0 }, { 3, -14,  50, 102, -17, 4, 0, 0 },
53
  { 3, -13,  47, 104, -17, 4, 0, 0 }, { 3, -13,  45, 106, -17, 4, 0, 0 },
54
  { 3, -12,  42, 108, -16, 3, 0, 0 }, { 3, -11,  40, 109, -16, 3, 0, 0 },
55
  { 3, -11,  37, 111, -15, 3, 0, 0 }, { 2, -10,  35, 113, -15, 3, 0, 0 },
56
  { 3, -10,  32, 114, -14, 3, 0, 0 }, { 2, - 9,  29, 116, -13, 3, 0, 0 },
57
  { 2, - 8,  27, 117, -13, 3, 0, 0 }, { 2, - 8,  25, 119, -12, 2, 0, 0 },
58
  { 2, - 7,  22, 120, -11, 2, 0, 0 }, { 1, - 6,  20, 121, -10, 2, 0, 0 },
59
  { 1, - 6,  18, 122, - 9, 2, 0, 0 }, { 1, - 5,  15, 123, - 8, 2, 0, 0 },
60
  { 1, - 4,  13, 124, - 7, 1, 0, 0 }, { 1, - 4,  11, 125, - 6, 1, 0, 0 },
61
  { 1, - 3,   8, 126, - 5, 1, 0, 0 }, { 1, - 2,   6, 126, - 4, 1, 0, 0 },
62
  { 0, - 1,   4, 127, - 3, 1, 0, 0 }, { 0,   0,   2, 127, - 1, 0, 0, 0 },
63
64
  // [0, 1)
65
  { 0,  0,   0, 127,   1,   0,  0,  0}, { 0,  0,  -1, 127,   2,   0,  0,  0},
66
  { 0,  1,  -3, 127,   4,  -2,  1,  0}, { 0,  1,  -5, 127,   6,  -2,  1,  0},
67
  { 0,  2,  -6, 126,   8,  -3,  1,  0}, {-1,  2,  -7, 126,  11,  -4,  2, -1},
68
  {-1,  3,  -8, 125,  13,  -5,  2, -1}, {-1,  3, -10, 124,  16,  -6,  3, -1},
69
  {-1,  4, -11, 123,  18,  -7,  3, -1}, {-1,  4, -12, 122,  20,  -7,  3, -1},
70
  {-1,  4, -13, 121,  23,  -8,  3, -1}, {-2,  5, -14, 120,  25,  -9,  4, -1},
71
  {-1,  5, -15, 119,  27, -10,  4, -1}, {-1,  5, -16, 118,  30, -11,  4, -1},
72
  {-2,  6, -17, 116,  33, -12,  5, -1}, {-2,  6, -17, 114,  35, -12,  5, -1},
73
  {-2,  6, -18, 113,  38, -13,  5, -1}, {-2,  7, -19, 111,  41, -14,  6, -2},
74
  {-2,  7, -19, 110,  43, -15,  6, -2}, {-2,  7, -20, 108,  46, -15,  6, -2},
75
  {-2,  7, -20, 106,  49, -16,  6, -2}, {-2,  7, -21, 104,  51, -16,  7, -2},
76
  {-2,  7, -21, 102,  54, -17,  7, -2}, {-2,  8, -21, 100,  56, -18,  7, -2},
77
  {-2,  8, -22,  98,  59, -18,  7, -2}, {-2,  8, -22,  96,  62, -19,  7, -2},
78
  {-2,  8, -22,  94,  64, -19,  7, -2}, {-2,  8, -22,  91,  67, -20,  8, -2},
79
  {-2,  8, -22,  89,  69, -20,  8, -2}, {-2,  8, -22,  87,  72, -21,  8, -2},
80
  {-2,  8, -21,  84,  74, -21,  8, -2}, {-2,  8, -22,  82,  77, -21,  8, -2},
81
  {-2,  8, -21,  79,  79, -21,  8, -2}, {-2,  8, -21,  77,  82, -22,  8, -2},
82
  {-2,  8, -21,  74,  84, -21,  8, -2}, {-2,  8, -21,  72,  87, -22,  8, -2},
83
  {-2,  8, -20,  69,  89, -22,  8, -2}, {-2,  8, -20,  67,  91, -22,  8, -2},
84
  {-2,  7, -19,  64,  94, -22,  8, -2}, {-2,  7, -19,  62,  96, -22,  8, -2},
85
  {-2,  7, -18,  59,  98, -22,  8, -2}, {-2,  7, -18,  56, 100, -21,  8, -2},
86
  {-2,  7, -17,  54, 102, -21,  7, -2}, {-2,  7, -16,  51, 104, -21,  7, -2},
87
  {-2,  6, -16,  49, 106, -20,  7, -2}, {-2,  6, -15,  46, 108, -20,  7, -2},
88
  {-2,  6, -15,  43, 110, -19,  7, -2}, {-2,  6, -14,  41, 111, -19,  7, -2},
89
  {-1,  5, -13,  38, 113, -18,  6, -2}, {-1,  5, -12,  35, 114, -17,  6, -2},
90
  {-1,  5, -12,  33, 116, -17,  6, -2}, {-1,  4, -11,  30, 118, -16,  5, -1},
91
  {-1,  4, -10,  27, 119, -15,  5, -1}, {-1,  4,  -9,  25, 120, -14,  5, -2},
92
  {-1,  3,  -8,  23, 121, -13,  4, -1}, {-1,  3,  -7,  20, 122, -12,  4, -1},
93
  {-1,  3,  -7,  18, 123, -11,  4, -1}, {-1,  3,  -6,  16, 124, -10,  3, -1},
94
  {-1,  2,  -5,  13, 125,  -8,  3, -1}, {-1,  2,  -4,  11, 126,  -7,  2, -1},
95
  { 0,  1,  -3,   8, 126,  -6,  2,  0}, { 0,  1,  -2,   6, 127,  -5,  1,  0},
96
  { 0,  1,  -2,   4, 127,  -3,  1,  0}, { 0,  0,   0,   2, 127,  -1,  0,  0},
97
98
  // [1, 2)
99
  { 0, 0, 0,   1, 127,   0,   0, 0 }, { 0, 0, 0, - 1, 127,   2,   0, 0 },
100
  { 0, 0, 1, - 3, 127,   4, - 1, 0 }, { 0, 0, 1, - 4, 126,   6, - 2, 1 },
101
  { 0, 0, 1, - 5, 126,   8, - 3, 1 }, { 0, 0, 1, - 6, 125,  11, - 4, 1 },
102
  { 0, 0, 1, - 7, 124,  13, - 4, 1 }, { 0, 0, 2, - 8, 123,  15, - 5, 1 },
103
  { 0, 0, 2, - 9, 122,  18, - 6, 1 }, { 0, 0, 2, -10, 121,  20, - 6, 1 },
104
  { 0, 0, 2, -11, 120,  22, - 7, 2 }, { 0, 0, 2, -12, 119,  25, - 8, 2 },
105
  { 0, 0, 3, -13, 117,  27, - 8, 2 }, { 0, 0, 3, -13, 116,  29, - 9, 2 },
106
  { 0, 0, 3, -14, 114,  32, -10, 3 }, { 0, 0, 3, -15, 113,  35, -10, 2 },
107
  { 0, 0, 3, -15, 111,  37, -11, 3 }, { 0, 0, 3, -16, 109,  40, -11, 3 },
108
  { 0, 0, 3, -16, 108,  42, -12, 3 }, { 0, 0, 4, -17, 106,  45, -13, 3 },
109
  { 0, 0, 4, -17, 104,  47, -13, 3 }, { 0, 0, 4, -17, 102,  50, -14, 3 },
110
  { 0, 0, 4, -17, 100,  52, -14, 3 }, { 0, 0, 4, -18,  98,  55, -15, 4 },
111
  { 0, 0, 4, -18,  96,  58, -15, 3 }, { 0, 0, 4, -18,  94,  60, -16, 4 },
112
  { 0, 0, 4, -18,  91,  63, -16, 4 }, { 0, 0, 4, -18,  89,  65, -16, 4 },
113
  { 0, 0, 4, -18,  87,  68, -17, 4 }, { 0, 0, 4, -18,  85,  70, -17, 4 },
114
  { 0, 0, 4, -18,  82,  73, -17, 4 }, { 0, 0, 4, -18,  80,  75, -17, 4 },
115
  { 0, 0, 4, -18,  78,  78, -18, 4 }, { 0, 0, 4, -17,  75,  80, -18, 4 },
116
  { 0, 0, 4, -17,  73,  82, -18, 4 }, { 0, 0, 4, -17,  70,  85, -18, 4 },
117
  { 0, 0, 4, -17,  68,  87, -18, 4 }, { 0, 0, 4, -16,  65,  89, -18, 4 },
118
  { 0, 0, 4, -16,  63,  91, -18, 4 }, { 0, 0, 4, -16,  60,  94, -18, 4 },
119
  { 0, 0, 3, -15,  58,  96, -18, 4 }, { 0, 0, 4, -15,  55,  98, -18, 4 },
120
  { 0, 0, 3, -14,  52, 100, -17, 4 }, { 0, 0, 3, -14,  50, 102, -17, 4 },
121
  { 0, 0, 3, -13,  47, 104, -17, 4 }, { 0, 0, 3, -13,  45, 106, -17, 4 },
122
  { 0, 0, 3, -12,  42, 108, -16, 3 }, { 0, 0, 3, -11,  40, 109, -16, 3 },
123
  { 0, 0, 3, -11,  37, 111, -15, 3 }, { 0, 0, 2, -10,  35, 113, -15, 3 },
124
  { 0, 0, 3, -10,  32, 114, -14, 3 }, { 0, 0, 2, - 9,  29, 116, -13, 3 },
125
  { 0, 0, 2, - 8,  27, 117, -13, 3 }, { 0, 0, 2, - 8,  25, 119, -12, 2 },
126
  { 0, 0, 2, - 7,  22, 120, -11, 2 }, { 0, 0, 1, - 6,  20, 121, -10, 2 },
127
  { 0, 0, 1, - 6,  18, 122, - 9, 2 }, { 0, 0, 1, - 5,  15, 123, - 8, 2 },
128
  { 0, 0, 1, - 4,  13, 124, - 7, 1 }, { 0, 0, 1, - 4,  11, 125, - 6, 1 },
129
  { 0, 0, 1, - 3,   8, 126, - 5, 1 }, { 0, 0, 1, - 2,   6, 126, - 4, 1 },
130
  { 0, 0, 0, - 1,   4, 127, - 3, 1 }, { 0, 0, 0,   0,   2, 127, - 1, 0 },
131
  // dummy (replicate row index 191)
132
  { 0, 0, 0,   0,   2, 127, - 1, 0 },
133
};
134
135
/* clang-format on */
136
137
1.05M
#define DIV_LUT_PREC_BITS 14
138
1.05M
#define DIV_LUT_BITS 8
139
#define DIV_LUT_NUM (1 << DIV_LUT_BITS)
140
141
static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
142
  16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
143
  15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
144
  15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
145
  14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
146
  13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
147
  13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
148
  13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
149
  12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
150
  12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
151
  11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
152
  11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
153
  11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
154
  10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
155
  10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
156
  10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
157
  9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
158
  9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
159
  9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
160
  9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
161
  9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
162
  8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
163
  8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
164
  8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
165
  8240,  8224,  8208,  8192,
166
};
167
168
// Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
169
// at precision of DIV_LUT_PREC_BITS along with the shift.
170
417k
static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
171
417k
  int64_t f;
172
417k
  *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
173
417k
                               : get_msb((unsigned int)D));
174
  // e is obtained from D after resetting the most significant 1 bit.
175
417k
  const int64_t e = D - ((uint64_t)1 << *shift);
176
  // Get the most significant DIV_LUT_BITS (8) bits of e into f
177
417k
  if (*shift > DIV_LUT_BITS)
178
417k
    f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
179
18.4E
  else
180
18.4E
    f = e << (DIV_LUT_BITS - *shift);
181
417k
  assert(f <= DIV_LUT_NUM);
182
417k
  *shift += DIV_LUT_PREC_BITS;
183
  // Use f as lookup into the precomputed table of multipliers
184
417k
  return div_lut[f];
185
417k
}
186
187
635k
static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
188
635k
  int32_t f;
189
635k
  *shift = get_msb(D);
190
  // e is obtained from D after resetting the most significant 1 bit.
191
635k
  const int32_t e = D - ((uint32_t)1 << *shift);
192
  // Get the most significant DIV_LUT_BITS (8) bits of e into f
193
635k
  if (*shift > DIV_LUT_BITS)
194
635k
    f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
195
58
  else
196
58
    f = e << (DIV_LUT_BITS - *shift);
197
635k
  assert(f <= DIV_LUT_NUM);
198
635k
  *shift += DIV_LUT_PREC_BITS;
199
  // Use f as lookup into the precomputed table of multipliers
200
635k
  return div_lut[f];
201
635k
}
202
203
636k
static int is_affine_valid(const WarpedMotionParams *const wm) {
204
636k
  const int32_t *mat = wm->wmmat;
205
636k
  return (mat[2] > 0);
206
636k
}
207
208
static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
209
635k
                                   int16_t delta) {
210
635k
  if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
211
635k
      (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
212
32.5k
    return 0;
213
603k
  else
214
603k
    return 1;
215
635k
}
216
217
// Returns 1 on success or 0 on an invalid affine set
218
636k
int av1_get_shear_params(WarpedMotionParams *wm) {
219
636k
  const int32_t *mat = wm->wmmat;
220
636k
  if (!is_affine_valid(wm)) return 0;
221
635k
  wm->alpha =
222
635k
      clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
223
635k
  wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
224
635k
  int16_t shift;
225
635k
  int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
226
635k
  int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
227
635k
  wm->gamma =
228
635k
      clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
229
635k
  v = ((int64_t)mat[3] * mat[4]) * y;
230
635k
  wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
231
635k
                        (1 << WARPEDMODEL_PREC_BITS),
232
635k
                    INT16_MIN, INT16_MAX);
233
234
635k
  wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
235
635k
              (1 << WARP_PARAM_REDUCE_BITS);
236
635k
  wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
237
635k
             (1 << WARP_PARAM_REDUCE_BITS);
238
635k
  wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
239
635k
              (1 << WARP_PARAM_REDUCE_BITS);
240
635k
  wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
241
635k
              (1 << WARP_PARAM_REDUCE_BITS);
242
243
635k
  if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
244
32.5k
    return 0;
245
246
603k
  return 1;
247
635k
}
248
249
#if CONFIG_AV1_HIGHBITDEPTH
250
0
static INLINE int highbd_error_measure(int err, int bd) {
251
0
  const int b = bd - 8;
252
0
  const int bmask = (1 << b) - 1;
253
0
  const int v = (1 << b);
254
0
  err = abs(err);
255
0
  const int e1 = err >> b;
256
0
  const int e2 = err & bmask;
257
0
  return error_measure_lut[255 + e1] * (v - e2) +
258
0
         error_measure_lut[256 + e1] * e2;
259
0
}
260
261
/* Note: For an explanation of the warp algorithm, and some notes on bit widths
262
    for hardware implementations, see the comments above av1_warp_affine_c
263
*/
264
void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
265
                              int width, int height, int stride, uint16_t *pred,
266
                              int p_col, int p_row, int p_width, int p_height,
267
                              int p_stride, int subsampling_x,
268
                              int subsampling_y, int bd,
269
                              ConvolveParams *conv_params, int16_t alpha,
270
0
                              int16_t beta, int16_t gamma, int16_t delta) {
271
0
  int32_t tmp[15 * 8];
272
0
  const int reduce_bits_horiz =
273
0
      conv_params->round_0 +
274
0
      AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
275
0
  const int reduce_bits_vert = conv_params->is_compound
276
0
                                   ? conv_params->round_1
277
0
                                   : 2 * FILTER_BITS - reduce_bits_horiz;
278
0
  const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
279
0
  const int offset_bits_horiz = bd + FILTER_BITS - 1;
280
0
  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
281
0
  const int round_bits =
282
0
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
283
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
284
0
  (void)max_bits_horiz;
285
0
  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
286
287
0
  for (int i = p_row; i < p_row + p_height; i += 8) {
288
0
    for (int j = p_col; j < p_col + p_width; j += 8) {
289
      // Calculate the center of this 8x8 block,
290
      // project to luma coordinates (if in a subsampled chroma plane),
291
      // apply the affine transformation,
292
      // then convert back to the original coordinates (if necessary)
293
0
      const int32_t src_x = (j + 4) << subsampling_x;
294
0
      const int32_t src_y = (i + 4) << subsampling_y;
295
0
      const int64_t dst_x =
296
0
          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
297
0
      const int64_t dst_y =
298
0
          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
299
0
      const int64_t x4 = dst_x >> subsampling_x;
300
0
      const int64_t y4 = dst_y >> subsampling_y;
301
302
0
      const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
303
0
      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
304
0
      const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
305
0
      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
306
307
0
      sx4 += alpha * (-4) + beta * (-4);
308
0
      sy4 += gamma * (-4) + delta * (-4);
309
310
0
      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
311
0
      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
312
313
      // Horizontal filter
314
0
      for (int k = -7; k < 8; ++k) {
315
0
        const int iy = clamp(iy4 + k, 0, height - 1);
316
317
0
        int sx = sx4 + beta * (k + 4);
318
0
        for (int l = -4; l < 4; ++l) {
319
0
          int ix = ix4 + l - 3;
320
0
          const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
321
0
                           WARPEDPIXEL_PREC_SHIFTS;
322
0
          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
323
0
          const int16_t *coeffs = av1_warped_filter[offs];
324
325
0
          int32_t sum = 1 << offset_bits_horiz;
326
0
          for (int m = 0; m < 8; ++m) {
327
0
            const int sample_x = clamp(ix + m, 0, width - 1);
328
0
            sum += ref[iy * stride + sample_x] * coeffs[m];
329
0
          }
330
0
          sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
331
0
          assert(0 <= sum && sum < (1 << max_bits_horiz));
332
0
          tmp[(k + 7) * 8 + (l + 4)] = sum;
333
0
          sx += alpha;
334
0
        }
335
0
      }
336
337
      // Vertical filter
338
0
      for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
339
0
        int sy = sy4 + delta * (k + 4);
340
0
        for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
341
0
          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
342
0
                           WARPEDPIXEL_PREC_SHIFTS;
343
0
          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
344
0
          const int16_t *coeffs = av1_warped_filter[offs];
345
346
0
          int32_t sum = 1 << offset_bits_vert;
347
0
          for (int m = 0; m < 8; ++m) {
348
0
            sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
349
0
          }
350
351
0
          if (conv_params->is_compound) {
352
0
            CONV_BUF_TYPE *p =
353
0
                &conv_params
354
0
                     ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
355
0
                           (j - p_col + l + 4)];
356
0
            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
357
0
            if (conv_params->do_average) {
358
0
              uint16_t *dst16 =
359
0
                  &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
360
0
              int32_t tmp32 = *p;
361
0
              if (conv_params->use_dist_wtd_comp_avg) {
362
0
                tmp32 = tmp32 * conv_params->fwd_offset +
363
0
                        sum * conv_params->bck_offset;
364
0
                tmp32 = tmp32 >> DIST_PRECISION_BITS;
365
0
              } else {
366
0
                tmp32 += sum;
367
0
                tmp32 = tmp32 >> 1;
368
0
              }
369
0
              tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
370
0
                      (1 << (offset_bits - conv_params->round_1 - 1));
371
0
              *dst16 =
372
0
                  clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd);
373
0
            } else {
374
0
              *p = sum;
375
0
            }
376
0
          } else {
377
0
            uint16_t *p =
378
0
                &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
379
0
            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
380
0
            assert(0 <= sum && sum < (1 << (bd + 2)));
381
0
            *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
382
0
          }
383
0
          sy += gamma;
384
0
        }
385
0
      }
386
0
    }
387
0
  }
388
0
}
389
390
void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
391
                       int width, int height, int stride, uint16_t *const pred,
392
                       int p_col, int p_row, int p_width, int p_height,
393
                       int p_stride, int subsampling_x, int subsampling_y,
394
187k
                       int bd, ConvolveParams *conv_params) {
395
187k
  assert(wm->wmtype <= AFFINE);
396
187k
  if (wm->wmtype == ROTZOOM) {
397
41.4k
    wm->wmmat[5] = wm->wmmat[2];
398
41.4k
    wm->wmmat[4] = -wm->wmmat[3];
399
41.4k
  }
400
187k
  const int32_t *const mat = wm->wmmat;
401
187k
  const int16_t alpha = wm->alpha;
402
187k
  const int16_t beta = wm->beta;
403
187k
  const int16_t gamma = wm->gamma;
404
187k
  const int16_t delta = wm->delta;
405
406
187k
  av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
407
187k
                         p_width, p_height, p_stride, subsampling_x,
408
187k
                         subsampling_y, bd, conv_params, alpha, beta, gamma,
409
187k
                         delta);
410
187k
}
411
412
int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride,
413
                                    const uint16_t *const dst, int p_width,
414
0
                                    int p_height, int p_stride, int bd) {
415
0
  int64_t sum_error = 0;
416
0
  for (int i = 0; i < p_height; ++i) {
417
0
    for (int j = 0; j < p_width; ++j) {
418
0
      sum_error +=
419
0
          highbd_error_measure(dst[j + i * p_stride] - ref[j + i * stride], bd);
420
0
    }
421
0
  }
422
0
  return sum_error;
423
0
}
424
425
static int64_t highbd_segmented_frame_error(
426
    const uint16_t *const ref, int stride, const uint16_t *const dst,
427
    int p_width, int p_height, int p_stride, int bd, uint8_t *segment_map,
428
0
    int segment_map_stride) {
429
0
  int patch_w, patch_h;
430
0
  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
431
0
  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
432
0
  int64_t sum_error = 0;
433
0
  for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
434
0
    for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
435
0
      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
436
0
      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
437
      // Only compute the error if this block contains inliers from the motion
438
      // model
439
0
      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
440
441
      // avoid computing error into the frame padding
442
0
      patch_w = AOMMIN(error_bsize_w, p_width - j);
443
0
      patch_h = AOMMIN(error_bsize_h, p_height - i);
444
0
      sum_error += av1_calc_highbd_frame_error(ref + j + i * stride, stride,
445
0
                                               dst + j + i * p_stride, patch_w,
446
0
                                               patch_h, p_stride, bd);
447
0
    }
448
0
  }
449
0
  return sum_error;
450
0
}
451
#endif  // CONFIG_AV1_HIGHBITDEPTH
452
453
/* The warp filter for ROTZOOM and AFFINE models works as follows:
454
   * Split the input into 8x8 blocks
455
   * For each block, project the point (4, 4) within the block, to get the
456
     overall block position. Split into integer and fractional coordinates,
457
     maintaining full WARPEDMODEL precision
458
   * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
459
     variable horizontal offset. This means that, while the rows of the
460
     intermediate buffer align with the rows of the *reference* image, the
461
     columns align with the columns of the *destination* image.
462
   * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
463
     destination is too small we crop the output at this stage). Each pixel has
464
     a variable vertical offset, so that the resulting rows are aligned with
465
     the rows of the destination image.
466
467
   To accomplish these alignments, we factor the warp matrix as a
468
   product of two shear / asymmetric zoom matrices:
469
   / a b \  = /   1       0    \ * / 1+alpha  beta \
470
   \ c d /    \ gamma  1+delta /   \    0      1   /
471
   where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
472
   The horizontal shear (with alpha and beta) is applied first,
473
   then the vertical shear (with gamma and delta) is applied second.
474
475
   The only limitation is that, to fit this in a fixed 8-tap filter size,
476
   the fractional pixel offsets must be at most +-1. Since the horizontal filter
477
   generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
478
   within the block, the parameters must satisfy
479
   4 * |alpha| + 7 * |beta| <= 1   and   4 * |gamma| + 4 * |delta| <= 1
480
   for this filter to be applicable.
481
482
   Note: This function assumes that the caller has done all of the relevant
483
   checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
484
   are set appropriately (if using a ROTZOOM model), and that alpha, beta,
485
   gamma, delta are all in range.
486
487
   TODO(rachelbarker): Maybe support scaled references?
488
*/
489
/* A note on hardware implementation:
490
    The warp filter is intended to be implementable using the same hardware as
491
    the high-precision convolve filters from the loop-restoration and
492
    convolve-round experiments.
493
494
    For a single filter stage, considering all of the coefficient sets for the
495
    warp filter and the regular convolution filter, an input in the range
496
    [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
497
    before rounding.
498
499
    Allowing for some changes to the filter coefficient sets, call the range
500
    [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
501
    we can replace this by the range [0, 256 * 2^k], which can be stored in an
502
    unsigned value with 8 + k bits.
503
504
    This allows the derivation of the appropriate bit widths and offsets for
505
    the various intermediate values: If
506
507
    F := FILTER_BITS = 7 (or else the above ranges need adjusting)
508
         So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
509
         intermediate value.
510
    H := ROUND0_BITS
511
    V := VERSHEAR_REDUCE_PREC_BITS
512
    (and note that we must have H + V = 2*F for the output to have the same
513
     scale as the input)
514
515
    then we end up with the following offsets and ranges:
516
    Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
517
                       uint{bd + F + 1}
518
    After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
519
    Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
520
                     uint{bd + 2*F + 2 - H}
521
    After rounding: The final value, before undoing the offset, fits into a
522
                    uint{bd + 2}.
523
524
    Then we need to undo the offsets before clamping to a pixel. Note that,
525
    if we do this at the end, the amount to subtract is actually independent
526
    of H and V:
527
528
    offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
529
                         (1 << ((bd + 2*F - H) - V))
530
                      == (1 << (bd - 1)) + (1 << bd)
531
532
    This allows us to entirely avoid clamping in both the warp filter and
533
    the convolve-round experiment. As of the time of writing, the Wiener filter
534
    from loop-restoration can encode a central coefficient up to 216, which
535
    leads to a maximum value of about 282 * 2^k after applying the offset.
536
    So in that case we still need to clamp.
537
*/
538
void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
539
                       int height, int stride, uint8_t *pred, int p_col,
540
                       int p_row, int p_width, int p_height, int p_stride,
541
                       int subsampling_x, int subsampling_y,
542
                       ConvolveParams *conv_params, int16_t alpha, int16_t beta,
543
0
                       int16_t gamma, int16_t delta) {
544
0
  int32_t tmp[15 * 8];
545
0
  const int bd = 8;
546
0
  const int reduce_bits_horiz = conv_params->round_0;
547
0
  const int reduce_bits_vert = conv_params->is_compound
548
0
                                   ? conv_params->round_1
549
0
                                   : 2 * FILTER_BITS - reduce_bits_horiz;
550
0
  const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
551
0
  const int offset_bits_horiz = bd + FILTER_BITS - 1;
552
0
  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
553
0
  const int round_bits =
554
0
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
555
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
556
0
  (void)max_bits_horiz;
557
0
  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
558
0
  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
559
560
0
  for (int i = p_row; i < p_row + p_height; i += 8) {
561
0
    for (int j = p_col; j < p_col + p_width; j += 8) {
562
      // Calculate the center of this 8x8 block,
563
      // project to luma coordinates (if in a subsampled chroma plane),
564
      // apply the affine transformation,
565
      // then convert back to the original coordinates (if necessary)
566
0
      const int32_t src_x = (j + 4) << subsampling_x;
567
0
      const int32_t src_y = (i + 4) << subsampling_y;
568
0
      const int64_t dst_x =
569
0
          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
570
0
      const int64_t dst_y =
571
0
          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
572
0
      const int64_t x4 = dst_x >> subsampling_x;
573
0
      const int64_t y4 = dst_y >> subsampling_y;
574
575
0
      int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
576
0
      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
577
0
      int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
578
0
      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
579
580
0
      sx4 += alpha * (-4) + beta * (-4);
581
0
      sy4 += gamma * (-4) + delta * (-4);
582
583
0
      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
584
0
      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
585
586
      // Horizontal filter
587
0
      for (int k = -7; k < 8; ++k) {
588
        // Clamp to top/bottom edge of the frame
589
0
        const int iy = clamp(iy4 + k, 0, height - 1);
590
591
0
        int sx = sx4 + beta * (k + 4);
592
593
0
        for (int l = -4; l < 4; ++l) {
594
0
          int ix = ix4 + l - 3;
595
          // At this point, sx = sx4 + alpha * l + beta * k
596
0
          const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
597
0
                           WARPEDPIXEL_PREC_SHIFTS;
598
0
          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
599
0
          const int16_t *coeffs = av1_warped_filter[offs];
600
601
0
          int32_t sum = 1 << offset_bits_horiz;
602
0
          for (int m = 0; m < 8; ++m) {
603
            // Clamp to left/right edge of the frame
604
0
            const int sample_x = clamp(ix + m, 0, width - 1);
605
606
0
            sum += ref[iy * stride + sample_x] * coeffs[m];
607
0
          }
608
0
          sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
609
0
          assert(0 <= sum && sum < (1 << max_bits_horiz));
610
0
          tmp[(k + 7) * 8 + (l + 4)] = sum;
611
0
          sx += alpha;
612
0
        }
613
0
      }
614
615
      // Vertical filter
616
0
      for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
617
0
        int sy = sy4 + delta * (k + 4);
618
0
        for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
619
          // At this point, sy = sy4 + gamma * l + delta * k
620
0
          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
621
0
                           WARPEDPIXEL_PREC_SHIFTS;
622
0
          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
623
0
          const int16_t *coeffs = av1_warped_filter[offs];
624
625
0
          int32_t sum = 1 << offset_bits_vert;
626
0
          for (int m = 0; m < 8; ++m) {
627
0
            sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
628
0
          }
629
630
0
          if (conv_params->is_compound) {
631
0
            CONV_BUF_TYPE *p =
632
0
                &conv_params
633
0
                     ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
634
0
                           (j - p_col + l + 4)];
635
0
            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
636
0
            if (conv_params->do_average) {
637
0
              uint8_t *dst8 =
638
0
                  &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
639
0
              int32_t tmp32 = *p;
640
0
              if (conv_params->use_dist_wtd_comp_avg) {
641
0
                tmp32 = tmp32 * conv_params->fwd_offset +
642
0
                        sum * conv_params->bck_offset;
643
0
                tmp32 = tmp32 >> DIST_PRECISION_BITS;
644
0
              } else {
645
0
                tmp32 += sum;
646
0
                tmp32 = tmp32 >> 1;
647
0
              }
648
0
              tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
649
0
                      (1 << (offset_bits - conv_params->round_1 - 1));
650
0
              *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
651
0
            } else {
652
0
              *p = sum;
653
0
            }
654
0
          } else {
655
0
            uint8_t *p =
656
0
                &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
657
0
            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
658
0
            assert(0 <= sum && sum < (1 << (bd + 2)));
659
0
            *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
660
0
          }
661
0
          sy += gamma;
662
0
        }
663
0
      }
664
0
    }
665
0
  }
666
0
}
667
668
void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
669
                int height, int stride, uint8_t *pred, int p_col, int p_row,
670
                int p_width, int p_height, int p_stride, int subsampling_x,
671
576k
                int subsampling_y, ConvolveParams *conv_params) {
672
576k
  assert(wm->wmtype <= AFFINE);
673
576k
  if (wm->wmtype == ROTZOOM) {
674
91.6k
    wm->wmmat[5] = wm->wmmat[2];
675
91.6k
    wm->wmmat[4] = -wm->wmmat[3];
676
91.6k
  }
677
576k
  const int32_t *const mat = wm->wmmat;
678
576k
  const int16_t alpha = wm->alpha;
679
576k
  const int16_t beta = wm->beta;
680
576k
  const int16_t gamma = wm->gamma;
681
576k
  const int16_t delta = wm->delta;
682
576k
  av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
683
576k
                  p_height, p_stride, subsampling_x, subsampling_y, conv_params,
684
576k
                  alpha, beta, gamma, delta);
685
576k
}
686
687
int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride,
688
                               const uint8_t *const dst, int p_width,
689
0
                               int p_height, int p_stride) {
690
0
  int64_t sum_error = 0;
691
0
  for (int i = 0; i < p_height; ++i) {
692
0
    for (int j = 0; j < p_width; ++j) {
693
0
      sum_error +=
694
0
          (int64_t)error_measure(dst[j + i * p_stride] - ref[j + i * stride]);
695
0
    }
696
0
  }
697
0
  return sum_error;
698
0
}
699
700
static int64_t segmented_frame_error(const uint8_t *const ref, int stride,
701
                                     const uint8_t *const dst, int p_width,
702
                                     int p_height, int p_stride,
703
                                     uint8_t *segment_map,
704
0
                                     int segment_map_stride) {
705
0
  int patch_w, patch_h;
706
0
  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
707
0
  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
708
0
  int64_t sum_error = 0;
709
0
  for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
710
0
    for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
711
0
      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
712
0
      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
713
      // Only compute the error if this block contains inliers from the motion
714
      // model
715
0
      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
716
717
      // avoid computing error into the frame padding
718
0
      patch_w = AOMMIN(error_bsize_w, p_width - j);
719
0
      patch_h = AOMMIN(error_bsize_h, p_height - i);
720
0
      sum_error += av1_calc_frame_error(ref + j + i * stride, stride,
721
0
                                        dst + j + i * p_stride, patch_w,
722
0
                                        patch_h, p_stride);
723
0
    }
724
0
  }
725
0
  return sum_error;
726
0
}
727
728
int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
729
0
                        uint8_t *dst, int p_width, int p_height, int p_stride) {
730
0
#if CONFIG_AV1_HIGHBITDEPTH
731
0
  if (use_hbd) {
732
0
    return av1_calc_highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
733
0
                                       CONVERT_TO_SHORTPTR(dst), p_width,
734
0
                                       p_height, p_stride, bd);
735
0
  }
736
0
#endif
737
0
  (void)use_hbd;
738
0
  (void)bd;
739
0
  return av1_calc_frame_error(ref, stride, dst, p_width, p_height, p_stride);
740
0
}
741
742
int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
743
                                  int stride, uint8_t *dst, int p_width,
744
                                  int p_height, int p_stride,
745
                                  uint8_t *segment_map,
746
0
                                  int segment_map_stride) {
747
0
#if CONFIG_AV1_HIGHBITDEPTH
748
0
  if (use_hbd) {
749
0
    return highbd_segmented_frame_error(
750
0
        CONVERT_TO_SHORTPTR(ref), stride, CONVERT_TO_SHORTPTR(dst), p_width,
751
0
        p_height, p_stride, bd, segment_map, segment_map_stride);
752
0
  }
753
0
#endif
754
0
  (void)use_hbd;
755
0
  (void)bd;
756
0
  return segmented_frame_error(ref, stride, dst, p_width, p_height, p_stride,
757
0
                               segment_map, segment_map_stride);
758
0
}
759
760
void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
761
                    const uint8_t *ref, int width, int height, int stride,
762
                    uint8_t *pred, int p_col, int p_row, int p_width,
763
                    int p_height, int p_stride, int subsampling_x,
764
764k
                    int subsampling_y, ConvolveParams *conv_params) {
765
764k
#if CONFIG_AV1_HIGHBITDEPTH
766
764k
  if (use_hbd)
767
187k
    highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride,
768
187k
                      CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width,
769
187k
                      p_height, p_stride, subsampling_x, subsampling_y, bd,
770
187k
                      conv_params);
771
576k
  else
772
576k
    warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
773
576k
               p_height, p_stride, subsampling_x, subsampling_y, conv_params);
774
#else
775
  (void)use_hbd;
776
  (void)bd;
777
  warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
778
             p_height, p_stride, subsampling_x, subsampling_y, conv_params);
779
#endif
780
764k
}
781
782
2.68M
#define LS_MV_MAX 256  // max mv in 1/8-pel
783
// Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By.
784
18.4M
#define LS_STEP 8
785
786
// Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
787
// the precision needed is:
788
//   (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
789
//   (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
790
//   1 [for sign] +
791
//   LEAST_SQUARES_SAMPLES_MAX_BITS
792
//        [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
793
// The value is 23
794
#define LS_MAT_RANGE_BITS \
795
  ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
796
797
// Bit-depth reduction from the full-range
798
6.13M
#define LS_MAT_DOWN_BITS 2
799
800
// bits range of A, Bx and By after downshifting
801
#define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
802
#define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
803
#define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
804
805
// By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are
806
// 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
807
#define LS_SQUARE(a)                                          \
808
1.75M
  (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
809
1.75M
   (2 + LS_MAT_DOWN_BITS))
810
#define LS_PRODUCT1(a, b)                                           \
811
2.63M
  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
812
2.63M
   (2 + LS_MAT_DOWN_BITS))
813
#define LS_PRODUCT2(a, b)                                               \
814
1.75M
  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
815
1.75M
   (2 + LS_MAT_DOWN_BITS))
816
817
#define USE_LIMITED_PREC_MULT 0
818
819
#if USE_LIMITED_PREC_MULT
820
821
#define MUL_PREC_BITS 16
822
static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
823
  int msb = 0;
824
  uint16_t mult = 0;
825
  *shift = 0;
826
  if (D != 0) {
827
    msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
828
                              : get_msb((unsigned int)D));
829
    if (msb >= MUL_PREC_BITS) {
830
      mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
831
      *shift = msb + 1 - MUL_PREC_BITS;
832
    } else {
833
      mult = (uint16_t)D;
834
      *shift = 0;
835
    }
836
  }
837
  return mult;
838
}
839
840
static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
841
  int32_t ret;
842
  int16_t mshift;
843
  uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
844
  int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
845
  shift -= mshift;
846
  if (shift > 0) {
847
    return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
848
                          -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
849
                          WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
850
  } else {
851
    return (int32_t)clamp(v * (1 << (-shift)),
852
                          -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
853
                          WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
854
  }
855
  return ret;
856
}
857
858
static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
859
  int16_t mshift;
860
  uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
861
  int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
862
  shift -= mshift;
863
  if (shift > 0) {
864
    return (int32_t)clamp(
865
        ROUND_POWER_OF_TWO_SIGNED(v, shift),
866
        (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
867
        (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
868
  } else {
869
    return (int32_t)clamp(
870
        v * (1 << (-shift)),
871
        (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
872
        (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
873
  }
874
}
875
876
#else
877
878
834k
static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
879
834k
  int64_t v = Px * (int64_t)iDet;
880
834k
  return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
881
834k
                          -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
882
834k
                          WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
883
834k
}
884
885
834k
static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
886
834k
  int64_t v = Px * (int64_t)iDet;
887
834k
  return (int32_t)clamp64(
888
834k
      ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
889
834k
      (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
890
834k
      (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
891
834k
}
892
#endif  // USE_LIMITED_PREC_MULT
893
894
static int find_affine_int(int np, const int *pts1, const int *pts2,
895
                           BLOCK_SIZE bsize, int mvy, int mvx,
896
441k
                           WarpedMotionParams *wm, int mi_row, int mi_col) {
897
441k
  int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
898
441k
  int32_t Bx[2] = { 0, 0 };
899
441k
  int32_t By[2] = { 0, 0 };
900
901
441k
  const int bw = block_size_wide[bsize];
902
441k
  const int bh = block_size_high[bsize];
903
441k
  const int rsuy = bh / 2 - 1;
904
441k
  const int rsux = bw / 2 - 1;
905
441k
  const int suy = rsuy * 8;
906
441k
  const int sux = rsux * 8;
907
441k
  const int duy = suy + mvy;
908
441k
  const int dux = sux + mvx;
909
910
  // Assume the center pixel of the block has exactly the same motion vector
911
  // as transmitted for the block. First shift the origin of the source
912
  // points to the block center, and the origin of the destination points to
913
  // the block center added to the motion vector transmitted.
914
  // Let (xi, yi) denote the source points and (xi', yi') denote destination
915
  // points after origin shfifting, for i = 0, 1, 2, .... n-1.
916
  // Then if  P = [x0, y0,
917
  //               x1, y1
918
  //               x2, y1,
919
  //                ....
920
  //              ]
921
  //          q = [x0', x1', x2', ... ]'
922
  //          r = [y0', y1', y2', ... ]'
923
  // the least squares problems that need to be solved are:
924
  //          [h1, h2]' = inv(P'P)P'q and
925
  //          [h3, h4]' = inv(P'P)P'r
926
  // where the affine transformation is given by:
927
  //          x' = h1.x + h2.y
928
  //          y' = h3.x + h4.y
929
  //
930
  // The loop below computes: A = P'P, Bx = P'q, By = P'r
931
  // We need to just compute inv(A).Bx and inv(A).By for the solutions.
932
  // Contribution from neighbor block
933
1.34M
  for (int i = 0; i < np; i++) {
934
901k
    const int dx = pts2[i * 2] - dux;
935
901k
    const int dy = pts2[i * 2 + 1] - duy;
936
901k
    const int sx = pts1[i * 2] - sux;
937
901k
    const int sy = pts1[i * 2 + 1] - suy;
938
    // (TODO)yunqing: This comparison wouldn't be necessary if the sample
939
    // selection is done in find_samples(). Also, global offset can be removed
940
    // while collecting samples.
941
901k
    if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
942
876k
      A[0][0] += LS_SQUARE(sx);
943
876k
      A[0][1] += LS_PRODUCT1(sx, sy);
944
876k
      A[1][1] += LS_SQUARE(sy);
945
876k
      Bx[0] += LS_PRODUCT2(sx, dx);
946
876k
      Bx[1] += LS_PRODUCT1(sy, dx);
947
876k
      By[0] += LS_PRODUCT1(sx, dy);
948
876k
      By[1] += LS_PRODUCT2(sy, dy);
949
876k
    }
950
901k
  }
951
952
  // Just for debugging, and can be removed later.
953
441k
  assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
954
0
  assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
955
0
  assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
956
0
  assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
957
0
  assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
958
0
  assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
959
0
  assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
960
961
  // Compute Determinant of A
962
0
  const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
963
441k
  if (Det == 0) return 1;
964
965
417k
  int16_t shift;
966
417k
  int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
967
417k
  shift -= WARPEDMODEL_PREC_BITS;
968
417k
  if (shift < 0) {
969
0
    iDet <<= (-shift);
970
0
    shift = 0;
971
0
  }
972
973
417k
  int64_t Px[2], Py[2];
974
  // These divided by the Det, are the least squares solutions
975
417k
  Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
976
417k
  Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
977
417k
  Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
978
417k
  Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
979
980
417k
  wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
981
417k
  wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
982
417k
  wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
983
417k
  wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
984
985
417k
  const int isuy = (mi_row * MI_SIZE + rsuy);
986
417k
  const int isux = (mi_col * MI_SIZE + rsux);
987
  // Note: In the vx, vy expressions below, the max value of each of the
988
  // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
989
  // for the first term so that the overall sum in the worst case fits
990
  // within 32 bits overall.
991
417k
  const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
992
417k
                     (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
993
417k
                      isuy * wm->wmmat[3]);
994
417k
  const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
995
417k
                     (isux * wm->wmmat[4] +
996
417k
                      isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
997
417k
  wm->wmmat[0] =
998
417k
      clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
999
417k
  wm->wmmat[1] =
1000
417k
      clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
1001
417k
  return 0;
1002
441k
}
1003
1004
int av1_find_projection(int np, const int *pts1, const int *pts2,
1005
                        BLOCK_SIZE bsize, int mvy, int mvx,
1006
441k
                        WarpedMotionParams *wm_params, int mi_row, int mi_col) {
1007
441k
  assert(wm_params->wmtype == AFFINE);
1008
1009
441k
  if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
1010
441k
                      mi_col))
1011
24.7k
    return 1;
1012
1013
  // check compatibility with the fast warp filter
1014
417k
  if (!av1_get_shear_params(wm_params)) return 1;
1015
1016
390k
  return 0;
1017
417k
}