Coverage Report

Created: 2026-05-30 06:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/svt-av1/Source/Lib/Codec/warped_motion.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10
 */
11
12
#include <stdio.h>
13
#include <stdlib.h>
14
#include <assert.h>
15
#include "common_utils.h"
16
#include "aom_dsp_rtcd.h"
17
#include "warped_motion.h"
18
#include "bitstream_unit.h"
19
#include "definitions.h"
20
#include "convolve.h"
21
22
#define LS_MV_MAX 256 // max mv in 1/8-pel
23
// Use LS_STEP = 8 so that 2 less bits needed for A, bx, by.
24
#define LS_STEP 8
25
26
// Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
27
// the precision needed is:
28
//   (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
29
//   (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
30
//   1 [for sign] +
31
//   LEAST_SQUARES_SAMPLES_MAX_BITS
32
//        [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
33
// The value is 23
34
#define LS_MAT_RANGE_BITS ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
35
36
// Bit-depth reduction from the full-range
37
#define LS_MAT_DOWN_BITS 2
38
39
// bits range of A, bx and by after downshifting
40
#define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
41
#define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
42
#define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
43
44
// by setting LS_STEP = 8, the least 2 bits of every elements in A, bx, by are
45
// 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
46
#define LS_SQUARE(a) (((a) * (a) * 4 + (a) * 4 * LS_STEP + LS_STEP * LS_STEP * 2) >> (2 + LS_MAT_DOWN_BITS))
47
#define LS_PRODUCT1(a, b) (((a) * (b) * 4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> (2 + LS_MAT_DOWN_BITS))
48
#define LS_PRODUCT2(a, b) \
49
    (((a) * (b) * 4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> (2 + LS_MAT_DOWN_BITS))
50
51
// For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
52
// at a time. The zoom/rotation/shear in the model are applied to the
53
// "fractional" position of each pixel, which therefore varies within
54
// [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
55
// We need an extra 2 taps to fit this in, for a total of 8 taps.
56
/* clang-format off */
57
EB_ALIGN(16) const int16_t svt_aom_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
58
#if WARPEDPIXEL_PREC_BITS == 6
59
    // [-1, 0)
60
    { 0,   0, 127,   1,   0, 0, 0, 0 }, { 0, - 1, 127,   2,   0, 0, 0, 0 },
61
    { 1, - 3, 127,   4, - 1, 0, 0, 0 }, { 1, - 4, 126,   6, - 2, 1, 0, 0 },
62
    { 1, - 5, 126,   8, - 3, 1, 0, 0 }, { 1, - 6, 125,  11, - 4, 1, 0, 0 },
63
    { 1, - 7, 124,  13, - 4, 1, 0, 0 }, { 2, - 8, 123,  15, - 5, 1, 0, 0 },
64
    { 2, - 9, 122,  18, - 6, 1, 0, 0 }, { 2, -10, 121,  20, - 6, 1, 0, 0 },
65
    { 2, -11, 120,  22, - 7, 2, 0, 0 }, { 2, -12, 119,  25, - 8, 2, 0, 0 },
66
    { 3, -13, 117,  27, - 8, 2, 0, 0 }, { 3, -13, 116,  29, - 9, 2, 0, 0 },
67
    { 3, -14, 114,  32, -10, 3, 0, 0 }, { 3, -15, 113,  35, -10, 2, 0, 0 },
68
    { 3, -15, 111,  37, -11, 3, 0, 0 }, { 3, -16, 109,  40, -11, 3, 0, 0 },
69
    { 3, -16, 108,  42, -12, 3, 0, 0 }, { 4, -17, 106,  45, -13, 3, 0, 0 },
70
    { 4, -17, 104,  47, -13, 3, 0, 0 }, { 4, -17, 102,  50, -14, 3, 0, 0 },
71
    { 4, -17, 100,  52, -14, 3, 0, 0 }, { 4, -18,  98,  55, -15, 4, 0, 0 },
72
    { 4, -18,  96,  58, -15, 3, 0, 0 }, { 4, -18,  94,  60, -16, 4, 0, 0 },
73
    { 4, -18,  91,  63, -16, 4, 0, 0 }, { 4, -18,  89,  65, -16, 4, 0, 0 },
74
    { 4, -18,  87,  68, -17, 4, 0, 0 }, { 4, -18,  85,  70, -17, 4, 0, 0 },
75
    { 4, -18,  82,  73, -17, 4, 0, 0 }, { 4, -18,  80,  75, -17, 4, 0, 0 },
76
    { 4, -18,  78,  78, -18, 4, 0, 0 }, { 4, -17,  75,  80, -18, 4, 0, 0 },
77
    { 4, -17,  73,  82, -18, 4, 0, 0 }, { 4, -17,  70,  85, -18, 4, 0, 0 },
78
    { 4, -17,  68,  87, -18, 4, 0, 0 }, { 4, -16,  65,  89, -18, 4, 0, 0 },
79
    { 4, -16,  63,  91, -18, 4, 0, 0 }, { 4, -16,  60,  94, -18, 4, 0, 0 },
80
    { 3, -15,  58,  96, -18, 4, 0, 0 }, { 4, -15,  55,  98, -18, 4, 0, 0 },
81
    { 3, -14,  52, 100, -17, 4, 0, 0 }, { 3, -14,  50, 102, -17, 4, 0, 0 },
82
    { 3, -13,  47, 104, -17, 4, 0, 0 }, { 3, -13,  45, 106, -17, 4, 0, 0 },
83
    { 3, -12,  42, 108, -16, 3, 0, 0 }, { 3, -11,  40, 109, -16, 3, 0, 0 },
84
    { 3, -11,  37, 111, -15, 3, 0, 0 }, { 2, -10,  35, 113, -15, 3, 0, 0 },
85
    { 3, -10,  32, 114, -14, 3, 0, 0 }, { 2, - 9,  29, 116, -13, 3, 0, 0 },
86
    { 2, - 8,  27, 117, -13, 3, 0, 0 }, { 2, - 8,  25, 119, -12, 2, 0, 0 },
87
    { 2, - 7,  22, 120, -11, 2, 0, 0 }, { 1, - 6,  20, 121, -10, 2, 0, 0 },
88
    { 1, - 6,  18, 122, - 9, 2, 0, 0 }, { 1, - 5,  15, 123, - 8, 2, 0, 0 },
89
    { 1, - 4,  13, 124, - 7, 1, 0, 0 }, { 1, - 4,  11, 125, - 6, 1, 0, 0 },
90
    { 1, - 3,   8, 126, - 5, 1, 0, 0 }, { 1, - 2,   6, 126, - 4, 1, 0, 0 },
91
    { 0, - 1,   4, 127, - 3, 1, 0, 0 }, { 0,   0,   2, 127, - 1, 0, 0, 0 },
92
    // [0, 1)
93
    { 0,  0,   0, 127,   1,   0,  0,  0}, { 0,  0,  -1, 127,   2,   0,  0,  0},
94
    { 0,  1,  -3, 127,   4,  -2,  1,  0}, { 0,  1,  -5, 127,   6,  -2,  1,  0},
95
    { 0,  2,  -6, 126,   8,  -3,  1,  0}, {-1,  2,  -7, 126,  11,  -4,  2, -1},
96
    {-1,  3,  -8, 125,  13,  -5,  2, -1}, {-1,  3, -10, 124,  16,  -6,  3, -1},
97
    {-1,  4, -11, 123,  18,  -7,  3, -1}, {-1,  4, -12, 122,  20,  -7,  3, -1},
98
    {-1,  4, -13, 121,  23,  -8,  3, -1}, {-2,  5, -14, 120,  25,  -9,  4, -1},
99
    {-1,  5, -15, 119,  27, -10,  4, -1}, {-1,  5, -16, 118,  30, -11,  4, -1},
100
    {-2,  6, -17, 116,  33, -12,  5, -1}, {-2,  6, -17, 114,  35, -12,  5, -1},
101
    {-2,  6, -18, 113,  38, -13,  5, -1}, {-2,  7, -19, 111,  41, -14,  6, -2},
102
    {-2,  7, -19, 110,  43, -15,  6, -2}, {-2,  7, -20, 108,  46, -15,  6, -2},
103
    {-2,  7, -20, 106,  49, -16,  6, -2}, {-2,  7, -21, 104,  51, -16,  7, -2},
104
    {-2,  7, -21, 102,  54, -17,  7, -2}, {-2,  8, -21, 100,  56, -18,  7, -2},
105
    {-2,  8, -22,  98,  59, -18,  7, -2}, {-2,  8, -22,  96,  62, -19,  7, -2},
106
    {-2,  8, -22,  94,  64, -19,  7, -2}, {-2,  8, -22,  91,  67, -20,  8, -2},
107
    {-2,  8, -22,  89,  69, -20,  8, -2}, {-2,  8, -22,  87,  72, -21,  8, -2},
108
    {-2,  8, -21,  84,  74, -21,  8, -2}, {-2,  8, -22,  82,  77, -21,  8, -2},
109
    {-2,  8, -21,  79,  79, -21,  8, -2}, {-2,  8, -21,  77,  82, -22,  8, -2},
110
    {-2,  8, -21,  74,  84, -21,  8, -2}, {-2,  8, -21,  72,  87, -22,  8, -2},
111
    {-2,  8, -20,  69,  89, -22,  8, -2}, {-2,  8, -20,  67,  91, -22,  8, -2},
112
    {-2,  7, -19,  64,  94, -22,  8, -2}, {-2,  7, -19,  62,  96, -22,  8, -2},
113
    {-2,  7, -18,  59,  98, -22,  8, -2}, {-2,  7, -18,  56, 100, -21,  8, -2},
114
    {-2,  7, -17,  54, 102, -21,  7, -2}, {-2,  7, -16,  51, 104, -21,  7, -2},
115
    {-2,  6, -16,  49, 106, -20,  7, -2}, {-2,  6, -15,  46, 108, -20,  7, -2},
116
    {-2,  6, -15,  43, 110, -19,  7, -2}, {-2,  6, -14,  41, 111, -19,  7, -2},
117
    {-1,  5, -13,  38, 113, -18,  6, -2}, {-1,  5, -12,  35, 114, -17,  6, -2},
118
    {-1,  5, -12,  33, 116, -17,  6, -2}, {-1,  4, -11,  30, 118, -16,  5, -1},
119
    {-1,  4, -10,  27, 119, -15,  5, -1}, {-1,  4,  -9,  25, 120, -14,  5, -2},
120
    {-1,  3,  -8,  23, 121, -13,  4, -1}, {-1,  3,  -7,  20, 122, -12,  4, -1},
121
    {-1,  3,  -7,  18, 123, -11,  4, -1}, {-1,  3,  -6,  16, 124, -10,  3, -1},
122
    {-1,  2,  -5,  13, 125,  -8,  3, -1}, {-1,  2,  -4,  11, 126,  -7,  2, -1},
123
    { 0,  1,  -3,   8, 126,  -6,  2,  0}, { 0,  1,  -2,   6, 127,  -5,  1,  0},
124
    { 0,  1,  -2,   4, 127,  -3,  1,  0}, { 0,  0,   0,   2, 127,  -1,  0,  0},
125
    // [1, 2)
126
    { 0, 0, 0,   1, 127,   0,   0, 0 }, { 0, 0, 0, - 1, 127,   2,   0, 0 },
127
    { 0, 0, 1, - 3, 127,   4, - 1, 0 }, { 0, 0, 1, - 4, 126,   6, - 2, 1 },
128
    { 0, 0, 1, - 5, 126,   8, - 3, 1 }, { 0, 0, 1, - 6, 125,  11, - 4, 1 },
129
    { 0, 0, 1, - 7, 124,  13, - 4, 1 }, { 0, 0, 2, - 8, 123,  15, - 5, 1 },
130
    { 0, 0, 2, - 9, 122,  18, - 6, 1 }, { 0, 0, 2, -10, 121,  20, - 6, 1 },
131
    { 0, 0, 2, -11, 120,  22, - 7, 2 }, { 0, 0, 2, -12, 119,  25, - 8, 2 },
132
    { 0, 0, 3, -13, 117,  27, - 8, 2 }, { 0, 0, 3, -13, 116,  29, - 9, 2 },
133
    { 0, 0, 3, -14, 114,  32, -10, 3 }, { 0, 0, 3, -15, 113,  35, -10, 2 },
134
    { 0, 0, 3, -15, 111,  37, -11, 3 }, { 0, 0, 3, -16, 109,  40, -11, 3 },
135
    { 0, 0, 3, -16, 108,  42, -12, 3 }, { 0, 0, 4, -17, 106,  45, -13, 3 },
136
    { 0, 0, 4, -17, 104,  47, -13, 3 }, { 0, 0, 4, -17, 102,  50, -14, 3 },
137
    { 0, 0, 4, -17, 100,  52, -14, 3 }, { 0, 0, 4, -18,  98,  55, -15, 4 },
138
    { 0, 0, 4, -18,  96,  58, -15, 3 }, { 0, 0, 4, -18,  94,  60, -16, 4 },
139
    { 0, 0, 4, -18,  91,  63, -16, 4 }, { 0, 0, 4, -18,  89,  65, -16, 4 },
140
    { 0, 0, 4, -18,  87,  68, -17, 4 }, { 0, 0, 4, -18,  85,  70, -17, 4 },
141
    { 0, 0, 4, -18,  82,  73, -17, 4 }, { 0, 0, 4, -18,  80,  75, -17, 4 },
142
    { 0, 0, 4, -18,  78,  78, -18, 4 }, { 0, 0, 4, -17,  75,  80, -18, 4 },
143
    { 0, 0, 4, -17,  73,  82, -18, 4 }, { 0, 0, 4, -17,  70,  85, -18, 4 },
144
    { 0, 0, 4, -17,  68,  87, -18, 4 }, { 0, 0, 4, -16,  65,  89, -18, 4 },
145
    { 0, 0, 4, -16,  63,  91, -18, 4 }, { 0, 0, 4, -16,  60,  94, -18, 4 },
146
    { 0, 0, 3, -15,  58,  96, -18, 4 }, { 0, 0, 4, -15,  55,  98, -18, 4 },
147
    { 0, 0, 3, -14,  52, 100, -17, 4 }, { 0, 0, 3, -14,  50, 102, -17, 4 },
148
    { 0, 0, 3, -13,  47, 104, -17, 4 }, { 0, 0, 3, -13,  45, 106, -17, 4 },
149
    { 0, 0, 3, -12,  42, 108, -16, 3 }, { 0, 0, 3, -11,  40, 109, -16, 3 },
150
    { 0, 0, 3, -11,  37, 111, -15, 3 }, { 0, 0, 2, -10,  35, 113, -15, 3 },
151
    { 0, 0, 3, -10,  32, 114, -14, 3 }, { 0, 0, 2, - 9,  29, 116, -13, 3 },
152
    { 0, 0, 2, - 8,  27, 117, -13, 3 }, { 0, 0, 2, - 8,  25, 119, -12, 2 },
153
    { 0, 0, 2, - 7,  22, 120, -11, 2 }, { 0, 0, 1, - 6,  20, 121, -10, 2 },
154
    { 0, 0, 1, - 6,  18, 122, - 9, 2 }, { 0, 0, 1, - 5,  15, 123, - 8, 2 },
155
    { 0, 0, 1, - 4,  13, 124, - 7, 1 }, { 0, 0, 1, - 4,  11, 125, - 6, 1 },
156
    { 0, 0, 1, - 3,   8, 126, - 5, 1 }, { 0, 0, 1, - 2,   6, 126, - 4, 1 },
157
    { 0, 0, 0, - 1,   4, 127, - 3, 1 }, { 0, 0, 0,   0,   2, 127, - 1, 0 },
158
    // dummy (replicate row index 191)
159
    { 0, 0, 0,   0,   2, 127, - 1, 0 },
160
#elif WARPEDPIXEL_PREC_BITS == 5
161
    // [-1, 0)
162
    {0,   0, 127,   1,   0, 0, 0, 0}, {1,  -3, 127,   4,  -1, 0, 0, 0},
163
    {1,  -5, 126,   8,  -3, 1, 0, 0}, {1,  -7, 124,  13,  -4, 1, 0, 0},
164
    {2,  -9, 122,  18,  -6, 1, 0, 0}, {2, -11, 120,  22,  -7, 2, 0, 0},
165
    {3, -13, 117,  27,  -8, 2, 0, 0}, {3, -14, 114,  32, -10, 3, 0, 0},
166
    {3, -15, 111,  37, -11, 3, 0, 0}, {3, -16, 108,  42, -12, 3, 0, 0},
167
    {4, -17, 104,  47, -13, 3, 0, 0}, {4, -17, 100,  52, -14, 3, 0, 0},
168
    {4, -18,  96,  58, -15, 3, 0, 0}, {4, -18,  91,  63, -16, 4, 0, 0},
169
    {4, -18,  87,  68, -17, 4, 0, 0}, {4, -18,  82,  73, -17, 4, 0, 0},
170
    {4, -18,  78,  78, -18, 4, 0, 0}, {4, -17,  73,  82, -18, 4, 0, 0},
171
    {4, -17,  68,  87, -18, 4, 0, 0}, {4, -16,  63,  91, -18, 4, 0, 0},
172
    {3, -15,  58,  96, -18, 4, 0, 0}, {3, -14,  52, 100, -17, 4, 0, 0},
173
    {3, -13,  47, 104, -17, 4, 0, 0}, {3, -12,  42, 108, -16, 3, 0, 0},
174
    {3, -11,  37, 111, -15, 3, 0, 0}, {3, -10,  32, 114, -14, 3, 0, 0},
175
    {2,  -8,  27, 117, -13, 3, 0, 0}, {2,  -7,  22, 120, -11, 2, 0, 0},
176
    {1,  -6,  18, 122,  -9, 2, 0, 0}, {1,  -4,  13, 124,  -7, 1, 0, 0},
177
    {1,  -3,   8, 126,  -5, 1, 0, 0}, {0,  -1,   4, 127,  -3, 1, 0, 0},
178
    // [0, 1)
179
    { 0,  0,   0, 127,   1,   0,   0,  0}, { 0,  1,  -3, 127,   4,  -2,   1,  0},
180
    { 0,  2,  -6, 126,   8,  -3,   1,  0}, {-1,  3,  -8, 125,  13,  -5,   2, -1},
181
    {-1,  4, -11, 123,  18,  -7,   3, -1}, {-1,  4, -13, 121,  23,  -8,   3, -1},
182
    {-1,  5, -15, 119,  27, -10,   4, -1}, {-2,  6, -17, 116,  33, -12,   5, -1},
183
    {-2,  6, -18, 113,  38, -13,   5, -1}, {-2,  7, -19, 110,  43, -15,   6, -2},
184
    {-2,  7, -20, 106,  49, -16,   6, -2}, {-2,  7, -21, 102,  54, -17,   7, -2},
185
    {-2,  8, -22,  98,  59, -18,   7, -2}, {-2,  8, -22,  94,  64, -19,   7, -2},
186
    {-2,  8, -22,  89,  69, -20,   8, -2}, {-2,  8, -21,  84,  74, -21,   8, -2},
187
    {-2,  8, -21,  79,  79, -21,   8, -2}, {-2,  8, -21,  74,  84, -21,   8, -2},
188
    {-2,  8, -20,  69,  89, -22,   8, -2}, {-2,  7, -19,  64,  94, -22,   8, -2},
189
    {-2,  7, -18,  59,  98, -22,   8, -2}, {-2,  7, -17,  54, 102, -21,   7, -2},
190
    {-2,  6, -16,  49, 106, -20,   7, -2}, {-2,  6, -15,  43, 110, -19,   7, -2},
191
    {-1,  5, -13,  38, 113, -18,   6, -2}, {-1,  5, -12,  33, 116, -17,   6, -2},
192
    {-1,  4, -10,  27, 119, -15,   5, -1}, {-1,  3,  -8,  23, 121, -13,   4, -1},
193
    {-1,  3,  -7,  18, 123, -11,   4, -1}, {-1,  2,  -5,  13, 125,  -8,   3, -1},
194
    { 0,  1,  -3,   8, 126,  -6,   2,  0}, { 0,  1,  -2,   4, 127,  -3,   1,  0},
195
    // [1, 2)
196
    {0, 0, 0,   1, 127,   0,   0, 0}, {0, 0, 1,  -3, 127,   4,  -1, 0},
197
    {0, 0, 1,  -5, 126,   8,  -3, 1}, {0, 0, 1,  -7, 124,  13,  -4, 1},
198
    {0, 0, 2,  -9, 122,  18,  -6, 1}, {0, 0, 2, -11, 120,  22,  -7, 2},
199
    {0, 0, 3, -13, 117,  27,  -8, 2}, {0, 0, 3, -14, 114,  32, -10, 3},
200
    {0, 0, 3, -15, 111,  37, -11, 3}, {0, 0, 3, -16, 108,  42, -12, 3},
201
    {0, 0, 4, -17, 104,  47, -13, 3}, {0, 0, 4, -17, 100,  52, -14, 3},
202
    {0, 0, 4, -18,  96,  58, -15, 3}, {0, 0, 4, -18,  91,  63, -16, 4},
203
    {0, 0, 4, -18,  87,  68, -17, 4}, {0, 0, 4, -18,  82,  73, -17, 4},
204
    {0, 0, 4, -18,  78,  78, -18, 4}, {0, 0, 4, -17,  73,  82, -18, 4},
205
    {0, 0, 4, -17,  68,  87, -18, 4}, {0, 0, 4, -16,  63,  91, -18, 4},
206
    {0, 0, 3, -15,  58,  96, -18, 4}, {0, 0, 3, -14,  52, 100, -17, 4},
207
    {0, 0, 3, -13,  47, 104, -17, 4}, {0, 0, 3, -12,  42, 108, -16, 3},
208
    {0, 0, 3, -11,  37, 111, -15, 3}, {0, 0, 3, -10,  32, 114, -14, 3},
209
    {0, 0, 2,  -8,  27, 117, -13, 3}, {0, 0, 2,  -7,  22, 120, -11, 2},
210
    {0, 0, 1,  -6,  18, 122,  -9, 2}, {0, 0, 1,  -4,  13, 124,  -7, 1},
211
    {0, 0, 1,  -3,   8, 126,  -5, 1}, {0, 0, 0,  -1,   4, 127,  -3, 1},
212
    // dummy (replicate row index 95)
213
    {0, 0, 0,  -1,   4, 127,  -3, 1},
214
#endif  // WARPEDPIXEL_PREC_BITS == 6
215
};
216
/* clang-format on */
217
218
#define USE_LIMITED_PREC_MULT 0
219
220
#if USE_LIMITED_PREC_MULT
221
222
#define MUL_PREC_BITS 16
223
224
static uint16_t resolve_multiplier_64(uint64_t D, int16_t* shift) {
225
    int      msb  = 0;
226
    uint16_t mult = 0;
227
    *shift        = 0;
228
    if (D != 0) {
229
        msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32 : get_msb((unsigned int)D));
230
        if (msb >= MUL_PREC_BITS) {
231
            mult   = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
232
            *shift = msb + 1 - MUL_PREC_BITS;
233
        } else {
234
            mult   = (uint16_t)D;
235
            *shift = 0;
236
        }
237
    }
238
    return mult;
239
}
240
241
static int32_t get_mult_shift_ndiag(int64_t p_x, int16_t i_det, int shift) {
242
    int32_t  ret;
243
    int16_t  mshift;
244
    uint16_t Mul = resolve_multiplier_64(llabs(p_x), &mshift);
245
    int32_t  v   = (int32_t)Mul * (int32_t)i_det * (p_x < 0 ? -1 : 1);
246
    shift -= mshift;
247
    if (shift > 0) {
248
        return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
249
                              -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
250
                              WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
251
    } else {
252
        return (int32_t)clamp(
253
            v * (1 << (-shift)), -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
254
    }
255
    return ret;
256
}
257
258
static int32_t get_mult_shift_diag(int64_t p_x, int16_t i_det, int shift) {
259
    int16_t  mshift;
260
    uint16_t Mul = resolve_multiplier_64(llabs(p_x), &mshift);
261
    int32_t  v   = (int32_t)Mul * (int32_t)i_det * (p_x < 0 ? -1 : 1);
262
    shift -= mshift;
263
    if (shift > 0) {
264
        return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
265
                              (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
266
                              (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
267
    } else {
268
        return (int32_t)clamp(v * (1 << (-shift)),
269
                              (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
270
                              (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
271
    }
272
}
273
274
#else
275
276
static int32_t get_mult_shift_ndiag(int64_t p_x, int16_t i_det, int shift) {
277
    int64_t v = p_x * (int64_t)i_det;
278
    return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
279
                            -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
280
                            WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
281
}
282
283
static int32_t get_mult_shift_diag(int64_t p_x, int16_t i_det, int shift) {
284
    int64_t v = p_x * (int64_t)i_det;
285
    return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
286
                            (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
287
                            (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
288
}
289
#endif // USE_LIMITED_PREC_MULT
290
291
#define DIV_LUT_PREC_BITS 14
292
#define DIV_LUT_BITS 8
293
#define DIV_LUT_NUM (1 << DIV_LUT_BITS)
294
295
static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
296
    16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768, 15709, 15650, 15592, 15534, 15477,
297
    15420, 15364, 15308, 15252, 15197, 15142, 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614,
298
    14564, 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028, 13981, 13935, 13888, 13843,
299
    13797, 13752, 13707, 13662, 13618, 13574, 13530, 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148,
300
    13107, 13066, 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633, 12596, 12558, 12520,
301
    12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228, 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950,
302
    11916, 11882, 11848, 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491, 11460, 11429,
303
    11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155, 11125, 11096, 11067, 11038, 11009, 10980, 10951,
304
    10923, 10894, 10866, 10838, 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538, 10512,
305
    10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255, 10230, 10205, 10180, 10156, 10131, 10107,
306
    10082, 10058, 10034, 10010, 9986,  9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
307
    9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,  9468,  9447,  9425,  9404,  9383,
308
    9362,  9341,  9321,  9300,  9279,  9259,  9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,
309
    9039,  9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,  8812,  8793,  8775,  8756,
310
    8738,  8720,  8702,  8684,  8666,  8648,  8630,  8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,
311
    8456,  8439,  8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,  8240,  8224,  8208,
312
    8192,
313
};
314
315
// Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
316
// at precision of DIV_LUT_PREC_BITS along with the shift.
317
static int16_t resolve_divisor_64(uint64_t D, int16_t* shift) {
318
    int64_t f;
319
    *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32 : get_msb((unsigned int)D));
320
    // e is obtained from D after resetting the most significant 1 bit.
321
    const int64_t e = D - ((uint64_t)1 << *shift);
322
    // Get the most significant DIV_LUT_BITS (8) bits of e into f
323
    if (*shift > DIV_LUT_BITS) {
324
        f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
325
    } else {
326
        f = e << (DIV_LUT_BITS - *shift);
327
    }
328
    assert(f <= DIV_LUT_NUM);
329
    *shift += DIV_LUT_PREC_BITS;
330
    // Use f as lookup into the precomputed table of multipliers
331
    return div_lut[f];
332
}
333
334
static int16_t resolve_divisor_32(uint32_t D, int16_t* shift) {
335
    int32_t f;
336
    *shift = get_msb(D);
337
    // e is obtained from D after resetting the most significant 1 bit.
338
    const int32_t e = D - ((uint32_t)1 << *shift);
339
    // Get the most significant DIV_LUT_BITS (8) bits of e into f
340
    if (*shift > DIV_LUT_BITS) {
341
        f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
342
    } else {
343
        f = e << (DIV_LUT_BITS - *shift);
344
    }
345
    assert(f <= DIV_LUT_NUM);
346
    *shift += DIV_LUT_PREC_BITS;
347
    // Use f as lookup into the precomputed table of multipliers
348
    return div_lut[f];
349
}
350
351
static int is_affine_valid(const WarpedMotionParams* const wm) {
352
    const int32_t* mat = wm->wmmat;
353
    return (mat[2] > 0);
354
}
355
356
static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) {
357
    if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
358
        (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS))) {
359
        return 0;
360
    } else {
361
        return 1;
362
    }
363
}
364
365
static int find_affine_int(int np, const int* pts1, const int* pts2, BlockSize bsize, Mv mv, WarpedMotionParams* wm,
366
                           int mi_row, int mi_col) {
367
    int32_t A[2][2] = {{0, 0}, {0, 0}};
368
    int32_t bx[2]   = {0, 0};
369
    int32_t by[2]   = {0, 0};
370
    int     i;
371
372
    const int bw   = block_size_wide[bsize];
373
    const int bh   = block_size_high[bsize];
374
    const int rsuy = (AOMMAX(bh, MI_SIZE) / 2 - 1);
375
    const int rsux = (AOMMAX(bw, MI_SIZE) / 2 - 1);
376
    const int suy  = rsuy * 8;
377
    const int sux  = rsux * 8;
378
    const int duy  = suy + mv.y;
379
    const int dux  = sux + mv.x;
380
    const int isuy = (mi_row * MI_SIZE + rsuy);
381
    const int isux = (mi_col * MI_SIZE + rsux);
382
383
    // Assume the center pixel of the block has exactly the same motion vector
384
    // as transmitted for the block. First shift the origin of the source
385
    // points to the block center, and the origin of the destination points to
386
    // the block center added to the motion vector transmitted.
387
    // Let (xi, yi) denote the source points and (xi', yi') denote destination
388
    // points after origin shfifting, for i = 0, 1, 2, .... n-1.
389
    // Then if  P = [x0, y0,
390
    //               x1, y1
391
    //               x2, y1,
392
    //                ....
393
    //              ]
394
    //          q = [x0', x1', x2', ... ]'
395
    //          r = [y0', y1', y2', ... ]'
396
    // the least squares problems that need to be solved are:
397
    //          [h1, h2]' = inv(P'P)P'q and
398
    //          [h3, h4]' = inv(P'P)P'r
399
    // where the affine transformation is given by:
400
    //          x' = h1.x + h2.y
401
    //          y' = h3.x + h4.y
402
    //
403
    // The loop below computes: A = P'P, bx = P'q, by = P'r
404
    // We need to just compute inv(A).bx and inv(A).by for the solutions.
405
    // Contribution from neighbor block
406
    for (i = 0; i < np; i++) {
407
        const int dx = pts2[i * 2] - dux;
408
        const int dy = pts2[i * 2 + 1] - duy;
409
        const int sx = pts1[i * 2] - sux;
410
        const int sy = pts1[i * 2 + 1] - suy;
411
        if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
412
            A[0][0] += LS_SQUARE(sx);
413
            A[0][1] += LS_PRODUCT1(sx, sy);
414
            A[1][1] += LS_SQUARE(sy);
415
            bx[0] += LS_PRODUCT2(sx, dx);
416
            bx[1] += LS_PRODUCT1(sy, dx);
417
            by[0] += LS_PRODUCT1(sx, dy);
418
            by[1] += LS_PRODUCT2(sy, dy);
419
        }
420
    }
421
422
    // Just for debugging, and can be removed later.
423
    assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
424
    assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
425
    assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
426
    assert(bx[0] >= LS_MAT_MIN && bx[0] <= LS_MAT_MAX);
427
    assert(bx[1] >= LS_MAT_MIN && bx[1] <= LS_MAT_MAX);
428
    assert(by[0] >= LS_MAT_MIN && by[0] <= LS_MAT_MAX);
429
    assert(by[1] >= LS_MAT_MIN && by[1] <= LS_MAT_MAX);
430
431
    int64_t det;
432
    int16_t i_det, shift;
433
434
    // Compute Determinant of A
435
    det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
436
    if (det == 0) {
437
        return 1;
438
    }
439
    i_det = resolve_divisor_64(llabs(det), &shift) * (det < 0 ? -1 : 1);
440
    shift -= WARPEDMODEL_PREC_BITS;
441
    if (shift < 0) {
442
        i_det <<= (-shift);
443
        shift = 0;
444
    }
445
446
    int64_t p_x[2], p_y[2];
447
448
    // These divided by the det, are the least squares solutions
449
    p_x[0] = (int64_t)A[1][1] * bx[0] - (int64_t)A[0][1] * bx[1];
450
    p_x[1] = -(int64_t)A[0][1] * bx[0] + (int64_t)A[0][0] * bx[1];
451
    p_y[0] = (int64_t)A[1][1] * by[0] - (int64_t)A[0][1] * by[1];
452
    p_y[1] = -(int64_t)A[0][1] * by[0] + (int64_t)A[0][0] * by[1];
453
454
    wm->wmmat[2] = get_mult_shift_diag(p_x[0], i_det, shift);
455
    wm->wmmat[3] = get_mult_shift_ndiag(p_x[1], i_det, shift);
456
    wm->wmmat[4] = get_mult_shift_ndiag(p_y[0], i_det, shift);
457
    wm->wmmat[5] = get_mult_shift_diag(p_y[1], i_det, shift);
458
459
    // Note: In the vx, vy expressions below, the max value of each of the
460
    // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
461
    // for the first term so that the overall sum in the worst case fits
462
    // within 32 bits overall.
463
    int32_t vx = mv.x * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
464
        (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) + isuy * wm->wmmat[3]);
465
    int32_t vy = mv.y * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
466
        (isux * wm->wmmat[4] + isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
467
    wm->wmmat[0] = clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
468
    wm->wmmat[1] = clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
469
470
    return 0;
471
}
472
473
bool svt_find_projection(int np, int* pts1, int* pts2, BlockSize bsize, Mv mv, WarpedMotionParams* wm_params,
474
0
                         int mi_row, int mi_col) {
475
0
    if (find_affine_int(np, pts1, pts2, bsize, mv, wm_params, mi_row, mi_col)) {
476
0
        return 1;
477
0
    }
478
479
    // check compatibility with the fast warp filter
480
0
    if (!svt_get_shear_params(wm_params)) {
481
0
        return 1;
482
0
    }
483
484
0
    return 0;
485
0
}
486
487
/* The warp filter for ROTZOOM and AFFINE models works as follows:
488
   * Split the input into 8x8 blocks
489
   * For each block, project the point (4, 4) within the block, to get the
490
     overall block position. Split into integer and fractional coordinates,
491
     maintaining full WARPEDMODEL precision
492
   * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
493
     variable horizontal offset. This means that, while the rows of the
494
     intermediate buffer align with the rows of the *reference* image, the
495
     columns align with the columns of the *destination* image.
496
   * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
497
     destination is too small we crop the output at this stage). Each pixel has
498
     a variable vertical offset, so that the resulting rows are aligned with
499
     the rows of the destination image.
500
501
   To accomplish these alignments, we factor the warp matrix as a
502
   product of two shear / asymmetric zoom matrices:
503
   / a b \  = /   1       0    \ * / 1+alpha  beta \
504
   \ c d /    \ gamma  1+delta /   \    0      1   /
505
   where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
506
   The horizontal shear (with alpha and beta) is applied first,
507
   then the vertical shear (with gamma and delta) is applied second.
508
509
   The only limitation is that, to fit this in a fixed 8-tap filter size,
510
   the fractional pixel offsets must be at most +-1. Since the horizontal filter
511
   generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
512
   within the block, the parameters must satisfy
513
   4 * |alpha| + 7 * |beta| <= 1   and   4 * |gamma| + 4 * |delta| <= 1
514
   for this filter to be applicable.
515
516
   Note: This function assumes that the caller has done all of the relevant
517
   checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
518
   are set appropriately (if using a ROTZOOM model), and that alpha, beta,
519
   gamma, delta are all in range.
520
521
*/
522
/* A note on hardware implementation:
523
    The warp filter is intended to be implementable using the same hardware as
524
    the high-precision svt_aom_convolve filters from the loop-restoration and
525
    convolve-round experiments.
526
527
    For a single filter stage, considering all of the coefficient sets for the
528
    warp filter and the regular convolution filter, an input in the range
529
    [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
530
    before rounding.
531
532
    Allowing for some changes to the filter coefficient sets, call the range
533
    [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
534
    we can replace this by the range [0, 256 * 2^k], which can be stored in an
535
    unsigned value with 8 + k bits.
536
537
    This allows the derivation of the appropriate bit widths and offsets for
538
    the various intermediate values: If
539
540
    F := FILTER_BITS = 7 (or else the above ranges need adjusting)
541
         So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
542
         intermediate value.
543
    H := ROUND0_BITS
544
    V := VERSHEAR_REDUCE_PREC_BITS
545
    (and note that we must have H + V = 2*F for the output to have the same
546
     scale as the input)
547
548
    then we end up with the following offsets and ranges:
549
    Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
550
                       uint{bd + F + 1}
551
    After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
552
    Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
553
                     uint{bd + 2*F + 2 - H}
554
    After rounding: The final value, before undoing the offset, fits into a
555
                    uint{bd + 2}.
556
557
    Then we need to undo the offsets before clamping to a pixel. Note that,
558
    if we do this at the end, the amount to subtract is actually independent
559
    of H and V:
560
561
    offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
562
                         (1 << ((bd + 2*F - H) - V))
563
                      == (1 << (bd - 1)) + (1 << bd)
564
565
    This allows us to entirely avoid clamping in both the warp filter and
566
    the convolve-round experiment. As of the time of writing, the Wiener filter
567
    from loop-restoration can encode a central coefficient up to 216, which
568
    leads to a maximum value of about 282 * 2^k after applying the offset.
569
    So in that case we still need to clamp.
570
*/
571
void svt_av1_warp_affine_c(const int32_t* mat, const uint8_t* ref, int width, int height, int stride, uint8_t* pred,
572
                           int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x,
573
                           int subsampling_y, ConvolveParams* conv_params, int16_t alpha, int16_t beta, int16_t gamma,
574
0
                           int16_t delta) {
575
0
    int32_t   tmp[15 * 8];
576
0
    const int bd                = 8;
577
0
    const int reduce_bits_horiz = conv_params->round_0;
578
0
    const int reduce_bits_vert  = conv_params->is_compound ? conv_params->round_1 : 2 * FILTER_BITS - reduce_bits_horiz;
579
0
    const int max_bits_horiz    = bd + FILTER_BITS + 1 - reduce_bits_horiz;
580
0
    const int offset_bits_horiz = bd + FILTER_BITS - 1;
581
0
    const int offset_bits_vert  = bd + 2 * FILTER_BITS - reduce_bits_horiz;
582
0
    const int round_bits        = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
583
0
    const int offset_bits       = bd + 2 * FILTER_BITS - conv_params->round_0;
584
0
    (void)max_bits_horiz;
585
0
    assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
586
0
    assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
587
588
0
    for (int i = p_row; i < p_row + p_height; i += 8) {
589
0
        for (int j = p_col; j < p_col + p_width; j += 8) {
590
            // Calculate the center of this 8x8 block,
591
            // project to luma coordinates (if in a subsampled chroma plane),
592
            // apply the affine transformation,
593
            // then convert back to the original coordinates (if necessary)
594
0
            const int32_t src_x = (j + 4) << subsampling_x;
595
0
            const int32_t src_y = (i + 4) << subsampling_y;
596
0
            const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
597
0
            const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
598
0
            const int32_t x4    = dst_x >> subsampling_x;
599
0
            const int32_t y4    = dst_y >> subsampling_y;
600
601
0
            int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
602
0
            int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
603
0
            int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
604
0
            int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
605
606
0
            sx4 += alpha * (-4) + beta * (-4);
607
0
            sy4 += gamma * (-4) + delta * (-4);
608
609
0
            sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
610
0
            sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
611
612
            // Horizontal filter
613
0
            for (int k = -7; k < 8; ++k) {
614
                // Clamp to top/bottom edge of the frame
615
0
                const int iy = clamp(iy4 + k, 0, height - 1);
616
617
0
                int sx = sx4 + beta * (k + 4);
618
619
0
                for (int l = -4; l < 4; ++l) {
620
0
                    int ix = ix4 + l - 3;
621
                    // At this point, sx = sx4 + alpha * l + beta * k
622
0
                    const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + WARPEDPIXEL_PREC_SHIFTS;
623
0
                    assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
624
0
                    const int16_t* coeffs = svt_aom_warped_filter[offs];
625
626
0
                    int32_t sum = 1 << offset_bits_horiz;
627
0
                    for (int m = 0; m < 8; ++m) {
628
                        // Clamp to left/right edge of the frame
629
0
                        const int sample_x = clamp(ix + m, 0, width - 1);
630
631
0
                        sum += ref[iy * stride + sample_x] * coeffs[m];
632
0
                    }
633
0
                    sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
634
0
                    assert(0 <= sum && sum < (1 << max_bits_horiz));
635
0
                    tmp[(k + 7) * 8 + (l + 4)] = sum;
636
0
                    sx += alpha;
637
0
                }
638
0
            }
639
640
            // Vertical filter
641
0
            for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
642
0
                int sy = sy4 + delta * (k + 4);
643
0
                for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
644
                    // At this point, sy = sy4 + gamma * l + delta * k
645
0
                    const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + WARPEDPIXEL_PREC_SHIFTS;
646
0
                    assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
647
0
                    const int16_t* coeffs = svt_aom_warped_filter[offs];
648
649
0
                    int32_t sum = 1 << offset_bits_vert;
650
0
                    for (int m = 0; m < 8; ++m) {
651
0
                        sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
652
0
                    }
653
0
                    if (conv_params->is_compound) {
654
0
                        ConvBufType* p =
655
0
                            &conv_params->dst[(i - p_row + k + 4) * conv_params->dst_stride + (j - p_col + l + 4)];
656
0
                        sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
657
0
                        if (conv_params->do_average) {
658
0
                            uint8_t* dst8  = &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
659
0
                            int32_t  tmp32 = *p;
660
0
                            if (conv_params->use_jnt_comp_avg) {
661
0
                                tmp32 = tmp32 * conv_params->fwd_offset + sum * conv_params->bck_offset;
662
0
                                tmp32 = tmp32 >> DIST_PRECISION_BITS;
663
0
                            } else {
664
0
                                tmp32 += sum;
665
0
                                tmp32 = tmp32 >> 1;
666
0
                            }
667
0
                            tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
668
0
                                (1 << (offset_bits - conv_params->round_1 - 1));
669
0
                            *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
670
0
                        } else {
671
0
                            *p = sum;
672
0
                        }
673
0
                    } else {
674
0
                        uint8_t* p = &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
675
0
                        sum        = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
676
0
                        assert(0 <= sum && sum < (1 << (bd + 2)));
677
0
                        *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
678
0
                    }
679
0
                    sy += gamma;
680
0
                }
681
0
            }
682
0
        }
683
0
    }
684
0
}
685
686
void svt_warp_plane(WarpedMotionParams* wm, const uint8_t* const ref, int width, int height, int stride, uint8_t* pred,
687
                    int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y,
688
0
                    ConvolveParams* conv_params) {
689
0
    assert(wm->wmtype <= AFFINE);
690
0
    if (wm->wmtype == ROTZOOM) {
691
0
        wm->wmmat[5] = wm->wmmat[2];
692
0
        wm->wmmat[4] = -wm->wmmat[3];
693
0
    }
694
0
    const int32_t* const mat   = wm->wmmat;
695
0
    const int16_t        alpha = wm->alpha;
696
0
    const int16_t        beta  = wm->beta;
697
0
    const int16_t        gamma = wm->gamma;
698
0
    const int16_t        delta = wm->delta;
699
0
    svt_av1_warp_affine(mat,
700
0
                        ref,
701
0
                        width,
702
0
                        height,
703
0
                        stride,
704
0
                        pred,
705
0
                        p_col,
706
0
                        p_row,
707
0
                        p_width,
708
0
                        p_height,
709
0
                        p_stride,
710
0
                        subsampling_x,
711
0
                        subsampling_y,
712
0
                        conv_params,
713
0
                        alpha,
714
0
                        beta,
715
0
                        gamma,
716
0
                        delta);
717
0
}
718
719
void svt_av1_highbd_warp_affine_c(const int32_t* mat, const uint8_t* ref8b, const uint8_t* ref2b, int width, int height,
720
                                  int stride8b, int stride2b, uint16_t* pred, int p_col, int p_row, int p_width,
721
                                  int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd,
722
                                  ConvolveParams* conv_params, int16_t alpha, int16_t beta, int16_t gamma,
723
0
                                  int16_t delta) {
724
0
    int32_t   tmp[15 * 8];
725
0
    const int reduce_bits_horiz = conv_params->round_0 + AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
726
0
    const int reduce_bits_vert  = conv_params->is_compound ? conv_params->round_1 : 2 * FILTER_BITS - reduce_bits_horiz;
727
0
    const int max_bits_horiz    = bd + FILTER_BITS + 1 - reduce_bits_horiz;
728
0
    const int offset_bits_horiz = bd + FILTER_BITS - 1;
729
0
    const int offset_bits_vert  = bd + 2 * FILTER_BITS - reduce_bits_horiz;
730
0
    const int round_bits        = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
731
0
    const int offset_bits       = bd + 2 * FILTER_BITS - conv_params->round_0;
732
0
    (void)max_bits_horiz;
733
0
    assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
734
735
0
    for (int i = p_row; i < p_row + p_height; i += 8) {
736
0
        for (int j = p_col; j < p_col + p_width; j += 8) {
737
            // Calculate the center of this 8x8 block,
738
            // project to luma coordinates (if in a subsampled chroma plane),
739
            // apply the affine transformation,
740
            // then convert back to the original coordinates (if necessary)
741
0
            const int32_t src_x = (j + 4) << subsampling_x;
742
0
            const int32_t src_y = (i + 4) << subsampling_y;
743
0
            const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
744
0
            const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
745
0
            const int32_t x4    = dst_x >> subsampling_x;
746
0
            const int32_t y4    = dst_y >> subsampling_y;
747
748
0
            const int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
749
0
            int32_t       sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
750
0
            const int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
751
0
            int32_t       sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
752
753
0
            sx4 += alpha * (-4) + beta * (-4);
754
0
            sy4 += gamma * (-4) + delta * (-4);
755
756
0
            sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
757
0
            sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
758
759
            // Horizontal filter
760
0
            for (int k = -7; k < 8; ++k) {
761
0
                const int iy = clamp(iy4 + k, 0, height - 1);
762
763
0
                int sx = sx4 + beta * (k + 4);
764
0
                for (int l = -4; l < 4; ++l) {
765
0
                    int       ix   = ix4 + l - 3;
766
0
                    const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + WARPEDPIXEL_PREC_SHIFTS;
767
0
                    assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
768
0
                    const int16_t* coeffs = svt_aom_warped_filter[offs];
769
770
0
                    int32_t sum = 1 << offset_bits_horiz;
771
0
                    for (int m = 0; m < 8; ++m) {
772
0
                        const int sample_x = clamp(ix + m, 0, width - 1);
773
0
                        uint16_t  ref      = (ref8b[iy * stride8b + sample_x] << 2) |
774
0
                            ((ref2b[iy * stride2b + sample_x] >> 6) & 3);
775
0
                        sum += ref * coeffs[m];
776
0
                    }
777
0
                    sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
778
0
                    assert(0 <= sum && sum < (1 << max_bits_horiz));
779
0
                    tmp[(k + 7) * 8 + (l + 4)] = sum;
780
0
                    sx += alpha;
781
0
                }
782
0
            }
783
784
            // Vertical filter
785
0
            for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
786
0
                int sy = sy4 + delta * (k + 4);
787
0
                for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
788
0
                    const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + WARPEDPIXEL_PREC_SHIFTS;
789
0
                    assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
790
0
                    const int16_t* coeffs = svt_aom_warped_filter[offs];
791
792
0
                    int32_t sum = 1 << offset_bits_vert;
793
0
                    for (int m = 0; m < 8; ++m) {
794
0
                        sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
795
0
                    }
796
0
                    if (conv_params->is_compound) {
797
0
                        ConvBufType* p =
798
0
                            &conv_params->dst[(i - p_row + k + 4) * conv_params->dst_stride + (j - p_col + l + 4)];
799
0
                        sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
800
0
                        if (conv_params->do_average) {
801
0
                            uint16_t* dst16 = &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
802
0
                            int32_t   tmp32 = *p;
803
0
                            if (conv_params->use_jnt_comp_avg) {
804
0
                                tmp32 = tmp32 * conv_params->fwd_offset + sum * conv_params->bck_offset;
805
0
                                tmp32 = tmp32 >> DIST_PRECISION_BITS;
806
0
                            } else {
807
0
                                tmp32 += sum;
808
0
                                tmp32 = tmp32 >> 1;
809
0
                            }
810
0
                            tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
811
0
                                (1 << (offset_bits - conv_params->round_1 - 1));
812
0
                            *dst16 = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd);
813
0
                        } else {
814
0
                            *p = sum;
815
0
                        }
816
0
                    } else {
817
0
                        uint16_t* p = &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
818
0
                        sum         = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
819
0
                        assert(0 <= sum && sum < (1 << (bd + 2)));
820
0
                        *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
821
0
                    }
822
0
                    sy += gamma;
823
0
                }
824
0
            }
825
0
        }
826
0
    }
827
0
}
828
829
static void highbd_warp_plane(WarpedMotionParams* wm, const uint8_t* const ref8, const uint8_t* const ref_2b, int width,
830
                              int height, int stride, const uint8_t* const pred8, int p_col, int p_row, int p_width,
831
                              int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd,
832
0
                              ConvolveParams* conv_params) {
833
0
    assert(wm->wmtype <= AFFINE);
834
0
    if (wm->wmtype == ROTZOOM) {
835
0
        wm->wmmat[5] = wm->wmmat[2];
836
0
        wm->wmmat[4] = -wm->wmmat[3];
837
0
    }
838
0
    const int32_t* const mat   = wm->wmmat;
839
0
    const int16_t        alpha = wm->alpha;
840
0
    const int16_t        beta  = wm->beta;
841
0
    const int16_t        gamma = wm->gamma;
842
0
    const int16_t        delta = wm->delta;
843
844
0
    uint16_t* pred = (uint16_t*)pred8;
845
0
    svt_av1_highbd_warp_affine(mat,
846
0
                               ref8,
847
0
                               ref_2b,
848
0
                               width,
849
0
                               height,
850
0
                               stride,
851
0
                               stride,
852
0
                               pred,
853
0
                               p_col,
854
0
                               p_row,
855
0
                               p_width,
856
0
                               p_height,
857
0
                               p_stride,
858
0
                               subsampling_x,
859
0
                               subsampling_y,
860
0
                               bd,
861
0
                               conv_params,
862
0
                               alpha,
863
0
                               beta,
864
0
                               gamma,
865
0
                               delta);
866
0
}
867
868
void svt_av1_warp_plane(WarpedMotionParams* wm, int use_hbd, int bd, const uint8_t* ref, const uint8_t* ref_2b,
869
                        int width, int height, int stride, uint8_t* pred, int p_col, int p_row, int p_width,
870
0
                        int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams* conv_params) {
871
0
    if (use_hbd) {
872
0
        highbd_warp_plane(wm,
873
0
                          ref,
874
0
                          ref_2b,
875
0
                          width,
876
0
                          height,
877
0
                          stride,
878
0
                          pred,
879
0
                          p_col,
880
0
                          p_row,
881
0
                          p_width,
882
0
                          p_height,
883
0
                          p_stride,
884
0
                          subsampling_x,
885
0
                          subsampling_y,
886
0
                          bd,
887
0
                          conv_params);
888
0
    } else {
889
0
        svt_warp_plane(wm,
890
0
                       ref,
891
0
                       width,
892
0
                       height,
893
0
                       stride,
894
0
                       pred,
895
0
                       p_col,
896
0
                       p_row,
897
0
                       p_width,
898
0
                       p_height,
899
0
                       p_stride,
900
0
                       subsampling_x,
901
0
                       subsampling_y,
902
0
                       conv_params);
903
0
    }
904
0
}
905
906
// Returns 1 on success or 0 on an invalid affine set
907
0
int svt_get_shear_params(WarpedMotionParams* wm) {
908
0
    const int32_t* mat = wm->wmmat;
909
0
    if (!is_affine_valid(wm)) {
910
0
        return 0;
911
0
    }
912
0
    wm->alpha = clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
913
0
    wm->beta  = clamp(mat[3], INT16_MIN, INT16_MAX);
914
0
    int16_t shift;
915
0
    int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
916
0
    int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
917
0
    wm->gamma = clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
918
0
    v         = ((int64_t)mat[3] * mat[4]) * y;
919
0
    wm->delta = clamp(
920
0
        mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
921
922
0
    wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) * (1 << WARP_PARAM_REDUCE_BITS);
923
0
    wm->beta  = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) * (1 << WARP_PARAM_REDUCE_BITS);
924
0
    wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) * (1 << WARP_PARAM_REDUCE_BITS);
925
0
    wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) * (1 << WARP_PARAM_REDUCE_BITS);
926
927
0
    if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta)) {
928
0
        return 0;
929
0
    }
930
931
0
    return 1;
932
0
}
933
934
// Select samples according to the motion vector difference.
935
0
uint8_t svt_aom_select_samples(Mv mv, int* pts, int* pts_inref, int len, BlockSize bsize) {
936
0
    const int bw     = block_size_wide[bsize];
937
0
    const int bh     = block_size_high[bsize];
938
0
    const int thresh = clamp(AOMMAX(bw, bh), 16, 112);
939
0
    uint8_t   ret    = 0;
940
941
    // Only keep the samples with MV differences within threshold.
942
0
    for (int i = 0; i < len; ++i) {
943
0
        const int diff = abs(pts_inref[2 * i] - pts[2 * i] - mv.x) + abs(pts_inref[2 * i + 1] - pts[2 * i + 1] - mv.y);
944
0
        if (diff > thresh) {
945
0
            continue;
946
0
        }
947
0
        if (ret != i) {
948
0
            memcpy(pts + 2 * ret, pts + 2 * i, 2 * sizeof(pts[0]));
949
0
            memcpy(pts_inref + 2 * ret, pts_inref + 2 * i, 2 * sizeof(pts_inref[0]));
950
0
        }
951
0
        ++ret;
952
0
    }
953
    // Keep at least 1 sample.
954
0
    return AOMMAX(ret, 1);
955
0
}