Coverage Report

Created: 2026-05-16 06:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/svt-av1/Source/Lib/Codec/inter_prediction.c
Line
Count
Source
1
/*
2
* Copyright(c) 2019 Intel Corporation
3
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
4
*
5
* This source code is subject to the terms of the BSD 3-Clause Clear License and
6
* the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear License
7
* was not distributed with this source code in the LICENSE file, you can
8
* obtain it at https://www.aomedia.org/license. If the Alliance for Open
9
* Media Patent License 1.0 was not distributed with this source code in the
10
* PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
11
*/
12
13
#include <stdlib.h>
14
15
#include "inter_prediction.h"
16
#include "convolve.h"
17
#include "common_dsp_rtcd.h"
18
#include "utility.h"
19
#include "pic_operators.h"
20
21
0
#define SCALE_SUBPEL_BITS 10
22
0
#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
23
0
#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
24
0
#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
25
26
void svt_aom_pack_block(uint8_t* in8_bit_buffer, uint32_t in8_stride, uint8_t* inn_bit_buffer, uint32_t inn_stride,
27
0
                        uint16_t* out16_bit_buffer, uint32_t out_stride, uint32_t width, uint32_t height) {
28
0
    svt_aom_pack2d_src(
29
0
        in8_bit_buffer, in8_stride, inn_bit_buffer, inn_stride, out16_bit_buffer, out_stride, width, height);
30
0
}
31
32
static WedgeMasksType wedge_masks[BLOCK_SIZES_ALL][2];
33
34
0
int svt_aom_is_masked_compound_type(COMPOUND_TYPE type) {
35
0
    return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD);
36
0
}
37
38
void svt_aom_highbd_subtract_block_c(int rows, int cols, int16_t* diff, ptrdiff_t diff_stride, const uint8_t* src8,
39
0
                                     ptrdiff_t src_stride, const uint8_t* pred8, ptrdiff_t pred_stride, int bd) {
40
0
    uint16_t* src  = (uint16_t*)(src8);
41
0
    uint16_t* pred = (uint16_t*)(pred8);
42
0
    (void)bd;
43
44
0
    for (int r = 0; r < rows; r++) {
45
0
        for (int c = 0; c < cols; c++) {
46
0
            diff[c] = src[c] - pred[c];
47
0
        }
48
49
0
        diff += diff_stride;
50
0
        pred += pred_stride;
51
0
        src += src_stride;
52
0
    }
53
0
}
54
55
void svt_aom_subtract_block_c(int rows, int cols, int16_t* diff, ptrdiff_t diff_stride, const uint8_t* src,
56
0
                              ptrdiff_t src_stride, const uint8_t* pred, ptrdiff_t pred_stride) {
57
0
    for (int r = 0; r < rows; r++) {
58
0
        for (int c = 0; c < cols; c++) {
59
0
            diff[c] = src[c] - pred[c];
60
0
        }
61
62
0
        diff += diff_stride;
63
0
        pred += pred_stride;
64
0
        src += src_stride;
65
0
    }
66
0
}
67
68
static void diffwtd_mask(uint8_t* mask, int which_inverse, int mask_base, const uint8_t* src0, int src0_stride,
69
0
                         const uint8_t* src1, int src1_stride, int h, int w) {
70
0
    for (int i = 0; i < h; ++i) {
71
0
        for (int j = 0; j < w; ++j) {
72
0
            int diff        = abs((int)src0[i * src0_stride + j] - (int)src1[i * src1_stride + j]);
73
0
            int m           = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
74
0
            mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
75
0
        }
76
0
    }
77
0
}
78
79
static AOM_FORCE_INLINE void diffwtd_mask_highbd(uint8_t* mask, int which_inverse, int mask_base, const uint16_t* src0,
80
                                                 int src0_stride, const uint16_t* src1, int src1_stride, int h, int w,
81
0
                                                 const unsigned int bd) {
82
0
    assert(bd >= 8);
83
0
    if (bd == 8) {
84
0
        if (which_inverse) {
85
0
            for (int i = 0; i < h; ++i) {
86
0
                for (int j = 0; j < w; ++j) {
87
0
                    int          diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
88
0
                    unsigned int m    = negative_to_zero(mask_base + diff);
89
0
                    m                 = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
90
0
                    mask[j]           = AOM_BLEND_A64_MAX_ALPHA - m;
91
0
                }
92
0
                src0 += src0_stride;
93
0
                src1 += src1_stride;
94
0
                mask += w;
95
0
            }
96
0
        } else {
97
0
            for (int i = 0; i < h; ++i) {
98
0
                for (int j = 0; j < w; ++j) {
99
0
                    int          diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
100
0
                    unsigned int m    = negative_to_zero(mask_base + diff);
101
0
                    m                 = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
102
0
                    mask[j]           = m;
103
0
                }
104
0
                src0 += src0_stride;
105
0
                src1 += src1_stride;
106
0
                mask += w;
107
0
            }
108
0
        }
109
0
    } else {
110
0
        const unsigned int bd_shift = bd - 8;
111
0
        if (which_inverse) {
112
0
            for (int i = 0; i < h; ++i) {
113
0
                for (int j = 0; j < w; ++j) {
114
0
                    int          diff = (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
115
0
                    unsigned int m    = negative_to_zero(mask_base + diff);
116
0
                    m                 = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
117
0
                    mask[j]           = AOM_BLEND_A64_MAX_ALPHA - m;
118
0
                }
119
0
                src0 += src0_stride;
120
0
                src1 += src1_stride;
121
0
                mask += w;
122
0
            }
123
0
        } else {
124
0
            for (int i = 0; i < h; ++i) {
125
0
                for (int j = 0; j < w; ++j) {
126
0
                    int          diff = (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
127
0
                    unsigned int m    = negative_to_zero(mask_base + diff);
128
0
                    m                 = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
129
0
                    mask[j]           = m;
130
0
                }
131
0
                src0 += src0_stride;
132
0
                src1 += src1_stride;
133
0
                mask += w;
134
0
            }
135
0
        }
136
0
    }
137
0
}
138
139
void svt_av1_build_compound_diffwtd_mask_highbd_c(uint8_t* mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t* src0,
140
                                                  int src0_stride, const uint8_t* src1, int src1_stride, int h, int w,
141
0
                                                  int bd) {
142
0
    switch (mask_type) {
143
0
    case DIFFWTD_38:
144
0
        diffwtd_mask_highbd(mask, 0, 38, (uint16_t*)src0, src0_stride, (uint16_t*)src1, src1_stride, h, w, bd);
145
0
        break;
146
0
    case DIFFWTD_38_INV:
147
0
        diffwtd_mask_highbd(mask, 1, 38, (uint16_t*)src0, src0_stride, (uint16_t*)src1, src1_stride, h, w, bd);
148
0
        break;
149
0
    default:
150
0
        assert(0);
151
0
    }
152
0
}
153
154
void svt_av1_build_compound_diffwtd_mask_c(uint8_t* mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t* src0,
155
0
                                           int src0_stride, const uint8_t* src1, int src1_stride, int h, int w) {
156
0
    switch (mask_type) {
157
0
    case DIFFWTD_38:
158
0
        diffwtd_mask(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w);
159
0
        break;
160
0
    case DIFFWTD_38_INV:
161
0
        diffwtd_mask(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w);
162
0
        break;
163
0
    default:
164
0
        assert(0);
165
0
    }
166
0
}
167
168
// Note: Expect val to be in q4 precision
169
0
static INLINE int32_t scaled_x(int32_t val, const ScaleFactors* sf) {
170
0
    const int     off  = (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
171
0
    const int64_t tval = (int64_t)val * sf->x_scale_fp + off;
172
0
    return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval, REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
173
0
}
174
175
// Note: Expect val to be in q4 precision
176
0
static INLINE int32_t scaled_y(int32_t val, const ScaleFactors* sf) {
177
0
    const int32_t off  = (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
178
0
    const int64_t tval = (int64_t)val * sf->y_scale_fp + off;
179
0
    return (int32_t)ROUND_POWER_OF_TWO_SIGNED_64(tval, REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
180
0
}
181
182
// Note: Expect val to be in q4 precision
183
0
static int32_t unscaled_value(int32_t val, const ScaleFactors* sf) {
184
0
    (void)sf;
185
0
    return val << SCALE_EXTRA_BITS;
186
0
}
187
188
948
static int32_t get_fixed_point_scale_factor(int32_t other_size, int32_t this_size) {
189
    // Calculate scaling factor once for each reference frame
190
    // and use fixed point scaling factors in decoding and encoding routines.
191
    // Hardware implementations can calculate scale factor in device driver
192
    // and use multiplication and shifting on hardware instead of division.
193
948
    return ((other_size << REF_SCALE_SHIFT) + this_size / 2) / this_size;
194
948
}
195
196
// Given the fixed point scale, calculate coarse point scale.
197
948
static int32_t fixed_point_scale_to_coarse_point_scale(int32_t scale_fp) {
198
948
    return ROUND_POWER_OF_TWO(scale_fp, REF_SCALE_SHIFT - SCALE_SUBPEL_BITS);
199
948
}
200
201
474
void svt_av1_setup_scale_factors_for_frame(ScaleFactors* sf, int other_w, int other_h, int this_w, int this_h) {
202
474
    if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
203
0
        sf->x_scale_fp = REF_INVALID_SCALE;
204
0
        sf->y_scale_fp = REF_INVALID_SCALE;
205
0
        return;
206
0
    }
207
208
474
    sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
209
474
    sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
210
211
474
    sf->x_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->x_scale_fp);
212
474
    sf->y_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->y_scale_fp);
213
214
474
    if (av1_is_scaled(sf)) {
215
0
        sf->scale_value_x = scaled_x;
216
0
        sf->scale_value_y = scaled_y;
217
474
    } else {
218
474
        sf->scale_value_x = unscaled_value;
219
474
        sf->scale_value_y = unscaled_value;
220
474
    }
221
474
}
222
223
0
static INLINE int32_t has_scale(int32_t xs, int32_t ys) {
224
0
    return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
225
0
}
226
227
0
static INLINE void revert_scale_extra_bits(SubpelParams* sp) {
228
0
    sp->subpel_x >>= SCALE_EXTRA_BITS;
229
0
    sp->subpel_y >>= SCALE_EXTRA_BITS;
230
0
    sp->xs >>= SCALE_EXTRA_BITS;
231
0
    sp->ys >>= SCALE_EXTRA_BITS;
232
0
    assert(sp->subpel_x < SUBPEL_SHIFTS);
233
0
    assert(sp->subpel_y < SUBPEL_SHIFTS);
234
0
    assert(sp->xs <= SUBPEL_SHIFTS);
235
0
    assert(sp->ys <= SUBPEL_SHIFTS);
236
0
}
237
238
DECLARE_ALIGNED(256, const InterpKernel, sub_pel_filters_8[SUBPEL_SHIFTS]) = {{0, 0, 0, 128, 0, 0, 0, 0},
239
                                                                              {0, 2, -6, 126, 8, -2, 0, 0},
240
                                                                              {0, 2, -10, 122, 18, -4, 0, 0},
241
                                                                              {0, 2, -12, 116, 28, -8, 2, 0},
242
                                                                              {0, 2, -14, 110, 38, -10, 2, 0},
243
                                                                              {0, 2, -14, 102, 48, -12, 2, 0},
244
                                                                              {0, 2, -16, 94, 58, -12, 2, 0},
245
                                                                              {0, 2, -14, 84, 66, -12, 2, 0},
246
                                                                              {0, 2, -14, 76, 76, -14, 2, 0},
247
                                                                              {0, 2, -12, 66, 84, -14, 2, 0},
248
                                                                              {0, 2, -12, 58, 94, -16, 2, 0},
249
                                                                              {0, 2, -12, 48, 102, -14, 2, 0},
250
                                                                              {0, 2, -10, 38, 110, -14, 2, 0},
251
                                                                              {0, 2, -8, 28, 116, -12, 2, 0},
252
                                                                              {0, 0, -4, 18, 122, -10, 2, 0},
253
                                                                              {0, 0, -2, 8, 126, -6, 2, 0}};
254
DECLARE_ALIGNED(256, const InterpKernel, sub_pel_filters_4[SUBPEL_SHIFTS]) = {{0, 0, 0, 128, 0, 0, 0, 0},
255
                                                                              {0, 0, -4, 126, 8, -2, 0, 0},
256
                                                                              {0, 0, -8, 122, 18, -4, 0, 0},
257
                                                                              {0, 0, -10, 116, 28, -6, 0, 0},
258
                                                                              {0, 0, -12, 110, 38, -8, 0, 0},
259
                                                                              {0, 0, -12, 102, 48, -10, 0, 0},
260
                                                                              {0, 0, -14, 94, 58, -10, 0, 0},
261
                                                                              {0, 0, -12, 84, 66, -10, 0, 0},
262
                                                                              {0, 0, -12, 76, 76, -12, 0, 0},
263
                                                                              {0, 0, -10, 66, 84, -12, 0, 0},
264
                                                                              {0, 0, -10, 58, 94, -14, 0, 0},
265
                                                                              {0, 0, -10, 48, 102, -12, 0, 0},
266
                                                                              {0, 0, -8, 38, 110, -12, 0, 0},
267
                                                                              {0, 0, -6, 28, 116, -10, 0, 0},
268
                                                                              {0, 0, -4, 18, 122, -8, 0, 0},
269
                                                                              {0, 0, -2, 8, 126, -4, 0, 0}};
270
271
#define MAX_FILTER_TAP 8
272
273
0
int svt_aom_get_relative_dist_enc(SeqHeader* seq_header, int ref_hint, int order_hint) {
274
0
    int diff, m;
275
0
    if (!seq_header->order_hint_info.enable_order_hint) {
276
0
        return 0;
277
0
    }
278
0
    diff = ref_hint - order_hint;
279
0
    m    = 1 << (seq_header->order_hint_info.order_hint_bits - 1);
280
0
    diff = (diff & (m - 1)) - (diff & m);
281
0
    return diff;
282
0
}
283
284
static const int quant_dist_weight[4][2]          = {{2, 3}, {2, 5}, {2, 7}, {1, MAX_FRAME_DISTANCE}};
285
static const int quant_dist_lookup_table[2][4][2] = {
286
    {{9, 7}, {11, 5}, {12, 4}, {13, 3}},
287
    {{7, 9}, {5, 11}, {4, 12}, {3, 13}},
288
};
289
290
void svt_av1_dist_wtd_comp_weight_assign(SeqHeader* seq_header, int cur_frame_index, int bck_frame_index,
291
                                         int fwd_frame_index, int compound_idx, int order_idx, int* fwd_offset,
292
0
                                         int* bck_offset, int* use_dist_wtd_comp_avg, int is_compound) {
293
0
    assert(fwd_offset != NULL && bck_offset != NULL);
294
0
    if (!is_compound || compound_idx) {
295
0
        *use_dist_wtd_comp_avg = 0;
296
0
        return;
297
0
    }
298
299
0
    *use_dist_wtd_comp_avg = 1;
300
301
0
    int d0 = clamp(
302
0
        abs(svt_aom_get_relative_dist_enc(seq_header, fwd_frame_index, cur_frame_index)), 0, MAX_FRAME_DISTANCE);
303
0
    int d1 = clamp(
304
0
        abs(svt_aom_get_relative_dist_enc(seq_header, cur_frame_index, bck_frame_index)), 0, MAX_FRAME_DISTANCE);
305
306
0
    const int order = d0 <= d1;
307
308
0
    if (d0 == 0 || d1 == 0) {
309
0
        *fwd_offset = quant_dist_lookup_table[order_idx][3][order];
310
0
        *bck_offset = quant_dist_lookup_table[order_idx][3][1 - order];
311
0
        return;
312
0
    }
313
314
0
    int i;
315
0
    for (i = 0; i < 3; ++i) {
316
0
        int c0    = quant_dist_weight[i][order];
317
0
        int c1    = quant_dist_weight[i][!order];
318
0
        int d0_c0 = d0 * c0;
319
0
        int d1_c1 = d1 * c1;
320
0
        if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) {
321
0
            break;
322
0
        }
323
0
    }
324
325
0
    *fwd_offset = quant_dist_lookup_table[order_idx][i][order];
326
0
    *bck_offset = quant_dist_lookup_table[order_idx][i][1 - order];
327
0
}
328
329
void svt_av1_convolve_2d_sr_c(const uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t w,
330
                              int32_t h, const InterpFilterParams* filter_params_x,
331
                              const InterpFilterParams* filter_params_y, const int32_t subpel_x_q4,
332
0
                              const int32_t subpel_y_q4, ConvolveParams* conv_params) {
333
0
    int16_t       im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
334
0
    int32_t       im_h      = h + filter_params_y->taps - 1;
335
0
    int32_t       im_stride = w;
336
0
    const int32_t fo_vert   = filter_params_y->taps / 2 - 1;
337
0
    const int32_t fo_horiz  = filter_params_x->taps / 2 - 1;
338
0
    const int32_t bd        = 8;
339
0
    const int32_t bits      = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
340
341
    // horizontal filter
342
0
    const uint8_t* src_horiz = src - fo_vert * src_stride;
343
0
    const int16_t* x_filter  = av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
344
0
    for (int32_t y = 0; y < im_h; ++y) {
345
0
        for (int32_t x = 0; x < w; ++x) {
346
0
            int32_t sum = (1 << (bd + FILTER_BITS - 1));
347
0
            for (int32_t k = 0; k < filter_params_x->taps; ++k) {
348
0
                sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
349
0
            }
350
0
            assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
351
0
            im_block[y * im_stride + x] = (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
352
0
        }
353
0
    }
354
355
    // vertical filter
356
0
    int16_t*       src_vert    = im_block + fo_vert * im_stride;
357
0
    const int16_t* y_filter    = av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
358
0
    const int32_t  offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
359
0
    for (int32_t y = 0; y < h; ++y) {
360
0
        for (int32_t x = 0; x < w; ++x) {
361
0
            int32_t sum = 1 << offset_bits;
362
0
            for (int32_t k = 0; k < filter_params_y->taps; ++k) {
363
0
                sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
364
0
            }
365
0
            assert(0 <= sum && sum < (1 << (offset_bits + 2)));
366
0
            int16_t res             = (ConvBufType)(ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
367
0
                                        ((1 << (offset_bits - conv_params->round_1)) +
368
0
                                         (1 << (offset_bits - conv_params->round_1 - 1))));
369
0
            dst[y * dst_stride + x] = (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), 8);
370
0
        }
371
0
    }
372
0
}
373
374
void svt_av1_convolve_y_sr_c(const uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t w,
375
                             int32_t h, const InterpFilterParams* filter_params_x,
376
                             const InterpFilterParams* filter_params_y, const int32_t subpel_x_q4,
377
0
                             const int32_t subpel_y_q4, ConvolveParams* conv_params) {
378
0
    assert(filter_params_y != NULL);
379
0
    const int32_t fo_vert = filter_params_y->taps / 2 - 1;
380
0
    (void)filter_params_x;
381
0
    (void)subpel_x_q4;
382
0
    (void)conv_params;
383
384
0
    assert(conv_params->round_0 <= FILTER_BITS);
385
0
    assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
386
0
           ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
387
388
    // vertical filter
389
0
    const int16_t* y_filter = av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
390
391
0
    for (int32_t y = 0; y < h; ++y) {
392
0
        for (int32_t x = 0; x < w; ++x) {
393
0
            int32_t res = 0;
394
0
            for (int32_t k = 0; k < filter_params_y->taps; ++k) {
395
0
                res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
396
0
            }
397
0
            dst[y * dst_stride + x] = (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), 8);
398
0
        }
399
0
    }
400
0
}
401
402
void svt_av1_convolve_x_sr_c(const uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t w,
403
                             int32_t h, const InterpFilterParams* filter_params_x,
404
                             const InterpFilterParams* filter_params_y, const int32_t subpel_x_q4,
405
0
                             const int32_t subpel_y_q4, ConvolveParams* conv_params) {
406
0
    const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
407
0
    const int32_t bits     = FILTER_BITS - conv_params->round_0;
408
0
    (void)filter_params_y;
409
0
    (void)subpel_y_q4;
410
0
    (void)conv_params;
411
412
0
    assert(bits >= 0);
413
0
    assert((FILTER_BITS - conv_params->round_1) >= 0 ||
414
0
           ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
415
416
    // horizontal filter
417
0
    const int16_t* x_filter = av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
418
419
0
    for (int32_t y = 0; y < h; ++y) {
420
0
        for (int32_t x = 0; x < w; ++x) {
421
0
            int32_t res = 0;
422
0
            for (int32_t k = 0; k < filter_params_x->taps; ++k) {
423
0
                res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
424
0
            }
425
0
            res                     = ROUND_POWER_OF_TWO(res, conv_params->round_0);
426
0
            dst[y * dst_stride + x] = (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), 8);
427
0
        }
428
0
    }
429
0
}
430
431
void svt_av1_convolve_2d_copy_sr_c(const uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t w,
432
                                   int32_t h, const InterpFilterParams* filter_params_x,
433
                                   const InterpFilterParams* filter_params_y, const int32_t subpel_x_q4,
434
0
                                   const int32_t subpel_y_q4, ConvolveParams* conv_params) {
435
0
    (void)filter_params_x;
436
0
    (void)filter_params_y;
437
0
    (void)subpel_x_q4;
438
0
    (void)subpel_y_q4;
439
0
    (void)conv_params;
440
441
0
    for (int32_t y = 0; y < h; ++y) {
442
0
        for (int32_t x = 0; x < w; ++x) {
443
0
            dst[y * dst_stride + x] = src[y * src_stride + x];
444
0
        }
445
0
    }
446
0
}
447
448
void svt_av1_convolve_2d_scale_c(const uint8_t* src, int src_stride, uint8_t* dst8, int dst8_stride, int w, int h,
449
                                 const InterpFilterParams* filter_params_x, const InterpFilterParams* filter_params_y,
450
                                 const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
451
0
                                 ConvolveParams* conv_params) {
452
0
    int16_t        im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
453
0
    int            im_h         = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + filter_params_y->taps;
454
0
    CONV_BUF_TYPE* dst16        = conv_params->dst;
455
0
    const int      dst16_stride = conv_params->dst_stride;
456
0
    const int      bits         = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
457
0
    assert(bits >= 0);
458
0
    int       im_stride = w;
459
0
    const int fo_vert   = filter_params_y->taps / 2 - 1;
460
0
    const int fo_horiz  = filter_params_x->taps / 2 - 1;
461
0
    const int bd        = 8;
462
463
    // horizontal filter
464
0
    const uint8_t* src_horiz = src - fo_vert * src_stride;
465
0
    for (int y = 0; y < im_h; ++y) {
466
0
        int x_qn = subpel_x_qn;
467
0
        for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
468
0
            const uint8_t* const src_x        = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
469
0
            const int            x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
470
0
            assert(x_filter_idx < SUBPEL_SHIFTS);
471
0
            const int16_t* x_filter = av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
472
0
            int32_t        sum      = (1 << (bd + FILTER_BITS - 1));
473
0
            for (int k = 0; k < filter_params_x->taps; ++k) {
474
0
                sum += x_filter[k] * src_x[k - fo_horiz];
475
0
            }
476
0
            assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
477
0
            im_block[y * im_stride + x] = (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
478
0
        }
479
0
        src_horiz += src_stride;
480
0
    }
481
482
    // vertical filter
483
0
    int16_t*  src_vert    = im_block + fo_vert * im_stride;
484
0
    const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
485
0
    for (int x = 0; x < w; ++x) {
486
0
        int y_qn = subpel_y_qn;
487
0
        for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
488
0
            const int16_t* src_y        = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
489
0
            const int      y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
490
0
            assert(y_filter_idx < SUBPEL_SHIFTS);
491
0
            const int16_t* y_filter = av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
492
0
            int32_t        sum      = 1 << offset_bits;
493
0
            for (int k = 0; k < filter_params_y->taps; ++k) {
494
0
                sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
495
0
            }
496
0
            assert(0 <= sum && sum < (1 << (offset_bits + 2)));
497
0
            CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
498
0
            if (conv_params->is_compound) {
499
0
                if (conv_params->do_average) {
500
0
                    int32_t tmp = dst16[y * dst16_stride + x];
501
0
                    if (conv_params->use_dist_wtd_comp_avg) {
502
0
                        tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
503
0
                        tmp = tmp >> DIST_PRECISION_BITS;
504
0
                    } else {
505
0
                        tmp += res;
506
0
                        tmp = tmp >> 1;
507
0
                    }
508
                    /* Subtract round offset and convolve round */
509
0
                    tmp = tmp -
510
0
                        ((1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1)));
511
0
                    dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
512
0
                } else {
513
0
                    dst16[y * dst16_stride + x] = res;
514
0
                }
515
0
            } else {
516
                /* Subtract round offset and convolve round */
517
0
                int32_t tmp = res -
518
0
                    ((1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1)));
519
0
                dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
520
0
            }
521
0
        }
522
0
        src_vert++;
523
0
    }
524
0
}
525
526
void svt_av1_jnt_convolve_2d_c(const uint8_t* src, int32_t src_stride, uint8_t* dst8, int32_t dst8_stride, int32_t w,
527
                               int32_t h, const InterpFilterParams* filter_params_x,
528
                               const InterpFilterParams* filter_params_y, const int32_t subpel_x_q4,
529
0
                               const int32_t subpel_y_q4, ConvolveParams* conv_params) {
530
0
    ConvBufType*  dst        = conv_params->dst;
531
0
    int32_t       dst_stride = conv_params->dst_stride;
532
0
    int16_t       im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
533
0
    int32_t       im_h       = h + filter_params_y->taps - 1;
534
0
    int32_t       im_stride  = w;
535
0
    const int32_t fo_vert    = filter_params_y->taps / 2 - 1;
536
0
    const int32_t fo_horiz   = filter_params_x->taps / 2 - 1;
537
0
    const int32_t bd         = 8;
538
0
    const int32_t round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
539
540
    // horizontal filter
541
0
    const uint8_t* src_horiz = src - fo_vert * src_stride;
542
0
    const int16_t* x_filter  = av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
543
0
    for (int32_t y = 0; y < im_h; ++y) {
544
0
        for (int32_t x = 0; x < w; ++x) {
545
0
            int32_t sum = (1 << (bd + FILTER_BITS - 1));
546
0
            for (int32_t k = 0; k < filter_params_x->taps; ++k) {
547
0
                sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
548
0
            }
549
0
            assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
550
0
            im_block[y * im_stride + x] = (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
551
0
        }
552
0
    }
553
554
    // vertical filter
555
0
    int16_t*       src_vert    = im_block + fo_vert * im_stride;
556
0
    const int16_t* y_filter    = av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
557
0
    const int32_t  offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
558
0
    for (int32_t y = 0; y < h; ++y) {
559
0
        for (int32_t x = 0; x < w; ++x) {
560
0
            int32_t sum = 1 << offset_bits;
561
0
            for (int32_t k = 0; k < filter_params_y->taps; ++k) {
562
0
                sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
563
0
            }
564
0
            assert(0 <= sum && sum < (1 << (offset_bits + 2)));
565
0
            ConvBufType res = (ConvBufType)ROUND_POWER_OF_TWO(sum, conv_params->round_1);
566
0
            if (conv_params->do_average) {
567
0
                int32_t tmp = dst[y * dst_stride + x];
568
0
                if (conv_params->use_jnt_comp_avg) {
569
0
                    tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
570
0
                    tmp = tmp >> DIST_PRECISION_BITS;
571
0
                } else {
572
0
                    tmp += res;
573
0
                    tmp = tmp >> 1;
574
0
                }
575
0
                tmp -= (1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1));
576
0
                dst8[y * dst8_stride + x] = (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), 8);
577
0
            } else {
578
0
                dst[y * dst_stride + x] = res;
579
0
            }
580
0
        }
581
0
    }
582
0
}
583
584
void svt_av1_jnt_convolve_y_c(const uint8_t* src, int32_t src_stride, uint8_t* dst8, int32_t dst8_stride, int32_t w,
585
                              int32_t h, const InterpFilterParams* filter_params_x,
586
                              const InterpFilterParams* filter_params_y, const int32_t subpel_x_q4,
587
0
                              const int32_t subpel_y_q4, ConvolveParams* conv_params) {
588
0
    ConvBufType*  dst          = conv_params->dst;
589
0
    int32_t       dst_stride   = conv_params->dst_stride;
590
0
    const int32_t fo_vert      = filter_params_y->taps / 2 - 1;
591
0
    const int32_t bits         = FILTER_BITS - conv_params->round_0;
592
0
    const int32_t bd           = 8;
593
0
    const int32_t offset_bits  = bd + 2 * FILTER_BITS - conv_params->round_0;
594
0
    const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
595
0
        (1 << (offset_bits - conv_params->round_1 - 1));
596
0
    const int32_t round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
597
0
    (void)filter_params_x;
598
0
    (void)subpel_x_q4;
599
600
    // vertical filter
601
0
    const int16_t* y_filter = av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
602
0
    for (int32_t y = 0; y < h; ++y) {
603
0
        for (int32_t x = 0; x < w; ++x) {
604
0
            int32_t res = 0;
605
0
            for (int32_t k = 0; k < filter_params_y->taps; ++k) {
606
0
                res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
607
0
            }
608
0
            res *= (1 << bits);
609
0
            res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
610
611
0
            if (conv_params->do_average) {
612
0
                int32_t tmp = dst[y * dst_stride + x];
613
0
                if (conv_params->use_jnt_comp_avg) {
614
0
                    tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
615
0
                    tmp = tmp >> DIST_PRECISION_BITS;
616
0
                } else {
617
0
                    tmp += res;
618
0
                    tmp = tmp >> 1;
619
0
                }
620
0
                tmp -= round_offset;
621
0
                dst8[y * dst8_stride + x] = (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), 8);
622
0
            } else {
623
0
                dst[y * dst_stride + x] = (ConvBufType)res;
624
0
            }
625
0
        }
626
0
    }
627
0
}
628
629
void svt_av1_jnt_convolve_x_c(const uint8_t* src, int32_t src_stride, uint8_t* dst8, int32_t dst8_stride, int32_t w,
630
                              int32_t h, const InterpFilterParams* filter_params_x,
631
                              const InterpFilterParams* filter_params_y, const int32_t subpel_x_q4,
632
0
                              const int32_t subpel_y_q4, ConvolveParams* conv_params) {
633
0
    ConvBufType*  dst          = conv_params->dst;
634
0
    int32_t       dst_stride   = conv_params->dst_stride;
635
0
    const int32_t fo_horiz     = filter_params_x->taps / 2 - 1;
636
0
    const int32_t bits         = FILTER_BITS - conv_params->round_1;
637
0
    const int32_t bd           = 8;
638
0
    const int32_t offset_bits  = bd + 2 * FILTER_BITS - conv_params->round_0;
639
0
    const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
640
0
        (1 << (offset_bits - conv_params->round_1 - 1));
641
0
    const int32_t round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
642
0
    (void)filter_params_y;
643
0
    (void)subpel_y_q4;
644
645
    // horizontal filter
646
0
    const int16_t* x_filter = av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
647
0
    for (int32_t y = 0; y < h; ++y) {
648
0
        for (int32_t x = 0; x < w; ++x) {
649
0
            int32_t res = 0;
650
0
            for (int32_t k = 0; k < filter_params_x->taps; ++k) {
651
0
                res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
652
0
            }
653
0
            res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
654
0
            res += round_offset;
655
656
0
            if (conv_params->do_average) {
657
0
                int32_t tmp = dst[y * dst_stride + x];
658
0
                if (conv_params->use_jnt_comp_avg) {
659
0
                    tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
660
0
                    tmp = tmp >> DIST_PRECISION_BITS;
661
0
                } else {
662
0
                    tmp += res;
663
0
                    tmp = tmp >> 1;
664
0
                }
665
0
                tmp -= round_offset;
666
0
                dst8[y * dst8_stride + x] = (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), 8);
667
0
            } else {
668
0
                dst[y * dst_stride + x] = (ConvBufType)res;
669
0
            }
670
0
        }
671
0
    }
672
0
}
673
674
void svt_av1_jnt_convolve_2d_copy_c(const uint8_t* src, int32_t src_stride, uint8_t* dst8, int32_t dst8_stride,
675
                                    int32_t w, int32_t h, const InterpFilterParams* filter_params_x,
676
                                    const InterpFilterParams* filter_params_y, const int32_t subpel_x_q4,
677
0
                                    const int32_t subpel_y_q4, ConvolveParams* conv_params) {
678
0
    ConvBufType*  dst          = conv_params->dst;
679
0
    int32_t       dst_stride   = conv_params->dst_stride;
680
0
    const int32_t bits         = FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
681
0
    const int32_t bd           = 8;
682
0
    const int32_t offset_bits  = bd + 2 * FILTER_BITS - conv_params->round_0;
683
0
    const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
684
0
        (1 << (offset_bits - conv_params->round_1 - 1));
685
0
    (void)filter_params_x;
686
0
    (void)filter_params_y;
687
0
    (void)subpel_x_q4;
688
0
    (void)subpel_y_q4;
689
690
0
    for (int32_t y = 0; y < h; ++y) {
691
0
        for (int32_t x = 0; x < w; ++x) {
692
0
            ConvBufType res = src[y * src_stride + x] << bits;
693
0
            res += (ConvBufType)round_offset;
694
695
0
            if (conv_params->do_average) {
696
0
                int32_t tmp = dst[y * dst_stride + x];
697
0
                if (conv_params->use_jnt_comp_avg) {
698
0
                    tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
699
0
                    tmp = tmp >> DIST_PRECISION_BITS;
700
0
                } else {
701
0
                    tmp += res;
702
0
                    tmp = tmp >> 1;
703
0
                }
704
0
                tmp -= round_offset;
705
0
                dst8[y * dst8_stride + x] = (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), 8);
706
0
            } else {
707
0
                dst[y * dst_stride + x] = res;
708
0
            }
709
0
        }
710
0
    }
711
0
}
712
713
void svt_av1_highbd_convolve_2d_copy_sr_c(const uint16_t* src, int32_t src_stride, uint16_t* dst, int32_t dst_stride,
714
                                          int32_t w, int32_t h, const InterpFilterParams* filter_params_x,
715
                                          const InterpFilterParams* filter_params_y, const int32_t subpel_x_q4,
716
0
                                          const int32_t subpel_y_q4, ConvolveParams* conv_params, int32_t bd) {
717
0
    (void)filter_params_x;
718
0
    (void)filter_params_y;
719
0
    (void)subpel_x_q4;
720
0
    (void)subpel_y_q4;
721
0
    (void)conv_params;
722
0
    (void)bd;
723
724
0
    for (int32_t y = 0; y < h; ++y) {
725
0
        for (int32_t x = 0; x < w; ++x) {
726
0
            dst[y * dst_stride + x] = src[y * src_stride + x];
727
0
        }
728
0
    }
729
0
}
730
731
void svt_av1_highbd_convolve_x_sr_c(const uint16_t* src, int32_t src_stride, uint16_t* dst, int32_t dst_stride,
732
                                    int32_t w, int32_t h, const InterpFilterParams* filter_params_x,
733
                                    const InterpFilterParams* filter_params_y, const int32_t subpel_x_q4,
734
0
                                    const int32_t subpel_y_q4, ConvolveParams* conv_params, int32_t bd) {
735
0
    const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
736
0
    const int32_t bits     = FILTER_BITS - conv_params->round_0;
737
0
    (void)filter_params_y;
738
0
    (void)subpel_y_q4;
739
740
0
    assert(bits >= 0);
741
0
    assert((FILTER_BITS - conv_params->round_1) >= 0 ||
742
0
           ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
743
744
    // horizontal filter
745
0
    const int16_t* x_filter = av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
746
0
    for (int32_t y = 0; y < h; ++y) {
747
0
        for (int32_t x = 0; x < w; ++x) {
748
0
            int32_t res = 0;
749
0
            for (int32_t k = 0; k < filter_params_x->taps; ++k) {
750
0
                res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
751
0
            }
752
0
            res                     = ROUND_POWER_OF_TWO(res, conv_params->round_0);
753
0
            dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
754
0
        }
755
0
    }
756
0
}
757
758
void svt_av1_highbd_convolve_y_sr_c(const uint16_t* src, int32_t src_stride, uint16_t* dst, int32_t dst_stride,
759
                                    int32_t w, int32_t h, const InterpFilterParams* filter_params_x,
760
                                    const InterpFilterParams* filter_params_y, const int32_t subpel_x_q4,
761
0
                                    const int32_t subpel_y_q4, ConvolveParams* conv_params, int32_t bd) {
762
0
    assert(filter_params_y != NULL);
763
0
    const int32_t fo_vert = filter_params_y->taps / 2 - 1;
764
0
    (void)filter_params_x;
765
0
    (void)subpel_x_q4;
766
0
    (void)conv_params;
767
768
0
    assert(conv_params->round_0 <= FILTER_BITS);
769
0
    assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
770
0
           ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
771
    // vertical filter
772
0
    const int16_t* y_filter = av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
773
0
    for (int32_t y = 0; y < h; ++y) {
774
0
        for (int32_t x = 0; x < w; ++x) {
775
0
            int32_t res = 0;
776
0
            for (int32_t k = 0; k < filter_params_y->taps; ++k) {
777
0
                res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
778
0
            }
779
0
            dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
780
0
        }
781
0
    }
782
0
}
783
784
void svt_av1_highbd_convolve_2d_sr_c(const uint16_t* src, int32_t src_stride, uint16_t* dst, int32_t dst_stride,
785
                                     int32_t w, int32_t h, const InterpFilterParams* filter_params_x,
786
                                     const InterpFilterParams* filter_params_y, const int32_t subpel_x_q4,
787
0
                                     const int32_t subpel_y_q4, ConvolveParams* conv_params, int32_t bd) {
788
0
    int16_t       im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
789
0
    int32_t       im_h      = h + filter_params_y->taps - 1;
790
0
    int32_t       im_stride = w;
791
0
    const int32_t fo_vert   = filter_params_y->taps / 2 - 1;
792
0
    const int32_t fo_horiz  = filter_params_x->taps / 2 - 1;
793
0
    const int32_t bits      = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
794
0
    assert(bits >= 0);
795
796
    // horizontal filter
797
0
    const uint16_t* src_horiz = src - fo_vert * src_stride;
798
0
    const int16_t*  x_filter  = av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
799
0
    for (int32_t y = 0; y < im_h; ++y) {
800
0
        for (int32_t x = 0; x < w; ++x) {
801
0
            int32_t sum = (1 << (bd + FILTER_BITS - 1));
802
0
            for (int32_t k = 0; k < filter_params_x->taps; ++k) {
803
0
                sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
804
0
            }
805
0
            assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
806
0
            im_block[y * im_stride + x] = (ConvBufType)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
807
0
        }
808
0
    }
809
810
    // vertical filter
811
0
    int16_t*       src_vert    = im_block + fo_vert * im_stride;
812
0
    const int16_t* y_filter    = av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
813
0
    const int32_t  offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
814
0
    for (int32_t y = 0; y < h; ++y) {
815
0
        for (int32_t x = 0; x < w; ++x) {
816
0
            int32_t sum = 1 << offset_bits;
817
0
            for (int32_t k = 0; k < filter_params_y->taps; ++k) {
818
0
                sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
819
0
            }
820
0
            assert(0 <= sum && sum < (1 << (offset_bits + 2)));
821
0
            int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
822
0
                ((1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1)));
823
0
            dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
824
0
        }
825
0
    }
826
0
}
827
828
void svt_av1_highbd_convolve_2d_scale_c(const uint16_t* src, int src_stride, uint16_t* dst, int dst_stride, int w,
829
                                        int h, const InterpFilterParams* filter_params_x,
830
                                        const InterpFilterParams* filter_params_y, const int subpel_x_qn,
831
                                        const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
832
0
                                        ConvolveParams* conv_params, int bd) {
833
0
    int16_t        im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
834
0
    int            im_h         = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + filter_params_y->taps;
835
0
    int            im_stride    = w;
836
0
    const int      fo_vert      = filter_params_y->taps / 2 - 1;
837
0
    const int      fo_horiz     = filter_params_x->taps / 2 - 1;
838
0
    CONV_BUF_TYPE* dst16        = conv_params->dst;
839
0
    const int      dst16_stride = conv_params->dst_stride;
840
0
    const int      bits         = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
841
0
    assert(bits >= 0);
842
    // horizontal filter
843
0
    const uint16_t* src_horiz = src - fo_vert * src_stride;
844
0
    for (int y = 0; y < im_h; ++y) {
845
0
        int x_qn = subpel_x_qn;
846
0
        for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
847
0
            const uint16_t* const src_x        = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
848
0
            const int             x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
849
0
            assert(x_filter_idx < SUBPEL_SHIFTS);
850
0
            const int16_t* x_filter = av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
851
0
            int32_t        sum      = (1 << (bd + FILTER_BITS - 1));
852
0
            for (int k = 0; k < filter_params_x->taps; ++k) {
853
0
                sum += x_filter[k] * src_x[k - fo_horiz];
854
0
            }
855
0
            assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
856
0
            im_block[y * im_stride + x] = (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
857
0
        }
858
0
        src_horiz += src_stride;
859
0
    }
860
861
    // vertical filter
862
0
    int16_t*  src_vert    = im_block + fo_vert * im_stride;
863
0
    const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
864
0
    for (int x = 0; x < w; ++x) {
865
0
        int y_qn = subpel_y_qn;
866
0
        for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
867
0
            const int16_t* src_y        = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
868
0
            const int      y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
869
0
            assert(y_filter_idx < SUBPEL_SHIFTS);
870
0
            const int16_t* y_filter = av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
871
0
            int32_t        sum      = 1 << offset_bits;
872
0
            for (int k = 0; k < filter_params_y->taps; ++k) {
873
0
                sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
874
0
            }
875
0
            assert(0 <= sum && sum < (1 << (offset_bits + 2)));
876
0
            CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
877
0
            if (conv_params->is_compound) {
878
0
                if (conv_params->do_average) {
879
0
                    int32_t tmp = dst16[y * dst16_stride + x];
880
0
                    if (conv_params->use_dist_wtd_comp_avg) {
881
0
                        tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
882
0
                        tmp = tmp >> DIST_PRECISION_BITS;
883
0
                    } else {
884
0
                        tmp += res;
885
0
                        tmp = tmp >> 1;
886
0
                    }
887
                    /* Subtract round offset and convolve round */
888
0
                    tmp = tmp -
889
0
                        ((1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1)));
890
0
                    dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
891
0
                } else {
892
0
                    dst16[y * dst16_stride + x] = res;
893
0
                }
894
0
            } else {
895
                /* Subtract round offset and convolve round */
896
0
                int32_t tmp = res -
897
0
                    ((1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1)));
898
0
                dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
899
0
            }
900
0
        }
901
0
        src_vert++;
902
0
    }
903
0
}
904
905
void svt_av1_highbd_jnt_convolve_x_c(const uint16_t* src, int32_t src_stride, uint16_t* dst16, int32_t dst16_stride,
906
                                     int32_t w, int32_t h, const InterpFilterParams* filter_params_x,
907
                                     const InterpFilterParams* filter_params_y, const int32_t subpel_x_q4,
908
0
                                     const int32_t subpel_y_q4, ConvolveParams* conv_params, int32_t bd) {
909
0
    ConvBufType*  dst          = conv_params->dst;
910
0
    int32_t       dst_stride   = conv_params->dst_stride;
911
0
    const int32_t fo_horiz     = filter_params_x->taps / 2 - 1;
912
0
    const int32_t bits         = FILTER_BITS - conv_params->round_1;
913
0
    const int32_t offset_bits  = bd + 2 * FILTER_BITS - conv_params->round_0;
914
0
    const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
915
0
        (1 << (offset_bits - conv_params->round_1 - 1));
916
0
    const int32_t round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
917
0
    assert(round_bits >= 0);
918
0
    (void)filter_params_y;
919
0
    (void)subpel_y_q4;
920
0
    assert(bits >= 0);
921
    // horizontal filter
922
0
    const int16_t* x_filter = av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
923
0
    for (int32_t y = 0; y < h; ++y) {
924
0
        for (int32_t x = 0; x < w; ++x) {
925
0
            int32_t res = 0;
926
0
            for (int32_t k = 0; k < filter_params_x->taps; ++k) {
927
0
                res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
928
0
            }
929
0
            res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
930
0
            res += round_offset;
931
932
0
            if (conv_params->do_average) {
933
0
                int32_t tmp = dst[y * dst_stride + x];
934
0
                if (conv_params->use_jnt_comp_avg) {
935
0
                    tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
936
0
                    tmp = tmp >> DIST_PRECISION_BITS;
937
0
                } else {
938
0
                    tmp += res;
939
0
                    tmp = tmp >> 1;
940
0
                }
941
0
                tmp -= round_offset;
942
0
                dst16[y * dst16_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
943
0
            } else {
944
0
                dst[y * dst_stride + x] = (ConvBufType)res;
945
0
            }
946
0
        }
947
0
    }
948
0
}
949
950
void svt_av1_highbd_jnt_convolve_y_c(const uint16_t* src, int32_t src_stride, uint16_t* dst16, int32_t dst16_stride,
951
                                     int32_t w, int32_t h, const InterpFilterParams* filter_params_x,
952
                                     const InterpFilterParams* filter_params_y, const int32_t subpel_x_q4,
953
0
                                     const int32_t subpel_y_q4, ConvolveParams* conv_params, int32_t bd) {
954
0
    ConvBufType*  dst          = conv_params->dst;
955
0
    int32_t       dst_stride   = conv_params->dst_stride;
956
0
    const int32_t fo_vert      = filter_params_y->taps / 2 - 1;
957
0
    const int32_t bits         = FILTER_BITS - conv_params->round_0;
958
0
    const int32_t offset_bits  = bd + 2 * FILTER_BITS - conv_params->round_0;
959
0
    const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
960
0
        (1 << (offset_bits - conv_params->round_1 - 1));
961
0
    const int32_t round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
962
0
    assert(round_bits >= 0);
963
0
    (void)filter_params_x;
964
0
    (void)subpel_x_q4;
965
0
    assert(bits >= 0);
966
    // vertical filter
967
0
    const int16_t* y_filter = av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
968
0
    for (int32_t y = 0; y < h; ++y) {
969
0
        for (int32_t x = 0; x < w; ++x) {
970
0
            int32_t res = 0;
971
0
            for (int32_t k = 0; k < filter_params_y->taps; ++k) {
972
0
                res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
973
0
            }
974
0
            res *= (1 << bits);
975
0
            res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
976
977
0
            if (conv_params->do_average) {
978
0
                int32_t tmp = dst[y * dst_stride + x];
979
0
                if (conv_params->use_jnt_comp_avg) {
980
0
                    tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
981
0
                    tmp = tmp >> DIST_PRECISION_BITS;
982
0
                } else {
983
0
                    tmp += res;
984
0
                    tmp = tmp >> 1;
985
0
                }
986
0
                tmp -= round_offset;
987
0
                dst16[y * dst16_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
988
0
            } else {
989
0
                dst[y * dst_stride + x] = (ConvBufType)res;
990
0
            }
991
0
        }
992
0
    }
993
0
}
994
995
void svt_av1_highbd_jnt_convolve_2d_copy_c(const uint16_t* src, int32_t src_stride, uint16_t* dst16,
996
                                           int32_t dst16_stride, int32_t w, int32_t h,
997
                                           const InterpFilterParams* filter_params_x,
998
                                           const InterpFilterParams* filter_params_y, const int32_t subpel_x_q4,
999
0
                                           const int32_t subpel_y_q4, ConvolveParams* conv_params, int32_t bd) {
1000
0
    ConvBufType*  dst          = conv_params->dst;
1001
0
    int32_t       dst_stride   = conv_params->dst_stride;
1002
0
    const int32_t bits         = FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
1003
0
    const int32_t offset_bits  = bd + 2 * FILTER_BITS - conv_params->round_0;
1004
0
    const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
1005
0
        (1 << (offset_bits - conv_params->round_1 - 1));
1006
0
    assert(bits >= 0);
1007
0
    (void)filter_params_x;
1008
0
    (void)filter_params_y;
1009
0
    (void)subpel_x_q4;
1010
0
    (void)subpel_y_q4;
1011
1012
0
    for (int32_t y = 0; y < h; ++y) {
1013
0
        for (int32_t x = 0; x < w; ++x) {
1014
0
            ConvBufType res = src[y * src_stride + x] << bits;
1015
0
            res += (ConvBufType)round_offset;
1016
0
            if (conv_params->do_average) {
1017
0
                int32_t tmp = dst[y * dst_stride + x];
1018
0
                if (conv_params->use_jnt_comp_avg) {
1019
0
                    tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1020
0
                    tmp = tmp >> DIST_PRECISION_BITS;
1021
0
                } else {
1022
0
                    tmp += res;
1023
0
                    tmp = tmp >> 1;
1024
0
                }
1025
0
                tmp -= round_offset;
1026
0
                dst16[y * dst16_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1027
0
            } else {
1028
0
                dst[y * dst_stride + x] = res;
1029
0
            }
1030
0
        }
1031
0
    }
1032
0
}
1033
1034
void svt_av1_highbd_jnt_convolve_2d_c(const uint16_t* src, int32_t src_stride, uint16_t* dst16, int32_t dst16_stride,
1035
                                      int32_t w, int32_t h, const InterpFilterParams* filter_params_x,
1036
                                      const InterpFilterParams* filter_params_y, const int32_t subpel_x_q4,
1037
                                      const int32_t subpel_y_q4, ConvolveParams* conv_params, int32_t bd)
1038
1039
0
{
1040
0
    int16_t       im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
1041
0
    ConvBufType*  dst        = conv_params->dst;
1042
0
    int32_t       dst_stride = conv_params->dst_stride;
1043
0
    int32_t       im_h       = h + filter_params_y->taps - 1;
1044
0
    int32_t       im_stride  = w;
1045
0
    const int32_t fo_vert    = filter_params_y->taps / 2 - 1;
1046
0
    const int32_t fo_horiz   = filter_params_x->taps / 2 - 1;
1047
1048
0
    const int32_t round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1049
0
    assert(round_bits >= 0);
1050
1051
    // horizontal filter
1052
0
    const uint16_t* src_horiz = src - fo_vert * src_stride;
1053
0
    const int16_t*  x_filter  = av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
1054
0
    for (int y = 0; y < im_h; ++y) {
1055
0
        for (int x = 0; x < w; ++x) {
1056
0
            int32_t sum = (1 << (bd + FILTER_BITS - 1));
1057
0
            for (int k = 0; k < filter_params_x->taps; ++k) {
1058
0
                sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
1059
0
            }
1060
0
            assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
1061
0
            (void)bd;
1062
0
            im_block[y * im_stride + x] = (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
1063
0
        }
1064
0
    }
1065
1066
    // vertical filter
1067
0
    int16_t*       src_vert    = im_block + fo_vert * im_stride;
1068
0
    const int32_t  offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1069
0
    const int16_t* y_filter    = av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
1070
0
    for (int y = 0; y < h; ++y) {
1071
0
        for (int x = 0; x < w; ++x) {
1072
0
            int32_t sum = 1 << offset_bits;
1073
0
            for (int k = 0; k < filter_params_y->taps; ++k) {
1074
0
                sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
1075
0
            }
1076
0
            assert(0 <= sum && sum < (1 << (offset_bits + 2)));
1077
0
            ConvBufType res = (ConvBufType)ROUND_POWER_OF_TWO(sum, conv_params->round_1);
1078
0
            if (conv_params->do_average) {
1079
0
                int32_t tmp = dst[y * dst_stride + x];
1080
0
                if (conv_params->use_jnt_comp_avg) {
1081
0
                    tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1082
0
                    tmp = tmp >> DIST_PRECISION_BITS;
1083
0
                } else {
1084
0
                    tmp += res;
1085
0
                    tmp = tmp >> 1;
1086
0
                }
1087
0
                tmp -= (1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1));
1088
0
                dst16[y * dst16_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
1089
0
            } else {
1090
0
                dst[y * dst_stride + x] = res;
1091
0
            }
1092
0
        }
1093
0
    }
1094
0
}
1095
1096
aom_highbd_convolve_fn_t svt_aom_convolveHbd[/*subX*/ 2][/*subY*/ 2][/*bi*/ 2];
1097
1098
1
void svt_aom_asm_set_convolve_hbd_asm_table(void) {
1099
1
    svt_aom_convolveHbd[0][0][0] = svt_av1_highbd_convolve_2d_copy_sr;
1100
1
    svt_aom_convolveHbd[0][0][1] = svt_av1_highbd_jnt_convolve_2d_copy;
1101
1102
1
    svt_aom_convolveHbd[0][1][0] = svt_av1_highbd_convolve_y_sr;
1103
1
    svt_aom_convolveHbd[0][1][1] = svt_av1_highbd_jnt_convolve_y;
1104
1105
1
    svt_aom_convolveHbd[1][0][0] = svt_av1_highbd_convolve_x_sr;
1106
1
    svt_aom_convolveHbd[1][0][1] = svt_av1_highbd_jnt_convolve_x;
1107
1108
1
    svt_aom_convolveHbd[1][1][0] = svt_av1_highbd_convolve_2d_sr;
1109
1
    svt_aom_convolveHbd[1][1][1] = svt_av1_highbd_jnt_convolve_2d;
1110
1
}
1111
1112
AomConvolveFn svt_aom_convolve[/*subX*/ 2][/*subY*/ 2][/*bi*/ 2];
1113
1114
1
void svt_aom_asm_set_convolve_asm_table(void) {
1115
1
    svt_aom_convolve[0][0][0] = svt_av1_convolve_2d_copy_sr;
1116
1
    svt_aom_convolve[0][0][1] = svt_av1_jnt_convolve_2d_copy;
1117
1118
1
    svt_aom_convolve[0][1][0] = svt_av1_convolve_y_sr;
1119
1
    svt_aom_convolve[0][1][1] = svt_av1_jnt_convolve_y;
1120
1121
1
    svt_aom_convolve[1][0][0] = svt_av1_convolve_x_sr;
1122
1
    svt_aom_convolve[1][0][1] = svt_av1_jnt_convolve_x;
1123
1124
1
    svt_aom_convolve[1][1][0] = svt_av1_convolve_2d_sr;
1125
1
    svt_aom_convolve[1][1][1] = svt_av1_jnt_convolve_2d;
1126
1
}
1127
1128
DECLARE_ALIGNED(256, const InterpKernel, sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {{0, 0, 0, 128, 0, 0, 0, 0},
1129
                                                                                   {-2, 2, -6, 126, 8, -2, 2, 0},
1130
                                                                                   {-2, 6, -12, 124, 16, -6, 4, -2},
1131
                                                                                   {-2, 8, -18, 120, 26, -10, 6, -2},
1132
                                                                                   {-4, 10, -22, 116, 38, -14, 6, -2},
1133
                                                                                   {-4, 10, -22, 108, 48, -18, 8, -2},
1134
                                                                                   {-4, 10, -24, 100, 60, -20, 8, -2},
1135
                                                                                   {-4, 10, -24, 90, 70, -22, 10, -2},
1136
                                                                                   {-4, 12, -24, 80, 80, -24, 12, -4},
1137
                                                                                   {-2, 10, -22, 70, 90, -24, 10, -4},
1138
                                                                                   {-2, 8, -20, 60, 100, -24, 10, -4},
1139
                                                                                   {-2, 8, -18, 48, 108, -22, 10, -4},
1140
                                                                                   {-2, 6, -14, 38, 116, -22, 10, -4},
1141
                                                                                   {-2, 6, -10, 26, 120, -18, 8, -2},
1142
                                                                                   {-2, 4, -6, 16, 124, -12, 6, -2},
1143
                                                                                   {0, 2, -2, 8, 126, -6, 2, -2}};
1144
1145
DECLARE_ALIGNED(256, const InterpKernel, sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {{0, 0, 0, 128, 0, 0, 0, 0},
1146
                                                                                    {0, 2, 28, 62, 34, 2, 0, 0},
1147
                                                                                    {0, 0, 26, 62, 36, 4, 0, 0},
1148
                                                                                    {0, 0, 22, 62, 40, 4, 0, 0},
1149
                                                                                    {0, 0, 20, 60, 42, 6, 0, 0},
1150
                                                                                    {0, 0, 18, 58, 44, 8, 0, 0},
1151
                                                                                    {0, 0, 16, 56, 46, 10, 0, 0},
1152
                                                                                    {0, -2, 16, 54, 48, 12, 0, 0},
1153
                                                                                    {0, -2, 14, 52, 52, 14, -2, 0},
1154
                                                                                    {0, 0, 12, 48, 54, 16, -2, 0},
1155
                                                                                    {0, 0, 10, 46, 56, 16, 0, 0},
1156
                                                                                    {0, 0, 8, 44, 58, 18, 0, 0},
1157
                                                                                    {0, 0, 6, 42, 60, 20, 0, 0},
1158
                                                                                    {0, 0, 4, 40, 62, 22, 0, 0},
1159
                                                                                    {0, 0, 4, 36, 62, 26, 0, 0},
1160
                                                                                    {0, 0, 2, 34, 62, 28, 2, 0}};
1161
DECLARE_ALIGNED(256, const InterpKernel, bilinear_filters[SUBPEL_SHIFTS])        = {{0, 0, 0, 128, 0, 0, 0, 0},
1162
                                                                                    {0, 0, 0, 120, 8, 0, 0, 0},
1163
                                                                                    {0, 0, 0, 112, 16, 0, 0, 0},
1164
                                                                                    {0, 0, 0, 104, 24, 0, 0, 0},
1165
                                                                                    {0, 0, 0, 96, 32, 0, 0, 0},
1166
                                                                                    {0, 0, 0, 88, 40, 0, 0, 0},
1167
                                                                                    {0, 0, 0, 80, 48, 0, 0, 0},
1168
                                                                                    {0, 0, 0, 72, 56, 0, 0, 0},
1169
                                                                                    {0, 0, 0, 64, 64, 0, 0, 0},
1170
                                                                                    {0, 0, 0, 56, 72, 0, 0, 0},
1171
                                                                                    {0, 0, 0, 48, 80, 0, 0, 0},
1172
                                                                                    {0, 0, 0, 40, 88, 0, 0, 0},
1173
                                                                                    {0, 0, 0, 32, 96, 0, 0, 0},
1174
                                                                                    {0, 0, 0, 24, 104, 0, 0, 0},
1175
                                                                                    {0, 0, 0, 16, 112, 0, 0, 0},
1176
                                                                                    {0, 0, 0, 8, 120, 0, 0, 0}};
1177
DECLARE_ALIGNED(256, const InterpKernel, sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = {{0, 0, 0, 128, 0, 0, 0, 0},
1178
                                                                                    {0, 0, 30, 62, 34, 2, 0, 0},
1179
                                                                                    {0, 0, 26, 62, 36, 4, 0, 0},
1180
                                                                                    {0, 0, 22, 62, 40, 4, 0, 0},
1181
                                                                                    {0, 0, 20, 60, 42, 6, 0, 0},
1182
                                                                                    {0, 0, 18, 58, 44, 8, 0, 0},
1183
                                                                                    {0, 0, 16, 56, 46, 10, 0, 0},
1184
                                                                                    {0, 0, 14, 54, 48, 12, 0, 0},
1185
                                                                                    {0, 0, 12, 52, 52, 12, 0, 0},
1186
                                                                                    {0, 0, 12, 48, 54, 14, 0, 0},
1187
                                                                                    {0, 0, 10, 46, 56, 16, 0, 0},
1188
                                                                                    {0, 0, 8, 44, 58, 18, 0, 0},
1189
                                                                                    {0, 0, 6, 42, 60, 20, 0, 0},
1190
                                                                                    {0, 0, 4, 40, 62, 22, 0, 0},
1191
                                                                                    {0, 0, 4, 36, 62, 26, 0, 0},
1192
                                                                                    {0, 0, 2, 34, 62, 30, 0, 0}};
1193
BlockSize svt_aom_scale_chroma_bsize(BlockSize bsize, int32_t subsampling_x, int32_t subsampling_y);
1194
1195
void convolve_2d_for_intrabc(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int w, int h,
1196
0
                             int subpel_x_q4, int subpel_y_q4, ConvolveParams* conv_params) {
1197
0
    const InterpFilterParams* filter_params_x = subpel_x_q4 ? &av1_interp_filter_params_list[BILINEAR] : NULL;
1198
0
    const InterpFilterParams* filter_params_y = subpel_y_q4 ? &av1_interp_filter_params_list[BILINEAR] : NULL;
1199
0
    if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
1200
0
        svt_av1_convolve_2d_sr(src,
1201
0
                               src_stride,
1202
0
                               dst,
1203
0
                               dst_stride,
1204
0
                               w,
1205
0
                               h,
1206
0
                               (InterpFilterParams*)filter_params_x,
1207
0
                               (InterpFilterParams*)filter_params_y,
1208
0
                               8,
1209
0
                               8,
1210
0
                               conv_params);
1211
0
    } else if (subpel_x_q4 != 0) {
1212
0
        svt_av1_convolve_x_sr(src,
1213
0
                              src_stride,
1214
0
                              dst,
1215
0
                              dst_stride,
1216
0
                              w,
1217
0
                              h,
1218
0
                              (InterpFilterParams*)filter_params_x,
1219
0
                              (InterpFilterParams*)filter_params_y,
1220
0
                              8,
1221
0
                              0,
1222
0
                              conv_params);
1223
0
    } else {
1224
0
        svt_av1_convolve_y_sr(src,
1225
0
                              src_stride,
1226
0
                              dst,
1227
0
                              dst_stride,
1228
0
                              w,
1229
0
                              h,
1230
0
                              (InterpFilterParams*)filter_params_x,
1231
0
                              (InterpFilterParams*)filter_params_y,
1232
0
                              0,
1233
0
                              8,
1234
0
                              conv_params);
1235
0
    }
1236
0
}
1237
1238
void highbd_convolve_2d_for_intrabc(const uint16_t* src, int src_stride, uint16_t* dst, int dst_stride, int w, int h,
1239
0
                                    int subpel_x_q4, int subpel_y_q4, ConvolveParams* conv_params, int bd) {
1240
0
    const InterpFilterParams* filter_params_x = subpel_x_q4 ? &av1_interp_filter_params_list[BILINEAR] : NULL;
1241
0
    const InterpFilterParams* filter_params_y = subpel_y_q4 ? &av1_interp_filter_params_list[BILINEAR] : NULL;
1242
0
    if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
1243
0
        svt_av1_highbd_convolve_2d_sr(
1244
0
            src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, 8, 8, conv_params, bd);
1245
0
    } else if (subpel_x_q4 != 0) {
1246
0
        svt_av1_highbd_convolve_x_sr(
1247
0
            src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, 8, 0, conv_params, bd);
1248
0
    } else {
1249
0
        svt_av1_highbd_convolve_y_sr(
1250
0
            src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, 0, 8, conv_params, bd);
1251
0
    }
1252
0
}
1253
1254
/*
1255
*/
1256
void svt_inter_predictor_light_pd0(const uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t w,
1257
0
                                   int32_t h, SubpelParams* subpel_params, ConvolveParams* conv_params) {
1258
0
    const int32_t is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
1259
0
    if (is_scaled) {
1260
0
        InterpFilterParams filter_params_x, filter_params_y;
1261
0
        av1_get_convolve_filter_params(
1262
0
            av1_make_interp_filters(EIGHTTAP_REGULAR, EIGHTTAP_REGULAR), &filter_params_x, &filter_params_y, w, h);
1263
0
        svt_av1_convolve_2d_scale(src,
1264
0
                                  src_stride,
1265
0
                                  dst,
1266
0
                                  dst_stride,
1267
0
                                  w,
1268
0
                                  h,
1269
0
                                  &filter_params_x,
1270
0
                                  &filter_params_y,
1271
0
                                  subpel_params->subpel_x,
1272
0
                                  subpel_params->xs,
1273
0
                                  subpel_params->subpel_y,
1274
0
                                  subpel_params->ys,
1275
0
                                  conv_params);
1276
0
    } else {
1277
0
        UNUSED(subpel_params);
1278
0
        svt_aom_convolve[0][0][conv_params->is_compound](
1279
0
            src, src_stride, dst, dst_stride, w, h, 0, 0, 0, 0, conv_params);
1280
0
    }
1281
0
}
1282
1283
void svt_inter_predictor_light_pd1(uint8_t* src, uint8_t* src_2b, int32_t src_stride, uint8_t* dst, int32_t dst_stride,
1284
                                   int32_t w, int32_t h, InterpFilters interp_filters, SubpelParams* subpel_params,
1285
0
                                   ConvolveParams* conv_params, int32_t bd) {
1286
0
    InterpFilterParams filter_params_x, filter_params_y;
1287
0
    av1_get_convolve_filter_params(interp_filters, &filter_params_x, &filter_params_y, w, h);
1288
0
    const int32_t is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
1289
1290
0
    if (bd > EB_EIGHT_BIT) {
1291
        // for super-res, the reference frame block might be 2x than predictor in maximum
1292
        // for reference scaling, it might be 4x since both width and height is scaled 2x
1293
        // should pack enough buffer for scaled reference
1294
0
        DECLARE_ALIGNED(16, uint16_t, src16[PACKED_BUFFER_SIZE * 4]);
1295
0
        int32_t src_stride16;
1296
        // pack the reference into temp 16bit buffer
1297
0
        uint8_t  offset       = INTERPOLATION_OFFSET;
1298
0
        uint32_t width_scale  = 1;
1299
0
        uint32_t height_scale = 1;
1300
0
        if (is_scaled) {
1301
0
            width_scale  = subpel_params->xs != SCALE_SUBPEL_SHIFTS ? 2 : 1;
1302
0
            height_scale = subpel_params->ys != SCALE_SUBPEL_SHIFTS ? 2 : 1;
1303
0
        }
1304
        // optimize stride from MAX_SB_SIZE to bwidth to minimum the block buffer size
1305
0
        src_stride16 = w * width_scale + (offset << 1);
1306
        // 16-byte align of src16
1307
0
        if (src_stride16 % 8) {
1308
0
            src_stride16 = ALIGN_POWER_OF_TWO(src_stride16, 3);
1309
0
        }
1310
1311
0
        svt_aom_pack_block(src - offset - (offset * src_stride),
1312
0
                           src_stride,
1313
0
                           src_2b - offset - (offset * src_stride),
1314
0
                           src_stride,
1315
0
                           src16,
1316
0
                           src_stride16,
1317
0
                           w * width_scale + (offset << 1),
1318
0
                           h * height_scale + (offset << 1));
1319
0
        uint16_t* src_10b = src16 + offset + (offset * src_stride16);
1320
0
        uint16_t* dst16   = (uint16_t*)dst;
1321
1322
0
        if (is_scaled) {
1323
0
            svt_av1_highbd_convolve_2d_scale(src_10b,
1324
0
                                             src_stride16,
1325
0
                                             dst16,
1326
0
                                             dst_stride,
1327
0
                                             w,
1328
0
                                             h,
1329
0
                                             &filter_params_x,
1330
0
                                             &filter_params_y,
1331
0
                                             subpel_params->subpel_x,
1332
0
                                             subpel_params->xs,
1333
0
                                             subpel_params->subpel_y,
1334
0
                                             subpel_params->ys,
1335
0
                                             conv_params,
1336
0
                                             bd);
1337
0
        } else {
1338
0
            SubpelParams sp = *subpel_params;
1339
0
            revert_scale_extra_bits(&sp);
1340
0
            svt_aom_convolveHbd[sp.subpel_x != 0][sp.subpel_y != 0][conv_params->is_compound](src_10b,
1341
0
                                                                                              src_stride16,
1342
0
                                                                                              dst16,
1343
0
                                                                                              dst_stride,
1344
0
                                                                                              w,
1345
0
                                                                                              h,
1346
0
                                                                                              &filter_params_x,
1347
0
                                                                                              &filter_params_y,
1348
0
                                                                                              sp.subpel_x,
1349
0
                                                                                              sp.subpel_y,
1350
0
                                                                                              conv_params,
1351
0
                                                                                              bd);
1352
0
        }
1353
0
    } else {
1354
0
        if (is_scaled) {
1355
0
            svt_av1_convolve_2d_scale(src,
1356
0
                                      src_stride,
1357
0
                                      dst,
1358
0
                                      dst_stride,
1359
0
                                      w,
1360
0
                                      h,
1361
0
                                      &filter_params_x,
1362
0
                                      &filter_params_y,
1363
0
                                      subpel_params->subpel_x,
1364
0
                                      subpel_params->xs,
1365
0
                                      subpel_params->subpel_y,
1366
0
                                      subpel_params->ys,
1367
0
                                      conv_params);
1368
0
        } else {
1369
0
            SubpelParams sp = *subpel_params;
1370
0
            revert_scale_extra_bits(&sp);
1371
0
            svt_aom_convolve[sp.subpel_x != 0][sp.subpel_y != 0][conv_params->is_compound](src,
1372
0
                                                                                           src_stride,
1373
0
                                                                                           dst,
1374
0
                                                                                           dst_stride,
1375
0
                                                                                           w,
1376
0
                                                                                           h,
1377
0
                                                                                           &filter_params_x,
1378
0
                                                                                           &filter_params_y,
1379
0
                                                                                           sp.subpel_x,
1380
0
                                                                                           sp.subpel_y,
1381
0
                                                                                           conv_params);
1382
0
        }
1383
0
    }
1384
0
}
1385
1386
void svt_inter_predictor(const uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride,
1387
                         const SubpelParams* subpel_params, const ScaleFactors* sf, int32_t w, int32_t h,
1388
0
                         ConvolveParams* conv_params, InterpFilters interp_filters, int32_t is_intrabc) {
1389
0
    InterpFilterParams filter_params_x, filter_params_y;
1390
0
    const int32_t      is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
1391
1392
0
    av1_get_convolve_filter_params(interp_filters, &filter_params_x, &filter_params_y, w, h);
1393
1394
0
    assert(conv_params->do_average == 0 || conv_params->do_average == 1);
1395
0
    assert(sf);
1396
0
    UNUSED(sf);
1397
0
    assert(IMPLIES(is_intrabc, !is_scaled));
1398
1399
0
    if (is_scaled) {
1400
0
        if (is_intrabc && (subpel_params->subpel_x != 0 || subpel_params->subpel_y != 0)) {
1401
0
            convolve_2d_for_intrabc(
1402
0
                src, src_stride, dst, dst_stride, w, h, subpel_params->subpel_x, subpel_params->subpel_y, conv_params);
1403
0
            return;
1404
0
        }
1405
0
        if (conv_params->is_compound) {
1406
0
            assert(conv_params->dst != NULL);
1407
0
        }
1408
0
        svt_av1_convolve_2d_scale(src,
1409
0
                                  src_stride,
1410
0
                                  dst,
1411
0
                                  dst_stride,
1412
0
                                  w,
1413
0
                                  h,
1414
0
                                  &filter_params_x,
1415
0
                                  &filter_params_y,
1416
0
                                  subpel_params->subpel_x,
1417
0
                                  subpel_params->xs,
1418
0
                                  subpel_params->subpel_y,
1419
0
                                  subpel_params->ys,
1420
0
                                  conv_params);
1421
0
    } else {
1422
0
        SubpelParams sp = *subpel_params;
1423
0
        revert_scale_extra_bits(&sp);
1424
1425
0
        if (is_intrabc && (sp.subpel_x != 0 || sp.subpel_y != 0)) {
1426
0
            convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, sp.subpel_x, sp.subpel_y, conv_params);
1427
0
            return;
1428
0
        }
1429
1430
0
        svt_aom_convolve[sp.subpel_x != 0][sp.subpel_y != 0][conv_params->is_compound](src,
1431
0
                                                                                       src_stride,
1432
0
                                                                                       dst,
1433
0
                                                                                       dst_stride,
1434
0
                                                                                       w,
1435
0
                                                                                       h,
1436
0
                                                                                       &filter_params_x,
1437
0
                                                                                       &filter_params_y,
1438
0
                                                                                       sp.subpel_x,
1439
0
                                                                                       sp.subpel_y,
1440
0
                                                                                       conv_params);
1441
0
    }
1442
0
}
1443
1444
void svt_highbd_inter_predictor(const uint16_t* src, int32_t src_stride, uint16_t* dst, int32_t dst_stride,
1445
                                const SubpelParams* subpel_params, const ScaleFactors* sf, int32_t w, int32_t h,
1446
                                ConvolveParams* conv_params, InterpFilters interp_filters, int32_t is_intrabc,
1447
0
                                int32_t bd) {
1448
0
    InterpFilterParams filter_params_x, filter_params_y;
1449
0
    const int32_t      is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
1450
1451
0
    av1_get_convolve_filter_params(interp_filters, &filter_params_x, &filter_params_y, w, h);
1452
1453
0
    assert(conv_params->do_average == 0 || conv_params->do_average == 1);
1454
0
    assert(sf);
1455
0
    UNUSED(sf);
1456
0
    assert(IMPLIES(is_intrabc, !is_scaled));
1457
1458
0
    if (is_scaled) {
1459
0
        if (is_intrabc && (subpel_params->subpel_x != 0 || subpel_params->subpel_y != 0)) {
1460
0
            highbd_convolve_2d_for_intrabc(src,
1461
0
                                           src_stride,
1462
0
                                           dst,
1463
0
                                           dst_stride,
1464
0
                                           w,
1465
0
                                           h,
1466
0
                                           subpel_params->subpel_x,
1467
0
                                           subpel_params->subpel_y,
1468
0
                                           conv_params,
1469
0
                                           bd);
1470
0
            return;
1471
0
        }
1472
0
        if (conv_params->is_compound) {
1473
0
            assert(conv_params->dst != NULL);
1474
0
        }
1475
0
        svt_av1_highbd_convolve_2d_scale(src,
1476
0
                                         src_stride,
1477
0
                                         dst,
1478
0
                                         dst_stride,
1479
0
                                         w,
1480
0
                                         h,
1481
0
                                         &filter_params_x,
1482
0
                                         &filter_params_y,
1483
0
                                         subpel_params->subpel_x,
1484
0
                                         subpel_params->xs,
1485
0
                                         subpel_params->subpel_y,
1486
0
                                         subpel_params->ys,
1487
0
                                         conv_params,
1488
0
                                         bd);
1489
0
    } else {
1490
0
        SubpelParams sp = *subpel_params;
1491
0
        revert_scale_extra_bits(&sp);
1492
1493
0
        if (is_intrabc && (sp.subpel_x != 0 || sp.subpel_y != 0)) {
1494
0
            highbd_convolve_2d_for_intrabc(
1495
0
                src, src_stride, dst, dst_stride, w, h, sp.subpel_x, sp.subpel_y, conv_params, bd);
1496
0
            return;
1497
0
        }
1498
1499
0
        svt_aom_convolveHbd[sp.subpel_x != 0][sp.subpel_y != 0][conv_params->is_compound](src,
1500
0
                                                                                          src_stride,
1501
0
                                                                                          dst,
1502
0
                                                                                          dst_stride,
1503
0
                                                                                          w,
1504
0
                                                                                          h,
1505
0
                                                                                          &filter_params_x,
1506
0
                                                                                          &filter_params_y,
1507
0
                                                                                          sp.subpel_x,
1508
0
                                                                                          sp.subpel_y,
1509
0
                                                                                          conv_params,
1510
0
                                                                                          bd);
1511
0
    }
1512
0
}
1513
1514
#define USE_PRECOMPUTED_WEDGE_SIGN 1
1515
#define USE_PRECOMPUTED_WEDGE_MASK 1
1516
1517
#if USE_PRECOMPUTED_WEDGE_MASK
1518
static const uint8_t wedge_primary_oblique_odd[MASK_PRIMARY_SIZE] = {
1519
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1520
    0,  0,  0,  0,  0,  0,  1,  2,  6,  18, 37, 53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64,
1521
    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
1522
};
1523
static const uint8_t wedge_primary_oblique_even[MASK_PRIMARY_SIZE] = {
1524
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1525
    0,  0,  0,  0,  0,  0,  1,  4,  11, 27, 46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64,
1526
    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
1527
};
1528
static const uint8_t wedge_primary_vertical[MASK_PRIMARY_SIZE] = {
1529
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1530
    0,  0,  0,  0,  0,  0,  0,  2,  7,  21, 43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64,
1531
    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
1532
};
1533
1534
DECLARE_ALIGNED(16, static uint8_t, wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]) = {
1535
    {
1536
        0,
1537
        0,
1538
        0,
1539
        0,
1540
        0,
1541
        0,
1542
        0,
1543
        0,
1544
        0,
1545
        0,
1546
        0,
1547
        0,
1548
        0,
1549
        0,
1550
        0,
1551
        0,
1552
    }, // not used
1553
    {
1554
        0,
1555
        0,
1556
        0,
1557
        0,
1558
        0,
1559
        0,
1560
        0,
1561
        0,
1562
        0,
1563
        0,
1564
        0,
1565
        0,
1566
        0,
1567
        0,
1568
        0,
1569
        0,
1570
    }, // not used
1571
    {
1572
        0,
1573
        0,
1574
        0,
1575
        0,
1576
        0,
1577
        0,
1578
        0,
1579
        0,
1580
        0,
1581
        0,
1582
        0,
1583
        0,
1584
        0,
1585
        0,
1586
        0,
1587
        0,
1588
    }, // not used
1589
    {
1590
        1,
1591
        1,
1592
        1,
1593
        1,
1594
        1,
1595
        1,
1596
        1,
1597
        1,
1598
        1,
1599
        1,
1600
        0,
1601
        1,
1602
        1,
1603
        1,
1604
        0,
1605
        1,
1606
    },
1607
    {
1608
        1,
1609
        1,
1610
        1,
1611
        1,
1612
        0,
1613
        1,
1614
        1,
1615
        1,
1616
        1,
1617
        1,
1618
        0,
1619
        1,
1620
        1,
1621
        1,
1622
        0,
1623
        1,
1624
    },
1625
    {
1626
        1,
1627
        1,
1628
        1,
1629
        1,
1630
        0,
1631
        1,
1632
        1,
1633
        1,
1634
        1,
1635
        1,
1636
        0,
1637
        1,
1638
        1,
1639
        1,
1640
        0,
1641
        1,
1642
    },
1643
    {
1644
        1,
1645
        1,
1646
        1,
1647
        1,
1648
        1,
1649
        1,
1650
        1,
1651
        1,
1652
        1,
1653
        1,
1654
        0,
1655
        1,
1656
        1,
1657
        1,
1658
        0,
1659
        1,
1660
    },
1661
    {
1662
        1,
1663
        1,
1664
        1,
1665
        1,
1666
        0,
1667
        1,
1668
        1,
1669
        1,
1670
        1,
1671
        1,
1672
        0,
1673
        1,
1674
        1,
1675
        1,
1676
        0,
1677
        1,
1678
    },
1679
    {
1680
        1,
1681
        1,
1682
        1,
1683
        1,
1684
        0,
1685
        1,
1686
        1,
1687
        1,
1688
        1,
1689
        1,
1690
        0,
1691
        1,
1692
        1,
1693
        1,
1694
        0,
1695
        1,
1696
    },
1697
    {
1698
        1,
1699
        1,
1700
        1,
1701
        1,
1702
        1,
1703
        1,
1704
        1,
1705
        1,
1706
        1,
1707
        1,
1708
        0,
1709
        1,
1710
        1,
1711
        1,
1712
        0,
1713
        1,
1714
    },
1715
    {
1716
        0,
1717
        0,
1718
        0,
1719
        0,
1720
        0,
1721
        0,
1722
        0,
1723
        0,
1724
        0,
1725
        0,
1726
        0,
1727
        0,
1728
        0,
1729
        0,
1730
        0,
1731
        0,
1732
    }, // not used
1733
    {
1734
        0,
1735
        0,
1736
        0,
1737
        0,
1738
        0,
1739
        0,
1740
        0,
1741
        0,
1742
        0,
1743
        0,
1744
        0,
1745
        0,
1746
        0,
1747
        0,
1748
        0,
1749
        0,
1750
    }, // not used
1751
    {
1752
        0,
1753
        0,
1754
        0,
1755
        0,
1756
        0,
1757
        0,
1758
        0,
1759
        0,
1760
        0,
1761
        0,
1762
        0,
1763
        0,
1764
        0,
1765
        0,
1766
        0,
1767
        0,
1768
    }, // not used
1769
    {
1770
        0,
1771
        0,
1772
        0,
1773
        0,
1774
        0,
1775
        0,
1776
        0,
1777
        0,
1778
        0,
1779
        0,
1780
        0,
1781
        0,
1782
        0,
1783
        0,
1784
        0,
1785
        0,
1786
    }, // not used
1787
    {
1788
        0,
1789
        0,
1790
        0,
1791
        0,
1792
        0,
1793
        0,
1794
        0,
1795
        0,
1796
        0,
1797
        0,
1798
        0,
1799
        0,
1800
        0,
1801
        0,
1802
        0,
1803
        0,
1804
    }, // not used
1805
    {
1806
        0,
1807
        0,
1808
        0,
1809
        0,
1810
        0,
1811
        0,
1812
        0,
1813
        0,
1814
        0,
1815
        0,
1816
        0,
1817
        0,
1818
        0,
1819
        0,
1820
        0,
1821
        0,
1822
    }, // not used
1823
    {
1824
        0,
1825
        0,
1826
        0,
1827
        0,
1828
        0,
1829
        0,
1830
        0,
1831
        0,
1832
        0,
1833
        0,
1834
        0,
1835
        0,
1836
        0,
1837
        0,
1838
        0,
1839
        0,
1840
    }, // not used
1841
    {
1842
        0,
1843
        0,
1844
        0,
1845
        0,
1846
        0,
1847
        0,
1848
        0,
1849
        0,
1850
        0,
1851
        0,
1852
        0,
1853
        0,
1854
        0,
1855
        0,
1856
        0,
1857
        0,
1858
    }, // not used
1859
    {
1860
        1,
1861
        1,
1862
        1,
1863
        1,
1864
        0,
1865
        1,
1866
        1,
1867
        1,
1868
        0,
1869
        1,
1870
        0,
1871
        1,
1872
        1,
1873
        1,
1874
        0,
1875
        1,
1876
    },
1877
    {
1878
        1,
1879
        1,
1880
        1,
1881
        1,
1882
        0,
1883
        1,
1884
        1,
1885
        1,
1886
        1,
1887
        1,
1888
        0,
1889
        1,
1890
        0,
1891
        1,
1892
        0,
1893
        1,
1894
    },
1895
    {
1896
        0,
1897
        0,
1898
        0,
1899
        0,
1900
        0,
1901
        0,
1902
        0,
1903
        0,
1904
        0,
1905
        0,
1906
        0,
1907
        0,
1908
        0,
1909
        0,
1910
        0,
1911
        0,
1912
    }, // not used
1913
    {
1914
        0,
1915
        0,
1916
        0,
1917
        0,
1918
        0,
1919
        0,
1920
        0,
1921
        0,
1922
        0,
1923
        0,
1924
        0,
1925
        0,
1926
        0,
1927
        0,
1928
        0,
1929
        0,
1930
    }, // not used
1931
};
1932
1933
static const WedgeCodeType wedge_codebook_16_hgtw[16] = {
1934
    {WEDGE_OBLIQUE27, 4, 4},
1935
    {WEDGE_OBLIQUE63, 4, 4},
1936
    {WEDGE_OBLIQUE117, 4, 4},
1937
    {WEDGE_OBLIQUE153, 4, 4},
1938
    {WEDGE_HORIZONTAL, 4, 2},
1939
    {WEDGE_HORIZONTAL, 4, 4},
1940
    {WEDGE_HORIZONTAL, 4, 6},
1941
    {WEDGE_VERTICAL, 4, 4},
1942
    {WEDGE_OBLIQUE27, 4, 2},
1943
    {WEDGE_OBLIQUE27, 4, 6},
1944
    {WEDGE_OBLIQUE153, 4, 2},
1945
    {WEDGE_OBLIQUE153, 4, 6},
1946
    {WEDGE_OBLIQUE63, 2, 4},
1947
    {WEDGE_OBLIQUE63, 6, 4},
1948
    {WEDGE_OBLIQUE117, 2, 4},
1949
    {WEDGE_OBLIQUE117, 6, 4},
1950
};
1951
1952
static const WedgeCodeType wedge_codebook_16_hltw[16] = {
1953
    {WEDGE_OBLIQUE27, 4, 4},
1954
    {WEDGE_OBLIQUE63, 4, 4},
1955
    {WEDGE_OBLIQUE117, 4, 4},
1956
    {WEDGE_OBLIQUE153, 4, 4},
1957
    {WEDGE_VERTICAL, 2, 4},
1958
    {WEDGE_VERTICAL, 4, 4},
1959
    {WEDGE_VERTICAL, 6, 4},
1960
    {WEDGE_HORIZONTAL, 4, 4},
1961
    {WEDGE_OBLIQUE27, 4, 2},
1962
    {WEDGE_OBLIQUE27, 4, 6},
1963
    {WEDGE_OBLIQUE153, 4, 2},
1964
    {WEDGE_OBLIQUE153, 4, 6},
1965
    {WEDGE_OBLIQUE63, 2, 4},
1966
    {WEDGE_OBLIQUE63, 6, 4},
1967
    {WEDGE_OBLIQUE117, 2, 4},
1968
    {WEDGE_OBLIQUE117, 6, 4},
1969
};
1970
1971
static const WedgeCodeType wedge_codebook_16_heqw[16] = {
1972
    {WEDGE_OBLIQUE27, 4, 4},
1973
    {WEDGE_OBLIQUE63, 4, 4},
1974
    {WEDGE_OBLIQUE117, 4, 4},
1975
    {WEDGE_OBLIQUE153, 4, 4},
1976
    {WEDGE_HORIZONTAL, 4, 2},
1977
    {WEDGE_HORIZONTAL, 4, 6},
1978
    {WEDGE_VERTICAL, 2, 4},
1979
    {WEDGE_VERTICAL, 6, 4},
1980
    {WEDGE_OBLIQUE27, 4, 2},
1981
    {WEDGE_OBLIQUE27, 4, 6},
1982
    {WEDGE_OBLIQUE153, 4, 2},
1983
    {WEDGE_OBLIQUE153, 4, 6},
1984
    {WEDGE_OBLIQUE63, 2, 4},
1985
    {WEDGE_OBLIQUE63, 6, 4},
1986
    {WEDGE_OBLIQUE117, 2, 4},
1987
    {WEDGE_OBLIQUE117, 6, 4},
1988
};
1989
1990
static const WedgeParamsType wedge_params_lookup[BLOCK_SIZES_ALL] = {
1991
    {0, NULL, NULL, NULL},
1992
    {0, NULL, NULL, NULL},
1993
    {0, NULL, NULL, NULL},
1994
    {4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8], wedge_masks[BLOCK_8X8]},
1995
    {4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16], wedge_masks[BLOCK_8X16]},
1996
    {4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8], wedge_masks[BLOCK_16X8]},
1997
    {4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16], wedge_masks[BLOCK_16X16]},
1998
    {4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32], wedge_masks[BLOCK_16X32]},
1999
    {4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16], wedge_masks[BLOCK_32X16]},
2000
    {4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32], wedge_masks[BLOCK_32X32]},
2001
    {0, NULL, NULL, NULL},
2002
    {0, NULL, NULL, NULL},
2003
    {0, NULL, NULL, NULL},
2004
    {0, NULL, NULL, NULL},
2005
    {0, NULL, NULL, NULL},
2006
    {0, NULL, NULL, NULL},
2007
    {0, NULL, NULL, NULL},
2008
    {0, NULL, NULL, NULL},
2009
    {4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32], wedge_masks[BLOCK_8X32]},
2010
    {4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8], wedge_masks[BLOCK_32X8]},
2011
    {0, NULL, NULL, NULL},
2012
    {0, NULL, NULL, NULL},
2013
};
2014
2015
0
int svt_aom_is_interintra_wedge_used(BlockSize bsize) {
2016
0
    return wedge_params_lookup[bsize].bits > 0;
2017
0
}
2018
2019
0
int32_t svt_aom_get_wedge_bits_lookup(BlockSize bsize) {
2020
0
    return wedge_params_lookup[bsize].bits;
2021
0
}
2022
2023
0
const uint8_t* svt_aom_get_contiguous_soft_mask(int wedge_index, int wedge_sign, BlockSize bsize) {
2024
0
    return wedge_params_lookup[bsize].masks[wedge_sign][wedge_index];
2025
0
}
2026
2027
static void aom_convolve_copy_c(const uint8_t* src, ptrdiff_t src_stride, uint8_t* dst, ptrdiff_t dst_stride,
2028
                                const int16_t* filter_x, int filter_x_stride, const int16_t* filter_y,
2029
288
                                int filter_y_stride, int w, int h) {
2030
288
    (void)filter_x;
2031
288
    (void)filter_x_stride;
2032
288
    (void)filter_y;
2033
288
    (void)filter_y_stride;
2034
2035
5.66k
    for (int r = h; r > 0; --r) {
2036
5.37k
        svt_memcpy(dst, src, w);
2037
5.37k
        src += src_stride;
2038
5.37k
        dst += dst_stride;
2039
5.37k
    }
2040
288
}
2041
2042
64
static void shift_copy(const uint8_t* src, uint8_t* dst, int shift, int width) {
2043
64
    if (shift >= 0) {
2044
33
        svt_memcpy(dst + shift, src, width - shift);
2045
33
        memset(dst, src[0], shift);
2046
33
    } else {
2047
31
        shift = -shift;
2048
31
        svt_memcpy(dst, src + shift, width - shift);
2049
31
        memset(dst + width - shift, src[width - 1], shift);
2050
31
    }
2051
64
}
2052
2053
0
int svt_aom_get_wedge_params_bits(BlockSize bsize) {
2054
0
    return wedge_params_lookup[bsize].bits;
2055
0
}
2056
2057
#endif // USE_PRECOMPUTED_WEDGE_MASK
2058
2059
// [negative][direction]
2060
DECLARE_ALIGNED(16, static uint8_t, wedge_mask_obl[2][WEDGE_DIRECTIONS][MASK_PRIMARY_SIZE * MASK_PRIMARY_SIZE]);
2061
2062
// 4 * MAX_WEDGE_SQUARE is an easy to compute and fairly tight upper bound
2063
// on the sum of all mask sizes up to an including MAX_WEDGE_SQUARE.
2064
DECLARE_ALIGNED(16, static uint8_t, wedge_mask_buf[2 * MAX_WEDGE_TYPES * 4 * MAX_WEDGE_SQUARE]);
2065
2066
1
static void init_wedge_primary_masks() {
2067
1
    const int w      = MASK_PRIMARY_SIZE;
2068
1
    const int h      = MASK_PRIMARY_SIZE;
2069
1
    const int stride = MASK_PRIMARY_STRIDE;
2070
    // Note: index [0] stores the primary, and [1] its complement.
2071
1
#if USE_PRECOMPUTED_WEDGE_MASK
2072
    // Generate prototype by shifting the primary
2073
1
    int shift = h / 4;
2074
33
    for (int i = 0; i < h; i += 2) {
2075
32
        shift_copy(
2076
32
            wedge_primary_oblique_even, &wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride], shift, MASK_PRIMARY_SIZE);
2077
32
        shift--;
2078
32
        shift_copy(
2079
32
            wedge_primary_oblique_odd, &wedge_mask_obl[0][WEDGE_OBLIQUE63][(i + 1) * stride], shift, MASK_PRIMARY_SIZE);
2080
32
        svt_memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][i * stride],
2081
32
                   wedge_primary_vertical,
2082
32
                   MASK_PRIMARY_SIZE * sizeof(wedge_primary_vertical[0]));
2083
32
        svt_memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][(i + 1) * stride],
2084
32
                   wedge_primary_vertical,
2085
32
                   MASK_PRIMARY_SIZE * sizeof(wedge_primary_vertical[0]));
2086
32
    }
2087
#else
2088
    static const double smoother_param = 2.85;
2089
    const int           a[2]           = {2, 1};
2090
    const double        asqrt          = sqrt(a[0] * a[0] + a[1] * a[1]);
2091
    for (int i = 0; i < h; i++) {
2092
        for (int j = 0; j < w; ++j) {
2093
            int       x                                        = (2 * j + 1 - w);
2094
            int       y                                        = (2 * i + 1 - h);
2095
            double    d                                        = (a[0] * x + a[1] * y) / asqrt;
2096
            const int msk                                      = (int)rint((1.0 + tanh(d / smoother_param)) * 32);
2097
            wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j] = msk;
2098
            const int mskx                                     = (int)rint((1.0 + tanh(x / smoother_param)) * 32);
2099
            wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j]  = mskx;
2100
        }
2101
    }
2102
#endif // USE_PRECOMPUTED_WEDGE_MASK
2103
65
    for (int i = 0; i < h; ++i) {
2104
4.16k
        for (int j = 0; j < w; ++j) {
2105
4.09k
            const int msk                                      = wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j];
2106
4.09k
            wedge_mask_obl[0][WEDGE_OBLIQUE27][j * stride + i] = msk;
2107
4.09k
            wedge_mask_obl[0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
2108
4.09k
                wedge_mask_obl[0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] = (1 << WEDGE_WEIGHT_BITS) - msk;
2109
4.09k
            wedge_mask_obl[1][WEDGE_OBLIQUE63][i * stride + j] = wedge_mask_obl[1][WEDGE_OBLIQUE27][j * stride + i] =
2110
4.09k
                (1 << WEDGE_WEIGHT_BITS) - msk;
2111
4.09k
            wedge_mask_obl[1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
2112
4.09k
                wedge_mask_obl[1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] = msk;
2113
4.09k
            const int mskx                                      = wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j];
2114
4.09k
            wedge_mask_obl[0][WEDGE_HORIZONTAL][j * stride + i] = mskx;
2115
4.09k
            wedge_mask_obl[1][WEDGE_VERTICAL][i * stride + j]   = wedge_mask_obl[1][WEDGE_HORIZONTAL][j * stride + i] =
2116
4.09k
                (1 << WEDGE_WEIGHT_BITS) - mskx;
2117
4.09k
        }
2118
64
    }
2119
1
}
2120
2121
#if !USE_PRECOMPUTED_WEDGE_SIGN
2122
// If the signs for the wedges for various BLOCK_SIZES are
2123
// inconsistent flip the sign flag. Do it only once for every
2124
// wedge codebook.
2125
static void init_wedge_signs() {
2126
    memset(wedge_signflip_lookup, 0, sizeof(wedge_signflip_lookup));
2127
    for (BlockSize bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; ++bsize) {
2128
        const int               bw           = block_size_wide[bsize];
2129
        const int               bh           = block_size_high[bsize];
2130
        const wedge_params_type wedge_params = wedge_params_lookup[bsize];
2131
        const int               wbits        = wedge_params.bits;
2132
        const int               wtypes       = 1 << wbits;
2133
2134
        if (wbits) {
2135
            for (int w = 0; w < wtypes; ++w) {
2136
                // Get the mask primary, i.e. index [0]
2137
                const uint8_t* mask = get_wedge_mask_inplace(w, 0, bsize);
2138
                int            avg  = 0;
2139
                for (int i = 0; i < bw; ++i) {
2140
                    avg += mask[i];
2141
                }
2142
                for (int i = 1; i < bh; ++i) {
2143
                    avg += mask[i * MASK_PRIMARY_STRIDE];
2144
                }
2145
                avg = (avg + (bw + bh - 1) / 2) / (bw + bh - 1);
2146
                // Default sign of this wedge is 1 if the average < 32, 0 otherwise.
2147
                // If default sign is 1:
2148
                //   If sign requested is 0, we need to flip the sign and return
2149
                //   the complement i.e. index [1] instead. If sign requested is 1
2150
                //   we need to flip the sign and return index [0] instead.
2151
                // If default sign is 0:
2152
                //   If sign requested is 0, we need to return index [0] the primary
2153
                //   if sign requested is 1, we need to return the complement index [1]
2154
                //   instead.
2155
                wedge_params.signflip[w] = (avg < 32);
2156
            }
2157
        }
2158
    }
2159
}
2160
#endif // !USE_PRECOMPUTED_WEDGE_SIGN
2161
2162
288
static const uint8_t* get_wedge_mask_inplace(int wedge_index, int neg, BlockSize bsize) {
2163
288
    const int bh = block_size_high[bsize];
2164
288
    const int bw = block_size_wide[bsize];
2165
2166
288
    assert(wedge_index >= 0 && wedge_index < (1 << svt_aom_get_wedge_bits_lookup(bsize)));
2167
288
    const WedgeCodeType* a = wedge_params_lookup[bsize].codebook + wedge_index;
2168
288
    int                  woff, hoff;
2169
288
    const uint8_t        wsignflip = wedge_params_lookup[bsize].signflip[wedge_index];
2170
2171
288
    woff = (a->x_offset * bw) >> 3;
2172
288
    hoff = (a->y_offset * bh) >> 3;
2173
288
    return wedge_mask_obl[neg ^ wsignflip][a->direction] + MASK_PRIMARY_STRIDE * (MASK_PRIMARY_SIZE / 2 - hoff) +
2174
288
        MASK_PRIMARY_SIZE / 2 - woff;
2175
288
}
2176
2177
1
static void init_wedge_masks() {
2178
1
    uint8_t* dst = wedge_mask_buf;
2179
1
    memset(wedge_masks, 0, sizeof(wedge_masks));
2180
23
    for (BlockSize bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; ++bsize) {
2181
22
        const int              bw           = block_size_wide[bsize];
2182
22
        const int              bh           = block_size_high[bsize];
2183
22
        const WedgeParamsType* wedge_params = &wedge_params_lookup[bsize];
2184
22
        const int              wbits        = wedge_params->bits;
2185
22
        const int              wtypes       = 1 << wbits;
2186
22
        if (wbits == 0) {
2187
13
            continue;
2188
13
        }
2189
153
        for (int w = 0; w < wtypes; ++w) {
2190
144
            const uint8_t* mask;
2191
144
            mask = get_wedge_mask_inplace(w, 0, bsize);
2192
144
            aom_convolve_copy_c(mask, MASK_PRIMARY_STRIDE, dst, bw, NULL, 0, NULL, 0, bw, bh);
2193
144
            wedge_params->masks[0][w] = dst;
2194
144
            dst += bw * bh;
2195
2196
144
            mask = get_wedge_mask_inplace(w, 1, bsize);
2197
144
            aom_convolve_copy_c(mask, MASK_PRIMARY_STRIDE, dst, bw, NULL, 0, NULL, 0, bw, bh);
2198
144
            wedge_params->masks[1][w] = dst;
2199
144
            dst += bw * bh;
2200
144
        }
2201
9
        assert(sizeof(wedge_mask_buf) >= (size_t)(dst - wedge_mask_buf));
2202
9
    }
2203
1
}
2204
2205
// Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0
2206
1
void svt_av1_init_wedge_masks(void) {
2207
1
    init_wedge_primary_masks();
2208
#if !USE_PRECOMPUTED_WEDGE_SIGN
2209
    init_wedge_signs();
2210
#endif // !USE_PRECOMPUTED_WEDGE_SIGN
2211
1
    init_wedge_masks();
2212
1
}
2213
2214
int svt_aom_is_masked_compound_type(COMPOUND_TYPE type);
2215
2216
/* clang-format off */
2217
static const uint8_t ii_weights1d[MAX_SB_SIZE] = {
2218
    60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32,
2219
    31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16,
2220
    16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10,  9,  9,  9,  8,
2221
    8,  8,  8,  7,  7,  7,  7,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  4,  4,
2222
    4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,
2223
    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,
2224
    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
2225
};
2226
static const uint8_t ii_size_scales[BLOCK_SIZES_ALL] = {
2227
    32, 16, 16, 16, 8, 8, 8, 4,
2228
    4,  4,  2,  2,  2, 1, 1, 1,
2229
    8,  8,  4,  4,  2, 2
2230
};
2231
/* clang-format on */
2232
2233
40
static void build_smooth_interintra_mask(uint8_t* mask, int stride, BlockSize plane_bsize, InterIntraMode mode) {
2234
40
    const int bw         = block_size_wide[plane_bsize];
2235
40
    const int bh         = block_size_high[plane_bsize];
2236
40
    const int size_scale = ii_size_scales[plane_bsize];
2237
2238
40
    switch (mode) {
2239
10
    case II_V_PRED:
2240
154
        for (int i = 0; i < bh; ++i) {
2241
144
            memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0]));
2242
144
            mask += stride;
2243
144
        }
2244
10
        break;
2245
2246
10
    case II_H_PRED:
2247
154
        for (int i = 0; i < bh; ++i) {
2248
2.84k
            for (int j = 0; j < bw; ++j) {
2249
2.70k
                mask[j] = ii_weights1d[j * size_scale];
2250
2.70k
            }
2251
144
            mask += stride;
2252
144
        }
2253
10
        break;
2254
2255
10
    case II_SMOOTH_PRED:
2256
154
        for (int i = 0; i < bh; ++i) {
2257
2.84k
            for (int j = 0; j < bw; ++j) {
2258
2.70k
                mask[j] = ii_weights1d[(i < j ? i : j) * size_scale];
2259
2.70k
            }
2260
144
            mask += stride;
2261
144
        }
2262
10
        break;
2263
2264
10
    case II_DC_PRED:
2265
10
    default:
2266
154
        for (int i = 0; i < bh; ++i) {
2267
144
            memset(mask, 32, bw * sizeof(mask[0]));
2268
144
            mask += stride;
2269
144
        }
2270
10
        break;
2271
40
    }
2272
40
}
2273
2274
// ii_masks stores the actual masks. We use smooth_ii_masks to access ii_masks so that we can index the array
2275
// directly with the bsize (BlockSize that would be passed when doing the prediction) without using the extra memory
2276
// to store empty, unused masks for the BLOCK_SIZES that don't allow inter-intra
2277
static uint8_t  ii_masks[BLOCK_32X32 - BLOCK_4X4 + 1][INTERINTRA_MODES][MAX_INTERINTRA_SB_SQUARE];
2278
static uint8_t* smooth_ii_masks[BLOCK_SIZES_ALL][INTERINTRA_MODES];
2279
2280
// Initialize the masks used for inter-intra compound blending. Inter-intra is allowed for 8x8-32x32 blocks, but
2281
// masks must be generated down to 4x4 because of chroma. The stride of each mask is the block width.
2282
1
void init_ii_masks(void) {
2283
1
    memset(smooth_ii_masks, 0 /*NULL*/, sizeof(smooth_ii_masks));
2284
11
    for (BlockSize bsize = BLOCK_4X4; bsize <= BLOCK_32X32; ++bsize) {
2285
10
        const int bw = block_size_wide[bsize];
2286
50
        for (InterIntraMode ii_mode = II_DC_PRED; ii_mode < INTERINTRA_MODES; ii_mode++) {
2287
40
            build_smooth_interintra_mask(ii_masks[bsize - BLOCK_4X4][ii_mode], bw, bsize, ii_mode);
2288
40
            smooth_ii_masks[bsize][ii_mode] = ii_masks[bsize - BLOCK_4X4][ii_mode];
2289
40
        }
2290
10
    }
2291
1
}
2292
2293
// mask stride is block width
2294
0
static uint8_t* get_ii_mask(BlockSize bsize, InterIntraMode ii_mode) {
2295
0
    return smooth_ii_masks[bsize][ii_mode];
2296
0
}
2297
2298
void svt_aom_combine_interintra_highbd(InterIntraMode mode, uint8_t use_wedge_interintra, uint8_t wedge_index,
2299
                                       uint8_t wedge_sign, BlockSize bsize, BlockSize plane_bsize, uint8_t* comppred8,
2300
                                       int compstride, const uint8_t* interpred8, int interstride,
2301
0
                                       const uint8_t* intrapred8, int intrastride, int bd) {
2302
0
    const int bw = block_size_wide[plane_bsize];
2303
0
    const int bh = block_size_high[plane_bsize];
2304
2305
0
    if (use_wedge_interintra) {
2306
0
        if (svt_aom_is_interintra_wedge_used(bsize)) {
2307
0
            const uint8_t* mask = svt_aom_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
2308
0
            const int      subh = 2 * mi_size_high[bsize] == bh;
2309
0
            const int      subw = 2 * mi_size_wide[bsize] == bw;
2310
0
            svt_aom_highbd_blend_a64_mask(comppred8,
2311
0
                                          compstride,
2312
0
                                          intrapred8,
2313
0
                                          intrastride,
2314
0
                                          interpred8,
2315
0
                                          interstride,
2316
0
                                          mask,
2317
0
                                          block_size_wide[bsize],
2318
0
                                          bw,
2319
0
                                          bh,
2320
0
                                          subw,
2321
0
                                          subh,
2322
0
                                          bd);
2323
0
        }
2324
0
        return;
2325
0
    }
2326
2327
0
    uint8_t* mask = get_ii_mask(plane_bsize, mode);
2328
0
    svt_aom_highbd_blend_a64_mask(
2329
0
        comppred8, compstride, intrapred8, intrastride, interpred8, interstride, mask, bw, bw, bh, 0, 0, bd);
2330
0
}
2331
2332
static const uint8_t* av1_get_compound_type_mask(const InterInterCompoundData* const comp_data, uint8_t* seg_mask,
2333
0
                                                 BlockSize bsize) {
2334
0
    assert(svt_aom_is_masked_compound_type(comp_data->type));
2335
0
    (void)bsize;
2336
0
    switch (comp_data->type) {
2337
0
    case COMPOUND_WEDGE:
2338
0
        return svt_aom_get_contiguous_soft_mask(comp_data->wedge_index, comp_data->wedge_sign, bsize);
2339
0
    case COMPOUND_DIFFWTD:
2340
0
        return seg_mask;
2341
0
    default:
2342
0
        assert(0);
2343
0
        return NULL;
2344
0
    }
2345
0
}
2346
2347
void svt_aom_build_masked_compound_no_round(uint8_t* dst, int dst_stride, const CONV_BUF_TYPE* src0, int src0_stride,
2348
                                            const CONV_BUF_TYPE* src1, int src1_stride,
2349
                                            const InterInterCompoundData* const comp_data, uint8_t* seg_mask,
2350
                                            BlockSize bsize, int h, int w, ConvolveParams* conv_params,
2351
0
                                            uint8_t bit_depth, bool is_16bit) {
2352
    // Derive subsampling from h and w passed in. May be refactored to
2353
    // pass in subsampling factors directly.
2354
0
    const int      subh = (2 << mi_size_high_log2[bsize]) == h;
2355
0
    const int      subw = (2 << mi_size_wide_log2[bsize]) == w;
2356
0
    const uint8_t* mask = av1_get_compound_type_mask(comp_data, seg_mask, bsize);
2357
2358
0
    if (is_16bit) {
2359
0
        svt_aom_highbd_blend_a64_d16_mask(dst,
2360
0
                                          dst_stride,
2361
0
                                          src0,
2362
0
                                          src0_stride,
2363
0
                                          src1,
2364
0
                                          src1_stride,
2365
0
                                          mask,
2366
0
                                          block_size_wide[bsize],
2367
0
                                          w,
2368
0
                                          h,
2369
0
                                          subw,
2370
0
                                          subh,
2371
0
                                          conv_params,
2372
0
                                          bit_depth);
2373
0
    } else {
2374
0
        svt_aom_lowbd_blend_a64_d16_mask(dst,
2375
0
                                         dst_stride,
2376
0
                                         src0,
2377
0
                                         src0_stride,
2378
0
                                         src1,
2379
0
                                         src1_stride,
2380
0
                                         mask,
2381
0
                                         block_size_wide[bsize],
2382
0
                                         w,
2383
0
                                         h,
2384
0
                                         subw,
2385
0
                                         subh,
2386
0
                                         conv_params);
2387
0
    }
2388
0
}
2389
2390
0
void svt_aom_find_ref_dv(Mv* ref_dv, const TileInfo* const tile, int mib_size, int mi_row, int mi_col) {
2391
0
    (void)mi_col;
2392
0
    if (mi_row - mib_size < tile->mi_row_start) {
2393
0
        ref_dv->y = 0;
2394
0
        ref_dv->x = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS;
2395
0
    } else {
2396
0
        ref_dv->y = -MI_SIZE * mib_size;
2397
0
        ref_dv->x = 0;
2398
0
    }
2399
0
    ref_dv->y *= 8;
2400
0
    ref_dv->x *= 8;
2401
0
}
2402
#if CONFIG_ENABLE_OBMC
2403
0
int svt_av1_skip_u4x4_pred_in_obmc(BlockSize bsize, int dir, int subsampling_x, int subsampling_y) {
2404
0
    assert(is_motion_variation_allowed_bsize(bsize));
2405
2406
0
    const BlockSize bsize_plane = get_plane_block_size(bsize, subsampling_x, subsampling_y);
2407
0
    switch (bsize_plane) {
2408
#if DISABLE_CHROMA_U8X8_OBMC
2409
    case BLOCK_4X4:
2410
    case BLOCK_8X4:
2411
    case BLOCK_4X8:
2412
        return 1;
2413
        break;
2414
#else
2415
0
    case BLOCK_4X4:
2416
0
    case BLOCK_8X4:
2417
0
    case BLOCK_4X8:
2418
0
        return dir == 0;
2419
0
        break;
2420
0
#endif
2421
0
    default:
2422
0
        return 0;
2423
0
    }
2424
0
}
2425
#endif
2426
2427
0
#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
2428
2429
/**
2430
 * Computes SSE of a compound predictor constructed from 2 fundamental
2431
 * predictors p0 and p1 using blending with mask.
2432
 *
2433
 * r1:  Residuals of p1.
2434
 *      (source - p1)
2435
 * d:   Difference of p1 and p0.
2436
 *      (p1 - p0)
2437
 * m:   The blending mask
2438
 * N:   Number of pixels
2439
 *
2440
 * 'r1', 'd', and 'm' are contiguous.
2441
 *
2442
 * Computes:
2443
 *  Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to:
2444
 *  Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2),
2445
 *    where r0 is (source - p0), and r1 is (source - p1), which is in turn
2446
 *    is equivalent to:
2447
 *  Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2),
2448
 *    which is the SSE of the residuals of the compound predictor scaled up by
2449
 *    MAX_MASK_VALUE**2.
2450
 *
2451
 * Note that we clamp the partial term in the loop to 16 bits signed. This is
2452
 * to facilitate equivalent SIMD implementation. It should have no effect if
2453
 * residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always
2454
 * holds for 8 bit input, and on real input, it should hold practically always,
2455
 * as residuals are expected to be small.
2456
 */
2457
0
uint64_t svt_av1_wedge_sse_from_residuals_c(const int16_t* r1, const int16_t* d, const uint8_t* m, int N) {
2458
0
    uint64_t csse = 0;
2459
2460
0
    for (int i = 0; i < N; i++) {
2461
0
        int32_t t = MAX_MASK_VALUE * r1[i] + m[i] * d[i];
2462
0
        t         = clamp(t, INT16_MIN, INT16_MAX);
2463
0
        csse += t * t;
2464
0
    }
2465
0
    return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
2466
0
}
2467
2468
void svt_aom_combine_interintra(InterIntraMode mode, int8_t use_wedge_interintra, int wedge_index, int wedge_sign,
2469
                                BlockSize bsize, BlockSize plane_bsize, uint8_t* comppred, int compstride,
2470
0
                                const uint8_t* interpred, int interstride, const uint8_t* intrapred, int intrastride) {
2471
0
    const int bw = block_size_wide[plane_bsize];
2472
0
    const int bh = block_size_high[plane_bsize];
2473
2474
0
    if (use_wedge_interintra) {
2475
0
        if (svt_aom_is_interintra_wedge_used(bsize)) {
2476
0
            const uint8_t* mask = svt_aom_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
2477
0
            const int      subw = 2 * mi_size_wide[bsize] == bw;
2478
0
            const int      subh = 2 * mi_size_high[bsize] == bh;
2479
0
            svt_aom_blend_a64_mask(comppred,
2480
0
                                   compstride,
2481
0
                                   intrapred,
2482
0
                                   intrastride,
2483
0
                                   interpred,
2484
0
                                   interstride,
2485
0
                                   mask,
2486
0
                                   block_size_wide[bsize],
2487
0
                                   bw,
2488
0
                                   bh,
2489
0
                                   subw,
2490
0
                                   subh);
2491
0
        }
2492
0
        return;
2493
0
    } else {
2494
0
        uint8_t* mask = get_ii_mask(plane_bsize, mode);
2495
0
        svt_aom_blend_a64_mask(
2496
0
            comppred, compstride, intrapred, intrastride, interpred, interstride, mask, bw, bw, bh, 0, 0);
2497
0
    }
2498
0
}
2499
2500
void svt_aom_highbd_blend_a64_hmask_16bit_c(uint16_t* dst, uint32_t dst_stride, const uint16_t* src0,
2501
                                            uint32_t src0_stride, const uint16_t* src1, uint32_t src1_stride,
2502
0
                                            const uint8_t* mask, int w, int h, int bd) {
2503
0
    (void)bd;
2504
2505
0
    assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
2506
0
    assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
2507
2508
0
    assert(h >= 1);
2509
0
    assert(w >= 1);
2510
0
    assert(IS_POWER_OF_TWO(h));
2511
0
    assert(IS_POWER_OF_TWO(w));
2512
2513
0
    assert(bd == 8 || bd == 10 || bd == 12);
2514
2515
0
    for (int i = 0; i < h; ++i) {
2516
0
        for (int j = 0; j < w; ++j) {
2517
0
            dst[i * dst_stride + j] = AOM_BLEND_A64(mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
2518
0
        }
2519
0
    }
2520
0
}
2521
2522
0
uint64_t svt_aom_sum_squares_i16_c(const int16_t* src, uint32_t n) {
2523
0
    uint64_t ss = 0;
2524
0
    do {
2525
0
        const int16_t v = *src++;
2526
0
        ss += v * v;
2527
0
    } while (--n);
2528
2529
0
    return ss;
2530
0
}
2531
2532
// obmc_mask_N[overlap_position]
2533
static const uint8_t obmc_mask_1[1]                      = {64};
2534
DECLARE_ALIGNED(2, static const uint8_t, obmc_mask_2[2]) = {45, 64};
2535
2536
DECLARE_ALIGNED(4, static const uint8_t, obmc_mask_4[4]) = {39, 50, 59, 64};
2537
2538
static const uint8_t obmc_mask_8[8] = {36, 42, 48, 53, 57, 61, 64, 64};
2539
2540
static const uint8_t obmc_mask_16[16] = {34, 37, 40, 43, 46, 49, 52, 54, 56, 58, 60, 61, 64, 64, 64, 64};
2541
2542
static const uint8_t obmc_mask_32[32] = {33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55,
2543
                                         56, 57, 58, 59, 60, 60, 61, 62, 64, 64, 64, 64, 64, 64, 64, 64};
2544
2545
0
const uint8_t* svt_av1_get_obmc_mask(int length) {
2546
0
    switch (length) {
2547
0
    case 1:
2548
0
        return obmc_mask_1;
2549
0
    case 2:
2550
0
        return obmc_mask_2;
2551
0
    case 4:
2552
0
        return obmc_mask_4;
2553
0
    case 8:
2554
0
        return obmc_mask_8;
2555
0
    case 16:
2556
0
        return obmc_mask_16;
2557
0
    case 32:
2558
0
        return obmc_mask_32;
2559
0
    default:
2560
0
        assert(0);
2561
0
        return NULL;
2562
0
    }
2563
0
}
2564
2565
0
int16_t svt_aom_mode_context_analyzer(int16_t mode_context, const MvReferenceFrame* const rf) {
2566
0
    static unsigned svt_aom_compound_mode_ctx_map[3][COMP_NEWMV_CTXS] = {
2567
0
        {0, 1, 1, 1, 1},
2568
0
        {1, 2, 3, 4, 4},
2569
0
        {4, 4, 5, 6, 7},
2570
0
    };
2571
2572
0
    if (rf[1] <= INTRA_FRAME) {
2573
0
        return mode_context;
2574
0
    }
2575
2576
0
    const unsigned newmv_ctx = mode_context & NEWMV_CTX_MASK;
2577
0
    const unsigned refmv_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
2578
0
    assert((refmv_ctx >> 1) < 3);
2579
0
    const unsigned comp_ctx = svt_aom_compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN(newmv_ctx, COMP_NEWMV_CTXS - 1)];
2580
0
    return comp_ctx;
2581
0
}