Coverage Report

Created: 2026-05-16 06:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/svt-av1/Source/Lib/Codec/convolve.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10
 */
11
12
#include <assert.h>
13
#include "convolve.h"
14
#include "common_dsp_rtcd.h"
15
16
// Note: Fixed size intermediate buffers, place limits on parameters
17
// of some functions. 2d filtering proceeds in 2 steps:
18
//   (1) Interpolate horizontally into an intermediate buffer, temp.
19
//   (2) Interpolate temp vertically to derive the sub-pixel result.
20
// Deriving the maximum number of rows in the temp buffer (135):
21
// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
22
// --Largest block size is 128x128 pixels.
23
// --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
24
//   original frame (in 1/16th pixel units).
25
// --Must round-up because block may be located at sub-pixel position.
26
// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
27
// --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
28
#define WIENER_MAX_EXT_SIZE 263
29
30
0
static INLINE int32_t svt_aom_horz_scalar_product(const uint8_t* a, const int16_t* b) {
31
0
    int32_t sum = 0;
32
0
    for (int32_t k = 0; k < SUBPEL_TAPS; ++k) {
33
0
        sum += a[k] * b[k];
34
0
    }
35
0
    return sum;
36
0
}
37
38
0
static INLINE int32_t svt_aom_highbd_horz_scalar_product(const uint16_t* a, const int16_t* b) {
39
0
    int32_t sum = 0;
40
0
    for (int32_t k = 0; k < SUBPEL_TAPS; ++k) {
41
0
        sum += a[k] * b[k];
42
0
    }
43
0
    return sum;
44
0
}
45
46
static INLINE int32_t highbd_vert_scalar_product(const uint16_t* a, ptrdiff_t a_stride, const int16_t* b) {
47
    int32_t sum = 0;
48
    for (int32_t k = 0; k < SUBPEL_TAPS; ++k) {
49
        sum += a[k * a_stride] * b[k];
50
    }
51
    return sum;
52
}
53
54
0
static const InterpKernel* svt_aom_get_filter_base(const int16_t* filter) {
55
    // NOTE: This assumes that the filter table is 256-byte aligned.
56
0
    return (const InterpKernel*)(((intptr_t)filter) & ~((intptr_t)0xFF));
57
0
}
58
59
0
static int32_t svt_aom_get_filter_offset(const int16_t* f, const InterpKernel* base) {
60
0
    return (int32_t)((const InterpKernel*)(intptr_t)f - base);
61
0
}
62
63
static void svt_aom_convolve_add_src_horiz_hip(const uint8_t* src, ptrdiff_t src_stride, uint16_t* dst,
64
                                               ptrdiff_t dst_stride, const InterpKernel* x_filters, int32_t x0_q4,
65
0
                                               int32_t x_step_q4, int32_t w, int32_t h, int32_t round0_bits) {
66
0
    const int32_t bd = 8;
67
0
    src -= SUBPEL_TAPS / 2 - 1;
68
0
    for (int32_t y = 0; y < h; ++y) {
69
0
        int32_t x_q4 = x0_q4;
70
0
        for (int32_t x = 0; x < w; ++x) {
71
0
            const uint8_t* const src_x    = &src[x_q4 >> SUBPEL_BITS];
72
0
            const int16_t* const x_filter = x_filters[x_q4 & SUBPEL_MASK];
73
0
            const int32_t        rounding = ((int32_t)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
74
0
                (1 << (bd + FILTER_BITS - 1));
75
0
            const int32_t sum = svt_aom_horz_scalar_product(src_x, x_filter) + rounding;
76
0
            dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
77
0
            x_q4 += x_step_q4;
78
0
        }
79
0
        src += src_stride;
80
0
        dst += dst_stride;
81
0
    }
82
0
}
83
84
static void svt_aom_convolve_add_src_vert_hip(const uint16_t* src, ptrdiff_t src_stride, uint8_t* dst,
85
                                              ptrdiff_t dst_stride, const InterpKernel* y_filters, int32_t y0_q4,
86
0
                                              int32_t y_step_q4, int32_t w, int32_t h, int32_t round1_bits) {
87
0
    const int32_t bd = 8;
88
0
    src -= src_stride * (SUBPEL_TAPS / 2 - 1);
89
90
0
    for (int32_t x = 0; x < w; ++x) {
91
0
        int32_t y_q4 = y0_q4;
92
0
        for (int32_t y = 0; y < h; ++y) {
93
0
            const uint16_t*      src_y    = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
94
0
            const int16_t* const y_filter = y_filters[y_q4 & SUBPEL_MASK];
95
0
            const int32_t        rounding = ((int32_t)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
96
0
                (1 << (bd + round1_bits - 1));
97
0
            const int32_t sum   = highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
98
0
            dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
99
0
            y_q4 += y_step_q4;
100
0
        }
101
0
        ++src;
102
0
        ++dst;
103
0
    }
104
0
}
105
106
void svt_av1_wiener_convolve_add_src_c(const uint8_t* const src, const ptrdiff_t src_stride, uint8_t* const dst,
107
                                       const ptrdiff_t dst_stride, const int16_t* const filter_x,
108
                                       const int16_t* const filter_y, const int32_t w, const int32_t h,
109
0
                                       const ConvolveParams* const conv_params) {
110
0
    const int32_t             x_step_q4 = 16;
111
0
    const int32_t             y_step_q4 = 16;
112
0
    const InterpKernel* const filters_x = svt_aom_get_filter_base(filter_x);
113
0
    const int32_t             x0_q4     = svt_aom_get_filter_offset(filter_x, filters_x);
114
115
0
    const InterpKernel* const filters_y = svt_aom_get_filter_base(filter_y);
116
0
    const int32_t             y0_q4     = svt_aom_get_filter_offset(filter_y, filters_y);
117
118
0
    uint16_t      temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
119
0
    const int32_t intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
120
121
    // The last row is set to 0 to address an uninitialized memory access when
122
    // using the "C" code path.  In vert_scalar_product, where the wiener filter is applied to the pixels,
123
    // the bottom-edge pixels will need 3 padded pixels to perform a 7-tap filter. However, the filter is applied
124
    // over 8 (SUBPEL_TAPS) pixels, with the final 8th weight being zero. Therefore, the extra bottom-most pixel
125
    // will not affect the result, but will cause a sanitizer failure if not initialized.
126
0
    memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
127
128
0
    assert(w <= MAX_SB_SIZE);
129
0
    assert(h <= MAX_SB_SIZE);
130
0
    assert(y_step_q4 <= 32);
131
0
    assert(x_step_q4 <= 32);
132
133
0
    svt_aom_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
134
0
                                       src_stride,
135
0
                                       temp,
136
0
                                       MAX_SB_SIZE,
137
0
                                       filters_x,
138
0
                                       x0_q4,
139
0
                                       x_step_q4,
140
0
                                       w,
141
0
                                       intermediate_height,
142
0
                                       conv_params->round_0);
143
0
    svt_aom_convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
144
0
                                      MAX_SB_SIZE,
145
0
                                      dst,
146
0
                                      dst_stride,
147
0
                                      filters_y,
148
0
                                      y0_q4,
149
0
                                      y_step_q4,
150
0
                                      w,
151
0
                                      h,
152
0
                                      conv_params->round_1);
153
0
}
154
155
static void svt_aom_highbd_convolve_add_src_horiz_hip(const uint8_t* src8, ptrdiff_t src_stride, uint16_t* dst,
156
                                                      ptrdiff_t dst_stride, const InterpKernel* x_filters,
157
                                                      int32_t x0_q4, int32_t x_step_q4, int32_t w, int32_t h,
158
0
                                                      int32_t round0_bits, int32_t bd) {
159
0
    const int32_t extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
160
0
    uint16_t*     src                   = CONVERT_TO_SHORTPTR(src8);
161
0
    src -= SUBPEL_TAPS / 2 - 1;
162
0
    for (int32_t y = 0; y < h; ++y) {
163
0
        int32_t x_q4 = x0_q4;
164
0
        for (int32_t x = 0; x < w; ++x) {
165
0
            const uint16_t* const src_x    = &src[x_q4 >> SUBPEL_BITS];
166
0
            const int16_t* const  x_filter = x_filters[x_q4 & SUBPEL_MASK];
167
0
            const int32_t         rounding = ((int32_t)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
168
0
                (1 << (bd + FILTER_BITS - 1));
169
0
            const int32_t sum = svt_aom_highbd_horz_scalar_product(src_x, x_filter) + rounding;
170
0
            dst[x]            = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, extraprec_clamp_limit - 1);
171
0
            x_q4 += x_step_q4;
172
0
        }
173
0
        src += src_stride;
174
0
        dst += dst_stride;
175
0
    }
176
0
}
177
178
static void svt_aom_highbd_convolve_add_src_vert_hip(const uint16_t* src, ptrdiff_t src_stride, uint8_t* dst8,
179
                                                     ptrdiff_t dst_stride, const InterpKernel* y_filters, int32_t y0_q4,
180
                                                     int32_t y_step_q4, int32_t w, int32_t h, int32_t round1_bits,
181
0
                                                     int32_t bd) {
182
0
    uint16_t* dst = CONVERT_TO_SHORTPTR(dst8);
183
0
    src -= src_stride * (SUBPEL_TAPS / 2 - 1);
184
0
    for (int32_t x = 0; x < w; ++x) {
185
0
        int32_t y_q4 = y0_q4;
186
0
        for (int32_t y = 0; y < h; ++y) {
187
0
            const uint16_t*      src_y    = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
188
0
            const int16_t* const y_filter = y_filters[y_q4 & SUBPEL_MASK];
189
0
            const int32_t        rounding = ((int32_t)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
190
0
                (1 << (bd + round1_bits - 1));
191
0
            const int32_t sum   = highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
192
0
            dst[y * dst_stride] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
193
0
            y_q4 += y_step_q4;
194
0
        }
195
0
        ++src;
196
0
        ++dst;
197
0
    }
198
0
}
199
200
void svt_av1_highbd_wiener_convolve_add_src_c(const uint8_t* const src, const ptrdiff_t src_stride, uint8_t* const dst,
201
                                              const ptrdiff_t dst_stride, const int16_t* const filter_x,
202
                                              const int16_t* const filter_y, const int32_t w, const int32_t h,
203
0
                                              const ConvolveParams* const conv_params, const int32_t bd) {
204
0
    const int32_t             x_step_q4 = 16;
205
0
    const int32_t             y_step_q4 = 16;
206
0
    const InterpKernel* const filters_x = svt_aom_get_filter_base(filter_x);
207
0
    const int32_t             x0_q4     = svt_aom_get_filter_offset(filter_x, filters_x);
208
209
0
    const InterpKernel* const filters_y = svt_aom_get_filter_base(filter_y);
210
0
    const int32_t             y0_q4     = svt_aom_get_filter_offset(filter_y, filters_y);
211
212
0
    uint16_t      temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
213
0
    const int32_t intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
214
215
0
    assert(w <= MAX_SB_SIZE);
216
0
    assert(h <= MAX_SB_SIZE);
217
0
    assert(y_step_q4 <= 32);
218
0
    assert(x_step_q4 <= 32);
219
0
    assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
220
221
0
    svt_aom_highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
222
0
                                              src_stride,
223
0
                                              temp,
224
0
                                              MAX_SB_SIZE,
225
0
                                              filters_x,
226
0
                                              x0_q4,
227
0
                                              x_step_q4,
228
0
                                              w,
229
0
                                              intermediate_height,
230
0
                                              conv_params->round_0,
231
0
                                              bd);
232
0
    svt_aom_highbd_convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
233
0
                                             MAX_SB_SIZE,
234
0
                                             dst,
235
0
                                             dst_stride,
236
0
                                             filters_y,
237
0
                                             y0_q4,
238
0
                                             y_step_q4,
239
0
                                             w,
240
0
                                             h,
241
0
                                             conv_params->round_1,
242
0
                                             bd);
243
0
}
244
245
0
static INLINE int vert_scalar_product(const uint8_t* a, ptrdiff_t a_stride, const int16_t* b) {
246
0
    int sum = 0;
247
0
    for (int k = 0; k < SUBPEL_TAPS; ++k) {
248
0
        sum += a[k * a_stride] * b[k];
249
0
    }
250
0
    return sum;
251
0
}
252
253
static void svt_aom_convolve_horiz(const uint8_t* src, ptrdiff_t src_stride, uint8_t* dst, ptrdiff_t dst_stride,
254
0
                                   const InterpKernel* x_filters, int x0_q4, int x_step_q4, int w, int h) {
255
0
    src -= SUBPEL_TAPS / 2 - 1;
256
0
    for (int y = 0; y < h; ++y) {
257
0
        int x_q4 = x0_q4;
258
0
        for (int x = 0; x < w; ++x) {
259
0
            const uint8_t* const src_x    = &src[x_q4 >> SUBPEL_BITS];
260
0
            const int16_t* const x_filter = x_filters[x_q4 & SUBPEL_MASK];
261
0
            const int            sum      = svt_aom_horz_scalar_product(src_x, x_filter);
262
0
            dst[x]                        = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
263
0
            x_q4 += x_step_q4;
264
0
        }
265
0
        src += src_stride;
266
0
        dst += dst_stride;
267
0
    }
268
0
}
269
270
static void svt_aom_convolve_vert(const uint8_t* src, ptrdiff_t src_stride, uint8_t* dst, ptrdiff_t dst_stride,
271
0
                                  const InterpKernel* y_filters, int y0_q4, int y_step_q4, int w, int h) {
272
0
    src -= src_stride * (SUBPEL_TAPS / 2 - 1);
273
274
0
    for (int x = 0; x < w; ++x) {
275
0
        int y_q4 = y0_q4;
276
0
        for (int y = 0; y < h; ++y) {
277
0
            const unsigned char* src_y    = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
278
0
            const int16_t* const y_filter = y_filters[y_q4 & SUBPEL_MASK];
279
0
            const int            sum      = vert_scalar_product(src_y, src_stride, y_filter);
280
0
            dst[y * dst_stride]           = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
281
0
            y_q4 += y_step_q4;
282
0
        }
283
0
        ++src;
284
0
        ++dst;
285
0
    }
286
0
}
287
288
void svt_aom_convolve8_horiz_c(const uint8_t* src, ptrdiff_t src_stride, uint8_t* dst, ptrdiff_t dst_stride,
289
                               const int16_t* filter_x, int x_step_q4, const int16_t* filter_y, int y_step_q4, int w,
290
0
                               int h) {
291
0
    const InterpKernel* const filters_x = svt_aom_get_filter_base(filter_x);
292
0
    const int                 x0_q4     = svt_aom_get_filter_offset(filter_x, filters_x);
293
294
0
    (void)filter_y;
295
0
    (void)y_step_q4;
296
297
0
    svt_aom_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, w, h);
298
0
}
299
300
void svt_aom_convolve8_vert_c(const uint8_t* src, ptrdiff_t src_stride, uint8_t* dst, ptrdiff_t dst_stride,
301
                              const int16_t* filter_x, int x_step_q4, const int16_t* filter_y, int y_step_q4, int w,
302
0
                              int h) {
303
0
    const InterpKernel* const filters_y = svt_aom_get_filter_base(filter_y);
304
0
    const int                 y0_q4     = svt_aom_get_filter_offset(filter_y, filters_y);
305
306
0
    (void)filter_x;
307
0
    (void)x_step_q4;
308
309
0
    svt_aom_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, w, h);
310
0
}