/work/svt-av1/Source/Lib/Codec/convolve.c

Source
/*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
 */

#include <assert.h>
#include "convolve.h"
#include "common_dsp_rtcd.h"

// Note: Fixed size intermediate buffers, place limits on parameters
// of some functions. 2d filtering proceeds in 2 steps:
//   (1) Interpolate horizontally into an intermediate buffer, temp.
//   (2) Interpolate temp vertically to derive the sub-pixel result.
// Deriving the maximum number of rows in the temp buffer (135):
// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
// --Largest block size is 128x128 pixels.
// --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
//   original frame (in 1/16th pixel units).
// --Must round-up because block may be located at sub-pixel position.
// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
// --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
#define WIENER_MAX_EXT_SIZE 263

static INLINE int32_t svt_aom_horz_scalar_product(const uint8_t* a, const int16_t* b) {
    int32_t sum = 0;
    for (int32_t k = 0; k < SUBPEL_TAPS; ++k) {
        sum += a[k] * b[k];
    }
    return sum;
}

static INLINE int32_t svt_aom_highbd_horz_scalar_product(const uint16_t* a, const int16_t* b) {
    int32_t sum = 0;
    for (int32_t k = 0; k < SUBPEL_TAPS; ++k) {
        sum += a[k] * b[k];
    }
    return sum;
}

static INLINE int32_t highbd_vert_scalar_product(const uint16_t* a, ptrdiff_t a_stride, const int16_t* b) {
    int32_t sum = 0;
    for (int32_t k = 0; k < SUBPEL_TAPS; ++k) {
        sum += a[k * a_stride] * b[k];
    }
    return sum;
}

static const InterpKernel* svt_aom_get_filter_base(const int16_t* filter) {
    // NOTE: This assumes that the filter table is 256-byte aligned.
    return (const InterpKernel*)(((intptr_t)filter) & ~((intptr_t)0xFF));
}

static int32_t svt_aom_get_filter_offset(const int16_t* f, const InterpKernel* base) {
    return (int32_t)((const InterpKernel*)(intptr_t)f - base);
}

static void svt_aom_convolve_add_src_horiz_hip(const uint8_t* src, ptrdiff_t src_stride, uint16_t* dst,
                                               ptrdiff_t dst_stride, const InterpKernel* x_filters, int32_t x0_q4,
                                               int32_t x_step_q4, int32_t w, int32_t h, int32_t round0_bits) {
    const int32_t bd = 8;
    src -= SUBPEL_TAPS / 2 - 1;
    for (int32_t y = 0; y < h; ++y) {
        int32_t x_q4 = x0_q4;
        for (int32_t x = 0; x < w; ++x) {
            const uint8_t* const src_x    = &src[x_q4 >> SUBPEL_BITS];
            const int16_t* const x_filter = x_filters[x_q4 & SUBPEL_MASK];
            const int32_t        rounding = ((int32_t)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
                (1 << (bd + FILTER_BITS - 1));
            const int32_t sum = svt_aom_horz_scalar_product(src_x, x_filter) + rounding;
            dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
            x_q4 += x_step_q4;
        }
        src += src_stride;
        dst += dst_stride;
    }
}

static void svt_aom_convolve_add_src_vert_hip(const uint16_t* src, ptrdiff_t src_stride, uint8_t* dst,
                                              ptrdiff_t dst_stride, const InterpKernel* y_filters, int32_t y0_q4,
                                              int32_t y_step_q4, int32_t w, int32_t h, int32_t round1_bits) {
    const int32_t bd = 8;
    src -= src_stride * (SUBPEL_TAPS / 2 - 1);

    for (int32_t x = 0; x < w; ++x) {
        int32_t y_q4 = y0_q4;
        for (int32_t y = 0; y < h; ++y) {
            const uint16_t*      src_y    = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
            const int16_t* const y_filter = y_filters[y_q4 & SUBPEL_MASK];
            const int32_t        rounding = ((int32_t)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
                (1 << (bd + round1_bits - 1));
            const int32_t sum   = highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
            dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
            y_q4 += y_step_q4;
        }
        ++src;
        ++dst;
    }
}

void svt_av1_wiener_convolve_add_src_c(const uint8_t* const src, const ptrdiff_t src_stride, uint8_t* const dst,
                                       const ptrdiff_t dst_stride, const int16_t* const filter_x,
                                       const int16_t* const filter_y, const int32_t w, const int32_t h,
                                       const ConvolveParams* const conv_params) {
    const int32_t             x_step_q4 = 16;
    const int32_t             y_step_q4 = 16;
    const InterpKernel* const filters_x = svt_aom_get_filter_base(filter_x);
    const int32_t             x0_q4     = svt_aom_get_filter_offset(filter_x, filters_x);

    const InterpKernel* const filters_y = svt_aom_get_filter_base(filter_y);
    const int32_t             y0_q4     = svt_aom_get_filter_offset(filter_y, filters_y);

    uint16_t      temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
    const int32_t intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;

    // The last row is set to 0 to address an uninitialized memory access when
    // using the "C" code path.  In vert_scalar_product, where the wiener filter is applied to the pixels,
    // the bottom-edge pixels will need 3 padded pixels to perform a 7-tap filter. However, the filter is applied
    // over 8 (SUBPEL_TAPS) pixels, with the final 8th weight being zero. Therefore, the extra bottom-most pixel
    // will not affect the result, but will cause a sanitizer failure if not initialized.
    memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);

    assert(w <= MAX_SB_SIZE);
    assert(h <= MAX_SB_SIZE);
    assert(y_step_q4 <= 32);
    assert(x_step_q4 <= 32);

    svt_aom_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
                                       src_stride,
                                       temp,
                                       MAX_SB_SIZE,
                                       filters_x,
                                       x0_q4,
                                       x_step_q4,
                                       w,
                                       intermediate_height,
                                       conv_params->round_0);
    svt_aom_convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
                                      MAX_SB_SIZE,
                                      dst,
                                      dst_stride,
                                      filters_y,
                                      y0_q4,
                                      y_step_q4,
                                      w,
                                      h,
                                      conv_params->round_1);
}

static void svt_aom_highbd_convolve_add_src_horiz_hip(const uint8_t* src8, ptrdiff_t src_stride, uint16_t* dst,
                                                      ptrdiff_t dst_stride, const InterpKernel* x_filters,
                                                      int32_t x0_q4, int32_t x_step_q4, int32_t w, int32_t h,
                                                      int32_t round0_bits, int32_t bd) {
    const int32_t extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
    uint16_t*     src                   = CONVERT_TO_SHORTPTR(src8);
    src -= SUBPEL_TAPS / 2 - 1;
    for (int32_t y = 0; y < h; ++y) {
        int32_t x_q4 = x0_q4;
        for (int32_t x = 0; x < w; ++x) {
            const uint16_t* const src_x    = &src[x_q4 >> SUBPEL_BITS];
            const int16_t* const  x_filter = x_filters[x_q4 & SUBPEL_MASK];
            const int32_t         rounding = ((int32_t)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
                (1 << (bd + FILTER_BITS - 1));
            const int32_t sum = svt_aom_highbd_horz_scalar_product(src_x, x_filter) + rounding;
            dst[x]            = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, extraprec_clamp_limit - 1);
            x_q4 += x_step_q4;
        }
        src += src_stride;
        dst += dst_stride;
    }
}

static void svt_aom_highbd_convolve_add_src_vert_hip(const uint16_t* src, ptrdiff_t src_stride, uint8_t* dst8,
                                                     ptrdiff_t dst_stride, const InterpKernel* y_filters, int32_t y0_q4,
                                                     int32_t y_step_q4, int32_t w, int32_t h, int32_t round1_bits,
                                                     int32_t bd) {
    uint16_t* dst = CONVERT_TO_SHORTPTR(dst8);
    src -= src_stride * (SUBPEL_TAPS / 2 - 1);
    for (int32_t x = 0; x < w; ++x) {
        int32_t y_q4 = y0_q4;
        for (int32_t y = 0; y < h; ++y) {
            const uint16_t*      src_y    = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
            const int16_t* const y_filter = y_filters[y_q4 & SUBPEL_MASK];
            const int32_t        rounding = ((int32_t)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
                (1 << (bd + round1_bits - 1));
            const int32_t sum   = highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
            dst[y * dst_stride] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
            y_q4 += y_step_q4;
        }
        ++src;
        ++dst;
    }
}

void svt_av1_highbd_wiener_convolve_add_src_c(const uint8_t* const src, const ptrdiff_t src_stride, uint8_t* const dst,
                                              const ptrdiff_t dst_stride, const int16_t* const filter_x,
                                              const int16_t* const filter_y, const int32_t w, const int32_t h,
                                              const ConvolveParams* const conv_params, const int32_t bd) {
    const int32_t             x_step_q4 = 16;
    const int32_t             y_step_q4 = 16;
    const InterpKernel* const filters_x = svt_aom_get_filter_base(filter_x);
    const int32_t             x0_q4     = svt_aom_get_filter_offset(filter_x, filters_x);

    const InterpKernel* const filters_y = svt_aom_get_filter_base(filter_y);
    const int32_t             y0_q4     = svt_aom_get_filter_offset(filter_y, filters_y);

    uint16_t      temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
    const int32_t intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;

    assert(w <= MAX_SB_SIZE);
    assert(h <= MAX_SB_SIZE);
    assert(y_step_q4 <= 32);
    assert(x_step_q4 <= 32);
    assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);

    svt_aom_highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
                                              src_stride,
                                              temp,
                                              MAX_SB_SIZE,
                                              filters_x,
                                              x0_q4,
                                              x_step_q4,
                                              w,
                                              intermediate_height,
                                              conv_params->round_0,
                                              bd);
    svt_aom_highbd_convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
                                             MAX_SB_SIZE,
                                             dst,
                                             dst_stride,
                                             filters_y,
                                             y0_q4,
                                             y_step_q4,
                                             w,
                                             h,
                                             conv_params->round_1,
                                             bd);
}

static INLINE int vert_scalar_product(const uint8_t* a, ptrdiff_t a_stride, const int16_t* b) {
    int sum = 0;
    for (int k = 0; k < SUBPEL_TAPS; ++k) {
        sum += a[k * a_stride] * b[k];
    }
    return sum;
}

static void svt_aom_convolve_horiz(const uint8_t* src, ptrdiff_t src_stride, uint8_t* dst, ptrdiff_t dst_stride,
                                   const InterpKernel* x_filters, int x0_q4, int x_step_q4, int w, int h) {
    src -= SUBPEL_TAPS / 2 - 1;
    for (int y = 0; y < h; ++y) {
        int x_q4 = x0_q4;
        for (int x = 0; x < w; ++x) {
            const uint8_t* const src_x    = &src[x_q4 >> SUBPEL_BITS];
            const int16_t* const x_filter = x_filters[x_q4 & SUBPEL_MASK];
            const int            sum      = svt_aom_horz_scalar_product(src_x, x_filter);
            dst[x]                        = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
            x_q4 += x_step_q4;
        }
        src += src_stride;
        dst += dst_stride;
    }
}

static void svt_aom_convolve_vert(const uint8_t* src, ptrdiff_t src_stride, uint8_t* dst, ptrdiff_t dst_stride,
                                  const InterpKernel* y_filters, int y0_q4, int y_step_q4, int w, int h) {
    src -= src_stride * (SUBPEL_TAPS / 2 - 1);

    for (int x = 0; x < w; ++x) {
        int y_q4 = y0_q4;
        for (int y = 0; y < h; ++y) {
            const unsigned char* src_y    = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
            const int16_t* const y_filter = y_filters[y_q4 & SUBPEL_MASK];
            const int            sum      = vert_scalar_product(src_y, src_stride, y_filter);
            dst[y * dst_stride]           = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
            y_q4 += y_step_q4;
        }
        ++src;
        ++dst;
    }
}

void svt_aom_convolve8_horiz_c(const uint8_t* src, ptrdiff_t src_stride, uint8_t* dst, ptrdiff_t dst_stride,
                               const int16_t* filter_x, int x_step_q4, const int16_t* filter_y, int y_step_q4, int w,
                               int h) {
    const InterpKernel* const filters_x = svt_aom_get_filter_base(filter_x);
    const int                 x0_q4     = svt_aom_get_filter_offset(filter_x, filters_x);

    (void)filter_y;
    (void)y_step_q4;

    svt_aom_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, w, h);
}

void svt_aom_convolve8_vert_c(const uint8_t* src, ptrdiff_t src_stride, uint8_t* dst, ptrdiff_t dst_stride,
                              const int16_t* filter_x, int x_step_q4, const int16_t* filter_y, int y_step_q4, int w,
                              int h) {
    const InterpKernel* const filters_y = svt_aom_get_filter_base(filter_y);
    const int                 y0_q4     = svt_aom_get_filter_offset(filter_y, filters_y);

    (void)filter_x;
    (void)x_step_q4;

    svt_aom_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, w, h);
}

Coverage Report

Created: 2026-05-16 06:41

Line	Count	Source
1		/*
2		* Copyright (c) 2016, Alliance for Open Media. All rights reserved
3		*
4		* This source code is subject to the terms of the BSD 2 Clause License and
5		* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6		* was not distributed with this source code in the LICENSE file, you can
7		* obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8		* Media Patent License 1.0 was not distributed with this source code in the
9		* PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10		*/
11
12		#include <assert.h>
13		#include "convolve.h"
14		#include "common_dsp_rtcd.h"
15
16		// Note: Fixed size intermediate buffers, place limits on parameters
17		// of some functions. 2d filtering proceeds in 2 steps:
18		// (1) Interpolate horizontally into an intermediate buffer, temp.
19		// (2) Interpolate temp vertically to derive the sub-pixel result.
20		// Deriving the maximum number of rows in the temp buffer (135):
21		// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
22		// --Largest block size is 128x128 pixels.
23		// --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
24		// original frame (in 1/16th pixel units).
25		// --Must round-up because block may be located at sub-pixel position.
26		// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
27		// --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
28		#define WIENER_MAX_EXT_SIZE 263
29
30	0	static INLINE int32_t svt_aom_horz_scalar_product(const uint8_t* a, const int16_t* b) {
31	0	int32_t sum = 0;
32	0	for (int32_t k = 0; k < SUBPEL_TAPS; ++k) {
33	0	sum += a[k] * b[k];
34	0	}
35	0	return sum;
36	0	}
37
38	0	static INLINE int32_t svt_aom_highbd_horz_scalar_product(const uint16_t* a, const int16_t* b) {
39	0	int32_t sum = 0;
40	0	for (int32_t k = 0; k < SUBPEL_TAPS; ++k) {
41	0	sum += a[k] * b[k];
42	0	}
43	0	return sum;
44	0	}
45
46		static INLINE int32_t highbd_vert_scalar_product(const uint16_t* a, ptrdiff_t a_stride, const int16_t* b) {
47		int32_t sum = 0;
48		for (int32_t k = 0; k < SUBPEL_TAPS; ++k) {
49		sum += a[k * a_stride] * b[k];
50		}
51		return sum;
52		}
53
54	0	static const InterpKernel* svt_aom_get_filter_base(const int16_t* filter) {
55		// NOTE: This assumes that the filter table is 256-byte aligned.
56	0	return (const InterpKernel*)(((intptr_t)filter) & ~((intptr_t)0xFF));
57	0	}
58
59	0	static int32_t svt_aom_get_filter_offset(const int16_t* f, const InterpKernel* base) {
60	0	return (int32_t)((const InterpKernel*)(intptr_t)f - base);
61	0	}
62
63		static void svt_aom_convolve_add_src_horiz_hip(const uint8_t* src, ptrdiff_t src_stride, uint16_t* dst,
64		ptrdiff_t dst_stride, const InterpKernel* x_filters, int32_t x0_q4,
65	0	int32_t x_step_q4, int32_t w, int32_t h, int32_t round0_bits) {
66	0	const int32_t bd = 8;
67	0	src -= SUBPEL_TAPS / 2 - 1;
68	0	for (int32_t y = 0; y < h; ++y) {
69	0	int32_t x_q4 = x0_q4;
70	0	for (int32_t x = 0; x < w; ++x) {
71	0	const uint8_t* const src_x = &src[x_q4 >> SUBPEL_BITS];
72	0	const int16_t* const x_filter = x_filters[x_q4 & SUBPEL_MASK];
73	0	const int32_t rounding = ((int32_t)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
74	0	(1 << (bd + FILTER_BITS - 1));
75	0	const int32_t sum = svt_aom_horz_scalar_product(src_x, x_filter) + rounding;
76	0	dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
77	0	x_q4 += x_step_q4;
78	0	}
79	0	src += src_stride;
80	0	dst += dst_stride;
81	0	}
82	0	}
83
84		static void svt_aom_convolve_add_src_vert_hip(const uint16_t* src, ptrdiff_t src_stride, uint8_t* dst,
85		ptrdiff_t dst_stride, const InterpKernel* y_filters, int32_t y0_q4,
86	0	int32_t y_step_q4, int32_t w, int32_t h, int32_t round1_bits) {
87	0	const int32_t bd = 8;
88	0	src -= src_stride * (SUBPEL_TAPS / 2 - 1);
89
90	0	for (int32_t x = 0; x < w; ++x) {
91	0	int32_t y_q4 = y0_q4;
92	0	for (int32_t y = 0; y < h; ++y) {
93	0	const uint16_t* src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
94	0	const int16_t* const y_filter = y_filters[y_q4 & SUBPEL_MASK];
95	0	const int32_t rounding = ((int32_t)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
96	0	(1 << (bd + round1_bits - 1));
97	0	const int32_t sum = highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
98	0	dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
99	0	y_q4 += y_step_q4;
100	0	}
101	0	++src;
102	0	++dst;
103	0	}
104	0	}
105
106		void svt_av1_wiener_convolve_add_src_c(const uint8_t* const src, const ptrdiff_t src_stride, uint8_t* const dst,
107		const ptrdiff_t dst_stride, const int16_t* const filter_x,
108		const int16_t* const filter_y, const int32_t w, const int32_t h,
109	0	const ConvolveParams* const conv_params) {
110	0	const int32_t x_step_q4 = 16;
111	0	const int32_t y_step_q4 = 16;
112	0	const InterpKernel* const filters_x = svt_aom_get_filter_base(filter_x);
113	0	const int32_t x0_q4 = svt_aom_get_filter_offset(filter_x, filters_x);
114
115	0	const InterpKernel* const filters_y = svt_aom_get_filter_base(filter_y);
116	0	const int32_t y0_q4 = svt_aom_get_filter_offset(filter_y, filters_y);
117
118	0	uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
119	0	const int32_t intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
120
121		// The last row is set to 0 to address an uninitialized memory access when
122		// using the "C" code path. In vert_scalar_product, where the wiener filter is applied to the pixels,
123		// the bottom-edge pixels will need 3 padded pixels to perform a 7-tap filter. However, the filter is applied
124		// over 8 (SUBPEL_TAPS) pixels, with the final 8th weight being zero. Therefore, the extra bottom-most pixel
125		// will not affect the result, but will cause a sanitizer failure if not initialized.
126	0	memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
127
128	0	assert(w <= MAX_SB_SIZE);
129	0	assert(h <= MAX_SB_SIZE);
130	0	assert(y_step_q4 <= 32);
131	0	assert(x_step_q4 <= 32);
132
133	0	svt_aom_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
134	0	src_stride,
135	0	temp,
136	0	MAX_SB_SIZE,
137	0	filters_x,
138	0	x0_q4,
139	0	x_step_q4,
140	0	w,
141	0	intermediate_height,
142	0	conv_params->round_0);
143	0	svt_aom_convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
144	0	MAX_SB_SIZE,
145	0	dst,
146	0	dst_stride,
147	0	filters_y,
148	0	y0_q4,
149	0	y_step_q4,
150	0	w,
151	0	h,
152	0	conv_params->round_1);
153	0	}
154
155		static void svt_aom_highbd_convolve_add_src_horiz_hip(const uint8_t* src8, ptrdiff_t src_stride, uint16_t* dst,
156		ptrdiff_t dst_stride, const InterpKernel* x_filters,
157		int32_t x0_q4, int32_t x_step_q4, int32_t w, int32_t h,
158	0	int32_t round0_bits, int32_t bd) {
159	0	const int32_t extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
160	0	uint16_t* src = CONVERT_TO_SHORTPTR(src8);
161	0	src -= SUBPEL_TAPS / 2 - 1;
162	0	for (int32_t y = 0; y < h; ++y) {
163	0	int32_t x_q4 = x0_q4;
164	0	for (int32_t x = 0; x < w; ++x) {
165	0	const uint16_t* const src_x = &src[x_q4 >> SUBPEL_BITS];
166	0	const int16_t* const x_filter = x_filters[x_q4 & SUBPEL_MASK];
167	0	const int32_t rounding = ((int32_t)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
168	0	(1 << (bd + FILTER_BITS - 1));
169	0	const int32_t sum = svt_aom_highbd_horz_scalar_product(src_x, x_filter) + rounding;
170	0	dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, extraprec_clamp_limit - 1);
171	0	x_q4 += x_step_q4;
172	0	}
173	0	src += src_stride;
174	0	dst += dst_stride;
175	0	}
176	0	}
177
178		static void svt_aom_highbd_convolve_add_src_vert_hip(const uint16_t* src, ptrdiff_t src_stride, uint8_t* dst8,
179		ptrdiff_t dst_stride, const InterpKernel* y_filters, int32_t y0_q4,
180		int32_t y_step_q4, int32_t w, int32_t h, int32_t round1_bits,
181	0	int32_t bd) {
182	0	uint16_t* dst = CONVERT_TO_SHORTPTR(dst8);
183	0	src -= src_stride * (SUBPEL_TAPS / 2 - 1);
184	0	for (int32_t x = 0; x < w; ++x) {
185	0	int32_t y_q4 = y0_q4;
186	0	for (int32_t y = 0; y < h; ++y) {
187	0	const uint16_t* src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
188	0	const int16_t* const y_filter = y_filters[y_q4 & SUBPEL_MASK];
189	0	const int32_t rounding = ((int32_t)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
190	0	(1 << (bd + round1_bits - 1));
191	0	const int32_t sum = highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
192	0	dst[y * dst_stride] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
193	0	y_q4 += y_step_q4;
194	0	}
195	0	++src;
196	0	++dst;
197	0	}
198	0	}
199
200		void svt_av1_highbd_wiener_convolve_add_src_c(const uint8_t* const src, const ptrdiff_t src_stride, uint8_t* const dst,
201		const ptrdiff_t dst_stride, const int16_t* const filter_x,
202		const int16_t* const filter_y, const int32_t w, const int32_t h,
203	0	const ConvolveParams* const conv_params, const int32_t bd) {
204	0	const int32_t x_step_q4 = 16;
205	0	const int32_t y_step_q4 = 16;
206	0	const InterpKernel* const filters_x = svt_aom_get_filter_base(filter_x);
207	0	const int32_t x0_q4 = svt_aom_get_filter_offset(filter_x, filters_x);
208
209	0	const InterpKernel* const filters_y = svt_aom_get_filter_base(filter_y);
210	0	const int32_t y0_q4 = svt_aom_get_filter_offset(filter_y, filters_y);
211
212	0	uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
213	0	const int32_t intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
214
215	0	assert(w <= MAX_SB_SIZE);
216	0	assert(h <= MAX_SB_SIZE);
217	0	assert(y_step_q4 <= 32);
218	0	assert(x_step_q4 <= 32);
219	0	assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
220
221	0	svt_aom_highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
222	0	src_stride,
223	0	temp,
224	0	MAX_SB_SIZE,
225	0	filters_x,
226	0	x0_q4,
227	0	x_step_q4,
228	0	w,
229	0	intermediate_height,
230	0	conv_params->round_0,
231	0	bd);
232	0	svt_aom_highbd_convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
233	0	MAX_SB_SIZE,
234	0	dst,
235	0	dst_stride,
236	0	filters_y,
237	0	y0_q4,
238	0	y_step_q4,
239	0	w,
240	0	h,
241	0	conv_params->round_1,
242	0	bd);
243	0	}
244
245	0	static INLINE int vert_scalar_product(const uint8_t* a, ptrdiff_t a_stride, const int16_t* b) {
246	0	int sum = 0;
247	0	for (int k = 0; k < SUBPEL_TAPS; ++k) {
248	0	sum += a[k * a_stride] * b[k];
249	0	}
250	0	return sum;
251	0	}
252
253		static void svt_aom_convolve_horiz(const uint8_t* src, ptrdiff_t src_stride, uint8_t* dst, ptrdiff_t dst_stride,
254	0	const InterpKernel* x_filters, int x0_q4, int x_step_q4, int w, int h) {
255	0	src -= SUBPEL_TAPS / 2 - 1;
256	0	for (int y = 0; y < h; ++y) {
257	0	int x_q4 = x0_q4;
258	0	for (int x = 0; x < w; ++x) {
259	0	const uint8_t* const src_x = &src[x_q4 >> SUBPEL_BITS];
260	0	const int16_t* const x_filter = x_filters[x_q4 & SUBPEL_MASK];
261	0	const int sum = svt_aom_horz_scalar_product(src_x, x_filter);
262	0	dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
263	0	x_q4 += x_step_q4;
264	0	}
265	0	src += src_stride;
266	0	dst += dst_stride;
267	0	}
268	0	}
269
270		static void svt_aom_convolve_vert(const uint8_t* src, ptrdiff_t src_stride, uint8_t* dst, ptrdiff_t dst_stride,
271	0	const InterpKernel* y_filters, int y0_q4, int y_step_q4, int w, int h) {
272	0	src -= src_stride * (SUBPEL_TAPS / 2 - 1);
273
274	0	for (int x = 0; x < w; ++x) {
275	0	int y_q4 = y0_q4;
276	0	for (int y = 0; y < h; ++y) {
277	0	const unsigned char* src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
278	0	const int16_t* const y_filter = y_filters[y_q4 & SUBPEL_MASK];
279	0	const int sum = vert_scalar_product(src_y, src_stride, y_filter);
280	0	dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
281	0	y_q4 += y_step_q4;
282	0	}
283	0	++src;
284	0	++dst;
285	0	}
286	0	}
287
288		void svt_aom_convolve8_horiz_c(const uint8_t* src, ptrdiff_t src_stride, uint8_t* dst, ptrdiff_t dst_stride,
289		const int16_t* filter_x, int x_step_q4, const int16_t* filter_y, int y_step_q4, int w,
290	0	int h) {
291	0	const InterpKernel* const filters_x = svt_aom_get_filter_base(filter_x);
292	0	const int x0_q4 = svt_aom_get_filter_offset(filter_x, filters_x);
293
294	0	(void)filter_y;
295	0	(void)y_step_q4;
296
297	0	svt_aom_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, w, h);
298	0	}
299
300		void svt_aom_convolve8_vert_c(const uint8_t* src, ptrdiff_t src_stride, uint8_t* dst, ptrdiff_t dst_stride,
301		const int16_t* filter_x, int x_step_q4, const int16_t* filter_y, int y_step_q4, int w,
302	0	int h) {
303	0	const InterpKernel* const filters_y = svt_aom_get_filter_base(filter_y);
304	0	const int y0_q4 = svt_aom_get_filter_offset(filter_y, filters_y);
305
306	0	(void)filter_x;
307	0	(void)x_step_q4;
308
309	0	svt_aom_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, w, h);
310	0	}