/work/svt-av1/Source/Lib/Codec/convolve.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license. |
10 | | */ |
11 | | |
12 | | #include <assert.h> |
13 | | #include "convolve.h" |
14 | | #include "common_dsp_rtcd.h" |
15 | | |
16 | | // Note: Fixed size intermediate buffers, place limits on parameters |
17 | | // of some functions. 2d filtering proceeds in 2 steps: |
18 | | // (1) Interpolate horizontally into an intermediate buffer, temp. |
19 | | // (2) Interpolate temp vertically to derive the sub-pixel result. |
20 | | // Deriving the maximum number of rows in the temp buffer (135): |
21 | | // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). |
22 | | // --Largest block size is 128x128 pixels. |
23 | | // --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the |
24 | | // original frame (in 1/16th pixel units). |
25 | | // --Must round-up because block may be located at sub-pixel position. |
26 | | // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. |
27 | | // --((128 - 1) * 32 + 15) >> 4 + 8 = 263. |
28 | | #define WIENER_MAX_EXT_SIZE 263 |
29 | | |
30 | 0 | static INLINE int32_t svt_aom_horz_scalar_product(const uint8_t* a, const int16_t* b) { |
31 | 0 | int32_t sum = 0; |
32 | 0 | for (int32_t k = 0; k < SUBPEL_TAPS; ++k) { |
33 | 0 | sum += a[k] * b[k]; |
34 | 0 | } |
35 | 0 | return sum; |
36 | 0 | } |
37 | | |
38 | 0 | static INLINE int32_t svt_aom_highbd_horz_scalar_product(const uint16_t* a, const int16_t* b) { |
39 | 0 | int32_t sum = 0; |
40 | 0 | for (int32_t k = 0; k < SUBPEL_TAPS; ++k) { |
41 | 0 | sum += a[k] * b[k]; |
42 | 0 | } |
43 | 0 | return sum; |
44 | 0 | } |
45 | | |
46 | | static INLINE int32_t highbd_vert_scalar_product(const uint16_t* a, ptrdiff_t a_stride, const int16_t* b) { |
47 | | int32_t sum = 0; |
48 | | for (int32_t k = 0; k < SUBPEL_TAPS; ++k) { |
49 | | sum += a[k * a_stride] * b[k]; |
50 | | } |
51 | | return sum; |
52 | | } |
53 | | |
54 | 0 | static const InterpKernel* svt_aom_get_filter_base(const int16_t* filter) { |
55 | | // NOTE: This assumes that the filter table is 256-byte aligned. |
56 | 0 | return (const InterpKernel*)(((intptr_t)filter) & ~((intptr_t)0xFF)); |
57 | 0 | } |
58 | | |
59 | 0 | static int32_t svt_aom_get_filter_offset(const int16_t* f, const InterpKernel* base) { |
60 | 0 | return (int32_t)((const InterpKernel*)(intptr_t)f - base); |
61 | 0 | } |
62 | | |
63 | | static void svt_aom_convolve_add_src_horiz_hip(const uint8_t* src, ptrdiff_t src_stride, uint16_t* dst, |
64 | | ptrdiff_t dst_stride, const InterpKernel* x_filters, int32_t x0_q4, |
65 | 0 | int32_t x_step_q4, int32_t w, int32_t h, int32_t round0_bits) { |
66 | 0 | const int32_t bd = 8; |
67 | 0 | src -= SUBPEL_TAPS / 2 - 1; |
68 | 0 | for (int32_t y = 0; y < h; ++y) { |
69 | 0 | int32_t x_q4 = x0_q4; |
70 | 0 | for (int32_t x = 0; x < w; ++x) { |
71 | 0 | const uint8_t* const src_x = &src[x_q4 >> SUBPEL_BITS]; |
72 | 0 | const int16_t* const x_filter = x_filters[x_q4 & SUBPEL_MASK]; |
73 | 0 | const int32_t rounding = ((int32_t)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) + |
74 | 0 | (1 << (bd + FILTER_BITS - 1)); |
75 | 0 | const int32_t sum = svt_aom_horz_scalar_product(src_x, x_filter) + rounding; |
76 | 0 | dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, WIENER_CLAMP_LIMIT(round0_bits, bd) - 1); |
77 | 0 | x_q4 += x_step_q4; |
78 | 0 | } |
79 | 0 | src += src_stride; |
80 | 0 | dst += dst_stride; |
81 | 0 | } |
82 | 0 | } |
83 | | |
84 | | static void svt_aom_convolve_add_src_vert_hip(const uint16_t* src, ptrdiff_t src_stride, uint8_t* dst, |
85 | | ptrdiff_t dst_stride, const InterpKernel* y_filters, int32_t y0_q4, |
86 | 0 | int32_t y_step_q4, int32_t w, int32_t h, int32_t round1_bits) { |
87 | 0 | const int32_t bd = 8; |
88 | 0 | src -= src_stride * (SUBPEL_TAPS / 2 - 1); |
89 | |
|
90 | 0 | for (int32_t x = 0; x < w; ++x) { |
91 | 0 | int32_t y_q4 = y0_q4; |
92 | 0 | for (int32_t y = 0; y < h; ++y) { |
93 | 0 | const uint16_t* src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; |
94 | 0 | const int16_t* const y_filter = y_filters[y_q4 & SUBPEL_MASK]; |
95 | 0 | const int32_t rounding = ((int32_t)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) - |
96 | 0 | (1 << (bd + round1_bits - 1)); |
97 | 0 | const int32_t sum = highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding; |
98 | 0 | dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits)); |
99 | 0 | y_q4 += y_step_q4; |
100 | 0 | } |
101 | 0 | ++src; |
102 | 0 | ++dst; |
103 | 0 | } |
104 | 0 | } |
105 | | |
106 | | void svt_av1_wiener_convolve_add_src_c(const uint8_t* const src, const ptrdiff_t src_stride, uint8_t* const dst, |
107 | | const ptrdiff_t dst_stride, const int16_t* const filter_x, |
108 | | const int16_t* const filter_y, const int32_t w, const int32_t h, |
109 | 0 | const ConvolveParams* const conv_params) { |
110 | 0 | const int32_t x_step_q4 = 16; |
111 | 0 | const int32_t y_step_q4 = 16; |
112 | 0 | const InterpKernel* const filters_x = svt_aom_get_filter_base(filter_x); |
113 | 0 | const int32_t x0_q4 = svt_aom_get_filter_offset(filter_x, filters_x); |
114 | |
|
115 | 0 | const InterpKernel* const filters_y = svt_aom_get_filter_base(filter_y); |
116 | 0 | const int32_t y0_q4 = svt_aom_get_filter_offset(filter_y, filters_y); |
117 | |
|
118 | 0 | uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE]; |
119 | 0 | const int32_t intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1; |
120 | | |
121 | | // The last row is set to 0 to address an uninitialized memory access when |
122 | | // using the "C" code path. In vert_scalar_product, where the wiener filter is applied to the pixels, |
123 | | // the bottom-edge pixels will need 3 padded pixels to perform a 7-tap filter. However, the filter is applied |
124 | | // over 8 (SUBPEL_TAPS) pixels, with the final 8th weight being zero. Therefore, the extra bottom-most pixel |
125 | | // will not affect the result, but will cause a sanitizer failure if not initialized. |
126 | 0 | memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE); |
127 | |
|
128 | 0 | assert(w <= MAX_SB_SIZE); |
129 | 0 | assert(h <= MAX_SB_SIZE); |
130 | 0 | assert(y_step_q4 <= 32); |
131 | 0 | assert(x_step_q4 <= 32); |
132 | |
|
133 | 0 | svt_aom_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1), |
134 | 0 | src_stride, |
135 | 0 | temp, |
136 | 0 | MAX_SB_SIZE, |
137 | 0 | filters_x, |
138 | 0 | x0_q4, |
139 | 0 | x_step_q4, |
140 | 0 | w, |
141 | 0 | intermediate_height, |
142 | 0 | conv_params->round_0); |
143 | 0 | svt_aom_convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), |
144 | 0 | MAX_SB_SIZE, |
145 | 0 | dst, |
146 | 0 | dst_stride, |
147 | 0 | filters_y, |
148 | 0 | y0_q4, |
149 | 0 | y_step_q4, |
150 | 0 | w, |
151 | 0 | h, |
152 | 0 | conv_params->round_1); |
153 | 0 | } |
154 | | |
155 | | static void svt_aom_highbd_convolve_add_src_horiz_hip(const uint8_t* src8, ptrdiff_t src_stride, uint16_t* dst, |
156 | | ptrdiff_t dst_stride, const InterpKernel* x_filters, |
157 | | int32_t x0_q4, int32_t x_step_q4, int32_t w, int32_t h, |
158 | 0 | int32_t round0_bits, int32_t bd) { |
159 | 0 | const int32_t extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd); |
160 | 0 | uint16_t* src = CONVERT_TO_SHORTPTR(src8); |
161 | 0 | src -= SUBPEL_TAPS / 2 - 1; |
162 | 0 | for (int32_t y = 0; y < h; ++y) { |
163 | 0 | int32_t x_q4 = x0_q4; |
164 | 0 | for (int32_t x = 0; x < w; ++x) { |
165 | 0 | const uint16_t* const src_x = &src[x_q4 >> SUBPEL_BITS]; |
166 | 0 | const int16_t* const x_filter = x_filters[x_q4 & SUBPEL_MASK]; |
167 | 0 | const int32_t rounding = ((int32_t)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) + |
168 | 0 | (1 << (bd + FILTER_BITS - 1)); |
169 | 0 | const int32_t sum = svt_aom_highbd_horz_scalar_product(src_x, x_filter) + rounding; |
170 | 0 | dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, extraprec_clamp_limit - 1); |
171 | 0 | x_q4 += x_step_q4; |
172 | 0 | } |
173 | 0 | src += src_stride; |
174 | 0 | dst += dst_stride; |
175 | 0 | } |
176 | 0 | } |
177 | | |
178 | | static void svt_aom_highbd_convolve_add_src_vert_hip(const uint16_t* src, ptrdiff_t src_stride, uint8_t* dst8, |
179 | | ptrdiff_t dst_stride, const InterpKernel* y_filters, int32_t y0_q4, |
180 | | int32_t y_step_q4, int32_t w, int32_t h, int32_t round1_bits, |
181 | 0 | int32_t bd) { |
182 | 0 | uint16_t* dst = CONVERT_TO_SHORTPTR(dst8); |
183 | 0 | src -= src_stride * (SUBPEL_TAPS / 2 - 1); |
184 | 0 | for (int32_t x = 0; x < w; ++x) { |
185 | 0 | int32_t y_q4 = y0_q4; |
186 | 0 | for (int32_t y = 0; y < h; ++y) { |
187 | 0 | const uint16_t* src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; |
188 | 0 | const int16_t* const y_filter = y_filters[y_q4 & SUBPEL_MASK]; |
189 | 0 | const int32_t rounding = ((int32_t)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) - |
190 | 0 | (1 << (bd + round1_bits - 1)); |
191 | 0 | const int32_t sum = highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding; |
192 | 0 | dst[y * dst_stride] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd); |
193 | 0 | y_q4 += y_step_q4; |
194 | 0 | } |
195 | 0 | ++src; |
196 | 0 | ++dst; |
197 | 0 | } |
198 | 0 | } |
199 | | |
200 | | void svt_av1_highbd_wiener_convolve_add_src_c(const uint8_t* const src, const ptrdiff_t src_stride, uint8_t* const dst, |
201 | | const ptrdiff_t dst_stride, const int16_t* const filter_x, |
202 | | const int16_t* const filter_y, const int32_t w, const int32_t h, |
203 | 0 | const ConvolveParams* const conv_params, const int32_t bd) { |
204 | 0 | const int32_t x_step_q4 = 16; |
205 | 0 | const int32_t y_step_q4 = 16; |
206 | 0 | const InterpKernel* const filters_x = svt_aom_get_filter_base(filter_x); |
207 | 0 | const int32_t x0_q4 = svt_aom_get_filter_offset(filter_x, filters_x); |
208 | |
|
209 | 0 | const InterpKernel* const filters_y = svt_aom_get_filter_base(filter_y); |
210 | 0 | const int32_t y0_q4 = svt_aom_get_filter_offset(filter_y, filters_y); |
211 | |
|
212 | 0 | uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE]; |
213 | 0 | const int32_t intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; |
214 | |
|
215 | 0 | assert(w <= MAX_SB_SIZE); |
216 | 0 | assert(h <= MAX_SB_SIZE); |
217 | 0 | assert(y_step_q4 <= 32); |
218 | 0 | assert(x_step_q4 <= 32); |
219 | 0 | assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); |
220 | |
|
221 | 0 | svt_aom_highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1), |
222 | 0 | src_stride, |
223 | 0 | temp, |
224 | 0 | MAX_SB_SIZE, |
225 | 0 | filters_x, |
226 | 0 | x0_q4, |
227 | 0 | x_step_q4, |
228 | 0 | w, |
229 | 0 | intermediate_height, |
230 | 0 | conv_params->round_0, |
231 | 0 | bd); |
232 | 0 | svt_aom_highbd_convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), |
233 | 0 | MAX_SB_SIZE, |
234 | 0 | dst, |
235 | 0 | dst_stride, |
236 | 0 | filters_y, |
237 | 0 | y0_q4, |
238 | 0 | y_step_q4, |
239 | 0 | w, |
240 | 0 | h, |
241 | 0 | conv_params->round_1, |
242 | 0 | bd); |
243 | 0 | } |
244 | | |
245 | 0 | static INLINE int vert_scalar_product(const uint8_t* a, ptrdiff_t a_stride, const int16_t* b) { |
246 | 0 | int sum = 0; |
247 | 0 | for (int k = 0; k < SUBPEL_TAPS; ++k) { |
248 | 0 | sum += a[k * a_stride] * b[k]; |
249 | 0 | } |
250 | 0 | return sum; |
251 | 0 | } |
252 | | |
253 | | static void svt_aom_convolve_horiz(const uint8_t* src, ptrdiff_t src_stride, uint8_t* dst, ptrdiff_t dst_stride, |
254 | 0 | const InterpKernel* x_filters, int x0_q4, int x_step_q4, int w, int h) { |
255 | 0 | src -= SUBPEL_TAPS / 2 - 1; |
256 | 0 | for (int y = 0; y < h; ++y) { |
257 | 0 | int x_q4 = x0_q4; |
258 | 0 | for (int x = 0; x < w; ++x) { |
259 | 0 | const uint8_t* const src_x = &src[x_q4 >> SUBPEL_BITS]; |
260 | 0 | const int16_t* const x_filter = x_filters[x_q4 & SUBPEL_MASK]; |
261 | 0 | const int sum = svt_aom_horz_scalar_product(src_x, x_filter); |
262 | 0 | dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); |
263 | 0 | x_q4 += x_step_q4; |
264 | 0 | } |
265 | 0 | src += src_stride; |
266 | 0 | dst += dst_stride; |
267 | 0 | } |
268 | 0 | } |
269 | | |
270 | | static void svt_aom_convolve_vert(const uint8_t* src, ptrdiff_t src_stride, uint8_t* dst, ptrdiff_t dst_stride, |
271 | 0 | const InterpKernel* y_filters, int y0_q4, int y_step_q4, int w, int h) { |
272 | 0 | src -= src_stride * (SUBPEL_TAPS / 2 - 1); |
273 | |
|
274 | 0 | for (int x = 0; x < w; ++x) { |
275 | 0 | int y_q4 = y0_q4; |
276 | 0 | for (int y = 0; y < h; ++y) { |
277 | 0 | const unsigned char* src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; |
278 | 0 | const int16_t* const y_filter = y_filters[y_q4 & SUBPEL_MASK]; |
279 | 0 | const int sum = vert_scalar_product(src_y, src_stride, y_filter); |
280 | 0 | dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); |
281 | 0 | y_q4 += y_step_q4; |
282 | 0 | } |
283 | 0 | ++src; |
284 | 0 | ++dst; |
285 | 0 | } |
286 | 0 | } |
287 | | |
288 | | void svt_aom_convolve8_horiz_c(const uint8_t* src, ptrdiff_t src_stride, uint8_t* dst, ptrdiff_t dst_stride, |
289 | | const int16_t* filter_x, int x_step_q4, const int16_t* filter_y, int y_step_q4, int w, |
290 | 0 | int h) { |
291 | 0 | const InterpKernel* const filters_x = svt_aom_get_filter_base(filter_x); |
292 | 0 | const int x0_q4 = svt_aom_get_filter_offset(filter_x, filters_x); |
293 | |
|
294 | 0 | (void)filter_y; |
295 | 0 | (void)y_step_q4; |
296 | |
|
297 | 0 | svt_aom_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, w, h); |
298 | 0 | } |
299 | | |
300 | | void svt_aom_convolve8_vert_c(const uint8_t* src, ptrdiff_t src_stride, uint8_t* dst, ptrdiff_t dst_stride, |
301 | | const int16_t* filter_x, int x_step_q4, const int16_t* filter_y, int y_step_q4, int w, |
302 | 0 | int h) { |
303 | 0 | const InterpKernel* const filters_y = svt_aom_get_filter_base(filter_y); |
304 | 0 | const int y0_q4 = svt_aom_get_filter_offset(filter_y, filters_y); |
305 | |
|
306 | 0 | (void)filter_x; |
307 | 0 | (void)x_step_q4; |
308 | |
|
309 | 0 | svt_aom_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, w, h); |
310 | 0 | } |