/src/libvpx/vpx_dsp/vpx_convolve.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2013 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <assert.h> |
12 | | #include <string.h> |
13 | | |
14 | | #include "./vpx_config.h" |
15 | | #include "./vpx_dsp_rtcd.h" |
16 | | #include "vpx/vpx_integer.h" |
17 | | #include "vpx_dsp/vpx_convolve.h" |
18 | | #include "vpx_dsp/vpx_dsp_common.h" |
19 | | #include "vpx_dsp/vpx_filter.h" |
20 | | #include "vpx_ports/mem.h" |
21 | | |
22 | | static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, |
23 | | uint8_t *dst, ptrdiff_t dst_stride, |
24 | | const InterpKernel *x_filters, int x0_q4, |
25 | 338k | int x_step_q4, int w, int h) { |
26 | 338k | int x, y; |
27 | 338k | src -= SUBPEL_TAPS / 2 - 1; |
28 | | |
29 | 3.55M | for (y = 0; y < h; ++y) { |
30 | 3.21M | int x_q4 = x0_q4; |
31 | 45.6M | for (x = 0; x < w; ++x) { |
32 | 42.4M | const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; |
33 | 42.4M | const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; |
34 | 42.4M | int k, sum = 0; |
35 | 381M | for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; |
36 | 42.4M | dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); |
37 | 42.4M | x_q4 += x_step_q4; |
38 | 42.4M | } |
39 | 3.21M | src += src_stride; |
40 | 3.21M | dst += dst_stride; |
41 | 3.21M | } |
42 | 338k | } |
43 | | |
44 | | static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, |
45 | | uint8_t *dst, ptrdiff_t dst_stride, |
46 | | const InterpKernel *x_filters, int x0_q4, |
47 | 108k | int x_step_q4, int w, int h) { |
48 | 108k | int x, y; |
49 | 108k | src -= SUBPEL_TAPS / 2 - 1; |
50 | | |
51 | 822k | for (y = 0; y < h; ++y) { |
52 | 714k | int x_q4 = x0_q4; |
53 | 12.9M | for (x = 0; x < w; ++x) { |
54 | 12.2M | const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; |
55 | 12.2M | const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; |
56 | 12.2M | int k, sum = 0; |
57 | 110M | for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; |
58 | 12.2M | dst[x] = ROUND_POWER_OF_TWO( |
59 | 12.2M | dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); |
60 | 12.2M | x_q4 += x_step_q4; |
61 | 12.2M | } |
62 | 714k | src += src_stride; |
63 | 714k | dst += dst_stride; |
64 | 714k | } |
65 | 108k | } |
66 | | |
67 | | static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, |
68 | | uint8_t *dst, ptrdiff_t dst_stride, |
69 | | const InterpKernel *y_filters, int y0_q4, |
70 | 191k | int y_step_q4, int w, int h) { |
71 | 191k | int x, y; |
72 | 191k | src -= src_stride * (SUBPEL_TAPS / 2 - 1); |
73 | | |
74 | 1.79M | for (x = 0; x < w; ++x) { |
75 | 1.60M | int y_q4 = y0_q4; |
76 | 19.1M | for (y = 0; y < h; ++y) { |
77 | 17.5M | const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; |
78 | 17.5M | const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; |
79 | 17.5M | int k, sum = 0; |
80 | 157M | for (k = 0; k < SUBPEL_TAPS; ++k) |
81 | 140M | sum += src_y[k * src_stride] * y_filter[k]; |
82 | 17.5M | dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); |
83 | 17.5M | y_q4 += y_step_q4; |
84 | 17.5M | } |
85 | 1.60M | ++src; |
86 | 1.60M | ++dst; |
87 | 1.60M | } |
88 | 191k | } |
89 | | |
90 | | static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, |
91 | | uint8_t *dst, ptrdiff_t dst_stride, |
92 | | const InterpKernel *y_filters, int y0_q4, |
93 | 2.02k | int y_step_q4, int w, int h) { |
94 | 2.02k | int x, y; |
95 | 2.02k | src -= src_stride * (SUBPEL_TAPS / 2 - 1); |
96 | | |
97 | 16.7k | for (x = 0; x < w; ++x) { |
98 | 14.6k | int y_q4 = y0_q4; |
99 | 283k | for (y = 0; y < h; ++y) { |
100 | 268k | const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; |
101 | 268k | const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; |
102 | 268k | int k, sum = 0; |
103 | 2.41M | for (k = 0; k < SUBPEL_TAPS; ++k) |
104 | 2.14M | sum += src_y[k * src_stride] * y_filter[k]; |
105 | 268k | dst[y * dst_stride] = ROUND_POWER_OF_TWO( |
106 | 268k | dst[y * dst_stride] + |
107 | 268k | clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), |
108 | 268k | 1); |
109 | 268k | y_q4 += y_step_q4; |
110 | 268k | } |
111 | 14.6k | ++src; |
112 | 14.6k | ++dst; |
113 | 14.6k | } |
114 | 2.02k | } |
115 | | |
116 | | void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, |
117 | | uint8_t *dst, ptrdiff_t dst_stride, |
118 | | const InterpKernel *filter, int x0_q4, int x_step_q4, |
119 | 159k | int y0_q4, int y_step_q4, int w, int h) { |
120 | 159k | (void)y0_q4; |
121 | 159k | (void)y_step_q4; |
122 | 159k | convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w, |
123 | 159k | h); |
124 | 159k | } |
125 | | |
126 | | void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, |
127 | | uint8_t *dst, ptrdiff_t dst_stride, |
128 | | const InterpKernel *filter, int x0_q4, |
129 | | int x_step_q4, int y0_q4, int y_step_q4, int w, |
130 | 108k | int h) { |
131 | 108k | (void)y0_q4; |
132 | 108k | (void)y_step_q4; |
133 | 108k | convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, |
134 | 108k | w, h); |
135 | 108k | } |
136 | | |
137 | | void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, |
138 | | uint8_t *dst, ptrdiff_t dst_stride, |
139 | | const InterpKernel *filter, int x0_q4, int x_step_q4, |
140 | 12.6k | int y0_q4, int y_step_q4, int w, int h) { |
141 | 12.6k | (void)x0_q4; |
142 | 12.6k | (void)x_step_q4; |
143 | 12.6k | convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, |
144 | 12.6k | h); |
145 | 12.6k | } |
146 | | |
147 | | void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, |
148 | | uint8_t *dst, ptrdiff_t dst_stride, |
149 | | const InterpKernel *filter, int x0_q4, |
150 | | int x_step_q4, int y0_q4, int y_step_q4, int w, |
151 | 2.02k | int h) { |
152 | 2.02k | (void)x0_q4; |
153 | 2.02k | (void)x_step_q4; |
154 | 2.02k | convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, |
155 | 2.02k | w, h); |
156 | 2.02k | } |
157 | | |
158 | | void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, |
159 | | ptrdiff_t dst_stride, const InterpKernel *filter, |
160 | | int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, |
161 | 178k | int h) { |
162 | | // Note: Fixed size intermediate buffer, temp, places limits on parameters. |
163 | | // 2d filtering proceeds in 2 steps: |
164 | | // (1) Interpolate horizontally into an intermediate buffer, temp. |
165 | | // (2) Interpolate temp vertically to derive the sub-pixel result. |
166 | | // Deriving the maximum number of rows in the temp buffer (135): |
167 | | // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). |
168 | | // --Largest block size is 64x64 pixels. |
169 | | // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the |
170 | | // original frame (in 1/16th pixel units). |
171 | | // --Must round-up because block may be located at sub-pixel position. |
172 | | // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. |
173 | | // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. |
174 | | // When calling in frame scaling function, the smallest scaling factor is x1/4 |
175 | | // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still |
176 | | // big enough. |
177 | 178k | uint8_t temp[64 * 135]; |
178 | 178k | const int intermediate_height = |
179 | 178k | (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; |
180 | | |
181 | 178k | assert(w <= 64); |
182 | 178k | assert(h <= 64); |
183 | 178k | assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); |
184 | 178k | assert(x_step_q4 <= 64); |
185 | | |
186 | 178k | convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, |
187 | 178k | filter, x0_q4, x_step_q4, w, intermediate_height); |
188 | 178k | convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter, |
189 | 178k | y0_q4, y_step_q4, w, h); |
190 | 178k | } |
191 | | |
192 | | void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, |
193 | | ptrdiff_t dst_stride, const InterpKernel *filter, |
194 | | int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, |
195 | 178k | int w, int h) { |
196 | | // Fixed size intermediate buffer places limits on parameters. |
197 | 178k | DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); |
198 | 178k | assert(w <= 64); |
199 | 178k | assert(h <= 64); |
200 | | |
201 | 178k | vpx_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4, |
202 | 178k | y_step_q4, w, h); |
203 | 178k | vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h); |
204 | 178k | } |
205 | | |
206 | | void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, |
207 | | ptrdiff_t dst_stride, const InterpKernel *filter, |
208 | | int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, |
209 | 0 | int w, int h) { |
210 | 0 | int r; |
211 | |
|
212 | 0 | (void)filter; |
213 | 0 | (void)x0_q4; |
214 | 0 | (void)x_step_q4; |
215 | 0 | (void)y0_q4; |
216 | 0 | (void)y_step_q4; |
217 | |
|
218 | 0 | for (r = h; r > 0; --r) { |
219 | 0 | memcpy(dst, src, w); |
220 | 0 | src += src_stride; |
221 | 0 | dst += dst_stride; |
222 | 0 | } |
223 | 0 | } |
224 | | |
225 | | void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, |
226 | | ptrdiff_t dst_stride, const InterpKernel *filter, |
227 | | int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, |
228 | 178k | int w, int h) { |
229 | 178k | int x, y; |
230 | | |
231 | 178k | (void)filter; |
232 | 178k | (void)x0_q4; |
233 | 178k | (void)x_step_q4; |
234 | 178k | (void)y0_q4; |
235 | 178k | (void)y_step_q4; |
236 | | |
237 | 1.29M | for (y = 0; y < h; ++y) { |
238 | 16.7M | for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); |
239 | 1.12M | src += src_stride; |
240 | 1.12M | dst += dst_stride; |
241 | 1.12M | } |
242 | 178k | } |
243 | | |
244 | | void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, |
245 | | ptrdiff_t dst_stride, const InterpKernel *filter, |
246 | | int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, |
247 | 159k | int w, int h) { |
248 | 159k | vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, |
249 | 159k | x_step_q4, y0_q4, y_step_q4, w, h); |
250 | 159k | } |
251 | | |
252 | | void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, |
253 | | ptrdiff_t dst_stride, const InterpKernel *filter, |
254 | | int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, |
255 | 12.6k | int w, int h) { |
256 | 12.6k | vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, |
257 | 12.6k | x_step_q4, y0_q4, y_step_q4, w, h); |
258 | 12.6k | } |
259 | | |
260 | | void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, |
261 | | ptrdiff_t dst_stride, const InterpKernel *filter, |
262 | | int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, |
263 | 0 | int h) { |
264 | 0 | vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, |
265 | 0 | y0_q4, y_step_q4, w, h); |
266 | 0 | } |
267 | | |
268 | | void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, |
269 | | uint8_t *dst, ptrdiff_t dst_stride, |
270 | | const InterpKernel *filter, int x0_q4, |
271 | | int x_step_q4, int y0_q4, int y_step_q4, int w, |
272 | 108k | int h) { |
273 | 108k | vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, |
274 | 108k | x_step_q4, y0_q4, y_step_q4, w, h); |
275 | 108k | } |
276 | | |
277 | | void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, |
278 | | uint8_t *dst, ptrdiff_t dst_stride, |
279 | | const InterpKernel *filter, int x0_q4, int x_step_q4, |
280 | 2.02k | int y0_q4, int y_step_q4, int w, int h) { |
281 | 2.02k | vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, |
282 | 2.02k | x_step_q4, y0_q4, y_step_q4, w, h); |
283 | 2.02k | } |
284 | | |
285 | | void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, |
286 | | ptrdiff_t dst_stride, const InterpKernel *filter, |
287 | | int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, |
288 | 178k | int w, int h) { |
289 | 178k | vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, |
290 | 178k | x_step_q4, y0_q4, y_step_q4, w, h); |
291 | 178k | } |
292 | | |
293 | | #if CONFIG_VP9_HIGHBITDEPTH |
294 | | static void highbd_convolve_horiz(const uint16_t *src, ptrdiff_t src_stride, |
295 | | uint16_t *dst, ptrdiff_t dst_stride, |
296 | | const InterpKernel *x_filters, int x0_q4, |
297 | 98.3k | int x_step_q4, int w, int h, int bd) { |
298 | 98.3k | int x, y; |
299 | 98.3k | src -= SUBPEL_TAPS / 2 - 1; |
300 | | |
301 | 1.01M | for (y = 0; y < h; ++y) { |
302 | 914k | int x_q4 = x0_q4; |
303 | 8.23M | for (x = 0; x < w; ++x) { |
304 | 7.32M | const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; |
305 | 7.32M | const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; |
306 | 7.32M | int k, sum = 0; |
307 | 65.8M | for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; |
308 | 7.32M | dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); |
309 | 7.32M | x_q4 += x_step_q4; |
310 | 7.32M | } |
311 | 914k | src += src_stride; |
312 | 914k | dst += dst_stride; |
313 | 914k | } |
314 | 98.3k | } |
315 | | |
316 | | static void highbd_convolve_avg_horiz(const uint16_t *src, ptrdiff_t src_stride, |
317 | | uint16_t *dst, ptrdiff_t dst_stride, |
318 | | const InterpKernel *x_filters, int x0_q4, |
319 | 6.56k | int x_step_q4, int w, int h, int bd) { |
320 | 6.56k | int x, y; |
321 | 6.56k | src -= SUBPEL_TAPS / 2 - 1; |
322 | | |
323 | 42.5k | for (y = 0; y < h; ++y) { |
324 | 35.9k | int x_q4 = x0_q4; |
325 | 477k | for (x = 0; x < w; ++x) { |
326 | 441k | const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; |
327 | 441k | const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; |
328 | 441k | int k, sum = 0; |
329 | 3.97M | for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; |
330 | 441k | dst[x] = ROUND_POWER_OF_TWO( |
331 | 441k | dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), |
332 | 441k | 1); |
333 | 441k | x_q4 += x_step_q4; |
334 | 441k | } |
335 | 35.9k | src += src_stride; |
336 | 35.9k | dst += dst_stride; |
337 | 35.9k | } |
338 | 6.56k | } |
339 | | |
340 | | static void highbd_convolve_vert(const uint16_t *src, ptrdiff_t src_stride, |
341 | | uint16_t *dst, ptrdiff_t dst_stride, |
342 | | const InterpKernel *y_filters, int y0_q4, |
343 | 174k | int y_step_q4, int w, int h, int bd) { |
344 | 174k | int x, y; |
345 | 174k | src -= src_stride * (SUBPEL_TAPS / 2 - 1); |
346 | | |
347 | 1.40M | for (x = 0; x < w; ++x) { |
348 | 1.23M | int y_q4 = y0_q4; |
349 | 15.1M | for (y = 0; y < h; ++y) { |
350 | 13.9M | const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; |
351 | 13.9M | const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; |
352 | 13.9M | int k, sum = 0; |
353 | 125M | for (k = 0; k < SUBPEL_TAPS; ++k) |
354 | 111M | sum += src_y[k * src_stride] * y_filter[k]; |
355 | 13.9M | dst[y * dst_stride] = |
356 | 13.9M | clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); |
357 | 13.9M | y_q4 += y_step_q4; |
358 | 13.9M | } |
359 | 1.23M | ++src; |
360 | 1.23M | ++dst; |
361 | 1.23M | } |
362 | 174k | } |
363 | | |
364 | | static void highbd_convolve_avg_vert(const uint16_t *src, ptrdiff_t src_stride, |
365 | | uint16_t *dst, ptrdiff_t dst_stride, |
366 | | const InterpKernel *y_filters, int y0_q4, |
367 | 29.8k | int y_step_q4, int w, int h, int bd) { |
368 | 29.8k | int x, y; |
369 | 29.8k | src -= src_stride * (SUBPEL_TAPS / 2 - 1); |
370 | | |
371 | 247k | for (x = 0; x < w; ++x) { |
372 | 217k | int y_q4 = y0_q4; |
373 | 2.98M | for (y = 0; y < h; ++y) { |
374 | 2.76M | const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; |
375 | 2.76M | const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; |
376 | 2.76M | int k, sum = 0; |
377 | 24.8M | for (k = 0; k < SUBPEL_TAPS; ++k) |
378 | 22.1M | sum += src_y[k * src_stride] * y_filter[k]; |
379 | 2.76M | dst[y * dst_stride] = ROUND_POWER_OF_TWO( |
380 | 2.76M | dst[y * dst_stride] + |
381 | 2.76M | clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), |
382 | 2.76M | 1); |
383 | 2.76M | y_q4 += y_step_q4; |
384 | 2.76M | } |
385 | 217k | ++src; |
386 | 217k | ++dst; |
387 | 217k | } |
388 | 29.8k | } |
389 | | |
390 | | static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride, |
391 | | uint16_t *dst, ptrdiff_t dst_stride, |
392 | | const InterpKernel *filter, int x0_q4, |
393 | | int x_step_q4, int y0_q4, int y_step_q4, int w, |
394 | 90.8k | int h, int bd) { |
395 | | // Note: Fixed size intermediate buffer, temp, places limits on parameters. |
396 | | // 2d filtering proceeds in 2 steps: |
397 | | // (1) Interpolate horizontally into an intermediate buffer, temp. |
398 | | // (2) Interpolate temp vertically to derive the sub-pixel result. |
399 | | // Deriving the maximum number of rows in the temp buffer (135): |
400 | | // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). |
401 | | // --Largest block size is 64x64 pixels. |
402 | | // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the |
403 | | // original frame (in 1/16th pixel units). |
404 | | // --Must round-up because block may be located at sub-pixel position. |
405 | | // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. |
406 | | // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. |
407 | 90.8k | uint16_t temp[64 * 135]; |
408 | 90.8k | const int intermediate_height = |
409 | 90.8k | (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; |
410 | | |
411 | 90.8k | assert(w <= 64); |
412 | 90.8k | assert(h <= 64); |
413 | 90.8k | assert(y_step_q4 <= 32); |
414 | 90.8k | assert(x_step_q4 <= 32); |
415 | | |
416 | 90.8k | highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, |
417 | 90.8k | temp, 64, filter, x0_q4, x_step_q4, w, |
418 | 90.8k | intermediate_height, bd); |
419 | 90.8k | highbd_convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, |
420 | 90.8k | filter, y0_q4, y_step_q4, w, h, bd); |
421 | 90.8k | } |
422 | | |
423 | | void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, |
424 | | uint16_t *dst, ptrdiff_t dst_stride, |
425 | | const InterpKernel *filter, int x0_q4, |
426 | | int x_step_q4, int y0_q4, int y_step_q4, |
427 | 7.48k | int w, int h, int bd) { |
428 | 7.48k | (void)y0_q4; |
429 | 7.48k | (void)y_step_q4; |
430 | | |
431 | 7.48k | highbd_convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, |
432 | 7.48k | x_step_q4, w, h, bd); |
433 | 7.48k | } |
434 | | |
435 | | void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, |
436 | | uint16_t *dst, ptrdiff_t dst_stride, |
437 | | const InterpKernel *filter, int x0_q4, |
438 | | int x_step_q4, int y0_q4, int y_step_q4, |
439 | 6.56k | int w, int h, int bd) { |
440 | 6.56k | (void)y0_q4; |
441 | 6.56k | (void)y_step_q4; |
442 | | |
443 | 6.56k | highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, |
444 | 6.56k | x_step_q4, w, h, bd); |
445 | 6.56k | } |
446 | | |
447 | | void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, |
448 | | uint16_t *dst, ptrdiff_t dst_stride, |
449 | | const InterpKernel *filter, int x0_q4, |
450 | | int x_step_q4, int y0_q4, int y_step_q4, int w, |
451 | 83.6k | int h, int bd) { |
452 | 83.6k | (void)x0_q4; |
453 | 83.6k | (void)x_step_q4; |
454 | | |
455 | 83.6k | highbd_convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, |
456 | 83.6k | y_step_q4, w, h, bd); |
457 | 83.6k | } |
458 | | |
459 | | void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, |
460 | | uint16_t *dst, ptrdiff_t dst_stride, |
461 | | const InterpKernel *filter, int x0_q4, |
462 | | int x_step_q4, int y0_q4, int y_step_q4, |
463 | 29.8k | int w, int h, int bd) { |
464 | 29.8k | (void)x0_q4; |
465 | 29.8k | (void)x_step_q4; |
466 | | |
467 | 29.8k | highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, |
468 | 29.8k | y_step_q4, w, h, bd); |
469 | 29.8k | } |
470 | | |
471 | | void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, |
472 | | uint16_t *dst, ptrdiff_t dst_stride, |
473 | | const InterpKernel *filter, int x0_q4, |
474 | | int x_step_q4, int y0_q4, int y_step_q4, int w, |
475 | 90.8k | int h, int bd) { |
476 | 90.8k | highbd_convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, |
477 | 90.8k | y0_q4, y_step_q4, w, h, bd); |
478 | 90.8k | } |
479 | | |
480 | | void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, |
481 | | uint16_t *dst, ptrdiff_t dst_stride, |
482 | | const InterpKernel *filter, int x0_q4, |
483 | | int x_step_q4, int y0_q4, int y_step_q4, int w, |
484 | 27.7k | int h, int bd) { |
485 | | // Fixed size intermediate buffer places limits on parameters. |
486 | 27.7k | DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]); |
487 | 27.7k | assert(w <= 64); |
488 | 27.7k | assert(h <= 64); |
489 | | |
490 | 27.7k | vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, |
491 | 27.7k | y0_q4, y_step_q4, w, h, bd); |
492 | 27.7k | vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h, |
493 | 27.7k | bd); |
494 | 27.7k | } |
495 | | |
496 | | void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, |
497 | | uint16_t *dst, ptrdiff_t dst_stride, |
498 | | const InterpKernel *filter, int x0_q4, |
499 | | int x_step_q4, int y0_q4, int y_step_q4, int w, |
500 | 0 | int h, int bd) { |
501 | 0 | int r; |
502 | |
|
503 | 0 | (void)filter; |
504 | 0 | (void)x0_q4; |
505 | 0 | (void)x_step_q4; |
506 | 0 | (void)y0_q4; |
507 | 0 | (void)y_step_q4; |
508 | 0 | (void)bd; |
509 | |
|
510 | 0 | for (r = h; r > 0; --r) { |
511 | 0 | memcpy(dst, src, w * sizeof(uint16_t)); |
512 | 0 | src += src_stride; |
513 | 0 | dst += dst_stride; |
514 | 0 | } |
515 | 0 | } |
516 | | |
517 | | void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, |
518 | | uint16_t *dst, ptrdiff_t dst_stride, |
519 | | const InterpKernel *filter, int x0_q4, |
520 | | int x_step_q4, int y0_q4, int y_step_q4, int w, |
521 | 27.7k | int h, int bd) { |
522 | 27.7k | int x, y; |
523 | | |
524 | 27.7k | (void)filter; |
525 | 27.7k | (void)x0_q4; |
526 | 27.7k | (void)x_step_q4; |
527 | 27.7k | (void)y0_q4; |
528 | 27.7k | (void)y_step_q4; |
529 | 27.7k | (void)bd; |
530 | | |
531 | 202k | for (y = 0; y < h; ++y) { |
532 | 1.86M | for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); |
533 | 174k | src += src_stride; |
534 | 174k | dst += dst_stride; |
535 | 174k | } |
536 | 27.7k | } |
537 | | #endif |