/work/dav1d/src/cdef_tmpl.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright © 2018, VideoLAN and dav1d authors |
3 | | * Copyright © 2018, Two Orioles, LLC |
4 | | * All rights reserved. |
5 | | * |
6 | | * Redistribution and use in source and binary forms, with or without |
7 | | * modification, are permitted provided that the following conditions are met: |
8 | | * |
9 | | * 1. Redistributions of source code must retain the above copyright notice, this |
10 | | * list of conditions and the following disclaimer. |
11 | | * |
12 | | * 2. Redistributions in binary form must reproduce the above copyright notice, |
13 | | * this list of conditions and the following disclaimer in the documentation |
14 | | * and/or other materials provided with the distribution. |
15 | | * |
16 | | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
17 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
18 | | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
19 | | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
20 | | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
21 | | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
22 | | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
23 | | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
24 | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
25 | | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
26 | | */ |
27 | | |
28 | | #include "config.h" |
29 | | |
30 | | #include <stdlib.h> |
31 | | |
32 | | #include "common/intops.h" |
33 | | |
34 | | #include "src/cdef.h" |
35 | | #include "src/tables.h" |
36 | | |
37 | | static inline int constrain(const int diff, const int threshold, |
38 | | const int shift) |
39 | 0 | { |
40 | 0 | const int adiff = abs(diff); |
41 | 0 | return apply_sign(imin(adiff, imax(0, threshold - (adiff >> shift))), diff); |
42 | 0 | } |
43 | | |
44 | | static inline void fill(int16_t *tmp, const ptrdiff_t stride, |
45 | | const int w, const int h) |
46 | 0 | { |
47 | | /* Use a value that's a large positive number when interpreted as unsigned, |
48 | | * and a large negative number when interpreted as signed. */ |
49 | 0 | for (int y = 0; y < h; y++) { |
50 | 0 | for (int x = 0; x < w; x++) |
51 | 0 | tmp[x] = INT16_MIN; |
52 | 0 | tmp += stride; |
53 | 0 | } |
54 | 0 | } |
55 | | |
56 | | static void padding(int16_t *tmp, const ptrdiff_t tmp_stride, |
57 | | const pixel *src, const ptrdiff_t src_stride, |
58 | | const pixel (*left)[2], |
59 | | const pixel *top, const pixel *bottom, |
60 | | const int w, const int h, const enum CdefEdgeFlags edges) |
61 | 0 | { |
62 | | // fill extended input buffer |
63 | 0 | int x_start = -2, x_end = w + 2, y_start = -2, y_end = h + 2; |
64 | 0 | if (!(edges & CDEF_HAVE_TOP)) { |
65 | 0 | fill(tmp - 2 - 2 * tmp_stride, tmp_stride, w + 4, 2); |
66 | 0 | y_start = 0; |
67 | 0 | } |
68 | 0 | if (!(edges & CDEF_HAVE_BOTTOM)) { |
69 | 0 | fill(tmp + h * tmp_stride - 2, tmp_stride, w + 4, 2); |
70 | 0 | y_end -= 2; |
71 | 0 | } |
72 | 0 | if (!(edges & CDEF_HAVE_LEFT)) { |
73 | 0 | fill(tmp + y_start * tmp_stride - 2, tmp_stride, 2, y_end - y_start); |
74 | 0 | x_start = 0; |
75 | 0 | } |
76 | 0 | if (!(edges & CDEF_HAVE_RIGHT)) { |
77 | 0 | fill(tmp + y_start * tmp_stride + w, tmp_stride, 2, y_end - y_start); |
78 | 0 | x_end -= 2; |
79 | 0 | } |
80 | |
|
81 | 0 | for (int y = y_start; y < 0; y++) { |
82 | 0 | for (int x = x_start; x < x_end; x++) |
83 | 0 | tmp[x + y * tmp_stride] = top[x]; |
84 | 0 | top += PXSTRIDE(src_stride); |
85 | 0 | } |
86 | 0 | for (int y = 0; y < h; y++) |
87 | 0 | for (int x = x_start; x < 0; x++) |
88 | 0 | tmp[x + y * tmp_stride] = left[y][2 + x]; |
89 | 0 | for (int y = 0; y < h; y++) { |
90 | 0 | for (int x = (y < h) ? 0 : x_start; x < x_end; x++) |
91 | 0 | tmp[x] = src[x]; |
92 | 0 | src += PXSTRIDE(src_stride); |
93 | 0 | tmp += tmp_stride; |
94 | 0 | } |
95 | 0 | for (int y = h; y < y_end; y++) { |
96 | 0 | for (int x = x_start; x < x_end; x++) |
97 | 0 | tmp[x] = bottom[x]; |
98 | 0 | bottom += PXSTRIDE(src_stride); |
99 | 0 | tmp += tmp_stride; |
100 | 0 | } |
101 | |
|
102 | 0 | } |
103 | | |
104 | | static NOINLINE void |
105 | | cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride, |
106 | | const pixel (*left)[2], |
107 | | const pixel *const top, const pixel *const bottom, |
108 | | const int pri_strength, const int sec_strength, |
109 | | const int dir, const int damping, const int w, int h, |
110 | | const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX) |
111 | 0 | { |
112 | 0 | const ptrdiff_t tmp_stride = 12; |
113 | 0 | assert((w == 4 || w == 8) && (h == 4 || h == 8)); |
114 | 0 | int16_t tmp_buf[144]; // 12*12 is the maximum value of tmp_stride * (h + 4) |
115 | 0 | int16_t *tmp = tmp_buf + 2 * tmp_stride + 2; |
116 | |
|
117 | 0 | padding(tmp, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges); |
118 | |
|
119 | 0 | if (pri_strength) { |
120 | 0 | const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; |
121 | 0 | const int pri_tap = 4 - ((pri_strength >> bitdepth_min_8) & 1); |
122 | 0 | const int pri_shift = imax(0, damping - ulog2(pri_strength)); |
123 | 0 | if (sec_strength) { |
124 | 0 | const int sec_shift = damping - ulog2(sec_strength); |
125 | 0 | do { |
126 | 0 | for (int x = 0; x < w; x++) { |
127 | 0 | const int px = dst[x]; |
128 | 0 | int sum = 0; |
129 | 0 | int max = px, min = px; |
130 | 0 | int pri_tap_k = pri_tap; |
131 | 0 | for (int k = 0; k < 2; k++) { |
132 | 0 | const int off1 = dav1d_cdef_directions[dir + 2][k]; // dir |
133 | 0 | const int p0 = tmp[x + off1]; |
134 | 0 | const int p1 = tmp[x - off1]; |
135 | 0 | sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift); |
136 | 0 | sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift); |
137 | | // if pri_tap_k == 4 then it becomes 2 else it remains 3 |
138 | 0 | pri_tap_k = (pri_tap_k & 3) | 2; |
139 | 0 | min = umin(p0, min); |
140 | 0 | max = imax(p0, max); |
141 | 0 | min = umin(p1, min); |
142 | 0 | max = imax(p1, max); |
143 | 0 | const int off2 = dav1d_cdef_directions[dir + 4][k]; // dir + 2 |
144 | 0 | const int off3 = dav1d_cdef_directions[dir + 0][k]; // dir - 2 |
145 | 0 | const int s0 = tmp[x + off2]; |
146 | 0 | const int s1 = tmp[x - off2]; |
147 | 0 | const int s2 = tmp[x + off3]; |
148 | 0 | const int s3 = tmp[x - off3]; |
149 | | // sec_tap starts at 2 and becomes 1 |
150 | 0 | const int sec_tap = 2 - k; |
151 | 0 | sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift); |
152 | 0 | sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift); |
153 | 0 | sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift); |
154 | 0 | sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift); |
155 | 0 | min = umin(s0, min); |
156 | 0 | max = imax(s0, max); |
157 | 0 | min = umin(s1, min); |
158 | 0 | max = imax(s1, max); |
159 | 0 | min = umin(s2, min); |
160 | 0 | max = imax(s2, max); |
161 | 0 | min = umin(s3, min); |
162 | 0 | max = imax(s3, max); |
163 | 0 | } |
164 | 0 | dst[x] = iclip(px + ((sum - (sum < 0) + 8) >> 4), min, max); |
165 | 0 | } |
166 | 0 | dst += PXSTRIDE(dst_stride); |
167 | 0 | tmp += tmp_stride; |
168 | 0 | } while (--h); |
169 | 0 | } else { // pri_strength only |
170 | 0 | do { |
171 | 0 | for (int x = 0; x < w; x++) { |
172 | 0 | const int px = dst[x]; |
173 | 0 | int sum = 0; |
174 | 0 | int pri_tap_k = pri_tap; |
175 | 0 | for (int k = 0; k < 2; k++) { |
176 | 0 | const int off = dav1d_cdef_directions[dir + 2][k]; // dir |
177 | 0 | const int p0 = tmp[x + off]; |
178 | 0 | const int p1 = tmp[x - off]; |
179 | 0 | sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift); |
180 | 0 | sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift); |
181 | 0 | pri_tap_k = (pri_tap_k & 3) | 2; |
182 | 0 | } |
183 | 0 | dst[x] = px + ((sum - (sum < 0) + 8) >> 4); |
184 | 0 | } |
185 | 0 | dst += PXSTRIDE(dst_stride); |
186 | 0 | tmp += tmp_stride; |
187 | 0 | } while (--h); |
188 | 0 | } |
189 | 0 | } else { // sec_strength only |
190 | 0 | assert(sec_strength); |
191 | 0 | const int sec_shift = damping - ulog2(sec_strength); |
192 | 0 | do { |
193 | 0 | for (int x = 0; x < w; x++) { |
194 | 0 | const int px = dst[x]; |
195 | 0 | int sum = 0; |
196 | 0 | for (int k = 0; k < 2; k++) { |
197 | 0 | const int off1 = dav1d_cdef_directions[dir + 4][k]; // dir + 2 |
198 | 0 | const int off2 = dav1d_cdef_directions[dir + 0][k]; // dir - 2 |
199 | 0 | const int s0 = tmp[x + off1]; |
200 | 0 | const int s1 = tmp[x - off1]; |
201 | 0 | const int s2 = tmp[x + off2]; |
202 | 0 | const int s3 = tmp[x - off2]; |
203 | 0 | const int sec_tap = 2 - k; |
204 | 0 | sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift); |
205 | 0 | sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift); |
206 | 0 | sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift); |
207 | 0 | sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift); |
208 | 0 | } |
209 | 0 | dst[x] = px + ((sum - (sum < 0) + 8) >> 4); |
210 | 0 | } |
211 | 0 | dst += PXSTRIDE(dst_stride); |
212 | 0 | tmp += tmp_stride; |
213 | 0 | } while (--h); |
214 | 0 | } |
215 | 0 | } |
216 | | |
217 | | #define cdef_fn(w, h) \ |
218 | | static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \ |
219 | | const ptrdiff_t stride, \ |
220 | | const pixel (*left)[2], \ |
221 | | const pixel *const top, \ |
222 | | const pixel *const bottom, \ |
223 | | const int pri_strength, \ |
224 | | const int sec_strength, \ |
225 | | const int dir, \ |
226 | | const int damping, \ |
227 | | const enum CdefEdgeFlags edges \ |
228 | 0 | HIGHBD_DECL_SUFFIX) \ |
229 | 0 | { \ |
230 | 0 | cdef_filter_block_c(dst, stride, left, top, bottom, \ |
231 | 0 | pri_strength, sec_strength, dir, damping, w, h, edges HIGHBD_TAIL_SUFFIX); \ |
232 | 0 | } Unexecuted instantiation: cdef_tmpl.c:cdef_filter_block_8x8_c Unexecuted instantiation: cdef_tmpl.c:cdef_filter_block_4x8_c Unexecuted instantiation: cdef_tmpl.c:cdef_filter_block_4x4_c |
233 | | |
234 | | cdef_fn(4, 4); |
235 | | cdef_fn(4, 8); |
236 | | cdef_fn(8, 8); |
237 | | |
238 | | static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride, |
239 | | unsigned *const var HIGHBD_DECL_SUFFIX) |
240 | 0 | { |
241 | 0 | const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; |
242 | 0 | int partial_sum_hv[2][8] = { { 0 } }; |
243 | 0 | int partial_sum_diag[2][15] = { { 0 } }; |
244 | 0 | int partial_sum_alt[4][11] = { { 0 } }; |
245 | |
|
246 | 0 | for (int y = 0; y < 8; y++) { |
247 | 0 | for (int x = 0; x < 8; x++) { |
248 | 0 | const int px = (img[x] >> bitdepth_min_8) - 128; |
249 | |
|
250 | 0 | partial_sum_diag[0][ y + x ] += px; |
251 | 0 | partial_sum_alt [0][ y + (x >> 1)] += px; |
252 | 0 | partial_sum_hv [0][ y ] += px; |
253 | 0 | partial_sum_alt [1][3 + y - (x >> 1)] += px; |
254 | 0 | partial_sum_diag[1][7 + y - x ] += px; |
255 | 0 | partial_sum_alt [2][3 - (y >> 1) + x ] += px; |
256 | 0 | partial_sum_hv [1][ x ] += px; |
257 | 0 | partial_sum_alt [3][ (y >> 1) + x ] += px; |
258 | 0 | } |
259 | 0 | img += PXSTRIDE(stride); |
260 | 0 | } |
261 | |
|
262 | 0 | unsigned cost[8] = { 0 }; |
263 | 0 | for (int n = 0; n < 8; n++) { |
264 | 0 | cost[2] += partial_sum_hv[0][n] * partial_sum_hv[0][n]; |
265 | 0 | cost[6] += partial_sum_hv[1][n] * partial_sum_hv[1][n]; |
266 | 0 | } |
267 | 0 | cost[2] *= 105; |
268 | 0 | cost[6] *= 105; |
269 | |
|
270 | 0 | static const uint16_t div_table[7] = { 840, 420, 280, 210, 168, 140, 120 }; |
271 | 0 | for (int n = 0; n < 7; n++) { |
272 | 0 | const int d = div_table[n]; |
273 | 0 | cost[0] += (partial_sum_diag[0][n] * partial_sum_diag[0][n] + |
274 | 0 | partial_sum_diag[0][14 - n] * partial_sum_diag[0][14 - n]) * d; |
275 | 0 | cost[4] += (partial_sum_diag[1][n] * partial_sum_diag[1][n] + |
276 | 0 | partial_sum_diag[1][14 - n] * partial_sum_diag[1][14 - n]) * d; |
277 | 0 | } |
278 | 0 | cost[0] += partial_sum_diag[0][7] * partial_sum_diag[0][7] * 105; |
279 | 0 | cost[4] += partial_sum_diag[1][7] * partial_sum_diag[1][7] * 105; |
280 | |
|
281 | 0 | for (int n = 0; n < 4; n++) { |
282 | 0 | unsigned *const cost_ptr = &cost[n * 2 + 1]; |
283 | 0 | for (int m = 0; m < 5; m++) |
284 | 0 | *cost_ptr += partial_sum_alt[n][3 + m] * partial_sum_alt[n][3 + m]; |
285 | 0 | *cost_ptr *= 105; |
286 | 0 | for (int m = 0; m < 3; m++) { |
287 | 0 | const int d = div_table[2 * m + 1]; |
288 | 0 | *cost_ptr += (partial_sum_alt[n][m] * partial_sum_alt[n][m] + |
289 | 0 | partial_sum_alt[n][10 - m] * partial_sum_alt[n][10 - m]) * d; |
290 | 0 | } |
291 | 0 | } |
292 | |
|
293 | 0 | int best_dir = 0; |
294 | 0 | unsigned best_cost = cost[0]; |
295 | 0 | for (int n = 1; n < 8; n++) { |
296 | 0 | if (cost[n] > best_cost) { |
297 | 0 | best_cost = cost[n]; |
298 | 0 | best_dir = n; |
299 | 0 | } |
300 | 0 | } |
301 | |
|
302 | 0 | *var = (best_cost - (cost[best_dir ^ 4])) >> 10; |
303 | 0 | return best_dir; |
304 | 0 | } |
305 | | |
306 | | #if HAVE_ASM |
307 | | #if ARCH_AARCH64 || ARCH_ARM |
308 | | #include "src/arm/cdef.h" |
309 | | #elif ARCH_PPC64LE |
310 | | #include "src/ppc/cdef.h" |
311 | | #elif ARCH_RISCV |
312 | | #include "src/riscv/cdef.h" |
313 | | #elif ARCH_X86 |
314 | | #include "src/x86/cdef.h" |
315 | | #elif ARCH_LOONGARCH64 |
316 | | #include "src/loongarch/cdef.h" |
317 | | #endif |
318 | | #endif |
319 | | |
320 | 0 | COLD void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) { |
321 | 0 | c->dir = cdef_find_dir_c; |
322 | 0 | c->fb[0] = cdef_filter_block_8x8_c; |
323 | 0 | c->fb[1] = cdef_filter_block_4x8_c; |
324 | 0 | c->fb[2] = cdef_filter_block_4x4_c; |
325 | |
|
326 | | #if HAVE_ASM |
327 | | #if ARCH_AARCH64 || ARCH_ARM |
328 | | cdef_dsp_init_arm(c); |
329 | | #elif ARCH_PPC64LE |
330 | | cdef_dsp_init_ppc(c); |
331 | | #elif ARCH_RISCV |
332 | | cdef_dsp_init_riscv(c); |
333 | | #elif ARCH_X86 |
334 | | cdef_dsp_init_x86(c); |
335 | | #elif ARCH_LOONGARCH64 |
336 | | cdef_dsp_init_loongarch(c); |
337 | | #endif |
338 | | #endif |
339 | 0 | } Unexecuted instantiation: dav1d_cdef_dsp_init_8bpc Unexecuted instantiation: dav1d_cdef_dsp_init_16bpc |