/work/dav1d/src/looprestoration_tmpl.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright © 2018, VideoLAN and dav1d authors |
3 | | * Copyright © 2018, Two Orioles, LLC |
4 | | * All rights reserved. |
5 | | * |
6 | | * Redistribution and use in source and binary forms, with or without |
7 | | * modification, are permitted provided that the following conditions are met: |
8 | | * |
9 | | * 1. Redistributions of source code must retain the above copyright notice, this |
10 | | * list of conditions and the following disclaimer. |
11 | | * |
12 | | * 2. Redistributions in binary form must reproduce the above copyright notice, |
13 | | * this list of conditions and the following disclaimer in the documentation |
14 | | * and/or other materials provided with the distribution. |
15 | | * |
16 | | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
17 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
18 | | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
19 | | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
20 | | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
21 | | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
22 | | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
23 | | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
24 | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
25 | | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
26 | | */ |
27 | | |
28 | | #include "config.h" |
29 | | |
30 | | #include <stdint.h> |
31 | | #include <stdlib.h> |
32 | | #include <string.h> |
33 | | |
34 | | #include "common/attributes.h" |
35 | | #include "common/bitdepth.h" |
36 | | #include "common/intops.h" |
37 | | |
38 | | #include "src/looprestoration.h" |
39 | | #include "src/tables.h" |
40 | | |
41 | | // 256 * 1.5 + 3 + 3 = 390 |
42 | 0 | #define REST_UNIT_STRIDE (390) |
43 | | |
44 | | static void wiener_filter_h(uint16_t *dst, const pixel (*left)[4], |
45 | | const pixel *src, const int16_t fh[8], |
46 | | const int w, const enum LrEdgeFlags edges |
47 | | HIGHBD_DECL_SUFFIX) |
48 | 0 | { |
49 | 0 | const int bitdepth = bitdepth_from_max(bitdepth_max); |
50 | 0 | const int round_bits_h = 3 + (bitdepth == 12) * 2; |
51 | 0 | const int rounding_off_h = 1 << (round_bits_h - 1); |
52 | 0 | const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h); |
53 | |
|
54 | 0 | if (w < 6) { |
55 | | // For small widths, do the fully conditional loop with |
56 | | // conditions on each access. |
57 | 0 | for (int x = 0; x < w; x++) { |
58 | 0 | int sum = (1 << (bitdepth + 6)); |
59 | 0 | #if BITDEPTH == 8 |
60 | 0 | sum += src[x] * 128; |
61 | 0 | #endif |
62 | 0 | for (int i = 0; i < 7; i++) { |
63 | 0 | int idx = x + i - 3; |
64 | 0 | if (idx < 0) { |
65 | 0 | if (!(edges & LR_HAVE_LEFT)) |
66 | 0 | sum += src[0] * fh[i]; |
67 | 0 | else if (left) |
68 | 0 | sum += left[0][4 + idx] * fh[i]; |
69 | 0 | else |
70 | 0 | sum += src[idx] * fh[i]; |
71 | 0 | } else if (idx >= w && !(edges & LR_HAVE_RIGHT)) { |
72 | 0 | sum += src[w - 1] * fh[i]; |
73 | 0 | } else |
74 | 0 | sum += src[idx] * fh[i]; |
75 | 0 | } |
76 | 0 | sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1); |
77 | 0 | dst[x] = sum; |
78 | 0 | } |
79 | |
|
80 | 0 | return; |
81 | 0 | } |
82 | | |
83 | | // For larger widths, do separate loops with less conditions; first |
84 | | // handle the start of the row. |
85 | 0 | int start = 3; |
86 | 0 | if (!(edges & LR_HAVE_LEFT)) { |
87 | | // If there's no left edge, pad using the leftmost pixel. |
88 | 0 | for (int x = 0; x < 3; x++) { |
89 | 0 | int sum = (1 << (bitdepth + 6)); |
90 | 0 | #if BITDEPTH == 8 |
91 | 0 | sum += src[x] * 128; |
92 | 0 | #endif |
93 | 0 | for (int i = 0; i < 7; i++) { |
94 | 0 | int idx = x + i - 3; |
95 | 0 | if (idx < 0) |
96 | 0 | sum += src[0] * fh[i]; |
97 | 0 | else |
98 | 0 | sum += src[idx] * fh[i]; |
99 | 0 | } |
100 | 0 | sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1); |
101 | 0 | dst[x] = sum; |
102 | 0 | } |
103 | 0 | } else if (left) { |
104 | | // If we have the left edge and a separate left buffer, pad using that. |
105 | 0 | for (int x = 0; x < 3; x++) { |
106 | 0 | int sum = (1 << (bitdepth + 6)); |
107 | 0 | #if BITDEPTH == 8 |
108 | 0 | sum += src[x] * 128; |
109 | 0 | #endif |
110 | 0 | for (int i = 0; i < 7; i++) { |
111 | 0 | int idx = x + i - 3; |
112 | 0 | if (idx < 0) |
113 | 0 | sum += left[0][4 + idx] * fh[i]; |
114 | 0 | else |
115 | 0 | sum += src[idx] * fh[i]; |
116 | 0 | } |
117 | 0 | sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1); |
118 | 0 | dst[x] = sum; |
119 | 0 | } |
120 | 0 | } else { |
121 | | // If we have the left edge, but no separate left buffer, we're in the |
122 | | // top/bottom area (lpf) with the left edge existing in the same |
123 | | // buffer; just do the regular loop from the start. |
124 | 0 | start = 0; |
125 | 0 | } |
126 | 0 | int end = w - 3; |
127 | 0 | if (edges & LR_HAVE_RIGHT) |
128 | 0 | end = w; |
129 | | |
130 | | // Do a condititon free loop for the bulk of the row. |
131 | 0 | for (int x = start; x < end; x++) { |
132 | 0 | int sum = (1 << (bitdepth + 6)); |
133 | 0 | #if BITDEPTH == 8 |
134 | 0 | sum += src[x] * 128; |
135 | 0 | #endif |
136 | 0 | for (int i = 0; i < 7; i++) { |
137 | 0 | int idx = x + i - 3; |
138 | 0 | sum += src[idx] * fh[i]; |
139 | 0 | } |
140 | 0 | sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1); |
141 | 0 | dst[x] = sum; |
142 | 0 | } |
143 | | |
144 | | // If we need to, calculate the end of the row with a condition for |
145 | | // right edge padding. |
146 | 0 | for (int x = end; x < w; x++) { |
147 | 0 | int sum = (1 << (bitdepth + 6)); |
148 | 0 | #if BITDEPTH == 8 |
149 | 0 | sum += src[x] * 128; |
150 | 0 | #endif |
151 | 0 | for (int i = 0; i < 7; i++) { |
152 | 0 | int idx = x + i - 3; |
153 | 0 | if (idx >= w) |
154 | 0 | sum += src[w - 1] * fh[i]; |
155 | 0 | else |
156 | 0 | sum += src[idx] * fh[i]; |
157 | 0 | } |
158 | 0 | sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1); |
159 | 0 | dst[x] = sum; |
160 | 0 | } |
161 | 0 | } |
162 | | |
163 | | static void wiener_filter_v(pixel *p, uint16_t **ptrs, const int16_t fv[8], |
164 | | const int w HIGHBD_DECL_SUFFIX) |
165 | 0 | { |
166 | 0 | const int bitdepth = bitdepth_from_max(bitdepth_max); |
167 | |
|
168 | 0 | const int round_bits_v = 11 - (bitdepth == 12) * 2; |
169 | 0 | const int rounding_off_v = 1 << (round_bits_v - 1); |
170 | 0 | const int round_offset = 1 << (bitdepth + (round_bits_v - 1)); |
171 | |
|
172 | 0 | for (int i = 0; i < w; i++) { |
173 | 0 | int sum = -round_offset; |
174 | | |
175 | | // Only filter using 6 input rows. The 7th row is assumed to be |
176 | | // identical to the last one. |
177 | | // |
178 | | // This function is assumed to only be called at the end, when doing |
179 | | // padding at the bottom. |
180 | 0 | for (int k = 0; k < 6; k++) |
181 | 0 | sum += ptrs[k][i] * fv[k]; |
182 | 0 | sum += ptrs[5][i] * fv[6]; |
183 | |
|
184 | 0 | p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v); |
185 | 0 | } |
186 | | |
187 | | // Shift the pointers, but only update the first 5; the 6th pointer is kept |
188 | | // as it was before (and the 7th is implicitly identical to the 6th). |
189 | 0 | for (int i = 0; i < 5; i++) |
190 | 0 | ptrs[i] = ptrs[i + 1]; |
191 | 0 | } |
192 | | |
193 | | static void wiener_filter_hv(pixel *p, uint16_t **ptrs, const pixel (*left)[4], |
194 | | const pixel *src, const int16_t filter[2][8], |
195 | | const int w, const enum LrEdgeFlags edges |
196 | | HIGHBD_DECL_SUFFIX) |
197 | 0 | { |
198 | 0 | const int bitdepth = bitdepth_from_max(bitdepth_max); |
199 | |
|
200 | 0 | const int round_bits_v = 11 - (bitdepth == 12) * 2; |
201 | 0 | const int rounding_off_v = 1 << (round_bits_v - 1); |
202 | 0 | const int round_offset = 1 << (bitdepth + (round_bits_v - 1)); |
203 | |
|
204 | 0 | const int16_t *fh = filter[0]; |
205 | 0 | const int16_t *fv = filter[1]; |
206 | | |
207 | | // Do combined horziontal and vertical filtering; doing horizontal |
208 | | // filtering of one row, combined with vertical filtering of 6 |
209 | | // preexisting rows and the newly filtered row. |
210 | | |
211 | | // For simplicity in the C implementation, just do a separate call |
212 | | // of the horizontal filter, into a temporary buffer. |
213 | 0 | uint16_t tmp[REST_UNIT_STRIDE]; |
214 | 0 | wiener_filter_h(tmp, left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); |
215 | |
|
216 | 0 | for (int i = 0; i < w; i++) { |
217 | 0 | int sum = -round_offset; |
218 | | |
219 | | // Filter using the 6 stored preexisting rows, and the newly |
220 | | // filtered one in tmp[]. |
221 | 0 | for (int k = 0; k < 6; k++) |
222 | 0 | sum += ptrs[k][i] * fv[k]; |
223 | 0 | sum += tmp[i] * fv[6]; |
224 | | // At this point, after having read all inputs at point [i], we |
225 | | // could overwrite [i] with the newly filtered data. |
226 | |
|
227 | 0 | p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v); |
228 | 0 | } |
229 | | |
230 | | // For simplicity in the C implementation, just memcpy the newly |
231 | | // filtered row into ptrs[6]. Normally, in steady state filtering, |
232 | | // this output row, ptrs[6], is equal to ptrs[0]. However at startup, |
233 | | // at the top of the filtered area, we may have ptrs[0] equal to ptrs[1], |
234 | | // so we can't assume we can write into ptrs[0] but we need to keep |
235 | | // a separate pointer for the next row to write into. |
236 | 0 | memcpy(ptrs[6], tmp, sizeof(uint16_t) * REST_UNIT_STRIDE); |
237 | | |
238 | | // Rotate the window of pointers. Shift the 6 pointers downwards one step. |
239 | 0 | for (int i = 0; i < 6; i++) |
240 | 0 | ptrs[i] = ptrs[i + 1]; |
241 | | // The topmost pointer, ptrs[6], which isn't used as input, is set to |
242 | | // ptrs[0], which will be used as output for the next _hv call. |
243 | | // At the start of the filtering, the caller may set ptrs[6] to the |
244 | | // right next buffer to fill in, instead. |
245 | 0 | ptrs[6] = ptrs[0]; |
246 | 0 | } |
247 | | |
248 | | // FIXME Could split into luma and chroma specific functions, |
249 | | // (since first and last tops are always 0 for chroma) |
250 | | static void wiener_c(pixel *p, const ptrdiff_t stride, |
251 | | const pixel (*left)[4], |
252 | | const pixel *lpf, const int w, int h, |
253 | | const LooprestorationParams *const params, |
254 | | const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) |
255 | 0 | { |
256 | | // Values stored between horizontal and vertical filtering don't |
257 | | // fit in a uint8_t. |
258 | 0 | uint16_t hor[6 * REST_UNIT_STRIDE]; |
259 | 0 | uint16_t *ptrs[7], *rows[6]; |
260 | 0 | for (int i = 0; i < 6; i++) |
261 | 0 | rows[i] = &hor[i * REST_UNIT_STRIDE]; |
262 | 0 | const int16_t (*const filter)[8] = params->filter; |
263 | 0 | const int16_t *fh = params->filter[0]; |
264 | 0 | const int16_t *fv = params->filter[1]; |
265 | 0 | const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); |
266 | |
|
267 | 0 | const pixel *src = p; |
268 | 0 | if (edges & LR_HAVE_TOP) { |
269 | 0 | ptrs[0] = rows[0]; |
270 | 0 | ptrs[1] = rows[0]; |
271 | 0 | ptrs[2] = rows[1]; |
272 | 0 | ptrs[3] = rows[2]; |
273 | 0 | ptrs[4] = rows[2]; |
274 | 0 | ptrs[5] = rows[2]; |
275 | |
|
276 | 0 | wiener_filter_h(rows[0], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX); |
277 | 0 | lpf += PXSTRIDE(stride); |
278 | 0 | wiener_filter_h(rows[1], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX); |
279 | |
|
280 | 0 | wiener_filter_h(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); |
281 | 0 | left++; |
282 | 0 | src += PXSTRIDE(stride); |
283 | |
|
284 | 0 | if (--h <= 0) |
285 | 0 | goto v1; |
286 | | |
287 | 0 | ptrs[4] = ptrs[5] = rows[3]; |
288 | 0 | wiener_filter_h(rows[3], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); |
289 | 0 | left++; |
290 | 0 | src += PXSTRIDE(stride); |
291 | |
|
292 | 0 | if (--h <= 0) |
293 | 0 | goto v2; |
294 | | |
295 | 0 | ptrs[5] = rows[4]; |
296 | 0 | wiener_filter_h(rows[4], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); |
297 | 0 | left++; |
298 | 0 | src += PXSTRIDE(stride); |
299 | |
|
300 | 0 | if (--h <= 0) |
301 | 0 | goto v3; |
302 | 0 | } else { |
303 | 0 | ptrs[0] = rows[0]; |
304 | 0 | ptrs[1] = rows[0]; |
305 | 0 | ptrs[2] = rows[0]; |
306 | 0 | ptrs[3] = rows[0]; |
307 | 0 | ptrs[4] = rows[0]; |
308 | 0 | ptrs[5] = rows[0]; |
309 | |
|
310 | 0 | wiener_filter_h(rows[0], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); |
311 | 0 | left++; |
312 | 0 | src += PXSTRIDE(stride); |
313 | |
|
314 | 0 | if (--h <= 0) |
315 | 0 | goto v1; |
316 | | |
317 | 0 | ptrs[4] = ptrs[5] = rows[1]; |
318 | 0 | wiener_filter_h(rows[1], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); |
319 | 0 | left++; |
320 | 0 | src += PXSTRIDE(stride); |
321 | |
|
322 | 0 | if (--h <= 0) |
323 | 0 | goto v2; |
324 | | |
325 | 0 | ptrs[5] = rows[2]; |
326 | 0 | wiener_filter_h(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); |
327 | 0 | left++; |
328 | 0 | src += PXSTRIDE(stride); |
329 | |
|
330 | 0 | if (--h <= 0) |
331 | 0 | goto v3; |
332 | | |
333 | 0 | ptrs[6] = rows[3]; |
334 | 0 | wiener_filter_hv(p, ptrs, left, src, filter, w, edges |
335 | 0 | HIGHBD_TAIL_SUFFIX); |
336 | 0 | left++; |
337 | 0 | src += PXSTRIDE(stride); |
338 | 0 | p += PXSTRIDE(stride); |
339 | |
|
340 | 0 | if (--h <= 0) |
341 | 0 | goto v3; |
342 | | |
343 | 0 | ptrs[6] = rows[4]; |
344 | 0 | wiener_filter_hv(p, ptrs, left, src, filter, w, edges |
345 | 0 | HIGHBD_TAIL_SUFFIX); |
346 | 0 | left++; |
347 | 0 | src += PXSTRIDE(stride); |
348 | 0 | p += PXSTRIDE(stride); |
349 | |
|
350 | 0 | if (--h <= 0) |
351 | 0 | goto v3; |
352 | 0 | } |
353 | | |
354 | 0 | ptrs[6] = ptrs[5] + REST_UNIT_STRIDE; |
355 | 0 | do { |
356 | 0 | wiener_filter_hv(p, ptrs, left, src, filter, w, edges |
357 | 0 | HIGHBD_TAIL_SUFFIX); |
358 | 0 | left++; |
359 | 0 | src += PXSTRIDE(stride); |
360 | 0 | p += PXSTRIDE(stride); |
361 | 0 | } while (--h > 0); |
362 | |
|
363 | 0 | if (!(edges & LR_HAVE_BOTTOM)) |
364 | 0 | goto v3; |
365 | | |
366 | 0 | wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges |
367 | 0 | HIGHBD_TAIL_SUFFIX); |
368 | 0 | lpf_bottom += PXSTRIDE(stride); |
369 | 0 | p += PXSTRIDE(stride); |
370 | |
|
371 | 0 | wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges |
372 | 0 | HIGHBD_TAIL_SUFFIX); |
373 | 0 | p += PXSTRIDE(stride); |
374 | 0 | v1: |
375 | 0 | wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX); |
376 | |
|
377 | 0 | return; |
378 | | |
379 | 0 | v3: |
380 | 0 | wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX); |
381 | 0 | p += PXSTRIDE(stride); |
382 | 0 | v2: |
383 | 0 | wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX); |
384 | 0 | p += PXSTRIDE(stride); |
385 | 0 | goto v1; |
386 | 0 | } |
387 | | |
388 | | // SGR |
389 | | static NOINLINE void rotate(int32_t **sumsq_ptrs, coef **sum_ptrs, int n) |
390 | 0 | { |
391 | 0 | int32_t *tmp32 = sumsq_ptrs[0]; |
392 | 0 | coef *tmpc = sum_ptrs[0]; |
393 | 0 | for (int i = 0; i < n - 1; i++) { |
394 | 0 | sumsq_ptrs[i] = sumsq_ptrs[i + 1]; |
395 | 0 | sum_ptrs[i] = sum_ptrs[i + 1]; |
396 | 0 | } |
397 | 0 | sumsq_ptrs[n - 1] = tmp32; |
398 | 0 | sum_ptrs[n - 1] = tmpc; |
399 | 0 | } |
400 | | |
401 | | static NOINLINE void rotate5_x2(int32_t **sumsq_ptrs, coef **sum_ptrs) |
402 | 0 | { |
403 | 0 | int32_t *tmp32[2]; |
404 | 0 | coef *tmpc[2]; |
405 | 0 | for (int i = 0; i < 2; i++) { |
406 | 0 | tmp32[i] = sumsq_ptrs[i]; |
407 | 0 | tmpc[i] = sum_ptrs[i]; |
408 | 0 | } |
409 | 0 | for (int i = 0; i < 3; i++) { |
410 | 0 | sumsq_ptrs[i] = sumsq_ptrs[i + 2]; |
411 | 0 | sum_ptrs[i] = sum_ptrs[i + 2]; |
412 | 0 | } |
413 | 0 | for (int i = 0; i < 2; i++) { |
414 | 0 | sumsq_ptrs[3 + i] = tmp32[i]; |
415 | 0 | sum_ptrs[3 + i] = tmpc[i]; |
416 | 0 | } |
417 | 0 | } |
418 | | |
419 | | static NOINLINE void sgr_box3_row_h(int32_t *sumsq, coef *sum, |
420 | | const pixel (*left)[4], |
421 | | const pixel *src, const int w, |
422 | | const enum LrEdgeFlags edges) |
423 | 0 | { |
424 | 0 | sumsq++; |
425 | 0 | sum++; |
426 | 0 | int a = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0]; |
427 | 0 | int b = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0]; |
428 | 0 | for (int x = -1; x < w + 1; x++) { |
429 | 0 | int c = (x + 1 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 1] : src[w - 1]; |
430 | 0 | sum[x] = a + b + c; |
431 | 0 | sumsq[x] = a * a + b * b + c * c; |
432 | 0 | a = b; |
433 | 0 | b = c; |
434 | 0 | } |
435 | 0 | } |
436 | | |
437 | | static NOINLINE void sgr_box5_row_h(int32_t *sumsq, coef *sum, |
438 | | const pixel (*left)[4], |
439 | | const pixel *src, const int w, |
440 | | const enum LrEdgeFlags edges) |
441 | 0 | { |
442 | 0 | sumsq++; |
443 | 0 | sum++; |
444 | 0 | int a = edges & LR_HAVE_LEFT ? (left ? left[0][1] : src[-3]) : src[0]; |
445 | 0 | int b = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0]; |
446 | 0 | int c = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0]; |
447 | 0 | int d = src[0]; |
448 | 0 | for (int x = -1; x < w + 1; x++) { |
449 | 0 | int e = (x + 2 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 2] : src[w - 1]; |
450 | 0 | sum[x] = a + b + c + d + e; |
451 | 0 | sumsq[x] = a * a + b * b + c * c + d * d + e * e; |
452 | 0 | a = b; |
453 | 0 | b = c; |
454 | 0 | c = d; |
455 | 0 | d = e; |
456 | 0 | } |
457 | 0 | } |
458 | | |
459 | | static void sgr_box35_row_h(int32_t *sumsq3, coef *sum3, |
460 | | int32_t *sumsq5, coef *sum5, |
461 | | const pixel (*left)[4], |
462 | | const pixel *src, const int w, |
463 | | const enum LrEdgeFlags edges) |
464 | 0 | { |
465 | 0 | sgr_box3_row_h(sumsq3, sum3, left, src, w, edges); |
466 | 0 | sgr_box5_row_h(sumsq5, sum5, left, src, w, edges); |
467 | 0 | } |
468 | | |
469 | | static NOINLINE void sgr_box3_row_v(int32_t **sumsq, coef **sum, |
470 | | int32_t *sumsq_out, coef *sum_out, |
471 | | const int w) |
472 | 0 | { |
473 | 0 | for (int x = 0; x < w + 2; x++) { |
474 | 0 | int sq_a = sumsq[0][x]; |
475 | 0 | int sq_b = sumsq[1][x]; |
476 | 0 | int sq_c = sumsq[2][x]; |
477 | 0 | int s_a = sum[0][x]; |
478 | 0 | int s_b = sum[1][x]; |
479 | 0 | int s_c = sum[2][x]; |
480 | 0 | sumsq_out[x] = sq_a + sq_b + sq_c; |
481 | 0 | sum_out[x] = s_a + s_b + s_c; |
482 | 0 | } |
483 | 0 | } |
484 | | |
485 | | static NOINLINE void sgr_box5_row_v(int32_t **sumsq, coef **sum, |
486 | | int32_t *sumsq_out, coef *sum_out, |
487 | | const int w) |
488 | 0 | { |
489 | 0 | for (int x = 0; x < w + 2; x++) { |
490 | 0 | int sq_a = sumsq[0][x]; |
491 | 0 | int sq_b = sumsq[1][x]; |
492 | 0 | int sq_c = sumsq[2][x]; |
493 | 0 | int sq_d = sumsq[3][x]; |
494 | 0 | int sq_e = sumsq[4][x]; |
495 | 0 | int s_a = sum[0][x]; |
496 | 0 | int s_b = sum[1][x]; |
497 | 0 | int s_c = sum[2][x]; |
498 | 0 | int s_d = sum[3][x]; |
499 | 0 | int s_e = sum[4][x]; |
500 | 0 | sumsq_out[x] = sq_a + sq_b + sq_c + sq_d + sq_e; |
501 | 0 | sum_out[x] = s_a + s_b + s_c + s_d + s_e; |
502 | 0 | } |
503 | 0 | } |
504 | | |
505 | | static NOINLINE void sgr_calc_row_ab(int32_t *AA, coef *BB, int w, int s, |
506 | | int bitdepth_max, int n, int sgr_one_by_x) |
507 | 0 | { |
508 | 0 | const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; |
509 | 0 | for (int i = 0; i < w + 2; i++) { |
510 | 0 | const int a = |
511 | 0 | (AA[i] + ((1 << (2 * bitdepth_min_8)) >> 1)) >> (2 * bitdepth_min_8); |
512 | 0 | const int b = |
513 | 0 | (BB[i] + ((1 << bitdepth_min_8) >> 1)) >> bitdepth_min_8; |
514 | |
|
515 | 0 | const unsigned p = imax(a * n - b * b, 0); |
516 | 0 | const unsigned z = (p * s + (1 << 19)) >> 20; |
517 | 0 | const unsigned x = dav1d_sgr_x_by_x[umin(z, 255)]; |
518 | | |
519 | | // This is where we invert A and B, so that B is of size coef. |
520 | 0 | AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12; |
521 | 0 | BB[i] = x; |
522 | 0 | } |
523 | 0 | } |
524 | | |
525 | | static void sgr_box3_vert(int32_t **sumsq, coef **sum, |
526 | | int32_t *sumsq_out, coef *sum_out, |
527 | | const int w, const int s, const int bitdepth_max) |
528 | 0 | { |
529 | 0 | sgr_box3_row_v(sumsq, sum, sumsq_out, sum_out, w); |
530 | 0 | sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 9, 455); |
531 | 0 | rotate(sumsq, sum, 3); |
532 | 0 | } |
533 | | |
534 | | static void sgr_box5_vert(int32_t **sumsq, coef **sum, |
535 | | int32_t *sumsq_out, coef *sum_out, |
536 | | const int w, const int s, const int bitdepth_max) |
537 | 0 | { |
538 | 0 | sgr_box5_row_v(sumsq, sum, sumsq_out, sum_out, w); |
539 | 0 | sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 25, 164); |
540 | 0 | rotate5_x2(sumsq, sum); |
541 | 0 | } |
542 | | |
543 | | static void sgr_box3_hv(int32_t **sumsq, coef **sum, |
544 | | int32_t *AA, coef *BB, |
545 | | const pixel (*left)[4], |
546 | | const pixel *src, const int w, |
547 | | const int s, |
548 | | const enum LrEdgeFlags edges, |
549 | | const int bitdepth_max) |
550 | 0 | { |
551 | 0 | sgr_box3_row_h(sumsq[2], sum[2], left, src, w, edges); |
552 | 0 | sgr_box3_vert(sumsq, sum, AA, BB, w, s, bitdepth_max); |
553 | 0 | } |
554 | | |
555 | | static NOINLINE void sgr_finish_filter_row1(coef *tmp, |
556 | | const pixel *src, |
557 | | int32_t **A_ptrs, coef **B_ptrs, |
558 | | const int w) |
559 | 0 | { |
560 | 0 | #define EIGHT_NEIGHBORS(P, i)\ |
561 | 0 | ((P[1][i] + P[1][i - 1] + P[1][i + 1] + P[0][i] + P[2][i]) * 4 + \ |
562 | 0 | (P[0][i - 1] + P[2][i - 1] + \ |
563 | 0 | P[0][i + 1] + P[2][i + 1]) * 3) |
564 | 0 | for (int i = 0; i < w; i++) { |
565 | 0 | const int a = EIGHT_NEIGHBORS(B_ptrs, i + 1); |
566 | 0 | const int b = EIGHT_NEIGHBORS(A_ptrs, i + 1); |
567 | 0 | tmp[i] = (b - a * src[i] + (1 << 8)) >> 9; |
568 | 0 | } |
569 | 0 | #undef EIGHT_NEIGHBORS |
570 | 0 | } |
571 | | |
572 | 0 | #define FILTER_OUT_STRIDE (384) |
573 | | |
574 | | static NOINLINE void sgr_finish_filter2(coef *tmp, |
575 | | const pixel *src, |
576 | | const ptrdiff_t src_stride, |
577 | | int32_t **A_ptrs, coef **B_ptrs, |
578 | | const int w, const int h) |
579 | 0 | { |
580 | 0 | #define SIX_NEIGHBORS(P, i)\ |
581 | 0 | ((P[0][i] + P[1][i]) * 6 + \ |
582 | 0 | (P[0][i - 1] + P[1][i - 1] + \ |
583 | 0 | P[0][i + 1] + P[1][i + 1]) * 5) |
584 | 0 | for (int i = 0; i < w; i++) { |
585 | 0 | const int a = SIX_NEIGHBORS(B_ptrs, i + 1); |
586 | 0 | const int b = SIX_NEIGHBORS(A_ptrs, i + 1); |
587 | 0 | tmp[i] = (b - a * src[i] + (1 << 8)) >> 9; |
588 | 0 | } |
589 | 0 | if (h <= 1) |
590 | 0 | return; |
591 | 0 | tmp += FILTER_OUT_STRIDE; |
592 | 0 | src += PXSTRIDE(src_stride); |
593 | 0 | const int32_t *A = &A_ptrs[1][1]; |
594 | 0 | const coef *B = &B_ptrs[1][1]; |
595 | 0 | for (int i = 0; i < w; i++) { |
596 | 0 | const int a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5; |
597 | 0 | const int b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5; |
598 | 0 | tmp[i] = (b - a * src[i] + (1 << 7)) >> 8; |
599 | 0 | } |
600 | 0 | #undef SIX_NEIGHBORS |
601 | 0 | } |
602 | | |
603 | | static NOINLINE void sgr_weighted_row1(pixel *dst, const coef *t1, |
604 | | const int w, const int w1 HIGHBD_DECL_SUFFIX) |
605 | 0 | { |
606 | 0 | for (int i = 0; i < w; i++) { |
607 | 0 | const int v = w1 * t1[i]; |
608 | 0 | dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11)); |
609 | 0 | } |
610 | 0 | } |
611 | | |
612 | | static NOINLINE void sgr_weighted2(pixel *dst, const ptrdiff_t dst_stride, |
613 | | const coef *t1, const coef *t2, |
614 | | const int w, const int h, |
615 | | const int w0, const int w1 HIGHBD_DECL_SUFFIX) |
616 | 0 | { |
617 | 0 | for (int j = 0; j < h; j++) { |
618 | 0 | for (int i = 0; i < w; i++) { |
619 | 0 | const int v = w0 * t1[i] + w1 * t2[i]; |
620 | 0 | dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11)); |
621 | 0 | } |
622 | 0 | dst += PXSTRIDE(dst_stride); |
623 | 0 | t1 += FILTER_OUT_STRIDE; |
624 | 0 | t2 += FILTER_OUT_STRIDE; |
625 | 0 | } |
626 | 0 | } |
627 | | |
628 | | static NOINLINE void sgr_finish1(pixel **dst, const ptrdiff_t stride, |
629 | | int32_t **A_ptrs, coef **B_ptrs, const int w, |
630 | | const int w1 HIGHBD_DECL_SUFFIX) |
631 | 0 | { |
632 | | // Only one single row, no stride needed |
633 | 0 | ALIGN_STK_16(coef, tmp, 384,); |
634 | |
|
635 | 0 | sgr_finish_filter_row1(tmp, *dst, A_ptrs, B_ptrs, w); |
636 | 0 | sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX); |
637 | 0 | *dst += PXSTRIDE(stride); |
638 | 0 | rotate(A_ptrs, B_ptrs, 3); |
639 | 0 | } |
640 | | |
641 | | static NOINLINE void sgr_finish2(pixel **dst, const ptrdiff_t stride, |
642 | | int32_t **A_ptrs, coef **B_ptrs, |
643 | | const int w, const int h, const int w1 |
644 | | HIGHBD_DECL_SUFFIX) |
645 | 0 | { |
646 | 0 | ALIGN_STK_16(coef, tmp, 2*FILTER_OUT_STRIDE,); |
647 | |
|
648 | 0 | sgr_finish_filter2(tmp, *dst, stride, A_ptrs, B_ptrs, w, h); |
649 | 0 | sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX); |
650 | 0 | *dst += PXSTRIDE(stride); |
651 | 0 | if (h > 1) { |
652 | 0 | sgr_weighted_row1(*dst, tmp + FILTER_OUT_STRIDE, w, w1 HIGHBD_TAIL_SUFFIX); |
653 | 0 | *dst += PXSTRIDE(stride); |
654 | 0 | } |
655 | 0 | rotate(A_ptrs, B_ptrs, 2); |
656 | 0 | } |
657 | | |
658 | | static NOINLINE void sgr_finish_mix(pixel **dst, const ptrdiff_t stride, |
659 | | int32_t **A5_ptrs, coef **B5_ptrs, |
660 | | int32_t **A3_ptrs, coef **B3_ptrs, |
661 | | const int w, const int h, |
662 | | const int w0, const int w1 HIGHBD_DECL_SUFFIX) |
663 | 0 | { |
664 | 0 | ALIGN_STK_16(coef, tmp5, 2*FILTER_OUT_STRIDE,); |
665 | 0 | ALIGN_STK_16(coef, tmp3, 2*FILTER_OUT_STRIDE,); |
666 | |
|
667 | 0 | sgr_finish_filter2(tmp5, *dst, stride, A5_ptrs, B5_ptrs, w, h); |
668 | 0 | sgr_finish_filter_row1(tmp3, *dst, A3_ptrs, B3_ptrs, w); |
669 | 0 | if (h > 1) |
670 | 0 | sgr_finish_filter_row1(tmp3 + FILTER_OUT_STRIDE, *dst + PXSTRIDE(stride), |
671 | 0 | &A3_ptrs[1], &B3_ptrs[1], w); |
672 | 0 | sgr_weighted2(*dst, stride, tmp5, tmp3, w, h, w0, w1 HIGHBD_TAIL_SUFFIX); |
673 | 0 | *dst += h*PXSTRIDE(stride); |
674 | 0 | rotate(A5_ptrs, B5_ptrs, 2); |
675 | 0 | rotate(A3_ptrs, B3_ptrs, 4); |
676 | 0 | } |
677 | | |
678 | | |
679 | | static void sgr_3x3_c(pixel *dst, const ptrdiff_t stride, |
680 | | const pixel (*left)[4], const pixel *lpf, |
681 | | const int w, int h, |
682 | | const LooprestorationParams *const params, |
683 | | const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) |
684 | 0 | { |
685 | 0 | #define BUF_STRIDE (384 + 16) |
686 | 0 | ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16,); |
687 | 0 | ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 3 + 16,); |
688 | 0 | int32_t *sumsq_ptrs[3], *sumsq_rows[3]; |
689 | 0 | coef *sum_ptrs[3], *sum_rows[3]; |
690 | 0 | for (int i = 0; i < 3; i++) { |
691 | 0 | sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE]; |
692 | 0 | sum_rows[i] = &sum_buf[i * BUF_STRIDE]; |
693 | 0 | } |
694 | |
|
695 | 0 | ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16,); |
696 | 0 | ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 3 + 16,); |
697 | 0 | int32_t *A_ptrs[3]; |
698 | 0 | coef *B_ptrs[3]; |
699 | 0 | for (int i = 0; i < 3; i++) { |
700 | 0 | A_ptrs[i] = &A_buf[i * BUF_STRIDE]; |
701 | 0 | B_ptrs[i] = &B_buf[i * BUF_STRIDE]; |
702 | 0 | } |
703 | 0 | const pixel *src = dst; |
704 | 0 | const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); |
705 | |
|
706 | 0 | if (edges & LR_HAVE_TOP) { |
707 | 0 | sumsq_ptrs[0] = sumsq_rows[0]; |
708 | 0 | sumsq_ptrs[1] = sumsq_rows[1]; |
709 | 0 | sumsq_ptrs[2] = sumsq_rows[2]; |
710 | 0 | sum_ptrs[0] = sum_rows[0]; |
711 | 0 | sum_ptrs[1] = sum_rows[1]; |
712 | 0 | sum_ptrs[2] = sum_rows[2]; |
713 | |
|
714 | 0 | sgr_box3_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges); |
715 | 0 | lpf += PXSTRIDE(stride); |
716 | 0 | sgr_box3_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges); |
717 | |
|
718 | 0 | sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
719 | 0 | left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); |
720 | 0 | left++; |
721 | 0 | src += PXSTRIDE(stride); |
722 | 0 | rotate(A_ptrs, B_ptrs, 3); |
723 | |
|
724 | 0 | if (--h <= 0) |
725 | 0 | goto vert_1; |
726 | | |
727 | 0 | sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
728 | 0 | left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); |
729 | 0 | left++; |
730 | 0 | src += PXSTRIDE(stride); |
731 | 0 | rotate(A_ptrs, B_ptrs, 3); |
732 | |
|
733 | 0 | if (--h <= 0) |
734 | 0 | goto vert_2; |
735 | 0 | } else { |
736 | 0 | sumsq_ptrs[0] = sumsq_rows[0]; |
737 | 0 | sumsq_ptrs[1] = sumsq_rows[0]; |
738 | 0 | sumsq_ptrs[2] = sumsq_rows[0]; |
739 | 0 | sum_ptrs[0] = sum_rows[0]; |
740 | 0 | sum_ptrs[1] = sum_rows[0]; |
741 | 0 | sum_ptrs[2] = sum_rows[0]; |
742 | |
|
743 | 0 | sgr_box3_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges); |
744 | 0 | left++; |
745 | 0 | src += PXSTRIDE(stride); |
746 | |
|
747 | 0 | sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
748 | 0 | w, params->sgr.s1, BITDEPTH_MAX); |
749 | 0 | rotate(A_ptrs, B_ptrs, 3); |
750 | |
|
751 | 0 | if (--h <= 0) |
752 | 0 | goto vert_1; |
753 | | |
754 | 0 | sumsq_ptrs[2] = sumsq_rows[1]; |
755 | 0 | sum_ptrs[2] = sum_rows[1]; |
756 | |
|
757 | 0 | sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
758 | 0 | left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); |
759 | 0 | left++; |
760 | 0 | src += PXSTRIDE(stride); |
761 | 0 | rotate(A_ptrs, B_ptrs, 3); |
762 | |
|
763 | 0 | if (--h <= 0) |
764 | 0 | goto vert_2; |
765 | | |
766 | 0 | sumsq_ptrs[2] = sumsq_rows[2]; |
767 | 0 | sum_ptrs[2] = sum_rows[2]; |
768 | 0 | } |
769 | | |
770 | 0 | do { |
771 | 0 | sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
772 | 0 | left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); |
773 | 0 | left++; |
774 | 0 | src += PXSTRIDE(stride); |
775 | |
|
776 | 0 | sgr_finish1(&dst, stride, A_ptrs, B_ptrs, |
777 | 0 | w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); |
778 | 0 | } while (--h > 0); |
779 | |
|
780 | 0 | if (!(edges & LR_HAVE_BOTTOM)) |
781 | 0 | goto vert_2; |
782 | | |
783 | 0 | sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
784 | 0 | NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX); |
785 | 0 | lpf_bottom += PXSTRIDE(stride); |
786 | |
|
787 | 0 | sgr_finish1(&dst, stride, A_ptrs, B_ptrs, |
788 | 0 | w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); |
789 | |
|
790 | 0 | sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
791 | 0 | NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX); |
792 | |
|
793 | 0 | sgr_finish1(&dst, stride, A_ptrs, B_ptrs, |
794 | 0 | w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); |
795 | 0 | return; |
796 | | |
797 | 0 | vert_2: |
798 | 0 | sumsq_ptrs[2] = sumsq_ptrs[1]; |
799 | 0 | sum_ptrs[2] = sum_ptrs[1]; |
800 | 0 | sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
801 | 0 | w, params->sgr.s1, BITDEPTH_MAX); |
802 | |
|
803 | 0 | sgr_finish1(&dst, stride, A_ptrs, B_ptrs, |
804 | 0 | w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); |
805 | |
|
806 | 0 | output_1: |
807 | 0 | sumsq_ptrs[2] = sumsq_ptrs[1]; |
808 | 0 | sum_ptrs[2] = sum_ptrs[1]; |
809 | 0 | sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
810 | 0 | w, params->sgr.s1, BITDEPTH_MAX); |
811 | |
|
812 | 0 | sgr_finish1(&dst, stride, A_ptrs, B_ptrs, |
813 | 0 | w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); |
814 | 0 | return; |
815 | | |
816 | 0 | vert_1: |
817 | 0 | sumsq_ptrs[2] = sumsq_ptrs[1]; |
818 | 0 | sum_ptrs[2] = sum_ptrs[1]; |
819 | 0 | sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
820 | 0 | w, params->sgr.s1, BITDEPTH_MAX); |
821 | 0 | rotate(A_ptrs, B_ptrs, 3); |
822 | 0 | goto output_1; |
823 | 0 | } |
824 | | |
825 | | static void sgr_5x5_c(pixel *dst, const ptrdiff_t stride, |
826 | | const pixel (*left)[4], const pixel *lpf, |
827 | | const int w, int h, |
828 | | const LooprestorationParams *const params, |
829 | | const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) |
830 | 0 | { |
831 | 0 | ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16,); |
832 | 0 | ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 5 + 16,); |
833 | 0 | int32_t *sumsq_ptrs[5], *sumsq_rows[5]; |
834 | 0 | coef *sum_ptrs[5], *sum_rows[5]; |
835 | 0 | for (int i = 0; i < 5; i++) { |
836 | 0 | sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE]; |
837 | 0 | sum_rows[i] = &sum_buf[i * BUF_STRIDE]; |
838 | 0 | } |
839 | |
|
840 | 0 | ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16,); |
841 | 0 | ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 2 + 16,); |
842 | 0 | int32_t *A_ptrs[2]; |
843 | 0 | coef *B_ptrs[2]; |
844 | 0 | for (int i = 0; i < 2; i++) { |
845 | 0 | A_ptrs[i] = &A_buf[i * BUF_STRIDE]; |
846 | 0 | B_ptrs[i] = &B_buf[i * BUF_STRIDE]; |
847 | 0 | } |
848 | 0 | const pixel *src = dst; |
849 | 0 | const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); |
850 | |
|
851 | 0 | if (edges & LR_HAVE_TOP) { |
852 | 0 | sumsq_ptrs[0] = sumsq_rows[0]; |
853 | 0 | sumsq_ptrs[1] = sumsq_rows[0]; |
854 | 0 | sumsq_ptrs[2] = sumsq_rows[1]; |
855 | 0 | sumsq_ptrs[3] = sumsq_rows[2]; |
856 | 0 | sumsq_ptrs[4] = sumsq_rows[3]; |
857 | 0 | sum_ptrs[0] = sum_rows[0]; |
858 | 0 | sum_ptrs[1] = sum_rows[0]; |
859 | 0 | sum_ptrs[2] = sum_rows[1]; |
860 | 0 | sum_ptrs[3] = sum_rows[2]; |
861 | 0 | sum_ptrs[4] = sum_rows[3]; |
862 | |
|
863 | 0 | sgr_box5_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges); |
864 | 0 | lpf += PXSTRIDE(stride); |
865 | 0 | sgr_box5_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges); |
866 | |
|
867 | 0 | sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges); |
868 | 0 | left++; |
869 | 0 | src += PXSTRIDE(stride); |
870 | |
|
871 | 0 | if (--h <= 0) |
872 | 0 | goto vert_1; |
873 | | |
874 | 0 | sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges); |
875 | 0 | left++; |
876 | 0 | src += PXSTRIDE(stride); |
877 | 0 | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
878 | 0 | w, params->sgr.s0, BITDEPTH_MAX); |
879 | 0 | rotate(A_ptrs, B_ptrs, 2); |
880 | |
|
881 | 0 | if (--h <= 0) |
882 | 0 | goto vert_2; |
883 | | |
884 | | // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set |
885 | | // one of them to point at the previously unused rows[4]. |
886 | 0 | sumsq_ptrs[3] = sumsq_rows[4]; |
887 | 0 | sum_ptrs[3] = sum_rows[4]; |
888 | 0 | } else { |
889 | 0 | sumsq_ptrs[0] = sumsq_rows[0]; |
890 | 0 | sumsq_ptrs[1] = sumsq_rows[0]; |
891 | 0 | sumsq_ptrs[2] = sumsq_rows[0]; |
892 | 0 | sumsq_ptrs[3] = sumsq_rows[0]; |
893 | 0 | sumsq_ptrs[4] = sumsq_rows[0]; |
894 | 0 | sum_ptrs[0] = sum_rows[0]; |
895 | 0 | sum_ptrs[1] = sum_rows[0]; |
896 | 0 | sum_ptrs[2] = sum_rows[0]; |
897 | 0 | sum_ptrs[3] = sum_rows[0]; |
898 | 0 | sum_ptrs[4] = sum_rows[0]; |
899 | |
|
900 | 0 | sgr_box5_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges); |
901 | 0 | left++; |
902 | 0 | src += PXSTRIDE(stride); |
903 | |
|
904 | 0 | if (--h <= 0) |
905 | 0 | goto vert_1; |
906 | | |
907 | 0 | sumsq_ptrs[4] = sumsq_rows[1]; |
908 | 0 | sum_ptrs[4] = sum_rows[1]; |
909 | |
|
910 | 0 | sgr_box5_row_h(sumsq_rows[1], sum_rows[1], left, src, w, edges); |
911 | 0 | left++; |
912 | 0 | src += PXSTRIDE(stride); |
913 | |
|
914 | 0 | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
915 | 0 | w, params->sgr.s0, BITDEPTH_MAX); |
916 | 0 | rotate(A_ptrs, B_ptrs, 2); |
917 | |
|
918 | 0 | if (--h <= 0) |
919 | 0 | goto vert_2; |
920 | | |
921 | 0 | sumsq_ptrs[3] = sumsq_rows[2]; |
922 | 0 | sumsq_ptrs[4] = sumsq_rows[3]; |
923 | 0 | sum_ptrs[3] = sum_rows[2]; |
924 | 0 | sum_ptrs[4] = sum_rows[3]; |
925 | |
|
926 | 0 | sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges); |
927 | 0 | left++; |
928 | 0 | src += PXSTRIDE(stride); |
929 | |
|
930 | 0 | if (--h <= 0) |
931 | 0 | goto odd; |
932 | | |
933 | 0 | sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges); |
934 | 0 | left++; |
935 | 0 | src += PXSTRIDE(stride); |
936 | |
|
937 | 0 | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
938 | 0 | w, params->sgr.s0, BITDEPTH_MAX); |
939 | 0 | sgr_finish2(&dst, stride, A_ptrs, B_ptrs, |
940 | 0 | w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); |
941 | |
|
942 | 0 | if (--h <= 0) |
943 | 0 | goto vert_2; |
944 | | |
945 | | // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set |
946 | | // one of them to point at the previously unused rows[4]. |
947 | 0 | sumsq_ptrs[3] = sumsq_rows[4]; |
948 | 0 | sum_ptrs[3] = sum_rows[4]; |
949 | 0 | } |
950 | | |
951 | 0 | do { |
952 | 0 | sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], left, src, w, edges); |
953 | 0 | left++; |
954 | 0 | src += PXSTRIDE(stride); |
955 | |
|
956 | 0 | if (--h <= 0) |
957 | 0 | goto odd; |
958 | | |
959 | 0 | sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], left, src, w, edges); |
960 | 0 | left++; |
961 | 0 | src += PXSTRIDE(stride); |
962 | |
|
963 | 0 | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
964 | 0 | w, params->sgr.s0, BITDEPTH_MAX); |
965 | 0 | sgr_finish2(&dst, stride, A_ptrs, B_ptrs, |
966 | 0 | w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); |
967 | 0 | } while (--h > 0); |
968 | | |
969 | 0 | if (!(edges & LR_HAVE_BOTTOM)) |
970 | 0 | goto vert_2; |
971 | | |
972 | 0 | sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], NULL, lpf_bottom, w, edges); |
973 | 0 | lpf_bottom += PXSTRIDE(stride); |
974 | 0 | sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], NULL, lpf_bottom, w, edges); |
975 | |
|
976 | 0 | output_2: |
977 | 0 | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
978 | 0 | w, params->sgr.s0, BITDEPTH_MAX); |
979 | 0 | sgr_finish2(&dst, stride, A_ptrs, B_ptrs, |
980 | 0 | w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); |
981 | 0 | return; |
982 | | |
983 | 0 | vert_2: |
984 | | // Duplicate the last row twice more |
985 | 0 | sumsq_ptrs[3] = sumsq_ptrs[2]; |
986 | 0 | sumsq_ptrs[4] = sumsq_ptrs[2]; |
987 | 0 | sum_ptrs[3] = sum_ptrs[2]; |
988 | 0 | sum_ptrs[4] = sum_ptrs[2]; |
989 | 0 | goto output_2; |
990 | | |
991 | 0 | odd: |
992 | | // Copy the last row as padding once |
993 | 0 | sumsq_ptrs[4] = sumsq_ptrs[3]; |
994 | 0 | sum_ptrs[4] = sum_ptrs[3]; |
995 | |
|
996 | 0 | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
997 | 0 | w, params->sgr.s0, BITDEPTH_MAX); |
998 | 0 | sgr_finish2(&dst, stride, A_ptrs, B_ptrs, |
999 | 0 | w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); |
1000 | |
|
1001 | 0 | output_1: |
1002 | | // Duplicate the last row twice more |
1003 | 0 | sumsq_ptrs[3] = sumsq_ptrs[2]; |
1004 | 0 | sumsq_ptrs[4] = sumsq_ptrs[2]; |
1005 | 0 | sum_ptrs[3] = sum_ptrs[2]; |
1006 | 0 | sum_ptrs[4] = sum_ptrs[2]; |
1007 | |
|
1008 | 0 | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
1009 | 0 | w, params->sgr.s0, BITDEPTH_MAX); |
1010 | | // Output only one row |
1011 | 0 | sgr_finish2(&dst, stride, A_ptrs, B_ptrs, |
1012 | 0 | w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX); |
1013 | 0 | return; |
1014 | | |
1015 | 0 | vert_1: |
1016 | | // Copy the last row as padding once |
1017 | 0 | sumsq_ptrs[4] = sumsq_ptrs[3]; |
1018 | 0 | sum_ptrs[4] = sum_ptrs[3]; |
1019 | |
|
1020 | 0 | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
1021 | 0 | w, params->sgr.s0, BITDEPTH_MAX); |
1022 | 0 | rotate(A_ptrs, B_ptrs, 2); |
1023 | |
|
1024 | 0 | goto output_1; |
1025 | 0 | } |
1026 | | |
1027 | | static void sgr_mix_c(pixel *dst, const ptrdiff_t stride, |
1028 | | const pixel (*left)[4], const pixel *lpf, |
1029 | | const int w, int h, |
1030 | | const LooprestorationParams *const params, |
1031 | | const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) |
1032 | 0 | { |
1033 | 0 | ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16,); |
1034 | 0 | ALIGN_STK_16(coef, sum5_buf, BUF_STRIDE * 5 + 16,); |
1035 | 0 | int32_t *sumsq5_ptrs[5], *sumsq5_rows[5]; |
1036 | 0 | coef *sum5_ptrs[5], *sum5_rows[5]; |
1037 | 0 | for (int i = 0; i < 5; i++) { |
1038 | 0 | sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE]; |
1039 | 0 | sum5_rows[i] = &sum5_buf[i * BUF_STRIDE]; |
1040 | 0 | } |
1041 | 0 | ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16,); |
1042 | 0 | ALIGN_STK_16(coef, sum3_buf, BUF_STRIDE * 3 + 16,); |
1043 | 0 | int32_t *sumsq3_ptrs[3], *sumsq3_rows[3]; |
1044 | 0 | coef *sum3_ptrs[3], *sum3_rows[3]; |
1045 | 0 | for (int i = 0; i < 3; i++) { |
1046 | 0 | sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE]; |
1047 | 0 | sum3_rows[i] = &sum3_buf[i * BUF_STRIDE]; |
1048 | 0 | } |
1049 | |
|
1050 | 0 | ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16,); |
1051 | 0 | ALIGN_STK_16(coef, B5_buf, BUF_STRIDE * 2 + 16,); |
1052 | 0 | int32_t *A5_ptrs[2]; |
1053 | 0 | coef *B5_ptrs[2]; |
1054 | 0 | for (int i = 0; i < 2; i++) { |
1055 | 0 | A5_ptrs[i] = &A5_buf[i * BUF_STRIDE]; |
1056 | 0 | B5_ptrs[i] = &B5_buf[i * BUF_STRIDE]; |
1057 | 0 | } |
1058 | 0 | ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16,); |
1059 | 0 | ALIGN_STK_16(coef, B3_buf, BUF_STRIDE * 4 + 16,); |
1060 | 0 | int32_t *A3_ptrs[4]; |
1061 | 0 | coef *B3_ptrs[4]; |
1062 | 0 | for (int i = 0; i < 4; i++) { |
1063 | 0 | A3_ptrs[i] = &A3_buf[i * BUF_STRIDE]; |
1064 | 0 | B3_ptrs[i] = &B3_buf[i * BUF_STRIDE]; |
1065 | 0 | } |
1066 | 0 | const pixel *src = dst; |
1067 | 0 | const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); |
1068 | |
|
1069 | 0 | if (edges & LR_HAVE_TOP) { |
1070 | 0 | sumsq5_ptrs[0] = sumsq5_rows[0]; |
1071 | 0 | sumsq5_ptrs[1] = sumsq5_rows[0]; |
1072 | 0 | sumsq5_ptrs[2] = sumsq5_rows[1]; |
1073 | 0 | sumsq5_ptrs[3] = sumsq5_rows[2]; |
1074 | 0 | sumsq5_ptrs[4] = sumsq5_rows[3]; |
1075 | 0 | sum5_ptrs[0] = sum5_rows[0]; |
1076 | 0 | sum5_ptrs[1] = sum5_rows[0]; |
1077 | 0 | sum5_ptrs[2] = sum5_rows[1]; |
1078 | 0 | sum5_ptrs[3] = sum5_rows[2]; |
1079 | 0 | sum5_ptrs[4] = sum5_rows[3]; |
1080 | |
|
1081 | 0 | sumsq3_ptrs[0] = sumsq3_rows[0]; |
1082 | 0 | sumsq3_ptrs[1] = sumsq3_rows[1]; |
1083 | 0 | sumsq3_ptrs[2] = sumsq3_rows[2]; |
1084 | 0 | sum3_ptrs[0] = sum3_rows[0]; |
1085 | 0 | sum3_ptrs[1] = sum3_rows[1]; |
1086 | 0 | sum3_ptrs[2] = sum3_rows[2]; |
1087 | |
|
1088 | 0 | sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0], |
1089 | 0 | sumsq5_rows[0], sum5_rows[0], |
1090 | 0 | NULL, lpf, w, edges); |
1091 | 0 | lpf += PXSTRIDE(stride); |
1092 | 0 | sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1], |
1093 | 0 | sumsq5_rows[1], sum5_rows[1], |
1094 | 0 | NULL, lpf, w, edges); |
1095 | |
|
1096 | 0 | sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2], |
1097 | 0 | sumsq5_rows[2], sum5_rows[2], |
1098 | 0 | left, src, w, edges); |
1099 | 0 | left++; |
1100 | 0 | src += PXSTRIDE(stride); |
1101 | |
|
1102 | 0 | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1103 | 0 | w, params->sgr.s1, BITDEPTH_MAX); |
1104 | 0 | rotate(A3_ptrs, B3_ptrs, 4); |
1105 | |
|
1106 | 0 | if (--h <= 0) |
1107 | 0 | goto vert_1; |
1108 | | |
1109 | 0 | sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], |
1110 | 0 | sumsq5_rows[3], sum5_rows[3], |
1111 | 0 | left, src, w, edges); |
1112 | 0 | left++; |
1113 | 0 | src += PXSTRIDE(stride); |
1114 | 0 | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1115 | 0 | w, params->sgr.s0, BITDEPTH_MAX); |
1116 | 0 | rotate(A5_ptrs, B5_ptrs, 2); |
1117 | 0 | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1118 | 0 | w, params->sgr.s1, BITDEPTH_MAX); |
1119 | 0 | rotate(A3_ptrs, B3_ptrs, 4); |
1120 | |
|
1121 | 0 | if (--h <= 0) |
1122 | 0 | goto vert_2; |
1123 | | |
1124 | | // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set |
1125 | | // one of them to point at the previously unused rows[4]. |
1126 | 0 | sumsq5_ptrs[3] = sumsq5_rows[4]; |
1127 | 0 | sum5_ptrs[3] = sum5_rows[4]; |
1128 | 0 | } else { |
1129 | 0 | sumsq5_ptrs[0] = sumsq5_rows[0]; |
1130 | 0 | sumsq5_ptrs[1] = sumsq5_rows[0]; |
1131 | 0 | sumsq5_ptrs[2] = sumsq5_rows[0]; |
1132 | 0 | sumsq5_ptrs[3] = sumsq5_rows[0]; |
1133 | 0 | sumsq5_ptrs[4] = sumsq5_rows[0]; |
1134 | 0 | sum5_ptrs[0] = sum5_rows[0]; |
1135 | 0 | sum5_ptrs[1] = sum5_rows[0]; |
1136 | 0 | sum5_ptrs[2] = sum5_rows[0]; |
1137 | 0 | sum5_ptrs[3] = sum5_rows[0]; |
1138 | 0 | sum5_ptrs[4] = sum5_rows[0]; |
1139 | |
|
1140 | 0 | sumsq3_ptrs[0] = sumsq3_rows[0]; |
1141 | 0 | sumsq3_ptrs[1] = sumsq3_rows[0]; |
1142 | 0 | sumsq3_ptrs[2] = sumsq3_rows[0]; |
1143 | 0 | sum3_ptrs[0] = sum3_rows[0]; |
1144 | 0 | sum3_ptrs[1] = sum3_rows[0]; |
1145 | 0 | sum3_ptrs[2] = sum3_rows[0]; |
1146 | |
|
1147 | 0 | sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0], |
1148 | 0 | sumsq5_rows[0], sum5_rows[0], |
1149 | 0 | left, src, w, edges); |
1150 | 0 | left++; |
1151 | 0 | src += PXSTRIDE(stride); |
1152 | |
|
1153 | 0 | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1154 | 0 | w, params->sgr.s1, BITDEPTH_MAX); |
1155 | 0 | rotate(A3_ptrs, B3_ptrs, 4); |
1156 | |
|
1157 | 0 | if (--h <= 0) |
1158 | 0 | goto vert_1; |
1159 | | |
1160 | 0 | sumsq5_ptrs[4] = sumsq5_rows[1]; |
1161 | 0 | sum5_ptrs[4] = sum5_rows[1]; |
1162 | |
|
1163 | 0 | sumsq3_ptrs[2] = sumsq3_rows[1]; |
1164 | 0 | sum3_ptrs[2] = sum3_rows[1]; |
1165 | |
|
1166 | 0 | sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1], |
1167 | 0 | sumsq5_rows[1], sum5_rows[1], |
1168 | 0 | left, src, w, edges); |
1169 | 0 | left++; |
1170 | 0 | src += PXSTRIDE(stride); |
1171 | |
|
1172 | 0 | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1173 | 0 | w, params->sgr.s0, BITDEPTH_MAX); |
1174 | 0 | rotate(A5_ptrs, B5_ptrs, 2); |
1175 | 0 | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1176 | 0 | w, params->sgr.s1, BITDEPTH_MAX); |
1177 | 0 | rotate(A3_ptrs, B3_ptrs, 4); |
1178 | |
|
1179 | 0 | if (--h <= 0) |
1180 | 0 | goto vert_2; |
1181 | | |
1182 | 0 | sumsq5_ptrs[3] = sumsq5_rows[2]; |
1183 | 0 | sumsq5_ptrs[4] = sumsq5_rows[3]; |
1184 | 0 | sum5_ptrs[3] = sum5_rows[2]; |
1185 | 0 | sum5_ptrs[4] = sum5_rows[3]; |
1186 | |
|
1187 | 0 | sumsq3_ptrs[2] = sumsq3_rows[2]; |
1188 | 0 | sum3_ptrs[2] = sum3_rows[2]; |
1189 | |
|
1190 | 0 | sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2], |
1191 | 0 | sumsq5_rows[2], sum5_rows[2], |
1192 | 0 | left, src, w, edges); |
1193 | 0 | left++; |
1194 | 0 | src += PXSTRIDE(stride); |
1195 | |
|
1196 | 0 | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1197 | 0 | w, params->sgr.s1, BITDEPTH_MAX); |
1198 | 0 | rotate(A3_ptrs, B3_ptrs, 4); |
1199 | |
|
1200 | 0 | if (--h <= 0) |
1201 | 0 | goto odd; |
1202 | | |
1203 | 0 | sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], |
1204 | 0 | sumsq5_rows[3], sum5_rows[3], |
1205 | 0 | left, src, w, edges); |
1206 | 0 | left++; |
1207 | 0 | src += PXSTRIDE(stride); |
1208 | |
|
1209 | 0 | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1210 | 0 | w, params->sgr.s0, BITDEPTH_MAX); |
1211 | 0 | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1212 | 0 | w, params->sgr.s1, BITDEPTH_MAX); |
1213 | 0 | sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, |
1214 | 0 | w, 2, params->sgr.w0, params->sgr.w1 |
1215 | 0 | HIGHBD_TAIL_SUFFIX); |
1216 | |
|
1217 | 0 | if (--h <= 0) |
1218 | 0 | goto vert_2; |
1219 | | |
1220 | | // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set |
1221 | | // one of them to point at the previously unused rows[4]. |
1222 | 0 | sumsq5_ptrs[3] = sumsq5_rows[4]; |
1223 | 0 | sum5_ptrs[3] = sum5_rows[4]; |
1224 | 0 | } |
1225 | | |
1226 | 0 | do { |
1227 | 0 | sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], |
1228 | 0 | sumsq5_ptrs[3], sum5_ptrs[3], |
1229 | 0 | left, src, w, edges); |
1230 | 0 | left++; |
1231 | 0 | src += PXSTRIDE(stride); |
1232 | |
|
1233 | 0 | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1234 | 0 | w, params->sgr.s1, BITDEPTH_MAX); |
1235 | 0 | rotate(A3_ptrs, B3_ptrs, 4); |
1236 | |
|
1237 | 0 | if (--h <= 0) |
1238 | 0 | goto odd; |
1239 | | |
1240 | 0 | sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], |
1241 | 0 | sumsq5_ptrs[4], sum5_ptrs[4], |
1242 | 0 | left, src, w, edges); |
1243 | 0 | left++; |
1244 | 0 | src += PXSTRIDE(stride); |
1245 | |
|
1246 | 0 | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1247 | 0 | w, params->sgr.s0, BITDEPTH_MAX); |
1248 | 0 | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1249 | 0 | w, params->sgr.s1, BITDEPTH_MAX); |
1250 | 0 | sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, |
1251 | 0 | w, 2, params->sgr.w0, params->sgr.w1 |
1252 | 0 | HIGHBD_TAIL_SUFFIX); |
1253 | 0 | } while (--h > 0); |
1254 | | |
1255 | 0 | if (!(edges & LR_HAVE_BOTTOM)) |
1256 | 0 | goto vert_2; |
1257 | | |
1258 | 0 | sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], |
1259 | 0 | sumsq5_ptrs[3], sum5_ptrs[3], |
1260 | 0 | NULL, lpf_bottom, w, edges); |
1261 | 0 | lpf_bottom += PXSTRIDE(stride); |
1262 | 0 | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1263 | 0 | w, params->sgr.s1, BITDEPTH_MAX); |
1264 | 0 | rotate(A3_ptrs, B3_ptrs, 4); |
1265 | |
|
1266 | 0 | sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], |
1267 | 0 | sumsq5_ptrs[4], sum5_ptrs[4], |
1268 | 0 | NULL, lpf_bottom, w, edges); |
1269 | |
|
1270 | 0 | output_2: |
1271 | 0 | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1272 | 0 | w, params->sgr.s0, BITDEPTH_MAX); |
1273 | 0 | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1274 | 0 | w, params->sgr.s1, BITDEPTH_MAX); |
1275 | 0 | sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, |
1276 | 0 | w, 2, params->sgr.w0, params->sgr.w1 |
1277 | 0 | HIGHBD_TAIL_SUFFIX); |
1278 | 0 | return; |
1279 | | |
1280 | 0 | vert_2: |
1281 | | // Duplicate the last row twice more |
1282 | 0 | sumsq5_ptrs[3] = sumsq5_ptrs[2]; |
1283 | 0 | sumsq5_ptrs[4] = sumsq5_ptrs[2]; |
1284 | 0 | sum5_ptrs[3] = sum5_ptrs[2]; |
1285 | 0 | sum5_ptrs[4] = sum5_ptrs[2]; |
1286 | |
|
1287 | 0 | sumsq3_ptrs[2] = sumsq3_ptrs[1]; |
1288 | 0 | sum3_ptrs[2] = sum3_ptrs[1]; |
1289 | 0 | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1290 | 0 | w, params->sgr.s1, BITDEPTH_MAX); |
1291 | 0 | rotate(A3_ptrs, B3_ptrs, 4); |
1292 | |
|
1293 | 0 | sumsq3_ptrs[2] = sumsq3_ptrs[1]; |
1294 | 0 | sum3_ptrs[2] = sum3_ptrs[1]; |
1295 | |
|
1296 | 0 | goto output_2; |
1297 | | |
1298 | 0 | odd: |
1299 | | // Copy the last row as padding once |
1300 | 0 | sumsq5_ptrs[4] = sumsq5_ptrs[3]; |
1301 | 0 | sum5_ptrs[4] = sum5_ptrs[3]; |
1302 | |
|
1303 | 0 | sumsq3_ptrs[2] = sumsq3_ptrs[1]; |
1304 | 0 | sum3_ptrs[2] = sum3_ptrs[1]; |
1305 | |
|
1306 | 0 | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1307 | 0 | w, params->sgr.s0, BITDEPTH_MAX); |
1308 | 0 | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1309 | 0 | w, params->sgr.s1, BITDEPTH_MAX); |
1310 | 0 | sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, |
1311 | 0 | w, 2, params->sgr.w0, params->sgr.w1 |
1312 | 0 | HIGHBD_TAIL_SUFFIX); |
1313 | |
|
1314 | 0 | output_1: |
1315 | | // Duplicate the last row twice more |
1316 | 0 | sumsq5_ptrs[3] = sumsq5_ptrs[2]; |
1317 | 0 | sumsq5_ptrs[4] = sumsq5_ptrs[2]; |
1318 | 0 | sum5_ptrs[3] = sum5_ptrs[2]; |
1319 | 0 | sum5_ptrs[4] = sum5_ptrs[2]; |
1320 | |
|
1321 | 0 | sumsq3_ptrs[2] = sumsq3_ptrs[1]; |
1322 | 0 | sum3_ptrs[2] = sum3_ptrs[1]; |
1323 | |
|
1324 | 0 | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1325 | 0 | w, params->sgr.s0, BITDEPTH_MAX); |
1326 | 0 | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1327 | 0 | w, params->sgr.s1, BITDEPTH_MAX); |
1328 | 0 | rotate(A3_ptrs, B3_ptrs, 4); |
1329 | | // Output only one row |
1330 | 0 | sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, |
1331 | 0 | w, 1, params->sgr.w0, params->sgr.w1 |
1332 | 0 | HIGHBD_TAIL_SUFFIX); |
1333 | 0 | return; |
1334 | | |
1335 | 0 | vert_1: |
1336 | | // Copy the last row as padding once |
1337 | 0 | sumsq5_ptrs[4] = sumsq5_ptrs[3]; |
1338 | 0 | sum5_ptrs[4] = sum5_ptrs[3]; |
1339 | |
|
1340 | 0 | sumsq3_ptrs[2] = sumsq3_ptrs[1]; |
1341 | 0 | sum3_ptrs[2] = sum3_ptrs[1]; |
1342 | |
|
1343 | 0 | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1344 | 0 | w, params->sgr.s0, BITDEPTH_MAX); |
1345 | 0 | rotate(A5_ptrs, B5_ptrs, 2); |
1346 | 0 | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1347 | 0 | w, params->sgr.s1, BITDEPTH_MAX); |
1348 | 0 | rotate(A3_ptrs, B3_ptrs, 4); |
1349 | |
|
1350 | 0 | goto output_1; |
1351 | 0 | } |
1352 | | |
1353 | | #if HAVE_ASM |
1354 | | #if ARCH_AARCH64 || ARCH_ARM |
1355 | | #include "src/arm/looprestoration.h" |
1356 | | #elif ARCH_LOONGARCH64 |
1357 | | #include "src/loongarch/looprestoration.h" |
1358 | | #elif ARCH_PPC64LE |
1359 | | #include "src/ppc/looprestoration.h" |
1360 | | #elif ARCH_X86 |
1361 | | #include "src/x86/looprestoration.h" |
1362 | | #endif |
1363 | | #endif |
1364 | | |
1365 | | COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, |
1366 | | const int bpc) |
1367 | 4 | { |
1368 | 4 | c->wiener[0] = c->wiener[1] = wiener_c; |
1369 | 4 | c->sgr[0] = sgr_5x5_c; |
1370 | 4 | c->sgr[1] = sgr_3x3_c; |
1371 | 4 | c->sgr[2] = sgr_mix_c; |
1372 | | |
1373 | | #if HAVE_ASM |
1374 | | #if ARCH_AARCH64 || ARCH_ARM |
1375 | | loop_restoration_dsp_init_arm(c, bpc); |
1376 | | #elif ARCH_LOONGARCH64 |
1377 | | loop_restoration_dsp_init_loongarch(c, bpc); |
1378 | | #elif ARCH_PPC64LE |
1379 | | loop_restoration_dsp_init_ppc(c, bpc); |
1380 | | #elif ARCH_X86 |
1381 | | loop_restoration_dsp_init_x86(c, bpc); |
1382 | | #endif |
1383 | | #endif |
1384 | 4 | } dav1d_loop_restoration_dsp_init_8bpc Line | Count | Source | 1367 | 4 | { | 1368 | 4 | c->wiener[0] = c->wiener[1] = wiener_c; | 1369 | 4 | c->sgr[0] = sgr_5x5_c; | 1370 | 4 | c->sgr[1] = sgr_3x3_c; | 1371 | 4 | c->sgr[2] = sgr_mix_c; | 1372 | | | 1373 | | #if HAVE_ASM | 1374 | | #if ARCH_AARCH64 || ARCH_ARM | 1375 | | loop_restoration_dsp_init_arm(c, bpc); | 1376 | | #elif ARCH_LOONGARCH64 | 1377 | | loop_restoration_dsp_init_loongarch(c, bpc); | 1378 | | #elif ARCH_PPC64LE | 1379 | | loop_restoration_dsp_init_ppc(c, bpc); | 1380 | | #elif ARCH_X86 | 1381 | | loop_restoration_dsp_init_x86(c, bpc); | 1382 | | #endif | 1383 | | #endif | 1384 | 4 | } |
Unexecuted instantiation: dav1d_loop_restoration_dsp_init_16bpc |