/work/dav1d/src/looprestoration_tmpl.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright © 2018, VideoLAN and dav1d authors |
3 | | * Copyright © 2018, Two Orioles, LLC |
4 | | * All rights reserved. |
5 | | * |
6 | | * Redistribution and use in source and binary forms, with or without |
7 | | * modification, are permitted provided that the following conditions are met: |
8 | | * |
9 | | * 1. Redistributions of source code must retain the above copyright notice, this |
10 | | * list of conditions and the following disclaimer. |
11 | | * |
12 | | * 2. Redistributions in binary form must reproduce the above copyright notice, |
13 | | * this list of conditions and the following disclaimer in the documentation |
14 | | * and/or other materials provided with the distribution. |
15 | | * |
16 | | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
17 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
18 | | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
19 | | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
20 | | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
21 | | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
22 | | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
23 | | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
24 | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
25 | | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
26 | | */ |
27 | | |
28 | | #include "config.h" |
29 | | |
30 | | #include <stdint.h> |
31 | | #include <stdlib.h> |
32 | | #include <string.h> |
33 | | |
34 | | #include "common/attributes.h" |
35 | | #include "common/bitdepth.h" |
36 | | #include "common/intops.h" |
37 | | |
38 | | #include "src/looprestoration.h" |
39 | | #include "src/tables.h" |
40 | | |
41 | | // 256 * 1.5 + 3 + 3 = 390 |
42 | 4.17M | #define REST_UNIT_STRIDE (390) |
43 | | |
44 | | static void wiener_filter_h(uint16_t *dst, const pixel (*left)[4], |
45 | | const pixel *src, const int16_t fh[8], |
46 | | const int w, const enum LrEdgeFlags edges |
47 | | HIGHBD_DECL_SUFFIX) |
48 | 3.91M | { |
49 | 3.91M | const int bitdepth = bitdepth_from_max(bitdepth_max); |
50 | 3.91M | const int round_bits_h = 3 + (bitdepth == 12) * 2; |
51 | 3.91M | const int rounding_off_h = 1 << (round_bits_h - 1); |
52 | 3.91M | const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h); |
53 | | |
54 | 3.91M | if (w < 6) { |
55 | | // For small widths, do the fully conditional loop with |
56 | | // conditions on each access. |
57 | 2.93M | for (int x = 0; x < w; x++) { |
58 | 1.95M | int sum = (1 << (bitdepth + 6)); |
59 | 1.95M | #if BITDEPTH == 8 |
60 | 1.95M | sum += src[x] * 128; |
61 | 1.95M | #endif |
62 | 15.5M | for (int i = 0; i < 7; i++) { |
63 | 13.5M | int idx = x + i - 3; |
64 | 13.5M | if (idx < 0) { |
65 | 4.02M | if (!(edges & LR_HAVE_LEFT)) |
66 | 4.02M | sum += src[0] * fh[i]; |
67 | 18.4E | else if (left) |
68 | 0 | sum += left[0][4 + idx] * fh[i]; |
69 | 18.4E | else |
70 | 18.4E | sum += src[idx] * fh[i]; |
71 | 9.54M | } else if (idx >= w && !(edges & LR_HAVE_RIGHT)) { |
72 | 3.97M | sum += src[w - 1] * fh[i]; |
73 | 3.97M | } else |
74 | 5.56M | sum += src[idx] * fh[i]; |
75 | 13.5M | } |
76 | 1.95M | sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1); |
77 | 1.95M | dst[x] = sum; |
78 | 1.95M | } |
79 | | |
80 | 973k | return; |
81 | 973k | } |
82 | | |
83 | | // For larger widths, do separate loops with less conditions; first |
84 | | // handle the start of the row. |
85 | 2.94M | int start = 3; |
86 | 2.94M | if (!(edges & LR_HAVE_LEFT)) { |
87 | | // If there's no left edge, pad using the leftmost pixel. |
88 | 4.12M | for (int x = 0; x < 3; x++) { |
89 | 3.09M | int sum = (1 << (bitdepth + 6)); |
90 | 3.09M | #if BITDEPTH == 8 |
91 | 3.09M | sum += src[x] * 128; |
92 | 3.09M | #endif |
93 | 24.6M | for (int i = 0; i < 7; i++) { |
94 | 21.6M | int idx = x + i - 3; |
95 | 21.6M | if (idx < 0) |
96 | 6.17M | sum += src[0] * fh[i]; |
97 | 15.4M | else |
98 | 15.4M | sum += src[idx] * fh[i]; |
99 | 21.6M | } |
100 | 3.09M | sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1); |
101 | 3.09M | dst[x] = sum; |
102 | 3.09M | } |
103 | 1.90M | } else if (left) { |
104 | | // If we have the left edge and a separate left buffer, pad using that. |
105 | 7.17M | for (int x = 0; x < 3; x++) { |
106 | 5.37M | int sum = (1 << (bitdepth + 6)); |
107 | 5.37M | #if BITDEPTH == 8 |
108 | 5.37M | sum += src[x] * 128; |
109 | 5.37M | #endif |
110 | 42.9M | for (int i = 0; i < 7; i++) { |
111 | 37.5M | int idx = x + i - 3; |
112 | 37.5M | if (idx < 0) |
113 | 10.7M | sum += left[0][4 + idx] * fh[i]; |
114 | 26.8M | else |
115 | 26.8M | sum += src[idx] * fh[i]; |
116 | 37.5M | } |
117 | 5.37M | sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1); |
118 | 5.37M | dst[x] = sum; |
119 | 5.37M | } |
120 | 1.79M | } else { |
121 | | // If we have the left edge, but no separate left buffer, we're in the |
122 | | // top/bottom area (lpf) with the left edge existing in the same |
123 | | // buffer; just do the regular loop from the start. |
124 | 112k | start = 0; |
125 | 112k | } |
126 | 2.94M | int end = w - 3; |
127 | 2.94M | if (edges & LR_HAVE_RIGHT) |
128 | 1.88M | end = w; |
129 | | |
130 | | // Do a condititon free loop for the bulk of the row. |
131 | 269M | for (int x = start; x < end; x++) { |
132 | 266M | int sum = (1 << (bitdepth + 6)); |
133 | 266M | #if BITDEPTH == 8 |
134 | 266M | sum += src[x] * 128; |
135 | 266M | #endif |
136 | 2.11G | for (int i = 0; i < 7; i++) { |
137 | 1.85G | int idx = x + i - 3; |
138 | 1.85G | sum += src[idx] * fh[i]; |
139 | 1.85G | } |
140 | 266M | sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1); |
141 | 266M | dst[x] = sum; |
142 | 266M | } |
143 | | |
144 | | // If we need to, calculate the end of the row with a condition for |
145 | | // right edge padding. |
146 | 6.10M | for (int x = end; x < w; x++) { |
147 | 3.16M | int sum = (1 << (bitdepth + 6)); |
148 | 3.16M | #if BITDEPTH == 8 |
149 | 3.16M | sum += src[x] * 128; |
150 | 3.16M | #endif |
151 | 25.2M | for (int i = 0; i < 7; i++) { |
152 | 22.1M | int idx = x + i - 3; |
153 | 22.1M | if (idx >= w) |
154 | 6.31M | sum += src[w - 1] * fh[i]; |
155 | 15.7M | else |
156 | 15.7M | sum += src[idx] * fh[i]; |
157 | 22.1M | } |
158 | 3.16M | sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1); |
159 | 3.16M | dst[x] = sum; |
160 | 3.16M | } |
161 | 2.94M | } |
162 | | |
163 | | static void wiener_filter_v(pixel *p, uint16_t **ptrs, const int16_t fv[8], |
164 | | const int w HIGHBD_DECL_SUFFIX) |
165 | 145k | { |
166 | 145k | const int bitdepth = bitdepth_from_max(bitdepth_max); |
167 | | |
168 | 145k | const int round_bits_v = 11 - (bitdepth == 12) * 2; |
169 | 145k | const int rounding_off_v = 1 << (round_bits_v - 1); |
170 | 145k | const int round_offset = 1 << (bitdepth + (round_bits_v - 1)); |
171 | | |
172 | 10.0M | for (int i = 0; i < w; i++) { |
173 | 9.87M | int sum = -round_offset; |
174 | | |
175 | | // Only filter using 6 input rows. The 7th row is assumed to be |
176 | | // identical to the last one. |
177 | | // |
178 | | // This function is assumed to only be called at the end, when doing |
179 | | // padding at the bottom. |
180 | 69.0M | for (int k = 0; k < 6; k++) |
181 | 59.2M | sum += ptrs[k][i] * fv[k]; |
182 | 9.87M | sum += ptrs[5][i] * fv[6]; |
183 | | |
184 | 9.87M | p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v); |
185 | 9.87M | } |
186 | | |
187 | | // Shift the pointers, but only update the first 5; the 6th pointer is kept |
188 | | // as it was before (and the 7th is implicitly identical to the 6th). |
189 | 870k | for (int i = 0; i < 5; i++) |
190 | 725k | ptrs[i] = ptrs[i + 1]; |
191 | 145k | } |
192 | | |
193 | | static void wiener_filter_hv(pixel *p, uint16_t **ptrs, const pixel (*left)[4], |
194 | | const pixel *src, const int16_t filter[2][8], |
195 | | const int w, const enum LrEdgeFlags edges |
196 | | HIGHBD_DECL_SUFFIX) |
197 | 3.53M | { |
198 | 3.53M | const int bitdepth = bitdepth_from_max(bitdepth_max); |
199 | | |
200 | 3.53M | const int round_bits_v = 11 - (bitdepth == 12) * 2; |
201 | 3.53M | const int rounding_off_v = 1 << (round_bits_v - 1); |
202 | 3.53M | const int round_offset = 1 << (bitdepth + (round_bits_v - 1)); |
203 | | |
204 | 3.53M | const int16_t *fh = filter[0]; |
205 | 3.53M | const int16_t *fv = filter[1]; |
206 | | |
207 | | // Do combined horziontal and vertical filtering; doing horizontal |
208 | | // filtering of one row, combined with vertical filtering of 6 |
209 | | // preexisting rows and the newly filtered row. |
210 | | |
211 | | // For simplicity in the C implementation, just do a separate call |
212 | | // of the horizontal filter, into a temporary buffer. |
213 | 3.53M | uint16_t tmp[REST_UNIT_STRIDE]; |
214 | 3.53M | wiener_filter_h(tmp, left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); |
215 | | |
216 | 258M | for (int i = 0; i < w; i++) { |
217 | 254M | int sum = -round_offset; |
218 | | |
219 | | // Filter using the 6 stored preexisting rows, and the newly |
220 | | // filtered one in tmp[]. |
221 | 1.77G | for (int k = 0; k < 6; k++) |
222 | 1.52G | sum += ptrs[k][i] * fv[k]; |
223 | 254M | sum += tmp[i] * fv[6]; |
224 | | // At this point, after having read all inputs at point [i], we |
225 | | // could overwrite [i] with the newly filtered data. |
226 | | |
227 | 254M | p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v); |
228 | 254M | } |
229 | | |
230 | | // For simplicity in the C implementation, just memcpy the newly |
231 | | // filtered row into ptrs[6]. Normally, in steady state filtering, |
232 | | // this output row, ptrs[6], is equal to ptrs[0]. However at startup, |
233 | | // at the top of the filtered area, we may have ptrs[0] equal to ptrs[1], |
234 | | // so we can't assume we can write into ptrs[0] but we need to keep |
235 | | // a separate pointer for the next row to write into. |
236 | 3.53M | memcpy(ptrs[6], tmp, sizeof(uint16_t) * REST_UNIT_STRIDE); |
237 | | |
238 | | // Rotate the window of pointers. Shift the 6 pointers downwards one step. |
239 | 24.7M | for (int i = 0; i < 6; i++) |
240 | 21.1M | ptrs[i] = ptrs[i + 1]; |
241 | | // The topmost pointer, ptrs[6], which isn't used as input, is set to |
242 | | // ptrs[0], which will be used as output for the next _hv call. |
243 | | // At the start of the filtering, the caller may set ptrs[6] to the |
244 | | // right next buffer to fill in, instead. |
245 | 3.53M | ptrs[6] = ptrs[0]; |
246 | 3.53M | } |
247 | | |
248 | | // FIXME Could split into luma and chroma specific functions, |
249 | | // (since first and last tops are always 0 for chroma) |
250 | | static void wiener_c(pixel *p, const ptrdiff_t stride, |
251 | | const pixel (*left)[4], |
252 | | const pixel *lpf, const int w, int h, |
253 | | const LooprestorationParams *const params, |
254 | | const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) |
255 | 92.6k | { |
256 | | // Values stored between horizontal and vertical filtering don't |
257 | | // fit in a uint8_t. |
258 | 92.6k | uint16_t hor[6 * REST_UNIT_STRIDE]; |
259 | 92.6k | uint16_t *ptrs[7], *rows[6]; |
260 | 648k | for (int i = 0; i < 6; i++) |
261 | 555k | rows[i] = &hor[i * REST_UNIT_STRIDE]; |
262 | 92.6k | const int16_t (*const filter)[8] = params->filter; |
263 | 92.6k | const int16_t *fh = params->filter[0]; |
264 | 92.6k | const int16_t *fv = params->filter[1]; |
265 | 92.6k | const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); |
266 | | |
267 | 92.6k | const pixel *src = p; |
268 | 92.6k | if (edges & LR_HAVE_TOP) { |
269 | 58.3k | ptrs[0] = rows[0]; |
270 | 58.3k | ptrs[1] = rows[0]; |
271 | 58.3k | ptrs[2] = rows[1]; |
272 | 58.3k | ptrs[3] = rows[2]; |
273 | 58.3k | ptrs[4] = rows[2]; |
274 | 58.3k | ptrs[5] = rows[2]; |
275 | | |
276 | 58.3k | wiener_filter_h(rows[0], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX); |
277 | 58.3k | lpf += PXSTRIDE(stride); |
278 | 58.3k | wiener_filter_h(rows[1], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX); |
279 | | |
280 | 58.3k | wiener_filter_h(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); |
281 | 58.3k | left++; |
282 | 58.3k | src += PXSTRIDE(stride); |
283 | | |
284 | 58.3k | if (--h <= 0) |
285 | 353 | goto v1; |
286 | | |
287 | 58.0k | ptrs[4] = ptrs[5] = rows[3]; |
288 | 58.0k | wiener_filter_h(rows[3], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); |
289 | 58.0k | left++; |
290 | 58.0k | src += PXSTRIDE(stride); |
291 | | |
292 | 58.0k | if (--h <= 0) |
293 | 379 | goto v2; |
294 | | |
295 | 57.6k | ptrs[5] = rows[4]; |
296 | 57.6k | wiener_filter_h(rows[4], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); |
297 | 57.6k | left++; |
298 | 57.6k | src += PXSTRIDE(stride); |
299 | | |
300 | 57.6k | if (--h <= 0) |
301 | 187 | goto v3; |
302 | 57.6k | } else { |
303 | 34.2k | ptrs[0] = rows[0]; |
304 | 34.2k | ptrs[1] = rows[0]; |
305 | 34.2k | ptrs[2] = rows[0]; |
306 | 34.2k | ptrs[3] = rows[0]; |
307 | 34.2k | ptrs[4] = rows[0]; |
308 | 34.2k | ptrs[5] = rows[0]; |
309 | | |
310 | 34.2k | wiener_filter_h(rows[0], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); |
311 | 34.2k | left++; |
312 | 34.2k | src += PXSTRIDE(stride); |
313 | | |
314 | 34.2k | if (--h <= 0) |
315 | 3.26k | goto v1; |
316 | | |
317 | 31.0k | ptrs[4] = ptrs[5] = rows[1]; |
318 | 31.0k | wiener_filter_h(rows[1], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); |
319 | 31.0k | left++; |
320 | 31.0k | src += PXSTRIDE(stride); |
321 | | |
322 | 31.0k | if (--h <= 0) |
323 | 6.48k | goto v2; |
324 | | |
325 | 24.5k | ptrs[5] = rows[2]; |
326 | 24.5k | wiener_filter_h(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); |
327 | 24.5k | left++; |
328 | 24.5k | src += PXSTRIDE(stride); |
329 | | |
330 | 24.5k | if (--h <= 0) |
331 | 2.43k | goto v3; |
332 | | |
333 | 22.0k | ptrs[6] = rows[3]; |
334 | 22.0k | wiener_filter_hv(p, ptrs, left, src, filter, w, edges |
335 | 22.0k | HIGHBD_TAIL_SUFFIX); |
336 | 22.0k | left++; |
337 | 22.0k | src += PXSTRIDE(stride); |
338 | 22.0k | p += PXSTRIDE(stride); |
339 | | |
340 | 22.0k | if (--h <= 0) |
341 | 1.67k | goto v3; |
342 | | |
343 | 20.4k | ptrs[6] = rows[4]; |
344 | 20.4k | wiener_filter_hv(p, ptrs, left, src, filter, w, edges |
345 | 20.4k | HIGHBD_TAIL_SUFFIX); |
346 | 20.4k | left++; |
347 | 20.4k | src += PXSTRIDE(stride); |
348 | 20.4k | p += PXSTRIDE(stride); |
349 | | |
350 | 20.4k | if (--h <= 0) |
351 | 1.49k | goto v3; |
352 | 20.4k | } |
353 | | |
354 | 76.3k | ptrs[6] = ptrs[5] + REST_UNIT_STRIDE; |
355 | 3.37M | do { |
356 | 3.37M | wiener_filter_hv(p, ptrs, left, src, filter, w, edges |
357 | 3.37M | HIGHBD_TAIL_SUFFIX); |
358 | 3.37M | left++; |
359 | 3.37M | src += PXSTRIDE(stride); |
360 | 3.37M | p += PXSTRIDE(stride); |
361 | 3.37M | } while (--h > 0); |
362 | | |
363 | 76.3k | if (!(edges & LR_HAVE_BOTTOM)) |
364 | 16.9k | goto v3; |
365 | | |
366 | 59.3k | wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges |
367 | 59.3k | HIGHBD_TAIL_SUFFIX); |
368 | 59.3k | lpf_bottom += PXSTRIDE(stride); |
369 | 59.3k | p += PXSTRIDE(stride); |
370 | | |
371 | 59.3k | wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges |
372 | 59.3k | HIGHBD_TAIL_SUFFIX); |
373 | 59.3k | p += PXSTRIDE(stride); |
374 | 92.6k | v1: |
375 | 92.6k | wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX); |
376 | | |
377 | 92.6k | return; |
378 | | |
379 | 22.7k | v3: |
380 | 22.7k | wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX); |
381 | 22.7k | p += PXSTRIDE(stride); |
382 | 29.6k | v2: |
383 | 29.6k | wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX); |
384 | 29.6k | p += PXSTRIDE(stride); |
385 | 29.6k | goto v1; |
386 | 22.7k | } |
387 | | |
388 | | // SGR |
389 | | static NOINLINE void rotate(int32_t **sumsq_ptrs, coef **sum_ptrs, int n) |
390 | 7.96M | { |
391 | 7.96M | int32_t *tmp32 = sumsq_ptrs[0]; |
392 | 7.96M | coef *tmpc = sum_ptrs[0]; |
393 | 24.5M | for (int i = 0; i < n - 1; i++) { |
394 | 16.5M | sumsq_ptrs[i] = sumsq_ptrs[i + 1]; |
395 | 16.5M | sum_ptrs[i] = sum_ptrs[i + 1]; |
396 | 16.5M | } |
397 | 7.96M | sumsq_ptrs[n - 1] = tmp32; |
398 | 7.96M | sum_ptrs[n - 1] = tmpc; |
399 | 7.96M | } |
400 | | |
401 | | static NOINLINE void rotate5_x2(int32_t **sumsq_ptrs, coef **sum_ptrs) |
402 | 1.69M | { |
403 | 1.69M | int32_t *tmp32[2]; |
404 | 1.69M | coef *tmpc[2]; |
405 | 5.08M | for (int i = 0; i < 2; i++) { |
406 | 3.39M | tmp32[i] = sumsq_ptrs[i]; |
407 | 3.39M | tmpc[i] = sum_ptrs[i]; |
408 | 3.39M | } |
409 | 6.77M | for (int i = 0; i < 3; i++) { |
410 | 5.07M | sumsq_ptrs[i] = sumsq_ptrs[i + 2]; |
411 | 5.07M | sum_ptrs[i] = sum_ptrs[i + 2]; |
412 | 5.07M | } |
413 | 5.08M | for (int i = 0; i < 2; i++) { |
414 | 3.39M | sumsq_ptrs[3 + i] = tmp32[i]; |
415 | 3.39M | sum_ptrs[3 + i] = tmpc[i]; |
416 | 3.39M | } |
417 | 1.69M | } |
418 | | |
419 | | static NOINLINE void sgr_box3_row_h(int32_t *sumsq, coef *sum, |
420 | | const pixel (*left)[4], |
421 | | const pixel *src, const int w, |
422 | | const enum LrEdgeFlags edges) |
423 | 3.20M | { |
424 | 3.20M | sumsq++; |
425 | 3.20M | sum++; |
426 | 3.20M | int a = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0]; |
427 | 3.20M | int b = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0]; |
428 | 251M | for (int x = -1; x < w + 1; x++) { |
429 | 248M | int c = (x + 1 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 1] : src[w - 1]; |
430 | 248M | sum[x] = a + b + c; |
431 | 248M | sumsq[x] = a * a + b * b + c * c; |
432 | 248M | a = b; |
433 | 248M | b = c; |
434 | 248M | } |
435 | 3.20M | } |
436 | | |
437 | | static NOINLINE void sgr_box5_row_h(int32_t *sumsq, coef *sum, |
438 | | const pixel (*left)[4], |
439 | | const pixel *src, const int w, |
440 | | const enum LrEdgeFlags edges) |
441 | 3.41M | { |
442 | 3.41M | sumsq++; |
443 | 3.41M | sum++; |
444 | 3.41M | int a = edges & LR_HAVE_LEFT ? (left ? left[0][1] : src[-3]) : src[0]; |
445 | 3.41M | int b = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0]; |
446 | 3.41M | int c = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0]; |
447 | 3.41M | int d = src[0]; |
448 | 263M | for (int x = -1; x < w + 1; x++) { |
449 | 260M | int e = (x + 2 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 2] : src[w - 1]; |
450 | 260M | sum[x] = a + b + c + d + e; |
451 | 260M | sumsq[x] = a * a + b * b + c * c + d * d + e * e; |
452 | 260M | a = b; |
453 | 260M | b = c; |
454 | 260M | c = d; |
455 | 260M | d = e; |
456 | 260M | } |
457 | 3.41M | } |
458 | | |
459 | | static void sgr_box35_row_h(int32_t *sumsq3, coef *sum3, |
460 | | int32_t *sumsq5, coef *sum5, |
461 | | const pixel (*left)[4], |
462 | | const pixel *src, const int w, |
463 | | const enum LrEdgeFlags edges) |
464 | 2.39M | { |
465 | 2.39M | sgr_box3_row_h(sumsq3, sum3, left, src, w, edges); |
466 | 2.39M | sgr_box5_row_h(sumsq5, sum5, left, src, w, edges); |
467 | 2.39M | } |
468 | | |
469 | | static NOINLINE void sgr_box3_row_v(int32_t **sumsq, coef **sum, |
470 | | int32_t *sumsq_out, coef *sum_out, |
471 | | const int w) |
472 | 3.16M | { |
473 | 248M | for (int x = 0; x < w + 2; x++) { |
474 | 245M | int sq_a = sumsq[0][x]; |
475 | 245M | int sq_b = sumsq[1][x]; |
476 | 245M | int sq_c = sumsq[2][x]; |
477 | 245M | int s_a = sum[0][x]; |
478 | 245M | int s_b = sum[1][x]; |
479 | 245M | int s_c = sum[2][x]; |
480 | 245M | sumsq_out[x] = sq_a + sq_b + sq_c; |
481 | 245M | sum_out[x] = s_a + s_b + s_c; |
482 | 245M | } |
483 | 3.16M | } |
484 | | |
485 | | static NOINLINE void sgr_box5_row_v(int32_t **sumsq, coef **sum, |
486 | | int32_t *sumsq_out, coef *sum_out, |
487 | | const int w) |
488 | 1.70M | { |
489 | 132M | for (int x = 0; x < w + 2; x++) { |
490 | 130M | int sq_a = sumsq[0][x]; |
491 | 130M | int sq_b = sumsq[1][x]; |
492 | 130M | int sq_c = sumsq[2][x]; |
493 | 130M | int sq_d = sumsq[3][x]; |
494 | 130M | int sq_e = sumsq[4][x]; |
495 | 130M | int s_a = sum[0][x]; |
496 | 130M | int s_b = sum[1][x]; |
497 | 130M | int s_c = sum[2][x]; |
498 | 130M | int s_d = sum[3][x]; |
499 | 130M | int s_e = sum[4][x]; |
500 | 130M | sumsq_out[x] = sq_a + sq_b + sq_c + sq_d + sq_e; |
501 | 130M | sum_out[x] = s_a + s_b + s_c + s_d + s_e; |
502 | 130M | } |
503 | 1.70M | } |
504 | | |
505 | | static NOINLINE void sgr_calc_row_ab(int32_t *AA, coef *BB, int w, int s, |
506 | | int bitdepth_max, int n, int sgr_one_by_x) |
507 | 4.85M | { |
508 | 4.85M | const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; |
509 | 370M | for (int i = 0; i < w + 2; i++) { |
510 | 365M | const int a = |
511 | 365M | (AA[i] + ((1 << (2 * bitdepth_min_8)) >> 1)) >> (2 * bitdepth_min_8); |
512 | 365M | const int b = |
513 | 365M | (BB[i] + ((1 << bitdepth_min_8) >> 1)) >> bitdepth_min_8; |
514 | | |
515 | 365M | const unsigned p = imax(a * n - b * b, 0); |
516 | 365M | const unsigned z = (p * s + (1 << 19)) >> 20; |
517 | 365M | const unsigned x = dav1d_sgr_x_by_x[umin(z, 255)]; |
518 | | |
519 | | // This is where we invert A and B, so that B is of size coef. |
520 | 365M | AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12; |
521 | 365M | BB[i] = x; |
522 | 365M | } |
523 | 4.85M | } |
524 | | |
525 | | static void sgr_box3_vert(int32_t **sumsq, coef **sum, |
526 | | int32_t *sumsq_out, coef *sum_out, |
527 | | const int w, const int s, const int bitdepth_max) |
528 | 3.17M | { |
529 | 3.17M | sgr_box3_row_v(sumsq, sum, sumsq_out, sum_out, w); |
530 | 3.17M | sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 9, 455); |
531 | 3.17M | rotate(sumsq, sum, 3); |
532 | 3.17M | } |
533 | | |
534 | | static void sgr_box5_vert(int32_t **sumsq, coef **sum, |
535 | | int32_t *sumsq_out, coef *sum_out, |
536 | | const int w, const int s, const int bitdepth_max) |
537 | 1.70M | { |
538 | 1.70M | sgr_box5_row_v(sumsq, sum, sumsq_out, sum_out, w); |
539 | 1.70M | sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 25, 164); |
540 | 1.70M | rotate5_x2(sumsq, sum); |
541 | 1.70M | } |
542 | | |
543 | | static void sgr_box3_hv(int32_t **sumsq, coef **sum, |
544 | | int32_t *AA, coef *BB, |
545 | | const pixel (*left)[4], |
546 | | const pixel *src, const int w, |
547 | | const int s, |
548 | | const enum LrEdgeFlags edges, |
549 | | const int bitdepth_max) |
550 | 783k | { |
551 | 783k | sgr_box3_row_h(sumsq[2], sum[2], left, src, w, edges); |
552 | 783k | sgr_box3_vert(sumsq, sum, AA, BB, w, s, bitdepth_max); |
553 | 783k | } |
554 | | |
555 | | static NOINLINE void sgr_finish_filter_row1(coef *tmp, |
556 | | const pixel *src, |
557 | | int32_t **A_ptrs, coef **B_ptrs, |
558 | | const int w) |
559 | 3.01M | { |
560 | 3.01M | #define EIGHT_NEIGHBORS(P, i)\ |
561 | 450M | ((P[1][i] + P[1][i - 1] + P[1][i + 1] + P[0][i] + P[2][i]) * 4 + \ |
562 | 450M | (P[0][i - 1] + P[2][i - 1] + \ |
563 | 450M | P[0][i + 1] + P[2][i + 1]) * 3) |
564 | 228M | for (int i = 0; i < w; i++) { |
565 | 225M | const int a = EIGHT_NEIGHBORS(B_ptrs, i + 1); |
566 | 225M | const int b = EIGHT_NEIGHBORS(A_ptrs, i + 1); |
567 | 225M | tmp[i] = (b - a * src[i] + (1 << 8)) >> 9; |
568 | 225M | } |
569 | 3.01M | #undef EIGHT_NEIGHBORS |
570 | 3.01M | } |
571 | | |
572 | 7.70M | #define FILTER_OUT_STRIDE (384) |
573 | | |
574 | | static NOINLINE void sgr_finish_filter2(coef *tmp, |
575 | | const pixel *src, |
576 | | const ptrdiff_t src_stride, |
577 | | int32_t **A_ptrs, coef **B_ptrs, |
578 | | const int w, const int h) |
579 | 1.61M | { |
580 | 1.61M | #define SIX_NEIGHBORS(P, i)\ |
581 | 241M | ((P[0][i] + P[1][i]) * 6 + \ |
582 | 241M | (P[0][i - 1] + P[1][i - 1] + \ |
583 | 241M | P[0][i + 1] + P[1][i + 1]) * 5) |
584 | 122M | for (int i = 0; i < w; i++) { |
585 | 120M | const int a = SIX_NEIGHBORS(B_ptrs, i + 1); |
586 | 120M | const int b = SIX_NEIGHBORS(A_ptrs, i + 1); |
587 | 120M | tmp[i] = (b - a * src[i] + (1 << 8)) >> 9; |
588 | 120M | } |
589 | 1.61M | if (h <= 1) |
590 | 18.9k | return; |
591 | 1.59M | tmp += FILTER_OUT_STRIDE; |
592 | 1.59M | src += PXSTRIDE(src_stride); |
593 | 1.59M | const int32_t *A = &A_ptrs[1][1]; |
594 | 1.59M | const coef *B = &B_ptrs[1][1]; |
595 | 121M | for (int i = 0; i < w; i++) { |
596 | 119M | const int a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5; |
597 | 119M | const int b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5; |
598 | 119M | tmp[i] = (b - a * src[i] + (1 << 7)) >> 8; |
599 | 119M | } |
600 | 1.59M | #undef SIX_NEIGHBORS |
601 | 1.59M | } |
602 | | |
603 | | static NOINLINE void sgr_weighted_row1(pixel *dst, const coef *t1, |
604 | | const int w, const int w1 HIGHBD_DECL_SUFFIX) |
605 | 1.71M | { |
606 | 119M | for (int i = 0; i < w; i++) { |
607 | 118M | const int v = w1 * t1[i]; |
608 | 118M | dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11)); |
609 | 118M | } |
610 | 1.71M | } |
611 | | |
612 | | static NOINLINE void sgr_weighted2(pixel *dst, const ptrdiff_t dst_stride, |
613 | | const coef *t1, const coef *t2, |
614 | | const int w, const int h, |
615 | | const int w0, const int w1 HIGHBD_DECL_SUFFIX) |
616 | 1.13M | { |
617 | 3.39M | for (int j = 0; j < h; j++) { |
618 | 174M | for (int i = 0; i < w; i++) { |
619 | 172M | const int v = w0 * t1[i] + w1 * t2[i]; |
620 | 172M | dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11)); |
621 | 172M | } |
622 | 2.25M | dst += PXSTRIDE(dst_stride); |
623 | 2.25M | t1 += FILTER_OUT_STRIDE; |
624 | 2.25M | t2 += FILTER_OUT_STRIDE; |
625 | 2.25M | } |
626 | 1.13M | } |
627 | | |
628 | | static NOINLINE void sgr_finish1(pixel **dst, const ptrdiff_t stride, |
629 | | int32_t **A_ptrs, coef **B_ptrs, const int w, |
630 | | const int w1 HIGHBD_DECL_SUFFIX) |
631 | 763k | { |
632 | | // Only one single row, no stride needed |
633 | 763k | ALIGN_STK_16(coef, tmp, 384,); |
634 | | |
635 | 763k | sgr_finish_filter_row1(tmp, *dst, A_ptrs, B_ptrs, w); |
636 | 763k | sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX); |
637 | 763k | *dst += PXSTRIDE(stride); |
638 | 763k | rotate(A_ptrs, B_ptrs, 3); |
639 | 763k | } |
640 | | |
641 | | static NOINLINE void sgr_finish2(pixel **dst, const ptrdiff_t stride, |
642 | | int32_t **A_ptrs, coef **B_ptrs, |
643 | | const int w, const int h, const int w1 |
644 | | HIGHBD_DECL_SUFFIX) |
645 | 480k | { |
646 | 480k | ALIGN_STK_16(coef, tmp, 2*FILTER_OUT_STRIDE,); |
647 | | |
648 | 480k | sgr_finish_filter2(tmp, *dst, stride, A_ptrs, B_ptrs, w, h); |
649 | 480k | sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX); |
650 | 480k | *dst += PXSTRIDE(stride); |
651 | 480k | if (h > 1) { |
652 | 475k | sgr_weighted_row1(*dst, tmp + FILTER_OUT_STRIDE, w, w1 HIGHBD_TAIL_SUFFIX); |
653 | 475k | *dst += PXSTRIDE(stride); |
654 | 475k | } |
655 | 480k | rotate(A_ptrs, B_ptrs, 2); |
656 | 480k | } |
657 | | |
658 | | static NOINLINE void sgr_finish_mix(pixel **dst, const ptrdiff_t stride, |
659 | | int32_t **A5_ptrs, coef **B5_ptrs, |
660 | | int32_t **A3_ptrs, coef **B3_ptrs, |
661 | | const int w, const int h, |
662 | | const int w0, const int w1 HIGHBD_DECL_SUFFIX) |
663 | 1.13M | { |
664 | 1.13M | ALIGN_STK_16(coef, tmp5, 2*FILTER_OUT_STRIDE,); |
665 | 1.13M | ALIGN_STK_16(coef, tmp3, 2*FILTER_OUT_STRIDE,); |
666 | | |
667 | 1.13M | sgr_finish_filter2(tmp5, *dst, stride, A5_ptrs, B5_ptrs, w, h); |
668 | 1.13M | sgr_finish_filter_row1(tmp3, *dst, A3_ptrs, B3_ptrs, w); |
669 | 1.13M | if (h > 1) |
670 | 1.12M | sgr_finish_filter_row1(tmp3 + FILTER_OUT_STRIDE, *dst + PXSTRIDE(stride), |
671 | 1.12M | &A3_ptrs[1], &B3_ptrs[1], w); |
672 | 1.13M | sgr_weighted2(*dst, stride, tmp5, tmp3, w, h, w0, w1 HIGHBD_TAIL_SUFFIX); |
673 | 1.13M | *dst += h*PXSTRIDE(stride); |
674 | 1.13M | rotate(A5_ptrs, B5_ptrs, 2); |
675 | 1.13M | rotate(A3_ptrs, B3_ptrs, 4); |
676 | 1.13M | } |
677 | | |
678 | | |
679 | | static void sgr_3x3_c(pixel *dst, const ptrdiff_t stride, |
680 | | const pixel (*left)[4], const pixel *lpf, |
681 | | const int w, int h, |
682 | | const LooprestorationParams *const params, |
683 | | const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) |
684 | 19.1k | { |
685 | 2.22M | #define BUF_STRIDE (384 + 16) |
686 | 19.1k | ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16,); |
687 | 19.1k | ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 3 + 16,); |
688 | 19.1k | int32_t *sumsq_ptrs[3], *sumsq_rows[3]; |
689 | 19.1k | coef *sum_ptrs[3], *sum_rows[3]; |
690 | 76.6k | for (int i = 0; i < 3; i++) { |
691 | 57.4k | sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE]; |
692 | 57.4k | sum_rows[i] = &sum_buf[i * BUF_STRIDE]; |
693 | 57.4k | } |
694 | | |
695 | 19.1k | ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16,); |
696 | 19.1k | ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 3 + 16,); |
697 | 19.1k | int32_t *A_ptrs[3]; |
698 | 19.1k | coef *B_ptrs[3]; |
699 | 76.6k | for (int i = 0; i < 3; i++) { |
700 | 57.4k | A_ptrs[i] = &A_buf[i * BUF_STRIDE]; |
701 | 57.4k | B_ptrs[i] = &B_buf[i * BUF_STRIDE]; |
702 | 57.4k | } |
703 | 19.1k | const pixel *src = dst; |
704 | 19.1k | const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); |
705 | | |
706 | 19.1k | if (edges & LR_HAVE_TOP) { |
707 | 13.0k | sumsq_ptrs[0] = sumsq_rows[0]; |
708 | 13.0k | sumsq_ptrs[1] = sumsq_rows[1]; |
709 | 13.0k | sumsq_ptrs[2] = sumsq_rows[2]; |
710 | 13.0k | sum_ptrs[0] = sum_rows[0]; |
711 | 13.0k | sum_ptrs[1] = sum_rows[1]; |
712 | 13.0k | sum_ptrs[2] = sum_rows[2]; |
713 | | |
714 | 13.0k | sgr_box3_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges); |
715 | 13.0k | lpf += PXSTRIDE(stride); |
716 | 13.0k | sgr_box3_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges); |
717 | | |
718 | 13.0k | sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
719 | 13.0k | left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); |
720 | 13.0k | left++; |
721 | 13.0k | src += PXSTRIDE(stride); |
722 | 13.0k | rotate(A_ptrs, B_ptrs, 3); |
723 | | |
724 | 13.0k | if (--h <= 0) |
725 | 179 | goto vert_1; |
726 | | |
727 | 12.8k | sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
728 | 12.8k | left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); |
729 | 12.8k | left++; |
730 | 12.8k | src += PXSTRIDE(stride); |
731 | 12.8k | rotate(A_ptrs, B_ptrs, 3); |
732 | | |
733 | 12.8k | if (--h <= 0) |
734 | 103 | goto vert_2; |
735 | 12.8k | } else { |
736 | 6.09k | sumsq_ptrs[0] = sumsq_rows[0]; |
737 | 6.09k | sumsq_ptrs[1] = sumsq_rows[0]; |
738 | 6.09k | sumsq_ptrs[2] = sumsq_rows[0]; |
739 | 6.09k | sum_ptrs[0] = sum_rows[0]; |
740 | 6.09k | sum_ptrs[1] = sum_rows[0]; |
741 | 6.09k | sum_ptrs[2] = sum_rows[0]; |
742 | | |
743 | 6.09k | sgr_box3_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges); |
744 | 6.09k | left++; |
745 | 6.09k | src += PXSTRIDE(stride); |
746 | | |
747 | 6.09k | sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
748 | 6.09k | w, params->sgr.s1, BITDEPTH_MAX); |
749 | 6.09k | rotate(A_ptrs, B_ptrs, 3); |
750 | | |
751 | 6.09k | if (--h <= 0) |
752 | 1.04k | goto vert_1; |
753 | | |
754 | 5.04k | sumsq_ptrs[2] = sumsq_rows[1]; |
755 | 5.04k | sum_ptrs[2] = sum_rows[1]; |
756 | | |
757 | 5.04k | sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
758 | 5.04k | left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); |
759 | 5.04k | left++; |
760 | 5.04k | src += PXSTRIDE(stride); |
761 | 5.04k | rotate(A_ptrs, B_ptrs, 3); |
762 | | |
763 | 5.04k | if (--h <= 0) |
764 | 1.26k | goto vert_2; |
765 | | |
766 | 3.78k | sumsq_ptrs[2] = sumsq_rows[2]; |
767 | 3.78k | sum_ptrs[2] = sum_rows[2]; |
768 | 3.78k | } |
769 | | |
770 | 726k | do { |
771 | 726k | sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
772 | 726k | left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); |
773 | 726k | left++; |
774 | 726k | src += PXSTRIDE(stride); |
775 | | |
776 | 726k | sgr_finish1(&dst, stride, A_ptrs, B_ptrs, |
777 | 726k | w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); |
778 | 726k | } while (--h > 0); |
779 | | |
780 | 16.5k | if (!(edges & LR_HAVE_BOTTOM)) |
781 | 3.64k | goto vert_2; |
782 | | |
783 | 12.9k | sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
784 | 12.9k | NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX); |
785 | 12.9k | lpf_bottom += PXSTRIDE(stride); |
786 | | |
787 | 12.9k | sgr_finish1(&dst, stride, A_ptrs, B_ptrs, |
788 | 12.9k | w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); |
789 | | |
790 | 12.9k | sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
791 | 12.9k | NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX); |
792 | | |
793 | 12.9k | sgr_finish1(&dst, stride, A_ptrs, B_ptrs, |
794 | 12.9k | w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); |
795 | 12.9k | return; |
796 | | |
797 | 5.00k | vert_2: |
798 | 5.00k | sumsq_ptrs[2] = sumsq_ptrs[1]; |
799 | 5.00k | sum_ptrs[2] = sum_ptrs[1]; |
800 | 5.00k | sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
801 | 5.00k | w, params->sgr.s1, BITDEPTH_MAX); |
802 | | |
803 | 5.00k | sgr_finish1(&dst, stride, A_ptrs, B_ptrs, |
804 | 5.00k | w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); |
805 | | |
806 | 6.23k | output_1: |
807 | 6.23k | sumsq_ptrs[2] = sumsq_ptrs[1]; |
808 | 6.23k | sum_ptrs[2] = sum_ptrs[1]; |
809 | 6.23k | sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
810 | 6.23k | w, params->sgr.s1, BITDEPTH_MAX); |
811 | | |
812 | 6.23k | sgr_finish1(&dst, stride, A_ptrs, B_ptrs, |
813 | 6.23k | w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); |
814 | 6.23k | return; |
815 | | |
816 | 1.22k | vert_1: |
817 | 1.22k | sumsq_ptrs[2] = sumsq_ptrs[1]; |
818 | 1.22k | sum_ptrs[2] = sum_ptrs[1]; |
819 | 1.22k | sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
820 | 1.22k | w, params->sgr.s1, BITDEPTH_MAX); |
821 | 1.22k | rotate(A_ptrs, B_ptrs, 3); |
822 | 1.22k | goto output_1; |
823 | 5.00k | } |
824 | | |
825 | | static void sgr_5x5_c(pixel *dst, const ptrdiff_t stride, |
826 | | const pixel (*left)[4], const pixel *lpf, |
827 | | const int w, int h, |
828 | | const LooprestorationParams *const params, |
829 | | const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) |
830 | 23.9k | { |
831 | 23.9k | ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16,); |
832 | 23.9k | ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 5 + 16,); |
833 | 23.9k | int32_t *sumsq_ptrs[5], *sumsq_rows[5]; |
834 | 23.9k | coef *sum_ptrs[5], *sum_rows[5]; |
835 | 143k | for (int i = 0; i < 5; i++) { |
836 | 119k | sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE]; |
837 | 119k | sum_rows[i] = &sum_buf[i * BUF_STRIDE]; |
838 | 119k | } |
839 | | |
840 | 23.9k | ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16,); |
841 | 23.9k | ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 2 + 16,); |
842 | 23.9k | int32_t *A_ptrs[2]; |
843 | 23.9k | coef *B_ptrs[2]; |
844 | 71.9k | for (int i = 0; i < 2; i++) { |
845 | 47.9k | A_ptrs[i] = &A_buf[i * BUF_STRIDE]; |
846 | 47.9k | B_ptrs[i] = &B_buf[i * BUF_STRIDE]; |
847 | 47.9k | } |
848 | 23.9k | const pixel *src = dst; |
849 | 23.9k | const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); |
850 | | |
851 | 23.9k | if (edges & LR_HAVE_TOP) { |
852 | 15.2k | sumsq_ptrs[0] = sumsq_rows[0]; |
853 | 15.2k | sumsq_ptrs[1] = sumsq_rows[0]; |
854 | 15.2k | sumsq_ptrs[2] = sumsq_rows[1]; |
855 | 15.2k | sumsq_ptrs[3] = sumsq_rows[2]; |
856 | 15.2k | sumsq_ptrs[4] = sumsq_rows[3]; |
857 | 15.2k | sum_ptrs[0] = sum_rows[0]; |
858 | 15.2k | sum_ptrs[1] = sum_rows[0]; |
859 | 15.2k | sum_ptrs[2] = sum_rows[1]; |
860 | 15.2k | sum_ptrs[3] = sum_rows[2]; |
861 | 15.2k | sum_ptrs[4] = sum_rows[3]; |
862 | | |
863 | 15.2k | sgr_box5_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges); |
864 | 15.2k | lpf += PXSTRIDE(stride); |
865 | 15.2k | sgr_box5_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges); |
866 | | |
867 | 15.2k | sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges); |
868 | 15.2k | left++; |
869 | 15.2k | src += PXSTRIDE(stride); |
870 | | |
871 | 15.2k | if (--h <= 0) |
872 | 133 | goto vert_1; |
873 | | |
874 | 15.0k | sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges); |
875 | 15.0k | left++; |
876 | 15.0k | src += PXSTRIDE(stride); |
877 | 15.0k | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
878 | 15.0k | w, params->sgr.s0, BITDEPTH_MAX); |
879 | 15.0k | rotate(A_ptrs, B_ptrs, 2); |
880 | | |
881 | 15.0k | if (--h <= 0) |
882 | 123 | goto vert_2; |
883 | | |
884 | | // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set |
885 | | // one of them to point at the previously unused rows[4]. |
886 | 14.9k | sumsq_ptrs[3] = sumsq_rows[4]; |
887 | 14.9k | sum_ptrs[3] = sum_rows[4]; |
888 | 14.9k | } else { |
889 | 8.77k | sumsq_ptrs[0] = sumsq_rows[0]; |
890 | 8.77k | sumsq_ptrs[1] = sumsq_rows[0]; |
891 | 8.77k | sumsq_ptrs[2] = sumsq_rows[0]; |
892 | 8.77k | sumsq_ptrs[3] = sumsq_rows[0]; |
893 | 8.77k | sumsq_ptrs[4] = sumsq_rows[0]; |
894 | 8.77k | sum_ptrs[0] = sum_rows[0]; |
895 | 8.77k | sum_ptrs[1] = sum_rows[0]; |
896 | 8.77k | sum_ptrs[2] = sum_rows[0]; |
897 | 8.77k | sum_ptrs[3] = sum_rows[0]; |
898 | 8.77k | sum_ptrs[4] = sum_rows[0]; |
899 | | |
900 | 8.77k | sgr_box5_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges); |
901 | 8.77k | left++; |
902 | 8.77k | src += PXSTRIDE(stride); |
903 | | |
904 | 8.77k | if (--h <= 0) |
905 | 1.45k | goto vert_1; |
906 | | |
907 | 7.31k | sumsq_ptrs[4] = sumsq_rows[1]; |
908 | 7.31k | sum_ptrs[4] = sum_rows[1]; |
909 | | |
910 | 7.31k | sgr_box5_row_h(sumsq_rows[1], sum_rows[1], left, src, w, edges); |
911 | 7.31k | left++; |
912 | 7.31k | src += PXSTRIDE(stride); |
913 | | |
914 | 7.31k | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
915 | 7.31k | w, params->sgr.s0, BITDEPTH_MAX); |
916 | 7.31k | rotate(A_ptrs, B_ptrs, 2); |
917 | | |
918 | 7.31k | if (--h <= 0) |
919 | 1.20k | goto vert_2; |
920 | | |
921 | 6.10k | sumsq_ptrs[3] = sumsq_rows[2]; |
922 | 6.10k | sumsq_ptrs[4] = sumsq_rows[3]; |
923 | 6.10k | sum_ptrs[3] = sum_rows[2]; |
924 | 6.10k | sum_ptrs[4] = sum_rows[3]; |
925 | | |
926 | 6.10k | sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges); |
927 | 6.10k | left++; |
928 | 6.10k | src += PXSTRIDE(stride); |
929 | | |
930 | 6.10k | if (--h <= 0) |
931 | 1.10k | goto odd; |
932 | | |
933 | 5.00k | sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges); |
934 | 5.00k | left++; |
935 | 5.00k | src += PXSTRIDE(stride); |
936 | | |
937 | 5.00k | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
938 | 5.00k | w, params->sgr.s0, BITDEPTH_MAX); |
939 | 5.00k | sgr_finish2(&dst, stride, A_ptrs, B_ptrs, |
940 | 5.00k | w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); |
941 | | |
942 | 5.00k | if (--h <= 0) |
943 | 488 | goto vert_2; |
944 | | |
945 | | // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set |
946 | | // one of them to point at the previously unused rows[4]. |
947 | 4.51k | sumsq_ptrs[3] = sumsq_rows[4]; |
948 | 4.51k | sum_ptrs[3] = sum_rows[4]; |
949 | 4.51k | } |
950 | | |
951 | 450k | do { |
952 | 450k | sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], left, src, w, edges); |
953 | 450k | left++; |
954 | 450k | src += PXSTRIDE(stride); |
955 | | |
956 | 450k | if (--h <= 0) |
957 | 2.29k | goto odd; |
958 | | |
959 | 447k | sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], left, src, w, edges); |
960 | 447k | left++; |
961 | 447k | src += PXSTRIDE(stride); |
962 | | |
963 | 447k | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
964 | 447k | w, params->sgr.s0, BITDEPTH_MAX); |
965 | 447k | sgr_finish2(&dst, stride, A_ptrs, B_ptrs, |
966 | 447k | w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); |
967 | 447k | } while (--h > 0); |
968 | | |
969 | 17.1k | if (!(edges & LR_HAVE_BOTTOM)) |
970 | 1.56k | goto vert_2; |
971 | | |
972 | 15.6k | sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], NULL, lpf_bottom, w, edges); |
973 | 15.6k | lpf_bottom += PXSTRIDE(stride); |
974 | 15.6k | sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], NULL, lpf_bottom, w, edges); |
975 | | |
976 | 18.9k | output_2: |
977 | 18.9k | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
978 | 18.9k | w, params->sgr.s0, BITDEPTH_MAX); |
979 | 18.9k | sgr_finish2(&dst, stride, A_ptrs, B_ptrs, |
980 | 18.9k | w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); |
981 | 18.9k | return; |
982 | | |
983 | 3.38k | vert_2: |
984 | | // Duplicate the last row twice more |
985 | 3.38k | sumsq_ptrs[3] = sumsq_ptrs[2]; |
986 | 3.38k | sumsq_ptrs[4] = sumsq_ptrs[2]; |
987 | 3.38k | sum_ptrs[3] = sum_ptrs[2]; |
988 | 3.38k | sum_ptrs[4] = sum_ptrs[2]; |
989 | 3.38k | goto output_2; |
990 | | |
991 | 3.40k | odd: |
992 | | // Copy the last row as padding once |
993 | 3.40k | sumsq_ptrs[4] = sumsq_ptrs[3]; |
994 | 3.40k | sum_ptrs[4] = sum_ptrs[3]; |
995 | | |
996 | 3.40k | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
997 | 3.40k | w, params->sgr.s0, BITDEPTH_MAX); |
998 | 3.40k | sgr_finish2(&dst, stride, A_ptrs, B_ptrs, |
999 | 3.40k | w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); |
1000 | | |
1001 | 4.99k | output_1: |
1002 | | // Duplicate the last row twice more |
1003 | 4.99k | sumsq_ptrs[3] = sumsq_ptrs[2]; |
1004 | 4.99k | sumsq_ptrs[4] = sumsq_ptrs[2]; |
1005 | 4.99k | sum_ptrs[3] = sum_ptrs[2]; |
1006 | 4.99k | sum_ptrs[4] = sum_ptrs[2]; |
1007 | | |
1008 | 4.99k | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
1009 | 4.99k | w, params->sgr.s0, BITDEPTH_MAX); |
1010 | | // Output only one row |
1011 | 4.99k | sgr_finish2(&dst, stride, A_ptrs, B_ptrs, |
1012 | 4.99k | w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX); |
1013 | 4.99k | return; |
1014 | | |
1015 | 1.58k | vert_1: |
1016 | | // Copy the last row as padding once |
1017 | 1.58k | sumsq_ptrs[4] = sumsq_ptrs[3]; |
1018 | 1.58k | sum_ptrs[4] = sum_ptrs[3]; |
1019 | | |
1020 | 1.58k | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
1021 | 1.58k | w, params->sgr.s0, BITDEPTH_MAX); |
1022 | 1.58k | rotate(A_ptrs, B_ptrs, 2); |
1023 | | |
1024 | 1.58k | goto output_1; |
1025 | 3.40k | } |
1026 | | |
1027 | | static void sgr_mix_c(pixel *dst, const ptrdiff_t stride, |
1028 | | const pixel (*left)[4], const pixel *lpf, |
1029 | | const int w, int h, |
1030 | | const LooprestorationParams *const params, |
1031 | | const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) |
1032 | 59.1k | { |
1033 | 59.1k | ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16,); |
1034 | 59.1k | ALIGN_STK_16(coef, sum5_buf, BUF_STRIDE * 5 + 16,); |
1035 | 59.1k | int32_t *sumsq5_ptrs[5], *sumsq5_rows[5]; |
1036 | 59.1k | coef *sum5_ptrs[5], *sum5_rows[5]; |
1037 | 354k | for (int i = 0; i < 5; i++) { |
1038 | 295k | sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE]; |
1039 | 295k | sum5_rows[i] = &sum5_buf[i * BUF_STRIDE]; |
1040 | 295k | } |
1041 | 59.1k | ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16,); |
1042 | 59.1k | ALIGN_STK_16(coef, sum3_buf, BUF_STRIDE * 3 + 16,); |
1043 | 59.1k | int32_t *sumsq3_ptrs[3], *sumsq3_rows[3]; |
1044 | 59.1k | coef *sum3_ptrs[3], *sum3_rows[3]; |
1045 | 236k | for (int i = 0; i < 3; i++) { |
1046 | 177k | sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE]; |
1047 | 177k | sum3_rows[i] = &sum3_buf[i * BUF_STRIDE]; |
1048 | 177k | } |
1049 | | |
1050 | 59.1k | ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16,); |
1051 | 59.1k | ALIGN_STK_16(coef, B5_buf, BUF_STRIDE * 2 + 16,); |
1052 | 59.1k | int32_t *A5_ptrs[2]; |
1053 | 59.1k | coef *B5_ptrs[2]; |
1054 | 177k | for (int i = 0; i < 2; i++) { |
1055 | 118k | A5_ptrs[i] = &A5_buf[i * BUF_STRIDE]; |
1056 | 118k | B5_ptrs[i] = &B5_buf[i * BUF_STRIDE]; |
1057 | 118k | } |
1058 | 59.1k | ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16,); |
1059 | 59.1k | ALIGN_STK_16(coef, B3_buf, BUF_STRIDE * 4 + 16,); |
1060 | 59.1k | int32_t *A3_ptrs[4]; |
1061 | 59.1k | coef *B3_ptrs[4]; |
1062 | 295k | for (int i = 0; i < 4; i++) { |
1063 | 236k | A3_ptrs[i] = &A3_buf[i * BUF_STRIDE]; |
1064 | 236k | B3_ptrs[i] = &B3_buf[i * BUF_STRIDE]; |
1065 | 236k | } |
1066 | 59.1k | const pixel *src = dst; |
1067 | 59.1k | const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); |
1068 | | |
1069 | 59.1k | if (edges & LR_HAVE_TOP) { |
1070 | 37.0k | sumsq5_ptrs[0] = sumsq5_rows[0]; |
1071 | 37.0k | sumsq5_ptrs[1] = sumsq5_rows[0]; |
1072 | 37.0k | sumsq5_ptrs[2] = sumsq5_rows[1]; |
1073 | 37.0k | sumsq5_ptrs[3] = sumsq5_rows[2]; |
1074 | 37.0k | sumsq5_ptrs[4] = sumsq5_rows[3]; |
1075 | 37.0k | sum5_ptrs[0] = sum5_rows[0]; |
1076 | 37.0k | sum5_ptrs[1] = sum5_rows[0]; |
1077 | 37.0k | sum5_ptrs[2] = sum5_rows[1]; |
1078 | 37.0k | sum5_ptrs[3] = sum5_rows[2]; |
1079 | 37.0k | sum5_ptrs[4] = sum5_rows[3]; |
1080 | | |
1081 | 37.0k | sumsq3_ptrs[0] = sumsq3_rows[0]; |
1082 | 37.0k | sumsq3_ptrs[1] = sumsq3_rows[1]; |
1083 | 37.0k | sumsq3_ptrs[2] = sumsq3_rows[2]; |
1084 | 37.0k | sum3_ptrs[0] = sum3_rows[0]; |
1085 | 37.0k | sum3_ptrs[1] = sum3_rows[1]; |
1086 | 37.0k | sum3_ptrs[2] = sum3_rows[2]; |
1087 | | |
1088 | 37.0k | sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0], |
1089 | 37.0k | sumsq5_rows[0], sum5_rows[0], |
1090 | 37.0k | NULL, lpf, w, edges); |
1091 | 37.0k | lpf += PXSTRIDE(stride); |
1092 | 37.0k | sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1], |
1093 | 37.0k | sumsq5_rows[1], sum5_rows[1], |
1094 | 37.0k | NULL, lpf, w, edges); |
1095 | | |
1096 | 37.0k | sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2], |
1097 | 37.0k | sumsq5_rows[2], sum5_rows[2], |
1098 | 37.0k | left, src, w, edges); |
1099 | 37.0k | left++; |
1100 | 37.0k | src += PXSTRIDE(stride); |
1101 | | |
1102 | 37.0k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1103 | 37.0k | w, params->sgr.s1, BITDEPTH_MAX); |
1104 | 37.0k | rotate(A3_ptrs, B3_ptrs, 4); |
1105 | | |
1106 | 37.0k | if (--h <= 0) |
1107 | 272 | goto vert_1; |
1108 | | |
1109 | 36.7k | sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], |
1110 | 36.7k | sumsq5_rows[3], sum5_rows[3], |
1111 | 36.7k | left, src, w, edges); |
1112 | 36.7k | left++; |
1113 | 36.7k | src += PXSTRIDE(stride); |
1114 | 36.7k | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1115 | 36.7k | w, params->sgr.s0, BITDEPTH_MAX); |
1116 | 36.7k | rotate(A5_ptrs, B5_ptrs, 2); |
1117 | 36.7k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1118 | 36.7k | w, params->sgr.s1, BITDEPTH_MAX); |
1119 | 36.7k | rotate(A3_ptrs, B3_ptrs, 4); |
1120 | | |
1121 | 36.7k | if (--h <= 0) |
1122 | 166 | goto vert_2; |
1123 | | |
1124 | | // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set |
1125 | | // one of them to point at the previously unused rows[4]. |
1126 | 36.6k | sumsq5_ptrs[3] = sumsq5_rows[4]; |
1127 | 36.6k | sum5_ptrs[3] = sum5_rows[4]; |
1128 | 36.6k | } else { |
1129 | 22.1k | sumsq5_ptrs[0] = sumsq5_rows[0]; |
1130 | 22.1k | sumsq5_ptrs[1] = sumsq5_rows[0]; |
1131 | 22.1k | sumsq5_ptrs[2] = sumsq5_rows[0]; |
1132 | 22.1k | sumsq5_ptrs[3] = sumsq5_rows[0]; |
1133 | 22.1k | sumsq5_ptrs[4] = sumsq5_rows[0]; |
1134 | 22.1k | sum5_ptrs[0] = sum5_rows[0]; |
1135 | 22.1k | sum5_ptrs[1] = sum5_rows[0]; |
1136 | 22.1k | sum5_ptrs[2] = sum5_rows[0]; |
1137 | 22.1k | sum5_ptrs[3] = sum5_rows[0]; |
1138 | 22.1k | sum5_ptrs[4] = sum5_rows[0]; |
1139 | | |
1140 | 22.1k | sumsq3_ptrs[0] = sumsq3_rows[0]; |
1141 | 22.1k | sumsq3_ptrs[1] = sumsq3_rows[0]; |
1142 | 22.1k | sumsq3_ptrs[2] = sumsq3_rows[0]; |
1143 | 22.1k | sum3_ptrs[0] = sum3_rows[0]; |
1144 | 22.1k | sum3_ptrs[1] = sum3_rows[0]; |
1145 | 22.1k | sum3_ptrs[2] = sum3_rows[0]; |
1146 | | |
1147 | 22.1k | sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0], |
1148 | 22.1k | sumsq5_rows[0], sum5_rows[0], |
1149 | 22.1k | left, src, w, edges); |
1150 | 22.1k | left++; |
1151 | 22.1k | src += PXSTRIDE(stride); |
1152 | | |
1153 | 22.1k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1154 | 22.1k | w, params->sgr.s1, BITDEPTH_MAX); |
1155 | 22.1k | rotate(A3_ptrs, B3_ptrs, 4); |
1156 | | |
1157 | 22.1k | if (--h <= 0) |
1158 | 3.42k | goto vert_1; |
1159 | | |
1160 | 18.6k | sumsq5_ptrs[4] = sumsq5_rows[1]; |
1161 | 18.6k | sum5_ptrs[4] = sum5_rows[1]; |
1162 | | |
1163 | 18.6k | sumsq3_ptrs[2] = sumsq3_rows[1]; |
1164 | 18.6k | sum3_ptrs[2] = sum3_rows[1]; |
1165 | | |
1166 | 18.6k | sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1], |
1167 | 18.6k | sumsq5_rows[1], sum5_rows[1], |
1168 | 18.6k | left, src, w, edges); |
1169 | 18.6k | left++; |
1170 | 18.6k | src += PXSTRIDE(stride); |
1171 | | |
1172 | 18.6k | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1173 | 18.6k | w, params->sgr.s0, BITDEPTH_MAX); |
1174 | 18.6k | rotate(A5_ptrs, B5_ptrs, 2); |
1175 | 18.6k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1176 | 18.6k | w, params->sgr.s1, BITDEPTH_MAX); |
1177 | 18.6k | rotate(A3_ptrs, B3_ptrs, 4); |
1178 | | |
1179 | 18.6k | if (--h <= 0) |
1180 | 3.66k | goto vert_2; |
1181 | | |
1182 | 15.0k | sumsq5_ptrs[3] = sumsq5_rows[2]; |
1183 | 15.0k | sumsq5_ptrs[4] = sumsq5_rows[3]; |
1184 | 15.0k | sum5_ptrs[3] = sum5_rows[2]; |
1185 | 15.0k | sum5_ptrs[4] = sum5_rows[3]; |
1186 | | |
1187 | 15.0k | sumsq3_ptrs[2] = sumsq3_rows[2]; |
1188 | 15.0k | sum3_ptrs[2] = sum3_rows[2]; |
1189 | | |
1190 | 15.0k | sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2], |
1191 | 15.0k | sumsq5_rows[2], sum5_rows[2], |
1192 | 15.0k | left, src, w, edges); |
1193 | 15.0k | left++; |
1194 | 15.0k | src += PXSTRIDE(stride); |
1195 | | |
1196 | 15.0k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1197 | 15.0k | w, params->sgr.s1, BITDEPTH_MAX); |
1198 | 15.0k | rotate(A3_ptrs, B3_ptrs, 4); |
1199 | | |
1200 | 15.0k | if (--h <= 0) |
1201 | 1.89k | goto odd; |
1202 | | |
1203 | 13.1k | sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], |
1204 | 13.1k | sumsq5_rows[3], sum5_rows[3], |
1205 | 13.1k | left, src, w, edges); |
1206 | 13.1k | left++; |
1207 | 13.1k | src += PXSTRIDE(stride); |
1208 | | |
1209 | 13.1k | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1210 | 13.1k | w, params->sgr.s0, BITDEPTH_MAX); |
1211 | 13.1k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1212 | 13.1k | w, params->sgr.s1, BITDEPTH_MAX); |
1213 | 13.1k | sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, |
1214 | 13.1k | w, 2, params->sgr.w0, params->sgr.w1 |
1215 | 13.1k | HIGHBD_TAIL_SUFFIX); |
1216 | | |
1217 | 13.1k | if (--h <= 0) |
1218 | 1.04k | goto vert_2; |
1219 | | |
1220 | | // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set |
1221 | | // one of them to point at the previously unused rows[4]. |
1222 | 12.0k | sumsq5_ptrs[3] = sumsq5_rows[4]; |
1223 | 12.0k | sum5_ptrs[3] = sum5_rows[4]; |
1224 | 12.0k | } |
1225 | | |
1226 | 1.06M | do { |
1227 | 1.06M | sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], |
1228 | 1.06M | sumsq5_ptrs[3], sum5_ptrs[3], |
1229 | 1.06M | left, src, w, edges); |
1230 | 1.06M | left++; |
1231 | 1.06M | src += PXSTRIDE(stride); |
1232 | | |
1233 | 1.06M | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1234 | 1.06M | w, params->sgr.s1, BITDEPTH_MAX); |
1235 | 1.06M | rotate(A3_ptrs, B3_ptrs, 4); |
1236 | | |
1237 | 1.06M | if (--h <= 0) |
1238 | 8.37k | goto odd; |
1239 | | |
1240 | 1.05M | sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], |
1241 | 1.05M | sumsq5_ptrs[4], sum5_ptrs[4], |
1242 | 1.05M | left, src, w, edges); |
1243 | 1.05M | left++; |
1244 | 1.05M | src += PXSTRIDE(stride); |
1245 | | |
1246 | 1.05M | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1247 | 1.05M | w, params->sgr.s0, BITDEPTH_MAX); |
1248 | 1.05M | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1249 | 1.05M | w, params->sgr.s1, BITDEPTH_MAX); |
1250 | 1.05M | sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, |
1251 | 1.05M | w, 2, params->sgr.w0, params->sgr.w1 |
1252 | 1.05M | HIGHBD_TAIL_SUFFIX); |
1253 | 1.05M | } while (--h > 0); |
1254 | | |
1255 | 40.3k | if (!(edges & LR_HAVE_BOTTOM)) |
1256 | 2.78k | goto vert_2; |
1257 | | |
1258 | 37.5k | sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], |
1259 | 37.5k | sumsq5_ptrs[3], sum5_ptrs[3], |
1260 | 37.5k | NULL, lpf_bottom, w, edges); |
1261 | 37.5k | lpf_bottom += PXSTRIDE(stride); |
1262 | 37.5k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1263 | 37.5k | w, params->sgr.s1, BITDEPTH_MAX); |
1264 | 37.5k | rotate(A3_ptrs, B3_ptrs, 4); |
1265 | | |
1266 | 37.5k | sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], |
1267 | 37.5k | sumsq5_ptrs[4], sum5_ptrs[4], |
1268 | 37.5k | NULL, lpf_bottom, w, edges); |
1269 | | |
1270 | 45.1k | output_2: |
1271 | 45.1k | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1272 | 45.1k | w, params->sgr.s0, BITDEPTH_MAX); |
1273 | 45.1k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1274 | 45.1k | w, params->sgr.s1, BITDEPTH_MAX); |
1275 | 45.1k | sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, |
1276 | 45.1k | w, 2, params->sgr.w0, params->sgr.w1 |
1277 | 45.1k | HIGHBD_TAIL_SUFFIX); |
1278 | 45.1k | return; |
1279 | | |
1280 | 7.65k | vert_2: |
1281 | | // Duplicate the last row twice more |
1282 | 7.65k | sumsq5_ptrs[3] = sumsq5_ptrs[2]; |
1283 | 7.65k | sumsq5_ptrs[4] = sumsq5_ptrs[2]; |
1284 | 7.65k | sum5_ptrs[3] = sum5_ptrs[2]; |
1285 | 7.65k | sum5_ptrs[4] = sum5_ptrs[2]; |
1286 | | |
1287 | 7.65k | sumsq3_ptrs[2] = sumsq3_ptrs[1]; |
1288 | 7.65k | sum3_ptrs[2] = sum3_ptrs[1]; |
1289 | 7.65k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1290 | 7.65k | w, params->sgr.s1, BITDEPTH_MAX); |
1291 | 7.65k | rotate(A3_ptrs, B3_ptrs, 4); |
1292 | | |
1293 | 7.65k | sumsq3_ptrs[2] = sumsq3_ptrs[1]; |
1294 | 7.65k | sum3_ptrs[2] = sum3_ptrs[1]; |
1295 | | |
1296 | 7.65k | goto output_2; |
1297 | | |
1298 | 10.2k | odd: |
1299 | | // Copy the last row as padding once |
1300 | 10.2k | sumsq5_ptrs[4] = sumsq5_ptrs[3]; |
1301 | 10.2k | sum5_ptrs[4] = sum5_ptrs[3]; |
1302 | | |
1303 | 10.2k | sumsq3_ptrs[2] = sumsq3_ptrs[1]; |
1304 | 10.2k | sum3_ptrs[2] = sum3_ptrs[1]; |
1305 | | |
1306 | 10.2k | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1307 | 10.2k | w, params->sgr.s0, BITDEPTH_MAX); |
1308 | 10.2k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1309 | 10.2k | w, params->sgr.s1, BITDEPTH_MAX); |
1310 | 10.2k | sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, |
1311 | 10.2k | w, 2, params->sgr.w0, params->sgr.w1 |
1312 | 10.2k | HIGHBD_TAIL_SUFFIX); |
1313 | | |
1314 | 13.9k | output_1: |
1315 | | // Duplicate the last row twice more |
1316 | 13.9k | sumsq5_ptrs[3] = sumsq5_ptrs[2]; |
1317 | 13.9k | sumsq5_ptrs[4] = sumsq5_ptrs[2]; |
1318 | 13.9k | sum5_ptrs[3] = sum5_ptrs[2]; |
1319 | 13.9k | sum5_ptrs[4] = sum5_ptrs[2]; |
1320 | | |
1321 | 13.9k | sumsq3_ptrs[2] = sumsq3_ptrs[1]; |
1322 | 13.9k | sum3_ptrs[2] = sum3_ptrs[1]; |
1323 | | |
1324 | 13.9k | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1325 | 13.9k | w, params->sgr.s0, BITDEPTH_MAX); |
1326 | 13.9k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1327 | 13.9k | w, params->sgr.s1, BITDEPTH_MAX); |
1328 | 13.9k | rotate(A3_ptrs, B3_ptrs, 4); |
1329 | | // Output only one row |
1330 | 13.9k | sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, |
1331 | 13.9k | w, 1, params->sgr.w0, params->sgr.w1 |
1332 | 13.9k | HIGHBD_TAIL_SUFFIX); |
1333 | 13.9k | return; |
1334 | | |
1335 | 3.69k | vert_1: |
1336 | | // Copy the last row as padding once |
1337 | 3.69k | sumsq5_ptrs[4] = sumsq5_ptrs[3]; |
1338 | 3.69k | sum5_ptrs[4] = sum5_ptrs[3]; |
1339 | | |
1340 | 3.69k | sumsq3_ptrs[2] = sumsq3_ptrs[1]; |
1341 | 3.69k | sum3_ptrs[2] = sum3_ptrs[1]; |
1342 | | |
1343 | 3.69k | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1344 | 3.69k | w, params->sgr.s0, BITDEPTH_MAX); |
1345 | 3.69k | rotate(A5_ptrs, B5_ptrs, 2); |
1346 | 3.69k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1347 | 3.69k | w, params->sgr.s1, BITDEPTH_MAX); |
1348 | 3.69k | rotate(A3_ptrs, B3_ptrs, 4); |
1349 | | |
1350 | 3.69k | goto output_1; |
1351 | 10.2k | } |
1352 | | |
1353 | | #if HAVE_ASM |
1354 | | #if ARCH_AARCH64 || ARCH_ARM |
1355 | | #include "src/arm/looprestoration.h" |
1356 | | #elif ARCH_LOONGARCH64 |
1357 | | #include "src/loongarch/looprestoration.h" |
1358 | | #elif ARCH_PPC64LE |
1359 | | #include "src/ppc/looprestoration.h" |
1360 | | #elif ARCH_X86 |
1361 | | #include "src/x86/looprestoration.h" |
1362 | | #endif |
1363 | | #endif |
1364 | | |
1365 | | COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, |
1366 | | const int bpc) |
1367 | 59.4k | { |
1368 | 59.4k | c->wiener[0] = c->wiener[1] = wiener_c; |
1369 | 59.4k | c->sgr[0] = sgr_5x5_c; |
1370 | 59.4k | c->sgr[1] = sgr_3x3_c; |
1371 | 59.4k | c->sgr[2] = sgr_mix_c; |
1372 | | |
1373 | | #if HAVE_ASM |
1374 | | #if ARCH_AARCH64 || ARCH_ARM |
1375 | | loop_restoration_dsp_init_arm(c, bpc); |
1376 | | #elif ARCH_LOONGARCH64 |
1377 | | loop_restoration_dsp_init_loongarch(c, bpc); |
1378 | | #elif ARCH_PPC64LE |
1379 | | loop_restoration_dsp_init_ppc(c, bpc); |
1380 | | #elif ARCH_X86 |
1381 | | loop_restoration_dsp_init_x86(c, bpc); |
1382 | | #endif |
1383 | | #endif |
1384 | 59.4k | } dav1d_loop_restoration_dsp_init_8bpc Line | Count | Source | 1367 | 27.6k | { | 1368 | 27.6k | c->wiener[0] = c->wiener[1] = wiener_c; | 1369 | 27.6k | c->sgr[0] = sgr_5x5_c; | 1370 | 27.6k | c->sgr[1] = sgr_3x3_c; | 1371 | 27.6k | c->sgr[2] = sgr_mix_c; | 1372 | | | 1373 | | #if HAVE_ASM | 1374 | | #if ARCH_AARCH64 || ARCH_ARM | 1375 | | loop_restoration_dsp_init_arm(c, bpc); | 1376 | | #elif ARCH_LOONGARCH64 | 1377 | | loop_restoration_dsp_init_loongarch(c, bpc); | 1378 | | #elif ARCH_PPC64LE | 1379 | | loop_restoration_dsp_init_ppc(c, bpc); | 1380 | | #elif ARCH_X86 | 1381 | | loop_restoration_dsp_init_x86(c, bpc); | 1382 | | #endif | 1383 | | #endif | 1384 | 27.6k | } |
dav1d_loop_restoration_dsp_init_16bpc Line | Count | Source | 1367 | 31.7k | { | 1368 | 31.7k | c->wiener[0] = c->wiener[1] = wiener_c; | 1369 | 31.7k | c->sgr[0] = sgr_5x5_c; | 1370 | 31.7k | c->sgr[1] = sgr_3x3_c; | 1371 | 31.7k | c->sgr[2] = sgr_mix_c; | 1372 | | | 1373 | | #if HAVE_ASM | 1374 | | #if ARCH_AARCH64 || ARCH_ARM | 1375 | | loop_restoration_dsp_init_arm(c, bpc); | 1376 | | #elif ARCH_LOONGARCH64 | 1377 | | loop_restoration_dsp_init_loongarch(c, bpc); | 1378 | | #elif ARCH_PPC64LE | 1379 | | loop_restoration_dsp_init_ppc(c, bpc); | 1380 | | #elif ARCH_X86 | 1381 | | loop_restoration_dsp_init_x86(c, bpc); | 1382 | | #endif | 1383 | | #endif | 1384 | 31.7k | } |
|