/work/dav1d/src/looprestoration_tmpl.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright © 2018, VideoLAN and dav1d authors |
3 | | * Copyright © 2018, Two Orioles, LLC |
4 | | * All rights reserved. |
5 | | * |
6 | | * Redistribution and use in source and binary forms, with or without |
7 | | * modification, are permitted provided that the following conditions are met: |
8 | | * |
9 | | * 1. Redistributions of source code must retain the above copyright notice, this |
10 | | * list of conditions and the following disclaimer. |
11 | | * |
12 | | * 2. Redistributions in binary form must reproduce the above copyright notice, |
13 | | * this list of conditions and the following disclaimer in the documentation |
14 | | * and/or other materials provided with the distribution. |
15 | | * |
16 | | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
17 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
18 | | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
19 | | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
20 | | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
21 | | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
22 | | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
23 | | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
24 | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
25 | | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
26 | | */ |
27 | | |
28 | | #include "config.h" |
29 | | |
30 | | #include <stdint.h> |
31 | | #include <stdlib.h> |
32 | | #include <string.h> |
33 | | |
34 | | #include "common/attributes.h" |
35 | | #include "common/bitdepth.h" |
36 | | #include "common/intops.h" |
37 | | |
38 | | #include "src/looprestoration.h" |
39 | | #include "src/tables.h" |
40 | | |
41 | | // 256 * 1.5 + 3 + 3 = 390 |
42 | 5.10M | #define REST_UNIT_STRIDE (390) |
43 | | |
44 | | static void wiener_filter_h(uint16_t *dst, const pixel (*left)[4], |
45 | | const pixel *src, const int16_t fh[8], |
46 | | const int w, const enum LrEdgeFlags edges |
47 | | HIGHBD_DECL_SUFFIX) |
48 | 4.77M | { |
49 | 4.77M | const int bitdepth = bitdepth_from_max(bitdepth_max); |
50 | 4.77M | const int round_bits_h = 3 + (bitdepth == 12) * 2; |
51 | 4.77M | const int rounding_off_h = 1 << (round_bits_h - 1); |
52 | 4.77M | const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h); |
53 | | |
54 | 4.77M | if (w < 6) { |
55 | | // For small widths, do the fully conditional loop with |
56 | | // conditions on each access. |
57 | 2.58M | for (int x = 0; x < w; x++) { |
58 | 1.69M | int sum = (1 << (bitdepth + 6)); |
59 | 1.69M | #if BITDEPTH == 8 |
60 | 1.69M | sum += src[x] * 128; |
61 | 1.69M | #endif |
62 | 13.4M | for (int i = 0; i < 7; i++) { |
63 | 11.7M | int idx = x + i - 3; |
64 | 11.7M | if (idx < 0) { |
65 | 3.72M | if (!(edges & LR_HAVE_LEFT)) |
66 | 3.73M | sum += src[0] * fh[i]; |
67 | 18.4E | else if (left) |
68 | 0 | sum += left[0][4 + idx] * fh[i]; |
69 | 18.4E | else |
70 | 18.4E | sum += src[idx] * fh[i]; |
71 | 8.01M | } else if (idx >= w && !(edges & LR_HAVE_RIGHT)) { |
72 | 3.67M | sum += src[w - 1] * fh[i]; |
73 | 3.67M | } else |
74 | 4.34M | sum += src[idx] * fh[i]; |
75 | 11.7M | } |
76 | 1.69M | sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1); |
77 | 1.69M | dst[x] = sum; |
78 | 1.69M | } |
79 | | |
80 | 893k | return; |
81 | 893k | } |
82 | | |
83 | | // For larger widths, do separate loops with less conditions; first |
84 | | // handle the start of the row. |
85 | 3.87M | int start = 3; |
86 | 3.87M | if (!(edges & LR_HAVE_LEFT)) { |
87 | | // If there's no left edge, pad using the leftmost pixel. |
88 | 5.37M | for (int x = 0; x < 3; x++) { |
89 | 4.02M | int sum = (1 << (bitdepth + 6)); |
90 | 4.02M | #if BITDEPTH == 8 |
91 | 4.02M | sum += src[x] * 128; |
92 | 4.02M | #endif |
93 | 32.1M | for (int i = 0; i < 7; i++) { |
94 | 28.1M | int idx = x + i - 3; |
95 | 28.1M | if (idx < 0) |
96 | 8.05M | sum += src[0] * fh[i]; |
97 | 20.1M | else |
98 | 20.1M | sum += src[idx] * fh[i]; |
99 | 28.1M | } |
100 | 4.02M | sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1); |
101 | 4.02M | dst[x] = sum; |
102 | 4.02M | } |
103 | 2.53M | } else if (left) { |
104 | | // If we have the left edge and a separate left buffer, pad using that. |
105 | 9.47M | for (int x = 0; x < 3; x++) { |
106 | 7.09M | int sum = (1 << (bitdepth + 6)); |
107 | 7.09M | #if BITDEPTH == 8 |
108 | 7.09M | sum += src[x] * 128; |
109 | 7.09M | #endif |
110 | 56.6M | for (int i = 0; i < 7; i++) { |
111 | 49.5M | int idx = x + i - 3; |
112 | 49.5M | if (idx < 0) |
113 | 14.1M | sum += left[0][4 + idx] * fh[i]; |
114 | 35.3M | else |
115 | 35.3M | sum += src[idx] * fh[i]; |
116 | 49.5M | } |
117 | 7.09M | sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1); |
118 | 7.09M | dst[x] = sum; |
119 | 7.09M | } |
120 | 2.37M | } else { |
121 | | // If we have the left edge, but no separate left buffer, we're in the |
122 | | // top/bottom area (lpf) with the left edge existing in the same |
123 | | // buffer; just do the regular loop from the start. |
124 | 156k | start = 0; |
125 | 156k | } |
126 | 3.87M | int end = w - 3; |
127 | 3.87M | if (edges & LR_HAVE_RIGHT) |
128 | 2.49M | end = w; |
129 | | |
130 | | // Do a condititon free loop for the bulk of the row. |
131 | 371M | for (int x = start; x < end; x++) { |
132 | 367M | int sum = (1 << (bitdepth + 6)); |
133 | 367M | #if BITDEPTH == 8 |
134 | 367M | sum += src[x] * 128; |
135 | 367M | #endif |
136 | 2.93G | for (int i = 0; i < 7; i++) { |
137 | 2.56G | int idx = x + i - 3; |
138 | 2.56G | sum += src[idx] * fh[i]; |
139 | 2.56G | } |
140 | 367M | sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1); |
141 | 367M | dst[x] = sum; |
142 | 367M | } |
143 | | |
144 | | // If we need to, calculate the end of the row with a condition for |
145 | | // right edge padding. |
146 | 7.99M | for (int x = end; x < w; x++) { |
147 | 4.11M | int sum = (1 << (bitdepth + 6)); |
148 | 4.11M | #if BITDEPTH == 8 |
149 | 4.11M | sum += src[x] * 128; |
150 | 4.11M | #endif |
151 | 32.8M | for (int i = 0; i < 7; i++) { |
152 | 28.7M | int idx = x + i - 3; |
153 | 28.7M | if (idx >= w) |
154 | 8.22M | sum += src[w - 1] * fh[i]; |
155 | 20.5M | else |
156 | 20.5M | sum += src[idx] * fh[i]; |
157 | 28.7M | } |
158 | 4.11M | sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1); |
159 | 4.11M | dst[x] = sum; |
160 | 4.11M | } |
161 | 3.87M | } |
162 | | |
163 | | static void wiener_filter_v(pixel *p, uint16_t **ptrs, const int16_t fv[8], |
164 | | const int w HIGHBD_DECL_SUFFIX) |
165 | 186k | { |
166 | 186k | const int bitdepth = bitdepth_from_max(bitdepth_max); |
167 | | |
168 | 186k | const int round_bits_v = 11 - (bitdepth == 12) * 2; |
169 | 186k | const int rounding_off_v = 1 << (round_bits_v - 1); |
170 | 186k | const int round_offset = 1 << (bitdepth + (round_bits_v - 1)); |
171 | | |
172 | 13.1M | for (int i = 0; i < w; i++) { |
173 | 12.9M | int sum = -round_offset; |
174 | | |
175 | | // Only filter using 6 input rows. The 7th row is assumed to be |
176 | | // identical to the last one. |
177 | | // |
178 | | // This function is assumed to only be called at the end, when doing |
179 | | // padding at the bottom. |
180 | 90.4M | for (int k = 0; k < 6; k++) |
181 | 77.5M | sum += ptrs[k][i] * fv[k]; |
182 | 12.9M | sum += ptrs[5][i] * fv[6]; |
183 | | |
184 | 12.9M | p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v); |
185 | 12.9M | } |
186 | | |
187 | | // Shift the pointers, but only update the first 5; the 6th pointer is kept |
188 | | // as it was before (and the 7th is implicitly identical to the 6th). |
189 | 1.11M | for (int i = 0; i < 5; i++) |
190 | 932k | ptrs[i] = ptrs[i + 1]; |
191 | 186k | } |
192 | | |
193 | | static void wiener_filter_hv(pixel *p, uint16_t **ptrs, const pixel (*left)[4], |
194 | | const pixel *src, const int16_t filter[2][8], |
195 | | const int w, const enum LrEdgeFlags edges |
196 | | HIGHBD_DECL_SUFFIX) |
197 | 4.28M | { |
198 | 4.28M | const int bitdepth = bitdepth_from_max(bitdepth_max); |
199 | | |
200 | 4.28M | const int round_bits_v = 11 - (bitdepth == 12) * 2; |
201 | 4.28M | const int rounding_off_v = 1 << (round_bits_v - 1); |
202 | 4.28M | const int round_offset = 1 << (bitdepth + (round_bits_v - 1)); |
203 | | |
204 | 4.28M | const int16_t *fh = filter[0]; |
205 | 4.28M | const int16_t *fv = filter[1]; |
206 | | |
207 | | // Do combined horziontal and vertical filtering; doing horizontal |
208 | | // filtering of one row, combined with vertical filtering of 6 |
209 | | // preexisting rows and the newly filtered row. |
210 | | |
211 | | // For simplicity in the C implementation, just do a separate call |
212 | | // of the horizontal filter, into a temporary buffer. |
213 | 4.28M | uint16_t tmp[REST_UNIT_STRIDE]; |
214 | 4.28M | wiener_filter_h(tmp, left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); |
215 | | |
216 | 353M | for (int i = 0; i < w; i++) { |
217 | 348M | int sum = -round_offset; |
218 | | |
219 | | // Filter using the 6 stored preexisting rows, and the newly |
220 | | // filtered one in tmp[]. |
221 | 2.42G | for (int k = 0; k < 6; k++) |
222 | 2.07G | sum += ptrs[k][i] * fv[k]; |
223 | 348M | sum += tmp[i] * fv[6]; |
224 | | // At this point, after having read all inputs at point [i], we |
225 | | // could overwrite [i] with the newly filtered data. |
226 | | |
227 | 348M | p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v); |
228 | 348M | } |
229 | | |
230 | | // For simplicity in the C implementation, just memcpy the newly |
231 | | // filtered row into ptrs[6]. Normally, in steady state filtering, |
232 | | // this output row, ptrs[6], is equal to ptrs[0]. However at startup, |
233 | | // at the top of the filtered area, we may have ptrs[0] equal to ptrs[1], |
234 | | // so we can't assume we can write into ptrs[0] but we need to keep |
235 | | // a separate pointer for the next row to write into. |
236 | 4.28M | memcpy(ptrs[6], tmp, sizeof(uint16_t) * REST_UNIT_STRIDE); |
237 | | |
238 | | // Rotate the window of pointers. Shift the 6 pointers downwards one step. |
239 | 29.9M | for (int i = 0; i < 6; i++) |
240 | 25.6M | ptrs[i] = ptrs[i + 1]; |
241 | | // The topmost pointer, ptrs[6], which isn't used as input, is set to |
242 | | // ptrs[0], which will be used as output for the next _hv call. |
243 | | // At the start of the filtering, the caller may set ptrs[6] to the |
244 | | // right next buffer to fill in, instead. |
245 | 4.28M | ptrs[6] = ptrs[0]; |
246 | 4.28M | } |
247 | | |
248 | | // FIXME Could split into luma and chroma specific functions, |
249 | | // (since first and last tops are always 0 for chroma) |
250 | | static void wiener_c(pixel *p, const ptrdiff_t stride, |
251 | | const pixel (*left)[4], |
252 | | const pixel *lpf, const int w, int h, |
253 | | const LooprestorationParams *const params, |
254 | | const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) |
255 | 119k | { |
256 | | // Values stored between horizontal and vertical filtering don't |
257 | | // fit in a uint8_t. |
258 | 119k | uint16_t hor[6 * REST_UNIT_STRIDE]; |
259 | 119k | uint16_t *ptrs[7], *rows[6]; |
260 | 834k | for (int i = 0; i < 6; i++) |
261 | 714k | rows[i] = &hor[i * REST_UNIT_STRIDE]; |
262 | 119k | const int16_t (*const filter)[8] = params->filter; |
263 | 119k | const int16_t *fh = params->filter[0]; |
264 | 119k | const int16_t *fv = params->filter[1]; |
265 | 119k | const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); |
266 | | |
267 | 119k | const pixel *src = p; |
268 | 119k | if (edges & LR_HAVE_TOP) { |
269 | 74.8k | ptrs[0] = rows[0]; |
270 | 74.8k | ptrs[1] = rows[0]; |
271 | 74.8k | ptrs[2] = rows[1]; |
272 | 74.8k | ptrs[3] = rows[2]; |
273 | 74.8k | ptrs[4] = rows[2]; |
274 | 74.8k | ptrs[5] = rows[2]; |
275 | | |
276 | 74.8k | wiener_filter_h(rows[0], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX); |
277 | 74.8k | lpf += PXSTRIDE(stride); |
278 | 74.8k | wiener_filter_h(rows[1], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX); |
279 | | |
280 | 74.8k | wiener_filter_h(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); |
281 | 74.8k | left++; |
282 | 74.8k | src += PXSTRIDE(stride); |
283 | | |
284 | 74.8k | if (--h <= 0) |
285 | 726 | goto v1; |
286 | | |
287 | 74.1k | ptrs[4] = ptrs[5] = rows[3]; |
288 | 74.1k | wiener_filter_h(rows[3], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); |
289 | 74.1k | left++; |
290 | 74.1k | src += PXSTRIDE(stride); |
291 | | |
292 | 74.1k | if (--h <= 0) |
293 | 809 | goto v2; |
294 | | |
295 | 73.3k | ptrs[5] = rows[4]; |
296 | 73.3k | wiener_filter_h(rows[4], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); |
297 | 73.3k | left++; |
298 | 73.3k | src += PXSTRIDE(stride); |
299 | | |
300 | 73.3k | if (--h <= 0) |
301 | 410 | goto v3; |
302 | 73.3k | } else { |
303 | 44.2k | ptrs[0] = rows[0]; |
304 | 44.2k | ptrs[1] = rows[0]; |
305 | 44.2k | ptrs[2] = rows[0]; |
306 | 44.2k | ptrs[3] = rows[0]; |
307 | 44.2k | ptrs[4] = rows[0]; |
308 | 44.2k | ptrs[5] = rows[0]; |
309 | | |
310 | 44.2k | wiener_filter_h(rows[0], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); |
311 | 44.2k | left++; |
312 | 44.2k | src += PXSTRIDE(stride); |
313 | | |
314 | 44.2k | if (--h <= 0) |
315 | 3.55k | goto v1; |
316 | | |
317 | 40.7k | ptrs[4] = ptrs[5] = rows[1]; |
318 | 40.7k | wiener_filter_h(rows[1], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); |
319 | 40.7k | left++; |
320 | 40.7k | src += PXSTRIDE(stride); |
321 | | |
322 | 40.7k | if (--h <= 0) |
323 | 9.48k | goto v2; |
324 | | |
325 | 31.2k | ptrs[5] = rows[2]; |
326 | 31.2k | wiener_filter_h(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); |
327 | 31.2k | left++; |
328 | 31.2k | src += PXSTRIDE(stride); |
329 | | |
330 | 31.2k | if (--h <= 0) |
331 | 2.28k | goto v3; |
332 | | |
333 | 28.9k | ptrs[6] = rows[3]; |
334 | 28.9k | wiener_filter_hv(p, ptrs, left, src, filter, w, edges |
335 | 28.9k | HIGHBD_TAIL_SUFFIX); |
336 | 28.9k | left++; |
337 | 28.9k | src += PXSTRIDE(stride); |
338 | 28.9k | p += PXSTRIDE(stride); |
339 | | |
340 | 28.9k | if (--h <= 0) |
341 | 2.57k | goto v3; |
342 | | |
343 | 26.3k | ptrs[6] = rows[4]; |
344 | 26.3k | wiener_filter_hv(p, ptrs, left, src, filter, w, edges |
345 | 26.3k | HIGHBD_TAIL_SUFFIX); |
346 | 26.3k | left++; |
347 | 26.3k | src += PXSTRIDE(stride); |
348 | 26.3k | p += PXSTRIDE(stride); |
349 | | |
350 | 26.3k | if (--h <= 0) |
351 | 1.33k | goto v3; |
352 | 26.3k | } |
353 | | |
354 | 97.9k | ptrs[6] = ptrs[5] + REST_UNIT_STRIDE; |
355 | 4.08M | do { |
356 | 4.08M | wiener_filter_hv(p, ptrs, left, src, filter, w, edges |
357 | 4.08M | HIGHBD_TAIL_SUFFIX); |
358 | 4.08M | left++; |
359 | 4.08M | src += PXSTRIDE(stride); |
360 | 4.08M | p += PXSTRIDE(stride); |
361 | 4.08M | } while (--h > 0); |
362 | | |
363 | 97.9k | if (!(edges & LR_HAVE_BOTTOM)) |
364 | 21.9k | goto v3; |
365 | | |
366 | 76.0k | wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges |
367 | 76.0k | HIGHBD_TAIL_SUFFIX); |
368 | 76.0k | lpf_bottom += PXSTRIDE(stride); |
369 | 76.0k | p += PXSTRIDE(stride); |
370 | | |
371 | 76.0k | wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges |
372 | 76.0k | HIGHBD_TAIL_SUFFIX); |
373 | 76.0k | p += PXSTRIDE(stride); |
374 | 119k | v1: |
375 | 119k | wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX); |
376 | | |
377 | 119k | return; |
378 | | |
379 | 28.5k | v3: |
380 | 28.5k | wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX); |
381 | 28.5k | p += PXSTRIDE(stride); |
382 | 38.8k | v2: |
383 | 38.8k | wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX); |
384 | 38.8k | p += PXSTRIDE(stride); |
385 | 38.8k | goto v1; |
386 | 28.5k | } |
387 | | |
388 | | // SGR |
389 | | static NOINLINE void rotate(int32_t **sumsq_ptrs, coef **sum_ptrs, int n) |
390 | 9.44M | { |
391 | 9.44M | int32_t *tmp32 = sumsq_ptrs[0]; |
392 | 9.44M | coef *tmpc = sum_ptrs[0]; |
393 | 29.0M | for (int i = 0; i < n - 1; i++) { |
394 | 19.5M | sumsq_ptrs[i] = sumsq_ptrs[i + 1]; |
395 | 19.5M | sum_ptrs[i] = sum_ptrs[i + 1]; |
396 | 19.5M | } |
397 | 9.44M | sumsq_ptrs[n - 1] = tmp32; |
398 | 9.44M | sum_ptrs[n - 1] = tmpc; |
399 | 9.44M | } |
400 | | |
401 | | static NOINLINE void rotate5_x2(int32_t **sumsq_ptrs, coef **sum_ptrs) |
402 | 1.97M | { |
403 | 1.97M | int32_t *tmp32[2]; |
404 | 1.97M | coef *tmpc[2]; |
405 | 5.93M | for (int i = 0; i < 2; i++) { |
406 | 3.95M | tmp32[i] = sumsq_ptrs[i]; |
407 | 3.95M | tmpc[i] = sum_ptrs[i]; |
408 | 3.95M | } |
409 | 7.90M | for (int i = 0; i < 3; i++) { |
410 | 5.92M | sumsq_ptrs[i] = sumsq_ptrs[i + 2]; |
411 | 5.92M | sum_ptrs[i] = sum_ptrs[i + 2]; |
412 | 5.92M | } |
413 | 5.93M | for (int i = 0; i < 2; i++) { |
414 | 3.95M | sumsq_ptrs[3 + i] = tmp32[i]; |
415 | 3.95M | sum_ptrs[3 + i] = tmpc[i]; |
416 | 3.95M | } |
417 | 1.97M | } |
418 | | |
419 | | static NOINLINE void sgr_box3_row_h(int32_t *sumsq, coef *sum, |
420 | | const pixel (*left)[4], |
421 | | const pixel *src, const int w, |
422 | | const enum LrEdgeFlags edges) |
423 | 3.82M | { |
424 | 3.82M | sumsq++; |
425 | 3.82M | sum++; |
426 | 3.82M | int a = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0]; |
427 | 3.82M | int b = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0]; |
428 | 334M | for (int x = -1; x < w + 1; x++) { |
429 | 330M | int c = (x + 1 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 1] : src[w - 1]; |
430 | 330M | sum[x] = a + b + c; |
431 | 330M | sumsq[x] = a * a + b * b + c * c; |
432 | 330M | a = b; |
433 | 330M | b = c; |
434 | 330M | } |
435 | 3.82M | } |
436 | | |
437 | | static NOINLINE void sgr_box5_row_h(int32_t *sumsq, coef *sum, |
438 | | const pixel (*left)[4], |
439 | | const pixel *src, const int w, |
440 | | const enum LrEdgeFlags edges) |
441 | 3.98M | { |
442 | 3.98M | sumsq++; |
443 | 3.98M | sum++; |
444 | 3.98M | int a = edges & LR_HAVE_LEFT ? (left ? left[0][1] : src[-3]) : src[0]; |
445 | 3.98M | int b = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0]; |
446 | 3.98M | int c = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0]; |
447 | 3.98M | int d = src[0]; |
448 | 335M | for (int x = -1; x < w + 1; x++) { |
449 | 331M | int e = (x + 2 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 2] : src[w - 1]; |
450 | 331M | sum[x] = a + b + c + d + e; |
451 | 331M | sumsq[x] = a * a + b * b + c * c + d * d + e * e; |
452 | 331M | a = b; |
453 | 331M | b = c; |
454 | 331M | c = d; |
455 | 331M | d = e; |
456 | 331M | } |
457 | 3.98M | } |
458 | | |
459 | | static void sgr_box35_row_h(int32_t *sumsq3, coef *sum3, |
460 | | int32_t *sumsq5, coef *sum5, |
461 | | const pixel (*left)[4], |
462 | | const pixel *src, const int w, |
463 | | const enum LrEdgeFlags edges) |
464 | 2.78M | { |
465 | 2.78M | sgr_box3_row_h(sumsq3, sum3, left, src, w, edges); |
466 | 2.78M | sgr_box5_row_h(sumsq5, sum5, left, src, w, edges); |
467 | 2.78M | } |
468 | | |
469 | | static NOINLINE void sgr_box3_row_v(int32_t **sumsq, coef **sum, |
470 | | int32_t *sumsq_out, coef *sum_out, |
471 | | const int w) |
472 | 3.77M | { |
473 | 329M | for (int x = 0; x < w + 2; x++) { |
474 | 325M | int sq_a = sumsq[0][x]; |
475 | 325M | int sq_b = sumsq[1][x]; |
476 | 325M | int sq_c = sumsq[2][x]; |
477 | 325M | int s_a = sum[0][x]; |
478 | 325M | int s_b = sum[1][x]; |
479 | 325M | int s_c = sum[2][x]; |
480 | 325M | sumsq_out[x] = sq_a + sq_b + sq_c; |
481 | 325M | sum_out[x] = s_a + s_b + s_c; |
482 | 325M | } |
483 | 3.77M | } |
484 | | |
485 | | static NOINLINE void sgr_box5_row_v(int32_t **sumsq, coef **sum, |
486 | | int32_t *sumsq_out, coef *sum_out, |
487 | | const int w) |
488 | 1.98M | { |
489 | 167M | for (int x = 0; x < w + 2; x++) { |
490 | 165M | int sq_a = sumsq[0][x]; |
491 | 165M | int sq_b = sumsq[1][x]; |
492 | 165M | int sq_c = sumsq[2][x]; |
493 | 165M | int sq_d = sumsq[3][x]; |
494 | 165M | int sq_e = sumsq[4][x]; |
495 | 165M | int s_a = sum[0][x]; |
496 | 165M | int s_b = sum[1][x]; |
497 | 165M | int s_c = sum[2][x]; |
498 | 165M | int s_d = sum[3][x]; |
499 | 165M | int s_e = sum[4][x]; |
500 | 165M | sumsq_out[x] = sq_a + sq_b + sq_c + sq_d + sq_e; |
501 | 165M | sum_out[x] = s_a + s_b + s_c + s_d + s_e; |
502 | 165M | } |
503 | 1.98M | } |
504 | | |
505 | | static NOINLINE void sgr_calc_row_ab(int32_t *AA, coef *BB, int w, int s, |
506 | | int bitdepth_max, int n, int sgr_one_by_x) |
507 | 5.73M | { |
508 | 5.73M | const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; |
509 | 481M | for (int i = 0; i < w + 2; i++) { |
510 | 475M | const int a = |
511 | 475M | (AA[i] + ((1 << (2 * bitdepth_min_8)) >> 1)) >> (2 * bitdepth_min_8); |
512 | 475M | const int b = |
513 | 475M | (BB[i] + ((1 << bitdepth_min_8) >> 1)) >> bitdepth_min_8; |
514 | | |
515 | 475M | const unsigned p = imax(a * n - b * b, 0); |
516 | 475M | const unsigned z = (p * s + (1 << 19)) >> 20; |
517 | 475M | const unsigned x = dav1d_sgr_x_by_x[umin(z, 255)]; |
518 | | |
519 | | // This is where we invert A and B, so that B is of size coef. |
520 | 475M | AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12; |
521 | 475M | BB[i] = x; |
522 | 475M | } |
523 | 5.73M | } |
524 | | |
525 | | static void sgr_box3_vert(int32_t **sumsq, coef **sum, |
526 | | int32_t *sumsq_out, coef *sum_out, |
527 | | const int w, const int s, const int bitdepth_max) |
528 | 3.77M | { |
529 | 3.77M | sgr_box3_row_v(sumsq, sum, sumsq_out, sum_out, w); |
530 | 3.77M | sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 9, 455); |
531 | 3.77M | rotate(sumsq, sum, 3); |
532 | 3.77M | } |
533 | | |
534 | | static void sgr_box5_vert(int32_t **sumsq, coef **sum, |
535 | | int32_t *sumsq_out, coef *sum_out, |
536 | | const int w, const int s, const int bitdepth_max) |
537 | 1.98M | { |
538 | 1.98M | sgr_box5_row_v(sumsq, sum, sumsq_out, sum_out, w); |
539 | 1.98M | sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 25, 164); |
540 | 1.98M | rotate5_x2(sumsq, sum); |
541 | 1.98M | } |
542 | | |
543 | | static void sgr_box3_hv(int32_t **sumsq, coef **sum, |
544 | | int32_t *AA, coef *BB, |
545 | | const pixel (*left)[4], |
546 | | const pixel *src, const int w, |
547 | | const int s, |
548 | | const enum LrEdgeFlags edges, |
549 | | const int bitdepth_max) |
550 | 1.00M | { |
551 | 1.00M | sgr_box3_row_h(sumsq[2], sum[2], left, src, w, edges); |
552 | 1.00M | sgr_box3_vert(sumsq, sum, AA, BB, w, s, bitdepth_max); |
553 | 1.00M | } |
554 | | |
555 | | static NOINLINE void sgr_finish_filter_row1(coef *tmp, |
556 | | const pixel *src, |
557 | | int32_t **A_ptrs, coef **B_ptrs, |
558 | | const int w) |
559 | 3.58M | { |
560 | 3.58M | #define EIGHT_NEIGHBORS(P, i)\ |
561 | 599M | ((P[1][i] + P[1][i - 1] + P[1][i + 1] + P[0][i] + P[2][i]) * 4 + \ |
562 | 599M | (P[0][i - 1] + P[2][i - 1] + \ |
563 | 599M | P[0][i + 1] + P[2][i + 1]) * 3) |
564 | 303M | for (int i = 0; i < w; i++) { |
565 | 299M | const int a = EIGHT_NEIGHBORS(B_ptrs, i + 1); |
566 | 299M | const int b = EIGHT_NEIGHBORS(A_ptrs, i + 1); |
567 | 299M | tmp[i] = (b - a * src[i] + (1 << 8)) >> 9; |
568 | 299M | } |
569 | 3.58M | #undef EIGHT_NEIGHBORS |
570 | 3.58M | } |
571 | | |
572 | 8.96M | #define FILTER_OUT_STRIDE (384) |
573 | | |
574 | | static NOINLINE void sgr_finish_filter2(coef *tmp, |
575 | | const pixel *src, |
576 | | const ptrdiff_t src_stride, |
577 | | int32_t **A_ptrs, coef **B_ptrs, |
578 | | const int w, const int h) |
579 | 1.88M | { |
580 | 1.88M | #define SIX_NEIGHBORS(P, i)\ |
581 | 307M | ((P[0][i] + P[1][i]) * 6 + \ |
582 | 307M | (P[0][i - 1] + P[1][i - 1] + \ |
583 | 307M | P[0][i + 1] + P[1][i + 1]) * 5) |
584 | 155M | for (int i = 0; i < w; i++) { |
585 | 153M | const int a = SIX_NEIGHBORS(B_ptrs, i + 1); |
586 | 153M | const int b = SIX_NEIGHBORS(A_ptrs, i + 1); |
587 | 153M | tmp[i] = (b - a * src[i] + (1 << 8)) >> 9; |
588 | 153M | } |
589 | 1.88M | if (h <= 1) |
590 | 20.2k | return; |
591 | 1.86M | tmp += FILTER_OUT_STRIDE; |
592 | 1.86M | src += PXSTRIDE(src_stride); |
593 | 1.86M | const int32_t *A = &A_ptrs[1][1]; |
594 | 1.86M | const coef *B = &B_ptrs[1][1]; |
595 | 154M | for (int i = 0; i < w; i++) { |
596 | 152M | const int a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5; |
597 | 152M | const int b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5; |
598 | 152M | tmp[i] = (b - a * src[i] + (1 << 7)) >> 8; |
599 | 152M | } |
600 | 1.86M | #undef SIX_NEIGHBORS |
601 | 1.86M | } |
602 | | |
603 | | static NOINLINE void sgr_weighted_row1(pixel *dst, const coef *t1, |
604 | | const int w, const int w1 HIGHBD_DECL_SUFFIX) |
605 | 2.10M | { |
606 | 160M | for (int i = 0; i < w; i++) { |
607 | 158M | const int v = w1 * t1[i]; |
608 | 158M | dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11)); |
609 | 158M | } |
610 | 2.10M | } |
611 | | |
612 | | static NOINLINE void sgr_weighted2(pixel *dst, const ptrdiff_t dst_stride, |
613 | | const coef *t1, const coef *t2, |
614 | | const int w, const int h, |
615 | | const int w0, const int w1 HIGHBD_DECL_SUFFIX) |
616 | 1.32M | { |
617 | 3.93M | for (int j = 0; j < h; j++) { |
618 | 225M | for (int i = 0; i < w; i++) { |
619 | 222M | const int v = w0 * t1[i] + w1 * t2[i]; |
620 | 222M | dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11)); |
621 | 222M | } |
622 | 2.61M | dst += PXSTRIDE(dst_stride); |
623 | 2.61M | t1 += FILTER_OUT_STRIDE; |
624 | 2.61M | t2 += FILTER_OUT_STRIDE; |
625 | 2.61M | } |
626 | 1.32M | } |
627 | | |
628 | | static NOINLINE void sgr_finish1(pixel **dst, const ptrdiff_t stride, |
629 | | int32_t **A_ptrs, coef **B_ptrs, const int w, |
630 | | const int w1 HIGHBD_DECL_SUFFIX) |
631 | 980k | { |
632 | | // Only one single row, no stride needed |
633 | 980k | ALIGN_STK_16(coef, tmp, 384,); |
634 | | |
635 | 980k | sgr_finish_filter_row1(tmp, *dst, A_ptrs, B_ptrs, w); |
636 | 980k | sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX); |
637 | 980k | *dst += PXSTRIDE(stride); |
638 | 980k | rotate(A_ptrs, B_ptrs, 3); |
639 | 980k | } |
640 | | |
641 | | static NOINLINE void sgr_finish2(pixel **dst, const ptrdiff_t stride, |
642 | | int32_t **A_ptrs, coef **B_ptrs, |
643 | | const int w, const int h, const int w1 |
644 | | HIGHBD_DECL_SUFFIX) |
645 | 565k | { |
646 | 565k | ALIGN_STK_16(coef, tmp, 2*FILTER_OUT_STRIDE,); |
647 | | |
648 | 565k | sgr_finish_filter2(tmp, *dst, stride, A_ptrs, B_ptrs, w, h); |
649 | 565k | sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX); |
650 | 565k | *dst += PXSTRIDE(stride); |
651 | 565k | if (h > 1) { |
652 | 559k | sgr_weighted_row1(*dst, tmp + FILTER_OUT_STRIDE, w, w1 HIGHBD_TAIL_SUFFIX); |
653 | 559k | *dst += PXSTRIDE(stride); |
654 | 559k | } |
655 | 565k | rotate(A_ptrs, B_ptrs, 2); |
656 | 565k | } |
657 | | |
658 | | static NOINLINE void sgr_finish_mix(pixel **dst, const ptrdiff_t stride, |
659 | | int32_t **A5_ptrs, coef **B5_ptrs, |
660 | | int32_t **A3_ptrs, coef **B3_ptrs, |
661 | | const int w, const int h, |
662 | | const int w0, const int w1 HIGHBD_DECL_SUFFIX) |
663 | 1.31M | { |
664 | 1.31M | ALIGN_STK_16(coef, tmp5, 2*FILTER_OUT_STRIDE,); |
665 | 1.31M | ALIGN_STK_16(coef, tmp3, 2*FILTER_OUT_STRIDE,); |
666 | | |
667 | 1.31M | sgr_finish_filter2(tmp5, *dst, stride, A5_ptrs, B5_ptrs, w, h); |
668 | 1.31M | sgr_finish_filter_row1(tmp3, *dst, A3_ptrs, B3_ptrs, w); |
669 | 1.31M | if (h > 1) |
670 | 1.30M | sgr_finish_filter_row1(tmp3 + FILTER_OUT_STRIDE, *dst + PXSTRIDE(stride), |
671 | 1.30M | &A3_ptrs[1], &B3_ptrs[1], w); |
672 | 1.31M | sgr_weighted2(*dst, stride, tmp5, tmp3, w, h, w0, w1 HIGHBD_TAIL_SUFFIX); |
673 | 1.31M | *dst += h*PXSTRIDE(stride); |
674 | 1.31M | rotate(A5_ptrs, B5_ptrs, 2); |
675 | 1.31M | rotate(A3_ptrs, B3_ptrs, 4); |
676 | 1.31M | } |
677 | | |
678 | | |
679 | | static void sgr_3x3_c(pixel *dst, const ptrdiff_t stride, |
680 | | const pixel (*left)[4], const pixel *lpf, |
681 | | const int w, int h, |
682 | | const LooprestorationParams *const params, |
683 | | const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) |
684 | 24.2k | { |
685 | 2.62M | #define BUF_STRIDE (384 + 16) |
686 | 24.2k | ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16,); |
687 | 24.2k | ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 3 + 16,); |
688 | 24.2k | int32_t *sumsq_ptrs[3], *sumsq_rows[3]; |
689 | 24.2k | coef *sum_ptrs[3], *sum_rows[3]; |
690 | 96.9k | for (int i = 0; i < 3; i++) { |
691 | 72.7k | sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE]; |
692 | 72.7k | sum_rows[i] = &sum_buf[i * BUF_STRIDE]; |
693 | 72.7k | } |
694 | | |
695 | 24.2k | ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16,); |
696 | 24.2k | ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 3 + 16,); |
697 | 24.2k | int32_t *A_ptrs[3]; |
698 | 24.2k | coef *B_ptrs[3]; |
699 | 96.9k | for (int i = 0; i < 3; i++) { |
700 | 72.7k | A_ptrs[i] = &A_buf[i * BUF_STRIDE]; |
701 | 72.7k | B_ptrs[i] = &B_buf[i * BUF_STRIDE]; |
702 | 72.7k | } |
703 | 24.2k | const pixel *src = dst; |
704 | 24.2k | const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); |
705 | | |
706 | 24.2k | if (edges & LR_HAVE_TOP) { |
707 | 16.8k | sumsq_ptrs[0] = sumsq_rows[0]; |
708 | 16.8k | sumsq_ptrs[1] = sumsq_rows[1]; |
709 | 16.8k | sumsq_ptrs[2] = sumsq_rows[2]; |
710 | 16.8k | sum_ptrs[0] = sum_rows[0]; |
711 | 16.8k | sum_ptrs[1] = sum_rows[1]; |
712 | 16.8k | sum_ptrs[2] = sum_rows[2]; |
713 | | |
714 | 16.8k | sgr_box3_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges); |
715 | 16.8k | lpf += PXSTRIDE(stride); |
716 | 16.8k | sgr_box3_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges); |
717 | | |
718 | 16.8k | sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
719 | 16.8k | left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); |
720 | 16.8k | left++; |
721 | 16.8k | src += PXSTRIDE(stride); |
722 | 16.8k | rotate(A_ptrs, B_ptrs, 3); |
723 | | |
724 | 16.8k | if (--h <= 0) |
725 | 308 | goto vert_1; |
726 | | |
727 | 16.5k | sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
728 | 16.5k | left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); |
729 | 16.5k | left++; |
730 | 16.5k | src += PXSTRIDE(stride); |
731 | 16.5k | rotate(A_ptrs, B_ptrs, 3); |
732 | | |
733 | 16.5k | if (--h <= 0) |
734 | 268 | goto vert_2; |
735 | 16.5k | } else { |
736 | 7.39k | sumsq_ptrs[0] = sumsq_rows[0]; |
737 | 7.39k | sumsq_ptrs[1] = sumsq_rows[0]; |
738 | 7.39k | sumsq_ptrs[2] = sumsq_rows[0]; |
739 | 7.39k | sum_ptrs[0] = sum_rows[0]; |
740 | 7.39k | sum_ptrs[1] = sum_rows[0]; |
741 | 7.39k | sum_ptrs[2] = sum_rows[0]; |
742 | | |
743 | 7.39k | sgr_box3_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges); |
744 | 7.39k | left++; |
745 | 7.39k | src += PXSTRIDE(stride); |
746 | | |
747 | 7.39k | sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
748 | 7.39k | w, params->sgr.s1, BITDEPTH_MAX); |
749 | 7.39k | rotate(A_ptrs, B_ptrs, 3); |
750 | | |
751 | 7.39k | if (--h <= 0) |
752 | 1.29k | goto vert_1; |
753 | | |
754 | 6.10k | sumsq_ptrs[2] = sumsq_rows[1]; |
755 | 6.10k | sum_ptrs[2] = sum_rows[1]; |
756 | | |
757 | 6.10k | sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
758 | 6.10k | left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); |
759 | 6.10k | left++; |
760 | 6.10k | src += PXSTRIDE(stride); |
761 | 6.10k | rotate(A_ptrs, B_ptrs, 3); |
762 | | |
763 | 6.10k | if (--h <= 0) |
764 | 1.48k | goto vert_2; |
765 | | |
766 | 4.62k | sumsq_ptrs[2] = sumsq_rows[2]; |
767 | 4.62k | sum_ptrs[2] = sum_rows[2]; |
768 | 4.62k | } |
769 | | |
770 | 935k | do { |
771 | 935k | sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
772 | 935k | left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); |
773 | 935k | left++; |
774 | 935k | src += PXSTRIDE(stride); |
775 | | |
776 | 935k | sgr_finish1(&dst, stride, A_ptrs, B_ptrs, |
777 | 935k | w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); |
778 | 935k | } while (--h > 0); |
779 | | |
780 | 20.8k | if (!(edges & LR_HAVE_BOTTOM)) |
781 | 4.42k | goto vert_2; |
782 | | |
783 | 16.4k | sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
784 | 16.4k | NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX); |
785 | 16.4k | lpf_bottom += PXSTRIDE(stride); |
786 | | |
787 | 16.4k | sgr_finish1(&dst, stride, A_ptrs, B_ptrs, |
788 | 16.4k | w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); |
789 | | |
790 | 16.4k | sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
791 | 16.4k | NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX); |
792 | | |
793 | 16.4k | sgr_finish1(&dst, stride, A_ptrs, B_ptrs, |
794 | 16.4k | w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); |
795 | 16.4k | return; |
796 | | |
797 | 6.17k | vert_2: |
798 | 6.17k | sumsq_ptrs[2] = sumsq_ptrs[1]; |
799 | 6.17k | sum_ptrs[2] = sum_ptrs[1]; |
800 | 6.17k | sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
801 | 6.17k | w, params->sgr.s1, BITDEPTH_MAX); |
802 | | |
803 | 6.17k | sgr_finish1(&dst, stride, A_ptrs, B_ptrs, |
804 | 6.17k | w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); |
805 | | |
806 | 7.77k | output_1: |
807 | 7.77k | sumsq_ptrs[2] = sumsq_ptrs[1]; |
808 | 7.77k | sum_ptrs[2] = sum_ptrs[1]; |
809 | 7.77k | sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
810 | 7.77k | w, params->sgr.s1, BITDEPTH_MAX); |
811 | | |
812 | 7.77k | sgr_finish1(&dst, stride, A_ptrs, B_ptrs, |
813 | 7.77k | w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); |
814 | 7.77k | return; |
815 | | |
816 | 1.60k | vert_1: |
817 | 1.60k | sumsq_ptrs[2] = sumsq_ptrs[1]; |
818 | 1.60k | sum_ptrs[2] = sum_ptrs[1]; |
819 | 1.60k | sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], |
820 | 1.60k | w, params->sgr.s1, BITDEPTH_MAX); |
821 | 1.60k | rotate(A_ptrs, B_ptrs, 3); |
822 | 1.60k | goto output_1; |
823 | 6.17k | } |
824 | | |
825 | | static void sgr_5x5_c(pixel *dst, const ptrdiff_t stride, |
826 | | const pixel (*left)[4], const pixel *lpf, |
827 | | const int w, int h, |
828 | | const LooprestorationParams *const params, |
829 | | const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) |
830 | 29.5k | { |
831 | 29.5k | ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16,); |
832 | 29.5k | ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 5 + 16,); |
833 | 29.5k | int32_t *sumsq_ptrs[5], *sumsq_rows[5]; |
834 | 29.5k | coef *sum_ptrs[5], *sum_rows[5]; |
835 | 177k | for (int i = 0; i < 5; i++) { |
836 | 147k | sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE]; |
837 | 147k | sum_rows[i] = &sum_buf[i * BUF_STRIDE]; |
838 | 147k | } |
839 | | |
840 | 29.5k | ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16,); |
841 | 29.5k | ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 2 + 16,); |
842 | 29.5k | int32_t *A_ptrs[2]; |
843 | 29.5k | coef *B_ptrs[2]; |
844 | 88.5k | for (int i = 0; i < 2; i++) { |
845 | 59.0k | A_ptrs[i] = &A_buf[i * BUF_STRIDE]; |
846 | 59.0k | B_ptrs[i] = &B_buf[i * BUF_STRIDE]; |
847 | 59.0k | } |
848 | 29.5k | const pixel *src = dst; |
849 | 29.5k | const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); |
850 | | |
851 | 29.5k | if (edges & LR_HAVE_TOP) { |
852 | 18.2k | sumsq_ptrs[0] = sumsq_rows[0]; |
853 | 18.2k | sumsq_ptrs[1] = sumsq_rows[0]; |
854 | 18.2k | sumsq_ptrs[2] = sumsq_rows[1]; |
855 | 18.2k | sumsq_ptrs[3] = sumsq_rows[2]; |
856 | 18.2k | sumsq_ptrs[4] = sumsq_rows[3]; |
857 | 18.2k | sum_ptrs[0] = sum_rows[0]; |
858 | 18.2k | sum_ptrs[1] = sum_rows[0]; |
859 | 18.2k | sum_ptrs[2] = sum_rows[1]; |
860 | 18.2k | sum_ptrs[3] = sum_rows[2]; |
861 | 18.2k | sum_ptrs[4] = sum_rows[3]; |
862 | | |
863 | 18.2k | sgr_box5_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges); |
864 | 18.2k | lpf += PXSTRIDE(stride); |
865 | 18.2k | sgr_box5_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges); |
866 | | |
867 | 18.2k | sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges); |
868 | 18.2k | left++; |
869 | 18.2k | src += PXSTRIDE(stride); |
870 | | |
871 | 18.2k | if (--h <= 0) |
872 | 249 | goto vert_1; |
873 | | |
874 | 17.9k | sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges); |
875 | 17.9k | left++; |
876 | 17.9k | src += PXSTRIDE(stride); |
877 | 17.9k | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
878 | 17.9k | w, params->sgr.s0, BITDEPTH_MAX); |
879 | 17.9k | rotate(A_ptrs, B_ptrs, 2); |
880 | | |
881 | 17.9k | if (--h <= 0) |
882 | 669 | goto vert_2; |
883 | | |
884 | | // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set |
885 | | // one of them to point at the previously unused rows[4]. |
886 | 17.3k | sumsq_ptrs[3] = sumsq_rows[4]; |
887 | 17.3k | sum_ptrs[3] = sum_rows[4]; |
888 | 17.3k | } else { |
889 | 11.3k | sumsq_ptrs[0] = sumsq_rows[0]; |
890 | 11.3k | sumsq_ptrs[1] = sumsq_rows[0]; |
891 | 11.3k | sumsq_ptrs[2] = sumsq_rows[0]; |
892 | 11.3k | sumsq_ptrs[3] = sumsq_rows[0]; |
893 | 11.3k | sumsq_ptrs[4] = sumsq_rows[0]; |
894 | 11.3k | sum_ptrs[0] = sum_rows[0]; |
895 | 11.3k | sum_ptrs[1] = sum_rows[0]; |
896 | 11.3k | sum_ptrs[2] = sum_rows[0]; |
897 | 11.3k | sum_ptrs[3] = sum_rows[0]; |
898 | 11.3k | sum_ptrs[4] = sum_rows[0]; |
899 | | |
900 | 11.3k | sgr_box5_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges); |
901 | 11.3k | left++; |
902 | 11.3k | src += PXSTRIDE(stride); |
903 | | |
904 | 11.3k | if (--h <= 0) |
905 | 1.84k | goto vert_1; |
906 | | |
907 | 9.46k | sumsq_ptrs[4] = sumsq_rows[1]; |
908 | 9.46k | sum_ptrs[4] = sum_rows[1]; |
909 | | |
910 | 9.46k | sgr_box5_row_h(sumsq_rows[1], sum_rows[1], left, src, w, edges); |
911 | 9.46k | left++; |
912 | 9.46k | src += PXSTRIDE(stride); |
913 | | |
914 | 9.46k | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
915 | 9.46k | w, params->sgr.s0, BITDEPTH_MAX); |
916 | 9.46k | rotate(A_ptrs, B_ptrs, 2); |
917 | | |
918 | 9.46k | if (--h <= 0) |
919 | 1.59k | goto vert_2; |
920 | | |
921 | 7.87k | sumsq_ptrs[3] = sumsq_rows[2]; |
922 | 7.87k | sumsq_ptrs[4] = sumsq_rows[3]; |
923 | 7.87k | sum_ptrs[3] = sum_rows[2]; |
924 | 7.87k | sum_ptrs[4] = sum_rows[3]; |
925 | | |
926 | 7.87k | sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges); |
927 | 7.87k | left++; |
928 | 7.87k | src += PXSTRIDE(stride); |
929 | | |
930 | 7.87k | if (--h <= 0) |
931 | 937 | goto odd; |
932 | | |
933 | 6.93k | sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges); |
934 | 6.93k | left++; |
935 | 6.93k | src += PXSTRIDE(stride); |
936 | | |
937 | 6.93k | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
938 | 6.93k | w, params->sgr.s0, BITDEPTH_MAX); |
939 | 6.93k | sgr_finish2(&dst, stride, A_ptrs, B_ptrs, |
940 | 6.93k | w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); |
941 | | |
942 | 6.93k | if (--h <= 0) |
943 | 707 | goto vert_2; |
944 | | |
945 | | // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set |
946 | | // one of them to point at the previously unused rows[4]. |
947 | 6.22k | sumsq_ptrs[3] = sumsq_rows[4]; |
948 | 6.22k | sum_ptrs[3] = sum_rows[4]; |
949 | 6.22k | } |
950 | | |
951 | 527k | do { |
952 | 527k | sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], left, src, w, edges); |
953 | 527k | left++; |
954 | 527k | src += PXSTRIDE(stride); |
955 | | |
956 | 527k | if (--h <= 0) |
957 | 2.74k | goto odd; |
958 | | |
959 | 525k | sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], left, src, w, edges); |
960 | 525k | left++; |
961 | 525k | src += PXSTRIDE(stride); |
962 | | |
963 | 525k | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
964 | 525k | w, params->sgr.s0, BITDEPTH_MAX); |
965 | 525k | sgr_finish2(&dst, stride, A_ptrs, B_ptrs, |
966 | 525k | w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); |
967 | 525k | } while (--h > 0); |
968 | | |
969 | 20.7k | if (!(edges & LR_HAVE_BOTTOM)) |
970 | 2.02k | goto vert_2; |
971 | | |
972 | 18.7k | sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], NULL, lpf_bottom, w, edges); |
973 | 18.7k | lpf_bottom += PXSTRIDE(stride); |
974 | 18.7k | sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], NULL, lpf_bottom, w, edges); |
975 | | |
976 | 23.7k | output_2: |
977 | 23.7k | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
978 | 23.7k | w, params->sgr.s0, BITDEPTH_MAX); |
979 | 23.7k | sgr_finish2(&dst, stride, A_ptrs, B_ptrs, |
980 | 23.7k | w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); |
981 | 23.7k | return; |
982 | | |
983 | 4.99k | vert_2: |
984 | | // Duplicate the last row twice more |
985 | 4.99k | sumsq_ptrs[3] = sumsq_ptrs[2]; |
986 | 4.99k | sumsq_ptrs[4] = sumsq_ptrs[2]; |
987 | 4.99k | sum_ptrs[3] = sum_ptrs[2]; |
988 | 4.99k | sum_ptrs[4] = sum_ptrs[2]; |
989 | 4.99k | goto output_2; |
990 | | |
991 | 3.68k | odd: |
992 | | // Copy the last row as padding once |
993 | 3.68k | sumsq_ptrs[4] = sumsq_ptrs[3]; |
994 | 3.68k | sum_ptrs[4] = sum_ptrs[3]; |
995 | | |
996 | 3.68k | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
997 | 3.68k | w, params->sgr.s0, BITDEPTH_MAX); |
998 | 3.68k | sgr_finish2(&dst, stride, A_ptrs, B_ptrs, |
999 | 3.68k | w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); |
1000 | | |
1001 | 5.77k | output_1: |
1002 | | // Duplicate the last row twice more |
1003 | 5.77k | sumsq_ptrs[3] = sumsq_ptrs[2]; |
1004 | 5.77k | sumsq_ptrs[4] = sumsq_ptrs[2]; |
1005 | 5.77k | sum_ptrs[3] = sum_ptrs[2]; |
1006 | 5.77k | sum_ptrs[4] = sum_ptrs[2]; |
1007 | | |
1008 | 5.77k | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
1009 | 5.77k | w, params->sgr.s0, BITDEPTH_MAX); |
1010 | | // Output only one row |
1011 | 5.77k | sgr_finish2(&dst, stride, A_ptrs, B_ptrs, |
1012 | 5.77k | w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX); |
1013 | 5.77k | return; |
1014 | | |
1015 | 2.09k | vert_1: |
1016 | | // Copy the last row as padding once |
1017 | 2.09k | sumsq_ptrs[4] = sumsq_ptrs[3]; |
1018 | 2.09k | sum_ptrs[4] = sum_ptrs[3]; |
1019 | | |
1020 | 2.09k | sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], |
1021 | 2.09k | w, params->sgr.s0, BITDEPTH_MAX); |
1022 | 2.09k | rotate(A_ptrs, B_ptrs, 2); |
1023 | | |
1024 | 2.09k | goto output_1; |
1025 | 3.68k | } |
1026 | | |
1027 | | static void sgr_mix_c(pixel *dst, const ptrdiff_t stride, |
1028 | | const pixel (*left)[4], const pixel *lpf, |
1029 | | const int w, int h, |
1030 | | const LooprestorationParams *const params, |
1031 | | const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) |
1032 | 68.6k | { |
1033 | 68.6k | ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16,); |
1034 | 68.6k | ALIGN_STK_16(coef, sum5_buf, BUF_STRIDE * 5 + 16,); |
1035 | 68.6k | int32_t *sumsq5_ptrs[5], *sumsq5_rows[5]; |
1036 | 68.6k | coef *sum5_ptrs[5], *sum5_rows[5]; |
1037 | 411k | for (int i = 0; i < 5; i++) { |
1038 | 343k | sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE]; |
1039 | 343k | sum5_rows[i] = &sum5_buf[i * BUF_STRIDE]; |
1040 | 343k | } |
1041 | 68.6k | ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16,); |
1042 | 68.6k | ALIGN_STK_16(coef, sum3_buf, BUF_STRIDE * 3 + 16,); |
1043 | 68.6k | int32_t *sumsq3_ptrs[3], *sumsq3_rows[3]; |
1044 | 68.6k | coef *sum3_ptrs[3], *sum3_rows[3]; |
1045 | 274k | for (int i = 0; i < 3; i++) { |
1046 | 205k | sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE]; |
1047 | 205k | sum3_rows[i] = &sum3_buf[i * BUF_STRIDE]; |
1048 | 205k | } |
1049 | | |
1050 | 68.6k | ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16,); |
1051 | 68.6k | ALIGN_STK_16(coef, B5_buf, BUF_STRIDE * 2 + 16,); |
1052 | 68.6k | int32_t *A5_ptrs[2]; |
1053 | 68.6k | coef *B5_ptrs[2]; |
1054 | 205k | for (int i = 0; i < 2; i++) { |
1055 | 137k | A5_ptrs[i] = &A5_buf[i * BUF_STRIDE]; |
1056 | 137k | B5_ptrs[i] = &B5_buf[i * BUF_STRIDE]; |
1057 | 137k | } |
1058 | 68.6k | ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16,); |
1059 | 68.6k | ALIGN_STK_16(coef, B3_buf, BUF_STRIDE * 4 + 16,); |
1060 | 68.6k | int32_t *A3_ptrs[4]; |
1061 | 68.6k | coef *B3_ptrs[4]; |
1062 | 343k | for (int i = 0; i < 4; i++) { |
1063 | 274k | A3_ptrs[i] = &A3_buf[i * BUF_STRIDE]; |
1064 | 274k | B3_ptrs[i] = &B3_buf[i * BUF_STRIDE]; |
1065 | 274k | } |
1066 | 68.6k | const pixel *src = dst; |
1067 | 68.6k | const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); |
1068 | | |
1069 | 68.6k | if (edges & LR_HAVE_TOP) { |
1070 | 44.0k | sumsq5_ptrs[0] = sumsq5_rows[0]; |
1071 | 44.0k | sumsq5_ptrs[1] = sumsq5_rows[0]; |
1072 | 44.0k | sumsq5_ptrs[2] = sumsq5_rows[1]; |
1073 | 44.0k | sumsq5_ptrs[3] = sumsq5_rows[2]; |
1074 | 44.0k | sumsq5_ptrs[4] = sumsq5_rows[3]; |
1075 | 44.0k | sum5_ptrs[0] = sum5_rows[0]; |
1076 | 44.0k | sum5_ptrs[1] = sum5_rows[0]; |
1077 | 44.0k | sum5_ptrs[2] = sum5_rows[1]; |
1078 | 44.0k | sum5_ptrs[3] = sum5_rows[2]; |
1079 | 44.0k | sum5_ptrs[4] = sum5_rows[3]; |
1080 | | |
1081 | 44.0k | sumsq3_ptrs[0] = sumsq3_rows[0]; |
1082 | 44.0k | sumsq3_ptrs[1] = sumsq3_rows[1]; |
1083 | 44.0k | sumsq3_ptrs[2] = sumsq3_rows[2]; |
1084 | 44.0k | sum3_ptrs[0] = sum3_rows[0]; |
1085 | 44.0k | sum3_ptrs[1] = sum3_rows[1]; |
1086 | 44.0k | sum3_ptrs[2] = sum3_rows[2]; |
1087 | | |
1088 | 44.0k | sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0], |
1089 | 44.0k | sumsq5_rows[0], sum5_rows[0], |
1090 | 44.0k | NULL, lpf, w, edges); |
1091 | 44.0k | lpf += PXSTRIDE(stride); |
1092 | 44.0k | sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1], |
1093 | 44.0k | sumsq5_rows[1], sum5_rows[1], |
1094 | 44.0k | NULL, lpf, w, edges); |
1095 | | |
1096 | 44.0k | sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2], |
1097 | 44.0k | sumsq5_rows[2], sum5_rows[2], |
1098 | 44.0k | left, src, w, edges); |
1099 | 44.0k | left++; |
1100 | 44.0k | src += PXSTRIDE(stride); |
1101 | | |
1102 | 44.0k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1103 | 44.0k | w, params->sgr.s1, BITDEPTH_MAX); |
1104 | 44.0k | rotate(A3_ptrs, B3_ptrs, 4); |
1105 | | |
1106 | 44.0k | if (--h <= 0) |
1107 | 555 | goto vert_1; |
1108 | | |
1109 | 43.4k | sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], |
1110 | 43.4k | sumsq5_rows[3], sum5_rows[3], |
1111 | 43.4k | left, src, w, edges); |
1112 | 43.4k | left++; |
1113 | 43.4k | src += PXSTRIDE(stride); |
1114 | 43.4k | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1115 | 43.4k | w, params->sgr.s0, BITDEPTH_MAX); |
1116 | 43.4k | rotate(A5_ptrs, B5_ptrs, 2); |
1117 | 43.4k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1118 | 43.4k | w, params->sgr.s1, BITDEPTH_MAX); |
1119 | 43.4k | rotate(A3_ptrs, B3_ptrs, 4); |
1120 | | |
1121 | 43.4k | if (--h <= 0) |
1122 | 496 | goto vert_2; |
1123 | | |
1124 | | // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set |
1125 | | // one of them to point at the previously unused rows[4]. |
1126 | 42.9k | sumsq5_ptrs[3] = sumsq5_rows[4]; |
1127 | 42.9k | sum5_ptrs[3] = sum5_rows[4]; |
1128 | 42.9k | } else { |
1129 | 24.6k | sumsq5_ptrs[0] = sumsq5_rows[0]; |
1130 | 24.6k | sumsq5_ptrs[1] = sumsq5_rows[0]; |
1131 | 24.6k | sumsq5_ptrs[2] = sumsq5_rows[0]; |
1132 | 24.6k | sumsq5_ptrs[3] = sumsq5_rows[0]; |
1133 | 24.6k | sumsq5_ptrs[4] = sumsq5_rows[0]; |
1134 | 24.6k | sum5_ptrs[0] = sum5_rows[0]; |
1135 | 24.6k | sum5_ptrs[1] = sum5_rows[0]; |
1136 | 24.6k | sum5_ptrs[2] = sum5_rows[0]; |
1137 | 24.6k | sum5_ptrs[3] = sum5_rows[0]; |
1138 | 24.6k | sum5_ptrs[4] = sum5_rows[0]; |
1139 | | |
1140 | 24.6k | sumsq3_ptrs[0] = sumsq3_rows[0]; |
1141 | 24.6k | sumsq3_ptrs[1] = sumsq3_rows[0]; |
1142 | 24.6k | sumsq3_ptrs[2] = sumsq3_rows[0]; |
1143 | 24.6k | sum3_ptrs[0] = sum3_rows[0]; |
1144 | 24.6k | sum3_ptrs[1] = sum3_rows[0]; |
1145 | 24.6k | sum3_ptrs[2] = sum3_rows[0]; |
1146 | | |
1147 | 24.6k | sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0], |
1148 | 24.6k | sumsq5_rows[0], sum5_rows[0], |
1149 | 24.6k | left, src, w, edges); |
1150 | 24.6k | left++; |
1151 | 24.6k | src += PXSTRIDE(stride); |
1152 | | |
1153 | 24.6k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1154 | 24.6k | w, params->sgr.s1, BITDEPTH_MAX); |
1155 | 24.6k | rotate(A3_ptrs, B3_ptrs, 4); |
1156 | | |
1157 | 24.6k | if (--h <= 0) |
1158 | 3.26k | goto vert_1; |
1159 | | |
1160 | 21.3k | sumsq5_ptrs[4] = sumsq5_rows[1]; |
1161 | 21.3k | sum5_ptrs[4] = sum5_rows[1]; |
1162 | | |
1163 | 21.3k | sumsq3_ptrs[2] = sumsq3_rows[1]; |
1164 | 21.3k | sum3_ptrs[2] = sum3_rows[1]; |
1165 | | |
1166 | 21.3k | sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1], |
1167 | 21.3k | sumsq5_rows[1], sum5_rows[1], |
1168 | 21.3k | left, src, w, edges); |
1169 | 21.3k | left++; |
1170 | 21.3k | src += PXSTRIDE(stride); |
1171 | | |
1172 | 21.3k | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1173 | 21.3k | w, params->sgr.s0, BITDEPTH_MAX); |
1174 | 21.3k | rotate(A5_ptrs, B5_ptrs, 2); |
1175 | 21.3k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1176 | 21.3k | w, params->sgr.s1, BITDEPTH_MAX); |
1177 | 21.3k | rotate(A3_ptrs, B3_ptrs, 4); |
1178 | | |
1179 | 21.3k | if (--h <= 0) |
1180 | 5.22k | goto vert_2; |
1181 | | |
1182 | 16.1k | sumsq5_ptrs[3] = sumsq5_rows[2]; |
1183 | 16.1k | sumsq5_ptrs[4] = sumsq5_rows[3]; |
1184 | 16.1k | sum5_ptrs[3] = sum5_rows[2]; |
1185 | 16.1k | sum5_ptrs[4] = sum5_rows[3]; |
1186 | | |
1187 | 16.1k | sumsq3_ptrs[2] = sumsq3_rows[2]; |
1188 | 16.1k | sum3_ptrs[2] = sum3_rows[2]; |
1189 | | |
1190 | 16.1k | sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2], |
1191 | 16.1k | sumsq5_rows[2], sum5_rows[2], |
1192 | 16.1k | left, src, w, edges); |
1193 | 16.1k | left++; |
1194 | 16.1k | src += PXSTRIDE(stride); |
1195 | | |
1196 | 16.1k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1197 | 16.1k | w, params->sgr.s1, BITDEPTH_MAX); |
1198 | 16.1k | rotate(A3_ptrs, B3_ptrs, 4); |
1199 | | |
1200 | 16.1k | if (--h <= 0) |
1201 | 1.96k | goto odd; |
1202 | | |
1203 | 14.1k | sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], |
1204 | 14.1k | sumsq5_rows[3], sum5_rows[3], |
1205 | 14.1k | left, src, w, edges); |
1206 | 14.1k | left++; |
1207 | 14.1k | src += PXSTRIDE(stride); |
1208 | | |
1209 | 14.1k | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1210 | 14.1k | w, params->sgr.s0, BITDEPTH_MAX); |
1211 | 14.1k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1212 | 14.1k | w, params->sgr.s1, BITDEPTH_MAX); |
1213 | 14.1k | sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, |
1214 | 14.1k | w, 2, params->sgr.w0, params->sgr.w1 |
1215 | 14.1k | HIGHBD_TAIL_SUFFIX); |
1216 | | |
1217 | 14.1k | if (--h <= 0) |
1218 | 1.42k | goto vert_2; |
1219 | | |
1220 | | // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set |
1221 | | // one of them to point at the previously unused rows[4]. |
1222 | 12.7k | sumsq5_ptrs[3] = sumsq5_rows[4]; |
1223 | 12.7k | sum5_ptrs[3] = sum5_rows[4]; |
1224 | 12.7k | } |
1225 | | |
1226 | 1.23M | do { |
1227 | 1.23M | sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], |
1228 | 1.23M | sumsq5_ptrs[3], sum5_ptrs[3], |
1229 | 1.23M | left, src, w, edges); |
1230 | 1.23M | left++; |
1231 | 1.23M | src += PXSTRIDE(stride); |
1232 | | |
1233 | 1.23M | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1234 | 1.23M | w, params->sgr.s1, BITDEPTH_MAX); |
1235 | 1.23M | rotate(A3_ptrs, B3_ptrs, 4); |
1236 | | |
1237 | 1.23M | if (--h <= 0) |
1238 | 8.72k | goto odd; |
1239 | | |
1240 | 1.22M | sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], |
1241 | 1.22M | sumsq5_ptrs[4], sum5_ptrs[4], |
1242 | 1.22M | left, src, w, edges); |
1243 | 1.22M | left++; |
1244 | 1.22M | src += PXSTRIDE(stride); |
1245 | | |
1246 | 1.22M | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1247 | 1.22M | w, params->sgr.s0, BITDEPTH_MAX); |
1248 | 1.22M | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1249 | 1.22M | w, params->sgr.s1, BITDEPTH_MAX); |
1250 | 1.22M | sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, |
1251 | 1.22M | w, 2, params->sgr.w0, params->sgr.w1 |
1252 | 1.22M | HIGHBD_TAIL_SUFFIX); |
1253 | 1.22M | } while (--h > 0); |
1254 | | |
1255 | 46.9k | if (!(edges & LR_HAVE_BOTTOM)) |
1256 | 2.96k | goto vert_2; |
1257 | | |
1258 | 44.0k | sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], |
1259 | 44.0k | sumsq5_ptrs[3], sum5_ptrs[3], |
1260 | 44.0k | NULL, lpf_bottom, w, edges); |
1261 | 44.0k | lpf_bottom += PXSTRIDE(stride); |
1262 | 44.0k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1263 | 44.0k | w, params->sgr.s1, BITDEPTH_MAX); |
1264 | 44.0k | rotate(A3_ptrs, B3_ptrs, 4); |
1265 | | |
1266 | 44.0k | sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], |
1267 | 44.0k | sumsq5_ptrs[4], sum5_ptrs[4], |
1268 | 44.0k | NULL, lpf_bottom, w, edges); |
1269 | | |
1270 | 54.1k | output_2: |
1271 | 54.1k | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1272 | 54.1k | w, params->sgr.s0, BITDEPTH_MAX); |
1273 | 54.1k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1274 | 54.1k | w, params->sgr.s1, BITDEPTH_MAX); |
1275 | 54.1k | sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, |
1276 | 54.1k | w, 2, params->sgr.w0, params->sgr.w1 |
1277 | 54.1k | HIGHBD_TAIL_SUFFIX); |
1278 | 54.1k | return; |
1279 | | |
1280 | 10.1k | vert_2: |
1281 | | // Duplicate the last row twice more |
1282 | 10.1k | sumsq5_ptrs[3] = sumsq5_ptrs[2]; |
1283 | 10.1k | sumsq5_ptrs[4] = sumsq5_ptrs[2]; |
1284 | 10.1k | sum5_ptrs[3] = sum5_ptrs[2]; |
1285 | 10.1k | sum5_ptrs[4] = sum5_ptrs[2]; |
1286 | | |
1287 | 10.1k | sumsq3_ptrs[2] = sumsq3_ptrs[1]; |
1288 | 10.1k | sum3_ptrs[2] = sum3_ptrs[1]; |
1289 | 10.1k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1290 | 10.1k | w, params->sgr.s1, BITDEPTH_MAX); |
1291 | 10.1k | rotate(A3_ptrs, B3_ptrs, 4); |
1292 | | |
1293 | 10.1k | sumsq3_ptrs[2] = sumsq3_ptrs[1]; |
1294 | 10.1k | sum3_ptrs[2] = sum3_ptrs[1]; |
1295 | | |
1296 | 10.1k | goto output_2; |
1297 | | |
1298 | 10.6k | odd: |
1299 | | // Copy the last row as padding once |
1300 | 10.6k | sumsq5_ptrs[4] = sumsq5_ptrs[3]; |
1301 | 10.6k | sum5_ptrs[4] = sum5_ptrs[3]; |
1302 | | |
1303 | 10.6k | sumsq3_ptrs[2] = sumsq3_ptrs[1]; |
1304 | 10.6k | sum3_ptrs[2] = sum3_ptrs[1]; |
1305 | | |
1306 | 10.6k | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1307 | 10.6k | w, params->sgr.s0, BITDEPTH_MAX); |
1308 | 10.6k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1309 | 10.6k | w, params->sgr.s1, BITDEPTH_MAX); |
1310 | 10.6k | sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, |
1311 | 10.6k | w, 2, params->sgr.w0, params->sgr.w1 |
1312 | 10.6k | HIGHBD_TAIL_SUFFIX); |
1313 | | |
1314 | 14.5k | output_1: |
1315 | | // Duplicate the last row twice more |
1316 | 14.5k | sumsq5_ptrs[3] = sumsq5_ptrs[2]; |
1317 | 14.5k | sumsq5_ptrs[4] = sumsq5_ptrs[2]; |
1318 | 14.5k | sum5_ptrs[3] = sum5_ptrs[2]; |
1319 | 14.5k | sum5_ptrs[4] = sum5_ptrs[2]; |
1320 | | |
1321 | 14.5k | sumsq3_ptrs[2] = sumsq3_ptrs[1]; |
1322 | 14.5k | sum3_ptrs[2] = sum3_ptrs[1]; |
1323 | | |
1324 | 14.5k | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1325 | 14.5k | w, params->sgr.s0, BITDEPTH_MAX); |
1326 | 14.5k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1327 | 14.5k | w, params->sgr.s1, BITDEPTH_MAX); |
1328 | 14.5k | rotate(A3_ptrs, B3_ptrs, 4); |
1329 | | // Output only one row |
1330 | 14.5k | sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, |
1331 | 14.5k | w, 1, params->sgr.w0, params->sgr.w1 |
1332 | 14.5k | HIGHBD_TAIL_SUFFIX); |
1333 | 14.5k | return; |
1334 | | |
1335 | 3.82k | vert_1: |
1336 | | // Copy the last row as padding once |
1337 | 3.82k | sumsq5_ptrs[4] = sumsq5_ptrs[3]; |
1338 | 3.82k | sum5_ptrs[4] = sum5_ptrs[3]; |
1339 | | |
1340 | 3.82k | sumsq3_ptrs[2] = sumsq3_ptrs[1]; |
1341 | 3.82k | sum3_ptrs[2] = sum3_ptrs[1]; |
1342 | | |
1343 | 3.82k | sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], |
1344 | 3.82k | w, params->sgr.s0, BITDEPTH_MAX); |
1345 | 3.82k | rotate(A5_ptrs, B5_ptrs, 2); |
1346 | 3.82k | sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], |
1347 | 3.82k | w, params->sgr.s1, BITDEPTH_MAX); |
1348 | 3.82k | rotate(A3_ptrs, B3_ptrs, 4); |
1349 | | |
1350 | 3.82k | goto output_1; |
1351 | 10.6k | } |
1352 | | |
1353 | | #if HAVE_ASM |
1354 | | #if ARCH_AARCH64 || ARCH_ARM |
1355 | | #include "src/arm/looprestoration.h" |
1356 | | #elif ARCH_LOONGARCH64 |
1357 | | #include "src/loongarch/looprestoration.h" |
1358 | | #elif ARCH_PPC64LE |
1359 | | #include "src/ppc/looprestoration.h" |
1360 | | #elif ARCH_X86 |
1361 | | #include "src/x86/looprestoration.h" |
1362 | | #endif |
1363 | | #endif |
1364 | | |
1365 | | COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, |
1366 | | const int bpc) |
1367 | 68.8k | { |
1368 | 68.8k | c->wiener[0] = c->wiener[1] = wiener_c; |
1369 | 68.8k | c->sgr[0] = sgr_5x5_c; |
1370 | 68.8k | c->sgr[1] = sgr_3x3_c; |
1371 | 68.8k | c->sgr[2] = sgr_mix_c; |
1372 | | |
1373 | | #if HAVE_ASM |
1374 | | #if ARCH_AARCH64 || ARCH_ARM |
1375 | | loop_restoration_dsp_init_arm(c, bpc); |
1376 | | #elif ARCH_LOONGARCH64 |
1377 | | loop_restoration_dsp_init_loongarch(c, bpc); |
1378 | | #elif ARCH_PPC64LE |
1379 | | loop_restoration_dsp_init_ppc(c, bpc); |
1380 | | #elif ARCH_X86 |
1381 | | loop_restoration_dsp_init_x86(c, bpc); |
1382 | | #endif |
1383 | | #endif |
1384 | 68.8k | } dav1d_loop_restoration_dsp_init_8bpc Line | Count | Source | 1367 | 30.8k | { | 1368 | 30.8k | c->wiener[0] = c->wiener[1] = wiener_c; | 1369 | 30.8k | c->sgr[0] = sgr_5x5_c; | 1370 | 30.8k | c->sgr[1] = sgr_3x3_c; | 1371 | 30.8k | c->sgr[2] = sgr_mix_c; | 1372 | | | 1373 | | #if HAVE_ASM | 1374 | | #if ARCH_AARCH64 || ARCH_ARM | 1375 | | loop_restoration_dsp_init_arm(c, bpc); | 1376 | | #elif ARCH_LOONGARCH64 | 1377 | | loop_restoration_dsp_init_loongarch(c, bpc); | 1378 | | #elif ARCH_PPC64LE | 1379 | | loop_restoration_dsp_init_ppc(c, bpc); | 1380 | | #elif ARCH_X86 | 1381 | | loop_restoration_dsp_init_x86(c, bpc); | 1382 | | #endif | 1383 | | #endif | 1384 | 30.8k | } |
dav1d_loop_restoration_dsp_init_16bpc Line | Count | Source | 1367 | 38.0k | { | 1368 | 38.0k | c->wiener[0] = c->wiener[1] = wiener_c; | 1369 | 38.0k | c->sgr[0] = sgr_5x5_c; | 1370 | 38.0k | c->sgr[1] = sgr_3x3_c; | 1371 | 38.0k | c->sgr[2] = sgr_mix_c; | 1372 | | | 1373 | | #if HAVE_ASM | 1374 | | #if ARCH_AARCH64 || ARCH_ARM | 1375 | | loop_restoration_dsp_init_arm(c, bpc); | 1376 | | #elif ARCH_LOONGARCH64 | 1377 | | loop_restoration_dsp_init_loongarch(c, bpc); | 1378 | | #elif ARCH_PPC64LE | 1379 | | loop_restoration_dsp_init_ppc(c, bpc); | 1380 | | #elif ARCH_X86 | 1381 | | loop_restoration_dsp_init_x86(c, bpc); | 1382 | | #endif | 1383 | | #endif | 1384 | 38.0k | } |
|