Coverage Report

Created: 2026-05-30 06:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/dav1d/src/looprestoration_tmpl.c
Line
Count
Source
1
/*
2
 * Copyright © 2018, VideoLAN and dav1d authors
3
 * Copyright © 2018, Two Orioles, LLC
4
 * All rights reserved.
5
 *
6
 * Redistribution and use in source and binary forms, with or without
7
 * modification, are permitted provided that the following conditions are met:
8
 *
9
 * 1. Redistributions of source code must retain the above copyright notice, this
10
 *    list of conditions and the following disclaimer.
11
 *
12
 * 2. Redistributions in binary form must reproduce the above copyright notice,
13
 *    this list of conditions and the following disclaimer in the documentation
14
 *    and/or other materials provided with the distribution.
15
 *
16
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
 */
27
28
#include "config.h"
29
30
#include <stdint.h>
31
#include <stdlib.h>
32
#include <string.h>
33
34
#include "common/attributes.h"
35
#include "common/bitdepth.h"
36
#include "common/intops.h"
37
38
#include "src/looprestoration.h"
39
#include "src/tables.h"
40
41
// 256 * 1.5 + 3 + 3 = 390
42
4.17M
#define REST_UNIT_STRIDE (390)
43
44
static void wiener_filter_h(uint16_t *dst, const pixel (*left)[4],
45
                            const pixel *src, const int16_t fh[8],
46
                            const int w, const enum LrEdgeFlags edges
47
                            HIGHBD_DECL_SUFFIX)
48
3.91M
{
49
3.91M
    const int bitdepth = bitdepth_from_max(bitdepth_max);
50
3.91M
    const int round_bits_h = 3 + (bitdepth == 12) * 2;
51
3.91M
    const int rounding_off_h = 1 << (round_bits_h - 1);
52
3.91M
    const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h);
53
54
3.91M
    if (w < 6) {
55
        // For small widths, do the fully conditional loop with
56
        // conditions on each access.
57
2.93M
        for (int x = 0; x < w; x++) {
58
1.95M
            int sum = (1 << (bitdepth + 6));
59
1.95M
#if BITDEPTH == 8
60
1.95M
            sum += src[x] * 128;
61
1.95M
#endif
62
15.5M
            for (int i = 0; i < 7; i++) {
63
13.5M
                int idx = x + i - 3;
64
13.5M
                if (idx < 0) {
65
4.02M
                    if (!(edges & LR_HAVE_LEFT))
66
4.02M
                        sum += src[0] * fh[i];
67
18.4E
                    else if (left)
68
0
                        sum += left[0][4 + idx] * fh[i];
69
18.4E
                    else
70
18.4E
                        sum += src[idx] * fh[i];
71
9.54M
                } else if (idx >= w && !(edges & LR_HAVE_RIGHT)) {
72
3.97M
                    sum += src[w - 1] * fh[i];
73
3.97M
                } else
74
5.56M
                    sum += src[idx] * fh[i];
75
13.5M
            }
76
1.95M
            sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
77
1.95M
            dst[x] = sum;
78
1.95M
        }
79
80
973k
        return;
81
973k
    }
82
83
    // For larger widths, do separate loops with less conditions; first
84
    // handle the start of the row.
85
2.94M
    int start = 3;
86
2.94M
    if (!(edges & LR_HAVE_LEFT)) {
87
        // If there's no left edge, pad using the leftmost pixel.
88
4.12M
        for (int x = 0; x < 3; x++) {
89
3.09M
            int sum = (1 << (bitdepth + 6));
90
3.09M
#if BITDEPTH == 8
91
3.09M
            sum += src[x] * 128;
92
3.09M
#endif
93
24.6M
            for (int i = 0; i < 7; i++) {
94
21.6M
                int idx = x + i - 3;
95
21.6M
                if (idx < 0)
96
6.17M
                    sum += src[0] * fh[i];
97
15.4M
                else
98
15.4M
                    sum += src[idx] * fh[i];
99
21.6M
            }
100
3.09M
            sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
101
3.09M
            dst[x] = sum;
102
3.09M
        }
103
1.90M
    } else if (left) {
104
        // If we have the left edge and a separate left buffer, pad using that.
105
7.17M
        for (int x = 0; x < 3; x++) {
106
5.37M
            int sum = (1 << (bitdepth + 6));
107
5.37M
#if BITDEPTH == 8
108
5.37M
            sum += src[x] * 128;
109
5.37M
#endif
110
42.9M
            for (int i = 0; i < 7; i++) {
111
37.5M
                int idx = x + i - 3;
112
37.5M
                if (idx < 0)
113
10.7M
                    sum += left[0][4 + idx] * fh[i];
114
26.8M
                else
115
26.8M
                    sum += src[idx] * fh[i];
116
37.5M
            }
117
5.37M
            sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
118
5.37M
            dst[x] = sum;
119
5.37M
        }
120
1.79M
    } else {
121
        // If we have the left edge, but no separate left buffer, we're in the
122
        // top/bottom area (lpf) with the left edge existing in the same
123
        // buffer; just do the regular loop from the start.
124
112k
        start = 0;
125
112k
    }
126
2.94M
    int end = w - 3;
127
2.94M
    if (edges & LR_HAVE_RIGHT)
128
1.88M
        end = w;
129
130
    // Do a condititon free loop for the bulk of the row.
131
269M
    for (int x = start; x < end; x++) {
132
266M
        int sum = (1 << (bitdepth + 6));
133
266M
#if BITDEPTH == 8
134
266M
        sum += src[x] * 128;
135
266M
#endif
136
2.11G
        for (int i = 0; i < 7; i++) {
137
1.85G
            int idx = x + i - 3;
138
1.85G
            sum += src[idx] * fh[i];
139
1.85G
        }
140
266M
        sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
141
266M
        dst[x] = sum;
142
266M
    }
143
144
    // If we need to, calculate the end of the row with a condition for
145
    // right edge padding.
146
6.10M
    for (int x = end; x < w; x++) {
147
3.16M
        int sum = (1 << (bitdepth + 6));
148
3.16M
#if BITDEPTH == 8
149
3.16M
        sum += src[x] * 128;
150
3.16M
#endif
151
25.2M
        for (int i = 0; i < 7; i++) {
152
22.1M
            int idx = x + i - 3;
153
22.1M
            if (idx >= w)
154
6.31M
                sum += src[w - 1] * fh[i];
155
15.7M
            else
156
15.7M
                sum += src[idx] * fh[i];
157
22.1M
        }
158
3.16M
        sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
159
3.16M
        dst[x] = sum;
160
3.16M
    }
161
2.94M
}
162
163
static void wiener_filter_v(pixel *p, uint16_t **ptrs, const int16_t fv[8],
164
                            const int w HIGHBD_DECL_SUFFIX)
165
145k
{
166
145k
    const int bitdepth = bitdepth_from_max(bitdepth_max);
167
168
145k
    const int round_bits_v = 11 - (bitdepth == 12) * 2;
169
145k
    const int rounding_off_v = 1 << (round_bits_v - 1);
170
145k
    const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
171
172
10.0M
    for (int i = 0; i < w; i++) {
173
9.87M
        int sum = -round_offset;
174
175
        // Only filter using 6 input rows. The 7th row is assumed to be
176
        // identical to the last one.
177
        //
178
        // This function is assumed to only be called at the end, when doing
179
        // padding at the bottom.
180
69.0M
        for (int k = 0; k < 6; k++)
181
59.2M
            sum += ptrs[k][i] * fv[k];
182
9.87M
        sum += ptrs[5][i] * fv[6];
183
184
9.87M
        p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v);
185
9.87M
    }
186
187
    // Shift the pointers, but only update the first 5; the 6th pointer is kept
188
    // as it was before (and the 7th is implicitly identical to the 6th).
189
870k
    for (int i = 0; i < 5; i++)
190
725k
        ptrs[i] = ptrs[i + 1];
191
145k
}
192
193
static void wiener_filter_hv(pixel *p, uint16_t **ptrs, const pixel (*left)[4],
194
                             const pixel *src, const int16_t filter[2][8],
195
                             const int w, const enum LrEdgeFlags edges
196
                             HIGHBD_DECL_SUFFIX)
197
3.53M
{
198
3.53M
    const int bitdepth = bitdepth_from_max(bitdepth_max);
199
200
3.53M
    const int round_bits_v = 11 - (bitdepth == 12) * 2;
201
3.53M
    const int rounding_off_v = 1 << (round_bits_v - 1);
202
3.53M
    const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
203
204
3.53M
    const int16_t *fh = filter[0];
205
3.53M
    const int16_t *fv = filter[1];
206
207
    // Do combined horziontal and vertical filtering; doing horizontal
208
    // filtering of one row, combined with vertical filtering of 6
209
    // preexisting rows and the newly filtered row.
210
211
    // For simplicity in the C implementation, just do a separate call
212
    // of the horizontal filter, into a temporary buffer.
213
3.53M
    uint16_t tmp[REST_UNIT_STRIDE];
214
3.53M
    wiener_filter_h(tmp, left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
215
216
258M
    for (int i = 0; i < w; i++) {
217
254M
        int sum = -round_offset;
218
219
        // Filter using the 6 stored preexisting rows, and the newly
220
        // filtered one in tmp[].
221
1.77G
        for (int k = 0; k < 6; k++)
222
1.52G
            sum += ptrs[k][i] * fv[k];
223
254M
        sum += tmp[i] * fv[6];
224
        // At this point, after having read all inputs at point [i], we
225
        // could overwrite [i] with the newly filtered data.
226
227
254M
        p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v);
228
254M
    }
229
230
    // For simplicity in the C implementation, just memcpy the newly
231
    // filtered row into ptrs[6]. Normally, in steady state filtering,
232
    // this output row, ptrs[6], is equal to ptrs[0]. However at startup,
233
    // at the top of the filtered area, we may have ptrs[0] equal to ptrs[1],
234
    // so we can't assume we can write into ptrs[0] but we need to keep
235
    // a separate pointer for the next row to write into.
236
3.53M
    memcpy(ptrs[6], tmp, sizeof(uint16_t) * REST_UNIT_STRIDE);
237
238
    // Rotate the window of pointers. Shift the 6 pointers downwards one step.
239
24.7M
    for (int i = 0; i < 6; i++)
240
21.1M
        ptrs[i] = ptrs[i + 1];
241
    // The topmost pointer, ptrs[6], which isn't used as input, is set to
242
    // ptrs[0], which will be used as output for the next _hv call.
243
    // At the start of the filtering, the caller may set ptrs[6] to the
244
    // right next buffer to fill in, instead.
245
3.53M
    ptrs[6] = ptrs[0];
246
3.53M
}
247
248
// FIXME Could split into luma and chroma specific functions,
249
// (since first and last tops are always 0 for chroma)
250
static void wiener_c(pixel *p, const ptrdiff_t stride,
251
                     const pixel (*left)[4],
252
                     const pixel *lpf, const int w, int h,
253
                     const LooprestorationParams *const params,
254
                     const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
255
92.6k
{
256
    // Values stored between horizontal and vertical filtering don't
257
    // fit in a uint8_t.
258
92.6k
    uint16_t hor[6 * REST_UNIT_STRIDE];
259
92.6k
    uint16_t *ptrs[7], *rows[6];
260
648k
    for (int i = 0; i < 6; i++)
261
555k
        rows[i] = &hor[i * REST_UNIT_STRIDE];
262
92.6k
    const int16_t (*const filter)[8] = params->filter;
263
92.6k
    const int16_t *fh = params->filter[0];
264
92.6k
    const int16_t *fv = params->filter[1];
265
92.6k
    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
266
267
92.6k
    const pixel *src = p;
268
92.6k
    if (edges & LR_HAVE_TOP) {
269
58.3k
        ptrs[0] = rows[0];
270
58.3k
        ptrs[1] = rows[0];
271
58.3k
        ptrs[2] = rows[1];
272
58.3k
        ptrs[3] = rows[2];
273
58.3k
        ptrs[4] = rows[2];
274
58.3k
        ptrs[5] = rows[2];
275
276
58.3k
        wiener_filter_h(rows[0], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX);
277
58.3k
        lpf += PXSTRIDE(stride);
278
58.3k
        wiener_filter_h(rows[1], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX);
279
280
58.3k
        wiener_filter_h(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
281
58.3k
        left++;
282
58.3k
        src += PXSTRIDE(stride);
283
284
58.3k
        if (--h <= 0)
285
353
            goto v1;
286
287
58.0k
        ptrs[4] = ptrs[5] = rows[3];
288
58.0k
        wiener_filter_h(rows[3], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
289
58.0k
        left++;
290
58.0k
        src += PXSTRIDE(stride);
291
292
58.0k
        if (--h <= 0)
293
379
            goto v2;
294
295
57.6k
        ptrs[5] = rows[4];
296
57.6k
        wiener_filter_h(rows[4], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
297
57.6k
        left++;
298
57.6k
        src += PXSTRIDE(stride);
299
300
57.6k
        if (--h <= 0)
301
187
            goto v3;
302
57.6k
    } else {
303
34.2k
        ptrs[0] = rows[0];
304
34.2k
        ptrs[1] = rows[0];
305
34.2k
        ptrs[2] = rows[0];
306
34.2k
        ptrs[3] = rows[0];
307
34.2k
        ptrs[4] = rows[0];
308
34.2k
        ptrs[5] = rows[0];
309
310
34.2k
        wiener_filter_h(rows[0], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
311
34.2k
        left++;
312
34.2k
        src += PXSTRIDE(stride);
313
314
34.2k
        if (--h <= 0)
315
3.26k
            goto v1;
316
317
31.0k
        ptrs[4] = ptrs[5] = rows[1];
318
31.0k
        wiener_filter_h(rows[1], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
319
31.0k
        left++;
320
31.0k
        src += PXSTRIDE(stride);
321
322
31.0k
        if (--h <= 0)
323
6.48k
            goto v2;
324
325
24.5k
        ptrs[5] = rows[2];
326
24.5k
        wiener_filter_h(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
327
24.5k
        left++;
328
24.5k
        src += PXSTRIDE(stride);
329
330
24.5k
        if (--h <= 0)
331
2.43k
            goto v3;
332
333
22.0k
        ptrs[6] = rows[3];
334
22.0k
        wiener_filter_hv(p, ptrs, left, src, filter, w, edges
335
22.0k
                         HIGHBD_TAIL_SUFFIX);
336
22.0k
        left++;
337
22.0k
        src += PXSTRIDE(stride);
338
22.0k
        p += PXSTRIDE(stride);
339
340
22.0k
        if (--h <= 0)
341
1.67k
            goto v3;
342
343
20.4k
        ptrs[6] = rows[4];
344
20.4k
        wiener_filter_hv(p, ptrs, left, src, filter, w, edges
345
20.4k
                         HIGHBD_TAIL_SUFFIX);
346
20.4k
        left++;
347
20.4k
        src += PXSTRIDE(stride);
348
20.4k
        p += PXSTRIDE(stride);
349
350
20.4k
        if (--h <= 0)
351
1.49k
            goto v3;
352
20.4k
    }
353
354
76.3k
    ptrs[6] = ptrs[5] + REST_UNIT_STRIDE;
355
3.37M
    do {
356
3.37M
        wiener_filter_hv(p, ptrs, left, src, filter, w, edges
357
3.37M
                         HIGHBD_TAIL_SUFFIX);
358
3.37M
        left++;
359
3.37M
        src += PXSTRIDE(stride);
360
3.37M
        p += PXSTRIDE(stride);
361
3.37M
    } while (--h > 0);
362
363
76.3k
    if (!(edges & LR_HAVE_BOTTOM))
364
16.9k
        goto v3;
365
366
59.3k
    wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges
367
59.3k
                     HIGHBD_TAIL_SUFFIX);
368
59.3k
    lpf_bottom += PXSTRIDE(stride);
369
59.3k
    p += PXSTRIDE(stride);
370
371
59.3k
    wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges
372
59.3k
                     HIGHBD_TAIL_SUFFIX);
373
59.3k
    p += PXSTRIDE(stride);
374
92.6k
v1:
375
92.6k
    wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
376
377
92.6k
    return;
378
379
22.7k
v3:
380
22.7k
    wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
381
22.7k
    p += PXSTRIDE(stride);
382
29.6k
v2:
383
29.6k
    wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
384
29.6k
    p += PXSTRIDE(stride);
385
29.6k
    goto v1;
386
22.7k
}
387
388
// SGR
389
static NOINLINE void rotate(int32_t **sumsq_ptrs, coef **sum_ptrs, int n)
390
7.96M
{
391
7.96M
    int32_t *tmp32 = sumsq_ptrs[0];
392
7.96M
    coef *tmpc = sum_ptrs[0];
393
24.5M
    for (int i = 0; i < n - 1; i++) {
394
16.5M
        sumsq_ptrs[i] = sumsq_ptrs[i + 1];
395
16.5M
        sum_ptrs[i] = sum_ptrs[i + 1];
396
16.5M
    }
397
7.96M
    sumsq_ptrs[n - 1] = tmp32;
398
7.96M
    sum_ptrs[n - 1] = tmpc;
399
7.96M
}
400
401
static NOINLINE void rotate5_x2(int32_t **sumsq_ptrs, coef **sum_ptrs)
402
1.69M
{
403
1.69M
    int32_t *tmp32[2];
404
1.69M
    coef *tmpc[2];
405
5.08M
    for (int i = 0; i < 2; i++) {
406
3.39M
        tmp32[i] = sumsq_ptrs[i];
407
3.39M
        tmpc[i] = sum_ptrs[i];
408
3.39M
    }
409
6.77M
    for (int i = 0; i < 3; i++) {
410
5.07M
        sumsq_ptrs[i] = sumsq_ptrs[i + 2];
411
5.07M
        sum_ptrs[i] = sum_ptrs[i + 2];
412
5.07M
    }
413
5.08M
    for (int i = 0; i < 2; i++) {
414
3.39M
        sumsq_ptrs[3 + i] = tmp32[i];
415
3.39M
        sum_ptrs[3 + i] = tmpc[i];
416
3.39M
    }
417
1.69M
}
418
419
static NOINLINE void sgr_box3_row_h(int32_t *sumsq, coef *sum,
420
                                    const pixel (*left)[4],
421
                                    const pixel *src, const int w,
422
                                    const enum LrEdgeFlags edges)
423
3.20M
{
424
3.20M
    sumsq++;
425
3.20M
    sum++;
426
3.20M
    int a = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0];
427
3.20M
    int b = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0];
428
251M
    for (int x = -1; x < w + 1; x++) {
429
248M
        int c = (x + 1 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 1] : src[w - 1];
430
248M
        sum[x] = a + b + c;
431
248M
        sumsq[x] = a * a + b * b + c * c;
432
248M
        a = b;
433
248M
        b = c;
434
248M
    }
435
3.20M
}
436
437
static NOINLINE void sgr_box5_row_h(int32_t *sumsq, coef *sum,
438
                                    const pixel (*left)[4],
439
                                    const pixel *src, const int w,
440
                                    const enum LrEdgeFlags edges)
441
3.41M
{
442
3.41M
    sumsq++;
443
3.41M
    sum++;
444
3.41M
    int a = edges & LR_HAVE_LEFT ? (left ? left[0][1] : src[-3]) : src[0];
445
3.41M
    int b = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0];
446
3.41M
    int c = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0];
447
3.41M
    int d = src[0];
448
263M
    for (int x = -1; x < w + 1; x++) {
449
260M
        int e = (x + 2 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 2] : src[w - 1];
450
260M
        sum[x] = a + b + c + d + e;
451
260M
        sumsq[x] = a * a + b * b + c * c + d * d + e * e;
452
260M
        a = b;
453
260M
        b = c;
454
260M
        c = d;
455
260M
        d = e;
456
260M
    }
457
3.41M
}
458
459
static void sgr_box35_row_h(int32_t *sumsq3, coef *sum3,
460
                            int32_t *sumsq5, coef *sum5,
461
                            const pixel (*left)[4],
462
                            const pixel *src, const int w,
463
                            const enum LrEdgeFlags edges)
464
2.39M
{
465
2.39M
    sgr_box3_row_h(sumsq3, sum3, left, src, w, edges);
466
2.39M
    sgr_box5_row_h(sumsq5, sum5, left, src, w, edges);
467
2.39M
}
468
469
static NOINLINE void sgr_box3_row_v(int32_t **sumsq, coef **sum,
470
                                    int32_t *sumsq_out, coef *sum_out,
471
                                    const int w)
472
3.16M
{
473
248M
    for (int x = 0; x < w + 2; x++) {
474
245M
        int sq_a = sumsq[0][x];
475
245M
        int sq_b = sumsq[1][x];
476
245M
        int sq_c = sumsq[2][x];
477
245M
        int s_a = sum[0][x];
478
245M
        int s_b = sum[1][x];
479
245M
        int s_c = sum[2][x];
480
245M
        sumsq_out[x] = sq_a + sq_b + sq_c;
481
245M
        sum_out[x] = s_a + s_b + s_c;
482
245M
    }
483
3.16M
}
484
485
static NOINLINE void sgr_box5_row_v(int32_t **sumsq, coef **sum,
486
                                    int32_t *sumsq_out, coef *sum_out,
487
                                    const int w)
488
1.70M
{
489
132M
    for (int x = 0; x < w + 2; x++) {
490
130M
        int sq_a = sumsq[0][x];
491
130M
        int sq_b = sumsq[1][x];
492
130M
        int sq_c = sumsq[2][x];
493
130M
        int sq_d = sumsq[3][x];
494
130M
        int sq_e = sumsq[4][x];
495
130M
        int s_a = sum[0][x];
496
130M
        int s_b = sum[1][x];
497
130M
        int s_c = sum[2][x];
498
130M
        int s_d = sum[3][x];
499
130M
        int s_e = sum[4][x];
500
130M
        sumsq_out[x] = sq_a + sq_b + sq_c + sq_d + sq_e;
501
130M
        sum_out[x] = s_a + s_b + s_c + s_d + s_e;
502
130M
    }
503
1.70M
}
504
505
static NOINLINE void sgr_calc_row_ab(int32_t *AA, coef *BB, int w, int s,
506
                                     int bitdepth_max, int n, int sgr_one_by_x)
507
4.85M
{
508
4.85M
    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
509
370M
    for (int i = 0; i < w + 2; i++) {
510
365M
        const int a =
511
365M
            (AA[i] + ((1 << (2 * bitdepth_min_8)) >> 1)) >> (2 * bitdepth_min_8);
512
365M
        const int b =
513
365M
            (BB[i] + ((1 << bitdepth_min_8) >> 1)) >> bitdepth_min_8;
514
515
365M
        const unsigned p = imax(a * n - b * b, 0);
516
365M
        const unsigned z = (p * s + (1 << 19)) >> 20;
517
365M
        const unsigned x = dav1d_sgr_x_by_x[umin(z, 255)];
518
519
        // This is where we invert A and B, so that B is of size coef.
520
365M
        AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
521
365M
        BB[i] = x;
522
365M
    }
523
4.85M
}
524
525
static void sgr_box3_vert(int32_t **sumsq, coef **sum,
526
                          int32_t *sumsq_out, coef *sum_out,
527
                          const int w, const int s, const int bitdepth_max)
528
3.17M
{
529
3.17M
    sgr_box3_row_v(sumsq, sum, sumsq_out, sum_out, w);
530
3.17M
    sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 9, 455);
531
3.17M
    rotate(sumsq, sum, 3);
532
3.17M
}
533
534
static void sgr_box5_vert(int32_t **sumsq, coef **sum,
535
                          int32_t *sumsq_out, coef *sum_out,
536
                          const int w, const int s, const int bitdepth_max)
537
1.70M
{
538
1.70M
    sgr_box5_row_v(sumsq, sum, sumsq_out, sum_out, w);
539
1.70M
    sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 25, 164);
540
1.70M
    rotate5_x2(sumsq, sum);
541
1.70M
}
542
543
static void sgr_box3_hv(int32_t **sumsq, coef **sum,
544
                        int32_t *AA, coef *BB,
545
                        const pixel (*left)[4],
546
                        const pixel *src, const int w,
547
                        const int s,
548
                        const enum LrEdgeFlags edges,
549
                        const int bitdepth_max)
550
783k
{
551
783k
    sgr_box3_row_h(sumsq[2], sum[2], left, src, w, edges);
552
783k
    sgr_box3_vert(sumsq, sum, AA, BB, w, s, bitdepth_max);
553
783k
}
554
555
static NOINLINE void sgr_finish_filter_row1(coef *tmp,
556
                                            const pixel *src,
557
                                            int32_t **A_ptrs, coef **B_ptrs,
558
                                            const int w)
559
3.01M
{
560
3.01M
#define EIGHT_NEIGHBORS(P, i)\
561
450M
    ((P[1][i] + P[1][i - 1] + P[1][i + 1] + P[0][i] + P[2][i]) * 4 + \
562
450M
     (P[0][i - 1] + P[2][i - 1] +                           \
563
450M
      P[0][i + 1] + P[2][i + 1]) * 3)
564
228M
    for (int i = 0; i < w; i++) {
565
225M
        const int a = EIGHT_NEIGHBORS(B_ptrs, i + 1);
566
225M
        const int b = EIGHT_NEIGHBORS(A_ptrs, i + 1);
567
225M
        tmp[i] = (b - a * src[i] + (1 << 8)) >> 9;
568
225M
    }
569
3.01M
#undef EIGHT_NEIGHBORS
570
3.01M
}
571
572
7.70M
#define FILTER_OUT_STRIDE (384)
573
574
static NOINLINE void sgr_finish_filter2(coef *tmp,
575
                                        const pixel *src,
576
                                        const ptrdiff_t src_stride,
577
                                        int32_t **A_ptrs, coef **B_ptrs,
578
                                        const int w, const int h)
579
1.61M
{
580
1.61M
#define SIX_NEIGHBORS(P, i)\
581
241M
    ((P[0][i]     + P[1][i]) * 6 +   \
582
241M
     (P[0][i - 1] + P[1][i - 1] +    \
583
241M
      P[0][i + 1] + P[1][i + 1]) * 5)
584
122M
    for (int i = 0; i < w; i++) {
585
120M
        const int a = SIX_NEIGHBORS(B_ptrs, i + 1);
586
120M
        const int b = SIX_NEIGHBORS(A_ptrs, i + 1);
587
120M
        tmp[i] = (b - a * src[i] + (1 << 8)) >> 9;
588
120M
    }
589
1.61M
    if (h <= 1)
590
18.9k
        return;
591
1.59M
    tmp += FILTER_OUT_STRIDE;
592
1.59M
    src += PXSTRIDE(src_stride);
593
1.59M
    const int32_t *A = &A_ptrs[1][1];
594
1.59M
    const coef *B = &B_ptrs[1][1];
595
121M
    for (int i = 0; i < w; i++) {
596
119M
        const int a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;
597
119M
        const int b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;
598
119M
        tmp[i] = (b - a * src[i] + (1 << 7)) >> 8;
599
119M
    }
600
1.59M
#undef SIX_NEIGHBORS
601
1.59M
}
602
603
static NOINLINE void sgr_weighted_row1(pixel *dst, const coef *t1,
604
                                       const int w, const int w1 HIGHBD_DECL_SUFFIX)
605
1.71M
{
606
119M
    for (int i = 0; i < w; i++) {
607
118M
        const int v = w1 * t1[i];
608
118M
        dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11));
609
118M
    }
610
1.71M
}
611
612
static NOINLINE void sgr_weighted2(pixel *dst, const ptrdiff_t dst_stride,
613
                                   const coef *t1, const coef *t2,
614
                                   const int w, const int h,
615
                                   const int w0, const int w1 HIGHBD_DECL_SUFFIX)
616
1.13M
{
617
3.39M
    for (int j = 0; j < h; j++) {
618
174M
        for (int i = 0; i < w; i++) {
619
172M
            const int v = w0 * t1[i] + w1 * t2[i];
620
172M
            dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11));
621
172M
        }
622
2.25M
        dst += PXSTRIDE(dst_stride);
623
2.25M
        t1 += FILTER_OUT_STRIDE;
624
2.25M
        t2 += FILTER_OUT_STRIDE;
625
2.25M
    }
626
1.13M
}
627
628
static NOINLINE void sgr_finish1(pixel **dst, const ptrdiff_t stride,
629
                                 int32_t **A_ptrs, coef **B_ptrs, const int w,
630
                                 const int w1 HIGHBD_DECL_SUFFIX)
631
763k
{
632
    // Only one single row, no stride needed
633
763k
    ALIGN_STK_16(coef, tmp, 384,);
634
635
763k
    sgr_finish_filter_row1(tmp, *dst, A_ptrs, B_ptrs, w);
636
763k
    sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX);
637
763k
    *dst += PXSTRIDE(stride);
638
763k
    rotate(A_ptrs, B_ptrs, 3);
639
763k
}
640
641
static NOINLINE void sgr_finish2(pixel **dst, const ptrdiff_t stride,
642
                                 int32_t **A_ptrs, coef **B_ptrs,
643
                                 const int w, const int h, const int w1
644
                                 HIGHBD_DECL_SUFFIX)
645
480k
{
646
480k
    ALIGN_STK_16(coef, tmp, 2*FILTER_OUT_STRIDE,);
647
648
480k
    sgr_finish_filter2(tmp, *dst, stride, A_ptrs, B_ptrs, w, h);
649
480k
    sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX);
650
480k
    *dst += PXSTRIDE(stride);
651
480k
    if (h > 1) {
652
475k
        sgr_weighted_row1(*dst, tmp + FILTER_OUT_STRIDE, w, w1 HIGHBD_TAIL_SUFFIX);
653
475k
        *dst += PXSTRIDE(stride);
654
475k
    }
655
480k
    rotate(A_ptrs, B_ptrs, 2);
656
480k
}
657
658
static NOINLINE void sgr_finish_mix(pixel **dst, const ptrdiff_t stride,
659
                                    int32_t **A5_ptrs, coef **B5_ptrs,
660
                                    int32_t **A3_ptrs, coef **B3_ptrs,
661
                                    const int w, const int h,
662
                                    const int w0, const int w1 HIGHBD_DECL_SUFFIX)
663
1.13M
{
664
1.13M
    ALIGN_STK_16(coef, tmp5, 2*FILTER_OUT_STRIDE,);
665
1.13M
    ALIGN_STK_16(coef, tmp3, 2*FILTER_OUT_STRIDE,);
666
667
1.13M
    sgr_finish_filter2(tmp5, *dst, stride, A5_ptrs, B5_ptrs, w, h);
668
1.13M
    sgr_finish_filter_row1(tmp3, *dst, A3_ptrs, B3_ptrs, w);
669
1.13M
    if (h > 1)
670
1.12M
        sgr_finish_filter_row1(tmp3 + FILTER_OUT_STRIDE, *dst + PXSTRIDE(stride),
671
1.12M
                               &A3_ptrs[1], &B3_ptrs[1], w);
672
1.13M
    sgr_weighted2(*dst, stride, tmp5, tmp3, w, h, w0, w1 HIGHBD_TAIL_SUFFIX);
673
1.13M
    *dst += h*PXSTRIDE(stride);
674
1.13M
    rotate(A5_ptrs, B5_ptrs, 2);
675
1.13M
    rotate(A3_ptrs, B3_ptrs, 4);
676
1.13M
}
677
678
679
static void sgr_3x3_c(pixel *dst, const ptrdiff_t stride,
680
                      const pixel (*left)[4], const pixel *lpf,
681
                      const int w, int h,
682
                      const LooprestorationParams *const params,
683
                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
684
19.1k
{
685
2.22M
#define BUF_STRIDE (384 + 16)
686
19.1k
    ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16,);
687
19.1k
    ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 3 + 16,);
688
19.1k
    int32_t *sumsq_ptrs[3], *sumsq_rows[3];
689
19.1k
    coef *sum_ptrs[3], *sum_rows[3];
690
76.6k
    for (int i = 0; i < 3; i++) {
691
57.4k
        sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
692
57.4k
        sum_rows[i] = &sum_buf[i * BUF_STRIDE];
693
57.4k
    }
694
695
19.1k
    ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16,);
696
19.1k
    ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 3 + 16,);
697
19.1k
    int32_t *A_ptrs[3];
698
19.1k
    coef *B_ptrs[3];
699
76.6k
    for (int i = 0; i < 3; i++) {
700
57.4k
        A_ptrs[i] = &A_buf[i * BUF_STRIDE];
701
57.4k
        B_ptrs[i] = &B_buf[i * BUF_STRIDE];
702
57.4k
    }
703
19.1k
    const pixel *src = dst;
704
19.1k
    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
705
706
19.1k
    if (edges & LR_HAVE_TOP) {
707
13.0k
        sumsq_ptrs[0] = sumsq_rows[0];
708
13.0k
        sumsq_ptrs[1] = sumsq_rows[1];
709
13.0k
        sumsq_ptrs[2] = sumsq_rows[2];
710
13.0k
        sum_ptrs[0] = sum_rows[0];
711
13.0k
        sum_ptrs[1] = sum_rows[1];
712
13.0k
        sum_ptrs[2] = sum_rows[2];
713
714
13.0k
        sgr_box3_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges);
715
13.0k
        lpf += PXSTRIDE(stride);
716
13.0k
        sgr_box3_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges);
717
718
13.0k
        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
719
13.0k
                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
720
13.0k
        left++;
721
13.0k
        src += PXSTRIDE(stride);
722
13.0k
        rotate(A_ptrs, B_ptrs, 3);
723
724
13.0k
        if (--h <= 0)
725
179
            goto vert_1;
726
727
12.8k
        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
728
12.8k
                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
729
12.8k
        left++;
730
12.8k
        src += PXSTRIDE(stride);
731
12.8k
        rotate(A_ptrs, B_ptrs, 3);
732
733
12.8k
        if (--h <= 0)
734
103
            goto vert_2;
735
12.8k
    } else {
736
6.09k
        sumsq_ptrs[0] = sumsq_rows[0];
737
6.09k
        sumsq_ptrs[1] = sumsq_rows[0];
738
6.09k
        sumsq_ptrs[2] = sumsq_rows[0];
739
6.09k
        sum_ptrs[0] = sum_rows[0];
740
6.09k
        sum_ptrs[1] = sum_rows[0];
741
6.09k
        sum_ptrs[2] = sum_rows[0];
742
743
6.09k
        sgr_box3_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges);
744
6.09k
        left++;
745
6.09k
        src += PXSTRIDE(stride);
746
747
6.09k
        sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
748
6.09k
                      w, params->sgr.s1, BITDEPTH_MAX);
749
6.09k
        rotate(A_ptrs, B_ptrs, 3);
750
751
6.09k
        if (--h <= 0)
752
1.04k
            goto vert_1;
753
754
5.04k
        sumsq_ptrs[2] = sumsq_rows[1];
755
5.04k
        sum_ptrs[2] = sum_rows[1];
756
757
5.04k
        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
758
5.04k
                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
759
5.04k
        left++;
760
5.04k
        src += PXSTRIDE(stride);
761
5.04k
        rotate(A_ptrs, B_ptrs, 3);
762
763
5.04k
        if (--h <= 0)
764
1.26k
            goto vert_2;
765
766
3.78k
        sumsq_ptrs[2] = sumsq_rows[2];
767
3.78k
        sum_ptrs[2] = sum_rows[2];
768
3.78k
    }
769
770
726k
    do {
771
726k
        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
772
726k
                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
773
726k
        left++;
774
726k
        src += PXSTRIDE(stride);
775
776
726k
        sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
777
726k
                    w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
778
726k
    } while (--h > 0);
779
780
16.5k
    if (!(edges & LR_HAVE_BOTTOM))
781
3.64k
        goto vert_2;
782
783
12.9k
    sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
784
12.9k
                NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
785
12.9k
    lpf_bottom += PXSTRIDE(stride);
786
787
12.9k
    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
788
12.9k
                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
789
790
12.9k
    sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
791
12.9k
                NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
792
793
12.9k
    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
794
12.9k
                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
795
12.9k
    return;
796
797
5.00k
vert_2:
798
5.00k
    sumsq_ptrs[2] = sumsq_ptrs[1];
799
5.00k
    sum_ptrs[2] = sum_ptrs[1];
800
5.00k
    sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
801
5.00k
                  w, params->sgr.s1, BITDEPTH_MAX);
802
803
5.00k
    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
804
5.00k
                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
805
806
6.23k
output_1:
807
6.23k
    sumsq_ptrs[2] = sumsq_ptrs[1];
808
6.23k
    sum_ptrs[2] = sum_ptrs[1];
809
6.23k
    sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
810
6.23k
                  w, params->sgr.s1, BITDEPTH_MAX);
811
812
6.23k
    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
813
6.23k
                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
814
6.23k
    return;
815
816
1.22k
vert_1:
817
1.22k
    sumsq_ptrs[2] = sumsq_ptrs[1];
818
1.22k
    sum_ptrs[2] = sum_ptrs[1];
819
1.22k
    sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
820
1.22k
                  w, params->sgr.s1, BITDEPTH_MAX);
821
1.22k
    rotate(A_ptrs, B_ptrs, 3);
822
1.22k
    goto output_1;
823
5.00k
}
824
825
static void sgr_5x5_c(pixel *dst, const ptrdiff_t stride,
826
                      const pixel (*left)[4], const pixel *lpf,
827
                      const int w, int h,
828
                      const LooprestorationParams *const params,
829
                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
830
23.9k
{
831
23.9k
    ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16,);
832
23.9k
    ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 5 + 16,);
833
23.9k
    int32_t *sumsq_ptrs[5], *sumsq_rows[5];
834
23.9k
    coef *sum_ptrs[5], *sum_rows[5];
835
143k
    for (int i = 0; i < 5; i++) {
836
119k
        sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
837
119k
        sum_rows[i] = &sum_buf[i * BUF_STRIDE];
838
119k
    }
839
840
23.9k
    ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16,);
841
23.9k
    ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 2 + 16,);
842
23.9k
    int32_t *A_ptrs[2];
843
23.9k
    coef *B_ptrs[2];
844
71.9k
    for (int i = 0; i < 2; i++) {
845
47.9k
        A_ptrs[i] = &A_buf[i * BUF_STRIDE];
846
47.9k
        B_ptrs[i] = &B_buf[i * BUF_STRIDE];
847
47.9k
    }
848
23.9k
    const pixel *src = dst;
849
23.9k
    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
850
851
23.9k
    if (edges & LR_HAVE_TOP) {
852
15.2k
        sumsq_ptrs[0] = sumsq_rows[0];
853
15.2k
        sumsq_ptrs[1] = sumsq_rows[0];
854
15.2k
        sumsq_ptrs[2] = sumsq_rows[1];
855
15.2k
        sumsq_ptrs[3] = sumsq_rows[2];
856
15.2k
        sumsq_ptrs[4] = sumsq_rows[3];
857
15.2k
        sum_ptrs[0] = sum_rows[0];
858
15.2k
        sum_ptrs[1] = sum_rows[0];
859
15.2k
        sum_ptrs[2] = sum_rows[1];
860
15.2k
        sum_ptrs[3] = sum_rows[2];
861
15.2k
        sum_ptrs[4] = sum_rows[3];
862
863
15.2k
        sgr_box5_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges);
864
15.2k
        lpf += PXSTRIDE(stride);
865
15.2k
        sgr_box5_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges);
866
867
15.2k
        sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges);
868
15.2k
        left++;
869
15.2k
        src += PXSTRIDE(stride);
870
871
15.2k
        if (--h <= 0)
872
133
            goto vert_1;
873
874
15.0k
        sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges);
875
15.0k
        left++;
876
15.0k
        src += PXSTRIDE(stride);
877
15.0k
        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
878
15.0k
                      w, params->sgr.s0, BITDEPTH_MAX);
879
15.0k
        rotate(A_ptrs, B_ptrs, 2);
880
881
15.0k
        if (--h <= 0)
882
123
            goto vert_2;
883
884
        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
885
        // one of them to point at the previously unused rows[4].
886
14.9k
        sumsq_ptrs[3] = sumsq_rows[4];
887
14.9k
        sum_ptrs[3] = sum_rows[4];
888
14.9k
    } else {
889
8.77k
        sumsq_ptrs[0] = sumsq_rows[0];
890
8.77k
        sumsq_ptrs[1] = sumsq_rows[0];
891
8.77k
        sumsq_ptrs[2] = sumsq_rows[0];
892
8.77k
        sumsq_ptrs[3] = sumsq_rows[0];
893
8.77k
        sumsq_ptrs[4] = sumsq_rows[0];
894
8.77k
        sum_ptrs[0] = sum_rows[0];
895
8.77k
        sum_ptrs[1] = sum_rows[0];
896
8.77k
        sum_ptrs[2] = sum_rows[0];
897
8.77k
        sum_ptrs[3] = sum_rows[0];
898
8.77k
        sum_ptrs[4] = sum_rows[0];
899
900
8.77k
        sgr_box5_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges);
901
8.77k
        left++;
902
8.77k
        src += PXSTRIDE(stride);
903
904
8.77k
        if (--h <= 0)
905
1.45k
            goto vert_1;
906
907
7.31k
        sumsq_ptrs[4] = sumsq_rows[1];
908
7.31k
        sum_ptrs[4] = sum_rows[1];
909
910
7.31k
        sgr_box5_row_h(sumsq_rows[1], sum_rows[1], left, src, w, edges);
911
7.31k
        left++;
912
7.31k
        src += PXSTRIDE(stride);
913
914
7.31k
        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
915
7.31k
                      w, params->sgr.s0, BITDEPTH_MAX);
916
7.31k
        rotate(A_ptrs, B_ptrs, 2);
917
918
7.31k
        if (--h <= 0)
919
1.20k
            goto vert_2;
920
921
6.10k
        sumsq_ptrs[3] = sumsq_rows[2];
922
6.10k
        sumsq_ptrs[4] = sumsq_rows[3];
923
6.10k
        sum_ptrs[3] = sum_rows[2];
924
6.10k
        sum_ptrs[4] = sum_rows[3];
925
926
6.10k
        sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges);
927
6.10k
        left++;
928
6.10k
        src += PXSTRIDE(stride);
929
930
6.10k
        if (--h <= 0)
931
1.10k
            goto odd;
932
933
5.00k
        sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges);
934
5.00k
        left++;
935
5.00k
        src += PXSTRIDE(stride);
936
937
5.00k
        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
938
5.00k
                      w, params->sgr.s0, BITDEPTH_MAX);
939
5.00k
        sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
940
5.00k
                    w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
941
942
5.00k
        if (--h <= 0)
943
488
            goto vert_2;
944
945
        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
946
        // one of them to point at the previously unused rows[4].
947
4.51k
        sumsq_ptrs[3] = sumsq_rows[4];
948
4.51k
        sum_ptrs[3] = sum_rows[4];
949
4.51k
    }
950
951
450k
    do {
952
450k
        sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], left, src, w, edges);
953
450k
        left++;
954
450k
        src += PXSTRIDE(stride);
955
956
450k
        if (--h <= 0)
957
2.29k
            goto odd;
958
959
447k
        sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], left, src, w, edges);
960
447k
        left++;
961
447k
        src += PXSTRIDE(stride);
962
963
447k
        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
964
447k
                      w, params->sgr.s0, BITDEPTH_MAX);
965
447k
        sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
966
447k
                    w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
967
447k
    } while (--h > 0);
968
969
17.1k
    if (!(edges & LR_HAVE_BOTTOM))
970
1.56k
        goto vert_2;
971
972
15.6k
    sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], NULL, lpf_bottom, w, edges);
973
15.6k
    lpf_bottom += PXSTRIDE(stride);
974
15.6k
    sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], NULL, lpf_bottom, w, edges);
975
976
18.9k
output_2:
977
18.9k
    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
978
18.9k
                  w, params->sgr.s0, BITDEPTH_MAX);
979
18.9k
    sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
980
18.9k
                w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
981
18.9k
    return;
982
983
3.38k
vert_2:
984
    // Duplicate the last row twice more
985
3.38k
    sumsq_ptrs[3] = sumsq_ptrs[2];
986
3.38k
    sumsq_ptrs[4] = sumsq_ptrs[2];
987
3.38k
    sum_ptrs[3] = sum_ptrs[2];
988
3.38k
    sum_ptrs[4] = sum_ptrs[2];
989
3.38k
    goto output_2;
990
991
3.40k
odd:
992
    // Copy the last row as padding once
993
3.40k
    sumsq_ptrs[4] = sumsq_ptrs[3];
994
3.40k
    sum_ptrs[4] = sum_ptrs[3];
995
996
3.40k
    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
997
3.40k
                  w, params->sgr.s0, BITDEPTH_MAX);
998
3.40k
    sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
999
3.40k
                w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
1000
1001
4.99k
output_1:
1002
    // Duplicate the last row twice more
1003
4.99k
    sumsq_ptrs[3] = sumsq_ptrs[2];
1004
4.99k
    sumsq_ptrs[4] = sumsq_ptrs[2];
1005
4.99k
    sum_ptrs[3] = sum_ptrs[2];
1006
4.99k
    sum_ptrs[4] = sum_ptrs[2];
1007
1008
4.99k
    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
1009
4.99k
                  w, params->sgr.s0, BITDEPTH_MAX);
1010
    // Output only one row
1011
4.99k
    sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
1012
4.99k
                w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
1013
4.99k
    return;
1014
1015
1.58k
vert_1:
1016
    // Copy the last row as padding once
1017
1.58k
    sumsq_ptrs[4] = sumsq_ptrs[3];
1018
1.58k
    sum_ptrs[4] = sum_ptrs[3];
1019
1020
1.58k
    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
1021
1.58k
                  w, params->sgr.s0, BITDEPTH_MAX);
1022
1.58k
    rotate(A_ptrs, B_ptrs, 2);
1023
1024
1.58k
    goto output_1;
1025
3.40k
}
1026
1027
static void sgr_mix_c(pixel *dst, const ptrdiff_t stride,
1028
                      const pixel (*left)[4], const pixel *lpf,
1029
                      const int w, int h,
1030
                      const LooprestorationParams *const params,
1031
                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
1032
59.1k
{
1033
59.1k
    ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16,);
1034
59.1k
    ALIGN_STK_16(coef, sum5_buf, BUF_STRIDE * 5 + 16,);
1035
59.1k
    int32_t *sumsq5_ptrs[5], *sumsq5_rows[5];
1036
59.1k
    coef *sum5_ptrs[5], *sum5_rows[5];
1037
354k
    for (int i = 0; i < 5; i++) {
1038
295k
        sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE];
1039
295k
        sum5_rows[i] = &sum5_buf[i * BUF_STRIDE];
1040
295k
    }
1041
59.1k
    ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16,);
1042
59.1k
    ALIGN_STK_16(coef, sum3_buf, BUF_STRIDE * 3 + 16,);
1043
59.1k
    int32_t *sumsq3_ptrs[3], *sumsq3_rows[3];
1044
59.1k
    coef *sum3_ptrs[3], *sum3_rows[3];
1045
236k
    for (int i = 0; i < 3; i++) {
1046
177k
        sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE];
1047
177k
        sum3_rows[i] = &sum3_buf[i * BUF_STRIDE];
1048
177k
    }
1049
1050
59.1k
    ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16,);
1051
59.1k
    ALIGN_STK_16(coef, B5_buf, BUF_STRIDE * 2 + 16,);
1052
59.1k
    int32_t *A5_ptrs[2];
1053
59.1k
    coef *B5_ptrs[2];
1054
177k
    for (int i = 0; i < 2; i++) {
1055
118k
        A5_ptrs[i] = &A5_buf[i * BUF_STRIDE];
1056
118k
        B5_ptrs[i] = &B5_buf[i * BUF_STRIDE];
1057
118k
    }
1058
59.1k
    ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16,);
1059
59.1k
    ALIGN_STK_16(coef, B3_buf, BUF_STRIDE * 4 + 16,);
1060
59.1k
    int32_t *A3_ptrs[4];
1061
59.1k
    coef *B3_ptrs[4];
1062
295k
    for (int i = 0; i < 4; i++) {
1063
236k
        A3_ptrs[i] = &A3_buf[i * BUF_STRIDE];
1064
236k
        B3_ptrs[i] = &B3_buf[i * BUF_STRIDE];
1065
236k
    }
1066
59.1k
    const pixel *src = dst;
1067
59.1k
    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
1068
1069
59.1k
    if (edges & LR_HAVE_TOP) {
1070
37.0k
        sumsq5_ptrs[0] = sumsq5_rows[0];
1071
37.0k
        sumsq5_ptrs[1] = sumsq5_rows[0];
1072
37.0k
        sumsq5_ptrs[2] = sumsq5_rows[1];
1073
37.0k
        sumsq5_ptrs[3] = sumsq5_rows[2];
1074
37.0k
        sumsq5_ptrs[4] = sumsq5_rows[3];
1075
37.0k
        sum5_ptrs[0] = sum5_rows[0];
1076
37.0k
        sum5_ptrs[1] = sum5_rows[0];
1077
37.0k
        sum5_ptrs[2] = sum5_rows[1];
1078
37.0k
        sum5_ptrs[3] = sum5_rows[2];
1079
37.0k
        sum5_ptrs[4] = sum5_rows[3];
1080
1081
37.0k
        sumsq3_ptrs[0] = sumsq3_rows[0];
1082
37.0k
        sumsq3_ptrs[1] = sumsq3_rows[1];
1083
37.0k
        sumsq3_ptrs[2] = sumsq3_rows[2];
1084
37.0k
        sum3_ptrs[0] = sum3_rows[0];
1085
37.0k
        sum3_ptrs[1] = sum3_rows[1];
1086
37.0k
        sum3_ptrs[2] = sum3_rows[2];
1087
1088
37.0k
        sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0],
1089
37.0k
                        sumsq5_rows[0], sum5_rows[0],
1090
37.0k
                        NULL, lpf, w, edges);
1091
37.0k
        lpf += PXSTRIDE(stride);
1092
37.0k
        sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1],
1093
37.0k
                        sumsq5_rows[1], sum5_rows[1],
1094
37.0k
                        NULL, lpf, w, edges);
1095
1096
37.0k
        sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2],
1097
37.0k
                        sumsq5_rows[2], sum5_rows[2],
1098
37.0k
                        left, src, w, edges);
1099
37.0k
        left++;
1100
37.0k
        src += PXSTRIDE(stride);
1101
1102
37.0k
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1103
37.0k
                      w, params->sgr.s1, BITDEPTH_MAX);
1104
37.0k
        rotate(A3_ptrs, B3_ptrs, 4);
1105
1106
37.0k
        if (--h <= 0)
1107
272
            goto vert_1;
1108
1109
36.7k
        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1110
36.7k
                        sumsq5_rows[3], sum5_rows[3],
1111
36.7k
                        left, src, w, edges);
1112
36.7k
        left++;
1113
36.7k
        src += PXSTRIDE(stride);
1114
36.7k
        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1115
36.7k
                      w, params->sgr.s0, BITDEPTH_MAX);
1116
36.7k
        rotate(A5_ptrs, B5_ptrs, 2);
1117
36.7k
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1118
36.7k
                      w, params->sgr.s1, BITDEPTH_MAX);
1119
36.7k
        rotate(A3_ptrs, B3_ptrs, 4);
1120
1121
36.7k
        if (--h <= 0)
1122
166
            goto vert_2;
1123
1124
        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
1125
        // one of them to point at the previously unused rows[4].
1126
36.6k
        sumsq5_ptrs[3] = sumsq5_rows[4];
1127
36.6k
        sum5_ptrs[3] = sum5_rows[4];
1128
36.6k
    } else {
1129
22.1k
        sumsq5_ptrs[0] = sumsq5_rows[0];
1130
22.1k
        sumsq5_ptrs[1] = sumsq5_rows[0];
1131
22.1k
        sumsq5_ptrs[2] = sumsq5_rows[0];
1132
22.1k
        sumsq5_ptrs[3] = sumsq5_rows[0];
1133
22.1k
        sumsq5_ptrs[4] = sumsq5_rows[0];
1134
22.1k
        sum5_ptrs[0] = sum5_rows[0];
1135
22.1k
        sum5_ptrs[1] = sum5_rows[0];
1136
22.1k
        sum5_ptrs[2] = sum5_rows[0];
1137
22.1k
        sum5_ptrs[3] = sum5_rows[0];
1138
22.1k
        sum5_ptrs[4] = sum5_rows[0];
1139
1140
22.1k
        sumsq3_ptrs[0] = sumsq3_rows[0];
1141
22.1k
        sumsq3_ptrs[1] = sumsq3_rows[0];
1142
22.1k
        sumsq3_ptrs[2] = sumsq3_rows[0];
1143
22.1k
        sum3_ptrs[0] = sum3_rows[0];
1144
22.1k
        sum3_ptrs[1] = sum3_rows[0];
1145
22.1k
        sum3_ptrs[2] = sum3_rows[0];
1146
1147
22.1k
        sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0],
1148
22.1k
                        sumsq5_rows[0], sum5_rows[0],
1149
22.1k
                        left, src, w, edges);
1150
22.1k
        left++;
1151
22.1k
        src += PXSTRIDE(stride);
1152
1153
22.1k
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1154
22.1k
                      w, params->sgr.s1, BITDEPTH_MAX);
1155
22.1k
        rotate(A3_ptrs, B3_ptrs, 4);
1156
1157
22.1k
        if (--h <= 0)
1158
3.42k
            goto vert_1;
1159
1160
18.6k
        sumsq5_ptrs[4] = sumsq5_rows[1];
1161
18.6k
        sum5_ptrs[4] = sum5_rows[1];
1162
1163
18.6k
        sumsq3_ptrs[2] = sumsq3_rows[1];
1164
18.6k
        sum3_ptrs[2] = sum3_rows[1];
1165
1166
18.6k
        sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1],
1167
18.6k
                        sumsq5_rows[1], sum5_rows[1],
1168
18.6k
                        left, src, w, edges);
1169
18.6k
        left++;
1170
18.6k
        src += PXSTRIDE(stride);
1171
1172
18.6k
        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1173
18.6k
                      w, params->sgr.s0, BITDEPTH_MAX);
1174
18.6k
        rotate(A5_ptrs, B5_ptrs, 2);
1175
18.6k
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1176
18.6k
                      w, params->sgr.s1, BITDEPTH_MAX);
1177
18.6k
        rotate(A3_ptrs, B3_ptrs, 4);
1178
1179
18.6k
        if (--h <= 0)
1180
3.66k
            goto vert_2;
1181
1182
15.0k
        sumsq5_ptrs[3] = sumsq5_rows[2];
1183
15.0k
        sumsq5_ptrs[4] = sumsq5_rows[3];
1184
15.0k
        sum5_ptrs[3] = sum5_rows[2];
1185
15.0k
        sum5_ptrs[4] = sum5_rows[3];
1186
1187
15.0k
        sumsq3_ptrs[2] = sumsq3_rows[2];
1188
15.0k
        sum3_ptrs[2] = sum3_rows[2];
1189
1190
15.0k
        sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2],
1191
15.0k
                        sumsq5_rows[2], sum5_rows[2],
1192
15.0k
                        left, src, w, edges);
1193
15.0k
        left++;
1194
15.0k
        src += PXSTRIDE(stride);
1195
1196
15.0k
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1197
15.0k
                      w, params->sgr.s1, BITDEPTH_MAX);
1198
15.0k
        rotate(A3_ptrs, B3_ptrs, 4);
1199
1200
15.0k
        if (--h <= 0)
1201
1.89k
            goto odd;
1202
1203
13.1k
        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1204
13.1k
                        sumsq5_rows[3], sum5_rows[3],
1205
13.1k
                        left, src, w, edges);
1206
13.1k
        left++;
1207
13.1k
        src += PXSTRIDE(stride);
1208
1209
13.1k
        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1210
13.1k
                      w, params->sgr.s0, BITDEPTH_MAX);
1211
13.1k
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1212
13.1k
                      w, params->sgr.s1, BITDEPTH_MAX);
1213
13.1k
        sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1214
13.1k
                       w, 2, params->sgr.w0, params->sgr.w1
1215
13.1k
                       HIGHBD_TAIL_SUFFIX);
1216
1217
13.1k
        if (--h <= 0)
1218
1.04k
            goto vert_2;
1219
1220
        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
1221
        // one of them to point at the previously unused rows[4].
1222
12.0k
        sumsq5_ptrs[3] = sumsq5_rows[4];
1223
12.0k
        sum5_ptrs[3] = sum5_rows[4];
1224
12.0k
    }
1225
1226
1.06M
    do {
1227
1.06M
        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1228
1.06M
                        sumsq5_ptrs[3], sum5_ptrs[3],
1229
1.06M
                        left, src, w, edges);
1230
1.06M
        left++;
1231
1.06M
        src += PXSTRIDE(stride);
1232
1233
1.06M
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1234
1.06M
                      w, params->sgr.s1, BITDEPTH_MAX);
1235
1.06M
        rotate(A3_ptrs, B3_ptrs, 4);
1236
1237
1.06M
        if (--h <= 0)
1238
8.37k
            goto odd;
1239
1240
1.05M
        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1241
1.05M
                        sumsq5_ptrs[4], sum5_ptrs[4],
1242
1.05M
                        left, src, w, edges);
1243
1.05M
        left++;
1244
1.05M
        src += PXSTRIDE(stride);
1245
1246
1.05M
        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1247
1.05M
                      w, params->sgr.s0, BITDEPTH_MAX);
1248
1.05M
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1249
1.05M
                      w, params->sgr.s1, BITDEPTH_MAX);
1250
1.05M
        sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1251
1.05M
                       w, 2, params->sgr.w0, params->sgr.w1
1252
1.05M
                       HIGHBD_TAIL_SUFFIX);
1253
1.05M
    } while (--h > 0);
1254
1255
40.3k
    if (!(edges & LR_HAVE_BOTTOM))
1256
2.78k
        goto vert_2;
1257
1258
37.5k
    sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1259
37.5k
                    sumsq5_ptrs[3], sum5_ptrs[3],
1260
37.5k
                    NULL, lpf_bottom, w, edges);
1261
37.5k
    lpf_bottom += PXSTRIDE(stride);
1262
37.5k
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1263
37.5k
                  w, params->sgr.s1, BITDEPTH_MAX);
1264
37.5k
    rotate(A3_ptrs, B3_ptrs, 4);
1265
1266
37.5k
    sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1267
37.5k
                    sumsq5_ptrs[4], sum5_ptrs[4],
1268
37.5k
                    NULL, lpf_bottom, w, edges);
1269
1270
45.1k
output_2:
1271
45.1k
    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1272
45.1k
                  w, params->sgr.s0, BITDEPTH_MAX);
1273
45.1k
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1274
45.1k
                  w, params->sgr.s1, BITDEPTH_MAX);
1275
45.1k
    sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1276
45.1k
                   w, 2, params->sgr.w0, params->sgr.w1
1277
45.1k
                   HIGHBD_TAIL_SUFFIX);
1278
45.1k
    return;
1279
1280
7.65k
vert_2:
1281
    // Duplicate the last row twice more
1282
7.65k
    sumsq5_ptrs[3] = sumsq5_ptrs[2];
1283
7.65k
    sumsq5_ptrs[4] = sumsq5_ptrs[2];
1284
7.65k
    sum5_ptrs[3] = sum5_ptrs[2];
1285
7.65k
    sum5_ptrs[4] = sum5_ptrs[2];
1286
1287
7.65k
    sumsq3_ptrs[2] = sumsq3_ptrs[1];
1288
7.65k
    sum3_ptrs[2] = sum3_ptrs[1];
1289
7.65k
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1290
7.65k
                  w, params->sgr.s1, BITDEPTH_MAX);
1291
7.65k
    rotate(A3_ptrs, B3_ptrs, 4);
1292
1293
7.65k
    sumsq3_ptrs[2] = sumsq3_ptrs[1];
1294
7.65k
    sum3_ptrs[2] = sum3_ptrs[1];
1295
1296
7.65k
    goto output_2;
1297
1298
10.2k
odd:
1299
    // Copy the last row as padding once
1300
10.2k
    sumsq5_ptrs[4] = sumsq5_ptrs[3];
1301
10.2k
    sum5_ptrs[4] = sum5_ptrs[3];
1302
1303
10.2k
    sumsq3_ptrs[2] = sumsq3_ptrs[1];
1304
10.2k
    sum3_ptrs[2] = sum3_ptrs[1];
1305
1306
10.2k
    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1307
10.2k
                  w, params->sgr.s0, BITDEPTH_MAX);
1308
10.2k
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1309
10.2k
                  w, params->sgr.s1, BITDEPTH_MAX);
1310
10.2k
    sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1311
10.2k
                   w, 2, params->sgr.w0, params->sgr.w1
1312
10.2k
                   HIGHBD_TAIL_SUFFIX);
1313
1314
13.9k
output_1:
1315
    // Duplicate the last row twice more
1316
13.9k
    sumsq5_ptrs[3] = sumsq5_ptrs[2];
1317
13.9k
    sumsq5_ptrs[4] = sumsq5_ptrs[2];
1318
13.9k
    sum5_ptrs[3] = sum5_ptrs[2];
1319
13.9k
    sum5_ptrs[4] = sum5_ptrs[2];
1320
1321
13.9k
    sumsq3_ptrs[2] = sumsq3_ptrs[1];
1322
13.9k
    sum3_ptrs[2] = sum3_ptrs[1];
1323
1324
13.9k
    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1325
13.9k
                  w, params->sgr.s0, BITDEPTH_MAX);
1326
13.9k
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1327
13.9k
                  w, params->sgr.s1, BITDEPTH_MAX);
1328
13.9k
    rotate(A3_ptrs, B3_ptrs, 4);
1329
    // Output only one row
1330
13.9k
    sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1331
13.9k
                   w, 1, params->sgr.w0, params->sgr.w1
1332
13.9k
                   HIGHBD_TAIL_SUFFIX);
1333
13.9k
    return;
1334
1335
3.69k
vert_1:
1336
    // Copy the last row as padding once
1337
3.69k
    sumsq5_ptrs[4] = sumsq5_ptrs[3];
1338
3.69k
    sum5_ptrs[4] = sum5_ptrs[3];
1339
1340
3.69k
    sumsq3_ptrs[2] = sumsq3_ptrs[1];
1341
3.69k
    sum3_ptrs[2] = sum3_ptrs[1];
1342
1343
3.69k
    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1344
3.69k
                  w, params->sgr.s0, BITDEPTH_MAX);
1345
3.69k
    rotate(A5_ptrs, B5_ptrs, 2);
1346
3.69k
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1347
3.69k
                  w, params->sgr.s1, BITDEPTH_MAX);
1348
3.69k
    rotate(A3_ptrs, B3_ptrs, 4);
1349
1350
3.69k
    goto output_1;
1351
10.2k
}
1352
1353
#if HAVE_ASM
1354
#if ARCH_AARCH64 || ARCH_ARM
1355
#include "src/arm/looprestoration.h"
1356
#elif ARCH_LOONGARCH64
1357
#include "src/loongarch/looprestoration.h"
1358
#elif ARCH_PPC64LE
1359
#include "src/ppc/looprestoration.h"
1360
#elif ARCH_X86
1361
#include "src/x86/looprestoration.h"
1362
#endif
1363
#endif
1364
1365
COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c,
1366
                                                 const int bpc)
1367
59.4k
{
1368
59.4k
    c->wiener[0] = c->wiener[1] = wiener_c;
1369
59.4k
    c->sgr[0] = sgr_5x5_c;
1370
59.4k
    c->sgr[1] = sgr_3x3_c;
1371
59.4k
    c->sgr[2] = sgr_mix_c;
1372
1373
#if HAVE_ASM
1374
#if ARCH_AARCH64 || ARCH_ARM
1375
    loop_restoration_dsp_init_arm(c, bpc);
1376
#elif ARCH_LOONGARCH64
1377
    loop_restoration_dsp_init_loongarch(c, bpc);
1378
#elif ARCH_PPC64LE
1379
    loop_restoration_dsp_init_ppc(c, bpc);
1380
#elif ARCH_X86
1381
    loop_restoration_dsp_init_x86(c, bpc);
1382
#endif
1383
#endif
1384
59.4k
}
dav1d_loop_restoration_dsp_init_8bpc
Line
Count
Source
1367
27.6k
{
1368
27.6k
    c->wiener[0] = c->wiener[1] = wiener_c;
1369
27.6k
    c->sgr[0] = sgr_5x5_c;
1370
27.6k
    c->sgr[1] = sgr_3x3_c;
1371
27.6k
    c->sgr[2] = sgr_mix_c;
1372
1373
#if HAVE_ASM
1374
#if ARCH_AARCH64 || ARCH_ARM
1375
    loop_restoration_dsp_init_arm(c, bpc);
1376
#elif ARCH_LOONGARCH64
1377
    loop_restoration_dsp_init_loongarch(c, bpc);
1378
#elif ARCH_PPC64LE
1379
    loop_restoration_dsp_init_ppc(c, bpc);
1380
#elif ARCH_X86
1381
    loop_restoration_dsp_init_x86(c, bpc);
1382
#endif
1383
#endif
1384
27.6k
}
dav1d_loop_restoration_dsp_init_16bpc
Line
Count
Source
1367
31.7k
{
1368
31.7k
    c->wiener[0] = c->wiener[1] = wiener_c;
1369
31.7k
    c->sgr[0] = sgr_5x5_c;
1370
31.7k
    c->sgr[1] = sgr_3x3_c;
1371
31.7k
    c->sgr[2] = sgr_mix_c;
1372
1373
#if HAVE_ASM
1374
#if ARCH_AARCH64 || ARCH_ARM
1375
    loop_restoration_dsp_init_arm(c, bpc);
1376
#elif ARCH_LOONGARCH64
1377
    loop_restoration_dsp_init_loongarch(c, bpc);
1378
#elif ARCH_PPC64LE
1379
    loop_restoration_dsp_init_ppc(c, bpc);
1380
#elif ARCH_X86
1381
    loop_restoration_dsp_init_x86(c, bpc);
1382
#endif
1383
#endif
1384
31.7k
}