Coverage Report

Created: 2026-06-10 07:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/dav1d/src/looprestoration_tmpl.c
Line
Count
Source
1
/*
2
 * Copyright © 2018, VideoLAN and dav1d authors
3
 * Copyright © 2018, Two Orioles, LLC
4
 * All rights reserved.
5
 *
6
 * Redistribution and use in source and binary forms, with or without
7
 * modification, are permitted provided that the following conditions are met:
8
 *
9
 * 1. Redistributions of source code must retain the above copyright notice, this
10
 *    list of conditions and the following disclaimer.
11
 *
12
 * 2. Redistributions in binary form must reproduce the above copyright notice,
13
 *    this list of conditions and the following disclaimer in the documentation
14
 *    and/or other materials provided with the distribution.
15
 *
16
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
 */
27
28
#include "config.h"
29
30
#include <stdint.h>
31
#include <stdlib.h>
32
#include <string.h>
33
34
#include "common/attributes.h"
35
#include "common/bitdepth.h"
36
#include "common/intops.h"
37
38
#include "src/looprestoration.h"
39
#include "src/tables.h"
40
41
// 256 * 1.5 + 3 + 3 = 390
42
5.08M
#define REST_UNIT_STRIDE (390)
43
44
static void wiener_filter_h(uint16_t *dst, const pixel (*left)[4],
45
                            const pixel *src, const int16_t fh[8],
46
                            const int w, const enum LrEdgeFlags edges
47
                            HIGHBD_DECL_SUFFIX)
48
4.74M
{
49
4.74M
    const int bitdepth = bitdepth_from_max(bitdepth_max);
50
4.74M
    const int round_bits_h = 3 + (bitdepth == 12) * 2;
51
4.74M
    const int rounding_off_h = 1 << (round_bits_h - 1);
52
4.74M
    const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h);
53
54
4.74M
    if (w < 6) {
55
        // For small widths, do the fully conditional loop with
56
        // conditions on each access.
57
2.58M
        for (int x = 0; x < w; x++) {
58
1.67M
            int sum = (1 << (bitdepth + 6));
59
1.67M
#if BITDEPTH == 8
60
1.67M
            sum += src[x] * 128;
61
1.67M
#endif
62
13.2M
            for (int i = 0; i < 7; i++) {
63
11.5M
                int idx = x + i - 3;
64
11.5M
                if (idx < 0) {
65
3.69M
                    if (!(edges & LR_HAVE_LEFT))
66
3.69M
                        sum += src[0] * fh[i];
67
68
                    else if (left)
68
0
                        sum += left[0][4 + idx] * fh[i];
69
68
                    else
70
68
                        sum += src[idx] * fh[i];
71
7.83M
                } else if (idx >= w && !(edges & LR_HAVE_RIGHT)) {
72
3.62M
                    sum += src[w - 1] * fh[i];
73
3.62M
                } else
74
4.20M
                    sum += src[idx] * fh[i];
75
11.5M
            }
76
1.67M
            sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
77
1.67M
            dst[x] = sum;
78
1.67M
        }
79
80
909k
        return;
81
909k
    }
82
83
    // For larger widths, do separate loops with less conditions; first
84
    // handle the start of the row.
85
3.83M
    int start = 3;
86
3.83M
    if (!(edges & LR_HAVE_LEFT)) {
87
        // If there's no left edge, pad using the leftmost pixel.
88
5.24M
        for (int x = 0; x < 3; x++) {
89
3.92M
            int sum = (1 << (bitdepth + 6));
90
3.92M
#if BITDEPTH == 8
91
3.92M
            sum += src[x] * 128;
92
3.92M
#endif
93
31.4M
            for (int i = 0; i < 7; i++) {
94
27.4M
                int idx = x + i - 3;
95
27.4M
                if (idx < 0)
96
7.87M
                    sum += src[0] * fh[i];
97
19.6M
                else
98
19.6M
                    sum += src[idx] * fh[i];
99
27.4M
            }
100
3.92M
            sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
101
3.92M
            dst[x] = sum;
102
3.92M
        }
103
2.50M
    } else if (left) {
104
        // If we have the left edge and a separate left buffer, pad using that.
105
9.35M
        for (int x = 0; x < 3; x++) {
106
6.99M
            int sum = (1 << (bitdepth + 6));
107
6.99M
#if BITDEPTH == 8
108
6.99M
            sum += src[x] * 128;
109
6.99M
#endif
110
55.9M
            for (int i = 0; i < 7; i++) {
111
48.9M
                int idx = x + i - 3;
112
48.9M
                if (idx < 0)
113
14.0M
                    sum += left[0][4 + idx] * fh[i];
114
34.9M
                else
115
34.9M
                    sum += src[idx] * fh[i];
116
48.9M
            }
117
6.99M
            sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
118
6.99M
            dst[x] = sum;
119
6.99M
        }
120
2.35M
    } else {
121
        // If we have the left edge, but no separate left buffer, we're in the
122
        // top/bottom area (lpf) with the left edge existing in the same
123
        // buffer; just do the regular loop from the start.
124
156k
        start = 0;
125
156k
    }
126
3.83M
    int end = w - 3;
127
3.83M
    if (edges & LR_HAVE_RIGHT)
128
2.44M
        end = w;
129
130
    // Do a condititon free loop for the bulk of the row.
131
350M
    for (int x = start; x < end; x++) {
132
346M
        int sum = (1 << (bitdepth + 6));
133
346M
#if BITDEPTH == 8
134
346M
        sum += src[x] * 128;
135
346M
#endif
136
2.75G
        for (int i = 0; i < 7; i++) {
137
2.40G
            int idx = x + i - 3;
138
2.40G
            sum += src[idx] * fh[i];
139
2.40G
        }
140
346M
        sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
141
346M
        dst[x] = sum;
142
346M
    }
143
144
    // If we need to, calculate the end of the row with a condition for
145
    // right edge padding.
146
7.79M
    for (int x = end; x < w; x++) {
147
3.96M
        int sum = (1 << (bitdepth + 6));
148
3.96M
#if BITDEPTH == 8
149
3.96M
        sum += src[x] * 128;
150
3.96M
#endif
151
31.6M
        for (int i = 0; i < 7; i++) {
152
27.7M
            int idx = x + i - 3;
153
27.7M
            if (idx >= w)
154
7.89M
                sum += src[w - 1] * fh[i];
155
19.8M
            else
156
19.8M
                sum += src[idx] * fh[i];
157
27.7M
        }
158
3.96M
        sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
159
3.96M
        dst[x] = sum;
160
3.96M
    }
161
3.83M
}
162
163
static void wiener_filter_v(pixel *p, uint16_t **ptrs, const int16_t fv[8],
164
                            const int w HIGHBD_DECL_SUFFIX)
165
187k
{
166
187k
    const int bitdepth = bitdepth_from_max(bitdepth_max);
167
168
187k
    const int round_bits_v = 11 - (bitdepth == 12) * 2;
169
187k
    const int rounding_off_v = 1 << (round_bits_v - 1);
170
187k
    const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
171
172
13.0M
    for (int i = 0; i < w; i++) {
173
12.8M
        int sum = -round_offset;
174
175
        // Only filter using 6 input rows. The 7th row is assumed to be
176
        // identical to the last one.
177
        //
178
        // This function is assumed to only be called at the end, when doing
179
        // padding at the bottom.
180
90.1M
        for (int k = 0; k < 6; k++)
181
77.2M
            sum += ptrs[k][i] * fv[k];
182
12.8M
        sum += ptrs[5][i] * fv[6];
183
184
12.8M
        p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v);
185
12.8M
    }
186
187
    // Shift the pointers, but only update the first 5; the 6th pointer is kept
188
    // as it was before (and the 7th is implicitly identical to the 6th).
189
1.12M
    for (int i = 0; i < 5; i++)
190
939k
        ptrs[i] = ptrs[i + 1];
191
187k
}
192
193
static void wiener_filter_hv(pixel *p, uint16_t **ptrs, const pixel (*left)[4],
194
                             const pixel *src, const int16_t filter[2][8],
195
                             const int w, const enum LrEdgeFlags edges
196
                             HIGHBD_DECL_SUFFIX)
197
4.26M
{
198
4.26M
    const int bitdepth = bitdepth_from_max(bitdepth_max);
199
200
4.26M
    const int round_bits_v = 11 - (bitdepth == 12) * 2;
201
4.26M
    const int rounding_off_v = 1 << (round_bits_v - 1);
202
4.26M
    const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
203
204
4.26M
    const int16_t *fh = filter[0];
205
4.26M
    const int16_t *fv = filter[1];
206
207
    // Do combined horziontal and vertical filtering; doing horizontal
208
    // filtering of one row, combined with vertical filtering of 6
209
    // preexisting rows and the newly filtered row.
210
211
    // For simplicity in the C implementation, just do a separate call
212
    // of the horizontal filter, into a temporary buffer.
213
4.26M
    uint16_t tmp[REST_UNIT_STRIDE];
214
4.26M
    wiener_filter_h(tmp, left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
215
216
314M
    for (int i = 0; i < w; i++) {
217
310M
        int sum = -round_offset;
218
219
        // Filter using the 6 stored preexisting rows, and the newly
220
        // filtered one in tmp[].
221
2.16G
        for (int k = 0; k < 6; k++)
222
1.85G
            sum += ptrs[k][i] * fv[k];
223
310M
        sum += tmp[i] * fv[6];
224
        // At this point, after having read all inputs at point [i], we
225
        // could overwrite [i] with the newly filtered data.
226
227
310M
        p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v);
228
310M
    }
229
230
    // For simplicity in the C implementation, just memcpy the newly
231
    // filtered row into ptrs[6]. Normally, in steady state filtering,
232
    // this output row, ptrs[6], is equal to ptrs[0]. However at startup,
233
    // at the top of the filtered area, we may have ptrs[0] equal to ptrs[1],
234
    // so we can't assume we can write into ptrs[0] but we need to keep
235
    // a separate pointer for the next row to write into.
236
4.26M
    memcpy(ptrs[6], tmp, sizeof(uint16_t) * REST_UNIT_STRIDE);
237
238
    // Rotate the window of pointers. Shift the 6 pointers downwards one step.
239
29.8M
    for (int i = 0; i < 6; i++)
240
25.5M
        ptrs[i] = ptrs[i + 1];
241
    // The topmost pointer, ptrs[6], which isn't used as input, is set to
242
    // ptrs[0], which will be used as output for the next _hv call.
243
    // At the start of the filtering, the caller may set ptrs[6] to the
244
    // right next buffer to fill in, instead.
245
4.26M
    ptrs[6] = ptrs[0];
246
4.26M
}
247
248
// FIXME Could split into luma and chroma specific functions,
249
// (since first and last tops are always 0 for chroma)
250
static void wiener_c(pixel *p, const ptrdiff_t stride,
251
                     const pixel (*left)[4],
252
                     const pixel *lpf, const int w, int h,
253
                     const LooprestorationParams *const params,
254
                     const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
255
120k
{
256
    // Values stored between horizontal and vertical filtering don't
257
    // fit in a uint8_t.
258
120k
    uint16_t hor[6 * REST_UNIT_STRIDE];
259
120k
    uint16_t *ptrs[7], *rows[6];
260
842k
    for (int i = 0; i < 6; i++)
261
722k
        rows[i] = &hor[i * REST_UNIT_STRIDE];
262
120k
    const int16_t (*const filter)[8] = params->filter;
263
120k
    const int16_t *fh = params->filter[0];
264
120k
    const int16_t *fv = params->filter[1];
265
120k
    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
266
267
120k
    const pixel *src = p;
268
120k
    if (edges & LR_HAVE_TOP) {
269
75.9k
        ptrs[0] = rows[0];
270
75.9k
        ptrs[1] = rows[0];
271
75.9k
        ptrs[2] = rows[1];
272
75.9k
        ptrs[3] = rows[2];
273
75.9k
        ptrs[4] = rows[2];
274
75.9k
        ptrs[5] = rows[2];
275
276
75.9k
        wiener_filter_h(rows[0], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX);
277
75.9k
        lpf += PXSTRIDE(stride);
278
75.9k
        wiener_filter_h(rows[1], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX);
279
280
75.9k
        wiener_filter_h(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
281
75.9k
        left++;
282
75.9k
        src += PXSTRIDE(stride);
283
284
75.9k
        if (--h <= 0)
285
737
            goto v1;
286
287
75.1k
        ptrs[4] = ptrs[5] = rows[3];
288
75.1k
        wiener_filter_h(rows[3], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
289
75.1k
        left++;
290
75.1k
        src += PXSTRIDE(stride);
291
292
75.1k
        if (--h <= 0)
293
823
            goto v2;
294
295
74.3k
        ptrs[5] = rows[4];
296
74.3k
        wiener_filter_h(rows[4], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
297
74.3k
        left++;
298
74.3k
        src += PXSTRIDE(stride);
299
300
74.3k
        if (--h <= 0)
301
350
            goto v3;
302
74.3k
    } else {
303
44.4k
        ptrs[0] = rows[0];
304
44.4k
        ptrs[1] = rows[0];
305
44.4k
        ptrs[2] = rows[0];
306
44.4k
        ptrs[3] = rows[0];
307
44.4k
        ptrs[4] = rows[0];
308
44.4k
        ptrs[5] = rows[0];
309
310
44.4k
        wiener_filter_h(rows[0], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
311
44.4k
        left++;
312
44.4k
        src += PXSTRIDE(stride);
313
314
44.4k
        if (--h <= 0)
315
3.51k
            goto v1;
316
317
40.9k
        ptrs[4] = ptrs[5] = rows[1];
318
40.9k
        wiener_filter_h(rows[1], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
319
40.9k
        left++;
320
40.9k
        src += PXSTRIDE(stride);
321
322
40.9k
        if (--h <= 0)
323
9.76k
            goto v2;
324
325
31.1k
        ptrs[5] = rows[2];
326
31.1k
        wiener_filter_h(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
327
31.1k
        left++;
328
31.1k
        src += PXSTRIDE(stride);
329
330
31.1k
        if (--h <= 0)
331
2.45k
            goto v3;
332
333
28.7k
        ptrs[6] = rows[3];
334
28.7k
        wiener_filter_hv(p, ptrs, left, src, filter, w, edges
335
28.7k
                         HIGHBD_TAIL_SUFFIX);
336
28.7k
        left++;
337
28.7k
        src += PXSTRIDE(stride);
338
28.7k
        p += PXSTRIDE(stride);
339
340
28.7k
        if (--h <= 0)
341
2.60k
            goto v3;
342
343
26.1k
        ptrs[6] = rows[4];
344
26.1k
        wiener_filter_hv(p, ptrs, left, src, filter, w, edges
345
26.1k
                         HIGHBD_TAIL_SUFFIX);
346
26.1k
        left++;
347
26.1k
        src += PXSTRIDE(stride);
348
26.1k
        p += PXSTRIDE(stride);
349
350
26.1k
        if (--h <= 0)
351
1.21k
            goto v3;
352
26.1k
    }
353
354
98.9k
    ptrs[6] = ptrs[5] + REST_UNIT_STRIDE;
355
4.06M
    do {
356
4.06M
        wiener_filter_hv(p, ptrs, left, src, filter, w, edges
357
4.06M
                         HIGHBD_TAIL_SUFFIX);
358
4.06M
        left++;
359
4.06M
        src += PXSTRIDE(stride);
360
4.06M
        p += PXSTRIDE(stride);
361
4.06M
    } while (--h > 0);
362
363
98.9k
    if (!(edges & LR_HAVE_BOTTOM))
364
21.9k
        goto v3;
365
366
76.9k
    wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges
367
76.9k
                     HIGHBD_TAIL_SUFFIX);
368
76.9k
    lpf_bottom += PXSTRIDE(stride);
369
76.9k
    p += PXSTRIDE(stride);
370
371
76.9k
    wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges
372
76.9k
                     HIGHBD_TAIL_SUFFIX);
373
76.9k
    p += PXSTRIDE(stride);
374
120k
v1:
375
120k
    wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
376
377
120k
    return;
378
379
28.5k
v3:
380
28.5k
    wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
381
28.5k
    p += PXSTRIDE(stride);
382
39.1k
v2:
383
39.1k
    wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
384
39.1k
    p += PXSTRIDE(stride);
385
39.1k
    goto v1;
386
28.5k
}
387
388
// SGR
389
static NOINLINE void rotate(int32_t **sumsq_ptrs, coef **sum_ptrs, int n)
390
8.87M
{
391
8.87M
    int32_t *tmp32 = sumsq_ptrs[0];
392
8.87M
    coef *tmpc = sum_ptrs[0];
393
27.3M
    for (int i = 0; i < n - 1; i++) {
394
18.4M
        sumsq_ptrs[i] = sumsq_ptrs[i + 1];
395
18.4M
        sum_ptrs[i] = sum_ptrs[i + 1];
396
18.4M
    }
397
8.87M
    sumsq_ptrs[n - 1] = tmp32;
398
8.87M
    sum_ptrs[n - 1] = tmpc;
399
8.87M
}
400
401
static NOINLINE void rotate5_x2(int32_t **sumsq_ptrs, coef **sum_ptrs)
402
1.95M
{
403
1.95M
    int32_t *tmp32[2];
404
1.95M
    coef *tmpc[2];
405
5.85M
    for (int i = 0; i < 2; i++) {
406
3.90M
        tmp32[i] = sumsq_ptrs[i];
407
3.90M
        tmpc[i] = sum_ptrs[i];
408
3.90M
    }
409
7.82M
    for (int i = 0; i < 3; i++) {
410
5.87M
        sumsq_ptrs[i] = sumsq_ptrs[i + 2];
411
5.87M
        sum_ptrs[i] = sum_ptrs[i + 2];
412
5.87M
    }
413
5.90M
    for (int i = 0; i < 2; i++) {
414
3.95M
        sumsq_ptrs[3 + i] = tmp32[i];
415
3.95M
        sum_ptrs[3 + i] = tmpc[i];
416
3.95M
    }
417
1.95M
}
418
419
static NOINLINE void sgr_box3_row_h(int32_t *sumsq, coef *sum,
420
                                    const pixel (*left)[4],
421
                                    const pixel *src, const int w,
422
                                    const enum LrEdgeFlags edges)
423
3.70M
{
424
3.70M
    sumsq++;
425
3.70M
    sum++;
426
3.70M
    int a = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0];
427
3.70M
    int b = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0];
428
328M
    for (int x = -1; x < w + 1; x++) {
429
324M
        int c = (x + 1 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 1] : src[w - 1];
430
324M
        sum[x] = a + b + c;
431
324M
        sumsq[x] = a * a + b * b + c * c;
432
324M
        a = b;
433
324M
        b = c;
434
324M
    }
435
3.70M
}
436
437
static NOINLINE void sgr_box5_row_h(int32_t *sumsq, coef *sum,
438
                                    const pixel (*left)[4],
439
                                    const pixel *src, const int w,
440
                                    const enum LrEdgeFlags edges)
441
3.93M
{
442
3.93M
    sumsq++;
443
3.93M
    sum++;
444
3.93M
    int a = edges & LR_HAVE_LEFT ? (left ? left[0][1] : src[-3]) : src[0];
445
3.93M
    int b = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0];
446
3.93M
    int c = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0];
447
3.93M
    int d = src[0];
448
330M
    for (int x = -1; x < w + 1; x++) {
449
326M
        int e = (x + 2 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 2] : src[w - 1];
450
326M
        sum[x] = a + b + c + d + e;
451
326M
        sumsq[x] = a * a + b * b + c * c + d * d + e * e;
452
326M
        a = b;
453
326M
        b = c;
454
326M
        c = d;
455
326M
        d = e;
456
326M
    }
457
3.93M
}
458
459
static void sgr_box35_row_h(int32_t *sumsq3, coef *sum3,
460
                            int32_t *sumsq5, coef *sum5,
461
                            const pixel (*left)[4],
462
                            const pixel *src, const int w,
463
                            const enum LrEdgeFlags edges)
464
2.75M
{
465
2.75M
    sgr_box3_row_h(sumsq3, sum3, left, src, w, edges);
466
2.75M
    sgr_box5_row_h(sumsq5, sum5, left, src, w, edges);
467
2.75M
}
468
469
static NOINLINE void sgr_box3_row_v(int32_t **sumsq, coef **sum,
470
                                    int32_t *sumsq_out, coef *sum_out,
471
                                    const int w)
472
3.65M
{
473
325M
    for (int x = 0; x < w + 2; x++) {
474
321M
        int sq_a = sumsq[0][x];
475
321M
        int sq_b = sumsq[1][x];
476
321M
        int sq_c = sumsq[2][x];
477
321M
        int s_a = sum[0][x];
478
321M
        int s_b = sum[1][x];
479
321M
        int s_c = sum[2][x];
480
321M
        sumsq_out[x] = sq_a + sq_b + sq_c;
481
321M
        sum_out[x] = s_a + s_b + s_c;
482
321M
    }
483
3.65M
}
484
485
static NOINLINE void sgr_box5_row_v(int32_t **sumsq, coef **sum,
486
                                    int32_t *sumsq_out, coef *sum_out,
487
                                    const int w)
488
1.98M
{
489
166M
    for (int x = 0; x < w + 2; x++) {
490
164M
        int sq_a = sumsq[0][x];
491
164M
        int sq_b = sumsq[1][x];
492
164M
        int sq_c = sumsq[2][x];
493
164M
        int sq_d = sumsq[3][x];
494
164M
        int sq_e = sumsq[4][x];
495
164M
        int s_a = sum[0][x];
496
164M
        int s_b = sum[1][x];
497
164M
        int s_c = sum[2][x];
498
164M
        int s_d = sum[3][x];
499
164M
        int s_e = sum[4][x];
500
164M
        sumsq_out[x] = sq_a + sq_b + sq_c + sq_d + sq_e;
501
164M
        sum_out[x] = s_a + s_b + s_c + s_d + s_e;
502
164M
    }
503
1.98M
}
504
505
static NOINLINE void sgr_calc_row_ab(int32_t *AA, coef *BB, int w, int s,
506
                                     int bitdepth_max, int n, int sgr_one_by_x)
507
5.59M
{
508
5.59M
    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
509
445M
    for (int i = 0; i < w + 2; i++) {
510
440M
        const int a =
511
440M
            (AA[i] + ((1 << (2 * bitdepth_min_8)) >> 1)) >> (2 * bitdepth_min_8);
512
440M
        const int b =
513
440M
            (BB[i] + ((1 << bitdepth_min_8) >> 1)) >> bitdepth_min_8;
514
515
440M
        const unsigned p = imax(a * n - b * b, 0);
516
440M
        const unsigned z = (p * s + (1 << 19)) >> 20;
517
440M
        const unsigned x = dav1d_sgr_x_by_x[umin(z, 255)];
518
519
        // This is where we invert A and B, so that B is of size coef.
520
440M
        AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
521
440M
        BB[i] = x;
522
440M
    }
523
5.59M
}
524
525
static void sgr_box3_vert(int32_t **sumsq, coef **sum,
526
                          int32_t *sumsq_out, coef *sum_out,
527
                          const int w, const int s, const int bitdepth_max)
528
3.65M
{
529
3.65M
    sgr_box3_row_v(sumsq, sum, sumsq_out, sum_out, w);
530
3.65M
    sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 9, 455);
531
3.65M
    rotate(sumsq, sum, 3);
532
3.65M
}
533
534
static void sgr_box5_vert(int32_t **sumsq, coef **sum,
535
                          int32_t *sumsq_out, coef *sum_out,
536
                          const int w, const int s, const int bitdepth_max)
537
1.99M
{
538
1.99M
    sgr_box5_row_v(sumsq, sum, sumsq_out, sum_out, w);
539
1.99M
    sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 25, 164);
540
1.99M
    rotate5_x2(sumsq, sum);
541
1.99M
}
542
543
static void sgr_box3_hv(int32_t **sumsq, coef **sum,
544
                        int32_t *AA, coef *BB,
545
                        const pixel (*left)[4],
546
                        const pixel *src, const int w,
547
                        const int s,
548
                        const enum LrEdgeFlags edges,
549
                        const int bitdepth_max)
550
984k
{
551
984k
    sgr_box3_row_h(sumsq[2], sum[2], left, src, w, edges);
552
984k
    sgr_box3_vert(sumsq, sum, AA, BB, w, s, bitdepth_max);
553
984k
}
554
555
static NOINLINE void sgr_finish_filter_row1(coef *tmp,
556
                                            const pixel *src,
557
                                            int32_t **A_ptrs, coef **B_ptrs,
558
                                            const int w)
559
3.54M
{
560
3.54M
#define EIGHT_NEIGHBORS(P, i)\
561
596M
    ((P[1][i] + P[1][i - 1] + P[1][i + 1] + P[0][i] + P[2][i]) * 4 + \
562
596M
     (P[0][i - 1] + P[2][i - 1] +                           \
563
596M
      P[0][i + 1] + P[2][i + 1]) * 3)
564
301M
    for (int i = 0; i < w; i++) {
565
298M
        const int a = EIGHT_NEIGHBORS(B_ptrs, i + 1);
566
298M
        const int b = EIGHT_NEIGHBORS(A_ptrs, i + 1);
567
298M
        tmp[i] = (b - a * src[i] + (1 << 8)) >> 9;
568
298M
    }
569
3.54M
#undef EIGHT_NEIGHBORS
570
3.54M
}
571
572
9.12M
#define FILTER_OUT_STRIDE (384)
573
574
static NOINLINE void sgr_finish_filter2(coef *tmp,
575
                                        const pixel *src,
576
                                        const ptrdiff_t src_stride,
577
                                        int32_t **A_ptrs, coef **B_ptrs,
578
                                        const int w, const int h)
579
1.91M
{
580
1.91M
#define SIX_NEIGHBORS(P, i)\
581
305M
    ((P[0][i]     + P[1][i]) * 6 +   \
582
305M
     (P[0][i - 1] + P[1][i - 1] +    \
583
305M
      P[0][i + 1] + P[1][i + 1]) * 5)
584
154M
    for (int i = 0; i < w; i++) {
585
152M
        const int a = SIX_NEIGHBORS(B_ptrs, i + 1);
586
152M
        const int b = SIX_NEIGHBORS(A_ptrs, i + 1);
587
152M
        tmp[i] = (b - a * src[i] + (1 << 8)) >> 9;
588
152M
    }
589
1.91M
    if (h <= 1)
590
20.0k
        return;
591
1.89M
    tmp += FILTER_OUT_STRIDE;
592
1.89M
    src += PXSTRIDE(src_stride);
593
1.89M
    const int32_t *A = &A_ptrs[1][1];
594
1.89M
    const coef *B = &B_ptrs[1][1];
595
153M
    for (int i = 0; i < w; i++) {
596
151M
        const int a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;
597
151M
        const int b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;
598
151M
        tmp[i] = (b - a * src[i] + (1 << 7)) >> 8;
599
151M
    }
600
1.89M
#undef SIX_NEIGHBORS
601
1.89M
}
602
603
static NOINLINE void sgr_weighted_row1(pixel *dst, const coef *t1,
604
                                       const int w, const int w1 HIGHBD_DECL_SUFFIX)
605
2.10M
{
606
157M
    for (int i = 0; i < w; i++) {
607
155M
        const int v = w1 * t1[i];
608
155M
        dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11));
609
155M
    }
610
2.10M
}
611
612
static NOINLINE void sgr_weighted2(pixel *dst, const ptrdiff_t dst_stride,
613
                                   const coef *t1, const coef *t2,
614
                                   const int w, const int h,
615
                                   const int w0, const int w1 HIGHBD_DECL_SUFFIX)
616
1.35M
{
617
4.01M
    for (int j = 0; j < h; j++) {
618
222M
        for (int i = 0; i < w; i++) {
619
219M
            const int v = w0 * t1[i] + w1 * t2[i];
620
219M
            dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11));
621
219M
        }
622
2.66M
        dst += PXSTRIDE(dst_stride);
623
2.66M
        t1 += FILTER_OUT_STRIDE;
624
2.66M
        t2 += FILTER_OUT_STRIDE;
625
2.66M
    }
626
1.35M
}
627
628
static NOINLINE void sgr_finish1(pixel **dst, const ptrdiff_t stride,
629
                                 int32_t **A_ptrs, coef **B_ptrs, const int w,
630
                                 const int w1 HIGHBD_DECL_SUFFIX)
631
956k
{
632
    // Only one single row, no stride needed
633
956k
    ALIGN_STK_16(coef, tmp, 384,);
634
635
956k
    sgr_finish_filter_row1(tmp, *dst, A_ptrs, B_ptrs, w);
636
956k
    sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX);
637
956k
    *dst += PXSTRIDE(stride);
638
956k
    rotate(A_ptrs, B_ptrs, 3);
639
956k
}
640
641
static NOINLINE void sgr_finish2(pixel **dst, const ptrdiff_t stride,
642
                                 int32_t **A_ptrs, coef **B_ptrs,
643
                                 const int w, const int h, const int w1
644
                                 HIGHBD_DECL_SUFFIX)
645
583k
{
646
583k
    ALIGN_STK_16(coef, tmp, 2*FILTER_OUT_STRIDE,);
647
648
583k
    sgr_finish_filter2(tmp, *dst, stride, A_ptrs, B_ptrs, w, h);
649
583k
    sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX);
650
583k
    *dst += PXSTRIDE(stride);
651
583k
    if (h > 1) {
652
574k
        sgr_weighted_row1(*dst, tmp + FILTER_OUT_STRIDE, w, w1 HIGHBD_TAIL_SUFFIX);
653
574k
        *dst += PXSTRIDE(stride);
654
574k
    }
655
583k
    rotate(A_ptrs, B_ptrs, 2);
656
583k
}
657
658
static NOINLINE void sgr_finish_mix(pixel **dst, const ptrdiff_t stride,
659
                                    int32_t **A5_ptrs, coef **B5_ptrs,
660
                                    int32_t **A3_ptrs, coef **B3_ptrs,
661
                                    const int w, const int h,
662
                                    const int w0, const int w1 HIGHBD_DECL_SUFFIX)
663
1.33M
{
664
1.33M
    ALIGN_STK_16(coef, tmp5, 2*FILTER_OUT_STRIDE,);
665
1.33M
    ALIGN_STK_16(coef, tmp3, 2*FILTER_OUT_STRIDE,);
666
667
1.33M
    sgr_finish_filter2(tmp5, *dst, stride, A5_ptrs, B5_ptrs, w, h);
668
1.33M
    sgr_finish_filter_row1(tmp3, *dst, A3_ptrs, B3_ptrs, w);
669
1.33M
    if (h > 1)
670
1.33M
        sgr_finish_filter_row1(tmp3 + FILTER_OUT_STRIDE, *dst + PXSTRIDE(stride),
671
1.33M
                               &A3_ptrs[1], &B3_ptrs[1], w);
672
1.33M
    sgr_weighted2(*dst, stride, tmp5, tmp3, w, h, w0, w1 HIGHBD_TAIL_SUFFIX);
673
1.33M
    *dst += h*PXSTRIDE(stride);
674
1.33M
    rotate(A5_ptrs, B5_ptrs, 2);
675
1.33M
    rotate(A3_ptrs, B3_ptrs, 4);
676
1.33M
}
677
678
679
static void sgr_3x3_c(pixel *dst, const ptrdiff_t stride,
680
                      const pixel (*left)[4], const pixel *lpf,
681
                      const int w, int h,
682
                      const LooprestorationParams *const params,
683
                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
684
24.5k
{
685
2.68M
#define BUF_STRIDE (384 + 16)
686
24.5k
    ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16,);
687
24.5k
    ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 3 + 16,);
688
24.5k
    int32_t *sumsq_ptrs[3], *sumsq_rows[3];
689
24.5k
    coef *sum_ptrs[3], *sum_rows[3];
690
98.0k
    for (int i = 0; i < 3; i++) {
691
73.5k
        sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
692
73.5k
        sum_rows[i] = &sum_buf[i * BUF_STRIDE];
693
73.5k
    }
694
695
24.5k
    ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16,);
696
24.5k
    ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 3 + 16,);
697
24.5k
    int32_t *A_ptrs[3];
698
24.5k
    coef *B_ptrs[3];
699
98.0k
    for (int i = 0; i < 3; i++) {
700
73.5k
        A_ptrs[i] = &A_buf[i * BUF_STRIDE];
701
73.5k
        B_ptrs[i] = &B_buf[i * BUF_STRIDE];
702
73.5k
    }
703
24.5k
    const pixel *src = dst;
704
24.5k
    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
705
706
24.5k
    if (edges & LR_HAVE_TOP) {
707
17.3k
        sumsq_ptrs[0] = sumsq_rows[0];
708
17.3k
        sumsq_ptrs[1] = sumsq_rows[1];
709
17.3k
        sumsq_ptrs[2] = sumsq_rows[2];
710
17.3k
        sum_ptrs[0] = sum_rows[0];
711
17.3k
        sum_ptrs[1] = sum_rows[1];
712
17.3k
        sum_ptrs[2] = sum_rows[2];
713
714
17.3k
        sgr_box3_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges);
715
17.3k
        lpf += PXSTRIDE(stride);
716
17.3k
        sgr_box3_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges);
717
718
17.3k
        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
719
17.3k
                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
720
17.3k
        left++;
721
17.3k
        src += PXSTRIDE(stride);
722
17.3k
        rotate(A_ptrs, B_ptrs, 3);
723
724
17.3k
        if (--h <= 0)
725
311
            goto vert_1;
726
727
17.0k
        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
728
17.0k
                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
729
17.0k
        left++;
730
17.0k
        src += PXSTRIDE(stride);
731
17.0k
        rotate(A_ptrs, B_ptrs, 3);
732
733
17.0k
        if (--h <= 0)
734
250
            goto vert_2;
735
17.0k
    } else {
736
7.17k
        sumsq_ptrs[0] = sumsq_rows[0];
737
7.17k
        sumsq_ptrs[1] = sumsq_rows[0];
738
7.17k
        sumsq_ptrs[2] = sumsq_rows[0];
739
7.17k
        sum_ptrs[0] = sum_rows[0];
740
7.17k
        sum_ptrs[1] = sum_rows[0];
741
7.17k
        sum_ptrs[2] = sum_rows[0];
742
743
7.17k
        sgr_box3_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges);
744
7.17k
        left++;
745
7.17k
        src += PXSTRIDE(stride);
746
747
7.17k
        sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
748
7.17k
                      w, params->sgr.s1, BITDEPTH_MAX);
749
7.17k
        rotate(A_ptrs, B_ptrs, 3);
750
751
7.17k
        if (--h <= 0)
752
1.18k
            goto vert_1;
753
754
5.98k
        sumsq_ptrs[2] = sumsq_rows[1];
755
5.98k
        sum_ptrs[2] = sum_rows[1];
756
757
5.98k
        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
758
5.98k
                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
759
5.98k
        left++;
760
5.98k
        src += PXSTRIDE(stride);
761
5.98k
        rotate(A_ptrs, B_ptrs, 3);
762
763
5.98k
        if (--h <= 0)
764
1.43k
            goto vert_2;
765
766
4.55k
        sumsq_ptrs[2] = sumsq_rows[2];
767
4.55k
        sum_ptrs[2] = sum_rows[2];
768
4.55k
    }
769
770
920k
    do {
771
920k
        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
772
920k
                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
773
920k
        left++;
774
920k
        src += PXSTRIDE(stride);
775
776
920k
        sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
777
920k
                    w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
778
920k
    } while (--h > 0);
779
780
21.3k
    if (!(edges & LR_HAVE_BOTTOM))
781
4.41k
        goto vert_2;
782
783
16.9k
    sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
784
16.9k
                NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
785
16.9k
    lpf_bottom += PXSTRIDE(stride);
786
787
16.9k
    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
788
16.9k
                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
789
790
16.9k
    sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
791
16.9k
                NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
792
793
16.9k
    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
794
16.9k
                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
795
16.9k
    return;
796
797
6.09k
vert_2:
798
6.09k
    sumsq_ptrs[2] = sumsq_ptrs[1];
799
6.09k
    sum_ptrs[2] = sum_ptrs[1];
800
6.09k
    sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
801
6.09k
                  w, params->sgr.s1, BITDEPTH_MAX);
802
803
6.09k
    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
804
6.09k
                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
805
806
7.59k
output_1:
807
7.59k
    sumsq_ptrs[2] = sumsq_ptrs[1];
808
7.59k
    sum_ptrs[2] = sum_ptrs[1];
809
7.59k
    sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
810
7.59k
                  w, params->sgr.s1, BITDEPTH_MAX);
811
812
7.59k
    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
813
7.59k
                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
814
7.59k
    return;
815
816
1.50k
vert_1:
817
1.50k
    sumsq_ptrs[2] = sumsq_ptrs[1];
818
1.50k
    sum_ptrs[2] = sum_ptrs[1];
819
1.50k
    sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
820
1.50k
                  w, params->sgr.s1, BITDEPTH_MAX);
821
1.50k
    rotate(A_ptrs, B_ptrs, 3);
822
1.50k
    goto output_1;
823
6.09k
}
824
825
static void sgr_5x5_c(pixel *dst, const ptrdiff_t stride,
826
                      const pixel (*left)[4], const pixel *lpf,
827
                      const int w, int h,
828
                      const LooprestorationParams *const params,
829
                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
830
30.1k
{
831
30.1k
    ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16,);
832
30.1k
    ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 5 + 16,);
833
30.1k
    int32_t *sumsq_ptrs[5], *sumsq_rows[5];
834
30.1k
    coef *sum_ptrs[5], *sum_rows[5];
835
181k
    for (int i = 0; i < 5; i++) {
836
150k
        sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
837
150k
        sum_rows[i] = &sum_buf[i * BUF_STRIDE];
838
150k
    }
839
840
30.1k
    ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16,);
841
30.1k
    ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 2 + 16,);
842
30.1k
    int32_t *A_ptrs[2];
843
30.1k
    coef *B_ptrs[2];
844
90.5k
    for (int i = 0; i < 2; i++) {
845
60.4k
        A_ptrs[i] = &A_buf[i * BUF_STRIDE];
846
60.4k
        B_ptrs[i] = &B_buf[i * BUF_STRIDE];
847
60.4k
    }
848
30.1k
    const pixel *src = dst;
849
30.1k
    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
850
851
30.1k
    if (edges & LR_HAVE_TOP) {
852
19.0k
        sumsq_ptrs[0] = sumsq_rows[0];
853
19.0k
        sumsq_ptrs[1] = sumsq_rows[0];
854
19.0k
        sumsq_ptrs[2] = sumsq_rows[1];
855
19.0k
        sumsq_ptrs[3] = sumsq_rows[2];
856
19.0k
        sumsq_ptrs[4] = sumsq_rows[3];
857
19.0k
        sum_ptrs[0] = sum_rows[0];
858
19.0k
        sum_ptrs[1] = sum_rows[0];
859
19.0k
        sum_ptrs[2] = sum_rows[1];
860
19.0k
        sum_ptrs[3] = sum_rows[2];
861
19.0k
        sum_ptrs[4] = sum_rows[3];
862
863
19.0k
        sgr_box5_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges);
864
19.0k
        lpf += PXSTRIDE(stride);
865
19.0k
        sgr_box5_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges);
866
867
19.0k
        sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges);
868
19.0k
        left++;
869
19.0k
        src += PXSTRIDE(stride);
870
871
19.0k
        if (--h <= 0)
872
266
            goto vert_1;
873
874
18.7k
        sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges);
875
18.7k
        left++;
876
18.7k
        src += PXSTRIDE(stride);
877
18.7k
        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
878
18.7k
                      w, params->sgr.s0, BITDEPTH_MAX);
879
18.7k
        rotate(A_ptrs, B_ptrs, 2);
880
881
18.7k
        if (--h <= 0)
882
622
            goto vert_2;
883
884
        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
885
        // one of them to point at the previously unused rows[4].
886
18.1k
        sumsq_ptrs[3] = sumsq_rows[4];
887
18.1k
        sum_ptrs[3] = sum_rows[4];
888
18.1k
    } else {
889
11.1k
        sumsq_ptrs[0] = sumsq_rows[0];
890
11.1k
        sumsq_ptrs[1] = sumsq_rows[0];
891
11.1k
        sumsq_ptrs[2] = sumsq_rows[0];
892
11.1k
        sumsq_ptrs[3] = sumsq_rows[0];
893
11.1k
        sumsq_ptrs[4] = sumsq_rows[0];
894
11.1k
        sum_ptrs[0] = sum_rows[0];
895
11.1k
        sum_ptrs[1] = sum_rows[0];
896
11.1k
        sum_ptrs[2] = sum_rows[0];
897
11.1k
        sum_ptrs[3] = sum_rows[0];
898
11.1k
        sum_ptrs[4] = sum_rows[0];
899
900
11.1k
        sgr_box5_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges);
901
11.1k
        left++;
902
11.1k
        src += PXSTRIDE(stride);
903
904
11.1k
        if (--h <= 0)
905
1.72k
            goto vert_1;
906
907
9.46k
        sumsq_ptrs[4] = sumsq_rows[1];
908
9.46k
        sum_ptrs[4] = sum_rows[1];
909
910
9.46k
        sgr_box5_row_h(sumsq_rows[1], sum_rows[1], left, src, w, edges);
911
9.46k
        left++;
912
9.46k
        src += PXSTRIDE(stride);
913
914
9.46k
        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
915
9.46k
                      w, params->sgr.s0, BITDEPTH_MAX);
916
9.46k
        rotate(A_ptrs, B_ptrs, 2);
917
918
9.46k
        if (--h <= 0)
919
1.53k
            goto vert_2;
920
921
7.93k
        sumsq_ptrs[3] = sumsq_rows[2];
922
7.93k
        sumsq_ptrs[4] = sumsq_rows[3];
923
7.93k
        sum_ptrs[3] = sum_rows[2];
924
7.93k
        sum_ptrs[4] = sum_rows[3];
925
926
7.93k
        sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges);
927
7.93k
        left++;
928
7.93k
        src += PXSTRIDE(stride);
929
930
7.93k
        if (--h <= 0)
931
1.03k
            goto odd;
932
933
6.89k
        sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges);
934
6.89k
        left++;
935
6.89k
        src += PXSTRIDE(stride);
936
937
6.89k
        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
938
6.89k
                      w, params->sgr.s0, BITDEPTH_MAX);
939
6.89k
        sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
940
6.89k
                    w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
941
942
6.89k
        if (--h <= 0)
943
691
            goto vert_2;
944
945
        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
946
        // one of them to point at the previously unused rows[4].
947
6.20k
        sumsq_ptrs[3] = sumsq_rows[4];
948
6.20k
        sum_ptrs[3] = sum_rows[4];
949
6.20k
    }
950
951
541k
    do {
952
541k
        sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], left, src, w, edges);
953
541k
        left++;
954
541k
        src += PXSTRIDE(stride);
955
956
541k
        if (--h <= 0)
957
2.81k
            goto odd;
958
959
538k
        sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], left, src, w, edges);
960
538k
        left++;
961
538k
        src += PXSTRIDE(stride);
962
963
538k
        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
964
538k
                      w, params->sgr.s0, BITDEPTH_MAX);
965
538k
        sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
966
538k
                    w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
967
538k
    } while (--h > 0);
968
969
21.5k
    if (!(edges & LR_HAVE_BOTTOM))
970
1.96k
        goto vert_2;
971
972
19.5k
    sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], NULL, lpf_bottom, w, edges);
973
19.5k
    lpf_bottom += PXSTRIDE(stride);
974
19.5k
    sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], NULL, lpf_bottom, w, edges);
975
976
24.3k
output_2:
977
24.3k
    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
978
24.3k
                  w, params->sgr.s0, BITDEPTH_MAX);
979
24.3k
    sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
980
24.3k
                w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
981
24.3k
    return;
982
983
4.81k
vert_2:
984
    // Duplicate the last row twice more
985
4.81k
    sumsq_ptrs[3] = sumsq_ptrs[2];
986
4.81k
    sumsq_ptrs[4] = sumsq_ptrs[2];
987
4.81k
    sum_ptrs[3] = sum_ptrs[2];
988
4.81k
    sum_ptrs[4] = sum_ptrs[2];
989
4.81k
    goto output_2;
990
991
3.85k
odd:
992
    // Copy the last row as padding once
993
3.85k
    sumsq_ptrs[4] = sumsq_ptrs[3];
994
3.85k
    sum_ptrs[4] = sum_ptrs[3];
995
996
3.85k
    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
997
3.85k
                  w, params->sgr.s0, BITDEPTH_MAX);
998
3.85k
    sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
999
3.85k
                w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
1000
1001
5.84k
output_1:
1002
    // Duplicate the last row twice more
1003
5.84k
    sumsq_ptrs[3] = sumsq_ptrs[2];
1004
5.84k
    sumsq_ptrs[4] = sumsq_ptrs[2];
1005
5.84k
    sum_ptrs[3] = sum_ptrs[2];
1006
5.84k
    sum_ptrs[4] = sum_ptrs[2];
1007
1008
5.84k
    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
1009
5.84k
                  w, params->sgr.s0, BITDEPTH_MAX);
1010
    // Output only one row
1011
5.84k
    sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
1012
5.84k
                w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
1013
5.84k
    return;
1014
1015
1.98k
vert_1:
1016
    // Copy the last row as padding once
1017
1.98k
    sumsq_ptrs[4] = sumsq_ptrs[3];
1018
1.98k
    sum_ptrs[4] = sum_ptrs[3];
1019
1020
1.98k
    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
1021
1.98k
                  w, params->sgr.s0, BITDEPTH_MAX);
1022
1.98k
    rotate(A_ptrs, B_ptrs, 2);
1023
1024
1.98k
    goto output_1;
1025
3.85k
}
1026
1027
static void sgr_mix_c(pixel *dst, const ptrdiff_t stride,
1028
                      const pixel (*left)[4], const pixel *lpf,
1029
                      const int w, int h,
1030
                      const LooprestorationParams *const params,
1031
                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
1032
70.1k
{
1033
70.1k
    ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16,);
1034
70.1k
    ALIGN_STK_16(coef, sum5_buf, BUF_STRIDE * 5 + 16,);
1035
70.1k
    int32_t *sumsq5_ptrs[5], *sumsq5_rows[5];
1036
70.1k
    coef *sum5_ptrs[5], *sum5_rows[5];
1037
420k
    for (int i = 0; i < 5; i++) {
1038
350k
        sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE];
1039
350k
        sum5_rows[i] = &sum5_buf[i * BUF_STRIDE];
1040
350k
    }
1041
70.1k
    ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16,);
1042
70.1k
    ALIGN_STK_16(coef, sum3_buf, BUF_STRIDE * 3 + 16,);
1043
70.1k
    int32_t *sumsq3_ptrs[3], *sumsq3_rows[3];
1044
70.1k
    coef *sum3_ptrs[3], *sum3_rows[3];
1045
280k
    for (int i = 0; i < 3; i++) {
1046
210k
        sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE];
1047
210k
        sum3_rows[i] = &sum3_buf[i * BUF_STRIDE];
1048
210k
    }
1049
1050
70.1k
    ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16,);
1051
70.1k
    ALIGN_STK_16(coef, B5_buf, BUF_STRIDE * 2 + 16,);
1052
70.1k
    int32_t *A5_ptrs[2];
1053
70.1k
    coef *B5_ptrs[2];
1054
210k
    for (int i = 0; i < 2; i++) {
1055
140k
        A5_ptrs[i] = &A5_buf[i * BUF_STRIDE];
1056
140k
        B5_ptrs[i] = &B5_buf[i * BUF_STRIDE];
1057
140k
    }
1058
70.1k
    ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16,);
1059
70.1k
    ALIGN_STK_16(coef, B3_buf, BUF_STRIDE * 4 + 16,);
1060
70.1k
    int32_t *A3_ptrs[4];
1061
70.1k
    coef *B3_ptrs[4];
1062
350k
    for (int i = 0; i < 4; i++) {
1063
280k
        A3_ptrs[i] = &A3_buf[i * BUF_STRIDE];
1064
280k
        B3_ptrs[i] = &B3_buf[i * BUF_STRIDE];
1065
280k
    }
1066
70.1k
    const pixel *src = dst;
1067
70.1k
    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
1068
1069
70.1k
    if (edges & LR_HAVE_TOP) {
1070
45.8k
        sumsq5_ptrs[0] = sumsq5_rows[0];
1071
45.8k
        sumsq5_ptrs[1] = sumsq5_rows[0];
1072
45.8k
        sumsq5_ptrs[2] = sumsq5_rows[1];
1073
45.8k
        sumsq5_ptrs[3] = sumsq5_rows[2];
1074
45.8k
        sumsq5_ptrs[4] = sumsq5_rows[3];
1075
45.8k
        sum5_ptrs[0] = sum5_rows[0];
1076
45.8k
        sum5_ptrs[1] = sum5_rows[0];
1077
45.8k
        sum5_ptrs[2] = sum5_rows[1];
1078
45.8k
        sum5_ptrs[3] = sum5_rows[2];
1079
45.8k
        sum5_ptrs[4] = sum5_rows[3];
1080
1081
45.8k
        sumsq3_ptrs[0] = sumsq3_rows[0];
1082
45.8k
        sumsq3_ptrs[1] = sumsq3_rows[1];
1083
45.8k
        sumsq3_ptrs[2] = sumsq3_rows[2];
1084
45.8k
        sum3_ptrs[0] = sum3_rows[0];
1085
45.8k
        sum3_ptrs[1] = sum3_rows[1];
1086
45.8k
        sum3_ptrs[2] = sum3_rows[2];
1087
1088
45.8k
        sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0],
1089
45.8k
                        sumsq5_rows[0], sum5_rows[0],
1090
45.8k
                        NULL, lpf, w, edges);
1091
45.8k
        lpf += PXSTRIDE(stride);
1092
45.8k
        sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1],
1093
45.8k
                        sumsq5_rows[1], sum5_rows[1],
1094
45.8k
                        NULL, lpf, w, edges);
1095
1096
45.8k
        sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2],
1097
45.8k
                        sumsq5_rows[2], sum5_rows[2],
1098
45.8k
                        left, src, w, edges);
1099
45.8k
        left++;
1100
45.8k
        src += PXSTRIDE(stride);
1101
1102
45.8k
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1103
45.8k
                      w, params->sgr.s1, BITDEPTH_MAX);
1104
45.8k
        rotate(A3_ptrs, B3_ptrs, 4);
1105
1106
45.8k
        if (--h <= 0)
1107
571
            goto vert_1;
1108
1109
45.3k
        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1110
45.3k
                        sumsq5_rows[3], sum5_rows[3],
1111
45.3k
                        left, src, w, edges);
1112
45.3k
        left++;
1113
45.3k
        src += PXSTRIDE(stride);
1114
45.3k
        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1115
45.3k
                      w, params->sgr.s0, BITDEPTH_MAX);
1116
45.3k
        rotate(A5_ptrs, B5_ptrs, 2);
1117
45.3k
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1118
45.3k
                      w, params->sgr.s1, BITDEPTH_MAX);
1119
45.3k
        rotate(A3_ptrs, B3_ptrs, 4);
1120
1121
45.3k
        if (--h <= 0)
1122
419
            goto vert_2;
1123
1124
        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
1125
        // one of them to point at the previously unused rows[4].
1126
44.8k
        sumsq5_ptrs[3] = sumsq5_rows[4];
1127
44.8k
        sum5_ptrs[3] = sum5_rows[4];
1128
44.8k
    } else {
1129
24.2k
        sumsq5_ptrs[0] = sumsq5_rows[0];
1130
24.2k
        sumsq5_ptrs[1] = sumsq5_rows[0];
1131
24.2k
        sumsq5_ptrs[2] = sumsq5_rows[0];
1132
24.2k
        sumsq5_ptrs[3] = sumsq5_rows[0];
1133
24.2k
        sumsq5_ptrs[4] = sumsq5_rows[0];
1134
24.2k
        sum5_ptrs[0] = sum5_rows[0];
1135
24.2k
        sum5_ptrs[1] = sum5_rows[0];
1136
24.2k
        sum5_ptrs[2] = sum5_rows[0];
1137
24.2k
        sum5_ptrs[3] = sum5_rows[0];
1138
24.2k
        sum5_ptrs[4] = sum5_rows[0];
1139
1140
24.2k
        sumsq3_ptrs[0] = sumsq3_rows[0];
1141
24.2k
        sumsq3_ptrs[1] = sumsq3_rows[0];
1142
24.2k
        sumsq3_ptrs[2] = sumsq3_rows[0];
1143
24.2k
        sum3_ptrs[0] = sum3_rows[0];
1144
24.2k
        sum3_ptrs[1] = sum3_rows[0];
1145
24.2k
        sum3_ptrs[2] = sum3_rows[0];
1146
1147
24.2k
        sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0],
1148
24.2k
                        sumsq5_rows[0], sum5_rows[0],
1149
24.2k
                        left, src, w, edges);
1150
24.2k
        left++;
1151
24.2k
        src += PXSTRIDE(stride);
1152
1153
24.2k
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1154
24.2k
                      w, params->sgr.s1, BITDEPTH_MAX);
1155
24.2k
        rotate(A3_ptrs, B3_ptrs, 4);
1156
1157
24.2k
        if (--h <= 0)
1158
2.94k
            goto vert_1;
1159
1160
21.2k
        sumsq5_ptrs[4] = sumsq5_rows[1];
1161
21.2k
        sum5_ptrs[4] = sum5_rows[1];
1162
1163
21.2k
        sumsq3_ptrs[2] = sumsq3_rows[1];
1164
21.2k
        sum3_ptrs[2] = sum3_rows[1];
1165
1166
21.2k
        sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1],
1167
21.2k
                        sumsq5_rows[1], sum5_rows[1],
1168
21.2k
                        left, src, w, edges);
1169
21.2k
        left++;
1170
21.2k
        src += PXSTRIDE(stride);
1171
1172
21.2k
        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1173
21.2k
                      w, params->sgr.s0, BITDEPTH_MAX);
1174
21.2k
        rotate(A5_ptrs, B5_ptrs, 2);
1175
21.2k
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1176
21.2k
                      w, params->sgr.s1, BITDEPTH_MAX);
1177
21.2k
        rotate(A3_ptrs, B3_ptrs, 4);
1178
1179
21.2k
        if (--h <= 0)
1180
5.15k
            goto vert_2;
1181
1182
16.1k
        sumsq5_ptrs[3] = sumsq5_rows[2];
1183
16.1k
        sumsq5_ptrs[4] = sumsq5_rows[3];
1184
16.1k
        sum5_ptrs[3] = sum5_rows[2];
1185
16.1k
        sum5_ptrs[4] = sum5_rows[3];
1186
1187
16.1k
        sumsq3_ptrs[2] = sumsq3_rows[2];
1188
16.1k
        sum3_ptrs[2] = sum3_rows[2];
1189
1190
16.1k
        sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2],
1191
16.1k
                        sumsq5_rows[2], sum5_rows[2],
1192
16.1k
                        left, src, w, edges);
1193
16.1k
        left++;
1194
16.1k
        src += PXSTRIDE(stride);
1195
1196
16.1k
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1197
16.1k
                      w, params->sgr.s1, BITDEPTH_MAX);
1198
16.1k
        rotate(A3_ptrs, B3_ptrs, 4);
1199
1200
16.1k
        if (--h <= 0)
1201
1.89k
            goto odd;
1202
1203
14.2k
        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1204
14.2k
                        sumsq5_rows[3], sum5_rows[3],
1205
14.2k
                        left, src, w, edges);
1206
14.2k
        left++;
1207
14.2k
        src += PXSTRIDE(stride);
1208
1209
14.2k
        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1210
14.2k
                      w, params->sgr.s0, BITDEPTH_MAX);
1211
14.2k
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1212
14.2k
                      w, params->sgr.s1, BITDEPTH_MAX);
1213
14.2k
        sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1214
14.2k
                       w, 2, params->sgr.w0, params->sgr.w1
1215
14.2k
                       HIGHBD_TAIL_SUFFIX);
1216
1217
14.2k
        if (--h <= 0)
1218
1.36k
            goto vert_2;
1219
1220
        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
1221
        // one of them to point at the previously unused rows[4].
1222
12.8k
        sumsq5_ptrs[3] = sumsq5_rows[4];
1223
12.8k
        sum5_ptrs[3] = sum5_rows[4];
1224
12.8k
    }
1225
1226
1.25M
    do {
1227
1.25M
        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1228
1.25M
                        sumsq5_ptrs[3], sum5_ptrs[3],
1229
1.25M
                        left, src, w, edges);
1230
1.25M
        left++;
1231
1.25M
        src += PXSTRIDE(stride);
1232
1233
1.25M
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1234
1.25M
                      w, params->sgr.s1, BITDEPTH_MAX);
1235
1.25M
        rotate(A3_ptrs, B3_ptrs, 4);
1236
1237
1.25M
        if (--h <= 0)
1238
8.77k
            goto odd;
1239
1240
1.24M
        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1241
1.24M
                        sumsq5_ptrs[4], sum5_ptrs[4],
1242
1.24M
                        left, src, w, edges);
1243
1.24M
        left++;
1244
1.24M
        src += PXSTRIDE(stride);
1245
1246
1.24M
        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1247
1.24M
                      w, params->sgr.s0, BITDEPTH_MAX);
1248
1.24M
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1249
1.24M
                      w, params->sgr.s1, BITDEPTH_MAX);
1250
1.24M
        sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1251
1.24M
                       w, 2, params->sgr.w0, params->sgr.w1
1252
1.24M
                       HIGHBD_TAIL_SUFFIX);
1253
1.24M
    } while (--h > 0);
1254
1255
48.9k
    if (!(edges & LR_HAVE_BOTTOM))
1256
3.10k
        goto vert_2;
1257
1258
45.8k
    sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1259
45.8k
                    sumsq5_ptrs[3], sum5_ptrs[3],
1260
45.8k
                    NULL, lpf_bottom, w, edges);
1261
45.8k
    lpf_bottom += PXSTRIDE(stride);
1262
45.8k
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1263
45.8k
                  w, params->sgr.s1, BITDEPTH_MAX);
1264
45.8k
    rotate(A3_ptrs, B3_ptrs, 4);
1265
1266
45.8k
    sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1267
45.8k
                    sumsq5_ptrs[4], sum5_ptrs[4],
1268
45.8k
                    NULL, lpf_bottom, w, edges);
1269
1270
55.8k
output_2:
1271
55.8k
    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1272
55.8k
                  w, params->sgr.s0, BITDEPTH_MAX);
1273
55.8k
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1274
55.8k
                  w, params->sgr.s1, BITDEPTH_MAX);
1275
55.8k
    sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1276
55.8k
                   w, 2, params->sgr.w0, params->sgr.w1
1277
55.8k
                   HIGHBD_TAIL_SUFFIX);
1278
55.8k
    return;
1279
1280
10.0k
vert_2:
1281
    // Duplicate the last row twice more
1282
10.0k
    sumsq5_ptrs[3] = sumsq5_ptrs[2];
1283
10.0k
    sumsq5_ptrs[4] = sumsq5_ptrs[2];
1284
10.0k
    sum5_ptrs[3] = sum5_ptrs[2];
1285
10.0k
    sum5_ptrs[4] = sum5_ptrs[2];
1286
1287
10.0k
    sumsq3_ptrs[2] = sumsq3_ptrs[1];
1288
10.0k
    sum3_ptrs[2] = sum3_ptrs[1];
1289
10.0k
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1290
10.0k
                  w, params->sgr.s1, BITDEPTH_MAX);
1291
10.0k
    rotate(A3_ptrs, B3_ptrs, 4);
1292
1293
10.0k
    sumsq3_ptrs[2] = sumsq3_ptrs[1];
1294
10.0k
    sum3_ptrs[2] = sum3_ptrs[1];
1295
1296
10.0k
    goto output_2;
1297
1298
10.6k
odd:
1299
    // Copy the last row as padding once
1300
10.6k
    sumsq5_ptrs[4] = sumsq5_ptrs[3];
1301
10.6k
    sum5_ptrs[4] = sum5_ptrs[3];
1302
1303
10.6k
    sumsq3_ptrs[2] = sumsq3_ptrs[1];
1304
10.6k
    sum3_ptrs[2] = sum3_ptrs[1];
1305
1306
10.6k
    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1307
10.6k
                  w, params->sgr.s0, BITDEPTH_MAX);
1308
10.6k
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1309
10.6k
                  w, params->sgr.s1, BITDEPTH_MAX);
1310
10.6k
    sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1311
10.6k
                   w, 2, params->sgr.w0, params->sgr.w1
1312
10.6k
                   HIGHBD_TAIL_SUFFIX);
1313
1314
14.1k
output_1:
1315
    // Duplicate the last row twice more
1316
14.1k
    sumsq5_ptrs[3] = sumsq5_ptrs[2];
1317
14.1k
    sumsq5_ptrs[4] = sumsq5_ptrs[2];
1318
14.1k
    sum5_ptrs[3] = sum5_ptrs[2];
1319
14.1k
    sum5_ptrs[4] = sum5_ptrs[2];
1320
1321
14.1k
    sumsq3_ptrs[2] = sumsq3_ptrs[1];
1322
14.1k
    sum3_ptrs[2] = sum3_ptrs[1];
1323
1324
14.1k
    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1325
14.1k
                  w, params->sgr.s0, BITDEPTH_MAX);
1326
14.1k
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1327
14.1k
                  w, params->sgr.s1, BITDEPTH_MAX);
1328
14.1k
    rotate(A3_ptrs, B3_ptrs, 4);
1329
    // Output only one row
1330
14.1k
    sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1331
14.1k
                   w, 1, params->sgr.w0, params->sgr.w1
1332
14.1k
                   HIGHBD_TAIL_SUFFIX);
1333
14.1k
    return;
1334
1335
3.51k
vert_1:
1336
    // Copy the last row as padding once
1337
3.51k
    sumsq5_ptrs[4] = sumsq5_ptrs[3];
1338
3.51k
    sum5_ptrs[4] = sum5_ptrs[3];
1339
1340
3.51k
    sumsq3_ptrs[2] = sumsq3_ptrs[1];
1341
3.51k
    sum3_ptrs[2] = sum3_ptrs[1];
1342
1343
3.51k
    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1344
3.51k
                  w, params->sgr.s0, BITDEPTH_MAX);
1345
3.51k
    rotate(A5_ptrs, B5_ptrs, 2);
1346
3.51k
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1347
3.51k
                  w, params->sgr.s1, BITDEPTH_MAX);
1348
3.51k
    rotate(A3_ptrs, B3_ptrs, 4);
1349
1350
3.51k
    goto output_1;
1351
10.6k
}
1352
1353
#if HAVE_ASM
1354
#if ARCH_AARCH64 || ARCH_ARM
1355
#include "src/arm/looprestoration.h"
1356
#elif ARCH_LOONGARCH64
1357
#include "src/loongarch/looprestoration.h"
1358
#elif ARCH_PPC64LE
1359
#include "src/ppc/looprestoration.h"
1360
#elif ARCH_X86
1361
#include "src/x86/looprestoration.h"
1362
#endif
1363
#endif
1364
1365
COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c,
1366
                                                 const int bpc)
1367
68.7k
{
1368
68.7k
    c->wiener[0] = c->wiener[1] = wiener_c;
1369
68.7k
    c->sgr[0] = sgr_5x5_c;
1370
68.7k
    c->sgr[1] = sgr_3x3_c;
1371
68.7k
    c->sgr[2] = sgr_mix_c;
1372
1373
#if HAVE_ASM
1374
#if ARCH_AARCH64 || ARCH_ARM
1375
    loop_restoration_dsp_init_arm(c, bpc);
1376
#elif ARCH_LOONGARCH64
1377
    loop_restoration_dsp_init_loongarch(c, bpc);
1378
#elif ARCH_PPC64LE
1379
    loop_restoration_dsp_init_ppc(c, bpc);
1380
#elif ARCH_X86
1381
    loop_restoration_dsp_init_x86(c, bpc);
1382
#endif
1383
#endif
1384
68.7k
}
dav1d_loop_restoration_dsp_init_8bpc
Line
Count
Source
1367
30.8k
{
1368
30.8k
    c->wiener[0] = c->wiener[1] = wiener_c;
1369
30.8k
    c->sgr[0] = sgr_5x5_c;
1370
30.8k
    c->sgr[1] = sgr_3x3_c;
1371
30.8k
    c->sgr[2] = sgr_mix_c;
1372
1373
#if HAVE_ASM
1374
#if ARCH_AARCH64 || ARCH_ARM
1375
    loop_restoration_dsp_init_arm(c, bpc);
1376
#elif ARCH_LOONGARCH64
1377
    loop_restoration_dsp_init_loongarch(c, bpc);
1378
#elif ARCH_PPC64LE
1379
    loop_restoration_dsp_init_ppc(c, bpc);
1380
#elif ARCH_X86
1381
    loop_restoration_dsp_init_x86(c, bpc);
1382
#endif
1383
#endif
1384
30.8k
}
dav1d_loop_restoration_dsp_init_16bpc
Line
Count
Source
1367
37.9k
{
1368
37.9k
    c->wiener[0] = c->wiener[1] = wiener_c;
1369
37.9k
    c->sgr[0] = sgr_5x5_c;
1370
37.9k
    c->sgr[1] = sgr_3x3_c;
1371
37.9k
    c->sgr[2] = sgr_mix_c;
1372
1373
#if HAVE_ASM
1374
#if ARCH_AARCH64 || ARCH_ARM
1375
    loop_restoration_dsp_init_arm(c, bpc);
1376
#elif ARCH_LOONGARCH64
1377
    loop_restoration_dsp_init_loongarch(c, bpc);
1378
#elif ARCH_PPC64LE
1379
    loop_restoration_dsp_init_ppc(c, bpc);
1380
#elif ARCH_X86
1381
    loop_restoration_dsp_init_x86(c, bpc);
1382
#endif
1383
#endif
1384
37.9k
}