Coverage Report

Created: 2026-06-15 06:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/dav1d/src/looprestoration_tmpl.c
Line
Count
Source
1
/*
2
 * Copyright © 2018, VideoLAN and dav1d authors
3
 * Copyright © 2018, Two Orioles, LLC
4
 * All rights reserved.
5
 *
6
 * Redistribution and use in source and binary forms, with or without
7
 * modification, are permitted provided that the following conditions are met:
8
 *
9
 * 1. Redistributions of source code must retain the above copyright notice, this
10
 *    list of conditions and the following disclaimer.
11
 *
12
 * 2. Redistributions in binary form must reproduce the above copyright notice,
13
 *    this list of conditions and the following disclaimer in the documentation
14
 *    and/or other materials provided with the distribution.
15
 *
16
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
 */
27
28
#include "config.h"
29
30
#include <stdint.h>
31
#include <stdlib.h>
32
#include <string.h>
33
34
#include "common/attributes.h"
35
#include "common/bitdepth.h"
36
#include "common/intops.h"
37
38
#include "src/looprestoration.h"
39
#include "src/tables.h"
40
41
// 256 * 1.5 + 3 + 3 = 390
42
5.10M
#define REST_UNIT_STRIDE (390)
43
44
static void wiener_filter_h(uint16_t *dst, const pixel (*left)[4],
45
                            const pixel *src, const int16_t fh[8],
46
                            const int w, const enum LrEdgeFlags edges
47
                            HIGHBD_DECL_SUFFIX)
48
4.77M
{
49
4.77M
    const int bitdepth = bitdepth_from_max(bitdepth_max);
50
4.77M
    const int round_bits_h = 3 + (bitdepth == 12) * 2;
51
4.77M
    const int rounding_off_h = 1 << (round_bits_h - 1);
52
4.77M
    const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h);
53
54
4.77M
    if (w < 6) {
55
        // For small widths, do the fully conditional loop with
56
        // conditions on each access.
57
2.58M
        for (int x = 0; x < w; x++) {
58
1.69M
            int sum = (1 << (bitdepth + 6));
59
1.69M
#if BITDEPTH == 8
60
1.69M
            sum += src[x] * 128;
61
1.69M
#endif
62
13.4M
            for (int i = 0; i < 7; i++) {
63
11.7M
                int idx = x + i - 3;
64
11.7M
                if (idx < 0) {
65
3.72M
                    if (!(edges & LR_HAVE_LEFT))
66
3.73M
                        sum += src[0] * fh[i];
67
18.4E
                    else if (left)
68
0
                        sum += left[0][4 + idx] * fh[i];
69
18.4E
                    else
70
18.4E
                        sum += src[idx] * fh[i];
71
8.01M
                } else if (idx >= w && !(edges & LR_HAVE_RIGHT)) {
72
3.67M
                    sum += src[w - 1] * fh[i];
73
3.67M
                } else
74
4.34M
                    sum += src[idx] * fh[i];
75
11.7M
            }
76
1.69M
            sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
77
1.69M
            dst[x] = sum;
78
1.69M
        }
79
80
893k
        return;
81
893k
    }
82
83
    // For larger widths, do separate loops with less conditions; first
84
    // handle the start of the row.
85
3.87M
    int start = 3;
86
3.87M
    if (!(edges & LR_HAVE_LEFT)) {
87
        // If there's no left edge, pad using the leftmost pixel.
88
5.37M
        for (int x = 0; x < 3; x++) {
89
4.02M
            int sum = (1 << (bitdepth + 6));
90
4.02M
#if BITDEPTH == 8
91
4.02M
            sum += src[x] * 128;
92
4.02M
#endif
93
32.1M
            for (int i = 0; i < 7; i++) {
94
28.1M
                int idx = x + i - 3;
95
28.1M
                if (idx < 0)
96
8.05M
                    sum += src[0] * fh[i];
97
20.1M
                else
98
20.1M
                    sum += src[idx] * fh[i];
99
28.1M
            }
100
4.02M
            sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
101
4.02M
            dst[x] = sum;
102
4.02M
        }
103
2.53M
    } else if (left) {
104
        // If we have the left edge and a separate left buffer, pad using that.
105
9.47M
        for (int x = 0; x < 3; x++) {
106
7.09M
            int sum = (1 << (bitdepth + 6));
107
7.09M
#if BITDEPTH == 8
108
7.09M
            sum += src[x] * 128;
109
7.09M
#endif
110
56.6M
            for (int i = 0; i < 7; i++) {
111
49.5M
                int idx = x + i - 3;
112
49.5M
                if (idx < 0)
113
14.1M
                    sum += left[0][4 + idx] * fh[i];
114
35.3M
                else
115
35.3M
                    sum += src[idx] * fh[i];
116
49.5M
            }
117
7.09M
            sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
118
7.09M
            dst[x] = sum;
119
7.09M
        }
120
2.37M
    } else {
121
        // If we have the left edge, but no separate left buffer, we're in the
122
        // top/bottom area (lpf) with the left edge existing in the same
123
        // buffer; just do the regular loop from the start.
124
156k
        start = 0;
125
156k
    }
126
3.87M
    int end = w - 3;
127
3.87M
    if (edges & LR_HAVE_RIGHT)
128
2.49M
        end = w;
129
130
    // Do a condititon free loop for the bulk of the row.
131
371M
    for (int x = start; x < end; x++) {
132
367M
        int sum = (1 << (bitdepth + 6));
133
367M
#if BITDEPTH == 8
134
367M
        sum += src[x] * 128;
135
367M
#endif
136
2.93G
        for (int i = 0; i < 7; i++) {
137
2.56G
            int idx = x + i - 3;
138
2.56G
            sum += src[idx] * fh[i];
139
2.56G
        }
140
367M
        sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
141
367M
        dst[x] = sum;
142
367M
    }
143
144
    // If we need to, calculate the end of the row with a condition for
145
    // right edge padding.
146
7.99M
    for (int x = end; x < w; x++) {
147
4.11M
        int sum = (1 << (bitdepth + 6));
148
4.11M
#if BITDEPTH == 8
149
4.11M
        sum += src[x] * 128;
150
4.11M
#endif
151
32.8M
        for (int i = 0; i < 7; i++) {
152
28.7M
            int idx = x + i - 3;
153
28.7M
            if (idx >= w)
154
8.22M
                sum += src[w - 1] * fh[i];
155
20.5M
            else
156
20.5M
                sum += src[idx] * fh[i];
157
28.7M
        }
158
4.11M
        sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
159
4.11M
        dst[x] = sum;
160
4.11M
    }
161
3.87M
}
162
163
static void wiener_filter_v(pixel *p, uint16_t **ptrs, const int16_t fv[8],
164
                            const int w HIGHBD_DECL_SUFFIX)
165
186k
{
166
186k
    const int bitdepth = bitdepth_from_max(bitdepth_max);
167
168
186k
    const int round_bits_v = 11 - (bitdepth == 12) * 2;
169
186k
    const int rounding_off_v = 1 << (round_bits_v - 1);
170
186k
    const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
171
172
13.1M
    for (int i = 0; i < w; i++) {
173
12.9M
        int sum = -round_offset;
174
175
        // Only filter using 6 input rows. The 7th row is assumed to be
176
        // identical to the last one.
177
        //
178
        // This function is assumed to only be called at the end, when doing
179
        // padding at the bottom.
180
90.4M
        for (int k = 0; k < 6; k++)
181
77.5M
            sum += ptrs[k][i] * fv[k];
182
12.9M
        sum += ptrs[5][i] * fv[6];
183
184
12.9M
        p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v);
185
12.9M
    }
186
187
    // Shift the pointers, but only update the first 5; the 6th pointer is kept
188
    // as it was before (and the 7th is implicitly identical to the 6th).
189
1.11M
    for (int i = 0; i < 5; i++)
190
932k
        ptrs[i] = ptrs[i + 1];
191
186k
}
192
193
static void wiener_filter_hv(pixel *p, uint16_t **ptrs, const pixel (*left)[4],
194
                             const pixel *src, const int16_t filter[2][8],
195
                             const int w, const enum LrEdgeFlags edges
196
                             HIGHBD_DECL_SUFFIX)
197
4.28M
{
198
4.28M
    const int bitdepth = bitdepth_from_max(bitdepth_max);
199
200
4.28M
    const int round_bits_v = 11 - (bitdepth == 12) * 2;
201
4.28M
    const int rounding_off_v = 1 << (round_bits_v - 1);
202
4.28M
    const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
203
204
4.28M
    const int16_t *fh = filter[0];
205
4.28M
    const int16_t *fv = filter[1];
206
207
    // Do combined horziontal and vertical filtering; doing horizontal
208
    // filtering of one row, combined with vertical filtering of 6
209
    // preexisting rows and the newly filtered row.
210
211
    // For simplicity in the C implementation, just do a separate call
212
    // of the horizontal filter, into a temporary buffer.
213
4.28M
    uint16_t tmp[REST_UNIT_STRIDE];
214
4.28M
    wiener_filter_h(tmp, left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
215
216
353M
    for (int i = 0; i < w; i++) {
217
348M
        int sum = -round_offset;
218
219
        // Filter using the 6 stored preexisting rows, and the newly
220
        // filtered one in tmp[].
221
2.42G
        for (int k = 0; k < 6; k++)
222
2.07G
            sum += ptrs[k][i] * fv[k];
223
348M
        sum += tmp[i] * fv[6];
224
        // At this point, after having read all inputs at point [i], we
225
        // could overwrite [i] with the newly filtered data.
226
227
348M
        p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v);
228
348M
    }
229
230
    // For simplicity in the C implementation, just memcpy the newly
231
    // filtered row into ptrs[6]. Normally, in steady state filtering,
232
    // this output row, ptrs[6], is equal to ptrs[0]. However at startup,
233
    // at the top of the filtered area, we may have ptrs[0] equal to ptrs[1],
234
    // so we can't assume we can write into ptrs[0] but we need to keep
235
    // a separate pointer for the next row to write into.
236
4.28M
    memcpy(ptrs[6], tmp, sizeof(uint16_t) * REST_UNIT_STRIDE);
237
238
    // Rotate the window of pointers. Shift the 6 pointers downwards one step.
239
29.9M
    for (int i = 0; i < 6; i++)
240
25.6M
        ptrs[i] = ptrs[i + 1];
241
    // The topmost pointer, ptrs[6], which isn't used as input, is set to
242
    // ptrs[0], which will be used as output for the next _hv call.
243
    // At the start of the filtering, the caller may set ptrs[6] to the
244
    // right next buffer to fill in, instead.
245
4.28M
    ptrs[6] = ptrs[0];
246
4.28M
}
247
248
// FIXME Could split into luma and chroma specific functions,
249
// (since first and last tops are always 0 for chroma)
250
static void wiener_c(pixel *p, const ptrdiff_t stride,
251
                     const pixel (*left)[4],
252
                     const pixel *lpf, const int w, int h,
253
                     const LooprestorationParams *const params,
254
                     const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
255
119k
{
256
    // Values stored between horizontal and vertical filtering don't
257
    // fit in a uint8_t.
258
119k
    uint16_t hor[6 * REST_UNIT_STRIDE];
259
119k
    uint16_t *ptrs[7], *rows[6];
260
834k
    for (int i = 0; i < 6; i++)
261
714k
        rows[i] = &hor[i * REST_UNIT_STRIDE];
262
119k
    const int16_t (*const filter)[8] = params->filter;
263
119k
    const int16_t *fh = params->filter[0];
264
119k
    const int16_t *fv = params->filter[1];
265
119k
    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
266
267
119k
    const pixel *src = p;
268
119k
    if (edges & LR_HAVE_TOP) {
269
74.8k
        ptrs[0] = rows[0];
270
74.8k
        ptrs[1] = rows[0];
271
74.8k
        ptrs[2] = rows[1];
272
74.8k
        ptrs[3] = rows[2];
273
74.8k
        ptrs[4] = rows[2];
274
74.8k
        ptrs[5] = rows[2];
275
276
74.8k
        wiener_filter_h(rows[0], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX);
277
74.8k
        lpf += PXSTRIDE(stride);
278
74.8k
        wiener_filter_h(rows[1], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX);
279
280
74.8k
        wiener_filter_h(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
281
74.8k
        left++;
282
74.8k
        src += PXSTRIDE(stride);
283
284
74.8k
        if (--h <= 0)
285
726
            goto v1;
286
287
74.1k
        ptrs[4] = ptrs[5] = rows[3];
288
74.1k
        wiener_filter_h(rows[3], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
289
74.1k
        left++;
290
74.1k
        src += PXSTRIDE(stride);
291
292
74.1k
        if (--h <= 0)
293
809
            goto v2;
294
295
73.3k
        ptrs[5] = rows[4];
296
73.3k
        wiener_filter_h(rows[4], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
297
73.3k
        left++;
298
73.3k
        src += PXSTRIDE(stride);
299
300
73.3k
        if (--h <= 0)
301
410
            goto v3;
302
73.3k
    } else {
303
44.2k
        ptrs[0] = rows[0];
304
44.2k
        ptrs[1] = rows[0];
305
44.2k
        ptrs[2] = rows[0];
306
44.2k
        ptrs[3] = rows[0];
307
44.2k
        ptrs[4] = rows[0];
308
44.2k
        ptrs[5] = rows[0];
309
310
44.2k
        wiener_filter_h(rows[0], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
311
44.2k
        left++;
312
44.2k
        src += PXSTRIDE(stride);
313
314
44.2k
        if (--h <= 0)
315
3.55k
            goto v1;
316
317
40.7k
        ptrs[4] = ptrs[5] = rows[1];
318
40.7k
        wiener_filter_h(rows[1], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
319
40.7k
        left++;
320
40.7k
        src += PXSTRIDE(stride);
321
322
40.7k
        if (--h <= 0)
323
9.48k
            goto v2;
324
325
31.2k
        ptrs[5] = rows[2];
326
31.2k
        wiener_filter_h(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
327
31.2k
        left++;
328
31.2k
        src += PXSTRIDE(stride);
329
330
31.2k
        if (--h <= 0)
331
2.28k
            goto v3;
332
333
28.9k
        ptrs[6] = rows[3];
334
28.9k
        wiener_filter_hv(p, ptrs, left, src, filter, w, edges
335
28.9k
                         HIGHBD_TAIL_SUFFIX);
336
28.9k
        left++;
337
28.9k
        src += PXSTRIDE(stride);
338
28.9k
        p += PXSTRIDE(stride);
339
340
28.9k
        if (--h <= 0)
341
2.57k
            goto v3;
342
343
26.3k
        ptrs[6] = rows[4];
344
26.3k
        wiener_filter_hv(p, ptrs, left, src, filter, w, edges
345
26.3k
                         HIGHBD_TAIL_SUFFIX);
346
26.3k
        left++;
347
26.3k
        src += PXSTRIDE(stride);
348
26.3k
        p += PXSTRIDE(stride);
349
350
26.3k
        if (--h <= 0)
351
1.33k
            goto v3;
352
26.3k
    }
353
354
97.9k
    ptrs[6] = ptrs[5] + REST_UNIT_STRIDE;
355
4.08M
    do {
356
4.08M
        wiener_filter_hv(p, ptrs, left, src, filter, w, edges
357
4.08M
                         HIGHBD_TAIL_SUFFIX);
358
4.08M
        left++;
359
4.08M
        src += PXSTRIDE(stride);
360
4.08M
        p += PXSTRIDE(stride);
361
4.08M
    } while (--h > 0);
362
363
97.9k
    if (!(edges & LR_HAVE_BOTTOM))
364
21.9k
        goto v3;
365
366
76.0k
    wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges
367
76.0k
                     HIGHBD_TAIL_SUFFIX);
368
76.0k
    lpf_bottom += PXSTRIDE(stride);
369
76.0k
    p += PXSTRIDE(stride);
370
371
76.0k
    wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges
372
76.0k
                     HIGHBD_TAIL_SUFFIX);
373
76.0k
    p += PXSTRIDE(stride);
374
119k
v1:
375
119k
    wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
376
377
119k
    return;
378
379
28.5k
v3:
380
28.5k
    wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
381
28.5k
    p += PXSTRIDE(stride);
382
38.8k
v2:
383
38.8k
    wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
384
38.8k
    p += PXSTRIDE(stride);
385
38.8k
    goto v1;
386
28.5k
}
387
388
// SGR
389
static NOINLINE void rotate(int32_t **sumsq_ptrs, coef **sum_ptrs, int n)
390
9.44M
{
391
9.44M
    int32_t *tmp32 = sumsq_ptrs[0];
392
9.44M
    coef *tmpc = sum_ptrs[0];
393
29.0M
    for (int i = 0; i < n - 1; i++) {
394
19.5M
        sumsq_ptrs[i] = sumsq_ptrs[i + 1];
395
19.5M
        sum_ptrs[i] = sum_ptrs[i + 1];
396
19.5M
    }
397
9.44M
    sumsq_ptrs[n - 1] = tmp32;
398
9.44M
    sum_ptrs[n - 1] = tmpc;
399
9.44M
}
400
401
static NOINLINE void rotate5_x2(int32_t **sumsq_ptrs, coef **sum_ptrs)
402
1.97M
{
403
1.97M
    int32_t *tmp32[2];
404
1.97M
    coef *tmpc[2];
405
5.93M
    for (int i = 0; i < 2; i++) {
406
3.95M
        tmp32[i] = sumsq_ptrs[i];
407
3.95M
        tmpc[i] = sum_ptrs[i];
408
3.95M
    }
409
7.90M
    for (int i = 0; i < 3; i++) {
410
5.92M
        sumsq_ptrs[i] = sumsq_ptrs[i + 2];
411
5.92M
        sum_ptrs[i] = sum_ptrs[i + 2];
412
5.92M
    }
413
5.93M
    for (int i = 0; i < 2; i++) {
414
3.95M
        sumsq_ptrs[3 + i] = tmp32[i];
415
3.95M
        sum_ptrs[3 + i] = tmpc[i];
416
3.95M
    }
417
1.97M
}
418
419
static NOINLINE void sgr_box3_row_h(int32_t *sumsq, coef *sum,
420
                                    const pixel (*left)[4],
421
                                    const pixel *src, const int w,
422
                                    const enum LrEdgeFlags edges)
423
3.82M
{
424
3.82M
    sumsq++;
425
3.82M
    sum++;
426
3.82M
    int a = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0];
427
3.82M
    int b = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0];
428
334M
    for (int x = -1; x < w + 1; x++) {
429
330M
        int c = (x + 1 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 1] : src[w - 1];
430
330M
        sum[x] = a + b + c;
431
330M
        sumsq[x] = a * a + b * b + c * c;
432
330M
        a = b;
433
330M
        b = c;
434
330M
    }
435
3.82M
}
436
437
static NOINLINE void sgr_box5_row_h(int32_t *sumsq, coef *sum,
438
                                    const pixel (*left)[4],
439
                                    const pixel *src, const int w,
440
                                    const enum LrEdgeFlags edges)
441
3.98M
{
442
3.98M
    sumsq++;
443
3.98M
    sum++;
444
3.98M
    int a = edges & LR_HAVE_LEFT ? (left ? left[0][1] : src[-3]) : src[0];
445
3.98M
    int b = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0];
446
3.98M
    int c = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0];
447
3.98M
    int d = src[0];
448
335M
    for (int x = -1; x < w + 1; x++) {
449
331M
        int e = (x + 2 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 2] : src[w - 1];
450
331M
        sum[x] = a + b + c + d + e;
451
331M
        sumsq[x] = a * a + b * b + c * c + d * d + e * e;
452
331M
        a = b;
453
331M
        b = c;
454
331M
        c = d;
455
331M
        d = e;
456
331M
    }
457
3.98M
}
458
459
static void sgr_box35_row_h(int32_t *sumsq3, coef *sum3,
460
                            int32_t *sumsq5, coef *sum5,
461
                            const pixel (*left)[4],
462
                            const pixel *src, const int w,
463
                            const enum LrEdgeFlags edges)
464
2.78M
{
465
2.78M
    sgr_box3_row_h(sumsq3, sum3, left, src, w, edges);
466
2.78M
    sgr_box5_row_h(sumsq5, sum5, left, src, w, edges);
467
2.78M
}
468
469
static NOINLINE void sgr_box3_row_v(int32_t **sumsq, coef **sum,
470
                                    int32_t *sumsq_out, coef *sum_out,
471
                                    const int w)
472
3.77M
{
473
329M
    for (int x = 0; x < w + 2; x++) {
474
325M
        int sq_a = sumsq[0][x];
475
325M
        int sq_b = sumsq[1][x];
476
325M
        int sq_c = sumsq[2][x];
477
325M
        int s_a = sum[0][x];
478
325M
        int s_b = sum[1][x];
479
325M
        int s_c = sum[2][x];
480
325M
        sumsq_out[x] = sq_a + sq_b + sq_c;
481
325M
        sum_out[x] = s_a + s_b + s_c;
482
325M
    }
483
3.77M
}
484
485
static NOINLINE void sgr_box5_row_v(int32_t **sumsq, coef **sum,
486
                                    int32_t *sumsq_out, coef *sum_out,
487
                                    const int w)
488
1.98M
{
489
167M
    for (int x = 0; x < w + 2; x++) {
490
165M
        int sq_a = sumsq[0][x];
491
165M
        int sq_b = sumsq[1][x];
492
165M
        int sq_c = sumsq[2][x];
493
165M
        int sq_d = sumsq[3][x];
494
165M
        int sq_e = sumsq[4][x];
495
165M
        int s_a = sum[0][x];
496
165M
        int s_b = sum[1][x];
497
165M
        int s_c = sum[2][x];
498
165M
        int s_d = sum[3][x];
499
165M
        int s_e = sum[4][x];
500
165M
        sumsq_out[x] = sq_a + sq_b + sq_c + sq_d + sq_e;
501
165M
        sum_out[x] = s_a + s_b + s_c + s_d + s_e;
502
165M
    }
503
1.98M
}
504
505
static NOINLINE void sgr_calc_row_ab(int32_t *AA, coef *BB, int w, int s,
506
                                     int bitdepth_max, int n, int sgr_one_by_x)
507
5.73M
{
508
5.73M
    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
509
481M
    for (int i = 0; i < w + 2; i++) {
510
475M
        const int a =
511
475M
            (AA[i] + ((1 << (2 * bitdepth_min_8)) >> 1)) >> (2 * bitdepth_min_8);
512
475M
        const int b =
513
475M
            (BB[i] + ((1 << bitdepth_min_8) >> 1)) >> bitdepth_min_8;
514
515
475M
        const unsigned p = imax(a * n - b * b, 0);
516
475M
        const unsigned z = (p * s + (1 << 19)) >> 20;
517
475M
        const unsigned x = dav1d_sgr_x_by_x[umin(z, 255)];
518
519
        // This is where we invert A and B, so that B is of size coef.
520
475M
        AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
521
475M
        BB[i] = x;
522
475M
    }
523
5.73M
}
524
525
static void sgr_box3_vert(int32_t **sumsq, coef **sum,
526
                          int32_t *sumsq_out, coef *sum_out,
527
                          const int w, const int s, const int bitdepth_max)
528
3.77M
{
529
3.77M
    sgr_box3_row_v(sumsq, sum, sumsq_out, sum_out, w);
530
3.77M
    sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 9, 455);
531
3.77M
    rotate(sumsq, sum, 3);
532
3.77M
}
533
534
static void sgr_box5_vert(int32_t **sumsq, coef **sum,
535
                          int32_t *sumsq_out, coef *sum_out,
536
                          const int w, const int s, const int bitdepth_max)
537
1.98M
{
538
1.98M
    sgr_box5_row_v(sumsq, sum, sumsq_out, sum_out, w);
539
1.98M
    sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 25, 164);
540
1.98M
    rotate5_x2(sumsq, sum);
541
1.98M
}
542
543
static void sgr_box3_hv(int32_t **sumsq, coef **sum,
544
                        int32_t *AA, coef *BB,
545
                        const pixel (*left)[4],
546
                        const pixel *src, const int w,
547
                        const int s,
548
                        const enum LrEdgeFlags edges,
549
                        const int bitdepth_max)
550
1.00M
{
551
1.00M
    sgr_box3_row_h(sumsq[2], sum[2], left, src, w, edges);
552
1.00M
    sgr_box3_vert(sumsq, sum, AA, BB, w, s, bitdepth_max);
553
1.00M
}
554
555
static NOINLINE void sgr_finish_filter_row1(coef *tmp,
556
                                            const pixel *src,
557
                                            int32_t **A_ptrs, coef **B_ptrs,
558
                                            const int w)
559
3.58M
{
560
3.58M
#define EIGHT_NEIGHBORS(P, i)\
561
599M
    ((P[1][i] + P[1][i - 1] + P[1][i + 1] + P[0][i] + P[2][i]) * 4 + \
562
599M
     (P[0][i - 1] + P[2][i - 1] +                           \
563
599M
      P[0][i + 1] + P[2][i + 1]) * 3)
564
303M
    for (int i = 0; i < w; i++) {
565
299M
        const int a = EIGHT_NEIGHBORS(B_ptrs, i + 1);
566
299M
        const int b = EIGHT_NEIGHBORS(A_ptrs, i + 1);
567
299M
        tmp[i] = (b - a * src[i] + (1 << 8)) >> 9;
568
299M
    }
569
3.58M
#undef EIGHT_NEIGHBORS
570
3.58M
}
571
572
8.96M
#define FILTER_OUT_STRIDE (384)
573
574
static NOINLINE void sgr_finish_filter2(coef *tmp,
575
                                        const pixel *src,
576
                                        const ptrdiff_t src_stride,
577
                                        int32_t **A_ptrs, coef **B_ptrs,
578
                                        const int w, const int h)
579
1.88M
{
580
1.88M
#define SIX_NEIGHBORS(P, i)\
581
307M
    ((P[0][i]     + P[1][i]) * 6 +   \
582
307M
     (P[0][i - 1] + P[1][i - 1] +    \
583
307M
      P[0][i + 1] + P[1][i + 1]) * 5)
584
155M
    for (int i = 0; i < w; i++) {
585
153M
        const int a = SIX_NEIGHBORS(B_ptrs, i + 1);
586
153M
        const int b = SIX_NEIGHBORS(A_ptrs, i + 1);
587
153M
        tmp[i] = (b - a * src[i] + (1 << 8)) >> 9;
588
153M
    }
589
1.88M
    if (h <= 1)
590
20.2k
        return;
591
1.86M
    tmp += FILTER_OUT_STRIDE;
592
1.86M
    src += PXSTRIDE(src_stride);
593
1.86M
    const int32_t *A = &A_ptrs[1][1];
594
1.86M
    const coef *B = &B_ptrs[1][1];
595
154M
    for (int i = 0; i < w; i++) {
596
152M
        const int a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;
597
152M
        const int b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;
598
152M
        tmp[i] = (b - a * src[i] + (1 << 7)) >> 8;
599
152M
    }
600
1.86M
#undef SIX_NEIGHBORS
601
1.86M
}
602
603
static NOINLINE void sgr_weighted_row1(pixel *dst, const coef *t1,
604
                                       const int w, const int w1 HIGHBD_DECL_SUFFIX)
605
2.10M
{
606
160M
    for (int i = 0; i < w; i++) {
607
158M
        const int v = w1 * t1[i];
608
158M
        dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11));
609
158M
    }
610
2.10M
}
611
612
static NOINLINE void sgr_weighted2(pixel *dst, const ptrdiff_t dst_stride,
613
                                   const coef *t1, const coef *t2,
614
                                   const int w, const int h,
615
                                   const int w0, const int w1 HIGHBD_DECL_SUFFIX)
616
1.32M
{
617
3.93M
    for (int j = 0; j < h; j++) {
618
225M
        for (int i = 0; i < w; i++) {
619
222M
            const int v = w0 * t1[i] + w1 * t2[i];
620
222M
            dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11));
621
222M
        }
622
2.61M
        dst += PXSTRIDE(dst_stride);
623
2.61M
        t1 += FILTER_OUT_STRIDE;
624
2.61M
        t2 += FILTER_OUT_STRIDE;
625
2.61M
    }
626
1.32M
}
627
628
static NOINLINE void sgr_finish1(pixel **dst, const ptrdiff_t stride,
629
                                 int32_t **A_ptrs, coef **B_ptrs, const int w,
630
                                 const int w1 HIGHBD_DECL_SUFFIX)
631
980k
{
632
    // Only one single row, no stride needed
633
980k
    ALIGN_STK_16(coef, tmp, 384,);
634
635
980k
    sgr_finish_filter_row1(tmp, *dst, A_ptrs, B_ptrs, w);
636
980k
    sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX);
637
980k
    *dst += PXSTRIDE(stride);
638
980k
    rotate(A_ptrs, B_ptrs, 3);
639
980k
}
640
641
static NOINLINE void sgr_finish2(pixel **dst, const ptrdiff_t stride,
642
                                 int32_t **A_ptrs, coef **B_ptrs,
643
                                 const int w, const int h, const int w1
644
                                 HIGHBD_DECL_SUFFIX)
645
565k
{
646
565k
    ALIGN_STK_16(coef, tmp, 2*FILTER_OUT_STRIDE,);
647
648
565k
    sgr_finish_filter2(tmp, *dst, stride, A_ptrs, B_ptrs, w, h);
649
565k
    sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX);
650
565k
    *dst += PXSTRIDE(stride);
651
565k
    if (h > 1) {
652
559k
        sgr_weighted_row1(*dst, tmp + FILTER_OUT_STRIDE, w, w1 HIGHBD_TAIL_SUFFIX);
653
559k
        *dst += PXSTRIDE(stride);
654
559k
    }
655
565k
    rotate(A_ptrs, B_ptrs, 2);
656
565k
}
657
658
static NOINLINE void sgr_finish_mix(pixel **dst, const ptrdiff_t stride,
659
                                    int32_t **A5_ptrs, coef **B5_ptrs,
660
                                    int32_t **A3_ptrs, coef **B3_ptrs,
661
                                    const int w, const int h,
662
                                    const int w0, const int w1 HIGHBD_DECL_SUFFIX)
663
1.31M
{
664
1.31M
    ALIGN_STK_16(coef, tmp5, 2*FILTER_OUT_STRIDE,);
665
1.31M
    ALIGN_STK_16(coef, tmp3, 2*FILTER_OUT_STRIDE,);
666
667
1.31M
    sgr_finish_filter2(tmp5, *dst, stride, A5_ptrs, B5_ptrs, w, h);
668
1.31M
    sgr_finish_filter_row1(tmp3, *dst, A3_ptrs, B3_ptrs, w);
669
1.31M
    if (h > 1)
670
1.30M
        sgr_finish_filter_row1(tmp3 + FILTER_OUT_STRIDE, *dst + PXSTRIDE(stride),
671
1.30M
                               &A3_ptrs[1], &B3_ptrs[1], w);
672
1.31M
    sgr_weighted2(*dst, stride, tmp5, tmp3, w, h, w0, w1 HIGHBD_TAIL_SUFFIX);
673
1.31M
    *dst += h*PXSTRIDE(stride);
674
1.31M
    rotate(A5_ptrs, B5_ptrs, 2);
675
1.31M
    rotate(A3_ptrs, B3_ptrs, 4);
676
1.31M
}
677
678
679
static void sgr_3x3_c(pixel *dst, const ptrdiff_t stride,
680
                      const pixel (*left)[4], const pixel *lpf,
681
                      const int w, int h,
682
                      const LooprestorationParams *const params,
683
                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
684
24.2k
{
685
2.62M
#define BUF_STRIDE (384 + 16)
686
24.2k
    ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16,);
687
24.2k
    ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 3 + 16,);
688
24.2k
    int32_t *sumsq_ptrs[3], *sumsq_rows[3];
689
24.2k
    coef *sum_ptrs[3], *sum_rows[3];
690
96.9k
    for (int i = 0; i < 3; i++) {
691
72.7k
        sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
692
72.7k
        sum_rows[i] = &sum_buf[i * BUF_STRIDE];
693
72.7k
    }
694
695
24.2k
    ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16,);
696
24.2k
    ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 3 + 16,);
697
24.2k
    int32_t *A_ptrs[3];
698
24.2k
    coef *B_ptrs[3];
699
96.9k
    for (int i = 0; i < 3; i++) {
700
72.7k
        A_ptrs[i] = &A_buf[i * BUF_STRIDE];
701
72.7k
        B_ptrs[i] = &B_buf[i * BUF_STRIDE];
702
72.7k
    }
703
24.2k
    const pixel *src = dst;
704
24.2k
    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
705
706
24.2k
    if (edges & LR_HAVE_TOP) {
707
16.8k
        sumsq_ptrs[0] = sumsq_rows[0];
708
16.8k
        sumsq_ptrs[1] = sumsq_rows[1];
709
16.8k
        sumsq_ptrs[2] = sumsq_rows[2];
710
16.8k
        sum_ptrs[0] = sum_rows[0];
711
16.8k
        sum_ptrs[1] = sum_rows[1];
712
16.8k
        sum_ptrs[2] = sum_rows[2];
713
714
16.8k
        sgr_box3_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges);
715
16.8k
        lpf += PXSTRIDE(stride);
716
16.8k
        sgr_box3_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges);
717
718
16.8k
        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
719
16.8k
                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
720
16.8k
        left++;
721
16.8k
        src += PXSTRIDE(stride);
722
16.8k
        rotate(A_ptrs, B_ptrs, 3);
723
724
16.8k
        if (--h <= 0)
725
308
            goto vert_1;
726
727
16.5k
        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
728
16.5k
                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
729
16.5k
        left++;
730
16.5k
        src += PXSTRIDE(stride);
731
16.5k
        rotate(A_ptrs, B_ptrs, 3);
732
733
16.5k
        if (--h <= 0)
734
268
            goto vert_2;
735
16.5k
    } else {
736
7.39k
        sumsq_ptrs[0] = sumsq_rows[0];
737
7.39k
        sumsq_ptrs[1] = sumsq_rows[0];
738
7.39k
        sumsq_ptrs[2] = sumsq_rows[0];
739
7.39k
        sum_ptrs[0] = sum_rows[0];
740
7.39k
        sum_ptrs[1] = sum_rows[0];
741
7.39k
        sum_ptrs[2] = sum_rows[0];
742
743
7.39k
        sgr_box3_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges);
744
7.39k
        left++;
745
7.39k
        src += PXSTRIDE(stride);
746
747
7.39k
        sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
748
7.39k
                      w, params->sgr.s1, BITDEPTH_MAX);
749
7.39k
        rotate(A_ptrs, B_ptrs, 3);
750
751
7.39k
        if (--h <= 0)
752
1.29k
            goto vert_1;
753
754
6.10k
        sumsq_ptrs[2] = sumsq_rows[1];
755
6.10k
        sum_ptrs[2] = sum_rows[1];
756
757
6.10k
        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
758
6.10k
                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
759
6.10k
        left++;
760
6.10k
        src += PXSTRIDE(stride);
761
6.10k
        rotate(A_ptrs, B_ptrs, 3);
762
763
6.10k
        if (--h <= 0)
764
1.48k
            goto vert_2;
765
766
4.62k
        sumsq_ptrs[2] = sumsq_rows[2];
767
4.62k
        sum_ptrs[2] = sum_rows[2];
768
4.62k
    }
769
770
935k
    do {
771
935k
        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
772
935k
                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
773
935k
        left++;
774
935k
        src += PXSTRIDE(stride);
775
776
935k
        sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
777
935k
                    w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
778
935k
    } while (--h > 0);
779
780
20.8k
    if (!(edges & LR_HAVE_BOTTOM))
781
4.42k
        goto vert_2;
782
783
16.4k
    sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
784
16.4k
                NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
785
16.4k
    lpf_bottom += PXSTRIDE(stride);
786
787
16.4k
    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
788
16.4k
                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
789
790
16.4k
    sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
791
16.4k
                NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
792
793
16.4k
    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
794
16.4k
                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
795
16.4k
    return;
796
797
6.17k
vert_2:
798
6.17k
    sumsq_ptrs[2] = sumsq_ptrs[1];
799
6.17k
    sum_ptrs[2] = sum_ptrs[1];
800
6.17k
    sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
801
6.17k
                  w, params->sgr.s1, BITDEPTH_MAX);
802
803
6.17k
    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
804
6.17k
                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
805
806
7.77k
output_1:
807
7.77k
    sumsq_ptrs[2] = sumsq_ptrs[1];
808
7.77k
    sum_ptrs[2] = sum_ptrs[1];
809
7.77k
    sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
810
7.77k
                  w, params->sgr.s1, BITDEPTH_MAX);
811
812
7.77k
    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
813
7.77k
                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
814
7.77k
    return;
815
816
1.60k
vert_1:
817
1.60k
    sumsq_ptrs[2] = sumsq_ptrs[1];
818
1.60k
    sum_ptrs[2] = sum_ptrs[1];
819
1.60k
    sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
820
1.60k
                  w, params->sgr.s1, BITDEPTH_MAX);
821
1.60k
    rotate(A_ptrs, B_ptrs, 3);
822
1.60k
    goto output_1;
823
6.17k
}
824
825
static void sgr_5x5_c(pixel *dst, const ptrdiff_t stride,
826
                      const pixel (*left)[4], const pixel *lpf,
827
                      const int w, int h,
828
                      const LooprestorationParams *const params,
829
                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
830
29.5k
{
831
29.5k
    ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16,);
832
29.5k
    ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 5 + 16,);
833
29.5k
    int32_t *sumsq_ptrs[5], *sumsq_rows[5];
834
29.5k
    coef *sum_ptrs[5], *sum_rows[5];
835
177k
    for (int i = 0; i < 5; i++) {
836
147k
        sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
837
147k
        sum_rows[i] = &sum_buf[i * BUF_STRIDE];
838
147k
    }
839
840
29.5k
    ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16,);
841
29.5k
    ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 2 + 16,);
842
29.5k
    int32_t *A_ptrs[2];
843
29.5k
    coef *B_ptrs[2];
844
88.5k
    for (int i = 0; i < 2; i++) {
845
59.0k
        A_ptrs[i] = &A_buf[i * BUF_STRIDE];
846
59.0k
        B_ptrs[i] = &B_buf[i * BUF_STRIDE];
847
59.0k
    }
848
29.5k
    const pixel *src = dst;
849
29.5k
    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
850
851
29.5k
    if (edges & LR_HAVE_TOP) {
852
18.2k
        sumsq_ptrs[0] = sumsq_rows[0];
853
18.2k
        sumsq_ptrs[1] = sumsq_rows[0];
854
18.2k
        sumsq_ptrs[2] = sumsq_rows[1];
855
18.2k
        sumsq_ptrs[3] = sumsq_rows[2];
856
18.2k
        sumsq_ptrs[4] = sumsq_rows[3];
857
18.2k
        sum_ptrs[0] = sum_rows[0];
858
18.2k
        sum_ptrs[1] = sum_rows[0];
859
18.2k
        sum_ptrs[2] = sum_rows[1];
860
18.2k
        sum_ptrs[3] = sum_rows[2];
861
18.2k
        sum_ptrs[4] = sum_rows[3];
862
863
18.2k
        sgr_box5_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges);
864
18.2k
        lpf += PXSTRIDE(stride);
865
18.2k
        sgr_box5_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges);
866
867
18.2k
        sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges);
868
18.2k
        left++;
869
18.2k
        src += PXSTRIDE(stride);
870
871
18.2k
        if (--h <= 0)
872
249
            goto vert_1;
873
874
17.9k
        sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges);
875
17.9k
        left++;
876
17.9k
        src += PXSTRIDE(stride);
877
17.9k
        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
878
17.9k
                      w, params->sgr.s0, BITDEPTH_MAX);
879
17.9k
        rotate(A_ptrs, B_ptrs, 2);
880
881
17.9k
        if (--h <= 0)
882
669
            goto vert_2;
883
884
        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
885
        // one of them to point at the previously unused rows[4].
886
17.3k
        sumsq_ptrs[3] = sumsq_rows[4];
887
17.3k
        sum_ptrs[3] = sum_rows[4];
888
17.3k
    } else {
889
11.3k
        sumsq_ptrs[0] = sumsq_rows[0];
890
11.3k
        sumsq_ptrs[1] = sumsq_rows[0];
891
11.3k
        sumsq_ptrs[2] = sumsq_rows[0];
892
11.3k
        sumsq_ptrs[3] = sumsq_rows[0];
893
11.3k
        sumsq_ptrs[4] = sumsq_rows[0];
894
11.3k
        sum_ptrs[0] = sum_rows[0];
895
11.3k
        sum_ptrs[1] = sum_rows[0];
896
11.3k
        sum_ptrs[2] = sum_rows[0];
897
11.3k
        sum_ptrs[3] = sum_rows[0];
898
11.3k
        sum_ptrs[4] = sum_rows[0];
899
900
11.3k
        sgr_box5_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges);
901
11.3k
        left++;
902
11.3k
        src += PXSTRIDE(stride);
903
904
11.3k
        if (--h <= 0)
905
1.84k
            goto vert_1;
906
907
9.46k
        sumsq_ptrs[4] = sumsq_rows[1];
908
9.46k
        sum_ptrs[4] = sum_rows[1];
909
910
9.46k
        sgr_box5_row_h(sumsq_rows[1], sum_rows[1], left, src, w, edges);
911
9.46k
        left++;
912
9.46k
        src += PXSTRIDE(stride);
913
914
9.46k
        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
915
9.46k
                      w, params->sgr.s0, BITDEPTH_MAX);
916
9.46k
        rotate(A_ptrs, B_ptrs, 2);
917
918
9.46k
        if (--h <= 0)
919
1.59k
            goto vert_2;
920
921
7.87k
        sumsq_ptrs[3] = sumsq_rows[2];
922
7.87k
        sumsq_ptrs[4] = sumsq_rows[3];
923
7.87k
        sum_ptrs[3] = sum_rows[2];
924
7.87k
        sum_ptrs[4] = sum_rows[3];
925
926
7.87k
        sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges);
927
7.87k
        left++;
928
7.87k
        src += PXSTRIDE(stride);
929
930
7.87k
        if (--h <= 0)
931
937
            goto odd;
932
933
6.93k
        sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges);
934
6.93k
        left++;
935
6.93k
        src += PXSTRIDE(stride);
936
937
6.93k
        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
938
6.93k
                      w, params->sgr.s0, BITDEPTH_MAX);
939
6.93k
        sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
940
6.93k
                    w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
941
942
6.93k
        if (--h <= 0)
943
707
            goto vert_2;
944
945
        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
946
        // one of them to point at the previously unused rows[4].
947
6.22k
        sumsq_ptrs[3] = sumsq_rows[4];
948
6.22k
        sum_ptrs[3] = sum_rows[4];
949
6.22k
    }
950
951
527k
    do {
952
527k
        sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], left, src, w, edges);
953
527k
        left++;
954
527k
        src += PXSTRIDE(stride);
955
956
527k
        if (--h <= 0)
957
2.74k
            goto odd;
958
959
525k
        sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], left, src, w, edges);
960
525k
        left++;
961
525k
        src += PXSTRIDE(stride);
962
963
525k
        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
964
525k
                      w, params->sgr.s0, BITDEPTH_MAX);
965
525k
        sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
966
525k
                    w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
967
525k
    } while (--h > 0);
968
969
20.7k
    if (!(edges & LR_HAVE_BOTTOM))
970
2.02k
        goto vert_2;
971
972
18.7k
    sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], NULL, lpf_bottom, w, edges);
973
18.7k
    lpf_bottom += PXSTRIDE(stride);
974
18.7k
    sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], NULL, lpf_bottom, w, edges);
975
976
23.7k
output_2:
977
23.7k
    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
978
23.7k
                  w, params->sgr.s0, BITDEPTH_MAX);
979
23.7k
    sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
980
23.7k
                w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
981
23.7k
    return;
982
983
4.99k
vert_2:
984
    // Duplicate the last row twice more
985
4.99k
    sumsq_ptrs[3] = sumsq_ptrs[2];
986
4.99k
    sumsq_ptrs[4] = sumsq_ptrs[2];
987
4.99k
    sum_ptrs[3] = sum_ptrs[2];
988
4.99k
    sum_ptrs[4] = sum_ptrs[2];
989
4.99k
    goto output_2;
990
991
3.68k
odd:
992
    // Copy the last row as padding once
993
3.68k
    sumsq_ptrs[4] = sumsq_ptrs[3];
994
3.68k
    sum_ptrs[4] = sum_ptrs[3];
995
996
3.68k
    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
997
3.68k
                  w, params->sgr.s0, BITDEPTH_MAX);
998
3.68k
    sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
999
3.68k
                w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
1000
1001
5.77k
output_1:
1002
    // Duplicate the last row twice more
1003
5.77k
    sumsq_ptrs[3] = sumsq_ptrs[2];
1004
5.77k
    sumsq_ptrs[4] = sumsq_ptrs[2];
1005
5.77k
    sum_ptrs[3] = sum_ptrs[2];
1006
5.77k
    sum_ptrs[4] = sum_ptrs[2];
1007
1008
5.77k
    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
1009
5.77k
                  w, params->sgr.s0, BITDEPTH_MAX);
1010
    // Output only one row
1011
5.77k
    sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
1012
5.77k
                w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
1013
5.77k
    return;
1014
1015
2.09k
vert_1:
1016
    // Copy the last row as padding once
1017
2.09k
    sumsq_ptrs[4] = sumsq_ptrs[3];
1018
2.09k
    sum_ptrs[4] = sum_ptrs[3];
1019
1020
2.09k
    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
1021
2.09k
                  w, params->sgr.s0, BITDEPTH_MAX);
1022
2.09k
    rotate(A_ptrs, B_ptrs, 2);
1023
1024
2.09k
    goto output_1;
1025
3.68k
}
1026
1027
static void sgr_mix_c(pixel *dst, const ptrdiff_t stride,
1028
                      const pixel (*left)[4], const pixel *lpf,
1029
                      const int w, int h,
1030
                      const LooprestorationParams *const params,
1031
                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
1032
68.6k
{
1033
68.6k
    ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16,);
1034
68.6k
    ALIGN_STK_16(coef, sum5_buf, BUF_STRIDE * 5 + 16,);
1035
68.6k
    int32_t *sumsq5_ptrs[5], *sumsq5_rows[5];
1036
68.6k
    coef *sum5_ptrs[5], *sum5_rows[5];
1037
411k
    for (int i = 0; i < 5; i++) {
1038
343k
        sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE];
1039
343k
        sum5_rows[i] = &sum5_buf[i * BUF_STRIDE];
1040
343k
    }
1041
68.6k
    ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16,);
1042
68.6k
    ALIGN_STK_16(coef, sum3_buf, BUF_STRIDE * 3 + 16,);
1043
68.6k
    int32_t *sumsq3_ptrs[3], *sumsq3_rows[3];
1044
68.6k
    coef *sum3_ptrs[3], *sum3_rows[3];
1045
274k
    for (int i = 0; i < 3; i++) {
1046
205k
        sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE];
1047
205k
        sum3_rows[i] = &sum3_buf[i * BUF_STRIDE];
1048
205k
    }
1049
1050
68.6k
    ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16,);
1051
68.6k
    ALIGN_STK_16(coef, B5_buf, BUF_STRIDE * 2 + 16,);
1052
68.6k
    int32_t *A5_ptrs[2];
1053
68.6k
    coef *B5_ptrs[2];
1054
205k
    for (int i = 0; i < 2; i++) {
1055
137k
        A5_ptrs[i] = &A5_buf[i * BUF_STRIDE];
1056
137k
        B5_ptrs[i] = &B5_buf[i * BUF_STRIDE];
1057
137k
    }
1058
68.6k
    ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16,);
1059
68.6k
    ALIGN_STK_16(coef, B3_buf, BUF_STRIDE * 4 + 16,);
1060
68.6k
    int32_t *A3_ptrs[4];
1061
68.6k
    coef *B3_ptrs[4];
1062
343k
    for (int i = 0; i < 4; i++) {
1063
274k
        A3_ptrs[i] = &A3_buf[i * BUF_STRIDE];
1064
274k
        B3_ptrs[i] = &B3_buf[i * BUF_STRIDE];
1065
274k
    }
1066
68.6k
    const pixel *src = dst;
1067
68.6k
    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
1068
1069
68.6k
    if (edges & LR_HAVE_TOP) {
1070
44.0k
        sumsq5_ptrs[0] = sumsq5_rows[0];
1071
44.0k
        sumsq5_ptrs[1] = sumsq5_rows[0];
1072
44.0k
        sumsq5_ptrs[2] = sumsq5_rows[1];
1073
44.0k
        sumsq5_ptrs[3] = sumsq5_rows[2];
1074
44.0k
        sumsq5_ptrs[4] = sumsq5_rows[3];
1075
44.0k
        sum5_ptrs[0] = sum5_rows[0];
1076
44.0k
        sum5_ptrs[1] = sum5_rows[0];
1077
44.0k
        sum5_ptrs[2] = sum5_rows[1];
1078
44.0k
        sum5_ptrs[3] = sum5_rows[2];
1079
44.0k
        sum5_ptrs[4] = sum5_rows[3];
1080
1081
44.0k
        sumsq3_ptrs[0] = sumsq3_rows[0];
1082
44.0k
        sumsq3_ptrs[1] = sumsq3_rows[1];
1083
44.0k
        sumsq3_ptrs[2] = sumsq3_rows[2];
1084
44.0k
        sum3_ptrs[0] = sum3_rows[0];
1085
44.0k
        sum3_ptrs[1] = sum3_rows[1];
1086
44.0k
        sum3_ptrs[2] = sum3_rows[2];
1087
1088
44.0k
        sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0],
1089
44.0k
                        sumsq5_rows[0], sum5_rows[0],
1090
44.0k
                        NULL, lpf, w, edges);
1091
44.0k
        lpf += PXSTRIDE(stride);
1092
44.0k
        sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1],
1093
44.0k
                        sumsq5_rows[1], sum5_rows[1],
1094
44.0k
                        NULL, lpf, w, edges);
1095
1096
44.0k
        sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2],
1097
44.0k
                        sumsq5_rows[2], sum5_rows[2],
1098
44.0k
                        left, src, w, edges);
1099
44.0k
        left++;
1100
44.0k
        src += PXSTRIDE(stride);
1101
1102
44.0k
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1103
44.0k
                      w, params->sgr.s1, BITDEPTH_MAX);
1104
44.0k
        rotate(A3_ptrs, B3_ptrs, 4);
1105
1106
44.0k
        if (--h <= 0)
1107
555
            goto vert_1;
1108
1109
43.4k
        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1110
43.4k
                        sumsq5_rows[3], sum5_rows[3],
1111
43.4k
                        left, src, w, edges);
1112
43.4k
        left++;
1113
43.4k
        src += PXSTRIDE(stride);
1114
43.4k
        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1115
43.4k
                      w, params->sgr.s0, BITDEPTH_MAX);
1116
43.4k
        rotate(A5_ptrs, B5_ptrs, 2);
1117
43.4k
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1118
43.4k
                      w, params->sgr.s1, BITDEPTH_MAX);
1119
43.4k
        rotate(A3_ptrs, B3_ptrs, 4);
1120
1121
43.4k
        if (--h <= 0)
1122
496
            goto vert_2;
1123
1124
        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
1125
        // one of them to point at the previously unused rows[4].
1126
42.9k
        sumsq5_ptrs[3] = sumsq5_rows[4];
1127
42.9k
        sum5_ptrs[3] = sum5_rows[4];
1128
42.9k
    } else {
1129
24.6k
        sumsq5_ptrs[0] = sumsq5_rows[0];
1130
24.6k
        sumsq5_ptrs[1] = sumsq5_rows[0];
1131
24.6k
        sumsq5_ptrs[2] = sumsq5_rows[0];
1132
24.6k
        sumsq5_ptrs[3] = sumsq5_rows[0];
1133
24.6k
        sumsq5_ptrs[4] = sumsq5_rows[0];
1134
24.6k
        sum5_ptrs[0] = sum5_rows[0];
1135
24.6k
        sum5_ptrs[1] = sum5_rows[0];
1136
24.6k
        sum5_ptrs[2] = sum5_rows[0];
1137
24.6k
        sum5_ptrs[3] = sum5_rows[0];
1138
24.6k
        sum5_ptrs[4] = sum5_rows[0];
1139
1140
24.6k
        sumsq3_ptrs[0] = sumsq3_rows[0];
1141
24.6k
        sumsq3_ptrs[1] = sumsq3_rows[0];
1142
24.6k
        sumsq3_ptrs[2] = sumsq3_rows[0];
1143
24.6k
        sum3_ptrs[0] = sum3_rows[0];
1144
24.6k
        sum3_ptrs[1] = sum3_rows[0];
1145
24.6k
        sum3_ptrs[2] = sum3_rows[0];
1146
1147
24.6k
        sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0],
1148
24.6k
                        sumsq5_rows[0], sum5_rows[0],
1149
24.6k
                        left, src, w, edges);
1150
24.6k
        left++;
1151
24.6k
        src += PXSTRIDE(stride);
1152
1153
24.6k
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1154
24.6k
                      w, params->sgr.s1, BITDEPTH_MAX);
1155
24.6k
        rotate(A3_ptrs, B3_ptrs, 4);
1156
1157
24.6k
        if (--h <= 0)
1158
3.26k
            goto vert_1;
1159
1160
21.3k
        sumsq5_ptrs[4] = sumsq5_rows[1];
1161
21.3k
        sum5_ptrs[4] = sum5_rows[1];
1162
1163
21.3k
        sumsq3_ptrs[2] = sumsq3_rows[1];
1164
21.3k
        sum3_ptrs[2] = sum3_rows[1];
1165
1166
21.3k
        sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1],
1167
21.3k
                        sumsq5_rows[1], sum5_rows[1],
1168
21.3k
                        left, src, w, edges);
1169
21.3k
        left++;
1170
21.3k
        src += PXSTRIDE(stride);
1171
1172
21.3k
        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1173
21.3k
                      w, params->sgr.s0, BITDEPTH_MAX);
1174
21.3k
        rotate(A5_ptrs, B5_ptrs, 2);
1175
21.3k
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1176
21.3k
                      w, params->sgr.s1, BITDEPTH_MAX);
1177
21.3k
        rotate(A3_ptrs, B3_ptrs, 4);
1178
1179
21.3k
        if (--h <= 0)
1180
5.22k
            goto vert_2;
1181
1182
16.1k
        sumsq5_ptrs[3] = sumsq5_rows[2];
1183
16.1k
        sumsq5_ptrs[4] = sumsq5_rows[3];
1184
16.1k
        sum5_ptrs[3] = sum5_rows[2];
1185
16.1k
        sum5_ptrs[4] = sum5_rows[3];
1186
1187
16.1k
        sumsq3_ptrs[2] = sumsq3_rows[2];
1188
16.1k
        sum3_ptrs[2] = sum3_rows[2];
1189
1190
16.1k
        sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2],
1191
16.1k
                        sumsq5_rows[2], sum5_rows[2],
1192
16.1k
                        left, src, w, edges);
1193
16.1k
        left++;
1194
16.1k
        src += PXSTRIDE(stride);
1195
1196
16.1k
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1197
16.1k
                      w, params->sgr.s1, BITDEPTH_MAX);
1198
16.1k
        rotate(A3_ptrs, B3_ptrs, 4);
1199
1200
16.1k
        if (--h <= 0)
1201
1.96k
            goto odd;
1202
1203
14.1k
        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1204
14.1k
                        sumsq5_rows[3], sum5_rows[3],
1205
14.1k
                        left, src, w, edges);
1206
14.1k
        left++;
1207
14.1k
        src += PXSTRIDE(stride);
1208
1209
14.1k
        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1210
14.1k
                      w, params->sgr.s0, BITDEPTH_MAX);
1211
14.1k
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1212
14.1k
                      w, params->sgr.s1, BITDEPTH_MAX);
1213
14.1k
        sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1214
14.1k
                       w, 2, params->sgr.w0, params->sgr.w1
1215
14.1k
                       HIGHBD_TAIL_SUFFIX);
1216
1217
14.1k
        if (--h <= 0)
1218
1.42k
            goto vert_2;
1219
1220
        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
1221
        // one of them to point at the previously unused rows[4].
1222
12.7k
        sumsq5_ptrs[3] = sumsq5_rows[4];
1223
12.7k
        sum5_ptrs[3] = sum5_rows[4];
1224
12.7k
    }
1225
1226
1.23M
    do {
1227
1.23M
        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1228
1.23M
                        sumsq5_ptrs[3], sum5_ptrs[3],
1229
1.23M
                        left, src, w, edges);
1230
1.23M
        left++;
1231
1.23M
        src += PXSTRIDE(stride);
1232
1233
1.23M
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1234
1.23M
                      w, params->sgr.s1, BITDEPTH_MAX);
1235
1.23M
        rotate(A3_ptrs, B3_ptrs, 4);
1236
1237
1.23M
        if (--h <= 0)
1238
8.72k
            goto odd;
1239
1240
1.22M
        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1241
1.22M
                        sumsq5_ptrs[4], sum5_ptrs[4],
1242
1.22M
                        left, src, w, edges);
1243
1.22M
        left++;
1244
1.22M
        src += PXSTRIDE(stride);
1245
1246
1.22M
        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1247
1.22M
                      w, params->sgr.s0, BITDEPTH_MAX);
1248
1.22M
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1249
1.22M
                      w, params->sgr.s1, BITDEPTH_MAX);
1250
1.22M
        sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1251
1.22M
                       w, 2, params->sgr.w0, params->sgr.w1
1252
1.22M
                       HIGHBD_TAIL_SUFFIX);
1253
1.22M
    } while (--h > 0);
1254
1255
46.9k
    if (!(edges & LR_HAVE_BOTTOM))
1256
2.96k
        goto vert_2;
1257
1258
44.0k
    sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1259
44.0k
                    sumsq5_ptrs[3], sum5_ptrs[3],
1260
44.0k
                    NULL, lpf_bottom, w, edges);
1261
44.0k
    lpf_bottom += PXSTRIDE(stride);
1262
44.0k
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1263
44.0k
                  w, params->sgr.s1, BITDEPTH_MAX);
1264
44.0k
    rotate(A3_ptrs, B3_ptrs, 4);
1265
1266
44.0k
    sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1267
44.0k
                    sumsq5_ptrs[4], sum5_ptrs[4],
1268
44.0k
                    NULL, lpf_bottom, w, edges);
1269
1270
54.1k
output_2:
1271
54.1k
    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1272
54.1k
                  w, params->sgr.s0, BITDEPTH_MAX);
1273
54.1k
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1274
54.1k
                  w, params->sgr.s1, BITDEPTH_MAX);
1275
54.1k
    sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1276
54.1k
                   w, 2, params->sgr.w0, params->sgr.w1
1277
54.1k
                   HIGHBD_TAIL_SUFFIX);
1278
54.1k
    return;
1279
1280
10.1k
vert_2:
1281
    // Duplicate the last row twice more
1282
10.1k
    sumsq5_ptrs[3] = sumsq5_ptrs[2];
1283
10.1k
    sumsq5_ptrs[4] = sumsq5_ptrs[2];
1284
10.1k
    sum5_ptrs[3] = sum5_ptrs[2];
1285
10.1k
    sum5_ptrs[4] = sum5_ptrs[2];
1286
1287
10.1k
    sumsq3_ptrs[2] = sumsq3_ptrs[1];
1288
10.1k
    sum3_ptrs[2] = sum3_ptrs[1];
1289
10.1k
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1290
10.1k
                  w, params->sgr.s1, BITDEPTH_MAX);
1291
10.1k
    rotate(A3_ptrs, B3_ptrs, 4);
1292
1293
10.1k
    sumsq3_ptrs[2] = sumsq3_ptrs[1];
1294
10.1k
    sum3_ptrs[2] = sum3_ptrs[1];
1295
1296
10.1k
    goto output_2;
1297
1298
10.6k
odd:
1299
    // Copy the last row as padding once
1300
10.6k
    sumsq5_ptrs[4] = sumsq5_ptrs[3];
1301
10.6k
    sum5_ptrs[4] = sum5_ptrs[3];
1302
1303
10.6k
    sumsq3_ptrs[2] = sumsq3_ptrs[1];
1304
10.6k
    sum3_ptrs[2] = sum3_ptrs[1];
1305
1306
10.6k
    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1307
10.6k
                  w, params->sgr.s0, BITDEPTH_MAX);
1308
10.6k
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1309
10.6k
                  w, params->sgr.s1, BITDEPTH_MAX);
1310
10.6k
    sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1311
10.6k
                   w, 2, params->sgr.w0, params->sgr.w1
1312
10.6k
                   HIGHBD_TAIL_SUFFIX);
1313
1314
14.5k
output_1:
1315
    // Duplicate the last row twice more
1316
14.5k
    sumsq5_ptrs[3] = sumsq5_ptrs[2];
1317
14.5k
    sumsq5_ptrs[4] = sumsq5_ptrs[2];
1318
14.5k
    sum5_ptrs[3] = sum5_ptrs[2];
1319
14.5k
    sum5_ptrs[4] = sum5_ptrs[2];
1320
1321
14.5k
    sumsq3_ptrs[2] = sumsq3_ptrs[1];
1322
14.5k
    sum3_ptrs[2] = sum3_ptrs[1];
1323
1324
14.5k
    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1325
14.5k
                  w, params->sgr.s0, BITDEPTH_MAX);
1326
14.5k
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1327
14.5k
                  w, params->sgr.s1, BITDEPTH_MAX);
1328
14.5k
    rotate(A3_ptrs, B3_ptrs, 4);
1329
    // Output only one row
1330
14.5k
    sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1331
14.5k
                   w, 1, params->sgr.w0, params->sgr.w1
1332
14.5k
                   HIGHBD_TAIL_SUFFIX);
1333
14.5k
    return;
1334
1335
3.82k
vert_1:
1336
    // Copy the last row as padding once
1337
3.82k
    sumsq5_ptrs[4] = sumsq5_ptrs[3];
1338
3.82k
    sum5_ptrs[4] = sum5_ptrs[3];
1339
1340
3.82k
    sumsq3_ptrs[2] = sumsq3_ptrs[1];
1341
3.82k
    sum3_ptrs[2] = sum3_ptrs[1];
1342
1343
3.82k
    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1344
3.82k
                  w, params->sgr.s0, BITDEPTH_MAX);
1345
3.82k
    rotate(A5_ptrs, B5_ptrs, 2);
1346
3.82k
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1347
3.82k
                  w, params->sgr.s1, BITDEPTH_MAX);
1348
3.82k
    rotate(A3_ptrs, B3_ptrs, 4);
1349
1350
3.82k
    goto output_1;
1351
10.6k
}
1352
1353
#if HAVE_ASM
1354
#if ARCH_AARCH64 || ARCH_ARM
1355
#include "src/arm/looprestoration.h"
1356
#elif ARCH_LOONGARCH64
1357
#include "src/loongarch/looprestoration.h"
1358
#elif ARCH_PPC64LE
1359
#include "src/ppc/looprestoration.h"
1360
#elif ARCH_X86
1361
#include "src/x86/looprestoration.h"
1362
#endif
1363
#endif
1364
1365
COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c,
1366
                                                 const int bpc)
1367
68.8k
{
1368
68.8k
    c->wiener[0] = c->wiener[1] = wiener_c;
1369
68.8k
    c->sgr[0] = sgr_5x5_c;
1370
68.8k
    c->sgr[1] = sgr_3x3_c;
1371
68.8k
    c->sgr[2] = sgr_mix_c;
1372
1373
#if HAVE_ASM
1374
#if ARCH_AARCH64 || ARCH_ARM
1375
    loop_restoration_dsp_init_arm(c, bpc);
1376
#elif ARCH_LOONGARCH64
1377
    loop_restoration_dsp_init_loongarch(c, bpc);
1378
#elif ARCH_PPC64LE
1379
    loop_restoration_dsp_init_ppc(c, bpc);
1380
#elif ARCH_X86
1381
    loop_restoration_dsp_init_x86(c, bpc);
1382
#endif
1383
#endif
1384
68.8k
}
dav1d_loop_restoration_dsp_init_8bpc
Line
Count
Source
1367
30.8k
{
1368
30.8k
    c->wiener[0] = c->wiener[1] = wiener_c;
1369
30.8k
    c->sgr[0] = sgr_5x5_c;
1370
30.8k
    c->sgr[1] = sgr_3x3_c;
1371
30.8k
    c->sgr[2] = sgr_mix_c;
1372
1373
#if HAVE_ASM
1374
#if ARCH_AARCH64 || ARCH_ARM
1375
    loop_restoration_dsp_init_arm(c, bpc);
1376
#elif ARCH_LOONGARCH64
1377
    loop_restoration_dsp_init_loongarch(c, bpc);
1378
#elif ARCH_PPC64LE
1379
    loop_restoration_dsp_init_ppc(c, bpc);
1380
#elif ARCH_X86
1381
    loop_restoration_dsp_init_x86(c, bpc);
1382
#endif
1383
#endif
1384
30.8k
}
dav1d_loop_restoration_dsp_init_16bpc
Line
Count
Source
1367
38.0k
{
1368
38.0k
    c->wiener[0] = c->wiener[1] = wiener_c;
1369
38.0k
    c->sgr[0] = sgr_5x5_c;
1370
38.0k
    c->sgr[1] = sgr_3x3_c;
1371
38.0k
    c->sgr[2] = sgr_mix_c;
1372
1373
#if HAVE_ASM
1374
#if ARCH_AARCH64 || ARCH_ARM
1375
    loop_restoration_dsp_init_arm(c, bpc);
1376
#elif ARCH_LOONGARCH64
1377
    loop_restoration_dsp_init_loongarch(c, bpc);
1378
#elif ARCH_PPC64LE
1379
    loop_restoration_dsp_init_ppc(c, bpc);
1380
#elif ARCH_X86
1381
    loop_restoration_dsp_init_x86(c, bpc);
1382
#endif
1383
#endif
1384
38.0k
}