Coverage Report

Created: 2026-05-16 06:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/dav1d/src/looprestoration_tmpl.c
Line
Count
Source
1
/*
2
 * Copyright © 2018, VideoLAN and dav1d authors
3
 * Copyright © 2018, Two Orioles, LLC
4
 * All rights reserved.
5
 *
6
 * Redistribution and use in source and binary forms, with or without
7
 * modification, are permitted provided that the following conditions are met:
8
 *
9
 * 1. Redistributions of source code must retain the above copyright notice, this
10
 *    list of conditions and the following disclaimer.
11
 *
12
 * 2. Redistributions in binary form must reproduce the above copyright notice,
13
 *    this list of conditions and the following disclaimer in the documentation
14
 *    and/or other materials provided with the distribution.
15
 *
16
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
 */
27
28
#include "config.h"
29
30
#include <stdint.h>
31
#include <stdlib.h>
32
#include <string.h>
33
34
#include "common/attributes.h"
35
#include "common/bitdepth.h"
36
#include "common/intops.h"
37
38
#include "src/looprestoration.h"
39
#include "src/tables.h"
40
41
// 256 * 1.5 + 3 + 3 = 390
42
0
#define REST_UNIT_STRIDE (390)
43
44
static void wiener_filter_h(uint16_t *dst, const pixel (*left)[4],
45
                            const pixel *src, const int16_t fh[8],
46
                            const int w, const enum LrEdgeFlags edges
47
                            HIGHBD_DECL_SUFFIX)
48
0
{
49
0
    const int bitdepth = bitdepth_from_max(bitdepth_max);
50
0
    const int round_bits_h = 3 + (bitdepth == 12) * 2;
51
0
    const int rounding_off_h = 1 << (round_bits_h - 1);
52
0
    const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h);
53
54
0
    if (w < 6) {
55
        // For small widths, do the fully conditional loop with
56
        // conditions on each access.
57
0
        for (int x = 0; x < w; x++) {
58
0
            int sum = (1 << (bitdepth + 6));
59
0
#if BITDEPTH == 8
60
0
            sum += src[x] * 128;
61
0
#endif
62
0
            for (int i = 0; i < 7; i++) {
63
0
                int idx = x + i - 3;
64
0
                if (idx < 0) {
65
0
                    if (!(edges & LR_HAVE_LEFT))
66
0
                        sum += src[0] * fh[i];
67
0
                    else if (left)
68
0
                        sum += left[0][4 + idx] * fh[i];
69
0
                    else
70
0
                        sum += src[idx] * fh[i];
71
0
                } else if (idx >= w && !(edges & LR_HAVE_RIGHT)) {
72
0
                    sum += src[w - 1] * fh[i];
73
0
                } else
74
0
                    sum += src[idx] * fh[i];
75
0
            }
76
0
            sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
77
0
            dst[x] = sum;
78
0
        }
79
80
0
        return;
81
0
    }
82
83
    // For larger widths, do separate loops with less conditions; first
84
    // handle the start of the row.
85
0
    int start = 3;
86
0
    if (!(edges & LR_HAVE_LEFT)) {
87
        // If there's no left edge, pad using the leftmost pixel.
88
0
        for (int x = 0; x < 3; x++) {
89
0
            int sum = (1 << (bitdepth + 6));
90
0
#if BITDEPTH == 8
91
0
            sum += src[x] * 128;
92
0
#endif
93
0
            for (int i = 0; i < 7; i++) {
94
0
                int idx = x + i - 3;
95
0
                if (idx < 0)
96
0
                    sum += src[0] * fh[i];
97
0
                else
98
0
                    sum += src[idx] * fh[i];
99
0
            }
100
0
            sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
101
0
            dst[x] = sum;
102
0
        }
103
0
    } else if (left) {
104
        // If we have the left edge and a separate left buffer, pad using that.
105
0
        for (int x = 0; x < 3; x++) {
106
0
            int sum = (1 << (bitdepth + 6));
107
0
#if BITDEPTH == 8
108
0
            sum += src[x] * 128;
109
0
#endif
110
0
            for (int i = 0; i < 7; i++) {
111
0
                int idx = x + i - 3;
112
0
                if (idx < 0)
113
0
                    sum += left[0][4 + idx] * fh[i];
114
0
                else
115
0
                    sum += src[idx] * fh[i];
116
0
            }
117
0
            sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
118
0
            dst[x] = sum;
119
0
        }
120
0
    } else {
121
        // If we have the left edge, but no separate left buffer, we're in the
122
        // top/bottom area (lpf) with the left edge existing in the same
123
        // buffer; just do the regular loop from the start.
124
0
        start = 0;
125
0
    }
126
0
    int end = w - 3;
127
0
    if (edges & LR_HAVE_RIGHT)
128
0
        end = w;
129
130
    // Do a condititon free loop for the bulk of the row.
131
0
    for (int x = start; x < end; x++) {
132
0
        int sum = (1 << (bitdepth + 6));
133
0
#if BITDEPTH == 8
134
0
        sum += src[x] * 128;
135
0
#endif
136
0
        for (int i = 0; i < 7; i++) {
137
0
            int idx = x + i - 3;
138
0
            sum += src[idx] * fh[i];
139
0
        }
140
0
        sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
141
0
        dst[x] = sum;
142
0
    }
143
144
    // If we need to, calculate the end of the row with a condition for
145
    // right edge padding.
146
0
    for (int x = end; x < w; x++) {
147
0
        int sum = (1 << (bitdepth + 6));
148
0
#if BITDEPTH == 8
149
0
        sum += src[x] * 128;
150
0
#endif
151
0
        for (int i = 0; i < 7; i++) {
152
0
            int idx = x + i - 3;
153
0
            if (idx >= w)
154
0
                sum += src[w - 1] * fh[i];
155
0
            else
156
0
                sum += src[idx] * fh[i];
157
0
        }
158
0
        sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
159
0
        dst[x] = sum;
160
0
    }
161
0
}
162
163
static void wiener_filter_v(pixel *p, uint16_t **ptrs, const int16_t fv[8],
164
                            const int w HIGHBD_DECL_SUFFIX)
165
0
{
166
0
    const int bitdepth = bitdepth_from_max(bitdepth_max);
167
168
0
    const int round_bits_v = 11 - (bitdepth == 12) * 2;
169
0
    const int rounding_off_v = 1 << (round_bits_v - 1);
170
0
    const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
171
172
0
    for (int i = 0; i < w; i++) {
173
0
        int sum = -round_offset;
174
175
        // Only filter using 6 input rows. The 7th row is assumed to be
176
        // identical to the last one.
177
        //
178
        // This function is assumed to only be called at the end, when doing
179
        // padding at the bottom.
180
0
        for (int k = 0; k < 6; k++)
181
0
            sum += ptrs[k][i] * fv[k];
182
0
        sum += ptrs[5][i] * fv[6];
183
184
0
        p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v);
185
0
    }
186
187
    // Shift the pointers, but only update the first 5; the 6th pointer is kept
188
    // as it was before (and the 7th is implicitly identical to the 6th).
189
0
    for (int i = 0; i < 5; i++)
190
0
        ptrs[i] = ptrs[i + 1];
191
0
}
192
193
static void wiener_filter_hv(pixel *p, uint16_t **ptrs, const pixel (*left)[4],
194
                             const pixel *src, const int16_t filter[2][8],
195
                             const int w, const enum LrEdgeFlags edges
196
                             HIGHBD_DECL_SUFFIX)
197
0
{
198
0
    const int bitdepth = bitdepth_from_max(bitdepth_max);
199
200
0
    const int round_bits_v = 11 - (bitdepth == 12) * 2;
201
0
    const int rounding_off_v = 1 << (round_bits_v - 1);
202
0
    const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
203
204
0
    const int16_t *fh = filter[0];
205
0
    const int16_t *fv = filter[1];
206
207
    // Do combined horziontal and vertical filtering; doing horizontal
208
    // filtering of one row, combined with vertical filtering of 6
209
    // preexisting rows and the newly filtered row.
210
211
    // For simplicity in the C implementation, just do a separate call
212
    // of the horizontal filter, into a temporary buffer.
213
0
    uint16_t tmp[REST_UNIT_STRIDE];
214
0
    wiener_filter_h(tmp, left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
215
216
0
    for (int i = 0; i < w; i++) {
217
0
        int sum = -round_offset;
218
219
        // Filter using the 6 stored preexisting rows, and the newly
220
        // filtered one in tmp[].
221
0
        for (int k = 0; k < 6; k++)
222
0
            sum += ptrs[k][i] * fv[k];
223
0
        sum += tmp[i] * fv[6];
224
        // At this point, after having read all inputs at point [i], we
225
        // could overwrite [i] with the newly filtered data.
226
227
0
        p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v);
228
0
    }
229
230
    // For simplicity in the C implementation, just memcpy the newly
231
    // filtered row into ptrs[6]. Normally, in steady state filtering,
232
    // this output row, ptrs[6], is equal to ptrs[0]. However at startup,
233
    // at the top of the filtered area, we may have ptrs[0] equal to ptrs[1],
234
    // so we can't assume we can write into ptrs[0] but we need to keep
235
    // a separate pointer for the next row to write into.
236
0
    memcpy(ptrs[6], tmp, sizeof(uint16_t) * REST_UNIT_STRIDE);
237
238
    // Rotate the window of pointers. Shift the 6 pointers downwards one step.
239
0
    for (int i = 0; i < 6; i++)
240
0
        ptrs[i] = ptrs[i + 1];
241
    // The topmost pointer, ptrs[6], which isn't used as input, is set to
242
    // ptrs[0], which will be used as output for the next _hv call.
243
    // At the start of the filtering, the caller may set ptrs[6] to the
244
    // right next buffer to fill in, instead.
245
0
    ptrs[6] = ptrs[0];
246
0
}
247
248
// FIXME Could split into luma and chroma specific functions,
249
// (since first and last tops are always 0 for chroma)
250
static void wiener_c(pixel *p, const ptrdiff_t stride,
251
                     const pixel (*left)[4],
252
                     const pixel *lpf, const int w, int h,
253
                     const LooprestorationParams *const params,
254
                     const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
255
0
{
256
    // Values stored between horizontal and vertical filtering don't
257
    // fit in a uint8_t.
258
0
    uint16_t hor[6 * REST_UNIT_STRIDE];
259
0
    uint16_t *ptrs[7], *rows[6];
260
0
    for (int i = 0; i < 6; i++)
261
0
        rows[i] = &hor[i * REST_UNIT_STRIDE];
262
0
    const int16_t (*const filter)[8] = params->filter;
263
0
    const int16_t *fh = params->filter[0];
264
0
    const int16_t *fv = params->filter[1];
265
0
    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
266
267
0
    const pixel *src = p;
268
0
    if (edges & LR_HAVE_TOP) {
269
0
        ptrs[0] = rows[0];
270
0
        ptrs[1] = rows[0];
271
0
        ptrs[2] = rows[1];
272
0
        ptrs[3] = rows[2];
273
0
        ptrs[4] = rows[2];
274
0
        ptrs[5] = rows[2];
275
276
0
        wiener_filter_h(rows[0], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX);
277
0
        lpf += PXSTRIDE(stride);
278
0
        wiener_filter_h(rows[1], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX);
279
280
0
        wiener_filter_h(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
281
0
        left++;
282
0
        src += PXSTRIDE(stride);
283
284
0
        if (--h <= 0)
285
0
            goto v1;
286
287
0
        ptrs[4] = ptrs[5] = rows[3];
288
0
        wiener_filter_h(rows[3], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
289
0
        left++;
290
0
        src += PXSTRIDE(stride);
291
292
0
        if (--h <= 0)
293
0
            goto v2;
294
295
0
        ptrs[5] = rows[4];
296
0
        wiener_filter_h(rows[4], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
297
0
        left++;
298
0
        src += PXSTRIDE(stride);
299
300
0
        if (--h <= 0)
301
0
            goto v3;
302
0
    } else {
303
0
        ptrs[0] = rows[0];
304
0
        ptrs[1] = rows[0];
305
0
        ptrs[2] = rows[0];
306
0
        ptrs[3] = rows[0];
307
0
        ptrs[4] = rows[0];
308
0
        ptrs[5] = rows[0];
309
310
0
        wiener_filter_h(rows[0], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
311
0
        left++;
312
0
        src += PXSTRIDE(stride);
313
314
0
        if (--h <= 0)
315
0
            goto v1;
316
317
0
        ptrs[4] = ptrs[5] = rows[1];
318
0
        wiener_filter_h(rows[1], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
319
0
        left++;
320
0
        src += PXSTRIDE(stride);
321
322
0
        if (--h <= 0)
323
0
            goto v2;
324
325
0
        ptrs[5] = rows[2];
326
0
        wiener_filter_h(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
327
0
        left++;
328
0
        src += PXSTRIDE(stride);
329
330
0
        if (--h <= 0)
331
0
            goto v3;
332
333
0
        ptrs[6] = rows[3];
334
0
        wiener_filter_hv(p, ptrs, left, src, filter, w, edges
335
0
                         HIGHBD_TAIL_SUFFIX);
336
0
        left++;
337
0
        src += PXSTRIDE(stride);
338
0
        p += PXSTRIDE(stride);
339
340
0
        if (--h <= 0)
341
0
            goto v3;
342
343
0
        ptrs[6] = rows[4];
344
0
        wiener_filter_hv(p, ptrs, left, src, filter, w, edges
345
0
                         HIGHBD_TAIL_SUFFIX);
346
0
        left++;
347
0
        src += PXSTRIDE(stride);
348
0
        p += PXSTRIDE(stride);
349
350
0
        if (--h <= 0)
351
0
            goto v3;
352
0
    }
353
354
0
    ptrs[6] = ptrs[5] + REST_UNIT_STRIDE;
355
0
    do {
356
0
        wiener_filter_hv(p, ptrs, left, src, filter, w, edges
357
0
                         HIGHBD_TAIL_SUFFIX);
358
0
        left++;
359
0
        src += PXSTRIDE(stride);
360
0
        p += PXSTRIDE(stride);
361
0
    } while (--h > 0);
362
363
0
    if (!(edges & LR_HAVE_BOTTOM))
364
0
        goto v3;
365
366
0
    wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges
367
0
                     HIGHBD_TAIL_SUFFIX);
368
0
    lpf_bottom += PXSTRIDE(stride);
369
0
    p += PXSTRIDE(stride);
370
371
0
    wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges
372
0
                     HIGHBD_TAIL_SUFFIX);
373
0
    p += PXSTRIDE(stride);
374
0
v1:
375
0
    wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
376
377
0
    return;
378
379
0
v3:
380
0
    wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
381
0
    p += PXSTRIDE(stride);
382
0
v2:
383
0
    wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
384
0
    p += PXSTRIDE(stride);
385
0
    goto v1;
386
0
}
387
388
// SGR
389
static NOINLINE void rotate(int32_t **sumsq_ptrs, coef **sum_ptrs, int n)
390
0
{
391
0
    int32_t *tmp32 = sumsq_ptrs[0];
392
0
    coef *tmpc = sum_ptrs[0];
393
0
    for (int i = 0; i < n - 1; i++) {
394
0
        sumsq_ptrs[i] = sumsq_ptrs[i + 1];
395
0
        sum_ptrs[i] = sum_ptrs[i + 1];
396
0
    }
397
0
    sumsq_ptrs[n - 1] = tmp32;
398
0
    sum_ptrs[n - 1] = tmpc;
399
0
}
400
401
static NOINLINE void rotate5_x2(int32_t **sumsq_ptrs, coef **sum_ptrs)
402
0
{
403
0
    int32_t *tmp32[2];
404
0
    coef *tmpc[2];
405
0
    for (int i = 0; i < 2; i++) {
406
0
        tmp32[i] = sumsq_ptrs[i];
407
0
        tmpc[i] = sum_ptrs[i];
408
0
    }
409
0
    for (int i = 0; i < 3; i++) {
410
0
        sumsq_ptrs[i] = sumsq_ptrs[i + 2];
411
0
        sum_ptrs[i] = sum_ptrs[i + 2];
412
0
    }
413
0
    for (int i = 0; i < 2; i++) {
414
0
        sumsq_ptrs[3 + i] = tmp32[i];
415
0
        sum_ptrs[3 + i] = tmpc[i];
416
0
    }
417
0
}
418
419
static NOINLINE void sgr_box3_row_h(int32_t *sumsq, coef *sum,
420
                                    const pixel (*left)[4],
421
                                    const pixel *src, const int w,
422
                                    const enum LrEdgeFlags edges)
423
0
{
424
0
    sumsq++;
425
0
    sum++;
426
0
    int a = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0];
427
0
    int b = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0];
428
0
    for (int x = -1; x < w + 1; x++) {
429
0
        int c = (x + 1 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 1] : src[w - 1];
430
0
        sum[x] = a + b + c;
431
0
        sumsq[x] = a * a + b * b + c * c;
432
0
        a = b;
433
0
        b = c;
434
0
    }
435
0
}
436
437
static NOINLINE void sgr_box5_row_h(int32_t *sumsq, coef *sum,
438
                                    const pixel (*left)[4],
439
                                    const pixel *src, const int w,
440
                                    const enum LrEdgeFlags edges)
441
0
{
442
0
    sumsq++;
443
0
    sum++;
444
0
    int a = edges & LR_HAVE_LEFT ? (left ? left[0][1] : src[-3]) : src[0];
445
0
    int b = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0];
446
0
    int c = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0];
447
0
    int d = src[0];
448
0
    for (int x = -1; x < w + 1; x++) {
449
0
        int e = (x + 2 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 2] : src[w - 1];
450
0
        sum[x] = a + b + c + d + e;
451
0
        sumsq[x] = a * a + b * b + c * c + d * d + e * e;
452
0
        a = b;
453
0
        b = c;
454
0
        c = d;
455
0
        d = e;
456
0
    }
457
0
}
458
459
static void sgr_box35_row_h(int32_t *sumsq3, coef *sum3,
460
                            int32_t *sumsq5, coef *sum5,
461
                            const pixel (*left)[4],
462
                            const pixel *src, const int w,
463
                            const enum LrEdgeFlags edges)
464
0
{
465
0
    sgr_box3_row_h(sumsq3, sum3, left, src, w, edges);
466
0
    sgr_box5_row_h(sumsq5, sum5, left, src, w, edges);
467
0
}
468
469
static NOINLINE void sgr_box3_row_v(int32_t **sumsq, coef **sum,
470
                                    int32_t *sumsq_out, coef *sum_out,
471
                                    const int w)
472
0
{
473
0
    for (int x = 0; x < w + 2; x++) {
474
0
        int sq_a = sumsq[0][x];
475
0
        int sq_b = sumsq[1][x];
476
0
        int sq_c = sumsq[2][x];
477
0
        int s_a = sum[0][x];
478
0
        int s_b = sum[1][x];
479
0
        int s_c = sum[2][x];
480
0
        sumsq_out[x] = sq_a + sq_b + sq_c;
481
0
        sum_out[x] = s_a + s_b + s_c;
482
0
    }
483
0
}
484
485
static NOINLINE void sgr_box5_row_v(int32_t **sumsq, coef **sum,
486
                                    int32_t *sumsq_out, coef *sum_out,
487
                                    const int w)
488
0
{
489
0
    for (int x = 0; x < w + 2; x++) {
490
0
        int sq_a = sumsq[0][x];
491
0
        int sq_b = sumsq[1][x];
492
0
        int sq_c = sumsq[2][x];
493
0
        int sq_d = sumsq[3][x];
494
0
        int sq_e = sumsq[4][x];
495
0
        int s_a = sum[0][x];
496
0
        int s_b = sum[1][x];
497
0
        int s_c = sum[2][x];
498
0
        int s_d = sum[3][x];
499
0
        int s_e = sum[4][x];
500
0
        sumsq_out[x] = sq_a + sq_b + sq_c + sq_d + sq_e;
501
0
        sum_out[x] = s_a + s_b + s_c + s_d + s_e;
502
0
    }
503
0
}
504
505
static NOINLINE void sgr_calc_row_ab(int32_t *AA, coef *BB, int w, int s,
506
                                     int bitdepth_max, int n, int sgr_one_by_x)
507
0
{
508
0
    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
509
0
    for (int i = 0; i < w + 2; i++) {
510
0
        const int a =
511
0
            (AA[i] + ((1 << (2 * bitdepth_min_8)) >> 1)) >> (2 * bitdepth_min_8);
512
0
        const int b =
513
0
            (BB[i] + ((1 << bitdepth_min_8) >> 1)) >> bitdepth_min_8;
514
515
0
        const unsigned p = imax(a * n - b * b, 0);
516
0
        const unsigned z = (p * s + (1 << 19)) >> 20;
517
0
        const unsigned x = dav1d_sgr_x_by_x[umin(z, 255)];
518
519
        // This is where we invert A and B, so that B is of size coef.
520
0
        AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
521
0
        BB[i] = x;
522
0
    }
523
0
}
524
525
static void sgr_box3_vert(int32_t **sumsq, coef **sum,
526
                          int32_t *sumsq_out, coef *sum_out,
527
                          const int w, const int s, const int bitdepth_max)
528
0
{
529
0
    sgr_box3_row_v(sumsq, sum, sumsq_out, sum_out, w);
530
0
    sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 9, 455);
531
0
    rotate(sumsq, sum, 3);
532
0
}
533
534
static void sgr_box5_vert(int32_t **sumsq, coef **sum,
535
                          int32_t *sumsq_out, coef *sum_out,
536
                          const int w, const int s, const int bitdepth_max)
537
0
{
538
0
    sgr_box5_row_v(sumsq, sum, sumsq_out, sum_out, w);
539
0
    sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 25, 164);
540
0
    rotate5_x2(sumsq, sum);
541
0
}
542
543
static void sgr_box3_hv(int32_t **sumsq, coef **sum,
544
                        int32_t *AA, coef *BB,
545
                        const pixel (*left)[4],
546
                        const pixel *src, const int w,
547
                        const int s,
548
                        const enum LrEdgeFlags edges,
549
                        const int bitdepth_max)
550
0
{
551
0
    sgr_box3_row_h(sumsq[2], sum[2], left, src, w, edges);
552
0
    sgr_box3_vert(sumsq, sum, AA, BB, w, s, bitdepth_max);
553
0
}
554
555
static NOINLINE void sgr_finish_filter_row1(coef *tmp,
556
                                            const pixel *src,
557
                                            int32_t **A_ptrs, coef **B_ptrs,
558
                                            const int w)
559
0
{
560
0
#define EIGHT_NEIGHBORS(P, i)\
561
0
    ((P[1][i] + P[1][i - 1] + P[1][i + 1] + P[0][i] + P[2][i]) * 4 + \
562
0
     (P[0][i - 1] + P[2][i - 1] +                           \
563
0
      P[0][i + 1] + P[2][i + 1]) * 3)
564
0
    for (int i = 0; i < w; i++) {
565
0
        const int a = EIGHT_NEIGHBORS(B_ptrs, i + 1);
566
0
        const int b = EIGHT_NEIGHBORS(A_ptrs, i + 1);
567
0
        tmp[i] = (b - a * src[i] + (1 << 8)) >> 9;
568
0
    }
569
0
#undef EIGHT_NEIGHBORS
570
0
}
571
572
0
#define FILTER_OUT_STRIDE (384)
573
574
static NOINLINE void sgr_finish_filter2(coef *tmp,
575
                                        const pixel *src,
576
                                        const ptrdiff_t src_stride,
577
                                        int32_t **A_ptrs, coef **B_ptrs,
578
                                        const int w, const int h)
579
0
{
580
0
#define SIX_NEIGHBORS(P, i)\
581
0
    ((P[0][i]     + P[1][i]) * 6 +   \
582
0
     (P[0][i - 1] + P[1][i - 1] +    \
583
0
      P[0][i + 1] + P[1][i + 1]) * 5)
584
0
    for (int i = 0; i < w; i++) {
585
0
        const int a = SIX_NEIGHBORS(B_ptrs, i + 1);
586
0
        const int b = SIX_NEIGHBORS(A_ptrs, i + 1);
587
0
        tmp[i] = (b - a * src[i] + (1 << 8)) >> 9;
588
0
    }
589
0
    if (h <= 1)
590
0
        return;
591
0
    tmp += FILTER_OUT_STRIDE;
592
0
    src += PXSTRIDE(src_stride);
593
0
    const int32_t *A = &A_ptrs[1][1];
594
0
    const coef *B = &B_ptrs[1][1];
595
0
    for (int i = 0; i < w; i++) {
596
0
        const int a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;
597
0
        const int b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;
598
0
        tmp[i] = (b - a * src[i] + (1 << 7)) >> 8;
599
0
    }
600
0
#undef SIX_NEIGHBORS
601
0
}
602
603
static NOINLINE void sgr_weighted_row1(pixel *dst, const coef *t1,
604
                                       const int w, const int w1 HIGHBD_DECL_SUFFIX)
605
0
{
606
0
    for (int i = 0; i < w; i++) {
607
0
        const int v = w1 * t1[i];
608
0
        dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11));
609
0
    }
610
0
}
611
612
static NOINLINE void sgr_weighted2(pixel *dst, const ptrdiff_t dst_stride,
613
                                   const coef *t1, const coef *t2,
614
                                   const int w, const int h,
615
                                   const int w0, const int w1 HIGHBD_DECL_SUFFIX)
616
0
{
617
0
    for (int j = 0; j < h; j++) {
618
0
        for (int i = 0; i < w; i++) {
619
0
            const int v = w0 * t1[i] + w1 * t2[i];
620
0
            dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11));
621
0
        }
622
0
        dst += PXSTRIDE(dst_stride);
623
0
        t1 += FILTER_OUT_STRIDE;
624
0
        t2 += FILTER_OUT_STRIDE;
625
0
    }
626
0
}
627
628
static NOINLINE void sgr_finish1(pixel **dst, const ptrdiff_t stride,
629
                                 int32_t **A_ptrs, coef **B_ptrs, const int w,
630
                                 const int w1 HIGHBD_DECL_SUFFIX)
631
0
{
632
    // Only one single row, no stride needed
633
0
    ALIGN_STK_16(coef, tmp, 384,);
634
635
0
    sgr_finish_filter_row1(tmp, *dst, A_ptrs, B_ptrs, w);
636
0
    sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX);
637
0
    *dst += PXSTRIDE(stride);
638
0
    rotate(A_ptrs, B_ptrs, 3);
639
0
}
640
641
static NOINLINE void sgr_finish2(pixel **dst, const ptrdiff_t stride,
642
                                 int32_t **A_ptrs, coef **B_ptrs,
643
                                 const int w, const int h, const int w1
644
                                 HIGHBD_DECL_SUFFIX)
645
0
{
646
0
    ALIGN_STK_16(coef, tmp, 2*FILTER_OUT_STRIDE,);
647
648
0
    sgr_finish_filter2(tmp, *dst, stride, A_ptrs, B_ptrs, w, h);
649
0
    sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX);
650
0
    *dst += PXSTRIDE(stride);
651
0
    if (h > 1) {
652
0
        sgr_weighted_row1(*dst, tmp + FILTER_OUT_STRIDE, w, w1 HIGHBD_TAIL_SUFFIX);
653
0
        *dst += PXSTRIDE(stride);
654
0
    }
655
0
    rotate(A_ptrs, B_ptrs, 2);
656
0
}
657
658
static NOINLINE void sgr_finish_mix(pixel **dst, const ptrdiff_t stride,
659
                                    int32_t **A5_ptrs, coef **B5_ptrs,
660
                                    int32_t **A3_ptrs, coef **B3_ptrs,
661
                                    const int w, const int h,
662
                                    const int w0, const int w1 HIGHBD_DECL_SUFFIX)
663
0
{
664
0
    ALIGN_STK_16(coef, tmp5, 2*FILTER_OUT_STRIDE,);
665
0
    ALIGN_STK_16(coef, tmp3, 2*FILTER_OUT_STRIDE,);
666
667
0
    sgr_finish_filter2(tmp5, *dst, stride, A5_ptrs, B5_ptrs, w, h);
668
0
    sgr_finish_filter_row1(tmp3, *dst, A3_ptrs, B3_ptrs, w);
669
0
    if (h > 1)
670
0
        sgr_finish_filter_row1(tmp3 + FILTER_OUT_STRIDE, *dst + PXSTRIDE(stride),
671
0
                               &A3_ptrs[1], &B3_ptrs[1], w);
672
0
    sgr_weighted2(*dst, stride, tmp5, tmp3, w, h, w0, w1 HIGHBD_TAIL_SUFFIX);
673
0
    *dst += h*PXSTRIDE(stride);
674
0
    rotate(A5_ptrs, B5_ptrs, 2);
675
0
    rotate(A3_ptrs, B3_ptrs, 4);
676
0
}
677
678
679
static void sgr_3x3_c(pixel *dst, const ptrdiff_t stride,
680
                      const pixel (*left)[4], const pixel *lpf,
681
                      const int w, int h,
682
                      const LooprestorationParams *const params,
683
                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
684
0
{
685
0
#define BUF_STRIDE (384 + 16)
686
0
    ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16,);
687
0
    ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 3 + 16,);
688
0
    int32_t *sumsq_ptrs[3], *sumsq_rows[3];
689
0
    coef *sum_ptrs[3], *sum_rows[3];
690
0
    for (int i = 0; i < 3; i++) {
691
0
        sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
692
0
        sum_rows[i] = &sum_buf[i * BUF_STRIDE];
693
0
    }
694
695
0
    ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16,);
696
0
    ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 3 + 16,);
697
0
    int32_t *A_ptrs[3];
698
0
    coef *B_ptrs[3];
699
0
    for (int i = 0; i < 3; i++) {
700
0
        A_ptrs[i] = &A_buf[i * BUF_STRIDE];
701
0
        B_ptrs[i] = &B_buf[i * BUF_STRIDE];
702
0
    }
703
0
    const pixel *src = dst;
704
0
    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
705
706
0
    if (edges & LR_HAVE_TOP) {
707
0
        sumsq_ptrs[0] = sumsq_rows[0];
708
0
        sumsq_ptrs[1] = sumsq_rows[1];
709
0
        sumsq_ptrs[2] = sumsq_rows[2];
710
0
        sum_ptrs[0] = sum_rows[0];
711
0
        sum_ptrs[1] = sum_rows[1];
712
0
        sum_ptrs[2] = sum_rows[2];
713
714
0
        sgr_box3_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges);
715
0
        lpf += PXSTRIDE(stride);
716
0
        sgr_box3_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges);
717
718
0
        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
719
0
                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
720
0
        left++;
721
0
        src += PXSTRIDE(stride);
722
0
        rotate(A_ptrs, B_ptrs, 3);
723
724
0
        if (--h <= 0)
725
0
            goto vert_1;
726
727
0
        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
728
0
                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
729
0
        left++;
730
0
        src += PXSTRIDE(stride);
731
0
        rotate(A_ptrs, B_ptrs, 3);
732
733
0
        if (--h <= 0)
734
0
            goto vert_2;
735
0
    } else {
736
0
        sumsq_ptrs[0] = sumsq_rows[0];
737
0
        sumsq_ptrs[1] = sumsq_rows[0];
738
0
        sumsq_ptrs[2] = sumsq_rows[0];
739
0
        sum_ptrs[0] = sum_rows[0];
740
0
        sum_ptrs[1] = sum_rows[0];
741
0
        sum_ptrs[2] = sum_rows[0];
742
743
0
        sgr_box3_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges);
744
0
        left++;
745
0
        src += PXSTRIDE(stride);
746
747
0
        sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
748
0
                      w, params->sgr.s1, BITDEPTH_MAX);
749
0
        rotate(A_ptrs, B_ptrs, 3);
750
751
0
        if (--h <= 0)
752
0
            goto vert_1;
753
754
0
        sumsq_ptrs[2] = sumsq_rows[1];
755
0
        sum_ptrs[2] = sum_rows[1];
756
757
0
        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
758
0
                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
759
0
        left++;
760
0
        src += PXSTRIDE(stride);
761
0
        rotate(A_ptrs, B_ptrs, 3);
762
763
0
        if (--h <= 0)
764
0
            goto vert_2;
765
766
0
        sumsq_ptrs[2] = sumsq_rows[2];
767
0
        sum_ptrs[2] = sum_rows[2];
768
0
    }
769
770
0
    do {
771
0
        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
772
0
                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
773
0
        left++;
774
0
        src += PXSTRIDE(stride);
775
776
0
        sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
777
0
                    w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
778
0
    } while (--h > 0);
779
780
0
    if (!(edges & LR_HAVE_BOTTOM))
781
0
        goto vert_2;
782
783
0
    sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
784
0
                NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
785
0
    lpf_bottom += PXSTRIDE(stride);
786
787
0
    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
788
0
                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
789
790
0
    sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
791
0
                NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
792
793
0
    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
794
0
                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
795
0
    return;
796
797
0
vert_2:
798
0
    sumsq_ptrs[2] = sumsq_ptrs[1];
799
0
    sum_ptrs[2] = sum_ptrs[1];
800
0
    sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
801
0
                  w, params->sgr.s1, BITDEPTH_MAX);
802
803
0
    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
804
0
                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
805
806
0
output_1:
807
0
    sumsq_ptrs[2] = sumsq_ptrs[1];
808
0
    sum_ptrs[2] = sum_ptrs[1];
809
0
    sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
810
0
                  w, params->sgr.s1, BITDEPTH_MAX);
811
812
0
    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
813
0
                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
814
0
    return;
815
816
0
vert_1:
817
0
    sumsq_ptrs[2] = sumsq_ptrs[1];
818
0
    sum_ptrs[2] = sum_ptrs[1];
819
0
    sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
820
0
                  w, params->sgr.s1, BITDEPTH_MAX);
821
0
    rotate(A_ptrs, B_ptrs, 3);
822
0
    goto output_1;
823
0
}
824
825
static void sgr_5x5_c(pixel *dst, const ptrdiff_t stride,
826
                      const pixel (*left)[4], const pixel *lpf,
827
                      const int w, int h,
828
                      const LooprestorationParams *const params,
829
                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
830
0
{
831
0
    ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16,);
832
0
    ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 5 + 16,);
833
0
    int32_t *sumsq_ptrs[5], *sumsq_rows[5];
834
0
    coef *sum_ptrs[5], *sum_rows[5];
835
0
    for (int i = 0; i < 5; i++) {
836
0
        sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
837
0
        sum_rows[i] = &sum_buf[i * BUF_STRIDE];
838
0
    }
839
840
0
    ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16,);
841
0
    ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 2 + 16,);
842
0
    int32_t *A_ptrs[2];
843
0
    coef *B_ptrs[2];
844
0
    for (int i = 0; i < 2; i++) {
845
0
        A_ptrs[i] = &A_buf[i * BUF_STRIDE];
846
0
        B_ptrs[i] = &B_buf[i * BUF_STRIDE];
847
0
    }
848
0
    const pixel *src = dst;
849
0
    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
850
851
0
    if (edges & LR_HAVE_TOP) {
852
0
        sumsq_ptrs[0] = sumsq_rows[0];
853
0
        sumsq_ptrs[1] = sumsq_rows[0];
854
0
        sumsq_ptrs[2] = sumsq_rows[1];
855
0
        sumsq_ptrs[3] = sumsq_rows[2];
856
0
        sumsq_ptrs[4] = sumsq_rows[3];
857
0
        sum_ptrs[0] = sum_rows[0];
858
0
        sum_ptrs[1] = sum_rows[0];
859
0
        sum_ptrs[2] = sum_rows[1];
860
0
        sum_ptrs[3] = sum_rows[2];
861
0
        sum_ptrs[4] = sum_rows[3];
862
863
0
        sgr_box5_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges);
864
0
        lpf += PXSTRIDE(stride);
865
0
        sgr_box5_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges);
866
867
0
        sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges);
868
0
        left++;
869
0
        src += PXSTRIDE(stride);
870
871
0
        if (--h <= 0)
872
0
            goto vert_1;
873
874
0
        sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges);
875
0
        left++;
876
0
        src += PXSTRIDE(stride);
877
0
        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
878
0
                      w, params->sgr.s0, BITDEPTH_MAX);
879
0
        rotate(A_ptrs, B_ptrs, 2);
880
881
0
        if (--h <= 0)
882
0
            goto vert_2;
883
884
        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
885
        // one of them to point at the previously unused rows[4].
886
0
        sumsq_ptrs[3] = sumsq_rows[4];
887
0
        sum_ptrs[3] = sum_rows[4];
888
0
    } else {
889
0
        sumsq_ptrs[0] = sumsq_rows[0];
890
0
        sumsq_ptrs[1] = sumsq_rows[0];
891
0
        sumsq_ptrs[2] = sumsq_rows[0];
892
0
        sumsq_ptrs[3] = sumsq_rows[0];
893
0
        sumsq_ptrs[4] = sumsq_rows[0];
894
0
        sum_ptrs[0] = sum_rows[0];
895
0
        sum_ptrs[1] = sum_rows[0];
896
0
        sum_ptrs[2] = sum_rows[0];
897
0
        sum_ptrs[3] = sum_rows[0];
898
0
        sum_ptrs[4] = sum_rows[0];
899
900
0
        sgr_box5_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges);
901
0
        left++;
902
0
        src += PXSTRIDE(stride);
903
904
0
        if (--h <= 0)
905
0
            goto vert_1;
906
907
0
        sumsq_ptrs[4] = sumsq_rows[1];
908
0
        sum_ptrs[4] = sum_rows[1];
909
910
0
        sgr_box5_row_h(sumsq_rows[1], sum_rows[1], left, src, w, edges);
911
0
        left++;
912
0
        src += PXSTRIDE(stride);
913
914
0
        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
915
0
                      w, params->sgr.s0, BITDEPTH_MAX);
916
0
        rotate(A_ptrs, B_ptrs, 2);
917
918
0
        if (--h <= 0)
919
0
            goto vert_2;
920
921
0
        sumsq_ptrs[3] = sumsq_rows[2];
922
0
        sumsq_ptrs[4] = sumsq_rows[3];
923
0
        sum_ptrs[3] = sum_rows[2];
924
0
        sum_ptrs[4] = sum_rows[3];
925
926
0
        sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges);
927
0
        left++;
928
0
        src += PXSTRIDE(stride);
929
930
0
        if (--h <= 0)
931
0
            goto odd;
932
933
0
        sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges);
934
0
        left++;
935
0
        src += PXSTRIDE(stride);
936
937
0
        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
938
0
                      w, params->sgr.s0, BITDEPTH_MAX);
939
0
        sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
940
0
                    w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
941
942
0
        if (--h <= 0)
943
0
            goto vert_2;
944
945
        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
946
        // one of them to point at the previously unused rows[4].
947
0
        sumsq_ptrs[3] = sumsq_rows[4];
948
0
        sum_ptrs[3] = sum_rows[4];
949
0
    }
950
951
0
    do {
952
0
        sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], left, src, w, edges);
953
0
        left++;
954
0
        src += PXSTRIDE(stride);
955
956
0
        if (--h <= 0)
957
0
            goto odd;
958
959
0
        sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], left, src, w, edges);
960
0
        left++;
961
0
        src += PXSTRIDE(stride);
962
963
0
        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
964
0
                      w, params->sgr.s0, BITDEPTH_MAX);
965
0
        sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
966
0
                    w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
967
0
    } while (--h > 0);
968
969
0
    if (!(edges & LR_HAVE_BOTTOM))
970
0
        goto vert_2;
971
972
0
    sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], NULL, lpf_bottom, w, edges);
973
0
    lpf_bottom += PXSTRIDE(stride);
974
0
    sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], NULL, lpf_bottom, w, edges);
975
976
0
output_2:
977
0
    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
978
0
                  w, params->sgr.s0, BITDEPTH_MAX);
979
0
    sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
980
0
                w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
981
0
    return;
982
983
0
vert_2:
984
    // Duplicate the last row twice more
985
0
    sumsq_ptrs[3] = sumsq_ptrs[2];
986
0
    sumsq_ptrs[4] = sumsq_ptrs[2];
987
0
    sum_ptrs[3] = sum_ptrs[2];
988
0
    sum_ptrs[4] = sum_ptrs[2];
989
0
    goto output_2;
990
991
0
odd:
992
    // Copy the last row as padding once
993
0
    sumsq_ptrs[4] = sumsq_ptrs[3];
994
0
    sum_ptrs[4] = sum_ptrs[3];
995
996
0
    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
997
0
                  w, params->sgr.s0, BITDEPTH_MAX);
998
0
    sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
999
0
                w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
1000
1001
0
output_1:
1002
    // Duplicate the last row twice more
1003
0
    sumsq_ptrs[3] = sumsq_ptrs[2];
1004
0
    sumsq_ptrs[4] = sumsq_ptrs[2];
1005
0
    sum_ptrs[3] = sum_ptrs[2];
1006
0
    sum_ptrs[4] = sum_ptrs[2];
1007
1008
0
    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
1009
0
                  w, params->sgr.s0, BITDEPTH_MAX);
1010
    // Output only one row
1011
0
    sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
1012
0
                w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
1013
0
    return;
1014
1015
0
vert_1:
1016
    // Copy the last row as padding once
1017
0
    sumsq_ptrs[4] = sumsq_ptrs[3];
1018
0
    sum_ptrs[4] = sum_ptrs[3];
1019
1020
0
    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
1021
0
                  w, params->sgr.s0, BITDEPTH_MAX);
1022
0
    rotate(A_ptrs, B_ptrs, 2);
1023
1024
0
    goto output_1;
1025
0
}
1026
1027
static void sgr_mix_c(pixel *dst, const ptrdiff_t stride,
1028
                      const pixel (*left)[4], const pixel *lpf,
1029
                      const int w, int h,
1030
                      const LooprestorationParams *const params,
1031
                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
1032
0
{
1033
0
    ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16,);
1034
0
    ALIGN_STK_16(coef, sum5_buf, BUF_STRIDE * 5 + 16,);
1035
0
    int32_t *sumsq5_ptrs[5], *sumsq5_rows[5];
1036
0
    coef *sum5_ptrs[5], *sum5_rows[5];
1037
0
    for (int i = 0; i < 5; i++) {
1038
0
        sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE];
1039
0
        sum5_rows[i] = &sum5_buf[i * BUF_STRIDE];
1040
0
    }
1041
0
    ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16,);
1042
0
    ALIGN_STK_16(coef, sum3_buf, BUF_STRIDE * 3 + 16,);
1043
0
    int32_t *sumsq3_ptrs[3], *sumsq3_rows[3];
1044
0
    coef *sum3_ptrs[3], *sum3_rows[3];
1045
0
    for (int i = 0; i < 3; i++) {
1046
0
        sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE];
1047
0
        sum3_rows[i] = &sum3_buf[i * BUF_STRIDE];
1048
0
    }
1049
1050
0
    ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16,);
1051
0
    ALIGN_STK_16(coef, B5_buf, BUF_STRIDE * 2 + 16,);
1052
0
    int32_t *A5_ptrs[2];
1053
0
    coef *B5_ptrs[2];
1054
0
    for (int i = 0; i < 2; i++) {
1055
0
        A5_ptrs[i] = &A5_buf[i * BUF_STRIDE];
1056
0
        B5_ptrs[i] = &B5_buf[i * BUF_STRIDE];
1057
0
    }
1058
0
    ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16,);
1059
0
    ALIGN_STK_16(coef, B3_buf, BUF_STRIDE * 4 + 16,);
1060
0
    int32_t *A3_ptrs[4];
1061
0
    coef *B3_ptrs[4];
1062
0
    for (int i = 0; i < 4; i++) {
1063
0
        A3_ptrs[i] = &A3_buf[i * BUF_STRIDE];
1064
0
        B3_ptrs[i] = &B3_buf[i * BUF_STRIDE];
1065
0
    }
1066
0
    const pixel *src = dst;
1067
0
    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
1068
1069
0
    if (edges & LR_HAVE_TOP) {
1070
0
        sumsq5_ptrs[0] = sumsq5_rows[0];
1071
0
        sumsq5_ptrs[1] = sumsq5_rows[0];
1072
0
        sumsq5_ptrs[2] = sumsq5_rows[1];
1073
0
        sumsq5_ptrs[3] = sumsq5_rows[2];
1074
0
        sumsq5_ptrs[4] = sumsq5_rows[3];
1075
0
        sum5_ptrs[0] = sum5_rows[0];
1076
0
        sum5_ptrs[1] = sum5_rows[0];
1077
0
        sum5_ptrs[2] = sum5_rows[1];
1078
0
        sum5_ptrs[3] = sum5_rows[2];
1079
0
        sum5_ptrs[4] = sum5_rows[3];
1080
1081
0
        sumsq3_ptrs[0] = sumsq3_rows[0];
1082
0
        sumsq3_ptrs[1] = sumsq3_rows[1];
1083
0
        sumsq3_ptrs[2] = sumsq3_rows[2];
1084
0
        sum3_ptrs[0] = sum3_rows[0];
1085
0
        sum3_ptrs[1] = sum3_rows[1];
1086
0
        sum3_ptrs[2] = sum3_rows[2];
1087
1088
0
        sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0],
1089
0
                        sumsq5_rows[0], sum5_rows[0],
1090
0
                        NULL, lpf, w, edges);
1091
0
        lpf += PXSTRIDE(stride);
1092
0
        sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1],
1093
0
                        sumsq5_rows[1], sum5_rows[1],
1094
0
                        NULL, lpf, w, edges);
1095
1096
0
        sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2],
1097
0
                        sumsq5_rows[2], sum5_rows[2],
1098
0
                        left, src, w, edges);
1099
0
        left++;
1100
0
        src += PXSTRIDE(stride);
1101
1102
0
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1103
0
                      w, params->sgr.s1, BITDEPTH_MAX);
1104
0
        rotate(A3_ptrs, B3_ptrs, 4);
1105
1106
0
        if (--h <= 0)
1107
0
            goto vert_1;
1108
1109
0
        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1110
0
                        sumsq5_rows[3], sum5_rows[3],
1111
0
                        left, src, w, edges);
1112
0
        left++;
1113
0
        src += PXSTRIDE(stride);
1114
0
        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1115
0
                      w, params->sgr.s0, BITDEPTH_MAX);
1116
0
        rotate(A5_ptrs, B5_ptrs, 2);
1117
0
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1118
0
                      w, params->sgr.s1, BITDEPTH_MAX);
1119
0
        rotate(A3_ptrs, B3_ptrs, 4);
1120
1121
0
        if (--h <= 0)
1122
0
            goto vert_2;
1123
1124
        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
1125
        // one of them to point at the previously unused rows[4].
1126
0
        sumsq5_ptrs[3] = sumsq5_rows[4];
1127
0
        sum5_ptrs[3] = sum5_rows[4];
1128
0
    } else {
1129
0
        sumsq5_ptrs[0] = sumsq5_rows[0];
1130
0
        sumsq5_ptrs[1] = sumsq5_rows[0];
1131
0
        sumsq5_ptrs[2] = sumsq5_rows[0];
1132
0
        sumsq5_ptrs[3] = sumsq5_rows[0];
1133
0
        sumsq5_ptrs[4] = sumsq5_rows[0];
1134
0
        sum5_ptrs[0] = sum5_rows[0];
1135
0
        sum5_ptrs[1] = sum5_rows[0];
1136
0
        sum5_ptrs[2] = sum5_rows[0];
1137
0
        sum5_ptrs[3] = sum5_rows[0];
1138
0
        sum5_ptrs[4] = sum5_rows[0];
1139
1140
0
        sumsq3_ptrs[0] = sumsq3_rows[0];
1141
0
        sumsq3_ptrs[1] = sumsq3_rows[0];
1142
0
        sumsq3_ptrs[2] = sumsq3_rows[0];
1143
0
        sum3_ptrs[0] = sum3_rows[0];
1144
0
        sum3_ptrs[1] = sum3_rows[0];
1145
0
        sum3_ptrs[2] = sum3_rows[0];
1146
1147
0
        sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0],
1148
0
                        sumsq5_rows[0], sum5_rows[0],
1149
0
                        left, src, w, edges);
1150
0
        left++;
1151
0
        src += PXSTRIDE(stride);
1152
1153
0
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1154
0
                      w, params->sgr.s1, BITDEPTH_MAX);
1155
0
        rotate(A3_ptrs, B3_ptrs, 4);
1156
1157
0
        if (--h <= 0)
1158
0
            goto vert_1;
1159
1160
0
        sumsq5_ptrs[4] = sumsq5_rows[1];
1161
0
        sum5_ptrs[4] = sum5_rows[1];
1162
1163
0
        sumsq3_ptrs[2] = sumsq3_rows[1];
1164
0
        sum3_ptrs[2] = sum3_rows[1];
1165
1166
0
        sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1],
1167
0
                        sumsq5_rows[1], sum5_rows[1],
1168
0
                        left, src, w, edges);
1169
0
        left++;
1170
0
        src += PXSTRIDE(stride);
1171
1172
0
        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1173
0
                      w, params->sgr.s0, BITDEPTH_MAX);
1174
0
        rotate(A5_ptrs, B5_ptrs, 2);
1175
0
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1176
0
                      w, params->sgr.s1, BITDEPTH_MAX);
1177
0
        rotate(A3_ptrs, B3_ptrs, 4);
1178
1179
0
        if (--h <= 0)
1180
0
            goto vert_2;
1181
1182
0
        sumsq5_ptrs[3] = sumsq5_rows[2];
1183
0
        sumsq5_ptrs[4] = sumsq5_rows[3];
1184
0
        sum5_ptrs[3] = sum5_rows[2];
1185
0
        sum5_ptrs[4] = sum5_rows[3];
1186
1187
0
        sumsq3_ptrs[2] = sumsq3_rows[2];
1188
0
        sum3_ptrs[2] = sum3_rows[2];
1189
1190
0
        sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2],
1191
0
                        sumsq5_rows[2], sum5_rows[2],
1192
0
                        left, src, w, edges);
1193
0
        left++;
1194
0
        src += PXSTRIDE(stride);
1195
1196
0
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1197
0
                      w, params->sgr.s1, BITDEPTH_MAX);
1198
0
        rotate(A3_ptrs, B3_ptrs, 4);
1199
1200
0
        if (--h <= 0)
1201
0
            goto odd;
1202
1203
0
        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1204
0
                        sumsq5_rows[3], sum5_rows[3],
1205
0
                        left, src, w, edges);
1206
0
        left++;
1207
0
        src += PXSTRIDE(stride);
1208
1209
0
        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1210
0
                      w, params->sgr.s0, BITDEPTH_MAX);
1211
0
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1212
0
                      w, params->sgr.s1, BITDEPTH_MAX);
1213
0
        sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1214
0
                       w, 2, params->sgr.w0, params->sgr.w1
1215
0
                       HIGHBD_TAIL_SUFFIX);
1216
1217
0
        if (--h <= 0)
1218
0
            goto vert_2;
1219
1220
        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
1221
        // one of them to point at the previously unused rows[4].
1222
0
        sumsq5_ptrs[3] = sumsq5_rows[4];
1223
0
        sum5_ptrs[3] = sum5_rows[4];
1224
0
    }
1225
1226
0
    do {
1227
0
        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1228
0
                        sumsq5_ptrs[3], sum5_ptrs[3],
1229
0
                        left, src, w, edges);
1230
0
        left++;
1231
0
        src += PXSTRIDE(stride);
1232
1233
0
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1234
0
                      w, params->sgr.s1, BITDEPTH_MAX);
1235
0
        rotate(A3_ptrs, B3_ptrs, 4);
1236
1237
0
        if (--h <= 0)
1238
0
            goto odd;
1239
1240
0
        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1241
0
                        sumsq5_ptrs[4], sum5_ptrs[4],
1242
0
                        left, src, w, edges);
1243
0
        left++;
1244
0
        src += PXSTRIDE(stride);
1245
1246
0
        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1247
0
                      w, params->sgr.s0, BITDEPTH_MAX);
1248
0
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1249
0
                      w, params->sgr.s1, BITDEPTH_MAX);
1250
0
        sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1251
0
                       w, 2, params->sgr.w0, params->sgr.w1
1252
0
                       HIGHBD_TAIL_SUFFIX);
1253
0
    } while (--h > 0);
1254
1255
0
    if (!(edges & LR_HAVE_BOTTOM))
1256
0
        goto vert_2;
1257
1258
0
    sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1259
0
                    sumsq5_ptrs[3], sum5_ptrs[3],
1260
0
                    NULL, lpf_bottom, w, edges);
1261
0
    lpf_bottom += PXSTRIDE(stride);
1262
0
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1263
0
                  w, params->sgr.s1, BITDEPTH_MAX);
1264
0
    rotate(A3_ptrs, B3_ptrs, 4);
1265
1266
0
    sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
1267
0
                    sumsq5_ptrs[4], sum5_ptrs[4],
1268
0
                    NULL, lpf_bottom, w, edges);
1269
1270
0
output_2:
1271
0
    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1272
0
                  w, params->sgr.s0, BITDEPTH_MAX);
1273
0
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1274
0
                  w, params->sgr.s1, BITDEPTH_MAX);
1275
0
    sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1276
0
                   w, 2, params->sgr.w0, params->sgr.w1
1277
0
                   HIGHBD_TAIL_SUFFIX);
1278
0
    return;
1279
1280
0
vert_2:
1281
    // Duplicate the last row twice more
1282
0
    sumsq5_ptrs[3] = sumsq5_ptrs[2];
1283
0
    sumsq5_ptrs[4] = sumsq5_ptrs[2];
1284
0
    sum5_ptrs[3] = sum5_ptrs[2];
1285
0
    sum5_ptrs[4] = sum5_ptrs[2];
1286
1287
0
    sumsq3_ptrs[2] = sumsq3_ptrs[1];
1288
0
    sum3_ptrs[2] = sum3_ptrs[1];
1289
0
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1290
0
                  w, params->sgr.s1, BITDEPTH_MAX);
1291
0
    rotate(A3_ptrs, B3_ptrs, 4);
1292
1293
0
    sumsq3_ptrs[2] = sumsq3_ptrs[1];
1294
0
    sum3_ptrs[2] = sum3_ptrs[1];
1295
1296
0
    goto output_2;
1297
1298
0
odd:
1299
    // Copy the last row as padding once
1300
0
    sumsq5_ptrs[4] = sumsq5_ptrs[3];
1301
0
    sum5_ptrs[4] = sum5_ptrs[3];
1302
1303
0
    sumsq3_ptrs[2] = sumsq3_ptrs[1];
1304
0
    sum3_ptrs[2] = sum3_ptrs[1];
1305
1306
0
    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1307
0
                  w, params->sgr.s0, BITDEPTH_MAX);
1308
0
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1309
0
                  w, params->sgr.s1, BITDEPTH_MAX);
1310
0
    sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1311
0
                   w, 2, params->sgr.w0, params->sgr.w1
1312
0
                   HIGHBD_TAIL_SUFFIX);
1313
1314
0
output_1:
1315
    // Duplicate the last row twice more
1316
0
    sumsq5_ptrs[3] = sumsq5_ptrs[2];
1317
0
    sumsq5_ptrs[4] = sumsq5_ptrs[2];
1318
0
    sum5_ptrs[3] = sum5_ptrs[2];
1319
0
    sum5_ptrs[4] = sum5_ptrs[2];
1320
1321
0
    sumsq3_ptrs[2] = sumsq3_ptrs[1];
1322
0
    sum3_ptrs[2] = sum3_ptrs[1];
1323
1324
0
    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1325
0
                  w, params->sgr.s0, BITDEPTH_MAX);
1326
0
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1327
0
                  w, params->sgr.s1, BITDEPTH_MAX);
1328
0
    rotate(A3_ptrs, B3_ptrs, 4);
1329
    // Output only one row
1330
0
    sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1331
0
                   w, 1, params->sgr.w0, params->sgr.w1
1332
0
                   HIGHBD_TAIL_SUFFIX);
1333
0
    return;
1334
1335
0
vert_1:
1336
    // Copy the last row as padding once
1337
0
    sumsq5_ptrs[4] = sumsq5_ptrs[3];
1338
0
    sum5_ptrs[4] = sum5_ptrs[3];
1339
1340
0
    sumsq3_ptrs[2] = sumsq3_ptrs[1];
1341
0
    sum3_ptrs[2] = sum3_ptrs[1];
1342
1343
0
    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1344
0
                  w, params->sgr.s0, BITDEPTH_MAX);
1345
0
    rotate(A5_ptrs, B5_ptrs, 2);
1346
0
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1347
0
                  w, params->sgr.s1, BITDEPTH_MAX);
1348
0
    rotate(A3_ptrs, B3_ptrs, 4);
1349
1350
0
    goto output_1;
1351
0
}
1352
1353
#if HAVE_ASM
1354
#if ARCH_AARCH64 || ARCH_ARM
1355
#include "src/arm/looprestoration.h"
1356
#elif ARCH_LOONGARCH64
1357
#include "src/loongarch/looprestoration.h"
1358
#elif ARCH_PPC64LE
1359
#include "src/ppc/looprestoration.h"
1360
#elif ARCH_X86
1361
#include "src/x86/looprestoration.h"
1362
#endif
1363
#endif
1364
1365
COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c,
1366
                                                 const int bpc)
1367
4
{
1368
4
    c->wiener[0] = c->wiener[1] = wiener_c;
1369
4
    c->sgr[0] = sgr_5x5_c;
1370
4
    c->sgr[1] = sgr_3x3_c;
1371
4
    c->sgr[2] = sgr_mix_c;
1372
1373
#if HAVE_ASM
1374
#if ARCH_AARCH64 || ARCH_ARM
1375
    loop_restoration_dsp_init_arm(c, bpc);
1376
#elif ARCH_LOONGARCH64
1377
    loop_restoration_dsp_init_loongarch(c, bpc);
1378
#elif ARCH_PPC64LE
1379
    loop_restoration_dsp_init_ppc(c, bpc);
1380
#elif ARCH_X86
1381
    loop_restoration_dsp_init_x86(c, bpc);
1382
#endif
1383
#endif
1384
4
}
dav1d_loop_restoration_dsp_init_8bpc
Line
Count
Source
1367
4
{
1368
4
    c->wiener[0] = c->wiener[1] = wiener_c;
1369
4
    c->sgr[0] = sgr_5x5_c;
1370
4
    c->sgr[1] = sgr_3x3_c;
1371
4
    c->sgr[2] = sgr_mix_c;
1372
1373
#if HAVE_ASM
1374
#if ARCH_AARCH64 || ARCH_ARM
1375
    loop_restoration_dsp_init_arm(c, bpc);
1376
#elif ARCH_LOONGARCH64
1377
    loop_restoration_dsp_init_loongarch(c, bpc);
1378
#elif ARCH_PPC64LE
1379
    loop_restoration_dsp_init_ppc(c, bpc);
1380
#elif ARCH_X86
1381
    loop_restoration_dsp_init_x86(c, bpc);
1382
#endif
1383
#endif
1384
4
}
Unexecuted instantiation: dav1d_loop_restoration_dsp_init_16bpc