Coverage Report

Created: 2026-05-16 06:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/svt-av1/Source/Lib/Codec/restoration.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10
 *
11
 */
12
#include "av1_common.h"
13
#include "common_dsp_rtcd.h"
14
#include "restoration.h"
15
#include "pic_buffer_desc.h"
16
#include "utility.h"
17
#include "svt_log.h"
18
#include "intra_prediction.h"
19
#include "pcs.h"
20
#include "super_res.h"
21
#include "pic_operators.h"
22
#include "convolve.h"
23
24
// The 's' values are calculated based on original 'r' and 'e' values in the
25
// spec using GenSgrprojVtable().
26
// Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
27
//n = (2 * r + 1) * (2 * r + 1);
28
//n2e = n * n * ep;
29
//s = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
30
31
const SgrParamsType svt_aom_eb_sgr_params[SGRPROJ_PARAMS] = {
32
    //      r0 e0  r1 e1
33
    {{2, 1}, {140, 3236}}, // 0  { 2, 12, 1, 4  }
34
    {{2, 1}, {112, 2158}}, // 1  { 2, 15, 1, 6  }
35
    {{2, 1}, {93, 1618}}, // 2  { 2, 18, 1, 8  }
36
    {{2, 1}, {80, 1438}}, // 3  { 2, 21, 1, 9  }
37
    {{2, 1}, {70, 1295}}, // 4  { 2, 24, 1, 10 }
38
    {{2, 1}, {58, 1177}}, // 5  { 2, 29, 1, 11 }
39
    {{2, 1}, {47, 1079}}, // 6  { 2, 36, 1, 12 }
40
    {{2, 1}, {37, 996}}, // 7  { 2, 45, 1, 13 }
41
    {{2, 1}, {30, 925}}, // 8  { 2, 56, 1, 14 }
42
    {{2, 1}, {25, 863}}, // 9  { 2, 68, 1, 15 }
43
    {{0, 1}, {-1, 2589}}, // 10 { 0, 0,  1, 5  }
44
    {{0, 1}, {-1, 1618}}, // 11 { 0, 0,  1, 8  }
45
    {{0, 1}, {-1, 1177}}, // 12 { 0, 0,  1, 11 }
46
    {{0, 1}, {-1, 925}}, // 13 { 0, 0,  1, 14 }
47
    {{2, 0}, {56, -1}}, // 14 { 2, 30, 0, 0  }
48
    {{2, 0}, {22, -1}}, // 15 { 2, 75, 0, 0  }
49
};
50
51
0
static Av1PixelRect whole_frame_rect(FrameSize* frm_size, int32_t sub_x, int32_t sub_y, int32_t is_uv) {
52
0
    Av1PixelRect rect;
53
54
0
    int32_t ss_x = is_uv && sub_x;
55
0
    int32_t ss_y = is_uv && sub_y;
56
57
0
    rect.top    = 0;
58
0
    rect.bottom = ROUND_POWER_OF_TWO(frm_size->frame_height, ss_y);
59
0
    rect.left   = 0;
60
0
    rect.right  = ROUND_POWER_OF_TWO(frm_size->superres_upscaled_width, ss_x);
61
0
    return rect;
62
0
}
63
64
// Count horizontal or vertical units per tile (use a width or height for
65
// tile_size, respectively). We basically want to divide the tile size by the
66
// size of a restoration unit. Rather than rounding up unconditionally as you
67
// might expect, we round to nearest, which models the way a right or bottom
68
// restoration unit can extend to up to 150% its normal width or height. The
69
// max with 1 is to deal with tiles that are smaller than half of a restoration
70
// unit.
71
0
static int32_t count_units_in_tile(int32_t unit_size, int32_t tile_size) {
72
0
    return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
73
0
}
74
75
0
EbErrorType svt_av1_alloc_restoration_struct(struct Av1Common* cm, RestorationInfo* rsi, int32_t is_uv) {
76
    // We need to allocate enough space for restoration units to cover the
77
    // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the
78
    // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
79
    // to do the computation ourselves, iterating over the tiles and keeping
80
    // track of the largest width and height, then upscaling.
81
0
    const Av1PixelRect tile_rect  = whole_frame_rect(&cm->frm_size, cm->subsampling_x, cm->subsampling_y, is_uv);
82
0
    const int32_t      max_tile_w = tile_rect.right - tile_rect.left;
83
0
    const int32_t      max_tile_h = tile_rect.bottom - tile_rect.top;
84
85
    // To calculate hpertile and vpertile (horizontal and vertical units per
86
    // tile), we basically want to divide the largest tile width or height by the
87
    // size of a restoration unit. Rather than rounding up unconditionally as you
88
    // might expect, we round to nearest, which models the way a right or bottom
89
    // restoration unit can extend to up to 150% its normal width or height. The
90
    // max with 1 is to deal with tiles that are smaller than half of a
91
    // restoration unit.
92
0
    const int32_t unit_size = rsi->restoration_unit_size;
93
0
    const int32_t hpertile  = count_units_in_tile(
94
0
        unit_size,
95
0
        max_tile_w); //FB of size < 1/2 unit_size are included in neigh FB making them bigger!!
96
0
    const int32_t vpertile = count_units_in_tile(unit_size, max_tile_h);
97
98
0
    rsi->units_per_tile      = hpertile * vpertile; //pic_tot_FB
99
0
    rsi->horz_units_per_tile = hpertile; //pic_width_in_FB
100
0
    rsi->vert_units_per_tile = vpertile; //pic_height_in_FB
101
102
0
    const int32_t ntiles = 1;
103
0
    const int32_t nunits = ntiles * rsi->units_per_tile;
104
105
0
    EB_MALLOC_ARRAY(rsi->unit_info, nunits);
106
107
0
    return EB_ErrorNone;
108
0
}
109
110
static void extend_frame_lowbd(uint8_t* data, int32_t width, int32_t height, int32_t stride, int32_t border_horz,
111
                               int32_t border_vert) {
112
    uint8_t* data_p;
113
    int32_t  i;
114
    for (i = 0; i < height; ++i) {
115
        data_p = data + i * stride;
116
        memset(data_p - border_horz, data_p[0], border_horz);
117
        memset(data_p + width, data_p[width - 1], border_horz);
118
    }
119
    data_p = data - border_horz;
120
    for (i = -border_vert; i < 0; ++i) {
121
        svt_memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
122
    }
123
    for (i = height; i < height + border_vert; ++i) {
124
        svt_memcpy(data_p + i * stride, data_p + (height - 1) * stride, width + 2 * border_horz);
125
    }
126
}
127
128
static void extend_frame_highbd(uint16_t* data, int32_t width, int32_t height, int32_t stride, int32_t border_horz,
129
                                int32_t border_vert) {
130
    uint16_t* data_p;
131
    int32_t   i, j;
132
    for (i = 0; i < height; ++i) {
133
        data_p = data + i * stride;
134
        for (j = -border_horz; j < 0; ++j) {
135
            data_p[j] = data_p[0];
136
        }
137
        for (j = width; j < width + border_horz; ++j) {
138
            data_p[j] = data_p[width - 1];
139
        }
140
    }
141
    data_p = data - border_horz;
142
    for (i = -border_vert; i < 0; ++i) {
143
        svt_memcpy(data_p + i * stride, data_p, (width + 2 * border_horz) * sizeof(uint16_t));
144
    }
145
    for (i = height; i < height + border_vert; ++i) {
146
        svt_memcpy(data_p + i * stride, data_p + (height - 1) * stride, (width + 2 * border_horz) * sizeof(uint16_t));
147
    }
148
}
149
150
void svt_extend_frame(uint8_t* data, int32_t width, int32_t height, int32_t stride, int32_t border_horz,
151
0
                      int32_t border_vert, int32_t highbd) {
152
0
    if (highbd) {
153
0
        extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride, border_horz, border_vert);
154
0
    } else {
155
0
        extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
156
0
    }
157
0
}
158
159
static void copy_tile_lowbd(int32_t width, int32_t height, const uint8_t* src, int32_t src_stride, uint8_t* dst,
160
                            int32_t dst_stride) {
161
    for (int32_t i = 0; i < height; ++i) {
162
        svt_memcpy(dst + i * dst_stride, src + i * src_stride, width);
163
    }
164
}
165
166
static void copy_tile_highbd(int32_t width, int32_t height, const uint16_t* src, int32_t src_stride, uint16_t* dst,
167
                             int32_t dst_stride) {
168
    for (int32_t i = 0; i < height; ++i) {
169
        svt_memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
170
    }
171
}
172
173
static void copy_tile(int32_t width, int32_t height, const uint8_t* src, int32_t src_stride, uint8_t* dst,
174
                      int32_t dst_stride, int32_t highbd) {
175
    if (highbd) {
176
        copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride, CONVERT_TO_SHORTPTR(dst), dst_stride);
177
    } else {
178
        copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
179
    }
180
}
181
182
// With striped loop restoration, the filtering for each 64-pixel stripe gets
183
// most of its input from the output of CDEF (stored in data8), but we need to
184
// fill out a border of 3 pixels above/below the stripe according to the
185
// following
186
// rules:
187
//
188
// * At a frame boundary, we copy the outermost row of CDEF pixels three times.
189
//   This extension is done by a call to svt_extend_frame() at the start of the loop
190
//   restoration process, so the value of copy_above/copy_below doesn't strictly
191
//   matter.
192
//   However, by setting *copy_above = *copy_below = 1 whenever loop filtering
193
//   across tiles is disabled, we can allow
194
//   {setup,restore}_processing_stripe_boundary to assume that the top/bottom
195
//   data has always been copied, simplifying the behaviour at the left and
196
//   right edges of tiles.
197
//
198
// * If we're at a tile boundary and loop filtering across tiles is enabled,
199
//   then there is a logical stripe which is 64 pixels high, but which is split
200
//   into an 8px high and a 56px high stripe so that the processing (and
201
//   coefficient set usage) can be aligned to tiles.
202
//   In this case, we use the 3 rows of CDEF output across the boundary for
203
//   context; this corresponds to leaving the frame buffer as-is.
204
//
205
// * If we're at a tile boundary and loop filtering across tiles is disabled,
206
//   then we take the outermost row of CDEF pixels *within the current tile*
207
//   and copy it three times. Thus we behave exactly as if the tile were a full
208
//   frame.
209
//
210
// * Otherwise, we're at a stripe boundary within a tile. In that case, we
211
//   take 2 rows of deblocked pixels and extend them to 3 rows of context.
212
//
213
// The distinction between the latter two cases is handled by the
214
// svt_av1_loop_restoration_save_boundary_lines() function, so here we just need
215
// to decide if we're overwriting the above/below boundary pixels or not.
216
static void get_stripe_boundary_info(const RestorationTileLimits* limits, const Av1PixelRect* tile_rect, int32_t ss_y,
217
                                     int32_t* copy_above, int32_t* copy_below) {
218
    *copy_above = 1;
219
    *copy_below = 1;
220
221
    const int32_t full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
222
    const int32_t runit_offset       = RESTORATION_UNIT_OFFSET >> ss_y;
223
224
    const int32_t first_stripe_in_tile = (limits->v_start == tile_rect->top);
225
    const int32_t this_stripe_height   = full_stripe_height - (first_stripe_in_tile ? runit_offset : 0);
226
    const int32_t last_stripe_in_tile  = (limits->v_start + this_stripe_height >= tile_rect->bottom);
227
228
    if (first_stripe_in_tile) {
229
        *copy_above = 0;
230
    }
231
    if (last_stripe_in_tile) {
232
        *copy_below = 0;
233
    }
234
}
235
236
// Overwrite the border pixels around a processing stripe so that the conditions
237
// listed above svt_aom_get_stripe_boundary_info() are preserved.
238
// We save the pixels which get overwritten into a temporary buffer, so that
239
// they can be restored by svt_aom_restore_processing_stripe_boundary() after we've
240
// processed the stripe.
241
//
242
// limits gives the rectangular limits of the remaining stripes for the current
243
// restoration unit. rsb is the stored stripe boundaries (taken from either
244
// deblock or CDEF output as necessary).
245
//
246
// tile_rect is the limits of the current tile and tile_stripe0 is the index of
247
// the first stripe in this tile (needed to convert the tile-relative stripe
248
// index we get from limits into something we can look up in rsb).
249
static void setup_processing_stripe_boundary(const RestorationTileLimits*       limits,
250
                                             const RestorationStripeBoundaries* rsb, int32_t rsb_row,
251
                                             int32_t use_highbd, int32_t h, uint8_t* data8, int32_t data_stride,
252
                                             RestorationLineBuffers* rlbs, int32_t copy_above, int32_t copy_below,
253
                                             int32_t opt) {
254
    // Offsets within the line buffers. The buffer logically starts at column
255
    // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
256
    // has column x0 in the buffer.
257
    const int32_t buf_stride = rsb->stripe_boundary_stride;
258
    const int32_t buf_x0_off = limits->h_start;
259
    const int32_t line_width = (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
260
    const int32_t line_size  = line_width << use_highbd;
261
262
    const int32_t data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
263
264
    // Replace RESTORATION_BORDER pixels above the top of the stripe
265
    // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
266
    // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
267
    // duplicating the topmost of the 2 lines (see the AOMMAX call when
268
    // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
269
    //
270
    // Special case: If we're at the top of a tile, which isn't on the topmost
271
    // tile row, and we're allowed to loop filter across tiles, then we have a
272
    // logical 64-pixel-high stripe which has been split into an 8-pixel high
273
    // stripe and a 56-pixel high stripe (the current one). So, in this case,
274
    // we want to leave the boundary alone!
275
    if (!opt) {
276
        if (copy_above) {
277
            uint8_t* data8_tl = data8 + data_x0 + limits->v_start * data_stride;
278
279
            for (int32_t i = -RESTORATION_BORDER; i < 0; ++i) {
280
                const int32_t  buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
281
                const int32_t  buf_off = buf_x0_off + buf_row * buf_stride;
282
                const uint8_t* buf     = rsb->stripe_boundary_above + (buf_off << use_highbd);
283
                uint8_t*       dst8    = data8_tl + i * data_stride;
284
                // Save old pixels, then replace with data from stripe_boundary_above
285
                svt_memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER], REAL_PTR(use_highbd, dst8), line_size);
286
                svt_memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
287
            }
288
        }
289
290
        // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
291
        // The second buffer row is repeated, so src_row gets the values 0, 1, 1
292
        // for i = 0, 1, 2.
293
        if (copy_below) {
294
            const int32_t stripe_end = limits->v_start + h;
295
            uint8_t*      data8_bl   = data8 + data_x0 + stripe_end * data_stride;
296
297
            for (int32_t i = 0; i < RESTORATION_BORDER; ++i) {
298
                const int32_t  buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
299
                const int32_t  buf_off = buf_x0_off + buf_row * buf_stride;
300
                const uint8_t* src     = rsb->stripe_boundary_below + (buf_off << use_highbd);
301
302
                uint8_t* dst8 = data8_bl + i * data_stride;
303
                // Save old pixels, then replace with data from stripe_boundary_below
304
                svt_memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
305
                svt_memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
306
            }
307
        }
308
    } else {
309
        if (copy_above) {
310
            uint8_t* data8_tl = data8 + data_x0 + limits->v_start * data_stride;
311
312
            // Only save and overwrite i=-RESTORATION_BORDER line.
313
            uint8_t* dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
314
            // Save old pixels, then replace with data from stripe_boundary_above
315
            svt_memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
316
            svt_memcpy(REAL_PTR(use_highbd, dst8),
317
                       REAL_PTR(use_highbd, data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
318
                       line_size);
319
        }
320
321
        if (copy_below) {
322
            const int32_t stripe_end = limits->v_start + h;
323
            uint8_t*      data8_bl   = data8 + data_x0 + stripe_end * data_stride;
324
325
            // Only save and overwrite i=2 line.
326
            uint8_t* dst8 = data8_bl + 2 * data_stride;
327
            // Save old pixels, then replace with data from stripe_boundary_below
328
            svt_memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
329
            svt_memcpy(REAL_PTR(use_highbd, dst8), REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
330
        }
331
    }
332
}
333
334
// This function restores the boundary lines modified by
335
// svt_aom_setup_processing_stripe_boundary.
336
//
337
// Note: We need to be careful when handling the corners of the processing
338
// unit, because (eg.) the top-left corner is considered to be part of
339
// both the left and top borders. This means that, depending on the
340
// loop_filter_across_tiles_enabled flag, the corner pixels might get
341
// overwritten twice, once as part of the "top" border and once as part
342
// of the "left" border (or similar for other corners).
343
//
344
// Everything works out fine as long as we make sure to reverse the order
345
// when restoring, ie. we need to restore the left/right borders followed
346
// by the top/bottom borders.
347
static void restore_processing_stripe_boundary(const RestorationTileLimits* limits, const RestorationLineBuffers* rlbs,
348
                                               int32_t use_highbd, int32_t h, uint8_t* data8, int32_t data_stride,
349
                                               int32_t copy_above, int32_t copy_below, int32_t opt) {
350
    const int32_t line_width = (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
351
    const int32_t line_size  = line_width << use_highbd;
352
353
    const int32_t data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
354
355
    if (!opt) {
356
        if (copy_above) {
357
            uint8_t* data8_tl = data8 + data_x0 + limits->v_start * data_stride;
358
            for (int32_t i = -RESTORATION_BORDER; i < 0; ++i) {
359
                uint8_t* dst8 = data8_tl + i * data_stride;
360
                svt_memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
361
            }
362
        }
363
364
        if (copy_below) {
365
            const int32_t stripe_bottom = limits->v_start + h;
366
            uint8_t*      data8_bl      = data8 + data_x0 + stripe_bottom * data_stride;
367
368
            for (int32_t i = 0; i < RESTORATION_BORDER; ++i) {
369
                if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) {
370
                    break;
371
                }
372
373
                uint8_t* dst8 = data8_bl + i * data_stride;
374
                svt_memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
375
            }
376
        }
377
    } else {
378
        if (copy_above) {
379
            uint8_t* data8_tl = data8 + data_x0 + limits->v_start * data_stride;
380
381
            // Only restore i=-RESTORATION_BORDER line.
382
            uint8_t* dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
383
            svt_memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
384
        }
385
386
        if (copy_below) {
387
            const int32_t stripe_bottom = limits->v_start + h;
388
            uint8_t*      data8_bl      = data8 + data_x0 + stripe_bottom * data_stride;
389
390
            // Only restore i=2 line.
391
            if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
392
                uint8_t* dst8 = data8_bl + 2 * data_stride;
393
                svt_memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
394
            }
395
        }
396
    }
397
}
398
399
static void wiener_filter_stripe(const RestorationUnitInfo* rui, int32_t stripe_width, int32_t stripe_height,
400
                                 int32_t procunit_width, const uint8_t* src, int32_t src_stride, uint8_t* dst,
401
                                 int32_t dst_stride, int32_t* tmpbuf, int32_t bit_depth) {
402
    (void)tmpbuf;
403
    (void)bit_depth;
404
    assert(bit_depth == 8);
405
    const ConvolveParams conv_params = get_conv_params_wiener(8);
406
407
    for (int32_t j = 0; j < stripe_width; j += procunit_width) {
408
        int32_t        w     = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
409
        const uint8_t* src_p = src + j;
410
        uint8_t*       dst_p = dst + j; //CHKN  SSE
411
        svt_av1_wiener_convolve_add_src(src_p,
412
                                        src_stride,
413
                                        dst_p,
414
                                        dst_stride,
415
                                        rui->wiener_info.hfilter,
416
                                        rui->wiener_info.vfilter,
417
                                        w,
418
                                        stripe_height,
419
                                        &conv_params);
420
    }
421
}
422
423
/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
424
   over the input. The window is of size (2r + 1)x(2r + 1), and we
425
   specialize to r = 1, 2, 3. A default function is used for r > 3.
426
427
   Each loop follows the same format: We keep a window's worth of input
428
   in individual variables and select data out of that as appropriate.
429
*/
430
static void boxsum1(int32_t* src, int32_t width, int32_t height, int32_t src_stride, int32_t sqr, int32_t* dst,
431
                    int32_t dst_stride) {
432
    int32_t i, j, a, b, c;
433
    assert(width > 2 * SGRPROJ_BORDER_HORZ);
434
    assert(height > 2 * SGRPROJ_BORDER_VERT);
435
436
    // Vertical sum over 3-pixel regions, from src into dst.
437
    if (!sqr) {
438
        for (j = 0; j < width; ++j) {
439
            a = src[j];
440
            b = src[src_stride + j];
441
            c = src[2 * src_stride + j];
442
443
            dst[j] = a + b;
444
            for (i = 1; i < height - 2; ++i) {
445
                // Loop invariant: At the start of each iteration,
446
                // a = src[(i - 1) * src_stride + j]
447
                // b = src[(i    ) * src_stride + j]
448
                // c = src[(i + 1) * src_stride + j]
449
                dst[i * dst_stride + j] = a + b + c;
450
                a                       = b;
451
                b                       = c;
452
                c                       = src[(i + 2) * src_stride + j];
453
            }
454
            dst[i * dst_stride + j]       = a + b + c;
455
            dst[(i + 1) * dst_stride + j] = b + c;
456
        }
457
    } else {
458
        for (j = 0; j < width; ++j) {
459
            a = src[j] * src[j];
460
            b = src[src_stride + j] * src[src_stride + j];
461
            c = src[2 * src_stride + j] * src[2 * src_stride + j];
462
463
            dst[j] = a + b;
464
            for (i = 1; i < height - 2; ++i) {
465
                dst[i * dst_stride + j] = a + b + c;
466
                a                       = b;
467
                b                       = c;
468
                c                       = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
469
            }
470
            dst[i * dst_stride + j]       = a + b + c;
471
            dst[(i + 1) * dst_stride + j] = b + c;
472
        }
473
    }
474
475
    // Horizontal sum over 3-pixel regions of dst
476
    for (i = 0; i < height; ++i) {
477
        a = dst[i * dst_stride];
478
        b = dst[i * dst_stride + 1];
479
        c = dst[i * dst_stride + 2];
480
481
        dst[i * dst_stride] = a + b;
482
        for (j = 1; j < width - 2; ++j) {
483
            // Loop invariant: At the start of each iteration,
484
            // a = src[i * src_stride + (j - 1)]
485
            // b = src[i * src_stride + (j    )]
486
            // c = src[i * src_stride + (j + 1)]
487
            dst[i * dst_stride + j] = a + b + c;
488
            a                       = b;
489
            b                       = c;
490
            c                       = dst[i * dst_stride + (j + 2)];
491
        }
492
        dst[i * dst_stride + j]       = a + b + c;
493
        dst[i * dst_stride + (j + 1)] = b + c;
494
    }
495
}
496
497
static void boxsum2(int32_t* src, int32_t width, int32_t height, int32_t src_stride, int32_t sqr, int32_t* dst,
498
                    int32_t dst_stride) {
499
    int32_t i, j, a, b, c, d, e;
500
    assert(width > 2 * SGRPROJ_BORDER_HORZ);
501
    assert(height > 2 * SGRPROJ_BORDER_VERT);
502
503
    // Vertical sum over 5-pixel regions, from src into dst.
504
    if (!sqr) {
505
        for (j = 0; j < width; ++j) {
506
            a = src[j];
507
            b = src[src_stride + j];
508
            c = src[2 * src_stride + j];
509
            d = src[3 * src_stride + j];
510
            e = src[4 * src_stride + j];
511
512
            dst[j]              = a + b + c;
513
            dst[dst_stride + j] = a + b + c + d;
514
            for (i = 2; i < height - 3; ++i) {
515
                // Loop invariant: At the start of each iteration,
516
                // a = src[(i - 2) * src_stride + j]
517
                // b = src[(i - 1) * src_stride + j]
518
                // c = src[(i    ) * src_stride + j]
519
                // d = src[(i + 1) * src_stride + j]
520
                // e = src[(i + 2) * src_stride + j]
521
                dst[i * dst_stride + j] = a + b + c + d + e;
522
                a                       = b;
523
                b                       = c;
524
                c                       = d;
525
                d                       = e;
526
                e                       = src[(i + 3) * src_stride + j];
527
            }
528
            dst[i * dst_stride + j]       = a + b + c + d + e;
529
            dst[(i + 1) * dst_stride + j] = b + c + d + e;
530
            dst[(i + 2) * dst_stride + j] = c + d + e;
531
        }
532
    } else {
533
        for (j = 0; j < width; ++j) {
534
            a = src[j] * src[j];
535
            b = src[src_stride + j] * src[src_stride + j];
536
            c = src[2 * src_stride + j] * src[2 * src_stride + j];
537
            d = src[3 * src_stride + j] * src[3 * src_stride + j];
538
            e = src[4 * src_stride + j] * src[4 * src_stride + j];
539
540
            dst[j]              = a + b + c;
541
            dst[dst_stride + j] = a + b + c + d;
542
            for (i = 2; i < height - 3; ++i) {
543
                dst[i * dst_stride + j] = a + b + c + d + e;
544
                a                       = b;
545
                b                       = c;
546
                c                       = d;
547
                d                       = e;
548
                e                       = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
549
            }
550
            dst[i * dst_stride + j]       = a + b + c + d + e;
551
            dst[(i + 1) * dst_stride + j] = b + c + d + e;
552
            dst[(i + 2) * dst_stride + j] = c + d + e;
553
        }
554
    }
555
556
    // Horizontal sum over 5-pixel regions of dst
557
    for (i = 0; i < height; ++i) {
558
        a = dst[i * dst_stride];
559
        b = dst[i * dst_stride + 1];
560
        c = dst[i * dst_stride + 2];
561
        d = dst[i * dst_stride + 3];
562
        e = dst[i * dst_stride + 4];
563
564
        dst[i * dst_stride]     = a + b + c;
565
        dst[i * dst_stride + 1] = a + b + c + d;
566
        for (j = 2; j < width - 3; ++j) {
567
            // Loop invariant: At the start of each iteration,
568
            // a = src[i * src_stride + (j - 2)]
569
            // b = src[i * src_stride + (j - 1)]
570
            // c = src[i * src_stride + (j    )]
571
            // d = src[i * src_stride + (j + 1)]
572
            // e = src[i * src_stride + (j + 2)]
573
            dst[i * dst_stride + j] = a + b + c + d + e;
574
            a                       = b;
575
            b                       = c;
576
            c                       = d;
577
            d                       = e;
578
            e                       = dst[i * dst_stride + (j + 3)];
579
        }
580
        dst[i * dst_stride + j]       = a + b + c + d + e;
581
        dst[i * dst_stride + (j + 1)] = b + c + d + e;
582
        dst[i * dst_stride + (j + 2)] = c + d + e;
583
    }
584
}
585
586
static void boxsum(int32_t* src, int32_t width, int32_t height, int32_t src_stride, int32_t r, int32_t sqr,
587
                   int32_t* dst, int32_t dst_stride) {
588
    if (r == 1) {
589
        boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
590
    } else if (r == 2) {
591
        boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
592
    } else {
593
        assert(0 && "Invalid value of r in self-guided filter");
594
    }
595
}
596
597
0
void svt_decode_xq(const int32_t* xqd, int32_t* xq, const SgrParamsType* params) {
598
0
    if (params->r[0] == 0) {
599
0
        xq[0] = 0;
600
0
        xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
601
0
    } else if (params->r[1] == 0) {
602
0
        xq[0] = xqd[0];
603
0
        xq[1] = 0;
604
0
    } else {
605
0
        xq[0] = xqd[0];
606
0
        xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
607
0
    }
608
0
}
609
610
const int32_t svt_aom_eb_x_by_xplus1[256] = {
611
    // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
612
    // instead of 0. See comments in selfguided_restoration_internal() for why
613
    1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239, 240, 241, 242, 243, 243, 244, 244,
614
    245, 245, 246, 246, 247, 247, 247, 247, 248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250,
615
    250, 250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
616
    252, 252, 252, 252, 252, 252, 252, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
617
    253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254, 254, 254, 254, 254, 254,
618
    254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
619
    254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
620
    254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255,
621
    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
622
    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
623
    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
624
    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 256,
625
};
626
627
const int32_t svt_aom_eb_one_by_x[MAX_NELEM] = {
628
    4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
629
    293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
630
};
631
632
static void selfguided_restoration_fast_internal(int32_t* dgd, int32_t width, int32_t height, int32_t dgd_stride,
633
                                                 int32_t* dst, int32_t dst_stride, int32_t bit_depth,
634
                                                 int32_t sgr_params_idx, int32_t radius_idx) {
635
    const SgrParamsType* const params     = &svt_aom_eb_sgr_params[sgr_params_idx];
636
    const int32_t              r          = params->r[radius_idx];
637
    const int32_t              width_ext  = width + 2 * SGRPROJ_BORDER_HORZ;
638
    const int32_t              height_ext = height + 2 * SGRPROJ_BORDER_VERT;
639
    // Adjusting the stride of A and B here appears to avoid bad cache effects,
640
    // leading to a significant speed improvement.
641
    // We also align the stride to a multiple of 16 bytes, for consistency
642
    // with the SIMD version of this function.
643
    int32_t  buf_stride = ((width_ext + 3) & ~3) + 16;
644
    int32_t  a_[RESTORATION_PROC_UNIT_PELS];
645
    int32_t  b_[RESTORATION_PROC_UNIT_PELS];
646
    int32_t* A = a_;
647
    int32_t* B = b_;
648
    int32_t  i, j;
649
650
    assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
651
    assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 && "Need SGRPROJ_BORDER_* >= r+1");
652
653
    boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
654
           width_ext,
655
           height_ext,
656
           dgd_stride,
657
           r,
658
           0,
659
           B,
660
           buf_stride);
661
    boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
662
           width_ext,
663
           height_ext,
664
           dgd_stride,
665
           r,
666
           1,
667
           A,
668
           buf_stride);
669
    A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
670
    B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
671
    // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
672
    // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
673
    for (i = -1; i < height + 1; i += 2) {
674
        for (j = -1; j < width + 1; ++j) {
675
            const int32_t k = i * buf_stride + j;
676
            const int32_t n = (2 * r + 1) * (2 * r + 1);
677
678
            // a < 2^16 * n < 2^22 regardless of bit depth
679
            uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
680
            // b < 2^8 * n < 2^14 regardless of bit depth
681
            uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
682
683
            // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
684
            // and p itself satisfies p < 2^14 * n^2 < 2^26.
685
            // This bound on p is due to:
686
            // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
687
            //
688
            // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
689
            // This is an artefact of rounding, and can only happen if all pixels
690
            // are (almost) identical, so in this case we saturate to p=0.
691
            uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
692
693
            const uint32_t s = params->s[radius_idx];
694
695
            // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
696
            // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
697
            // (this holds even after accounting for the rounding in s)
698
            const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
699
700
            // Note: We have to be quite careful about the value of A[k].
701
            // This is used as a blend factor between individual pixel values and the
702
            // local mean. So it logically has a range of [0, 256], including both
703
            // endpoints.
704
            //
705
            // This is a pain for hardware, as we'd like something which can be stored
706
            // in exactly 8 bits.
707
            // Further, in the calculation of B[k] below, if z == 0 and r == 2,
708
            // then A[k] "should be" 0. But then we can end up setting B[k] to a value
709
            // slightly above 2^(8 + bit depth), due to rounding in the value of
710
            // svt_aom_eb_one_by_x[25-1].
711
            //
712
            // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
713
            // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
714
            // overflow), without significantly affecting the final result: z == 0
715
            // implies that the image is essentially "flat", so the local mean and
716
            // individual pixel values are very similar.
717
            //
718
            // Note that saturating on the other side, ie. requring A[k] <= 255,
719
            // would be a bad idea, as that corresponds to the case where the image
720
            // is very variable, when we want to preserve the local pixel value as
721
            // much as possible.
722
            A[k] = svt_aom_eb_x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256]
723
724
            // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
725
            // svt_aom_eb_one_by_x[n - 1] = round(2^12 / n)
726
            // => the product here is < 2^(20 + bit_depth) <= 2^32,
727
            // and B[k] is set to a value < 2^(8 + bit depth)
728
            // This holds even with the rounding in svt_aom_eb_one_by_x and in the overall
729
            // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
730
            B[k] = (int32_t)ROUND_POWER_OF_TWO(
731
                (uint32_t)(SGRPROJ_SGR - A[k]) * (uint32_t)B[k] * (uint32_t)svt_aom_eb_one_by_x[n - 1],
732
                SGRPROJ_RECIP_BITS);
733
        }
734
    }
735
    // Use the A[] and B[] arrays to calculate the filtered image
736
    assert(r == 2);
737
    for (i = 0; i < height; ++i) {
738
        if (!(i & 1)) { // even row
739
            for (j = 0; j < width; ++j) {
740
                const int32_t k  = i * buf_stride + j;
741
                const int32_t l  = i * dgd_stride + j;
742
                const int32_t m  = i * dst_stride + j;
743
                const int32_t nb = 5;
744
                const int32_t a  = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
745
                    (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] + A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) * 5;
746
                const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
747
                    (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] + B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) * 5;
748
                const int32_t v = a * dgd[l] + b;
749
                dst[m]          = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
750
            }
751
        } else { // odd row
752
            for (j = 0; j < width; ++j) {
753
                const int32_t k  = i * buf_stride + j;
754
                const int32_t l  = i * dgd_stride + j;
755
                const int32_t m  = i * dst_stride + j;
756
                const int32_t nb = 4;
757
                const int32_t a  = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
758
                const int32_t b  = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
759
                const int32_t v  = a * dgd[l] + b;
760
                dst[m]           = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
761
            }
762
        }
763
    }
764
}
765
766
static void selfguided_restoration_internal(int32_t* dgd, int32_t width, int32_t height, int32_t dgd_stride,
767
                                            int32_t* dst, int32_t dst_stride, int32_t bit_depth, int32_t sgr_params_idx,
768
                                            int32_t radius_idx) {
769
    const SgrParamsType* const params     = &svt_aom_eb_sgr_params[sgr_params_idx];
770
    const int32_t              r          = params->r[radius_idx];
771
    const int32_t              width_ext  = width + 2 * SGRPROJ_BORDER_HORZ;
772
    const int32_t              height_ext = height + 2 * SGRPROJ_BORDER_VERT;
773
    // Adjusting the stride of A and B here appears to avoid bad cache effects,
774
    // leading to a significant speed improvement.
775
    // We also align the stride to a multiple of 16 bytes, for consistency
776
    // with the SIMD version of this function.
777
    int32_t  buf_stride = ((width_ext + 3) & ~3) + 16;
778
    int32_t  a_[RESTORATION_PROC_UNIT_PELS];
779
    int32_t  b_[RESTORATION_PROC_UNIT_PELS];
780
    int32_t* A = a_;
781
    int32_t* B = b_;
782
    int32_t  i, j;
783
784
    assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
785
    assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 && "Need SGRPROJ_BORDER_* >= r+1");
786
787
    boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
788
           width_ext,
789
           height_ext,
790
           dgd_stride,
791
           r,
792
           0,
793
           B,
794
           buf_stride);
795
    boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
796
           width_ext,
797
           height_ext,
798
           dgd_stride,
799
           r,
800
           1,
801
           A,
802
           buf_stride);
803
    A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
804
    B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
805
    // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
806
    // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
807
    for (i = -1; i < height + 1; ++i) {
808
        for (j = -1; j < width + 1; ++j) {
809
            const int32_t k = i * buf_stride + j;
810
            const int32_t n = (2 * r + 1) * (2 * r + 1);
811
812
            // a < 2^16 * n < 2^22 regardless of bit depth
813
            uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
814
            // b < 2^8 * n < 2^14 regardless of bit depth
815
            uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
816
817
            // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
818
            // and p itself satisfies p < 2^14 * n^2 < 2^26.
819
            // This bound on p is due to:
820
            // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
821
            //
822
            // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
823
            // This is an artefact of rounding, and can only happen if all pixels
824
            // are (almost) identical, so in this case we saturate to p=0.
825
            uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
826
827
            const uint32_t s = params->s[radius_idx];
828
829
            // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
830
            // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
831
            // (this holds even after accounting for the rounding in s)
832
            const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
833
834
            // Note: We have to be quite careful about the value of A[k].
835
            // This is used as a blend factor between individual pixel values and the
836
            // local mean. So it logically has a range of [0, 256], including both
837
            // endpoints.
838
            //
839
            // This is a pain for hardware, as we'd like something which can be stored
840
            // in exactly 8 bits.
841
            // Further, in the calculation of B[k] below, if z == 0 and r == 2,
842
            // then A[k] "should be" 0. But then we can end up setting B[k] to a value
843
            // slightly above 2^(8 + bit depth), due to rounding in the value of
844
            // svt_aom_eb_one_by_x[25-1].
845
            //
846
            // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
847
            // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
848
            // overflow), without significantly affecting the final result: z == 0
849
            // implies that the image is essentially "flat", so the local mean and
850
            // individual pixel values are very similar.
851
            //
852
            // Note that saturating on the other side, ie. requring A[k] <= 255,
853
            // would be a bad idea, as that corresponds to the case where the image
854
            // is very variable, when we want to preserve the local pixel value as
855
            // much as possible.
856
            A[k] = svt_aom_eb_x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256]
857
858
            // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
859
            // svt_aom_eb_one_by_x[n - 1] = round(2^12 / n)
860
            // => the product here is < 2^(20 + bit_depth) <= 2^32,
861
            // and B[k] is set to a value < 2^(8 + bit depth)
862
            // This holds even with the rounding in svt_aom_eb_one_by_x and in the overall
863
            // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
864
            B[k] = (int32_t)ROUND_POWER_OF_TWO(
865
                (uint32_t)(SGRPROJ_SGR - A[k]) * (uint32_t)B[k] * (uint32_t)svt_aom_eb_one_by_x[n - 1],
866
                SGRPROJ_RECIP_BITS);
867
        }
868
    }
869
    // Use the A[] and B[] arrays to calculate the filtered image
870
    for (i = 0; i < height; ++i) {
871
        for (j = 0; j < width; ++j) {
872
            const int32_t k  = i * buf_stride + j;
873
            const int32_t l  = i * dgd_stride + j;
874
            const int32_t m  = i * dst_stride + j;
875
            const int32_t nb = 5;
876
            const int32_t a  = (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) * 4 +
877
                (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] + A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) * 3;
878
            const int32_t b = (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) * 4 +
879
                (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] + B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) * 3;
880
            const int32_t v = a * dgd[l] + b;
881
            dst[m]          = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
882
        }
883
    }
884
}
885
886
void svt_av1_selfguided_restoration_c(const uint8_t* dgd8, int32_t width, int32_t height, int32_t dgd_stride,
887
                                      int32_t* flt0, int32_t* flt1, int32_t flt_stride, int32_t sgr_params_idx,
888
0
                                      int32_t bit_depth, int32_t highbd) {
889
0
    int32_t       dgd32_[RESTORATION_PROC_UNIT_PELS];
890
0
    const int32_t dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
891
0
    int32_t*      dgd32        = dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
892
893
0
    if (highbd) {
894
0
        const uint16_t* dgd16 = CONVERT_TO_SHORTPTR(dgd8);
895
0
        for (int32_t i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
896
0
            for (int32_t j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
897
0
                dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
898
0
            }
899
0
        }
900
0
    } else {
901
0
        for (int32_t i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
902
0
            for (int32_t j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
903
0
                dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
904
0
            }
905
0
        }
906
0
    }
907
908
0
    const SgrParamsType* const params = &svt_aom_eb_sgr_params[sgr_params_idx];
909
    // If params->r == 0 we skip the corresponding filter. We only allow one of
910
    // the radii to be 0, as having both equal to 0 would be equivalent to
911
    // skipping SGR entirely.
912
0
    assert(!(params->r[0] == 0 && params->r[1] == 0));
913
914
0
    if (params->r[0] > 0) {
915
0
        selfguided_restoration_fast_internal(
916
0
            dgd32, width, height, dgd32_stride, flt0, flt_stride, bit_depth, sgr_params_idx, 0);
917
0
    }
918
0
    if (params->r[1] > 0) {
919
0
        selfguided_restoration_internal(
920
0
            dgd32, width, height, dgd32_stride, flt1, flt_stride, bit_depth, sgr_params_idx, 1);
921
0
    }
922
0
}
923
924
void svt_apply_selfguided_restoration_c(const uint8_t* dat8, int32_t width, int32_t height, int32_t stride, int32_t eps,
925
                                        const int32_t* xqd, uint8_t* dst8, int32_t dst_stride, int32_t* tmpbuf,
926
0
                                        int32_t bit_depth, int32_t highbd) {
927
0
    int32_t* flt0 = tmpbuf;
928
0
    int32_t* flt1 = flt0 + RESTORATION_UNITPELS_MAX;
929
0
    assert(width * height <= RESTORATION_UNITPELS_MAX);
930
931
0
    svt_av1_selfguided_restoration_c(dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
932
0
    const SgrParamsType* const params = &svt_aom_eb_sgr_params[eps];
933
0
    int32_t                    xq[2];
934
0
    svt_decode_xq(xqd, xq, params);
935
0
    for (int32_t i = 0; i < height; ++i) {
936
0
        for (int32_t j = 0; j < width; ++j) {
937
0
            const int32_t  k      = i * width + j;
938
0
            uint8_t*       dst8ij = dst8 + i * dst_stride + j;
939
0
            const uint8_t* dat8ij = dat8 + i * stride + j;
940
941
0
            const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
942
0
            const int32_t  u     = (int32_t)pre_u << SGRPROJ_RST_BITS;
943
0
            int32_t        v     = u << SGRPROJ_PRJ_BITS;
944
            // If params->r == 0 then we skipped the filtering in
945
            // svt_av1_selfguided_restoration_c, i.e. flt[k] == u
946
0
            if (params->r[0] > 0) {
947
0
                v += xq[0] * (flt0[k] - u);
948
0
            }
949
0
            if (params->r[1] > 0) {
950
0
                v += xq[1] * (flt1[k] - u);
951
0
            }
952
0
            const int16_t w = (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
953
954
0
            const uint16_t out = clip_pixel_highbd(w, bit_depth);
955
0
            if (highbd) {
956
0
                *CONVERT_TO_SHORTPTR(dst8ij) = out;
957
0
            } else {
958
0
                *dst8ij = (uint8_t)out;
959
0
            }
960
0
        }
961
0
    }
962
0
}
963
964
static void sgrproj_filter_stripe(const RestorationUnitInfo* rui, int32_t stripe_width, int32_t stripe_height,
965
                                  int32_t procunit_width, const uint8_t* src, int32_t src_stride, uint8_t* dst,
966
                                  int32_t dst_stride, int32_t* tmpbuf, int32_t bit_depth) {
967
    (void)bit_depth;
968
    assert(bit_depth == 8);
969
970
    for (int32_t j = 0; j < stripe_width; j += procunit_width) {
971
        int32_t w = AOMMIN(procunit_width, stripe_width - j);
972
        //CHKN SSE
973
        svt_apply_selfguided_restoration(src + j,
974
                                         w,
975
                                         stripe_height,
976
                                         src_stride,
977
                                         rui->sgrproj_info.ep,
978
                                         rui->sgrproj_info.xqd,
979
                                         dst + j,
980
                                         dst_stride,
981
                                         tmpbuf,
982
                                         bit_depth,
983
                                         0);
984
    }
985
}
986
987
static void wiener_filter_stripe_highbd(const RestorationUnitInfo* rui, int32_t stripe_width, int32_t stripe_height,
988
                                        int32_t procunit_width, const uint8_t* src8, int32_t src_stride, uint8_t* dst8,
989
                                        int32_t dst_stride, int32_t* tmpbuf, int32_t bit_depth) {
990
    (void)tmpbuf;
991
    const ConvolveParams conv_params = get_conv_params_wiener(bit_depth);
992
993
    for (int32_t j = 0; j < stripe_width; j += procunit_width) {
994
        int32_t        w      = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
995
        const uint8_t* src8_p = src8 + j;
996
        uint8_t*       dst8_p = dst8 + j;
997
        svt_av1_highbd_wiener_convolve_add_src(src8_p,
998
                                               src_stride,
999
                                               dst8_p,
1000
                                               dst_stride, //CHKN  SSE
1001
                                               rui->wiener_info.hfilter,
1002
                                               rui->wiener_info.vfilter,
1003
                                               w,
1004
                                               stripe_height,
1005
                                               &conv_params,
1006
                                               bit_depth);
1007
    }
1008
}
1009
1010
static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo* rui, int32_t stripe_width, int32_t stripe_height,
1011
                                         int32_t procunit_width, const uint8_t* src8, int32_t src_stride, uint8_t* dst8,
1012
                                         int32_t dst_stride, int32_t* tmpbuf, int32_t bit_depth) {
1013
    for (int32_t j = 0; j < stripe_width; j += procunit_width) {
1014
        int32_t w = AOMMIN(procunit_width, stripe_width - j);
1015
1016
        //CHKN SSE
1017
        svt_apply_selfguided_restoration(src8 + j,
1018
                                         w,
1019
                                         stripe_height,
1020
                                         src_stride,
1021
                                         rui->sgrproj_info.ep,
1022
                                         rui->sgrproj_info.xqd,
1023
                                         dst8 + j,
1024
                                         dst_stride,
1025
                                         tmpbuf,
1026
                                         bit_depth,
1027
                                         1);
1028
    }
1029
}
1030
1031
#define NUM_STRIPE_FILTERS 4
1032
typedef void (*StripeFilterFun)(const RestorationUnitInfo* rui, int32_t stripe_width, int32_t stripe_height,
1033
                                int32_t procunit_width, const uint8_t* src, int32_t src_stride, uint8_t* dst,
1034
                                int32_t dst_stride, int32_t* tmpbuf, int32_t bit_depth);
1035
1036
static const StripeFilterFun stripe_filters[NUM_STRIPE_FILTERS] = {
1037
    wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd, sgrproj_filter_stripe_highbd};
1038
1039
// Filter one restoration unit
1040
void svt_av1_loop_restoration_filter_unit(uint8_t need_bounadaries, const RestorationTileLimits* limits,
1041
                                          const RestorationUnitInfo* rui, const RestorationStripeBoundaries* rsb,
1042
                                          RestorationLineBuffers* rlbs, const Av1PixelRect* tile_rect,
1043
                                          int32_t tile_stripe0, int32_t ss_x, int32_t ss_y, int32_t highbd,
1044
                                          int32_t bit_depth, uint8_t* data8, int32_t stride, uint8_t* dst8,
1045
0
                                          int32_t dst_stride, int32_t* tmpbuf, int32_t optimized_lr) {
1046
0
    RestorationType unit_rtype = rui->restoration_type;
1047
1048
0
    int32_t  unit_h   = limits->v_end - limits->v_start;
1049
0
    int32_t  unit_w   = limits->h_end - limits->h_start;
1050
0
    uint8_t* data8_tl = data8 + limits->v_start * stride + limits->h_start;
1051
0
    uint8_t* dst8_tl  = dst8 + limits->v_start * dst_stride + limits->h_start;
1052
1053
0
    if (unit_rtype == RESTORE_NONE) {
1054
0
        copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd);
1055
0
        return;
1056
0
    }
1057
1058
0
    const int32_t filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
1059
0
    assert(filter_idx < NUM_STRIPE_FILTERS);
1060
0
    const StripeFilterFun stripe_filter = stripe_filters[filter_idx];
1061
1062
0
    const int32_t procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
1063
1064
    // Convolve the whole tile one stripe at a time
1065
0
    RestorationTileLimits remaining_stripes = *limits;
1066
0
    int32_t               i                 = 0;
1067
0
    while (i < unit_h) {
1068
0
        int32_t copy_above, copy_below;
1069
0
        remaining_stripes.v_start = limits->v_start + i;
1070
1071
0
        get_stripe_boundary_info(&remaining_stripes, tile_rect, ss_y, &copy_above, &copy_below);
1072
1073
0
        const int32_t full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1074
0
        const int32_t runit_offset       = RESTORATION_UNIT_OFFSET >> ss_y;
1075
1076
        // Work out where this stripe's boundaries are within
1077
        // rsb->stripe_boundary_{above,below}
1078
0
        const int32_t tile_stripe  = (remaining_stripes.v_start - tile_rect->top + runit_offset) / full_stripe_height;
1079
0
        const int32_t frame_stripe = tile_stripe0 + tile_stripe;
1080
0
        const int32_t rsb_row      = RESTORATION_CTX_VERT * frame_stripe;
1081
1082
        // Calculate this stripe's height, based on two rules:
1083
        // * The topmost stripe in each tile is 8 luma pixels shorter than usual.
1084
        // * We can't extend past the end of the current restoration unit
1085
0
        const int32_t nominal_stripe_height = full_stripe_height - ((tile_stripe == 0) ? runit_offset : 0);
1086
0
        const int32_t h = AOMMIN(nominal_stripe_height, remaining_stripes.v_end - remaining_stripes.v_start);
1087
1088
0
        if (need_bounadaries) {
1089
0
            setup_processing_stripe_boundary(
1090
0
                &remaining_stripes, rsb, rsb_row, highbd, h, data8, stride, rlbs, copy_above, copy_below, optimized_lr);
1091
0
        }
1092
1093
0
        stripe_filter(rui,
1094
0
                      unit_w,
1095
0
                      h,
1096
0
                      procunit_width,
1097
0
                      data8_tl + i * stride,
1098
0
                      stride,
1099
0
                      dst8_tl + i * dst_stride,
1100
0
                      dst_stride,
1101
0
                      tmpbuf,
1102
0
                      bit_depth);
1103
0
        if (need_bounadaries) {
1104
0
            restore_processing_stripe_boundary(
1105
0
                &remaining_stripes, rlbs, highbd, h, data8, stride, copy_above, copy_below, optimized_lr);
1106
0
        }
1107
1108
0
        i += h;
1109
0
    }
1110
0
}
1111
1112
typedef struct {
1113
    const RestorationInfo*  rsi;
1114
    RestorationLineBuffers* rlbs;
1115
    const Av1Common*        cm;
1116
    int32_t                 tile_stripe0;
1117
    int32_t                 ss_x, ss_y;
1118
    int32_t                 highbd, bit_depth;
1119
    uint8_t *               data8, *dst8;
1120
    int32_t                 data_stride, dst_stride;
1121
    int32_t*                tmpbuf;
1122
} FilterFrameCtxt;
1123
1124
0
static void filter_frame_on_tile(int32_t tile_row, int32_t tile_col, void* priv) {
1125
0
    (void)tile_col;
1126
0
    FilterFrameCtxt* ctxt = (FilterFrameCtxt*)priv;
1127
0
    ctxt->tile_stripe0    = (tile_row == 0) ? 0 : ctxt->cm->child_pcs->rst_end_stripe[tile_row - 1];
1128
0
}
1129
1130
static void filter_frame_on_unit(const RestorationTileLimits* limits, const Av1PixelRect* tile_rect,
1131
                                 int32_t rest_unit_idx, void* priv) {
1132
    FilterFrameCtxt*       ctxt = (FilterFrameCtxt*)priv;
1133
    const RestorationInfo* rsi  = ctxt->rsi;
1134
1135
    svt_av1_loop_restoration_filter_unit(1,
1136
                                         limits,
1137
                                         &rsi->unit_info[rest_unit_idx],
1138
                                         &rsi->boundaries,
1139
                                         ctxt->rlbs,
1140
                                         tile_rect,
1141
                                         ctxt->tile_stripe0,
1142
                                         ctxt->ss_x,
1143
                                         ctxt->ss_y,
1144
                                         ctxt->highbd,
1145
                                         ctxt->bit_depth,
1146
                                         ctxt->data8,
1147
                                         ctxt->data_stride,
1148
                                         ctxt->dst8,
1149
                                         ctxt->dst_stride,
1150
                                         ctxt->tmpbuf,
1151
                                         rsi->optimized_lr);
1152
}
1153
1154
void svt_av1_loop_restoration_filter_frame(int32_t* rst_tmpbuf, Yv12BufferConfig* frame, Av1Common* cm,
1155
0
                                           int32_t optimized_lr) {
1156
    // assert(!cm->all_lossless);
1157
0
    const int32_t num_planes = 3; // av1_num_planes(cm);
1158
0
    typedef void (*CopyFun)(const Yv12BufferConfig* src, Yv12BufferConfig* dst);
1159
0
    static const CopyFun copy_funs[3] = {
1160
0
        svt_aom_yv12_copy_y_c, svt_aom_yv12_copy_u_c, svt_aom_yv12_copy_v_c}; //CHKN SSE
1161
1162
0
    Yv12BufferConfig* dst = &cm->rst_frame;
1163
1164
0
    const int32_t frame_width  = frame->crop_widths[0];
1165
0
    const int32_t frame_height = frame->crop_heights[0];
1166
0
    if (svt_aom_realloc_frame_buffer(dst,
1167
0
                                     frame_width,
1168
0
                                     frame_height,
1169
0
                                     cm->subsampling_x,
1170
0
                                     cm->subsampling_y,
1171
0
                                     cm->use_highbitdepth,
1172
0
                                     AOM_RESTORATION_FRAME_BORDER,
1173
0
                                     cm->byte_alignment,
1174
0
                                     NULL,
1175
0
                                     NULL,
1176
0
                                     NULL) < 0) {
1177
0
        SVT_LOG("Failed to allocate restoration dst buffer\n");
1178
0
    }
1179
1180
0
    RestorationLineBuffers rlbs;
1181
0
    const int32_t          bit_depth = cm->bit_depth;
1182
0
    const int32_t          highbd    = cm->use_highbitdepth;
1183
1184
0
    for (int32_t plane = 0; plane < num_planes; ++plane) {
1185
0
        RestorationInfo* rsi   = &cm->child_pcs->rst_info[plane];
1186
0
        RestorationType  rtype = rsi->frame_restoration_type;
1187
0
        rsi->optimized_lr      = optimized_lr;
1188
1189
0
        if (rtype == RESTORE_NONE) {
1190
0
            continue;
1191
0
        }
1192
0
        const int32_t is_uv        = plane > 0;
1193
0
        const int32_t plane_width  = frame->crop_widths[is_uv];
1194
0
        const int32_t plane_height = frame->crop_heights[is_uv];
1195
1196
0
        svt_extend_frame(frame->buffers[plane],
1197
0
                         plane_width,
1198
0
                         plane_height,
1199
0
                         frame->strides[is_uv],
1200
0
                         RESTORATION_BORDER,
1201
0
                         RESTORATION_BORDER,
1202
0
                         highbd);
1203
1204
0
        FilterFrameCtxt ctxt;
1205
0
        ctxt.rsi         = rsi;
1206
0
        ctxt.rlbs        = &rlbs;
1207
0
        ctxt.cm          = cm;
1208
0
        ctxt.ss_x        = is_uv && cm->subsampling_x;
1209
0
        ctxt.ss_y        = is_uv && cm->subsampling_y;
1210
0
        ctxt.highbd      = highbd;
1211
0
        ctxt.bit_depth   = bit_depth;
1212
0
        ctxt.data8       = frame->buffers[plane];
1213
0
        ctxt.dst8        = dst->buffers[plane];
1214
0
        ctxt.data_stride = frame->strides[is_uv];
1215
0
        ctxt.dst_stride  = dst->strides[is_uv];
1216
0
        ctxt.tmpbuf      = rst_tmpbuf;
1217
0
        svt_aom_foreach_rest_unit_in_frame(cm, plane, filter_frame_on_tile, filter_frame_on_unit, &ctxt);
1218
1219
0
        copy_funs[plane](dst, frame);
1220
0
    }
1221
0
    if (dst->buffer_alloc_sz) {
1222
0
        dst->buffer_alloc_sz = 0;
1223
0
        EB_FREE_ARRAY(dst->buffer_alloc);
1224
0
    }
1225
0
}
1226
1227
static void foreach_rest_unit_in_tile(const Av1PixelRect* tile_rect, int32_t tile_row, int32_t tile_col,
1228
                                      int32_t tile_cols, int32_t hunits_per_tile, int32_t units_per_tile,
1229
                                      int32_t unit_size, int32_t ss_y, RestUnitVisitor on_rest_unit, void* priv) {
1230
    const int32_t tile_w   = tile_rect->right - tile_rect->left;
1231
    const int32_t tile_h   = tile_rect->bottom - tile_rect->top;
1232
    const int32_t ext_size = unit_size * 3 / 2;
1233
1234
    const int32_t tile_idx  = tile_col + tile_row * tile_cols;
1235
    const int32_t unit_idx0 = tile_idx * units_per_tile;
1236
1237
    int32_t y0 = 0, i = 0;
1238
    while (y0 < tile_h) {
1239
        int32_t remaining_h = tile_h - y0;
1240
        int32_t h           = (remaining_h < ext_size) ? remaining_h : unit_size;
1241
1242
        RestorationTileLimits limits;
1243
        limits.v_start = tile_rect->top + y0;
1244
        limits.v_end   = tile_rect->top + y0 + h;
1245
        assert(limits.v_end <= tile_rect->bottom);
1246
        // Offset the tile upwards to align with the restoration processing stripe
1247
        const int32_t voffset = RESTORATION_UNIT_OFFSET >> ss_y;
1248
        limits.v_start        = AOMMAX(tile_rect->top, limits.v_start - voffset);
1249
        if (limits.v_end < tile_rect->bottom) {
1250
            limits.v_end -= voffset;
1251
        }
1252
1253
        int32_t x0 = 0, j = 0;
1254
        while (x0 < tile_w) {
1255
            int32_t remaining_w = tile_w - x0;
1256
            int32_t w           = (remaining_w < ext_size) ? remaining_w : unit_size;
1257
1258
            limits.h_start = tile_rect->left + x0;
1259
            limits.h_end   = tile_rect->left + x0 + w;
1260
            assert(limits.h_end <= tile_rect->right);
1261
1262
            const int32_t unit_idx = unit_idx0 + i * hunits_per_tile + j;
1263
            on_rest_unit(&limits, tile_rect, unit_idx, priv);
1264
1265
            x0 += w;
1266
            ++j;
1267
        }
1268
1269
        y0 += h;
1270
        ++i;
1271
    }
1272
}
1273
1274
void svt_aom_foreach_rest_unit_in_frame(Av1Common* cm, int32_t plane, RestTileStartVisitor on_tile,
1275
0
                                        RestUnitVisitor on_rest_unit, void* priv) {
1276
0
    const int32_t is_uv = plane > 0;
1277
0
    const int32_t ss_y  = is_uv && cm->subsampling_y;
1278
1279
0
    const RestorationInfo* rsi = &cm->child_pcs->rst_info[plane];
1280
1281
0
    const Av1PixelRect tile_rect = whole_frame_rect(&cm->frm_size, cm->subsampling_x, cm->subsampling_y, is_uv);
1282
1283
0
    if (on_tile) {
1284
0
        on_tile(0, 0, priv);
1285
0
    }
1286
1287
0
    foreach_rest_unit_in_tile(&tile_rect,
1288
0
                              0,
1289
0
                              0,
1290
0
                              1,
1291
0
                              rsi->horz_units_per_tile,
1292
0
                              rsi->units_per_tile,
1293
0
                              rsi->restoration_unit_size,
1294
0
                              ss_y,
1295
0
                              on_rest_unit,
1296
0
                              priv);
1297
0
}
1298
1299
static void foreach_rest_unit_in_tile_seg(const Av1PixelRect* tile_rect, int32_t tile_row, int32_t tile_col,
1300
                                          int32_t tile_cols, int32_t hunits_per_tile, int32_t units_per_tile,
1301
                                          int32_t unit_size, int32_t ss_y, RestUnitVisitor on_rest_unit, void* priv,
1302
                                          int32_t vunits_per_tile, uint8_t rest_segments_column_count,
1303
0
                                          uint8_t rest_segments_row_count, uint32_t segment_index) {
1304
    //tile_row=0
1305
    //tile_col=0
1306
    //tile_cols=1
1307
0
    const int32_t tile_w   = tile_rect->right - tile_rect->left; // eq to pic_width
1308
0
    const int32_t tile_h   = tile_rect->bottom - tile_rect->top; // eq to pic_height
1309
0
    const int32_t ext_size = unit_size * 3 / 2;
1310
1311
0
    const int32_t tile_idx  = tile_col + tile_row * tile_cols; //eq to 0
1312
0
    const int32_t unit_idx0 = tile_idx * units_per_tile; //eq to 0
1313
1314
0
    uint32_t x_seg_idx;
1315
0
    uint32_t y_seg_idx;
1316
0
    uint32_t picture_width_in_units  = hunits_per_tile;
1317
0
    uint32_t picture_height_in_units = vunits_per_tile;
1318
0
    SEGMENT_CONVERT_IDX_TO_XY(segment_index, x_seg_idx, y_seg_idx, rest_segments_column_count);
1319
0
    uint32_t x_unit_start_idx = SEGMENT_START_IDX(x_seg_idx, picture_width_in_units, rest_segments_column_count);
1320
0
    uint32_t x_unit_end_idx   = SEGMENT_END_IDX(x_seg_idx, picture_width_in_units, rest_segments_column_count);
1321
0
    uint32_t y_unit_start_idx = SEGMENT_START_IDX(y_seg_idx, picture_height_in_units, rest_segments_row_count);
1322
0
    uint32_t y_unit_end_idx   = SEGMENT_END_IDX(y_seg_idx, picture_height_in_units, rest_segments_row_count);
1323
1324
0
    int32_t y0   = y_unit_start_idx * unit_size;
1325
0
    int32_t yend = ((int32_t)y_unit_end_idx == (int32_t)picture_height_in_units)
1326
0
        ? tile_h
1327
0
        : (int32_t)y_unit_end_idx * (int32_t)unit_size; //MIN(y_unit_end_idx * unit_size , tile_h);
1328
0
    int32_t i    = y_unit_start_idx;
1329
1330
0
    while (y0 < yend) {
1331
0
        int32_t remaining_h = tile_h - y0;
1332
0
        int32_t h           = (remaining_h < ext_size)
1333
0
                      ? remaining_h
1334
0
                      : unit_size; //the area at the pic boundary should have size>= half unit_size to be an independent unit.
1335
        //if not, it will be added to the last complete unit, increasing its size to up to  3/2 unit_size.
1336
1337
0
        RestorationTileLimits limits;
1338
0
        limits.v_start = tile_rect->top + y0;
1339
0
        limits.v_end   = tile_rect->top + y0 + h;
1340
0
        assert(limits.v_end <= tile_rect->bottom);
1341
        // Offset the tile upwards to align with the restoration processing stripe
1342
0
        const int32_t voffset = RESTORATION_UNIT_OFFSET >> ss_y;
1343
0
        limits.v_start        = AOMMAX(tile_rect->top, limits.v_start - voffset);
1344
0
        if (limits.v_end < tile_rect->bottom) {
1345
0
            limits.v_end -= voffset;
1346
0
        }
1347
1348
0
        int32_t x0 = x_unit_start_idx * unit_size;
1349
        // for the superblock below-right. If we're at the bottom or right of the tile,
1350
        // this restoration unit might not exist, in which case we'll clamp accordingly.
1351
0
        int32_t xend = ((int32_t)x_unit_end_idx == (int32_t)picture_width_in_units)
1352
0
            ? tile_w
1353
0
            : AOMMIN((int32_t)x_unit_end_idx * (int32_t)unit_size, tile_w);
1354
0
        int32_t j    = x_unit_start_idx;
1355
1356
0
        while (x0 < xend) {
1357
0
            int32_t remaining_w = tile_w - x0;
1358
0
            int32_t w           = (remaining_w < ext_size) ? remaining_w : unit_size;
1359
1360
0
            limits.h_start = tile_rect->left + x0;
1361
0
            limits.h_end   = tile_rect->left + x0 + w;
1362
0
            assert(limits.h_end <= tile_rect->right);
1363
1364
0
            const int32_t unit_idx = unit_idx0 + i * hunits_per_tile + j;
1365
0
            on_rest_unit(&limits, tile_rect, unit_idx, priv);
1366
1367
0
            x0 += w;
1368
0
            ++j;
1369
0
        }
1370
1371
0
        y0 += h;
1372
0
        ++i;
1373
0
    }
1374
0
}
1375
1376
/* For each restoration unit in the frame, get the best filter parameters and distortions
1377
   for the passed filter type.
1378
*/
1379
void svt_aom_foreach_rest_unit_in_frame_seg(Av1Common* cm, int32_t plane, RestTileStartVisitor on_tile,
1380
                                            RestUnitVisitor on_rest_unit, void* priv,
1381
                                            uint8_t rest_segments_column_count, uint8_t rest_segments_row_count,
1382
0
                                            uint32_t segment_index) {
1383
0
    const int32_t is_uv = plane > 0;
1384
0
    const int32_t ss_y  = is_uv && cm->subsampling_y;
1385
1386
0
    const RestorationInfo* rsi = &cm->child_pcs->rst_info[plane];
1387
1388
0
    const Av1PixelRect tile_rect = whole_frame_rect(&cm->frm_size, cm->subsampling_x, cm->subsampling_y, is_uv);
1389
1390
0
    if (on_tile) {
1391
0
        on_tile(0, 0, priv); //will set rsc->tile_strip0=0;
1392
0
    }
1393
1394
0
    foreach_rest_unit_in_tile_seg(&tile_rect,
1395
0
                                  0,
1396
0
                                  0,
1397
0
                                  1,
1398
0
                                  rsi->horz_units_per_tile,
1399
0
                                  rsi->units_per_tile,
1400
0
                                  rsi->restoration_unit_size,
1401
0
                                  ss_y,
1402
0
                                  on_rest_unit,
1403
0
                                  priv,
1404
0
                                  rsi->vert_units_per_tile,
1405
0
                                  rest_segments_column_count,
1406
0
                                  rest_segments_row_count,
1407
0
                                  segment_index);
1408
0
}
1409
1410
int32_t svt_av1_loop_restoration_corners_in_sb(Av1Common* cm, SeqHeader* seq_header_p, int32_t plane, int32_t mi_row,
1411
                                               int32_t mi_col, BlockSize bsize, int32_t* rcol0, int32_t* rcol1,
1412
525k
                                               int32_t* rrow0, int32_t* rrow1, int32_t* tile_tl_idx) {
1413
525k
    assert(rcol0 && rcol1 && rrow0 && rrow1);
1414
525k
    if (bsize != seq_header_p->sb_size) {
1415
506k
        return 0;
1416
506k
    }
1417
18.6k
    if (cm->child_pcs->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
1418
18.6k
        return 0;
1419
18.6k
    }
1420
1421
    // assert(!cm->all_lossless);
1422
1423
0
    const int32_t is_uv = plane > 0;
1424
1425
0
    const Av1PixelRect tile_rect = whole_frame_rect(&cm->frm_size, cm->subsampling_x, cm->subsampling_y, is_uv);
1426
0
    const int32_t      tile_w    = tile_rect.right - tile_rect.left;
1427
0
    const int32_t      tile_h    = tile_rect.bottom - tile_rect.top;
1428
1429
0
    const int32_t mi_top  = 0;
1430
0
    const int32_t mi_left = 0;
1431
1432
    // Compute the mi-unit corners of the superblock relative to the top-left of
1433
    // the tile
1434
0
    const int32_t mi_rel_row0 = mi_row - mi_top;
1435
0
    const int32_t mi_rel_col0 = mi_col - mi_left;
1436
0
    const int32_t mi_rel_row1 = mi_rel_row0 + mi_size_high[bsize];
1437
0
    const int32_t mi_rel_col1 = mi_rel_col0 + mi_size_wide[bsize];
1438
1439
0
    const RestorationInfo* rsi  = &cm->child_pcs->rst_info[plane];
1440
0
    const int32_t          size = rsi->restoration_unit_size;
1441
1442
    // Calculate the number of restoration units in this tile (which might be
1443
    // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile)
1444
0
    const int32_t horz_units = count_units_in_tile(size, tile_w);
1445
0
    const int32_t vert_units = count_units_in_tile(size, tile_h);
1446
1447
    // The size of an MI-unit on this plane of the image
1448
0
    const int32_t ss_x      = is_uv && cm->subsampling_x;
1449
0
    const int32_t ss_y      = is_uv && cm->subsampling_y;
1450
0
    const int32_t mi_size_x = MI_SIZE >> ss_x;
1451
0
    const int32_t mi_size_y = MI_SIZE >> ss_y;
1452
1453
    // Write m for the relative mi column or row, D for the superres denominator
1454
    // and N for the superres numerator. If u is the upscaled (called "unscaled"
1455
    // elsewhere) pixel offset then we can write the downscaled pixel offset in
1456
    // two ways as:
1457
    //
1458
    //   MI_SIZE * m = N / D u
1459
    //
1460
    // from which we get u = D * MI_SIZE * m / N
1461
1462
0
    const int     mi_to_num_x = !av1_superres_unscaled(&cm->frm_size) ? mi_size_x * cm->frm_size.superres_denominator
1463
0
                                                                      : mi_size_x;
1464
0
    const int     mi_to_num_y = mi_size_y;
1465
0
    const int     denom_x     = !av1_superres_unscaled(&cm->frm_size) ? size * SCALE_NUMERATOR : size;
1466
0
    const int32_t denom_y     = size;
1467
1468
0
    const int32_t rnd_x = denom_x - 1;
1469
0
    const int32_t rnd_y = denom_y - 1;
1470
1471
    // rcol0/rrow0 should be the first column/row of restoration units (relative
1472
    // to the top-left of the tile) that doesn't start left/below of
1473
    // mi_col/mi_row. For this calculation, we need to round up the division (if
1474
    // the sb starts at runit column 10.1, the first matching runit has column
1475
    // index 11)
1476
0
    *rcol0 = (mi_rel_col0 * mi_to_num_x + rnd_x) / denom_x;
1477
0
    *rrow0 = (mi_rel_row0 * mi_to_num_y + rnd_y) / denom_y;
1478
1479
    // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
1480
    // below-right. If we're at the bottom or right of the tile, this restoration
1481
    // unit might not exist, in which case we'll clamp accordingly.
1482
0
    *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
1483
0
    *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
1484
1485
0
    const int32_t tile_idx = 0;
1486
0
    *tile_tl_idx           = tile_idx * rsi->units_per_tile;
1487
1488
0
    return *rcol0 < *rcol1 && *rrow0 < *rrow1;
1489
18.6k
}
1490
1491
// Extend to left and right
1492
static void extend_lines(uint8_t* buf, int32_t width, int32_t height, int32_t stride, int32_t extend,
1493
                         int32_t use_highbitdepth) {
1494
    for (int32_t i = 0; i < height; ++i) {
1495
        if (use_highbitdepth) {
1496
            uint16_t* buf16 = (uint16_t*)buf;
1497
            svt_aom_memset16(buf16 - extend, buf16[0], extend);
1498
            svt_aom_memset16(buf16 + width, buf16[width - 1], extend);
1499
        } else {
1500
            memset(buf - extend, buf[0], extend);
1501
            memset(buf + width, buf[width - 1], extend);
1502
        }
1503
        buf += stride;
1504
    }
1505
}
1506
1507
void svt_aom_save_deblock_boundary_lines(uint8_t* src_buf, int32_t src_stride, int32_t src_width, int32_t src_height,
1508
                                         const Av1Common* cm, int32_t plane, int32_t row, int32_t stripe,
1509
                                         int32_t use_highbd, int32_t is_above,
1510
0
                                         RestorationStripeBoundaries* boundaries) {
1511
0
    const int32_t is_uv     = plane > 0;
1512
0
    src_stride              = src_stride << use_highbd;
1513
0
    const uint8_t* src_rows = src_buf + row * src_stride;
1514
1515
0
    uint8_t*      bdry_buf    = is_above ? boundaries->stripe_boundary_above : boundaries->stripe_boundary_below;
1516
0
    uint8_t*      bdry_start  = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1517
0
    const int32_t bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1518
0
    uint8_t*      bdry_rows   = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1519
1520
    // There is a rare case in which a processing stripe can end 1px above the
1521
    // crop border. In this case, we do want to use deblocked pixels from below
1522
    // the stripe (hence why we ended up in this function), but instead of
1523
    // fetching 2 "below" rows we need to fetch one and duplicate it.
1524
    // This is equivalent to clamping the sample locations against the crop border
1525
0
    const int32_t lines_to_save = AOMMIN(RESTORATION_CTX_VERT, src_height - row);
1526
1527
0
    assert(lines_to_save == 1 || lines_to_save == 2);
1528
1529
0
    int32_t upscaled_width;
1530
0
    int32_t line_bytes;
1531
1532
0
    if (!av1_superres_unscaled(&cm->frm_size)) {
1533
0
        int32_t sx     = is_uv && cm->subsampling_x;
1534
0
        upscaled_width = (cm->frm_size.superres_upscaled_width + sx) >> sx;
1535
0
        line_bytes     = upscaled_width << use_highbd;
1536
1537
0
        svt_av1_upscale_normative_rows(cm,
1538
0
                                       (src_rows),
1539
0
                                       src_stride >> use_highbd,
1540
0
                                       (bdry_rows),
1541
0
                                       boundaries->stripe_boundary_stride,
1542
0
                                       lines_to_save,
1543
0
                                       sx,
1544
0
                                       cm->bit_depth,
1545
0
                                       use_highbd);
1546
0
    } else {
1547
0
        upscaled_width = src_width;
1548
0
        line_bytes     = upscaled_width << use_highbd;
1549
0
        for (int32_t i = 0; i < lines_to_save; i++) {
1550
0
            svt_memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride, line_bytes);
1551
0
        }
1552
0
    }
1553
    // If we only saved one line, then copy it into the second line buffer
1554
0
    if (lines_to_save == 1) {
1555
0
        svt_memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
1556
0
    }
1557
1558
0
    extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride, RESTORATION_EXTRA_HORZ, use_highbd);
1559
0
}
1560
1561
void svt_aom_save_cdef_boundary_lines(uint8_t* src_buf, int32_t src_stride, int32_t src_width, const Av1Common* cm,
1562
                                      int32_t plane, int32_t row, int32_t stripe, int32_t use_highbd, int32_t is_above,
1563
0
                                      RestorationStripeBoundaries* boundaries) {
1564
0
    const int32_t is_uv     = plane > 0;
1565
0
    src_stride              = src_stride << use_highbd;
1566
0
    const uint8_t* src_rows = src_buf + row * src_stride;
1567
1568
0
    uint8_t*      bdry_buf    = is_above ? boundaries->stripe_boundary_above : boundaries->stripe_boundary_below;
1569
0
    uint8_t*      bdry_start  = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1570
0
    const int32_t bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1571
0
    uint8_t*      bdry_rows   = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1572
1573
    // At the point where this function is called, we've already applied
1574
    // superres. So we don't need to extend the lines here, we can just
1575
    // pull directly from the topmost row of the upscaled frame.
1576
0
    const int32_t ss_x           = is_uv && cm->subsampling_x;
1577
0
    const int32_t upscaled_width = av1_superres_unscaled(&cm->frm_size)
1578
0
        ? src_width
1579
0
        : (cm->frm_size.superres_upscaled_width + ss_x) >> ss_x;
1580
0
    const int32_t line_bytes     = upscaled_width << use_highbd;
1581
0
    for (int32_t i = 0; i < RESTORATION_CTX_VERT; i++) {
1582
        // Copy the line at 'row' into both context lines. This is because
1583
        // we want to (effectively) extend the outermost row of CDEF data
1584
        // from this tile to produce a border, rather than using deblocked
1585
        // pixels from the tile above/below.
1586
0
        svt_memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
1587
0
    }
1588
0
    extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride, RESTORATION_EXTRA_HORZ, use_highbd);
1589
0
}
1590
1591
void svt_aom_save_tile_row_boundary_lines(uint8_t* src, int32_t src_stride, int32_t src_width, int32_t src_height,
1592
                                          int32_t use_highbd, int32_t plane, Av1Common* cm, int32_t after_cdef,
1593
0
                                          RestorationStripeBoundaries* boundaries) {
1594
0
    const int32_t is_uv         = plane > 0;
1595
0
    const int32_t ss_y          = is_uv && cm->subsampling_y;
1596
0
    const int32_t stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1597
0
    const int32_t stripe_off    = RESTORATION_UNIT_OFFSET >> ss_y;
1598
1599
    // Get the tile rectangle, with height rounded up to the next multiple of 8
1600
    // luma pixels (only relevant for the bottom tile of the frame)
1601
0
    const Av1PixelRect tile_rect = whole_frame_rect(&cm->frm_size, cm->subsampling_x, cm->subsampling_y, is_uv);
1602
0
    const int32_t      stripe0   = 0;
1603
1604
0
    int32_t plane_height = ROUND_POWER_OF_TWO(cm->frm_size.frame_height, ss_y);
1605
1606
0
    int32_t tile_stripe;
1607
0
    for (tile_stripe = 0;; ++tile_stripe) {
1608
0
        const int32_t rel_y0 = AOMMAX(0, tile_stripe * stripe_height - stripe_off);
1609
0
        const int32_t y0     = tile_rect.top + rel_y0;
1610
0
        if (y0 >= tile_rect.bottom) {
1611
0
            break;
1612
0
        }
1613
1614
0
        const int32_t rel_y1 = (tile_stripe + 1) * stripe_height - stripe_off;
1615
0
        const int32_t y1     = AOMMIN(tile_rect.top + rel_y1, tile_rect.bottom);
1616
1617
0
        const int32_t frame_stripe = stripe0 + tile_stripe;
1618
1619
0
        int32_t use_deblock_above, use_deblock_below;
1620
        // In this case, we should only use CDEF pixels at the top
1621
        // and bottom of the frame as a whole; internal tile boundaries
1622
        // can use deblocked pixels from adjacent tiles for context.
1623
0
        use_deblock_above = (frame_stripe > 0);
1624
0
        use_deblock_below = (y1 < plane_height);
1625
1626
0
        if (!after_cdef) {
1627
            // Save deblocked context where needed.
1628
0
            if (use_deblock_above) {
1629
0
                svt_aom_save_deblock_boundary_lines(src,
1630
0
                                                    src_stride,
1631
0
                                                    src_width,
1632
0
                                                    src_height,
1633
0
                                                    cm,
1634
0
                                                    plane,
1635
0
                                                    y0 - RESTORATION_CTX_VERT,
1636
0
                                                    frame_stripe,
1637
0
                                                    use_highbd,
1638
0
                                                    1,
1639
0
                                                    boundaries);
1640
0
            }
1641
0
            if (use_deblock_below) {
1642
0
                svt_aom_save_deblock_boundary_lines(
1643
0
                    src, src_stride, src_width, src_height, cm, plane, y1, frame_stripe, use_highbd, 0, boundaries);
1644
0
            }
1645
0
        } else {
1646
            // Save CDEF context where needed. Note that we need to save the CDEF
1647
            // context for a particular boundary iff we *didn't* save deblocked
1648
            // context for that boundary.
1649
            //
1650
            // In addition, we need to save copies of the outermost line within
1651
            // the tile, rather than using data from outside the tile.
1652
0
            if (!use_deblock_above) {
1653
0
                svt_aom_save_cdef_boundary_lines(
1654
0
                    src, src_stride, src_width, cm, plane, y0, frame_stripe, use_highbd, 1, boundaries);
1655
0
            }
1656
0
            if (!use_deblock_below) {
1657
0
                svt_aom_save_cdef_boundary_lines(
1658
0
                    src, src_stride, src_width, cm, plane, y1 - 1, frame_stripe, use_highbd, 0, boundaries);
1659
0
            }
1660
0
        }
1661
0
    }
1662
0
}
1663
1664
// For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
1665
// lines to be used as boundary in the loop restoration process. The
1666
// lines are saved in rst_internal.stripe_boundary_lines
1667
0
void svt_av1_loop_restoration_save_boundary_lines(const Yv12BufferConfig* frame, Av1Common* cm, int32_t after_cdef) {
1668
0
    const int32_t num_planes = 3; // av1_num_planes(cm);
1669
0
    const int32_t use_highbd = cm->use_highbitdepth;
1670
1671
0
    for (int32_t p = 0; p < num_planes; ++p) {
1672
0
        const int32_t                is_uv       = p > 0;
1673
0
        int32_t                      crop_width  = frame->crop_widths[is_uv];
1674
0
        int32_t                      crop_height = frame->crop_heights[is_uv];
1675
0
        uint8_t*                     src_buf     = REAL_PTR(use_highbd, frame->buffers[p]);
1676
0
        int32_t                      src_stride  = frame->strides[is_uv];
1677
0
        RestorationStripeBoundaries* boundaries  = &cm->child_pcs->rst_info[p].boundaries;
1678
1679
0
        svt_aom_save_tile_row_boundary_lines(
1680
0
            src_buf, src_stride, crop_width, crop_height, use_highbd, p, cm, after_cdef, boundaries);
1681
0
    }
1682
0
}
1683
1684
// Assumes cm->rst_info[p].restoration_unit_size is already initialized
1685
0
EbErrorType svt_av1_alloc_restoration_buffers(PictureControlSet* pcs, Av1Common* cm) {
1686
0
    EbErrorType   return_error = EB_ErrorNone;
1687
0
    const int32_t num_planes   = 3; // av1_num_planes(cm);
1688
0
    for (int32_t p = 0; p < num_planes; ++p) {
1689
0
        return_error |= svt_av1_alloc_restoration_struct(cm, &pcs->rst_info[p], p > 0);
1690
0
    }
1691
1692
    // For striped loop restoration, we divide each row of tiles into "stripes",
1693
    // of height 64 luma pixels but with an offset by RESTORATION_UNIT_OFFSET
1694
    // luma pixels to match the output from CDEF. We will need to store 2 *
1695
    // RESTORATION_CTX_VERT lines of data for each stripe, and also need to be
1696
    // able to quickly answer the question "Where is the <n>'th stripe for tile
1697
    // row <m>?" To make that efficient, we generate the rst_last_stripe array.
1698
0
    int32_t num_stripes = 0;
1699
0
    for (int32_t i = 0; i < 1 /*cm->tile_rows*/; ++i) {
1700
        //TileInfo tile_info;
1701
        //svt_av1_tile_set_row(&tile_info, cm, i);
1702
1703
0
        const int32_t mi_h         = cm->mi_rows; // tile_info.mi_row_end - tile_info.mi_row_start;
1704
0
        const int32_t ext_h        = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2);
1705
0
        const int32_t tile_stripes = (ext_h + 63) / 64;
1706
0
        num_stripes += tile_stripes;
1707
0
        pcs->rst_end_stripe[i] = num_stripes;
1708
0
    }
1709
1710
    // Now we need to allocate enough space to store the line buffers for the
1711
    // stripes
1712
0
    const int32_t frame_w = cm->frm_size.superres_upscaled_width;
1713
1714
0
    for (int32_t p = 0; p < num_planes; ++p) {
1715
0
        const int32_t                is_uv      = p > 0;
1716
0
        const int32_t                ss_x       = is_uv && cm->subsampling_x;
1717
0
        const int32_t                plane_w    = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ;
1718
0
        const int32_t                stride     = ALIGN_POWER_OF_TWO(plane_w, 5);
1719
0
        const int32_t                buf_size   = num_stripes * stride * RESTORATION_CTX_VERT << 1;
1720
0
        RestorationStripeBoundaries* boundaries = &pcs->rst_info[p].boundaries;
1721
1722
0
        {
1723
0
            EB_MALLOC(boundaries->stripe_boundary_above, buf_size);
1724
0
            EB_MALLOC(boundaries->stripe_boundary_below, buf_size);
1725
1726
0
            boundaries->stripe_boundary_size = buf_size;
1727
0
        }
1728
0
        boundaries->stripe_boundary_stride = stride;
1729
0
    }
1730
1731
0
    return return_error;
1732
0
}