Coverage Report

Created: 2025-11-16 07:22

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/av1/common/restoration.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 *
11
 */
12
13
#include <math.h>
14
15
#include "config/aom_config.h"
16
#include "config/aom_dsp_rtcd.h"
17
#include "config/aom_scale_rtcd.h"
18
19
#include "aom_mem/aom_mem.h"
20
#include "av1/common/av1_common_int.h"
21
#include "av1/common/resize.h"
22
#include "av1/common/restoration.h"
23
#include "aom_dsp/aom_dsp_common.h"
24
#include "aom_mem/aom_mem.h"
25
26
#include "aom_ports/mem.h"
27
28
// The 's' values are calculated based on original 'r' and 'e' values in the
29
// spec using GenSgrprojVtable().
30
// Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
31
const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
32
  { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
33
  { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
34
  { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
35
  { { 2, 1 }, { 47, 1079 } },  { { 2, 1 }, { 37, 996 } },
36
  { { 2, 1 }, { 30, 925 } },   { { 2, 1 }, { 25, 863 } },
37
  { { 0, 1 }, { -1, 2589 } },  { { 0, 1 }, { -1, 1618 } },
38
  { { 0, 1 }, { -1, 1177 } },  { { 0, 1 }, { -1, 925 } },
39
  { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
40
};
41
42
534k
AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
43
534k
  AV1PixelRect rect;
44
45
534k
  int ss_x = is_uv && cm->seq_params->subsampling_x;
46
534k
  int ss_y = is_uv && cm->seq_params->subsampling_y;
47
48
534k
  rect.top = 0;
49
534k
  rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
50
534k
  rect.left = 0;
51
534k
  rect.right = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
52
534k
  return rect;
53
534k
}
54
55
// Count horizontal or vertical units per tile (use a width or height for
56
// tile_size, respectively). We basically want to divide the tile size by the
57
// size of a restoration unit. Rather than rounding up unconditionally as you
58
// might expect, we round to nearest, which models the way a right or bottom
59
// restoration unit can extend to up to 150% its normal width or height. The
60
// max with 1 is to deal with tiles that are smaller than half of a restoration
61
// unit.
62
973k
int av1_lr_count_units_in_tile(int unit_size, int tile_size) {
63
973k
  return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
64
973k
}
65
66
void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
67
34.4k
                                  int is_uv) {
68
  // We need to allocate enough space for restoration units to cover the
69
  // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the
70
  // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
71
  // to do the computation ourselves, iterating over the tiles and keeping
72
  // track of the largest width and height, then upscaling.
73
34.4k
  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
74
34.4k
  const int max_tile_w = tile_rect.right - tile_rect.left;
75
34.4k
  const int max_tile_h = tile_rect.bottom - tile_rect.top;
76
77
  // To calculate hpertile and vpertile (horizontal and vertical units per
78
  // tile), we basically want to divide the largest tile width or height by the
79
  // size of a restoration unit. Rather than rounding up unconditionally as you
80
  // might expect, we round to nearest, which models the way a right or bottom
81
  // restoration unit can extend to up to 150% its normal width or height. The
82
  // max with 1 is to deal with tiles that are smaller than half of a
83
  // restoration unit.
84
34.4k
  const int unit_size = rsi->restoration_unit_size;
85
34.4k
  const int hpertile = av1_lr_count_units_in_tile(unit_size, max_tile_w);
86
34.4k
  const int vpertile = av1_lr_count_units_in_tile(unit_size, max_tile_h);
87
88
34.4k
  rsi->units_per_tile = hpertile * vpertile;
89
34.4k
  rsi->horz_units_per_tile = hpertile;
90
34.4k
  rsi->vert_units_per_tile = vpertile;
91
92
34.4k
  const int ntiles = 1;
93
34.4k
  const int nunits = ntiles * rsi->units_per_tile;
94
95
34.4k
  aom_free(rsi->unit_info);
96
34.4k
  CHECK_MEM_ERROR(cm, rsi->unit_info,
97
34.4k
                  (RestorationUnitInfo *)aom_memalign(
98
34.4k
                      16, sizeof(*rsi->unit_info) * nunits));
99
34.4k
}
100
101
103k
void av1_free_restoration_struct(RestorationInfo *rst_info) {
102
103k
  aom_free(rst_info->unit_info);
103
103k
  rst_info->unit_info = NULL;
104
103k
}
105
106
#if 0
107
// Pair of values for each sgrproj parameter:
108
// Index 0 corresponds to r[0], e[0]
109
// Index 1 corresponds to r[1], e[1]
110
int sgrproj_mtable[SGRPROJ_PARAMS][2];
111
112
static void GenSgrprojVtable() {
113
  for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
114
    const sgr_params_type *const params = &av1_sgr_params[i];
115
    for (int j = 0; j < 2; ++j) {
116
      const int e = params->e[j];
117
      const int r = params->r[j];
118
      if (r == 0) {                 // filter is disabled
119
        sgrproj_mtable[i][j] = -1;  // mark invalid
120
      } else {                      // filter is enabled
121
        const int n = (2 * r + 1) * (2 * r + 1);
122
        const int n2e = n * n * e;
123
        assert(n2e != 0);
124
        sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
125
      }
126
    }
127
  }
128
}
129
#endif
130
131
34.3k
void av1_loop_restoration_precal() {
132
#if 0
133
  GenSgrprojVtable();
134
#endif
135
34.3k
}
136
137
static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride,
138
3.56k
                               int border_horz, int border_vert) {
139
3.56k
  uint8_t *data_p;
140
3.56k
  int i;
141
1.29M
  for (i = 0; i < height; ++i) {
142
1.29M
    data_p = data + i * stride;
143
1.29M
    memset(data_p - border_horz, data_p[0], border_horz);
144
1.29M
    memset(data_p + width, data_p[width - 1], border_horz);
145
1.29M
  }
146
3.56k
  data_p = data - border_horz;
147
14.2k
  for (i = -border_vert; i < 0; ++i) {
148
10.6k
    memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
149
10.6k
  }
150
14.2k
  for (i = height; i < height + border_vert; ++i) {
151
10.6k
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
152
10.6k
           width + 2 * border_horz);
153
10.6k
  }
154
3.56k
}
155
156
#if CONFIG_AV1_HIGHBITDEPTH
157
static void extend_frame_highbd(uint16_t *data, int width, int height,
158
8.20k
                                int stride, int border_horz, int border_vert) {
159
8.20k
  uint16_t *data_p;
160
8.20k
  int i, j;
161
2.17M
  for (i = 0; i < height; ++i) {
162
2.16M
    data_p = data + i * stride;
163
8.66M
    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
164
8.66M
    for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
165
2.16M
  }
166
8.20k
  data_p = data - border_horz;
167
32.8k
  for (i = -border_vert; i < 0; ++i) {
168
24.6k
    memcpy(data_p + i * stride, data_p,
169
24.6k
           (width + 2 * border_horz) * sizeof(uint16_t));
170
24.6k
  }
171
32.8k
  for (i = height; i < height + border_vert; ++i) {
172
24.6k
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
173
24.6k
           (width + 2 * border_horz) * sizeof(uint16_t));
174
24.6k
  }
175
8.20k
}
176
177
static void copy_tile_highbd(int width, int height, const uint16_t *src,
178
10.5k
                             int src_stride, uint16_t *dst, int dst_stride) {
179
889k
  for (int i = 0; i < height; ++i)
180
879k
    memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
181
10.5k
}
182
#endif
183
184
void av1_extend_frame(uint8_t *data, int width, int height, int stride,
185
11.7k
                      int border_horz, int border_vert, int highbd) {
186
11.7k
#if CONFIG_AV1_HIGHBITDEPTH
187
11.7k
  if (highbd) {
188
8.20k
    extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
189
8.20k
                        border_horz, border_vert);
190
8.20k
    return;
191
8.20k
  }
192
3.56k
#endif
193
3.56k
  (void)highbd;
194
3.56k
  extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
195
3.56k
}
196
197
static void copy_tile_lowbd(int width, int height, const uint8_t *src,
198
6.44k
                            int src_stride, uint8_t *dst, int dst_stride) {
199
559k
  for (int i = 0; i < height; ++i)
200
553k
    memcpy(dst + i * dst_stride, src + i * src_stride, width);
201
6.44k
}
202
203
static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
204
16.9k
                      uint8_t *dst, int dst_stride, int highbd) {
205
16.9k
#if CONFIG_AV1_HIGHBITDEPTH
206
16.9k
  if (highbd) {
207
10.5k
    copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
208
10.5k
                     CONVERT_TO_SHORTPTR(dst), dst_stride);
209
10.5k
    return;
210
10.5k
  }
211
6.44k
#endif
212
6.44k
  (void)highbd;
213
6.44k
  copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
214
6.44k
}
215
216
904k
#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
217
218
// With striped loop restoration, the filtering for each 64-pixel stripe gets
219
// most of its input from the output of CDEF (stored in data8), but we need to
220
// fill out a border of 3 pixels above/below the stripe according to the
221
// following
222
// rules:
223
//
224
// * At a frame boundary, we copy the outermost row of CDEF pixels three times.
225
//   This extension is done by a call to av1_extend_frame() at the start of the
226
//   loop restoration process, so the value of copy_above/copy_below doesn't
227
//   strictly matter. However, by setting *copy_above = *copy_below = 1 whenever
228
//   loop filtering across tiles is disabled, we can allow
229
//   {setup,restore}_processing_stripe_boundary to assume that the top/bottom
230
//   data has always been copied, simplifying the behaviour at the left and
231
//   right edges of tiles.
232
//
233
// * If we're at a tile boundary and loop filtering across tiles is enabled,
234
//   then there is a logical stripe which is 64 pixels high, but which is split
235
//   into an 8px high and a 56px high stripe so that the processing (and
236
//   coefficient set usage) can be aligned to tiles.
237
//   In this case, we use the 3 rows of CDEF output across the boundary for
238
//   context; this corresponds to leaving the frame buffer as-is.
239
//
240
// * If we're at a tile boundary and loop filtering across tiles is disabled,
241
//   then we take the outermost row of CDEF pixels *within the current tile*
242
//   and copy it three times. Thus we behave exactly as if the tile were a full
243
//   frame.
244
//
245
// * Otherwise, we're at a stripe boundary within a tile. In that case, we
246
//   take 2 rows of deblocked pixels and extend them to 3 rows of context.
247
//
248
// The distinction between the latter two cases is handled by the
249
// av1_loop_restoration_save_boundary_lines() function, so here we just need
250
// to decide if we're overwriting the above/below boundary pixels or not.
251
static void get_stripe_boundary_info(const RestorationTileLimits *limits,
252
                                     const AV1PixelRect *tile_rect, int ss_y,
253
54.4k
                                     int *copy_above, int *copy_below) {
254
54.4k
  *copy_above = 1;
255
54.4k
  *copy_below = 1;
256
257
54.4k
  const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
258
54.4k
  const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
259
260
54.4k
  const int first_stripe_in_tile = (limits->v_start == tile_rect->top);
261
54.4k
  const int this_stripe_height =
262
54.4k
      full_stripe_height - (first_stripe_in_tile ? runit_offset : 0);
263
54.4k
  const int last_stripe_in_tile =
264
54.4k
      (limits->v_start + this_stripe_height >= tile_rect->bottom);
265
266
54.4k
  if (first_stripe_in_tile) *copy_above = 0;
267
54.4k
  if (last_stripe_in_tile) *copy_below = 0;
268
54.4k
}
269
270
// Overwrite the border pixels around a processing stripe so that the conditions
271
// listed above get_stripe_boundary_info() are preserved.
272
// We save the pixels which get overwritten into a temporary buffer, so that
273
// they can be restored by restore_processing_stripe_boundary() after we've
274
// processed the stripe.
275
//
276
// limits gives the rectangular limits of the remaining stripes for the current
277
// restoration unit. rsb is the stored stripe boundaries (taken from either
278
// deblock or CDEF output as necessary).
279
//
280
// tile_rect is the limits of the current tile and tile_stripe0 is the index of
281
// the first stripe in this tile (needed to convert the tile-relative stripe
282
// index we get from limits into something we can look up in rsb).
283
static void setup_processing_stripe_boundary(
284
    const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
285
    int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
286
54.4k
    RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
287
  // Offsets within the line buffers. The buffer logically starts at column
288
  // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
289
  // has column x0 in the buffer.
290
54.4k
  const int buf_stride = rsb->stripe_boundary_stride;
291
54.4k
  const int buf_x0_off = limits->h_start;
292
54.4k
  const int line_width =
293
54.4k
      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
294
54.4k
  const int line_size = line_width << use_highbd;
295
296
54.4k
  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
297
298
  // Replace RESTORATION_BORDER pixels above the top of the stripe
299
  // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
300
  // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
301
  // duplicating the topmost of the 2 lines (see the AOMMAX call when
302
  // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
303
  //
304
  // Special case: If we're at the top of a tile, which isn't on the topmost
305
  // tile row, and we're allowed to loop filter across tiles, then we have a
306
  // logical 64-pixel-high stripe which has been split into an 8-pixel high
307
  // stripe and a 56-pixel high stripe (the current one). So, in this case,
308
  // we want to leave the boundary alone!
309
54.4k
  if (!opt) {
310
40.1k
    if (copy_above) {
311
34.5k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
312
313
138k
      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
314
103k
        const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
315
103k
        const int buf_off = buf_x0_off + buf_row * buf_stride;
316
103k
        const uint8_t *buf =
317
103k
            rsb->stripe_boundary_above + (buf_off << use_highbd);
318
103k
        uint8_t *dst8 = data8_tl + i * data_stride;
319
        // Save old pixels, then replace with data from stripe_boundary_above
320
103k
        memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
321
103k
               REAL_PTR(use_highbd, dst8), line_size);
322
103k
        memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
323
103k
      }
324
34.5k
    }
325
326
    // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
327
    // The second buffer row is repeated, so src_row gets the values 0, 1, 1
328
    // for i = 0, 1, 2.
329
40.1k
    if (copy_below) {
330
31.3k
      const int stripe_end = limits->v_start + h;
331
31.3k
      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
332
333
125k
      for (int i = 0; i < RESTORATION_BORDER; ++i) {
334
94.1k
        const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
335
94.1k
        const int buf_off = buf_x0_off + buf_row * buf_stride;
336
94.1k
        const uint8_t *src =
337
94.1k
            rsb->stripe_boundary_below + (buf_off << use_highbd);
338
339
94.1k
        uint8_t *dst8 = data8_bl + i * data_stride;
340
        // Save old pixels, then replace with data from stripe_boundary_below
341
94.1k
        memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
342
94.1k
        memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
343
94.1k
      }
344
31.3k
    }
345
40.1k
  } else {
346
14.2k
    if (copy_above) {
347
13.1k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
348
349
      // Only save and overwrite i=-RESTORATION_BORDER line.
350
13.1k
      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
351
      // Save old pixels, then replace with data from stripe_boundary_above
352
13.1k
      memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
353
13.1k
      memcpy(REAL_PTR(use_highbd, dst8),
354
13.1k
             REAL_PTR(use_highbd,
355
13.1k
                      data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
356
13.1k
             line_size);
357
13.1k
    }
358
359
14.2k
    if (copy_below) {
360
13.0k
      const int stripe_end = limits->v_start + h;
361
13.0k
      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
362
363
      // Only save and overwrite i=2 line.
364
13.0k
      uint8_t *dst8 = data8_bl + 2 * data_stride;
365
      // Save old pixels, then replace with data from stripe_boundary_below
366
13.0k
      memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
367
13.0k
      memcpy(REAL_PTR(use_highbd, dst8),
368
13.0k
             REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
369
13.0k
    }
370
14.2k
  }
371
54.4k
}
372
373
// This function restores the boundary lines modified by
374
// setup_processing_stripe_boundary.
375
//
376
// Note: We need to be careful when handling the corners of the processing
377
// unit, because (eg.) the top-left corner is considered to be part of
378
// both the left and top borders. This means that, depending on the
379
// loop_filter_across_tiles_enabled flag, the corner pixels might get
380
// overwritten twice, once as part of the "top" border and once as part
381
// of the "left" border (or similar for other corners).
382
//
383
// Everything works out fine as long as we make sure to reverse the order
384
// when restoring, ie. we need to restore the left/right borders followed
385
// by the top/bottom borders.
386
static void restore_processing_stripe_boundary(
387
    const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
388
    int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
389
54.4k
    int copy_below, int opt) {
390
54.4k
  const int line_width =
391
54.4k
      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
392
54.4k
  const int line_size = line_width << use_highbd;
393
394
54.4k
  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
395
396
54.4k
  if (!opt) {
397
40.1k
    if (copy_above) {
398
34.5k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
399
138k
      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
400
103k
        uint8_t *dst8 = data8_tl + i * data_stride;
401
103k
        memcpy(REAL_PTR(use_highbd, dst8),
402
103k
               rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
403
103k
      }
404
34.5k
    }
405
406
40.1k
    if (copy_below) {
407
31.3k
      const int stripe_bottom = limits->v_start + h;
408
31.3k
      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
409
410
125k
      for (int i = 0; i < RESTORATION_BORDER; ++i) {
411
94.1k
        if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
412
413
94.1k
        uint8_t *dst8 = data8_bl + i * data_stride;
414
94.1k
        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
415
94.1k
      }
416
31.3k
    }
417
40.1k
  } else {
418
14.2k
    if (copy_above) {
419
13.1k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
420
421
      // Only restore i=-RESTORATION_BORDER line.
422
13.1k
      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
423
13.1k
      memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
424
13.1k
    }
425
426
14.2k
    if (copy_below) {
427
13.0k
      const int stripe_bottom = limits->v_start + h;
428
13.0k
      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
429
430
      // Only restore i=2 line.
431
13.0k
      if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
432
13.0k
        uint8_t *dst8 = data8_bl + 2 * data_stride;
433
13.0k
        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
434
13.0k
      }
435
13.0k
    }
436
14.2k
  }
437
54.4k
}
438
439
static void wiener_filter_stripe(const RestorationUnitInfo *rui,
440
                                 int stripe_width, int stripe_height,
441
                                 int procunit_width, const uint8_t *src,
442
                                 int src_stride, uint8_t *dst, int dst_stride,
443
11.2k
                                 int32_t *tmpbuf, int bit_depth) {
444
11.2k
  (void)tmpbuf;
445
11.2k
  (void)bit_depth;
446
11.2k
  assert(bit_depth == 8);
447
11.2k
  const ConvolveParams conv_params = get_conv_params_wiener(8);
448
449
30.1k
  for (int j = 0; j < stripe_width; j += procunit_width) {
450
18.9k
    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
451
18.9k
    const uint8_t *src_p = src + j;
452
18.9k
    uint8_t *dst_p = dst + j;
453
18.9k
    av1_wiener_convolve_add_src(
454
18.9k
        src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
455
18.9k
        rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
456
18.9k
  }
457
11.2k
}
458
459
/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
460
   over the input. The window is of size (2r + 1)x(2r + 1), and we
461
   specialize to r = 1, 2, 3. A default function is used for r > 3.
462
463
   Each loop follows the same format: We keep a window's worth of input
464
   in individual variables and select data out of that as appropriate.
465
*/
466
static void boxsum1(int32_t *src, int width, int height, int src_stride,
467
55.6k
                    int sqr, int32_t *dst, int dst_stride) {
468
55.6k
  int i, j, a, b, c;
469
55.6k
  assert(width > 2 * SGRPROJ_BORDER_HORZ);
470
55.6k
  assert(height > 2 * SGRPROJ_BORDER_VERT);
471
472
  // Vertical sum over 3-pixel regions, from src into dst.
473
55.6k
  if (!sqr) {
474
1.28M
    for (j = 0; j < width; ++j) {
475
1.26M
      a = src[j];
476
1.26M
      b = src[src_stride + j];
477
1.26M
      c = src[2 * src_stride + j];
478
479
1.26M
      dst[j] = a + b;
480
67.0M
      for (i = 1; i < height - 2; ++i) {
481
        // Loop invariant: At the start of each iteration,
482
        // a = src[(i - 1) * src_stride + j]
483
        // b = src[(i    ) * src_stride + j]
484
        // c = src[(i + 1) * src_stride + j]
485
65.8M
        dst[i * dst_stride + j] = a + b + c;
486
65.8M
        a = b;
487
65.8M
        b = c;
488
65.8M
        c = src[(i + 2) * src_stride + j];
489
65.8M
      }
490
1.26M
      dst[i * dst_stride + j] = a + b + c;
491
1.26M
      dst[(i + 1) * dst_stride + j] = b + c;
492
1.26M
    }
493
27.8k
  } else {
494
1.28M
    for (j = 0; j < width; ++j) {
495
1.26M
      a = src[j] * src[j];
496
1.26M
      b = src[src_stride + j] * src[src_stride + j];
497
1.26M
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
498
499
1.26M
      dst[j] = a + b;
500
67.0M
      for (i = 1; i < height - 2; ++i) {
501
65.8M
        dst[i * dst_stride + j] = a + b + c;
502
65.8M
        a = b;
503
65.8M
        b = c;
504
65.8M
        c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
505
65.8M
      }
506
1.26M
      dst[i * dst_stride + j] = a + b + c;
507
1.26M
      dst[(i + 1) * dst_stride + j] = b + c;
508
1.26M
    }
509
27.8k
  }
510
511
  // Horizontal sum over 3-pixel regions of dst
512
3.15M
  for (i = 0; i < height; ++i) {
513
3.10M
    a = dst[i * dst_stride];
514
3.10M
    b = dst[i * dst_stride + 1];
515
3.10M
    c = dst[i * dst_stride + 2];
516
517
3.10M
    dst[i * dst_stride] = a + b;
518
133M
    for (j = 1; j < width - 2; ++j) {
519
      // Loop invariant: At the start of each iteration,
520
      // a = src[i * src_stride + (j - 1)]
521
      // b = src[i * src_stride + (j    )]
522
      // c = src[i * src_stride + (j + 1)]
523
129M
      dst[i * dst_stride + j] = a + b + c;
524
129M
      a = b;
525
129M
      b = c;
526
129M
      c = dst[i * dst_stride + (j + 2)];
527
129M
    }
528
3.10M
    dst[i * dst_stride + j] = a + b + c;
529
3.10M
    dst[i * dst_stride + (j + 1)] = b + c;
530
3.10M
  }
531
55.6k
}
532
533
static void boxsum2(int32_t *src, int width, int height, int src_stride,
534
48.6k
                    int sqr, int32_t *dst, int dst_stride) {
535
48.6k
  int i, j, a, b, c, d, e;
536
48.6k
  assert(width > 2 * SGRPROJ_BORDER_HORZ);
537
48.6k
  assert(height > 2 * SGRPROJ_BORDER_VERT);
538
539
  // Vertical sum over 5-pixel regions, from src into dst.
540
48.6k
  if (!sqr) {
541
1.10M
    for (j = 0; j < width; ++j) {
542
1.07M
      a = src[j];
543
1.07M
      b = src[src_stride + j];
544
1.07M
      c = src[2 * src_stride + j];
545
1.07M
      d = src[3 * src_stride + j];
546
1.07M
      e = src[4 * src_stride + j];
547
548
1.07M
      dst[j] = a + b + c;
549
1.07M
      dst[dst_stride + j] = a + b + c + d;
550
54.6M
      for (i = 2; i < height - 3; ++i) {
551
        // Loop invariant: At the start of each iteration,
552
        // a = src[(i - 2) * src_stride + j]
553
        // b = src[(i - 1) * src_stride + j]
554
        // c = src[(i    ) * src_stride + j]
555
        // d = src[(i + 1) * src_stride + j]
556
        // e = src[(i + 2) * src_stride + j]
557
53.6M
        dst[i * dst_stride + j] = a + b + c + d + e;
558
53.6M
        a = b;
559
53.6M
        b = c;
560
53.6M
        c = d;
561
53.6M
        d = e;
562
53.6M
        e = src[(i + 3) * src_stride + j];
563
53.6M
      }
564
1.07M
      dst[i * dst_stride + j] = a + b + c + d + e;
565
1.07M
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
566
1.07M
      dst[(i + 2) * dst_stride + j] = c + d + e;
567
1.07M
    }
568
24.3k
  } else {
569
1.10M
    for (j = 0; j < width; ++j) {
570
1.07M
      a = src[j] * src[j];
571
1.07M
      b = src[src_stride + j] * src[src_stride + j];
572
1.07M
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
573
1.07M
      d = src[3 * src_stride + j] * src[3 * src_stride + j];
574
1.07M
      e = src[4 * src_stride + j] * src[4 * src_stride + j];
575
576
1.07M
      dst[j] = a + b + c;
577
1.07M
      dst[dst_stride + j] = a + b + c + d;
578
54.6M
      for (i = 2; i < height - 3; ++i) {
579
53.6M
        dst[i * dst_stride + j] = a + b + c + d + e;
580
53.6M
        a = b;
581
53.6M
        b = c;
582
53.6M
        c = d;
583
53.6M
        d = e;
584
53.6M
        e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
585
53.6M
      }
586
1.07M
      dst[i * dst_stride + j] = a + b + c + d + e;
587
1.07M
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
588
1.07M
      dst[(i + 2) * dst_stride + j] = c + d + e;
589
1.07M
    }
590
24.3k
  }
591
592
  // Horizontal sum over 5-pixel regions of dst
593
2.75M
  for (i = 0; i < height; ++i) {
594
2.70M
    a = dst[i * dst_stride];
595
2.70M
    b = dst[i * dst_stride + 1];
596
2.70M
    c = dst[i * dst_stride + 2];
597
2.70M
    d = dst[i * dst_stride + 3];
598
2.70M
    e = dst[i * dst_stride + 4];
599
600
2.70M
    dst[i * dst_stride] = a + b + c;
601
2.70M
    dst[i * dst_stride + 1] = a + b + c + d;
602
107M
    for (j = 2; j < width - 3; ++j) {
603
      // Loop invariant: At the start of each iteration,
604
      // a = src[i * src_stride + (j - 2)]
605
      // b = src[i * src_stride + (j - 1)]
606
      // c = src[i * src_stride + (j    )]
607
      // d = src[i * src_stride + (j + 1)]
608
      // e = src[i * src_stride + (j + 2)]
609
104M
      dst[i * dst_stride + j] = a + b + c + d + e;
610
104M
      a = b;
611
104M
      b = c;
612
104M
      c = d;
613
104M
      d = e;
614
104M
      e = dst[i * dst_stride + (j + 3)];
615
104M
    }
616
2.70M
    dst[i * dst_stride + j] = a + b + c + d + e;
617
2.70M
    dst[i * dst_stride + (j + 1)] = b + c + d + e;
618
2.70M
    dst[i * dst_stride + (j + 2)] = c + d + e;
619
2.70M
  }
620
48.6k
}
621
622
static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
623
104k
                   int sqr, int32_t *dst, int dst_stride) {
624
104k
  if (r == 1)
625
55.6k
    boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
626
48.6k
  else if (r == 2)
627
48.6k
    boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
628
0
  else
629
48.6k
    assert(0 && "Invalid value of r in self-guided filter");
630
104k
}
631
632
32.9k
void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
633
32.9k
  if (params->r[0] == 0) {
634
8.61k
    xq[0] = 0;
635
8.61k
    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
636
24.3k
  } else if (params->r[1] == 0) {
637
5.08k
    xq[0] = xqd[0];
638
5.08k
    xq[1] = 0;
639
19.2k
  } else {
640
19.2k
    xq[0] = xqd[0];
641
19.2k
    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
642
19.2k
  }
643
32.9k
}
644
645
const int32_t av1_x_by_xplus1[256] = {
646
  // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
647
  // instead of 0. See comments in selfguided_restoration_internal() for why
648
  1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
649
  240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
650
  248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
651
  250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
652
  252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
653
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
654
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
655
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
656
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
657
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
658
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
659
  254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
660
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
661
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
662
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
663
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
664
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
665
  256,
666
};
667
668
const int32_t av1_one_by_x[MAX_NELEM] = {
669
  4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
670
  293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
671
};
672
673
static void calculate_intermediate_result(int32_t *dgd, int width, int height,
674
                                          int dgd_stride, int bit_depth,
675
                                          int sgr_params_idx, int radius_idx,
676
52.1k
                                          int pass, int32_t *A, int32_t *B) {
677
52.1k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
678
52.1k
  const int r = params->r[radius_idx];
679
52.1k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
680
52.1k
  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
681
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
682
  // leading to a significant speed improvement.
683
  // We also align the stride to a multiple of 16 bytes, for consistency
684
  // with the SIMD version of this function.
685
52.1k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
686
52.1k
  const int step = pass == 0 ? 1 : 2;
687
52.1k
  int i, j;
688
689
52.1k
  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
690
52.1k
  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
691
52.1k
         "Need SGRPROJ_BORDER_* >= r+1");
692
693
52.1k
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
694
52.1k
         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
695
52.1k
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
696
52.1k
         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
697
52.1k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
698
52.1k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
699
  // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
700
  // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
701
2.12M
  for (i = -1; i < height + 1; i += step) {
702
85.7M
    for (j = -1; j < width + 1; ++j) {
703
83.6M
      const int k = i * buf_stride + j;
704
83.6M
      const int n = (2 * r + 1) * (2 * r + 1);
705
706
      // a < 2^16 * n < 2^22 regardless of bit depth
707
83.6M
      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
708
      // b < 2^8 * n < 2^14 regardless of bit depth
709
83.6M
      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
710
711
      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
712
      // and p itself satisfies p < 2^14 * n^2 < 2^26.
713
      // This bound on p is due to:
714
      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
715
      //
716
      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
717
      // This is an artefact of rounding, and can only happen if all pixels
718
      // are (almost) identical, so in this case we saturate to p=0.
719
83.6M
      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
720
721
83.6M
      const uint32_t s = params->s[radius_idx];
722
723
      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
724
      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
725
      // (this holds even after accounting for the rounding in s)
726
83.6M
      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
727
728
      // Note: We have to be quite careful about the value of A[k].
729
      // This is used as a blend factor between individual pixel values and the
730
      // local mean. So it logically has a range of [0, 256], including both
731
      // endpoints.
732
      //
733
      // This is a pain for hardware, as we'd like something which can be stored
734
      // in exactly 8 bits.
735
      // Further, in the calculation of B[k] below, if z == 0 and r == 2,
736
      // then A[k] "should be" 0. But then we can end up setting B[k] to a value
737
      // slightly above 2^(8 + bit depth), due to rounding in the value of
738
      // av1_one_by_x[25-1].
739
      //
740
      // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
741
      // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
742
      // overflow), without significantly affecting the final result: z == 0
743
      // implies that the image is essentially "flat", so the local mean and
744
      // individual pixel values are very similar.
745
      //
746
      // Note that saturating on the other side, ie. requring A[k] <= 255,
747
      // would be a bad idea, as that corresponds to the case where the image
748
      // is very variable, when we want to preserve the local pixel value as
749
      // much as possible.
750
83.6M
      A[k] = av1_x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
751
752
      // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
753
      // av1_one_by_x[n - 1] = round(2^12 / n)
754
      // => the product here is < 2^(20 + bit_depth) <= 2^32,
755
      // and B[k] is set to a value < 2^(8 + bit depth)
756
      // This holds even with the rounding in av1_one_by_x and in the overall
757
      // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
758
83.6M
      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
759
83.6M
                                             (uint32_t)B[k] *
760
83.6M
                                             (uint32_t)av1_one_by_x[n - 1],
761
83.6M
                                         SGRPROJ_RECIP_BITS);
762
83.6M
    }
763
2.06M
  }
764
52.1k
}
765
766
static void selfguided_restoration_fast_internal(
767
    int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
768
24.3k
    int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
769
24.3k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
770
24.3k
  const int r = params->r[radius_idx];
771
24.3k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
772
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
773
  // leading to a significant speed improvement.
774
  // We also align the stride to a multiple of 16 bytes, for consistency
775
  // with the SIMD version of this function.
776
24.3k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
777
24.3k
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
778
24.3k
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
779
24.3k
  int32_t *A = A_;
780
24.3k
  int32_t *B = B_;
781
24.3k
  int i, j;
782
24.3k
  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
783
24.3k
                                sgr_params_idx, radius_idx, 1, A, B);
784
24.3k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
785
24.3k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
786
787
  // Use the A[] and B[] arrays to calculate the filtered image
788
24.3k
  (void)r;
789
24.3k
  assert(r == 2);
790
1.23M
  for (i = 0; i < height; ++i) {
791
1.20M
    if (!(i & 1)) {  // even row
792
23.2M
      for (j = 0; j < width; ++j) {
793
22.6M
        const int k = i * buf_stride + j;
794
22.6M
        const int l = i * dgd_stride + j;
795
22.6M
        const int m = i * dst_stride + j;
796
22.6M
        const int nb = 5;
797
22.6M
        const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
798
22.6M
                          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
799
22.6M
                           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
800
22.6M
                              5;
801
22.6M
        const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
802
22.6M
                          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
803
22.6M
                           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
804
22.6M
                              5;
805
22.6M
        const int32_t v = a * dgd[l] + b;
806
22.6M
        dst[m] =
807
22.6M
            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
808
22.6M
      }
809
604k
    } else {  // odd row
810
23.2M
      for (j = 0; j < width; ++j) {
811
22.6M
        const int k = i * buf_stride + j;
812
22.6M
        const int l = i * dgd_stride + j;
813
22.6M
        const int m = i * dst_stride + j;
814
22.6M
        const int nb = 4;
815
22.6M
        const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
816
22.6M
        const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
817
22.6M
        const int32_t v = a * dgd[l] + b;
818
22.6M
        dst[m] =
819
22.6M
            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
820
22.6M
      }
821
603k
    }
822
1.20M
  }
823
24.3k
}
824
825
static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
826
                                            int dgd_stride, int32_t *dst,
827
                                            int dst_stride, int bit_depth,
828
                                            int sgr_params_idx,
829
27.8k
                                            int radius_idx) {
830
27.8k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
831
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
832
  // leading to a significant speed improvement.
833
  // We also align the stride to a multiple of 16 bytes, for consistency
834
  // with the SIMD version of this function.
835
27.8k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
836
27.8k
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
837
27.8k
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
838
27.8k
  int32_t *A = A_;
839
27.8k
  int32_t *B = B_;
840
27.8k
  int i, j;
841
27.8k
  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
842
27.8k
                                sgr_params_idx, radius_idx, 0, A, B);
843
27.8k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
844
27.8k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
845
846
  // Use the A[] and B[] arrays to calculate the filtered image
847
1.41M
  for (i = 0; i < height; ++i) {
848
55.1M
    for (j = 0; j < width; ++j) {
849
53.7M
      const int k = i * buf_stride + j;
850
53.7M
      const int l = i * dgd_stride + j;
851
53.7M
      const int m = i * dst_stride + j;
852
53.7M
      const int nb = 5;
853
53.7M
      const int32_t a =
854
53.7M
          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
855
53.7M
              4 +
856
53.7M
          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
857
53.7M
           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
858
53.7M
              3;
859
53.7M
      const int32_t b =
860
53.7M
          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
861
53.7M
              4 +
862
53.7M
          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
863
53.7M
           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
864
53.7M
              3;
865
53.7M
      const int32_t v = a * dgd[l] + b;
866
53.7M
      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
867
53.7M
    }
868
1.38M
  }
869
27.8k
}
870
871
int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
872
                                 int dgd_stride, int32_t *flt0, int32_t *flt1,
873
                                 int flt_stride, int sgr_params_idx,
874
32.9k
                                 int bit_depth, int highbd) {
875
32.9k
  int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
876
32.9k
  const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
877
32.9k
  int32_t *dgd32 =
878
32.9k
      dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
879
880
32.9k
  if (highbd) {
881
14.1k
    const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
882
753k
    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
883
32.9M
      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
884
32.2M
        dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
885
32.2M
      }
886
739k
    }
887
18.8k
  } else {
888
1.10M
    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
889
48.7M
      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
890
47.7M
        dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
891
47.7M
      }
892
1.08M
    }
893
18.8k
  }
894
895
32.9k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
896
  // If params->r == 0 we skip the corresponding filter. We only allow one of
897
  // the radii to be 0, as having both equal to 0 would be equivalent to
898
  // skipping SGR entirely.
899
32.9k
  assert(!(params->r[0] == 0 && params->r[1] == 0));
900
901
32.9k
  if (params->r[0] > 0)
902
24.3k
    selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
903
24.3k
                                         flt0, flt_stride, bit_depth,
904
24.3k
                                         sgr_params_idx, 0);
905
32.9k
  if (params->r[1] > 0)
906
27.8k
    selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
907
27.8k
                                    flt_stride, bit_depth, sgr_params_idx, 1);
908
32.9k
  return 0;
909
32.9k
}
910
911
void av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
912
                                        int height, int stride, int eps,
913
                                        const int *xqd, uint8_t *dst8,
914
                                        int dst_stride, int32_t *tmpbuf,
915
32.9k
                                        int bit_depth, int highbd) {
916
32.9k
  int32_t *flt0 = tmpbuf;
917
32.9k
  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
918
32.9k
  assert(width * height <= RESTORATION_UNITPELS_MAX);
919
920
32.9k
  const int ret = av1_selfguided_restoration_c(
921
32.9k
      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
922
32.9k
  (void)ret;
923
32.9k
  assert(!ret);
924
32.9k
  const sgr_params_type *const params = &av1_sgr_params[eps];
925
32.9k
  int xq[2];
926
32.9k
  av1_decode_xq(xqd, xq, params);
927
1.66M
  for (int i = 0; i < height; ++i) {
928
62.9M
    for (int j = 0; j < width; ++j) {
929
61.3M
      const int k = i * width + j;
930
61.3M
      uint8_t *dst8ij = dst8 + i * dst_stride + j;
931
61.3M
      const uint8_t *dat8ij = dat8 + i * stride + j;
932
933
61.3M
      const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
934
61.3M
      const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
935
61.3M
      int32_t v = u << SGRPROJ_PRJ_BITS;
936
      // If params->r == 0 then we skipped the filtering in
937
      // av1_selfguided_restoration_c, i.e. flt[k] == u
938
61.3M
      if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
939
61.3M
      if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
940
61.3M
      const int16_t w =
941
61.3M
          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
942
943
61.3M
      const uint16_t out = clip_pixel_highbd(w, bit_depth);
944
61.3M
      if (highbd)
945
24.4M
        *CONVERT_TO_SHORTPTR(dst8ij) = out;
946
36.8M
      else
947
36.8M
        *dst8ij = (uint8_t)out;
948
61.3M
    }
949
1.62M
  }
950
32.9k
}
951
952
static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
953
                                  int stripe_width, int stripe_height,
954
                                  int procunit_width, const uint8_t *src,
955
                                  int src_stride, uint8_t *dst, int dst_stride,
956
10.1k
                                  int32_t *tmpbuf, int bit_depth) {
957
10.1k
  (void)bit_depth;
958
10.1k
  assert(bit_depth == 8);
959
960
28.9k
  for (int j = 0; j < stripe_width; j += procunit_width) {
961
18.8k
    int w = AOMMIN(procunit_width, stripe_width - j);
962
18.8k
    av1_apply_selfguided_restoration(
963
18.8k
        src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
964
18.8k
        rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth, 0);
965
18.8k
  }
966
10.1k
}
967
968
#if CONFIG_AV1_HIGHBITDEPTH
969
static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
970
                                        int stripe_width, int stripe_height,
971
                                        int procunit_width, const uint8_t *src8,
972
                                        int src_stride, uint8_t *dst8,
973
                                        int dst_stride, int32_t *tmpbuf,
974
25.4k
                                        int bit_depth) {
975
25.4k
  (void)tmpbuf;
976
25.4k
  const ConvolveParams conv_params = get_conv_params_wiener(bit_depth);
977
978
60.0k
  for (int j = 0; j < stripe_width; j += procunit_width) {
979
34.6k
    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
980
34.6k
    const uint8_t *src8_p = src8 + j;
981
34.6k
    uint8_t *dst8_p = dst8 + j;
982
34.6k
    av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
983
34.6k
                                       rui->wiener_info.hfilter, 16,
984
34.6k
                                       rui->wiener_info.vfilter, 16, w,
985
34.6k
                                       stripe_height, &conv_params, bit_depth);
986
34.6k
  }
987
25.4k
}
988
989
static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
990
                                         int stripe_width, int stripe_height,
991
                                         int procunit_width,
992
                                         const uint8_t *src8, int src_stride,
993
                                         uint8_t *dst8, int dst_stride,
994
7.57k
                                         int32_t *tmpbuf, int bit_depth) {
995
21.6k
  for (int j = 0; j < stripe_width; j += procunit_width) {
996
14.1k
    int w = AOMMIN(procunit_width, stripe_width - j);
997
14.1k
    av1_apply_selfguided_restoration(
998
14.1k
        src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
999
14.1k
        rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
1000
14.1k
  }
1001
7.57k
}
1002
#endif  // CONFIG_AV1_HIGHBITDEPTH
1003
1004
typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
1005
                                  int stripe_width, int stripe_height,
1006
                                  int procunit_width, const uint8_t *src,
1007
                                  int src_stride, uint8_t *dst, int dst_stride,
1008
                                  int32_t *tmpbuf, int bit_depth);
1009
1010
#if CONFIG_AV1_HIGHBITDEPTH
1011
#define NUM_STRIPE_FILTERS 4
1012
static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
1013
  wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
1014
  sgrproj_filter_stripe_highbd
1015
};
1016
#else
1017
#define NUM_STRIPE_FILTERS 2
1018
static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
1019
  wiener_filter_stripe, sgrproj_filter_stripe
1020
};
1021
#endif  // CONFIG_AV1_HIGHBITDEPTH
1022
1023
// Filter one restoration unit
1024
void av1_loop_restoration_filter_unit(
1025
    const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
1026
    const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
1027
    const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
1028
    int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
1029
44.1k
    int dst_stride, int32_t *tmpbuf, int optimized_lr) {
1030
44.1k
  RestorationType unit_rtype = rui->restoration_type;
1031
1032
44.1k
  int unit_h = limits->v_end - limits->v_start;
1033
44.1k
  int unit_w = limits->h_end - limits->h_start;
1034
44.1k
  uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start;
1035
44.1k
  uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start;
1036
1037
44.1k
  if (unit_rtype == RESTORE_NONE) {
1038
16.9k
    copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd);
1039
16.9k
    return;
1040
16.9k
  }
1041
1042
27.1k
  const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
1043
27.1k
  assert(filter_idx < NUM_STRIPE_FILTERS);
1044
27.1k
  const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
1045
1046
27.1k
  const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
1047
1048
  // Convolve the whole tile one stripe at a time
1049
27.1k
  RestorationTileLimits remaining_stripes = *limits;
1050
27.1k
  int i = 0;
1051
81.5k
  while (i < unit_h) {
1052
54.4k
    int copy_above, copy_below;
1053
54.4k
    remaining_stripes.v_start = limits->v_start + i;
1054
1055
54.4k
    get_stripe_boundary_info(&remaining_stripes, tile_rect, ss_y, &copy_above,
1056
54.4k
                             &copy_below);
1057
1058
54.4k
    const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1059
54.4k
    const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
1060
1061
    // Work out where this stripe's boundaries are within
1062
    // rsb->stripe_boundary_{above,below}
1063
54.4k
    const int tile_stripe =
1064
54.4k
        (remaining_stripes.v_start - tile_rect->top + runit_offset) /
1065
54.4k
        full_stripe_height;
1066
54.4k
    const int frame_stripe = tile_stripe0 + tile_stripe;
1067
54.4k
    const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
1068
1069
    // Calculate this stripe's height, based on two rules:
1070
    // * The topmost stripe in each tile is 8 luma pixels shorter than usual.
1071
    // * We can't extend past the end of the current restoration unit
1072
54.4k
    const int nominal_stripe_height =
1073
54.4k
        full_stripe_height - ((tile_stripe == 0) ? runit_offset : 0);
1074
54.4k
    const int h = AOMMIN(nominal_stripe_height,
1075
54.4k
                         remaining_stripes.v_end - remaining_stripes.v_start);
1076
1077
54.4k
    setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
1078
54.4k
                                     h, data8, stride, rlbs, copy_above,
1079
54.4k
                                     copy_below, optimized_lr);
1080
1081
54.4k
    stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
1082
54.4k
                  dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth);
1083
1084
54.4k
    restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
1085
54.4k
                                       data8, stride, copy_above, copy_below,
1086
54.4k
                                       optimized_lr);
1087
1088
54.4k
    i += h;
1089
54.4k
  }
1090
27.1k
}
1091
1092
static void filter_frame_on_unit(const RestorationTileLimits *limits,
1093
                                 const AV1PixelRect *tile_rect,
1094
                                 int rest_unit_idx, void *priv, int32_t *tmpbuf,
1095
44.1k
                                 RestorationLineBuffers *rlbs) {
1096
44.1k
  FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
1097
44.1k
  const RestorationInfo *rsi = ctxt->rsi;
1098
1099
44.1k
  av1_loop_restoration_filter_unit(
1100
44.1k
      limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, tile_rect,
1101
44.1k
      ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, ctxt->bit_depth,
1102
44.1k
      ctxt->data8, ctxt->data_stride, ctxt->dst8, ctxt->dst_stride, tmpbuf,
1103
44.1k
      rsi->optimized_lr);
1104
44.1k
}
1105
1106
void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
1107
                                            YV12_BUFFER_CONFIG *frame,
1108
                                            AV1_COMMON *cm, int optimized_lr,
1109
7.01k
                                            int num_planes) {
1110
7.01k
  const SequenceHeader *const seq_params = cm->seq_params;
1111
7.01k
  const int bit_depth = seq_params->bit_depth;
1112
7.01k
  const int highbd = seq_params->use_highbitdepth;
1113
7.01k
  lr_ctxt->dst = &cm->rst_frame;
1114
1115
7.01k
  const int frame_width = frame->crop_widths[0];
1116
7.01k
  const int frame_height = frame->crop_heights[0];
1117
7.01k
  if (aom_realloc_frame_buffer(
1118
7.01k
          lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
1119
7.01k
          seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
1120
7.01k
          cm->features.byte_alignment, NULL, NULL, NULL, 0) < 0)
1121
0
    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
1122
0
                       "Failed to allocate restoration dst buffer");
1123
1124
7.01k
  lr_ctxt->on_rest_unit = filter_frame_on_unit;
1125
7.01k
  lr_ctxt->frame = frame;
1126
27.0k
  for (int plane = 0; plane < num_planes; ++plane) {
1127
20.0k
    RestorationInfo *rsi = &cm->rst_info[plane];
1128
20.0k
    RestorationType rtype = rsi->frame_restoration_type;
1129
20.0k
    rsi->optimized_lr = optimized_lr;
1130
1131
20.0k
    if (rtype == RESTORE_NONE) {
1132
8.31k
      continue;
1133
8.31k
    }
1134
1135
11.7k
    const int is_uv = plane > 0;
1136
11.7k
    const int plane_width = frame->crop_widths[is_uv];
1137
11.7k
    const int plane_height = frame->crop_heights[is_uv];
1138
11.7k
    FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
1139
1140
11.7k
    av1_extend_frame(frame->buffers[plane], plane_width, plane_height,
1141
11.7k
                     frame->strides[is_uv], RESTORATION_BORDER,
1142
11.7k
                     RESTORATION_BORDER, highbd);
1143
1144
11.7k
    lr_plane_ctxt->rsi = rsi;
1145
11.7k
    lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
1146
11.7k
    lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
1147
11.7k
    lr_plane_ctxt->highbd = highbd;
1148
11.7k
    lr_plane_ctxt->bit_depth = bit_depth;
1149
11.7k
    lr_plane_ctxt->data8 = frame->buffers[plane];
1150
11.7k
    lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
1151
11.7k
    lr_plane_ctxt->data_stride = frame->strides[is_uv];
1152
11.7k
    lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
1153
11.7k
    lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv);
1154
11.7k
    lr_plane_ctxt->tile_stripe0 = 0;
1155
11.7k
  }
1156
7.01k
}
1157
1158
void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
1159
7.01k
                                      AV1_COMMON *cm, int num_planes) {
1160
7.01k
  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
1161
7.01k
                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
1162
7.01k
                           int vstart, int vend);
1163
7.01k
  static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
1164
7.01k
                                         aom_yv12_partial_coloc_copy_u,
1165
7.01k
                                         aom_yv12_partial_coloc_copy_v };
1166
7.01k
  assert(num_planes <= 3);
1167
27.0k
  for (int plane = 0; plane < num_planes; ++plane) {
1168
20.0k
    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
1169
11.7k
    AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
1170
11.7k
    copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, tile_rect.left,
1171
11.7k
                     tile_rect.right, tile_rect.top, tile_rect.bottom);
1172
11.7k
  }
1173
7.01k
}
1174
1175
static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
1176
7.01k
                                        int num_planes) {
1177
7.01k
  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
1178
1179
27.0k
  for (int plane = 0; plane < num_planes; ++plane) {
1180
20.0k
    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
1181
8.31k
      continue;
1182
8.31k
    }
1183
1184
11.7k
    av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit,
1185
11.7k
                                   &ctxt[plane], &ctxt[plane].tile_rect,
1186
11.7k
                                   cm->rst_tmpbuf, cm->rlbs);
1187
11.7k
  }
1188
7.01k
}
1189
1190
void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
1191
                                       AV1_COMMON *cm, int optimized_lr,
1192
7.01k
                                       void *lr_ctxt) {
1193
7.01k
  assert(!cm->features.all_lossless);
1194
7.01k
  const int num_planes = av1_num_planes(cm);
1195
1196
7.01k
  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
1197
1198
7.01k
  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
1199
7.01k
                                         optimized_lr, num_planes);
1200
1201
7.01k
  foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
1202
1203
7.01k
  av1_loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
1204
7.01k
}
1205
1206
void av1_foreach_rest_unit_in_row(
1207
    RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
1208
    rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
1209
    int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
1210
    void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
1211
    sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
1212
33.4k
    struct AV1LrSyncData *const lr_sync) {
1213
33.4k
  const int tile_w = tile_rect->right - tile_rect->left;
1214
33.4k
  const int ext_size = unit_size * 3 / 2;
1215
33.4k
  int x0 = 0, j = 0;
1216
77.5k
  while (x0 < tile_w) {
1217
44.1k
    int remaining_w = tile_w - x0;
1218
44.1k
    int w = (remaining_w < ext_size) ? remaining_w : unit_size;
1219
1220
44.1k
    limits->h_start = tile_rect->left + x0;
1221
44.1k
    limits->h_end = tile_rect->left + x0 + w;
1222
44.1k
    assert(limits->h_end <= tile_rect->right);
1223
1224
44.1k
    const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j;
1225
1226
    // No sync for even numbered rows
1227
    // For odd numbered rows, Loop Restoration of current block requires the LR
1228
    // of top-right and bottom-right blocks to be completed
1229
1230
    // top-right sync
1231
44.1k
    on_sync_read(lr_sync, row_number, j, plane);
1232
44.1k
    if ((row_number + 1) < vunits_per_tile)
1233
      // bottom-right sync
1234
25.1k
      on_sync_read(lr_sync, row_number + 2, j, plane);
1235
1236
44.1k
    on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs);
1237
1238
44.1k
    on_sync_write(lr_sync, row_number, j, hunits_per_tile, plane);
1239
1240
44.1k
    x0 += w;
1241
44.1k
    ++j;
1242
44.1k
  }
1243
33.4k
}
1244
1245
69.2k
void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
1246
69.2k
  (void)lr_sync;
1247
69.2k
  (void)r;
1248
69.2k
  (void)c;
1249
69.2k
  (void)plane;
1250
69.2k
}
1251
1252
void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
1253
44.1k
                             const int sb_cols, int plane) {
1254
44.1k
  (void)lr_sync;
1255
44.1k
  (void)r;
1256
44.1k
  (void)c;
1257
44.1k
  (void)sb_cols;
1258
44.1k
  (void)plane;
1259
44.1k
}
1260
1261
static void foreach_rest_unit_in_tile(
1262
    const AV1PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols,
1263
    int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size,
1264
    int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv,
1265
11.7k
    int32_t *tmpbuf, RestorationLineBuffers *rlbs) {
1266
11.7k
  const int tile_h = tile_rect->bottom - tile_rect->top;
1267
11.7k
  const int ext_size = unit_size * 3 / 2;
1268
1269
11.7k
  const int tile_idx = tile_col + tile_row * tile_cols;
1270
11.7k
  const int unit_idx0 = tile_idx * units_per_tile;
1271
1272
11.7k
  int y0 = 0, i = 0;
1273
45.2k
  while (y0 < tile_h) {
1274
33.4k
    int remaining_h = tile_h - y0;
1275
33.4k
    int h = (remaining_h < ext_size) ? remaining_h : unit_size;
1276
1277
33.4k
    RestorationTileLimits limits;
1278
33.4k
    limits.v_start = tile_rect->top + y0;
1279
33.4k
    limits.v_end = tile_rect->top + y0 + h;
1280
33.4k
    assert(limits.v_end <= tile_rect->bottom);
1281
    // Offset the tile upwards to align with the restoration processing stripe
1282
33.4k
    const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
1283
33.4k
    limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
1284
33.4k
    if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
1285
1286
33.4k
    av1_foreach_rest_unit_in_row(
1287
33.4k
        &limits, tile_rect, on_rest_unit, i, unit_size, unit_idx0,
1288
33.4k
        hunits_per_tile, vunits_per_tile, plane, priv, tmpbuf, rlbs,
1289
33.4k
        av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL);
1290
1291
33.4k
    y0 += h;
1292
33.4k
    ++i;
1293
33.4k
  }
1294
11.7k
}
1295
1296
void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
1297
                                    rest_unit_visitor_t on_rest_unit,
1298
                                    void *priv, AV1PixelRect *tile_rect,
1299
                                    int32_t *tmpbuf,
1300
11.7k
                                    RestorationLineBuffers *rlbs) {
1301
11.7k
  const int is_uv = plane > 0;
1302
11.7k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1303
1304
11.7k
  const RestorationInfo *rsi = &cm->rst_info[plane];
1305
1306
11.7k
  foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS,
1307
11.7k
                            rsi->horz_units_per_tile, rsi->vert_units_per_tile,
1308
11.7k
                            rsi->units_per_tile, rsi->restoration_unit_size,
1309
11.7k
                            ss_y, plane, on_rest_unit, priv, tmpbuf, rlbs);
1310
11.7k
}
1311
1312
int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
1313
                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
1314
                                       int *rcol0, int *rcol1, int *rrow0,
1315
22.2M
                                       int *rrow1) {
1316
22.2M
  assert(rcol0 && rcol1 && rrow0 && rrow1);
1317
1318
22.2M
  if (bsize != cm->seq_params->sb_size) return 0;
1319
2.23M
  if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
1320
1321
2.23M
  assert(!cm->features.all_lossless);
1322
1323
452k
  const int is_uv = plane > 0;
1324
1325
452k
  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
1326
452k
  const int tile_w = tile_rect.right - tile_rect.left;
1327
452k
  const int tile_h = tile_rect.bottom - tile_rect.top;
1328
1329
452k
  const int mi_top = 0;
1330
452k
  const int mi_left = 0;
1331
1332
  // Compute the mi-unit corners of the superblock relative to the top-left of
1333
  // the tile
1334
452k
  const int mi_rel_row0 = mi_row - mi_top;
1335
452k
  const int mi_rel_col0 = mi_col - mi_left;
1336
452k
  const int mi_rel_row1 = mi_rel_row0 + mi_size_high[bsize];
1337
452k
  const int mi_rel_col1 = mi_rel_col0 + mi_size_wide[bsize];
1338
1339
452k
  const RestorationInfo *rsi = &cm->rst_info[plane];
1340
452k
  const int size = rsi->restoration_unit_size;
1341
1342
  // Calculate the number of restoration units in this tile (which might be
1343
  // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile)
1344
452k
  const int horz_units = av1_lr_count_units_in_tile(size, tile_w);
1345
452k
  const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
1346
1347
  // The size of an MI-unit on this plane of the image
1348
452k
  const int ss_x = is_uv && cm->seq_params->subsampling_x;
1349
452k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1350
452k
  const int mi_size_x = MI_SIZE >> ss_x;
1351
452k
  const int mi_size_y = MI_SIZE >> ss_y;
1352
1353
  // Write m for the relative mi column or row, D for the superres denominator
1354
  // and N for the superres numerator. If u is the upscaled pixel offset then
1355
  // we can write the downscaled pixel offset in two ways as:
1356
  //
1357
  //   MI_SIZE * m = N / D u
1358
  //
1359
  // from which we get u = D * MI_SIZE * m / N
1360
452k
  const int mi_to_num_x = av1_superres_scaled(cm)
1361
452k
                              ? mi_size_x * cm->superres_scale_denominator
1362
452k
                              : mi_size_x;
1363
452k
  const int mi_to_num_y = mi_size_y;
1364
452k
  const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
1365
452k
  const int denom_y = size;
1366
1367
452k
  const int rnd_x = denom_x - 1;
1368
452k
  const int rnd_y = denom_y - 1;
1369
1370
  // rcol0/rrow0 should be the first column/row of restoration units (relative
1371
  // to the top-left of the tile) that doesn't start left/below of
1372
  // mi_col/mi_row. For this calculation, we need to round up the division (if
1373
  // the sb starts at runit column 10.1, the first matching runit has column
1374
  // index 11)
1375
452k
  *rcol0 = (mi_rel_col0 * mi_to_num_x + rnd_x) / denom_x;
1376
452k
  *rrow0 = (mi_rel_row0 * mi_to_num_y + rnd_y) / denom_y;
1377
1378
  // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
1379
  // below-right. If we're at the bottom or right of the tile, this restoration
1380
  // unit might not exist, in which case we'll clamp accordingly.
1381
452k
  *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
1382
452k
  *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
1383
1384
452k
  return *rcol0 < *rcol1 && *rrow0 < *rrow1;
1385
452k
}
1386
1387
// Extend to left and right
1388
static void extend_lines(uint8_t *buf, int width, int height, int stride,
1389
206k
                         int extend, int use_highbitdepth) {
1390
619k
  for (int i = 0; i < height; ++i) {
1391
413k
    if (use_highbitdepth) {
1392
324k
      uint16_t *buf16 = (uint16_t *)buf;
1393
324k
      aom_memset16(buf16 - extend, buf16[0], extend);
1394
324k
      aom_memset16(buf16 + width, buf16[width - 1], extend);
1395
324k
    } else {
1396
89.0k
      memset(buf - extend, buf[0], extend);
1397
89.0k
      memset(buf + width, buf[width - 1], extend);
1398
89.0k
    }
1399
413k
    buf += stride;
1400
413k
  }
1401
206k
}
1402
1403
static void save_deblock_boundary_lines(
1404
    const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
1405
    int stripe, int use_highbd, int is_above,
1406
170k
    RestorationStripeBoundaries *boundaries) {
1407
170k
  const int is_uv = plane > 0;
1408
170k
  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1409
170k
  const int src_stride = frame->strides[is_uv] << use_highbd;
1410
170k
  const uint8_t *src_rows = src_buf + row * src_stride;
1411
1412
170k
  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1413
170k
                               : boundaries->stripe_boundary_below;
1414
170k
  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1415
170k
  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1416
170k
  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1417
1418
  // There is a rare case in which a processing stripe can end 1px above the
1419
  // crop border. In this case, we do want to use deblocked pixels from below
1420
  // the stripe (hence why we ended up in this function), but instead of
1421
  // fetching 2 "below" rows we need to fetch one and duplicate it.
1422
  // This is equivalent to clamping the sample locations against the crop border
1423
170k
  const int lines_to_save =
1424
170k
      AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
1425
170k
  assert(lines_to_save == 1 || lines_to_save == 2);
1426
1427
170k
  int upscaled_width;
1428
170k
  int line_bytes;
1429
170k
  if (av1_superres_scaled(cm)) {
1430
144k
    const int ss_x = is_uv && cm->seq_params->subsampling_x;
1431
144k
    upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
1432
144k
    line_bytes = upscaled_width << use_highbd;
1433
144k
    if (use_highbd)
1434
120k
      av1_upscale_normative_rows(
1435
120k
          cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
1436
120k
          CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
1437
120k
          plane, lines_to_save);
1438
24.9k
    else
1439
24.9k
      av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
1440
24.9k
                                 boundaries->stripe_boundary_stride, plane,
1441
24.9k
                                 lines_to_save);
1442
144k
  } else {
1443
25.2k
    upscaled_width = frame->crop_widths[is_uv];
1444
25.2k
    line_bytes = upscaled_width << use_highbd;
1445
75.8k
    for (int i = 0; i < lines_to_save; i++) {
1446
50.5k
      memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
1447
50.5k
             line_bytes);
1448
50.5k
    }
1449
25.2k
  }
1450
  // If we only saved one line, then copy it into the second line buffer
1451
170k
  if (lines_to_save == 1)
1452
27
    memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
1453
1454
170k
  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1455
170k
               RESTORATION_EXTRA_HORZ, use_highbd);
1456
170k
}
1457
1458
static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1459
                                     const AV1_COMMON *cm, int plane, int row,
1460
                                     int stripe, int use_highbd, int is_above,
1461
36.3k
                                     RestorationStripeBoundaries *boundaries) {
1462
36.3k
  const int is_uv = plane > 0;
1463
36.3k
  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1464
36.3k
  const int src_stride = frame->strides[is_uv] << use_highbd;
1465
36.3k
  const uint8_t *src_rows = src_buf + row * src_stride;
1466
1467
36.3k
  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1468
36.3k
                               : boundaries->stripe_boundary_below;
1469
36.3k
  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1470
36.3k
  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1471
36.3k
  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1472
36.3k
  const int src_width = frame->crop_widths[is_uv];
1473
1474
  // At the point where this function is called, we've already applied
1475
  // superres. So we don't need to extend the lines here, we can just
1476
  // pull directly from the topmost row of the upscaled frame.
1477
36.3k
  const int ss_x = is_uv && cm->seq_params->subsampling_x;
1478
36.3k
  const int upscaled_width = av1_superres_scaled(cm)
1479
36.3k
                                 ? (cm->superres_upscaled_width + ss_x) >> ss_x
1480
36.3k
                                 : src_width;
1481
36.3k
  const int line_bytes = upscaled_width << use_highbd;
1482
109k
  for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
1483
    // Copy the line at 'row' into both context lines. This is because
1484
    // we want to (effectively) extend the outermost row of CDEF data
1485
    // from this tile to produce a border, rather than using deblocked
1486
    // pixels from the tile above/below.
1487
72.7k
    memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
1488
72.7k
  }
1489
36.3k
  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1490
36.3k
               RESTORATION_EXTRA_HORZ, use_highbd);
1491
36.3k
}
1492
1493
static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1494
                                         int use_highbd, int plane,
1495
36.3k
                                         AV1_COMMON *cm, int after_cdef) {
1496
36.3k
  const int is_uv = plane > 0;
1497
36.3k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1498
36.3k
  const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1499
36.3k
  const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
1500
1501
  // Get the tile rectangle, with height rounded up to the next multiple of 8
1502
  // luma pixels (only relevant for the bottom tile of the frame)
1503
36.3k
  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
1504
36.3k
  const int stripe0 = 0;
1505
1506
36.3k
  RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
1507
1508
36.3k
  const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
1509
1510
36.3k
  int tile_stripe;
1511
243k
  for (tile_stripe = 0;; ++tile_stripe) {
1512
243k
    const int rel_y0 = AOMMAX(0, tile_stripe * stripe_height - stripe_off);
1513
243k
    const int y0 = tile_rect.top + rel_y0;
1514
243k
    if (y0 >= tile_rect.bottom) break;
1515
1516
206k
    const int rel_y1 = (tile_stripe + 1) * stripe_height - stripe_off;
1517
206k
    const int y1 = AOMMIN(tile_rect.top + rel_y1, tile_rect.bottom);
1518
1519
206k
    const int frame_stripe = stripe0 + tile_stripe;
1520
1521
    // In this case, we should only use CDEF pixels at the top
1522
    // and bottom of the frame as a whole; internal tile boundaries
1523
    // can use deblocked pixels from adjacent tiles for context.
1524
206k
    const int use_deblock_above = (frame_stripe > 0);
1525
206k
    const int use_deblock_below = (y1 < plane_height);
1526
1527
206k
    if (!after_cdef) {
1528
      // Save deblocked context where needed.
1529
103k
      if (use_deblock_above) {
1530
85.1k
        save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
1531
85.1k
                                    frame_stripe, use_highbd, 1, boundaries);
1532
85.1k
      }
1533
103k
      if (use_deblock_below) {
1534
85.1k
        save_deblock_boundary_lines(frame, cm, plane, y1, frame_stripe,
1535
85.1k
                                    use_highbd, 0, boundaries);
1536
85.1k
      }
1537
103k
    } else {
1538
      // Save CDEF context where needed. Note that we need to save the CDEF
1539
      // context for a particular boundary iff we *didn't* save deblocked
1540
      // context for that boundary.
1541
      //
1542
      // In addition, we need to save copies of the outermost line within
1543
      // the tile, rather than using data from outside the tile.
1544
103k
      if (!use_deblock_above) {
1545
18.1k
        save_cdef_boundary_lines(frame, cm, plane, y0, frame_stripe, use_highbd,
1546
18.1k
                                 1, boundaries);
1547
18.1k
      }
1548
103k
      if (!use_deblock_below) {
1549
18.1k
        save_cdef_boundary_lines(frame, cm, plane, y1 - 1, frame_stripe,
1550
18.1k
                                 use_highbd, 0, boundaries);
1551
18.1k
      }
1552
103k
    }
1553
206k
  }
1554
36.3k
}
1555
1556
// For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
1557
// lines to be used as boundary in the loop restoration process. The
1558
// lines are saved in rst_internal.stripe_boundary_lines
1559
void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1560
12.2k
                                              AV1_COMMON *cm, int after_cdef) {
1561
12.2k
  const int num_planes = av1_num_planes(cm);
1562
12.2k
  const int use_highbd = cm->seq_params->use_highbitdepth;
1563
48.6k
  for (int p = 0; p < num_planes; ++p) {
1564
36.3k
    save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
1565
36.3k
  }
1566
12.2k
}