Coverage Report

Created: 2025-12-03 07:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/av1/common/restoration.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 *
11
 */
12
13
#include <math.h>
14
15
#include "config/aom_config.h"
16
#include "config/aom_dsp_rtcd.h"
17
#include "config/aom_scale_rtcd.h"
18
19
#include "aom_mem/aom_mem.h"
20
#include "av1/common/av1_common_int.h"
21
#include "av1/common/resize.h"
22
#include "av1/common/restoration.h"
23
#include "aom_dsp/aom_dsp_common.h"
24
#include "aom_mem/aom_mem.h"
25
26
#include "aom_ports/mem.h"
27
28
// The 's' values are calculated based on original 'r' and 'e' values in the
29
// spec using GenSgrprojVtable().
30
// Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
31
const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
32
  { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
33
  { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
34
  { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
35
  { { 2, 1 }, { 47, 1079 } },  { { 2, 1 }, { 37, 996 } },
36
  { { 2, 1 }, { 30, 925 } },   { { 2, 1 }, { 25, 863 } },
37
  { { 0, 1 }, { -1, 2589 } },  { { 0, 1 }, { -1, 1618 } },
38
  { { 0, 1 }, { -1, 1177 } },  { { 0, 1 }, { -1, 925 } },
39
  { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
40
};
41
42
512k
AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
43
512k
  AV1PixelRect rect;
44
45
512k
  int ss_x = is_uv && cm->seq_params->subsampling_x;
46
512k
  int ss_y = is_uv && cm->seq_params->subsampling_y;
47
48
512k
  rect.top = 0;
49
512k
  rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
50
512k
  rect.left = 0;
51
512k
  rect.right = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
52
512k
  return rect;
53
512k
}
54
55
// Count horizontal or vertical units per tile (use a width or height for
56
// tile_size, respectively). We basically want to divide the tile size by the
57
// size of a restoration unit. Rather than rounding up unconditionally as you
58
// might expect, we round to nearest, which models the way a right or bottom
59
// restoration unit can extend to up to 150% its normal width or height. The
60
// max with 1 is to deal with tiles that are smaller than half of a restoration
61
// unit.
62
925k
int av1_lr_count_units_in_tile(int unit_size, int tile_size) {
63
925k
  return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
64
925k
}
65
66
void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
67
35.3k
                                  int is_uv) {
68
  // We need to allocate enough space for restoration units to cover the
69
  // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the
70
  // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
71
  // to do the computation ourselves, iterating over the tiles and keeping
72
  // track of the largest width and height, then upscaling.
73
35.3k
  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
74
35.3k
  const int max_tile_w = tile_rect.right - tile_rect.left;
75
35.3k
  const int max_tile_h = tile_rect.bottom - tile_rect.top;
76
77
  // To calculate hpertile and vpertile (horizontal and vertical units per
78
  // tile), we basically want to divide the largest tile width or height by the
79
  // size of a restoration unit. Rather than rounding up unconditionally as you
80
  // might expect, we round to nearest, which models the way a right or bottom
81
  // restoration unit can extend to up to 150% its normal width or height. The
82
  // max with 1 is to deal with tiles that are smaller than half of a
83
  // restoration unit.
84
35.3k
  const int unit_size = rsi->restoration_unit_size;
85
35.3k
  const int hpertile = av1_lr_count_units_in_tile(unit_size, max_tile_w);
86
35.3k
  const int vpertile = av1_lr_count_units_in_tile(unit_size, max_tile_h);
87
88
35.3k
  rsi->units_per_tile = hpertile * vpertile;
89
35.3k
  rsi->horz_units_per_tile = hpertile;
90
35.3k
  rsi->vert_units_per_tile = vpertile;
91
92
35.3k
  const int ntiles = 1;
93
35.3k
  const int nunits = ntiles * rsi->units_per_tile;
94
95
35.3k
  aom_free(rsi->unit_info);
96
35.3k
  CHECK_MEM_ERROR(cm, rsi->unit_info,
97
35.3k
                  (RestorationUnitInfo *)aom_memalign(
98
35.3k
                      16, sizeof(*rsi->unit_info) * nunits));
99
35.3k
}
100
101
104k
void av1_free_restoration_struct(RestorationInfo *rst_info) {
102
104k
  aom_free(rst_info->unit_info);
103
104k
  rst_info->unit_info = NULL;
104
104k
}
105
106
#if 0
107
// Pair of values for each sgrproj parameter:
108
// Index 0 corresponds to r[0], e[0]
109
// Index 1 corresponds to r[1], e[1]
110
int sgrproj_mtable[SGRPROJ_PARAMS][2];
111
112
static void GenSgrprojVtable() {
113
  for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
114
    const sgr_params_type *const params = &av1_sgr_params[i];
115
    for (int j = 0; j < 2; ++j) {
116
      const int e = params->e[j];
117
      const int r = params->r[j];
118
      if (r == 0) {                 // filter is disabled
119
        sgrproj_mtable[i][j] = -1;  // mark invalid
120
      } else {                      // filter is enabled
121
        const int n = (2 * r + 1) * (2 * r + 1);
122
        const int n2e = n * n * e;
123
        assert(n2e != 0);
124
        sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
125
      }
126
    }
127
  }
128
}
129
#endif
130
131
34.8k
void av1_loop_restoration_precal() {
132
#if 0
133
  GenSgrprojVtable();
134
#endif
135
34.8k
}
136
137
static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride,
138
3.78k
                               int border_horz, int border_vert) {
139
3.78k
  uint8_t *data_p;
140
3.78k
  int i;
141
1.35M
  for (i = 0; i < height; ++i) {
142
1.34M
    data_p = data + i * stride;
143
1.34M
    memset(data_p - border_horz, data_p[0], border_horz);
144
1.34M
    memset(data_p + width, data_p[width - 1], border_horz);
145
1.34M
  }
146
3.78k
  data_p = data - border_horz;
147
15.1k
  for (i = -border_vert; i < 0; ++i) {
148
11.3k
    memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
149
11.3k
  }
150
15.1k
  for (i = height; i < height + border_vert; ++i) {
151
11.3k
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
152
11.3k
           width + 2 * border_horz);
153
11.3k
  }
154
3.78k
}
155
156
#if CONFIG_AV1_HIGHBITDEPTH
157
static void extend_frame_highbd(uint16_t *data, int width, int height,
158
8.50k
                                int stride, int border_horz, int border_vert) {
159
8.50k
  uint16_t *data_p;
160
8.50k
  int i, j;
161
2.19M
  for (i = 0; i < height; ++i) {
162
2.18M
    data_p = data + i * stride;
163
8.73M
    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
164
8.73M
    for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
165
2.18M
  }
166
8.50k
  data_p = data - border_horz;
167
34.0k
  for (i = -border_vert; i < 0; ++i) {
168
25.5k
    memcpy(data_p + i * stride, data_p,
169
25.5k
           (width + 2 * border_horz) * sizeof(uint16_t));
170
25.5k
  }
171
34.0k
  for (i = height; i < height + border_vert; ++i) {
172
25.5k
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
173
25.5k
           (width + 2 * border_horz) * sizeof(uint16_t));
174
25.5k
  }
175
8.50k
}
176
177
static void copy_tile_highbd(int width, int height, const uint16_t *src,
178
11.0k
                             int src_stride, uint16_t *dst, int dst_stride) {
179
904k
  for (int i = 0; i < height; ++i)
180
892k
    memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
181
11.0k
}
182
#endif
183
184
void av1_extend_frame(uint8_t *data, int width, int height, int stride,
185
12.2k
                      int border_horz, int border_vert, int highbd) {
186
12.2k
#if CONFIG_AV1_HIGHBITDEPTH
187
12.2k
  if (highbd) {
188
8.50k
    extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
189
8.50k
                        border_horz, border_vert);
190
8.50k
    return;
191
8.50k
  }
192
3.78k
#endif
193
3.78k
  (void)highbd;
194
3.78k
  extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
195
3.78k
}
196
197
static void copy_tile_lowbd(int width, int height, const uint8_t *src,
198
6.63k
                            int src_stride, uint8_t *dst, int dst_stride) {
199
570k
  for (int i = 0; i < height; ++i)
200
563k
    memcpy(dst + i * dst_stride, src + i * src_stride, width);
201
6.63k
}
202
203
static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
204
17.6k
                      uint8_t *dst, int dst_stride, int highbd) {
205
17.6k
#if CONFIG_AV1_HIGHBITDEPTH
206
17.6k
  if (highbd) {
207
11.0k
    copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
208
11.0k
                     CONVERT_TO_SHORTPTR(dst), dst_stride);
209
11.0k
    return;
210
11.0k
  }
211
6.63k
#endif
212
6.63k
  (void)highbd;
213
6.63k
  copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
214
6.63k
}
215
216
938k
#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
217
218
// With striped loop restoration, the filtering for each 64-pixel stripe gets
219
// most of its input from the output of CDEF (stored in data8), but we need to
220
// fill out a border of 3 pixels above/below the stripe according to the
221
// following
222
// rules:
223
//
224
// * At a frame boundary, we copy the outermost row of CDEF pixels three times.
225
//   This extension is done by a call to av1_extend_frame() at the start of the
226
//   loop restoration process, so the value of copy_above/copy_below doesn't
227
//   strictly matter. However, by setting *copy_above = *copy_below = 1 whenever
228
//   loop filtering across tiles is disabled, we can allow
229
//   {setup,restore}_processing_stripe_boundary to assume that the top/bottom
230
//   data has always been copied, simplifying the behaviour at the left and
231
//   right edges of tiles.
232
//
233
// * If we're at a tile boundary and loop filtering across tiles is enabled,
234
//   then there is a logical stripe which is 64 pixels high, but which is split
235
//   into an 8px high and a 56px high stripe so that the processing (and
236
//   coefficient set usage) can be aligned to tiles.
237
//   In this case, we use the 3 rows of CDEF output across the boundary for
238
//   context; this corresponds to leaving the frame buffer as-is.
239
//
240
// * If we're at a tile boundary and loop filtering across tiles is disabled,
241
//   then we take the outermost row of CDEF pixels *within the current tile*
242
//   and copy it three times. Thus we behave exactly as if the tile were a full
243
//   frame.
244
//
245
// * Otherwise, we're at a stripe boundary within a tile. In that case, we
246
//   take 2 rows of deblocked pixels and extend them to 3 rows of context.
247
//
248
// The distinction between the latter two cases is handled by the
249
// av1_loop_restoration_save_boundary_lines() function, so here we just need
250
// to decide if we're overwriting the above/below boundary pixels or not.
251
static void get_stripe_boundary_info(const RestorationTileLimits *limits,
252
                                     const AV1PixelRect *tile_rect, int ss_y,
253
56.4k
                                     int *copy_above, int *copy_below) {
254
56.4k
  *copy_above = 1;
255
56.4k
  *copy_below = 1;
256
257
56.4k
  const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
258
56.4k
  const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
259
260
56.4k
  const int first_stripe_in_tile = (limits->v_start == tile_rect->top);
261
56.4k
  const int this_stripe_height =
262
56.4k
      full_stripe_height - (first_stripe_in_tile ? runit_offset : 0);
263
56.4k
  const int last_stripe_in_tile =
264
56.4k
      (limits->v_start + this_stripe_height >= tile_rect->bottom);
265
266
56.4k
  if (first_stripe_in_tile) *copy_above = 0;
267
56.4k
  if (last_stripe_in_tile) *copy_below = 0;
268
56.4k
}
269
270
// Overwrite the border pixels around a processing stripe so that the conditions
271
// listed above get_stripe_boundary_info() are preserved.
272
// We save the pixels which get overwritten into a temporary buffer, so that
273
// they can be restored by restore_processing_stripe_boundary() after we've
274
// processed the stripe.
275
//
276
// limits gives the rectangular limits of the remaining stripes for the current
277
// restoration unit. rsb is the stored stripe boundaries (taken from either
278
// deblock or CDEF output as necessary).
279
//
280
// tile_rect is the limits of the current tile and tile_stripe0 is the index of
281
// the first stripe in this tile (needed to convert the tile-relative stripe
282
// index we get from limits into something we can look up in rsb).
283
static void setup_processing_stripe_boundary(
284
    const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
285
    int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
286
56.4k
    RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
287
  // Offsets within the line buffers. The buffer logically starts at column
288
  // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
289
  // has column x0 in the buffer.
290
56.4k
  const int buf_stride = rsb->stripe_boundary_stride;
291
56.4k
  const int buf_x0_off = limits->h_start;
292
56.4k
  const int line_width =
293
56.4k
      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
294
56.4k
  const int line_size = line_width << use_highbd;
295
296
56.4k
  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
297
298
  // Replace RESTORATION_BORDER pixels above the top of the stripe
299
  // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
300
  // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
301
  // duplicating the topmost of the 2 lines (see the AOMMAX call when
302
  // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
303
  //
304
  // Special case: If we're at the top of a tile, which isn't on the topmost
305
  // tile row, and we're allowed to loop filter across tiles, then we have a
306
  // logical 64-pixel-high stripe which has been split into an 8-pixel high
307
  // stripe and a 56-pixel high stripe (the current one). So, in this case,
308
  // we want to leave the boundary alone!
309
56.4k
  if (!opt) {
310
42.0k
    if (copy_above) {
311
36.1k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
312
313
144k
      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
314
108k
        const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
315
108k
        const int buf_off = buf_x0_off + buf_row * buf_stride;
316
108k
        const uint8_t *buf =
317
108k
            rsb->stripe_boundary_above + (buf_off << use_highbd);
318
108k
        uint8_t *dst8 = data8_tl + i * data_stride;
319
        // Save old pixels, then replace with data from stripe_boundary_above
320
108k
        memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
321
108k
               REAL_PTR(use_highbd, dst8), line_size);
322
108k
        memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
323
108k
      }
324
36.1k
    }
325
326
    // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
327
    // The second buffer row is repeated, so src_row gets the values 0, 1, 1
328
    // for i = 0, 1, 2.
329
42.0k
    if (copy_below) {
330
32.9k
      const int stripe_end = limits->v_start + h;
331
32.9k
      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
332
333
131k
      for (int i = 0; i < RESTORATION_BORDER; ++i) {
334
98.8k
        const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
335
98.8k
        const int buf_off = buf_x0_off + buf_row * buf_stride;
336
98.8k
        const uint8_t *src =
337
98.8k
            rsb->stripe_boundary_below + (buf_off << use_highbd);
338
339
98.8k
        uint8_t *dst8 = data8_bl + i * data_stride;
340
        // Save old pixels, then replace with data from stripe_boundary_below
341
98.8k
        memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
342
98.8k
        memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
343
98.8k
      }
344
32.9k
    }
345
42.0k
  } else {
346
14.4k
    if (copy_above) {
347
13.2k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
348
349
      // Only save and overwrite i=-RESTORATION_BORDER line.
350
13.2k
      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
351
      // Save old pixels, then replace with data from stripe_boundary_above
352
13.2k
      memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
353
13.2k
      memcpy(REAL_PTR(use_highbd, dst8),
354
13.2k
             REAL_PTR(use_highbd,
355
13.2k
                      data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
356
13.2k
             line_size);
357
13.2k
    }
358
359
14.4k
    if (copy_below) {
360
13.2k
      const int stripe_end = limits->v_start + h;
361
13.2k
      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
362
363
      // Only save and overwrite i=2 line.
364
13.2k
      uint8_t *dst8 = data8_bl + 2 * data_stride;
365
      // Save old pixels, then replace with data from stripe_boundary_below
366
13.2k
      memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
367
13.2k
      memcpy(REAL_PTR(use_highbd, dst8),
368
13.2k
             REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
369
13.2k
    }
370
14.4k
  }
371
56.4k
}
372
373
// This function restores the boundary lines modified by
374
// setup_processing_stripe_boundary.
375
//
376
// Note: We need to be careful when handling the corners of the processing
377
// unit, because (eg.) the top-left corner is considered to be part of
378
// both the left and top borders. This means that, depending on the
379
// loop_filter_across_tiles_enabled flag, the corner pixels might get
380
// overwritten twice, once as part of the "top" border and once as part
381
// of the "left" border (or similar for other corners).
382
//
383
// Everything works out fine as long as we make sure to reverse the order
384
// when restoring, ie. we need to restore the left/right borders followed
385
// by the top/bottom borders.
386
static void restore_processing_stripe_boundary(
387
    const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
388
    int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
389
56.4k
    int copy_below, int opt) {
390
56.4k
  const int line_width =
391
56.4k
      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
392
56.4k
  const int line_size = line_width << use_highbd;
393
394
56.4k
  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
395
396
56.4k
  if (!opt) {
397
42.0k
    if (copy_above) {
398
36.1k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
399
144k
      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
400
108k
        uint8_t *dst8 = data8_tl + i * data_stride;
401
108k
        memcpy(REAL_PTR(use_highbd, dst8),
402
108k
               rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
403
108k
      }
404
36.1k
    }
405
406
42.0k
    if (copy_below) {
407
32.9k
      const int stripe_bottom = limits->v_start + h;
408
32.9k
      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
409
410
131k
      for (int i = 0; i < RESTORATION_BORDER; ++i) {
411
98.8k
        if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
412
413
98.8k
        uint8_t *dst8 = data8_bl + i * data_stride;
414
98.8k
        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
415
98.8k
      }
416
32.9k
    }
417
42.0k
  } else {
418
14.4k
    if (copy_above) {
419
13.2k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
420
421
      // Only restore i=-RESTORATION_BORDER line.
422
13.2k
      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
423
13.2k
      memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
424
13.2k
    }
425
426
14.4k
    if (copy_below) {
427
13.2k
      const int stripe_bottom = limits->v_start + h;
428
13.2k
      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
429
430
      // Only restore i=2 line.
431
13.2k
      if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
432
13.2k
        uint8_t *dst8 = data8_bl + 2 * data_stride;
433
13.2k
        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
434
13.2k
      }
435
13.2k
    }
436
14.4k
  }
437
56.4k
}
438
439
static void wiener_filter_stripe(const RestorationUnitInfo *rui,
440
                                 int stripe_width, int stripe_height,
441
                                 int procunit_width, const uint8_t *src,
442
                                 int src_stride, uint8_t *dst, int dst_stride,
443
12.0k
                                 int32_t *tmpbuf, int bit_depth) {
444
12.0k
  (void)tmpbuf;
445
12.0k
  (void)bit_depth;
446
12.0k
  assert(bit_depth == 8);
447
12.0k
  const ConvolveParams conv_params = get_conv_params_wiener(8);
448
449
31.6k
  for (int j = 0; j < stripe_width; j += procunit_width) {
450
19.6k
    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
451
19.6k
    const uint8_t *src_p = src + j;
452
19.6k
    uint8_t *dst_p = dst + j;
453
19.6k
    av1_wiener_convolve_add_src(
454
19.6k
        src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
455
19.6k
        rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
456
19.6k
  }
457
12.0k
}
458
459
/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
460
   over the input. The window is of size (2r + 1)x(2r + 1), and we
461
   specialize to r = 1, 2, 3. A default function is used for r > 3.
462
463
   Each loop follows the same format: We keep a window's worth of input
464
   in individual variables and select data out of that as appropriate.
465
*/
466
static void boxsum1(int32_t *src, int width, int height, int src_stride,
467
56.4k
                    int sqr, int32_t *dst, int dst_stride) {
468
56.4k
  int i, j, a, b, c;
469
56.4k
  assert(width > 2 * SGRPROJ_BORDER_HORZ);
470
56.4k
  assert(height > 2 * SGRPROJ_BORDER_VERT);
471
472
  // Vertical sum over 3-pixel regions, from src into dst.
473
56.4k
  if (!sqr) {
474
1.29M
    for (j = 0; j < width; ++j) {
475
1.26M
      a = src[j];
476
1.26M
      b = src[src_stride + j];
477
1.26M
      c = src[2 * src_stride + j];
478
479
1.26M
      dst[j] = a + b;
480
66.4M
      for (i = 1; i < height - 2; ++i) {
481
        // Loop invariant: At the start of each iteration,
482
        // a = src[(i - 1) * src_stride + j]
483
        // b = src[(i    ) * src_stride + j]
484
        // c = src[(i + 1) * src_stride + j]
485
65.1M
        dst[i * dst_stride + j] = a + b + c;
486
65.1M
        a = b;
487
65.1M
        b = c;
488
65.1M
        c = src[(i + 2) * src_stride + j];
489
65.1M
      }
490
1.26M
      dst[i * dst_stride + j] = a + b + c;
491
1.26M
      dst[(i + 1) * dst_stride + j] = b + c;
492
1.26M
    }
493
28.2k
  } else {
494
1.29M
    for (j = 0; j < width; ++j) {
495
1.26M
      a = src[j] * src[j];
496
1.26M
      b = src[src_stride + j] * src[src_stride + j];
497
1.26M
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
498
499
1.26M
      dst[j] = a + b;
500
66.4M
      for (i = 1; i < height - 2; ++i) {
501
65.1M
        dst[i * dst_stride + j] = a + b + c;
502
65.1M
        a = b;
503
65.1M
        b = c;
504
65.1M
        c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
505
65.1M
      }
506
1.26M
      dst[i * dst_stride + j] = a + b + c;
507
1.26M
      dst[(i + 1) * dst_stride + j] = b + c;
508
1.26M
    }
509
28.2k
  }
510
511
  // Horizontal sum over 3-pixel regions of dst
512
3.15M
  for (i = 0; i < height; ++i) {
513
3.10M
    a = dst[i * dst_stride];
514
3.10M
    b = dst[i * dst_stride + 1];
515
3.10M
    c = dst[i * dst_stride + 2];
516
517
3.10M
    dst[i * dst_stride] = a + b;
518
131M
    for (j = 1; j < width - 2; ++j) {
519
      // Loop invariant: At the start of each iteration,
520
      // a = src[i * src_stride + (j - 1)]
521
      // b = src[i * src_stride + (j    )]
522
      // c = src[i * src_stride + (j + 1)]
523
128M
      dst[i * dst_stride + j] = a + b + c;
524
128M
      a = b;
525
128M
      b = c;
526
128M
      c = dst[i * dst_stride + (j + 2)];
527
128M
    }
528
3.10M
    dst[i * dst_stride + j] = a + b + c;
529
3.10M
    dst[i * dst_stride + (j + 1)] = b + c;
530
3.10M
  }
531
56.4k
}
532
533
static void boxsum2(int32_t *src, int width, int height, int src_stride,
534
49.5k
                    int sqr, int32_t *dst, int dst_stride) {
535
49.5k
  int i, j, a, b, c, d, e;
536
49.5k
  assert(width > 2 * SGRPROJ_BORDER_HORZ);
537
49.5k
  assert(height > 2 * SGRPROJ_BORDER_VERT);
538
539
  // Vertical sum over 5-pixel regions, from src into dst.
540
49.5k
  if (!sqr) {
541
1.11M
    for (j = 0; j < width; ++j) {
542
1.09M
      a = src[j];
543
1.09M
      b = src[src_stride + j];
544
1.09M
      c = src[2 * src_stride + j];
545
1.09M
      d = src[3 * src_stride + j];
546
1.09M
      e = src[4 * src_stride + j];
547
548
1.09M
      dst[j] = a + b + c;
549
1.09M
      dst[dst_stride + j] = a + b + c + d;
550
54.7M
      for (i = 2; i < height - 3; ++i) {
551
        // Loop invariant: At the start of each iteration,
552
        // a = src[(i - 2) * src_stride + j]
553
        // b = src[(i - 1) * src_stride + j]
554
        // c = src[(i    ) * src_stride + j]
555
        // d = src[(i + 1) * src_stride + j]
556
        // e = src[(i + 2) * src_stride + j]
557
53.6M
        dst[i * dst_stride + j] = a + b + c + d + e;
558
53.6M
        a = b;
559
53.6M
        b = c;
560
53.6M
        c = d;
561
53.6M
        d = e;
562
53.6M
        e = src[(i + 3) * src_stride + j];
563
53.6M
      }
564
1.09M
      dst[i * dst_stride + j] = a + b + c + d + e;
565
1.09M
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
566
1.09M
      dst[(i + 2) * dst_stride + j] = c + d + e;
567
1.09M
    }
568
24.7k
  } else {
569
1.11M
    for (j = 0; j < width; ++j) {
570
1.09M
      a = src[j] * src[j];
571
1.09M
      b = src[src_stride + j] * src[src_stride + j];
572
1.09M
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
573
1.09M
      d = src[3 * src_stride + j] * src[3 * src_stride + j];
574
1.09M
      e = src[4 * src_stride + j] * src[4 * src_stride + j];
575
576
1.09M
      dst[j] = a + b + c;
577
1.09M
      dst[dst_stride + j] = a + b + c + d;
578
54.7M
      for (i = 2; i < height - 3; ++i) {
579
53.6M
        dst[i * dst_stride + j] = a + b + c + d + e;
580
53.6M
        a = b;
581
53.6M
        b = c;
582
53.6M
        c = d;
583
53.6M
        d = e;
584
53.6M
        e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
585
53.6M
      }
586
1.09M
      dst[i * dst_stride + j] = a + b + c + d + e;
587
1.09M
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
588
1.09M
      dst[(i + 2) * dst_stride + j] = c + d + e;
589
1.09M
    }
590
24.7k
  }
591
592
  // Horizontal sum over 5-pixel regions of dst
593
2.77M
  for (i = 0; i < height; ++i) {
594
2.72M
    a = dst[i * dst_stride];
595
2.72M
    b = dst[i * dst_stride + 1];
596
2.72M
    c = dst[i * dst_stride + 2];
597
2.72M
    d = dst[i * dst_stride + 3];
598
2.72M
    e = dst[i * dst_stride + 4];
599
600
2.72M
    dst[i * dst_stride] = a + b + c;
601
2.72M
    dst[i * dst_stride + 1] = a + b + c + d;
602
107M
    for (j = 2; j < width - 3; ++j) {
603
      // Loop invariant: At the start of each iteration,
604
      // a = src[i * src_stride + (j - 2)]
605
      // b = src[i * src_stride + (j - 1)]
606
      // c = src[i * src_stride + (j    )]
607
      // d = src[i * src_stride + (j + 1)]
608
      // e = src[i * src_stride + (j + 2)]
609
104M
      dst[i * dst_stride + j] = a + b + c + d + e;
610
104M
      a = b;
611
104M
      b = c;
612
104M
      c = d;
613
104M
      d = e;
614
104M
      e = dst[i * dst_stride + (j + 3)];
615
104M
    }
616
2.72M
    dst[i * dst_stride + j] = a + b + c + d + e;
617
2.72M
    dst[i * dst_stride + (j + 1)] = b + c + d + e;
618
2.72M
    dst[i * dst_stride + (j + 2)] = c + d + e;
619
2.72M
  }
620
49.5k
}
621
622
static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
623
105k
                   int sqr, int32_t *dst, int dst_stride) {
624
105k
  if (r == 1)
625
56.4k
    boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
626
49.5k
  else if (r == 2)
627
49.5k
    boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
628
0
  else
629
49.5k
    assert(0 && "Invalid value of r in self-guided filter");
630
105k
}
631
632
33.4k
void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
633
33.4k
  if (params->r[0] == 0) {
634
8.68k
    xq[0] = 0;
635
8.68k
    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
636
24.7k
  } else if (params->r[1] == 0) {
637
5.23k
    xq[0] = xqd[0];
638
5.23k
    xq[1] = 0;
639
19.5k
  } else {
640
19.5k
    xq[0] = xqd[0];
641
19.5k
    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
642
19.5k
  }
643
33.4k
}
644
645
const int32_t av1_x_by_xplus1[256] = {
646
  // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
647
  // instead of 0. See comments in selfguided_restoration_internal() for why
648
  1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
649
  240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
650
  248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
651
  250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
652
  252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
653
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
654
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
655
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
656
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
657
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
658
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
659
  254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
660
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
661
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
662
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
663
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
664
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
665
  256,
666
};
667
668
const int32_t av1_one_by_x[MAX_NELEM] = {
669
  4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
670
  293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
671
};
672
673
static void calculate_intermediate_result(int32_t *dgd, int width, int height,
674
                                          int dgd_stride, int bit_depth,
675
                                          int sgr_params_idx, int radius_idx,
676
52.9k
                                          int pass, int32_t *A, int32_t *B) {
677
52.9k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
678
52.9k
  const int r = params->r[radius_idx];
679
52.9k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
680
52.9k
  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
681
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
682
  // leading to a significant speed improvement.
683
  // We also align the stride to a multiple of 16 bytes, for consistency
684
  // with the SIMD version of this function.
685
52.9k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
686
52.9k
  const int step = pass == 0 ? 1 : 2;
687
52.9k
  int i, j;
688
689
52.9k
  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
690
52.9k
  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
691
52.9k
         "Need SGRPROJ_BORDER_* >= r+1");
692
693
52.9k
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
694
52.9k
         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
695
52.9k
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
696
52.9k
         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
697
52.9k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
698
52.9k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
699
  // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
700
  // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
701
2.12M
  for (i = -1; i < height + 1; i += step) {
702
85.0M
    for (j = -1; j < width + 1; ++j) {
703
83.0M
      const int k = i * buf_stride + j;
704
83.0M
      const int n = (2 * r + 1) * (2 * r + 1);
705
706
      // a < 2^16 * n < 2^22 regardless of bit depth
707
83.0M
      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
708
      // b < 2^8 * n < 2^14 regardless of bit depth
709
83.0M
      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
710
711
      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
712
      // and p itself satisfies p < 2^14 * n^2 < 2^26.
713
      // This bound on p is due to:
714
      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
715
      //
716
      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
717
      // This is an artefact of rounding, and can only happen if all pixels
718
      // are (almost) identical, so in this case we saturate to p=0.
719
83.0M
      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
720
721
83.0M
      const uint32_t s = params->s[radius_idx];
722
723
      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
724
      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
725
      // (this holds even after accounting for the rounding in s)
726
83.0M
      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
727
728
      // Note: We have to be quite careful about the value of A[k].
729
      // This is used as a blend factor between individual pixel values and the
730
      // local mean. So it logically has a range of [0, 256], including both
731
      // endpoints.
732
      //
733
      // This is a pain for hardware, as we'd like something which can be stored
734
      // in exactly 8 bits.
735
      // Further, in the calculation of B[k] below, if z == 0 and r == 2,
736
      // then A[k] "should be" 0. But then we can end up setting B[k] to a value
737
      // slightly above 2^(8 + bit depth), due to rounding in the value of
738
      // av1_one_by_x[25-1].
739
      //
740
      // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
741
      // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
742
      // overflow), without significantly affecting the final result: z == 0
743
      // implies that the image is essentially "flat", so the local mean and
744
      // individual pixel values are very similar.
745
      //
746
      // Note that saturating on the other side, ie. requring A[k] <= 255,
747
      // would be a bad idea, as that corresponds to the case where the image
748
      // is very variable, when we want to preserve the local pixel value as
749
      // much as possible.
750
83.0M
      A[k] = av1_x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
751
752
      // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
753
      // av1_one_by_x[n - 1] = round(2^12 / n)
754
      // => the product here is < 2^(20 + bit_depth) <= 2^32,
755
      // and B[k] is set to a value < 2^(8 + bit depth)
756
      // This holds even with the rounding in av1_one_by_x and in the overall
757
      // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
758
83.0M
      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
759
83.0M
                                             (uint32_t)B[k] *
760
83.0M
                                             (uint32_t)av1_one_by_x[n - 1],
761
83.0M
                                         SGRPROJ_RECIP_BITS);
762
83.0M
    }
763
2.07M
  }
764
52.9k
}
765
766
static void selfguided_restoration_fast_internal(
767
    int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
768
24.7k
    int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
769
24.7k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
770
24.7k
  const int r = params->r[radius_idx];
771
24.7k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
772
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
773
  // leading to a significant speed improvement.
774
  // We also align the stride to a multiple of 16 bytes, for consistency
775
  // with the SIMD version of this function.
776
24.7k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
777
24.7k
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
778
24.7k
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
779
24.7k
  int32_t *A = A_;
780
24.7k
  int32_t *B = B_;
781
24.7k
  int i, j;
782
24.7k
  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
783
24.7k
                                sgr_params_idx, radius_idx, 1, A, B);
784
24.7k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
785
24.7k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
786
787
  // Use the A[] and B[] arrays to calculate the filtered image
788
24.7k
  (void)r;
789
24.7k
  assert(r == 2);
790
1.24M
  for (i = 0; i < height; ++i) {
791
1.21M
    if (!(i & 1)) {  // even row
792
23.2M
      for (j = 0; j < width; ++j) {
793
22.6M
        const int k = i * buf_stride + j;
794
22.6M
        const int l = i * dgd_stride + j;
795
22.6M
        const int m = i * dst_stride + j;
796
22.6M
        const int nb = 5;
797
22.6M
        const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
798
22.6M
                          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
799
22.6M
                           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
800
22.6M
                              5;
801
22.6M
        const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
802
22.6M
                          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
803
22.6M
                           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
804
22.6M
                              5;
805
22.6M
        const int32_t v = a * dgd[l] + b;
806
22.6M
        dst[m] =
807
22.6M
            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
808
22.6M
      }
809
608k
    } else {  // odd row
810
23.1M
      for (j = 0; j < width; ++j) {
811
22.5M
        const int k = i * buf_stride + j;
812
22.5M
        const int l = i * dgd_stride + j;
813
22.5M
        const int m = i * dst_stride + j;
814
22.5M
        const int nb = 4;
815
22.5M
        const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
816
22.5M
        const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
817
22.5M
        const int32_t v = a * dgd[l] + b;
818
22.5M
        dst[m] =
819
22.5M
            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
820
22.5M
      }
821
606k
    }
822
1.21M
  }
823
24.7k
}
824
825
static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
826
                                            int dgd_stride, int32_t *dst,
827
                                            int dst_stride, int bit_depth,
828
                                            int sgr_params_idx,
829
28.2k
                                            int radius_idx) {
830
28.2k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
831
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
832
  // leading to a significant speed improvement.
833
  // We also align the stride to a multiple of 16 bytes, for consistency
834
  // with the SIMD version of this function.
835
28.2k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
836
28.2k
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
837
28.2k
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
838
28.2k
  int32_t *A = A_;
839
28.2k
  int32_t *B = B_;
840
28.2k
  int i, j;
841
28.2k
  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
842
28.2k
                                sgr_params_idx, radius_idx, 0, A, B);
843
28.2k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
844
28.2k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
845
846
  // Use the A[] and B[] arrays to calculate the filtered image
847
1.40M
  for (i = 0; i < height; ++i) {
848
54.4M
    for (j = 0; j < width; ++j) {
849
53.0M
      const int k = i * buf_stride + j;
850
53.0M
      const int l = i * dgd_stride + j;
851
53.0M
      const int m = i * dst_stride + j;
852
53.0M
      const int nb = 5;
853
53.0M
      const int32_t a =
854
53.0M
          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
855
53.0M
              4 +
856
53.0M
          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
857
53.0M
           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
858
53.0M
              3;
859
53.0M
      const int32_t b =
860
53.0M
          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
861
53.0M
              4 +
862
53.0M
          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
863
53.0M
           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
864
53.0M
              3;
865
53.0M
      const int32_t v = a * dgd[l] + b;
866
53.0M
      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
867
53.0M
    }
868
1.38M
  }
869
28.2k
}
870
871
int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
872
                                 int dgd_stride, int32_t *flt0, int32_t *flt1,
873
                                 int flt_stride, int sgr_params_idx,
874
33.4k
                                 int bit_depth, int highbd) {
875
33.4k
  int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
876
33.4k
  const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
877
33.4k
  int32_t *dgd32 =
878
33.4k
      dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
879
880
33.4k
  if (highbd) {
881
14.7k
    const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
882
783k
    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
883
34.5M
      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
884
33.7M
        dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
885
33.7M
      }
886
768k
    }
887
18.6k
  } else {
888
1.08M
    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
889
46.9M
      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
890
45.9M
        dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
891
45.9M
      }
892
1.06M
    }
893
18.6k
  }
894
895
33.4k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
896
  // If params->r == 0 we skip the corresponding filter. We only allow one of
897
  // the radii to be 0, as having both equal to 0 would be equivalent to
898
  // skipping SGR entirely.
899
33.4k
  assert(!(params->r[0] == 0 && params->r[1] == 0));
900
901
33.4k
  if (params->r[0] > 0)
902
24.7k
    selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
903
24.7k
                                         flt0, flt_stride, bit_depth,
904
24.7k
                                         sgr_params_idx, 0);
905
33.4k
  if (params->r[1] > 0)
906
28.2k
    selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
907
28.2k
                                    flt_stride, bit_depth, sgr_params_idx, 1);
908
33.4k
  return 0;
909
33.4k
}
910
911
void av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
912
                                        int height, int stride, int eps,
913
                                        const int *xqd, uint8_t *dst8,
914
                                        int dst_stride, int32_t *tmpbuf,
915
33.4k
                                        int bit_depth, int highbd) {
916
33.4k
  int32_t *flt0 = tmpbuf;
917
33.4k
  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
918
33.4k
  assert(width * height <= RESTORATION_UNITPELS_MAX);
919
920
33.4k
  const int ret = av1_selfguided_restoration_c(
921
33.4k
      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
922
33.4k
  (void)ret;
923
33.4k
  assert(!ret);
924
33.4k
  const sgr_params_type *const params = &av1_sgr_params[eps];
925
33.4k
  int xq[2];
926
33.4k
  av1_decode_xq(xqd, xq, params);
927
1.66M
  for (int i = 0; i < height; ++i) {
928
62.6M
    for (int j = 0; j < width; ++j) {
929
60.9M
      const int k = i * width + j;
930
60.9M
      uint8_t *dst8ij = dst8 + i * dst_stride + j;
931
60.9M
      const uint8_t *dat8ij = dat8 + i * stride + j;
932
933
60.9M
      const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
934
60.9M
      const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
935
60.9M
      int32_t v = u << SGRPROJ_PRJ_BITS;
936
      // If params->r == 0 then we skipped the filtering in
937
      // av1_selfguided_restoration_c, i.e. flt[k] == u
938
60.9M
      if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
939
60.9M
      if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
940
60.9M
      const int16_t w =
941
60.9M
          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
942
943
60.9M
      const uint16_t out = clip_pixel_highbd(w, bit_depth);
944
60.9M
      if (highbd)
945
25.6M
        *CONVERT_TO_SHORTPTR(dst8ij) = out;
946
35.2M
      else
947
35.2M
        *dst8ij = (uint8_t)out;
948
60.9M
    }
949
1.63M
  }
950
33.4k
}
951
952
static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
953
                                  int stripe_width, int stripe_height,
954
                                  int procunit_width, const uint8_t *src,
955
                                  int src_stride, uint8_t *dst, int dst_stride,
956
10.3k
                                  int32_t *tmpbuf, int bit_depth) {
957
10.3k
  (void)bit_depth;
958
10.3k
  assert(bit_depth == 8);
959
960
29.0k
  for (int j = 0; j < stripe_width; j += procunit_width) {
961
18.6k
    int w = AOMMIN(procunit_width, stripe_width - j);
962
18.6k
    av1_apply_selfguided_restoration(
963
18.6k
        src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
964
18.6k
        rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth, 0);
965
18.6k
  }
966
10.3k
}
967
968
#if CONFIG_AV1_HIGHBITDEPTH
969
static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
970
                                        int stripe_width, int stripe_height,
971
                                        int procunit_width, const uint8_t *src8,
972
                                        int src_stride, uint8_t *dst8,
973
                                        int dst_stride, int32_t *tmpbuf,
974
26.1k
                                        int bit_depth) {
975
26.1k
  (void)tmpbuf;
976
26.1k
  const ConvolveParams conv_params = get_conv_params_wiener(bit_depth);
977
978
61.5k
  for (int j = 0; j < stripe_width; j += procunit_width) {
979
35.4k
    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
980
35.4k
    const uint8_t *src8_p = src8 + j;
981
35.4k
    uint8_t *dst8_p = dst8 + j;
982
35.4k
    av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
983
35.4k
                                       rui->wiener_info.hfilter, 16,
984
35.4k
                                       rui->wiener_info.vfilter, 16, w,
985
35.4k
                                       stripe_height, &conv_params, bit_depth);
986
35.4k
  }
987
26.1k
}
988
989
static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
990
                                         int stripe_width, int stripe_height,
991
                                         int procunit_width,
992
                                         const uint8_t *src8, int src_stride,
993
                                         uint8_t *dst8, int dst_stride,
994
8.03k
                                         int32_t *tmpbuf, int bit_depth) {
995
22.7k
  for (int j = 0; j < stripe_width; j += procunit_width) {
996
14.7k
    int w = AOMMIN(procunit_width, stripe_width - j);
997
14.7k
    av1_apply_selfguided_restoration(
998
14.7k
        src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
999
14.7k
        rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
1000
14.7k
  }
1001
8.03k
}
1002
#endif  // CONFIG_AV1_HIGHBITDEPTH
1003
1004
typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
1005
                                  int stripe_width, int stripe_height,
1006
                                  int procunit_width, const uint8_t *src,
1007
                                  int src_stride, uint8_t *dst, int dst_stride,
1008
                                  int32_t *tmpbuf, int bit_depth);
1009
1010
#if CONFIG_AV1_HIGHBITDEPTH
1011
#define NUM_STRIPE_FILTERS 4
1012
static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
1013
  wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
1014
  sgrproj_filter_stripe_highbd
1015
};
1016
#else
1017
#define NUM_STRIPE_FILTERS 2
1018
static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
1019
  wiener_filter_stripe, sgrproj_filter_stripe
1020
};
1021
#endif  // CONFIG_AV1_HIGHBITDEPTH
1022
1023
// Filter one restoration unit
1024
void av1_loop_restoration_filter_unit(
1025
    const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
1026
    const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
1027
    const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
1028
    int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
1029
46.0k
    int dst_stride, int32_t *tmpbuf, int optimized_lr) {
1030
46.0k
  RestorationType unit_rtype = rui->restoration_type;
1031
1032
46.0k
  int unit_h = limits->v_end - limits->v_start;
1033
46.0k
  int unit_w = limits->h_end - limits->h_start;
1034
46.0k
  uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start;
1035
46.0k
  uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start;
1036
1037
46.0k
  if (unit_rtype == RESTORE_NONE) {
1038
17.6k
    copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd);
1039
17.6k
    return;
1040
17.6k
  }
1041
1042
28.3k
  const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
1043
28.3k
  assert(filter_idx < NUM_STRIPE_FILTERS);
1044
28.3k
  const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
1045
1046
28.3k
  const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
1047
1048
  // Convolve the whole tile one stripe at a time
1049
28.3k
  RestorationTileLimits remaining_stripes = *limits;
1050
28.3k
  int i = 0;
1051
84.8k
  while (i < unit_h) {
1052
56.4k
    int copy_above, copy_below;
1053
56.4k
    remaining_stripes.v_start = limits->v_start + i;
1054
1055
56.4k
    get_stripe_boundary_info(&remaining_stripes, tile_rect, ss_y, &copy_above,
1056
56.4k
                             &copy_below);
1057
1058
56.4k
    const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1059
56.4k
    const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
1060
1061
    // Work out where this stripe's boundaries are within
1062
    // rsb->stripe_boundary_{above,below}
1063
56.4k
    const int tile_stripe =
1064
56.4k
        (remaining_stripes.v_start - tile_rect->top + runit_offset) /
1065
56.4k
        full_stripe_height;
1066
56.4k
    const int frame_stripe = tile_stripe0 + tile_stripe;
1067
56.4k
    const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
1068
1069
    // Calculate this stripe's height, based on two rules:
1070
    // * The topmost stripe in each tile is 8 luma pixels shorter than usual.
1071
    // * We can't extend past the end of the current restoration unit
1072
56.4k
    const int nominal_stripe_height =
1073
56.4k
        full_stripe_height - ((tile_stripe == 0) ? runit_offset : 0);
1074
56.4k
    const int h = AOMMIN(nominal_stripe_height,
1075
56.4k
                         remaining_stripes.v_end - remaining_stripes.v_start);
1076
1077
56.4k
    setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
1078
56.4k
                                     h, data8, stride, rlbs, copy_above,
1079
56.4k
                                     copy_below, optimized_lr);
1080
1081
56.4k
    stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
1082
56.4k
                  dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth);
1083
1084
56.4k
    restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
1085
56.4k
                                       data8, stride, copy_above, copy_below,
1086
56.4k
                                       optimized_lr);
1087
1088
56.4k
    i += h;
1089
56.4k
  }
1090
28.3k
}
1091
1092
static void filter_frame_on_unit(const RestorationTileLimits *limits,
1093
                                 const AV1PixelRect *tile_rect,
1094
                                 int rest_unit_idx, void *priv, int32_t *tmpbuf,
1095
46.0k
                                 RestorationLineBuffers *rlbs) {
1096
46.0k
  FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
1097
46.0k
  const RestorationInfo *rsi = ctxt->rsi;
1098
1099
46.0k
  av1_loop_restoration_filter_unit(
1100
46.0k
      limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, tile_rect,
1101
46.0k
      ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, ctxt->bit_depth,
1102
46.0k
      ctxt->data8, ctxt->data_stride, ctxt->dst8, ctxt->dst_stride, tmpbuf,
1103
46.0k
      rsi->optimized_lr);
1104
46.0k
}
1105
1106
void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
1107
                                            YV12_BUFFER_CONFIG *frame,
1108
                                            AV1_COMMON *cm, int optimized_lr,
1109
7.24k
                                            int num_planes) {
1110
7.24k
  const SequenceHeader *const seq_params = cm->seq_params;
1111
7.24k
  const int bit_depth = seq_params->bit_depth;
1112
7.24k
  const int highbd = seq_params->use_highbitdepth;
1113
7.24k
  lr_ctxt->dst = &cm->rst_frame;
1114
1115
7.24k
  const int frame_width = frame->crop_widths[0];
1116
7.24k
  const int frame_height = frame->crop_heights[0];
1117
7.24k
  if (aom_realloc_frame_buffer(
1118
7.24k
          lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
1119
7.24k
          seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
1120
7.24k
          cm->features.byte_alignment, NULL, NULL, NULL, 0) < 0)
1121
0
    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
1122
0
                       "Failed to allocate restoration dst buffer");
1123
1124
7.24k
  lr_ctxt->on_rest_unit = filter_frame_on_unit;
1125
7.24k
  lr_ctxt->frame = frame;
1126
27.9k
  for (int plane = 0; plane < num_planes; ++plane) {
1127
20.7k
    RestorationInfo *rsi = &cm->rst_info[plane];
1128
20.7k
    RestorationType rtype = rsi->frame_restoration_type;
1129
20.7k
    rsi->optimized_lr = optimized_lr;
1130
1131
20.7k
    if (rtype == RESTORE_NONE) {
1132
8.42k
      continue;
1133
8.42k
    }
1134
1135
12.2k
    const int is_uv = plane > 0;
1136
12.2k
    const int plane_width = frame->crop_widths[is_uv];
1137
12.2k
    const int plane_height = frame->crop_heights[is_uv];
1138
12.2k
    FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
1139
1140
12.2k
    av1_extend_frame(frame->buffers[plane], plane_width, plane_height,
1141
12.2k
                     frame->strides[is_uv], RESTORATION_BORDER,
1142
12.2k
                     RESTORATION_BORDER, highbd);
1143
1144
12.2k
    lr_plane_ctxt->rsi = rsi;
1145
12.2k
    lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
1146
12.2k
    lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
1147
12.2k
    lr_plane_ctxt->highbd = highbd;
1148
12.2k
    lr_plane_ctxt->bit_depth = bit_depth;
1149
12.2k
    lr_plane_ctxt->data8 = frame->buffers[plane];
1150
12.2k
    lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
1151
12.2k
    lr_plane_ctxt->data_stride = frame->strides[is_uv];
1152
12.2k
    lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
1153
12.2k
    lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv);
1154
12.2k
    lr_plane_ctxt->tile_stripe0 = 0;
1155
12.2k
  }
1156
7.24k
}
1157
1158
void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
1159
7.24k
                                      AV1_COMMON *cm, int num_planes) {
1160
7.24k
  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
1161
7.24k
                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
1162
7.24k
                           int vstart, int vend);
1163
7.24k
  static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
1164
7.24k
                                         aom_yv12_partial_coloc_copy_u,
1165
7.24k
                                         aom_yv12_partial_coloc_copy_v };
1166
7.24k
  assert(num_planes <= 3);
1167
27.9k
  for (int plane = 0; plane < num_planes; ++plane) {
1168
20.7k
    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
1169
12.2k
    AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
1170
12.2k
    copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, tile_rect.left,
1171
12.2k
                     tile_rect.right, tile_rect.top, tile_rect.bottom);
1172
12.2k
  }
1173
7.24k
}
1174
1175
static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
1176
7.24k
                                        int num_planes) {
1177
7.24k
  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
1178
1179
27.9k
  for (int plane = 0; plane < num_planes; ++plane) {
1180
20.7k
    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
1181
8.42k
      continue;
1182
8.42k
    }
1183
1184
12.2k
    av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit,
1185
12.2k
                                   &ctxt[plane], &ctxt[plane].tile_rect,
1186
12.2k
                                   cm->rst_tmpbuf, cm->rlbs);
1187
12.2k
  }
1188
7.24k
}
1189
1190
void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
1191
                                       AV1_COMMON *cm, int optimized_lr,
1192
7.24k
                                       void *lr_ctxt) {
1193
7.24k
  assert(!cm->features.all_lossless);
1194
7.24k
  const int num_planes = av1_num_planes(cm);
1195
1196
7.24k
  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
1197
1198
7.24k
  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
1199
7.24k
                                         optimized_lr, num_planes);
1200
1201
7.24k
  foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
1202
1203
7.24k
  av1_loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
1204
7.24k
}
1205
1206
void av1_foreach_rest_unit_in_row(
1207
    RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
1208
    rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
1209
    int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
1210
    void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
1211
    sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
1212
34.5k
    struct AV1LrSyncData *const lr_sync) {
1213
34.5k
  const int tile_w = tile_rect->right - tile_rect->left;
1214
34.5k
  const int ext_size = unit_size * 3 / 2;
1215
34.5k
  int x0 = 0, j = 0;
1216
80.6k
  while (x0 < tile_w) {
1217
46.0k
    int remaining_w = tile_w - x0;
1218
46.0k
    int w = (remaining_w < ext_size) ? remaining_w : unit_size;
1219
1220
46.0k
    limits->h_start = tile_rect->left + x0;
1221
46.0k
    limits->h_end = tile_rect->left + x0 + w;
1222
46.0k
    assert(limits->h_end <= tile_rect->right);
1223
1224
46.0k
    const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j;
1225
1226
    // No sync for even numbered rows
1227
    // For odd numbered rows, Loop Restoration of current block requires the LR
1228
    // of top-right and bottom-right blocks to be completed
1229
1230
    // top-right sync
1231
46.0k
    on_sync_read(lr_sync, row_number, j, plane);
1232
46.0k
    if ((row_number + 1) < vunits_per_tile)
1233
      // bottom-right sync
1234
26.3k
      on_sync_read(lr_sync, row_number + 2, j, plane);
1235
1236
46.0k
    on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs);
1237
1238
46.0k
    on_sync_write(lr_sync, row_number, j, hunits_per_tile, plane);
1239
1240
46.0k
    x0 += w;
1241
46.0k
    ++j;
1242
46.0k
  }
1243
34.5k
}
1244
1245
72.3k
void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
1246
72.3k
  (void)lr_sync;
1247
72.3k
  (void)r;
1248
72.3k
  (void)c;
1249
72.3k
  (void)plane;
1250
72.3k
}
1251
1252
void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
1253
46.0k
                             const int sb_cols, int plane) {
1254
46.0k
  (void)lr_sync;
1255
46.0k
  (void)r;
1256
46.0k
  (void)c;
1257
46.0k
  (void)sb_cols;
1258
46.0k
  (void)plane;
1259
46.0k
}
1260
1261
static void foreach_rest_unit_in_tile(
1262
    const AV1PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols,
1263
    int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size,
1264
    int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv,
1265
12.2k
    int32_t *tmpbuf, RestorationLineBuffers *rlbs) {
1266
12.2k
  const int tile_h = tile_rect->bottom - tile_rect->top;
1267
12.2k
  const int ext_size = unit_size * 3 / 2;
1268
1269
12.2k
  const int tile_idx = tile_col + tile_row * tile_cols;
1270
12.2k
  const int unit_idx0 = tile_idx * units_per_tile;
1271
1272
12.2k
  int y0 = 0, i = 0;
1273
46.8k
  while (y0 < tile_h) {
1274
34.5k
    int remaining_h = tile_h - y0;
1275
34.5k
    int h = (remaining_h < ext_size) ? remaining_h : unit_size;
1276
1277
34.5k
    RestorationTileLimits limits;
1278
34.5k
    limits.v_start = tile_rect->top + y0;
1279
34.5k
    limits.v_end = tile_rect->top + y0 + h;
1280
34.5k
    assert(limits.v_end <= tile_rect->bottom);
1281
    // Offset the tile upwards to align with the restoration processing stripe
1282
34.5k
    const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
1283
34.5k
    limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
1284
34.5k
    if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
1285
1286
34.5k
    av1_foreach_rest_unit_in_row(
1287
34.5k
        &limits, tile_rect, on_rest_unit, i, unit_size, unit_idx0,
1288
34.5k
        hunits_per_tile, vunits_per_tile, plane, priv, tmpbuf, rlbs,
1289
34.5k
        av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL);
1290
1291
34.5k
    y0 += h;
1292
34.5k
    ++i;
1293
34.5k
  }
1294
12.2k
}
1295
1296
void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
1297
                                    rest_unit_visitor_t on_rest_unit,
1298
                                    void *priv, AV1PixelRect *tile_rect,
1299
                                    int32_t *tmpbuf,
1300
12.2k
                                    RestorationLineBuffers *rlbs) {
1301
12.2k
  const int is_uv = plane > 0;
1302
12.2k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1303
1304
12.2k
  const RestorationInfo *rsi = &cm->rst_info[plane];
1305
1306
12.2k
  foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS,
1307
12.2k
                            rsi->horz_units_per_tile, rsi->vert_units_per_tile,
1308
12.2k
                            rsi->units_per_tile, rsi->restoration_unit_size,
1309
12.2k
                            ss_y, plane, on_rest_unit, priv, tmpbuf, rlbs);
1310
12.2k
}
1311
1312
int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
1313
                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
1314
                                       int *rcol0, int *rcol1, int *rrow0,
1315
21.5M
                                       int *rrow1) {
1316
21.5M
  assert(rcol0 && rcol1 && rrow0 && rrow1);
1317
1318
21.5M
  if (bsize != cm->seq_params->sb_size) return 0;
1319
2.31M
  if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
1320
1321
2.31M
  assert(!cm->features.all_lossless);
1322
1323
427k
  const int is_uv = plane > 0;
1324
1325
427k
  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
1326
427k
  const int tile_w = tile_rect.right - tile_rect.left;
1327
427k
  const int tile_h = tile_rect.bottom - tile_rect.top;
1328
1329
427k
  const int mi_top = 0;
1330
427k
  const int mi_left = 0;
1331
1332
  // Compute the mi-unit corners of the superblock relative to the top-left of
1333
  // the tile
1334
427k
  const int mi_rel_row0 = mi_row - mi_top;
1335
427k
  const int mi_rel_col0 = mi_col - mi_left;
1336
427k
  const int mi_rel_row1 = mi_rel_row0 + mi_size_high[bsize];
1337
427k
  const int mi_rel_col1 = mi_rel_col0 + mi_size_wide[bsize];
1338
1339
427k
  const RestorationInfo *rsi = &cm->rst_info[plane];
1340
427k
  const int size = rsi->restoration_unit_size;
1341
1342
  // Calculate the number of restoration units in this tile (which might be
1343
  // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile)
1344
427k
  const int horz_units = av1_lr_count_units_in_tile(size, tile_w);
1345
427k
  const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
1346
1347
  // The size of an MI-unit on this plane of the image
1348
427k
  const int ss_x = is_uv && cm->seq_params->subsampling_x;
1349
427k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1350
427k
  const int mi_size_x = MI_SIZE >> ss_x;
1351
427k
  const int mi_size_y = MI_SIZE >> ss_y;
1352
1353
  // Write m for the relative mi column or row, D for the superres denominator
1354
  // and N for the superres numerator. If u is the upscaled pixel offset then
1355
  // we can write the downscaled pixel offset in two ways as:
1356
  //
1357
  //   MI_SIZE * m = N / D u
1358
  //
1359
  // from which we get u = D * MI_SIZE * m / N
1360
427k
  const int mi_to_num_x = av1_superres_scaled(cm)
1361
427k
                              ? mi_size_x * cm->superres_scale_denominator
1362
427k
                              : mi_size_x;
1363
427k
  const int mi_to_num_y = mi_size_y;
1364
427k
  const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
1365
427k
  const int denom_y = size;
1366
1367
427k
  const int rnd_x = denom_x - 1;
1368
427k
  const int rnd_y = denom_y - 1;
1369
1370
  // rcol0/rrow0 should be the first column/row of restoration units (relative
1371
  // to the top-left of the tile) that doesn't start left/below of
1372
  // mi_col/mi_row. For this calculation, we need to round up the division (if
1373
  // the sb starts at runit column 10.1, the first matching runit has column
1374
  // index 11)
1375
427k
  *rcol0 = (mi_rel_col0 * mi_to_num_x + rnd_x) / denom_x;
1376
427k
  *rrow0 = (mi_rel_row0 * mi_to_num_y + rnd_y) / denom_y;
1377
1378
  // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
1379
  // below-right. If we're at the bottom or right of the tile, this restoration
1380
  // unit might not exist, in which case we'll clamp accordingly.
1381
427k
  *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
1382
427k
  *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
1383
1384
427k
  return *rcol0 < *rcol1 && *rrow0 < *rrow1;
1385
427k
}
1386
1387
// Extend to left and right
1388
static void extend_lines(uint8_t *buf, int width, int height, int stride,
1389
210k
                         int extend, int use_highbitdepth) {
1390
632k
  for (int i = 0; i < height; ++i) {
1391
421k
    if (use_highbitdepth) {
1392
325k
      uint16_t *buf16 = (uint16_t *)buf;
1393
325k
      aom_memset16(buf16 - extend, buf16[0], extend);
1394
325k
      aom_memset16(buf16 + width, buf16[width - 1], extend);
1395
325k
    } else {
1396
96.4k
      memset(buf - extend, buf[0], extend);
1397
96.4k
      memset(buf + width, buf[width - 1], extend);
1398
96.4k
    }
1399
421k
    buf += stride;
1400
421k
  }
1401
210k
}
1402
1403
static void save_deblock_boundary_lines(
1404
    const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
1405
    int stripe, int use_highbd, int is_above,
1406
173k
    RestorationStripeBoundaries *boundaries) {
1407
173k
  const int is_uv = plane > 0;
1408
173k
  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1409
173k
  const int src_stride = frame->strides[is_uv] << use_highbd;
1410
173k
  const uint8_t *src_rows = src_buf + row * src_stride;
1411
1412
173k
  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1413
173k
                               : boundaries->stripe_boundary_below;
1414
173k
  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1415
173k
  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1416
173k
  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1417
1418
  // There is a rare case in which a processing stripe can end 1px above the
1419
  // crop border. In this case, we do want to use deblocked pixels from below
1420
  // the stripe (hence why we ended up in this function), but instead of
1421
  // fetching 2 "below" rows we need to fetch one and duplicate it.
1422
  // This is equivalent to clamping the sample locations against the crop border
1423
173k
  const int lines_to_save =
1424
173k
      AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
1425
173k
  assert(lines_to_save == 1 || lines_to_save == 2);
1426
1427
173k
  int upscaled_width;
1428
173k
  int line_bytes;
1429
173k
  if (av1_superres_scaled(cm)) {
1430
147k
    const int ss_x = is_uv && cm->seq_params->subsampling_x;
1431
147k
    upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
1432
147k
    line_bytes = upscaled_width << use_highbd;
1433
147k
    if (use_highbd)
1434
119k
      av1_upscale_normative_rows(
1435
119k
          cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
1436
119k
          CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
1437
119k
          plane, lines_to_save);
1438
28.0k
    else
1439
28.0k
      av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
1440
28.0k
                                 boundaries->stripe_boundary_stride, plane,
1441
28.0k
                                 lines_to_save);
1442
147k
  } else {
1443
25.4k
    upscaled_width = frame->crop_widths[is_uv];
1444
25.4k
    line_bytes = upscaled_width << use_highbd;
1445
76.2k
    for (int i = 0; i < lines_to_save; i++) {
1446
50.8k
      memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
1447
50.8k
             line_bytes);
1448
50.8k
    }
1449
25.4k
  }
1450
  // If we only saved one line, then copy it into the second line buffer
1451
173k
  if (lines_to_save == 1)
1452
36
    memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
1453
1454
173k
  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1455
173k
               RESTORATION_EXTRA_HORZ, use_highbd);
1456
173k
}
1457
1458
static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1459
                                     const AV1_COMMON *cm, int plane, int row,
1460
                                     int stripe, int use_highbd, int is_above,
1461
37.5k
                                     RestorationStripeBoundaries *boundaries) {
1462
37.5k
  const int is_uv = plane > 0;
1463
37.5k
  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1464
37.5k
  const int src_stride = frame->strides[is_uv] << use_highbd;
1465
37.5k
  const uint8_t *src_rows = src_buf + row * src_stride;
1466
1467
37.5k
  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1468
37.5k
                               : boundaries->stripe_boundary_below;
1469
37.5k
  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1470
37.5k
  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1471
37.5k
  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1472
37.5k
  const int src_width = frame->crop_widths[is_uv];
1473
1474
  // At the point where this function is called, we've already applied
1475
  // superres. So we don't need to extend the lines here, we can just
1476
  // pull directly from the topmost row of the upscaled frame.
1477
37.5k
  const int ss_x = is_uv && cm->seq_params->subsampling_x;
1478
37.5k
  const int upscaled_width = av1_superres_scaled(cm)
1479
37.5k
                                 ? (cm->superres_upscaled_width + ss_x) >> ss_x
1480
37.5k
                                 : src_width;
1481
37.5k
  const int line_bytes = upscaled_width << use_highbd;
1482
112k
  for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
1483
    // Copy the line at 'row' into both context lines. This is because
1484
    // we want to (effectively) extend the outermost row of CDEF data
1485
    // from this tile to produce a border, rather than using deblocked
1486
    // pixels from the tile above/below.
1487
75.1k
    memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
1488
75.1k
  }
1489
37.5k
  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1490
37.5k
               RESTORATION_EXTRA_HORZ, use_highbd);
1491
37.5k
}
1492
1493
static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1494
                                         int use_highbd, int plane,
1495
37.5k
                                         AV1_COMMON *cm, int after_cdef) {
1496
37.5k
  const int is_uv = plane > 0;
1497
37.5k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1498
37.5k
  const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1499
37.5k
  const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
1500
1501
  // Get the tile rectangle, with height rounded up to the next multiple of 8
1502
  // luma pixels (only relevant for the bottom tile of the frame)
1503
37.5k
  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
1504
37.5k
  const int stripe0 = 0;
1505
1506
37.5k
  RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
1507
1508
37.5k
  const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
1509
1510
37.5k
  int tile_stripe;
1511
248k
  for (tile_stripe = 0;; ++tile_stripe) {
1512
248k
    const int rel_y0 = AOMMAX(0, tile_stripe * stripe_height - stripe_off);
1513
248k
    const int y0 = tile_rect.top + rel_y0;
1514
248k
    if (y0 >= tile_rect.bottom) break;
1515
1516
210k
    const int rel_y1 = (tile_stripe + 1) * stripe_height - stripe_off;
1517
210k
    const int y1 = AOMMIN(tile_rect.top + rel_y1, tile_rect.bottom);
1518
1519
210k
    const int frame_stripe = stripe0 + tile_stripe;
1520
1521
    // In this case, we should only use CDEF pixels at the top
1522
    // and bottom of the frame as a whole; internal tile boundaries
1523
    // can use deblocked pixels from adjacent tiles for context.
1524
210k
    const int use_deblock_above = (frame_stripe > 0);
1525
210k
    const int use_deblock_below = (y1 < plane_height);
1526
1527
210k
    if (!after_cdef) {
1528
      // Save deblocked context where needed.
1529
105k
      if (use_deblock_above) {
1530
86.5k
        save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
1531
86.5k
                                    frame_stripe, use_highbd, 1, boundaries);
1532
86.5k
      }
1533
105k
      if (use_deblock_below) {
1534
86.5k
        save_deblock_boundary_lines(frame, cm, plane, y1, frame_stripe,
1535
86.5k
                                    use_highbd, 0, boundaries);
1536
86.5k
      }
1537
105k
    } else {
1538
      // Save CDEF context where needed. Note that we need to save the CDEF
1539
      // context for a particular boundary iff we *didn't* save deblocked
1540
      // context for that boundary.
1541
      //
1542
      // In addition, we need to save copies of the outermost line within
1543
      // the tile, rather than using data from outside the tile.
1544
105k
      if (!use_deblock_above) {
1545
18.7k
        save_cdef_boundary_lines(frame, cm, plane, y0, frame_stripe, use_highbd,
1546
18.7k
                                 1, boundaries);
1547
18.7k
      }
1548
105k
      if (!use_deblock_below) {
1549
18.7k
        save_cdef_boundary_lines(frame, cm, plane, y1 - 1, frame_stripe,
1550
18.7k
                                 use_highbd, 0, boundaries);
1551
18.7k
      }
1552
105k
    }
1553
210k
  }
1554
37.5k
}
1555
1556
// For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
1557
// lines to be used as boundary in the loop restoration process. The
1558
// lines are saved in rst_internal.stripe_boundary_lines
1559
void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1560
12.6k
                                              AV1_COMMON *cm, int after_cdef) {
1561
12.6k
  const int num_planes = av1_num_planes(cm);
1562
12.6k
  const int use_highbd = cm->seq_params->use_highbitdepth;
1563
50.2k
  for (int p = 0; p < num_planes; ++p) {
1564
37.5k
    save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
1565
37.5k
  }
1566
12.6k
}