Coverage Report

Created: 2022-08-24 06:17

/src/aom/av1/common/restoration.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 *
11
 */
12
13
#include <math.h>
14
15
#include "config/aom_config.h"
16
#include "config/aom_dsp_rtcd.h"
17
#include "config/aom_scale_rtcd.h"
18
19
#include "aom_mem/aom_mem.h"
20
#include "av1/common/av1_common_int.h"
21
#include "av1/common/resize.h"
22
#include "av1/common/restoration.h"
23
#include "aom_dsp/aom_dsp_common.h"
24
#include "aom_mem/aom_mem.h"
25
26
#include "aom_ports/mem.h"
27
28
// The 's' values are calculated based on original 'r' and 'e' values in the
29
// spec using GenSgrprojVtable().
30
// Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
31
const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
32
  { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
33
  { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
34
  { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
35
  { { 2, 1 }, { 47, 1079 } },  { { 2, 1 }, { 37, 996 } },
36
  { { 2, 1 }, { 30, 925 } },   { { 2, 1 }, { 25, 863 } },
37
  { { 0, 1 }, { -1, 2589 } },  { { 0, 1 }, { -1, 1618 } },
38
  { { 0, 1 }, { -1, 1177 } },  { { 0, 1 }, { -1, 925 } },
39
  { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
40
};
41
42
93.6k
AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
43
93.6k
  AV1PixelRect rect;
44
45
93.6k
  int ss_x = is_uv && cm->seq_params->subsampling_x;
46
93.6k
  int ss_y = is_uv && cm->seq_params->subsampling_y;
47
48
93.6k
  rect.top = 0;
49
93.6k
  rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
50
93.6k
  rect.left = 0;
51
93.6k
  rect.right = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
52
93.6k
  return rect;
53
93.6k
}
54
55
// Count horizontal or vertical units per tile (use a width or height for
56
// tile_size, respectively). We basically want to divide the tile size by the
57
// size of a restoration unit. Rather than rounding up unconditionally as you
58
// might expect, we round to nearest, which models the way a right or bottom
59
// restoration unit can extend to up to 150% its normal width or height. The
60
// max with 1 is to deal with tiles that are smaller than half of a restoration
61
// unit.
62
187k
int av1_lr_count_units_in_tile(int unit_size, int tile_size) {
63
187k
  return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
64
187k
}
65
66
void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
67
1.27k
                                  int is_uv) {
68
  // We need to allocate enough space for restoration units to cover the
69
  // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the
70
  // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
71
  // to do the computation ourselves, iterating over the tiles and keeping
72
  // track of the largest width and height, then upscaling.
73
1.27k
  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
74
1.27k
  const int max_tile_w = tile_rect.right - tile_rect.left;
75
1.27k
  const int max_tile_h = tile_rect.bottom - tile_rect.top;
76
77
  // To calculate hpertile and vpertile (horizontal and vertical units per
78
  // tile), we basically want to divide the largest tile width or height by the
79
  // size of a restoration unit. Rather than rounding up unconditionally as you
80
  // might expect, we round to nearest, which models the way a right or bottom
81
  // restoration unit can extend to up to 150% its normal width or height. The
82
  // max with 1 is to deal with tiles that are smaller than half of a
83
  // restoration unit.
84
1.27k
  const int unit_size = rsi->restoration_unit_size;
85
1.27k
  const int hpertile = av1_lr_count_units_in_tile(unit_size, max_tile_w);
86
1.27k
  const int vpertile = av1_lr_count_units_in_tile(unit_size, max_tile_h);
87
88
1.27k
  rsi->units_per_tile = hpertile * vpertile;
89
1.27k
  rsi->horz_units_per_tile = hpertile;
90
1.27k
  rsi->vert_units_per_tile = vpertile;
91
92
1.27k
  const int ntiles = 1;
93
1.27k
  const int nunits = ntiles * rsi->units_per_tile;
94
95
1.27k
  aom_free(rsi->unit_info);
96
1.27k
  CHECK_MEM_ERROR(cm, rsi->unit_info,
97
1.27k
                  (RestorationUnitInfo *)aom_memalign(
98
1.27k
                      16, sizeof(*rsi->unit_info) * nunits));
99
1.27k
}
100
101
5.96k
void av1_free_restoration_struct(RestorationInfo *rst_info) {
102
5.96k
  aom_free(rst_info->unit_info);
103
5.96k
  rst_info->unit_info = NULL;
104
5.96k
}
105
106
#if 0
107
// Pair of values for each sgrproj parameter:
108
// Index 0 corresponds to r[0], e[0]
109
// Index 1 corresponds to r[1], e[1]
110
int sgrproj_mtable[SGRPROJ_PARAMS][2];
111
112
static void GenSgrprojVtable() {
113
  for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
114
    const sgr_params_type *const params = &av1_sgr_params[i];
115
    for (int j = 0; j < 2; ++j) {
116
      const int e = params->e[j];
117
      const int r = params->r[j];
118
      if (r == 0) {                 // filter is disabled
119
        sgrproj_mtable[i][j] = -1;  // mark invalid
120
      } else {                      // filter is enabled
121
        const int n = (2 * r + 1) * (2 * r + 1);
122
        const int n2e = n * n * e;
123
        assert(n2e != 0);
124
        sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
125
      }
126
    }
127
  }
128
}
129
#endif
130
131
1.98k
void av1_loop_restoration_precal() {
132
#if 0
133
  GenSgrprojVtable();
134
#endif
135
1.98k
}
136
137
static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride,
138
6
                               int border_horz, int border_vert) {
139
6
  uint8_t *data_p;
140
6
  int i;
141
716
  for (i = 0; i < height; ++i) {
142
710
    data_p = data + i * stride;
143
710
    memset(data_p - border_horz, data_p[0], border_horz);
144
710
    memset(data_p + width, data_p[width - 1], border_horz);
145
710
  }
146
6
  data_p = data - border_horz;
147
24
  for (i = -border_vert; i < 0; ++i) {
148
18
    memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
149
18
  }
150
24
  for (i = height; i < height + border_vert; ++i) {
151
18
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
152
18
           width + 2 * border_horz);
153
18
  }
154
6
}
155
156
#if CONFIG_AV1_HIGHBITDEPTH
157
static void extend_frame_highbd(uint16_t *data, int width, int height,
158
0
                                int stride, int border_horz, int border_vert) {
159
0
  uint16_t *data_p;
160
0
  int i, j;
161
0
  for (i = 0; i < height; ++i) {
162
0
    data_p = data + i * stride;
163
0
    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
164
0
    for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
165
0
  }
166
0
  data_p = data - border_horz;
167
0
  for (i = -border_vert; i < 0; ++i) {
168
0
    memcpy(data_p + i * stride, data_p,
169
0
           (width + 2 * border_horz) * sizeof(uint16_t));
170
0
  }
171
0
  for (i = height; i < height + border_vert; ++i) {
172
0
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
173
0
           (width + 2 * border_horz) * sizeof(uint16_t));
174
0
  }
175
0
}
176
177
static void copy_tile_highbd(int width, int height, const uint16_t *src,
178
0
                             int src_stride, uint16_t *dst, int dst_stride) {
179
0
  for (int i = 0; i < height; ++i)
180
0
    memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
181
0
}
182
#endif
183
184
void av1_extend_frame(uint8_t *data, int width, int height, int stride,
185
6
                      int border_horz, int border_vert, int highbd) {
186
6
#if CONFIG_AV1_HIGHBITDEPTH
187
6
  if (highbd) {
188
0
    extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
189
0
                        border_horz, border_vert);
190
0
    return;
191
0
  }
192
6
#endif
193
6
  (void)highbd;
194
6
  extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
195
6
}
196
197
static void copy_tile_lowbd(int width, int height, const uint8_t *src,
198
32
                            int src_stride, uint8_t *dst, int dst_stride) {
199
3.58k
  for (int i = 0; i < height; ++i)
200
3.55k
    memcpy(dst + i * dst_stride, src + i * src_stride, width);
201
32
}
202
203
static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
204
32
                      uint8_t *dst, int dst_stride, int highbd) {
205
32
#if CONFIG_AV1_HIGHBITDEPTH
206
32
  if (highbd) {
207
0
    copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
208
0
                     CONVERT_TO_SHORTPTR(dst), dst_stride);
209
0
    return;
210
0
  }
211
32
#endif
212
32
  (void)highbd;
213
32
  copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
214
32
}
215
216
928
#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
217
218
// With striped loop restoration, the filtering for each 64-pixel stripe gets
219
// most of its input from the output of CDEF (stored in data8), but we need to
220
// fill out a border of 3 pixels above/below the stripe according to the
221
// following
222
// rules:
223
//
224
// * At a frame boundary, we copy the outermost row of CDEF pixels three times.
225
//   This extension is done by a call to av1_extend_frame() at the start of the
226
//   loop restoration process, so the value of copy_above/copy_below doesn't
227
//   strictly matter. However, by setting *copy_above = *copy_below = 1 whenever
228
//   loop filtering across tiles is disabled, we can allow
229
//   {setup,restore}_processing_stripe_boundary to assume that the top/bottom
230
//   data has always been copied, simplifying the behaviour at the left and
231
//   right edges of tiles.
232
//
233
// * If we're at a tile boundary and loop filtering across tiles is enabled,
234
//   then there is a logical stripe which is 64 pixels high, but which is split
235
//   into an 8px high and a 56px high stripe so that the processing (and
236
//   coefficient set usage) can be aligned to tiles.
237
//   In this case, we use the 3 rows of CDEF output across the boundary for
238
//   context; this corresponds to leaving the frame buffer as-is.
239
//
240
// * If we're at a tile boundary and loop filtering across tiles is disabled,
241
//   then we take the outermost row of CDEF pixels *within the current tile*
242
//   and copy it three times. Thus we behave exactly as if the tile were a full
243
//   frame.
244
//
245
// * Otherwise, we're at a stripe boundary within a tile. In that case, we
246
//   take 2 rows of deblocked pixels and extend them to 3 rows of context.
247
//
248
// The distinction between the latter two cases is handled by the
249
// av1_loop_restoration_save_boundary_lines() function, so here we just need
250
// to decide if we're overwriting the above/below boundary pixels or not.
251
static void get_stripe_boundary_info(const RestorationTileLimits *limits,
252
                                     const AV1PixelRect *tile_rect, int ss_y,
253
174
                                     int *copy_above, int *copy_below) {
254
174
  *copy_above = 1;
255
174
  *copy_below = 1;
256
257
174
  const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
258
174
  const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
259
260
174
  const int first_stripe_in_tile = (limits->v_start == tile_rect->top);
261
174
  const int this_stripe_height =
262
174
      full_stripe_height - (first_stripe_in_tile ? runit_offset : 0);
263
174
  const int last_stripe_in_tile =
264
174
      (limits->v_start + this_stripe_height >= tile_rect->bottom);
265
266
174
  if (first_stripe_in_tile) *copy_above = 0;
267
174
  if (last_stripe_in_tile) *copy_below = 0;
268
174
}
269
270
// Overwrite the border pixels around a processing stripe so that the conditions
271
// listed above get_stripe_boundary_info() are preserved.
272
// We save the pixels which get overwritten into a temporary buffer, so that
273
// they can be restored by restore_processing_stripe_boundary() after we've
274
// processed the stripe.
275
//
276
// limits gives the rectangular limits of the remaining stripes for the current
277
// restoration unit. rsb is the stored stripe boundaries (taken from either
278
// deblock or CDEF output as necessary).
279
//
280
// tile_rect is the limits of the current tile and tile_stripe0 is the index of
281
// the first stripe in this tile (needed to convert the tile-relative stripe
282
// index we get from limits into something we can look up in rsb).
283
static void setup_processing_stripe_boundary(
284
    const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
285
    int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
286
174
    RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
287
  // Offsets within the line buffers. The buffer logically starts at column
288
  // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
289
  // has column x0 in the buffer.
290
174
  const int buf_stride = rsb->stripe_boundary_stride;
291
174
  const int buf_x0_off = limits->h_start;
292
174
  const int line_width =
293
174
      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
294
174
  const int line_size = line_width << use_highbd;
295
296
174
  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
297
298
  // Replace RESTORATION_BORDER pixels above the top of the stripe
299
  // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
300
  // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
301
  // duplicating the topmost of the 2 lines (see the AOMMAX call when
302
  // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
303
  //
304
  // Special case: If we're at the top of a tile, which isn't on the topmost
305
  // tile row, and we're allowed to loop filter across tiles, then we have a
306
  // logical 64-pixel-high stripe which has been split into an 8-pixel high
307
  // stripe and a 56-pixel high stripe (the current one). So, in this case,
308
  // we want to leave the boundary alone!
309
174
  if (!opt) {
310
0
    if (copy_above) {
311
0
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
312
313
0
      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
314
0
        const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
315
0
        const int buf_off = buf_x0_off + buf_row * buf_stride;
316
0
        const uint8_t *buf =
317
0
            rsb->stripe_boundary_above + (buf_off << use_highbd);
318
0
        uint8_t *dst8 = data8_tl + i * data_stride;
319
        // Save old pixels, then replace with data from stripe_boundary_above
320
0
        memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
321
0
               REAL_PTR(use_highbd, dst8), line_size);
322
0
        memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
323
0
      }
324
0
    }
325
326
    // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
327
    // The second buffer row is repeated, so src_row gets the values 0, 1, 1
328
    // for i = 0, 1, 2.
329
0
    if (copy_below) {
330
0
      const int stripe_end = limits->v_start + h;
331
0
      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
332
333
0
      for (int i = 0; i < RESTORATION_BORDER; ++i) {
334
0
        const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
335
0
        const int buf_off = buf_x0_off + buf_row * buf_stride;
336
0
        const uint8_t *src =
337
0
            rsb->stripe_boundary_below + (buf_off << use_highbd);
338
339
0
        uint8_t *dst8 = data8_bl + i * data_stride;
340
        // Save old pixels, then replace with data from stripe_boundary_below
341
0
        memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
342
0
        memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
343
0
      }
344
0
    }
345
174
  } else {
346
174
    if (copy_above) {
347
116
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
348
349
      // Only save and overwrite i=-RESTORATION_BORDER line.
350
116
      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
351
      // Save old pixels, then replace with data from stripe_boundary_above
352
116
      memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
353
116
      memcpy(REAL_PTR(use_highbd, dst8),
354
116
             REAL_PTR(use_highbd,
355
116
                      data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
356
116
             line_size);
357
116
    }
358
359
174
    if (copy_below) {
360
116
      const int stripe_end = limits->v_start + h;
361
116
      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
362
363
      // Only save and overwrite i=2 line.
364
116
      uint8_t *dst8 = data8_bl + 2 * data_stride;
365
      // Save old pixels, then replace with data from stripe_boundary_below
366
116
      memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
367
116
      memcpy(REAL_PTR(use_highbd, dst8),
368
116
             REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
369
116
    }
370
174
  }
371
174
}
372
373
// This function restores the boundary lines modified by
374
// setup_processing_stripe_boundary.
375
//
376
// Note: We need to be careful when handling the corners of the processing
377
// unit, because (eg.) the top-left corner is considered to be part of
378
// both the left and top borders. This means that, depending on the
379
// loop_filter_across_tiles_enabled flag, the corner pixels might get
380
// overwritten twice, once as part of the "top" border and once as part
381
// of the "left" border (or similar for other corners).
382
//
383
// Everything works out fine as long as we make sure to reverse the order
384
// when restoring, ie. we need to restore the left/right borders followed
385
// by the top/bottom borders.
386
static void restore_processing_stripe_boundary(
387
    const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
388
    int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
389
174
    int copy_below, int opt) {
390
174
  const int line_width =
391
174
      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
392
174
  const int line_size = line_width << use_highbd;
393
394
174
  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
395
396
174
  if (!opt) {
397
0
    if (copy_above) {
398
0
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
399
0
      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
400
0
        uint8_t *dst8 = data8_tl + i * data_stride;
401
0
        memcpy(REAL_PTR(use_highbd, dst8),
402
0
               rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
403
0
      }
404
0
    }
405
406
0
    if (copy_below) {
407
0
      const int stripe_bottom = limits->v_start + h;
408
0
      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
409
410
0
      for (int i = 0; i < RESTORATION_BORDER; ++i) {
411
0
        if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
412
413
0
        uint8_t *dst8 = data8_bl + i * data_stride;
414
0
        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
415
0
      }
416
0
    }
417
174
  } else {
418
174
    if (copy_above) {
419
116
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
420
421
      // Only restore i=-RESTORATION_BORDER line.
422
116
      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
423
116
      memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
424
116
    }
425
426
174
    if (copy_below) {
427
116
      const int stripe_bottom = limits->v_start + h;
428
116
      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
429
430
      // Only restore i=2 line.
431
116
      if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
432
116
        uint8_t *dst8 = data8_bl + 2 * data_stride;
433
116
        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
434
116
      }
435
116
    }
436
174
  }
437
174
}
438
439
static void wiener_filter_stripe(const RestorationUnitInfo *rui,
440
                                 int stripe_width, int stripe_height,
441
                                 int procunit_width, const uint8_t *src,
442
                                 int src_stride, uint8_t *dst, int dst_stride,
443
0
                                 int32_t *tmpbuf, int bit_depth) {
444
0
  (void)tmpbuf;
445
0
  (void)bit_depth;
446
0
  assert(bit_depth == 8);
447
0
  const ConvolveParams conv_params = get_conv_params_wiener(8);
448
449
0
  for (int j = 0; j < stripe_width; j += procunit_width) {
450
0
    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
451
0
    const uint8_t *src_p = src + j;
452
0
    uint8_t *dst_p = dst + j;
453
0
    av1_wiener_convolve_add_src(
454
0
        src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
455
0
        rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
456
0
  }
457
0
}
458
459
/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
460
   over the input. The window is of size (2r + 1)x(2r + 1), and we
461
   specialize to r = 1, 2, 3. A default function is used for r > 3.
462
463
   Each loop follows the same format: We keep a window's worth of input
464
   in individual variables and select data out of that as appropriate.
465
*/
466
static void boxsum1(int32_t *src, int width, int height, int src_stride,
467
1.05k
                    int sqr, int32_t *dst, int dst_stride) {
468
1.05k
  int i, j, a, b, c;
469
1.05k
  assert(width > 2 * SGRPROJ_BORDER_HORZ);
470
1.05k
  assert(height > 2 * SGRPROJ_BORDER_VERT);
471
472
  // Vertical sum over 3-pixel regions, from src into dst.
473
1.05k
  if (!sqr) {
474
26.2k
    for (j = 0; j < width; ++j) {
475
25.7k
      a = src[j];
476
25.7k
      b = src[src_stride + j];
477
25.7k
      c = src[2 * src_stride + j];
478
479
25.7k
      dst[j] = a + b;
480
1.25M
      for (i = 1; i < height - 2; ++i) {
481
        // Loop invariant: At the start of each iteration,
482
        // a = src[(i - 1) * src_stride + j]
483
        // b = src[(i    ) * src_stride + j]
484
        // c = src[(i + 1) * src_stride + j]
485
1.22M
        dst[i * dst_stride + j] = a + b + c;
486
1.22M
        a = b;
487
1.22M
        b = c;
488
1.22M
        c = src[(i + 2) * src_stride + j];
489
1.22M
      }
490
25.7k
      dst[i * dst_stride + j] = a + b + c;
491
25.7k
      dst[(i + 1) * dst_stride + j] = b + c;
492
25.7k
    }
493
528
  } else {
494
26.2k
    for (j = 0; j < width; ++j) {
495
25.7k
      a = src[j] * src[j];
496
25.7k
      b = src[src_stride + j] * src[src_stride + j];
497
25.7k
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
498
499
25.7k
      dst[j] = a + b;
500
1.25M
      for (i = 1; i < height - 2; ++i) {
501
1.22M
        dst[i * dst_stride + j] = a + b + c;
502
1.22M
        a = b;
503
1.22M
        b = c;
504
1.22M
        c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
505
1.22M
      }
506
25.7k
      dst[i * dst_stride + j] = a + b + c;
507
25.7k
      dst[(i + 1) * dst_stride + j] = b + c;
508
25.7k
    }
509
528
  }
510
511
  // Horizontal sum over 3-pixel regions of dst
512
49.9k
  for (i = 0; i < height; ++i) {
513
48.9k
    a = dst[i * dst_stride];
514
48.9k
    b = dst[i * dst_stride + 1];
515
48.9k
    c = dst[i * dst_stride + 2];
516
517
48.9k
    dst[i * dst_stride] = a + b;
518
2.50M
    for (j = 1; j < width - 2; ++j) {
519
      // Loop invariant: At the start of each iteration,
520
      // a = src[i * src_stride + (j - 1)]
521
      // b = src[i * src_stride + (j    )]
522
      // c = src[i * src_stride + (j + 1)]
523
2.45M
      dst[i * dst_stride + j] = a + b + c;
524
2.45M
      a = b;
525
2.45M
      b = c;
526
2.45M
      c = dst[i * dst_stride + (j + 2)];
527
2.45M
    }
528
48.9k
    dst[i * dst_stride + j] = a + b + c;
529
48.9k
    dst[i * dst_stride + (j + 1)] = b + c;
530
48.9k
  }
531
1.05k
}
532
533
static void boxsum2(int32_t *src, int width, int height, int src_stride,
534
1.20k
                    int sqr, int32_t *dst, int dst_stride) {
535
1.20k
  int i, j, a, b, c, d, e;
536
1.20k
  assert(width > 2 * SGRPROJ_BORDER_HORZ);
537
1.20k
  assert(height > 2 * SGRPROJ_BORDER_VERT);
538
539
  // Vertical sum over 5-pixel regions, from src into dst.
540
1.20k
  if (!sqr) {
541
29.8k
    for (j = 0; j < width; ++j) {
542
29.2k
      a = src[j];
543
29.2k
      b = src[src_stride + j];
544
29.2k
      c = src[2 * src_stride + j];
545
29.2k
      d = src[3 * src_stride + j];
546
29.2k
      e = src[4 * src_stride + j];
547
548
29.2k
      dst[j] = a + b + c;
549
29.2k
      dst[dst_stride + j] = a + b + c + d;
550
1.35M
      for (i = 2; i < height - 3; ++i) {
551
        // Loop invariant: At the start of each iteration,
552
        // a = src[(i - 2) * src_stride + j]
553
        // b = src[(i - 1) * src_stride + j]
554
        // c = src[(i    ) * src_stride + j]
555
        // d = src[(i + 1) * src_stride + j]
556
        // e = src[(i + 2) * src_stride + j]
557
1.32M
        dst[i * dst_stride + j] = a + b + c + d + e;
558
1.32M
        a = b;
559
1.32M
        b = c;
560
1.32M
        c = d;
561
1.32M
        d = e;
562
1.32M
        e = src[(i + 3) * src_stride + j];
563
1.32M
      }
564
29.2k
      dst[i * dst_stride + j] = a + b + c + d + e;
565
29.2k
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
566
29.2k
      dst[(i + 2) * dst_stride + j] = c + d + e;
567
29.2k
    }
568
600
  } else {
569
29.8k
    for (j = 0; j < width; ++j) {
570
29.2k
      a = src[j] * src[j];
571
29.2k
      b = src[src_stride + j] * src[src_stride + j];
572
29.2k
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
573
29.2k
      d = src[3 * src_stride + j] * src[3 * src_stride + j];
574
29.2k
      e = src[4 * src_stride + j] * src[4 * src_stride + j];
575
576
29.2k
      dst[j] = a + b + c;
577
29.2k
      dst[dst_stride + j] = a + b + c + d;
578
1.35M
      for (i = 2; i < height - 3; ++i) {
579
1.32M
        dst[i * dst_stride + j] = a + b + c + d + e;
580
1.32M
        a = b;
581
1.32M
        b = c;
582
1.32M
        c = d;
583
1.32M
        d = e;
584
1.32M
        e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
585
1.32M
      }
586
29.2k
      dst[i * dst_stride + j] = a + b + c + d + e;
587
29.2k
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
588
29.2k
      dst[(i + 2) * dst_stride + j] = c + d + e;
589
29.2k
    }
590
600
  }
591
592
  // Horizontal sum over 5-pixel regions of dst
593
56.6k
  for (i = 0; i < height; ++i) {
594
55.4k
    a = dst[i * dst_stride];
595
55.4k
    b = dst[i * dst_stride + 1];
596
55.4k
    c = dst[i * dst_stride + 2];
597
55.4k
    d = dst[i * dst_stride + 3];
598
55.4k
    e = dst[i * dst_stride + 4];
599
600
55.4k
    dst[i * dst_stride] = a + b + c;
601
55.4k
    dst[i * dst_stride + 1] = a + b + c + d;
602
2.73M
    for (j = 2; j < width - 3; ++j) {
603
      // Loop invariant: At the start of each iteration,
604
      // a = src[i * src_stride + (j - 2)]
605
      // b = src[i * src_stride + (j - 1)]
606
      // c = src[i * src_stride + (j    )]
607
      // d = src[i * src_stride + (j + 1)]
608
      // e = src[i * src_stride + (j + 2)]
609
2.67M
      dst[i * dst_stride + j] = a + b + c + d + e;
610
2.67M
      a = b;
611
2.67M
      b = c;
612
2.67M
      c = d;
613
2.67M
      d = e;
614
2.67M
      e = dst[i * dst_stride + (j + 3)];
615
2.67M
    }
616
55.4k
    dst[i * dst_stride + j] = a + b + c + d + e;
617
55.4k
    dst[i * dst_stride + (j + 1)] = b + c + d + e;
618
55.4k
    dst[i * dst_stride + (j + 2)] = c + d + e;
619
55.4k
  }
620
1.20k
}
621
622
static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
623
2.25k
                   int sqr, int32_t *dst, int dst_stride) {
624
2.25k
  if (r == 1)
625
1.05k
    boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
626
1.20k
  else if (r == 2)
627
1.20k
    boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
628
0
  else
629
0
    assert(0 && "Invalid value of r in self-guided filter");
630
2.25k
}
631
632
696
void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
633
696
  if (params->r[0] == 0) {
634
96
    xq[0] = 0;
635
96
    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
636
600
  } else if (params->r[1] == 0) {
637
168
    xq[0] = xqd[0];
638
168
    xq[1] = 0;
639
432
  } else {
640
432
    xq[0] = xqd[0];
641
432
    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
642
432
  }
643
696
}
644
645
const int32_t av1_x_by_xplus1[256] = {
646
  // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
647
  // instead of 0. See comments in selfguided_restoration_internal() for why
648
  1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
649
  240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
650
  248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
651
  250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
652
  252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
653
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
654
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
655
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
656
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
657
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
658
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
659
  254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
660
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
661
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
662
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
663
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
664
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
665
  256,
666
};
667
668
const int32_t av1_one_by_x[MAX_NELEM] = {
669
  4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
670
  293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
671
};
672
673
static void calculate_intermediate_result(int32_t *dgd, int width, int height,
674
                                          int dgd_stride, int bit_depth,
675
                                          int sgr_params_idx, int radius_idx,
676
1.12k
                                          int pass, int32_t *A, int32_t *B) {
677
1.12k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
678
1.12k
  const int r = params->r[radius_idx];
679
1.12k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
680
1.12k
  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
681
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
682
  // leading to a significant speed improvement.
683
  // We also align the stride to a multiple of 16 bytes, for consistency
684
  // with the SIMD version of this function.
685
1.12k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
686
1.12k
  const int step = pass == 0 ? 1 : 2;
687
1.12k
  int i, j;
688
689
1.12k
  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
690
1.12k
  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
691
1.12k
         "Need SGRPROJ_BORDER_* >= r+1");
692
693
1.12k
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
694
1.12k
         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
695
1.12k
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
696
1.12k
         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
697
1.12k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
698
1.12k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
699
  // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
700
  // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
701
36.2k
  for (i = -1; i < height + 1; i += step) {
702
1.77M
    for (j = -1; j < width + 1; ++j) {
703
1.74M
      const int k = i * buf_stride + j;
704
1.74M
      const int n = (2 * r + 1) * (2 * r + 1);
705
706
      // a < 2^16 * n < 2^22 regardless of bit depth
707
1.74M
      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
708
      // b < 2^8 * n < 2^14 regardless of bit depth
709
1.74M
      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
710
711
      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
712
      // and p itself satisfies p < 2^14 * n^2 < 2^26.
713
      // This bound on p is due to:
714
      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
715
      //
716
      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
717
      // This is an artefact of rounding, and can only happen if all pixels
718
      // are (almost) identical, so in this case we saturate to p=0.
719
1.74M
      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
720
721
1.74M
      const uint32_t s = params->s[radius_idx];
722
723
      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
724
      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
725
      // (this holds even after accounting for the rounding in s)
726
1.74M
      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
727
728
      // Note: We have to be quite careful about the value of A[k].
729
      // This is used as a blend factor between individual pixel values and the
730
      // local mean. So it logically has a range of [0, 256], including both
731
      // endpoints.
732
      //
733
      // This is a pain for hardware, as we'd like something which can be stored
734
      // in exactly 8 bits.
735
      // Further, in the calculation of B[k] below, if z == 0 and r == 2,
736
      // then A[k] "should be" 0. But then we can end up setting B[k] to a value
737
      // slightly above 2^(8 + bit depth), due to rounding in the value of
738
      // av1_one_by_x[25-1].
739
      //
740
      // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
741
      // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
742
      // overflow), without significantly affecting the final result: z == 0
743
      // implies that the image is essentially "flat", so the local mean and
744
      // individual pixel values are very similar.
745
      //
746
      // Note that saturating on the other side, ie. requring A[k] <= 255,
747
      // would be a bad idea, as that corresponds to the case where the image
748
      // is very variable, when we want to preserve the local pixel value as
749
      // much as possible.
750
1.74M
      A[k] = av1_x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
751
752
      // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
753
      // av1_one_by_x[n - 1] = round(2^12 / n)
754
      // => the product here is < 2^(20 + bit_depth) <= 2^32,
755
      // and B[k] is set to a value < 2^(8 + bit depth)
756
      // This holds even with the rounding in av1_one_by_x and in the overall
757
      // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
758
1.74M
      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
759
1.74M
                                             (uint32_t)B[k] *
760
1.74M
                                             (uint32_t)av1_one_by_x[n - 1],
761
1.74M
                                         SGRPROJ_RECIP_BITS);
762
1.74M
    }
763
35.1k
  }
764
1.12k
}
765
766
static void selfguided_restoration_fast_internal(
767
    int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
768
600
    int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
769
600
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
770
600
  const int r = params->r[radius_idx];
771
600
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
772
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
773
  // leading to a significant speed improvement.
774
  // We also align the stride to a multiple of 16 bytes, for consistency
775
  // with the SIMD version of this function.
776
600
  int buf_stride = ((width_ext + 3) & ~3) + 16;
777
600
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
778
600
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
779
600
  int32_t *A = A_;
780
600
  int32_t *B = B_;
781
600
  int i, j;
782
600
  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
783
600
                                sgr_params_idx, radius_idx, 1, A, B);
784
600
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
785
600
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
786
787
  // Use the A[] and B[] arrays to calculate the filtered image
788
600
  (void)r;
789
600
  assert(r == 2);
790
24.7k
  for (i = 0; i < height; ++i) {
791
24.1k
    if (!(i & 1)) {  // even row
792
594k
      for (j = 0; j < width; ++j) {
793
582k
        const int k = i * buf_stride + j;
794
582k
        const int l = i * dgd_stride + j;
795
582k
        const int m = i * dst_stride + j;
796
582k
        const int nb = 5;
797
582k
        const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
798
582k
                          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
799
582k
                           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
800
582k
                              5;
801
582k
        const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
802
582k
                          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
803
582k
                           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
804
582k
                              5;
805
582k
        const int32_t v = a * dgd[l] + b;
806
582k
        dst[m] =
807
582k
            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
808
582k
      }
809
12.1k
    } else {  // odd row
810
585k
      for (j = 0; j < width; ++j) {
811
573k
        const int k = i * buf_stride + j;
812
573k
        const int l = i * dgd_stride + j;
813
573k
        const int m = i * dst_stride + j;
814
573k
        const int nb = 4;
815
573k
        const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
816
573k
        const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
817
573k
        const int32_t v = a * dgd[l] + b;
818
573k
        dst[m] =
819
573k
            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
820
573k
      }
821
11.9k
    }
822
24.1k
  }
823
600
}
824
825
static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
826
                                            int dgd_stride, int32_t *dst,
827
                                            int dst_stride, int bit_depth,
828
                                            int sgr_params_idx,
829
528
                                            int radius_idx) {
830
528
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
831
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
832
  // leading to a significant speed improvement.
833
  // We also align the stride to a multiple of 16 bytes, for consistency
834
  // with the SIMD version of this function.
835
528
  int buf_stride = ((width_ext + 3) & ~3) + 16;
836
528
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
837
528
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
838
528
  int32_t *A = A_;
839
528
  int32_t *B = B_;
840
528
  int i, j;
841
528
  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
842
528
                                sgr_params_idx, radius_idx, 0, A, B);
843
528
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
844
528
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
845
846
  // Use the A[] and B[] arrays to calculate the filtered image
847
21.8k
  for (i = 0; i < height; ++i) {
848
1.04M
    for (j = 0; j < width; ++j) {
849
1.01M
      const int k = i * buf_stride + j;
850
1.01M
      const int l = i * dgd_stride + j;
851
1.01M
      const int m = i * dst_stride + j;
852
1.01M
      const int nb = 5;
853
1.01M
      const int32_t a =
854
1.01M
          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
855
1.01M
              4 +
856
1.01M
          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
857
1.01M
           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
858
1.01M
              3;
859
1.01M
      const int32_t b =
860
1.01M
          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
861
1.01M
              4 +
862
1.01M
          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
863
1.01M
           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
864
1.01M
              3;
865
1.01M
      const int32_t v = a * dgd[l] + b;
866
1.01M
      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
867
1.01M
    }
868
21.2k
  }
869
528
}
870
871
int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
872
                                 int dgd_stride, int32_t *flt0, int32_t *flt1,
873
                                 int flt_stride, int sgr_params_idx,
874
696
                                 int bit_depth, int highbd) {
875
696
  int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
876
696
  const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
877
696
  int32_t *dgd32 =
878
696
      dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
879
880
696
  if (highbd) {
881
0
    const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
882
0
    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
883
0
      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
884
0
        dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
885
0
      }
886
0
    }
887
696
  } else {
888
33.2k
    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
889
1.79M
      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
890
1.75M
        dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
891
1.75M
      }
892
32.5k
    }
893
696
  }
894
895
696
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
896
  // If params->r == 0 we skip the corresponding filter. We only allow one of
897
  // the radii to be 0, as having both equal to 0 would be equivalent to
898
  // skipping SGR entirely.
899
696
  assert(!(params->r[0] == 0 && params->r[1] == 0));
900
901
696
  if (params->r[0] > 0)
902
600
    selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
903
600
                                         flt0, flt_stride, bit_depth,
904
600
                                         sgr_params_idx, 0);
905
696
  if (params->r[1] > 0)
906
528
    selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
907
528
                                    flt_stride, bit_depth, sgr_params_idx, 1);
908
696
  return 0;
909
696
}
910
911
void av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
912
                                        int height, int stride, int eps,
913
                                        const int *xqd, uint8_t *dst8,
914
                                        int dst_stride, int32_t *tmpbuf,
915
696
                                        int bit_depth, int highbd) {
916
696
  int32_t *flt0 = tmpbuf;
917
696
  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
918
696
  assert(width * height <= RESTORATION_UNITPELS_MAX);
919
920
696
  const int ret = av1_selfguided_restoration_c(
921
696
      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
922
696
  (void)ret;
923
696
  assert(!ret);
924
696
  const sgr_params_type *const params = &av1_sgr_params[eps];
925
696
  int xq[2];
926
696
  av1_decode_xq(xqd, xq, params);
927
29.0k
  for (int i = 0; i < height; ++i) {
928
1.41M
    for (int j = 0; j < width; ++j) {
929
1.38M
      const int k = i * width + j;
930
1.38M
      uint8_t *dst8ij = dst8 + i * dst_stride + j;
931
1.38M
      const uint8_t *dat8ij = dat8 + i * stride + j;
932
933
1.38M
      const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
934
1.38M
      const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
935
1.38M
      int32_t v = u << SGRPROJ_PRJ_BITS;
936
      // If params->r == 0 then we skipped the filtering in
937
      // av1_selfguided_restoration_c, i.e. flt[k] == u
938
1.38M
      if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
939
1.38M
      if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
940
1.38M
      const int16_t w =
941
1.38M
          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
942
943
1.38M
      const uint16_t out = clip_pixel_highbd(w, bit_depth);
944
1.38M
      if (highbd)
945
0
        *CONVERT_TO_SHORTPTR(dst8ij) = out;
946
1.38M
      else
947
1.38M
        *dst8ij = (uint8_t)out;
948
1.38M
    }
949
28.3k
  }
950
696
}
951
952
static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
953
                                  int stripe_width, int stripe_height,
954
                                  int procunit_width, const uint8_t *src,
955
                                  int src_stride, uint8_t *dst, int dst_stride,
956
174
                                  int32_t *tmpbuf, int bit_depth) {
957
174
  (void)bit_depth;
958
174
  assert(bit_depth == 8);
959
960
870
  for (int j = 0; j < stripe_width; j += procunit_width) {
961
696
    int w = AOMMIN(procunit_width, stripe_width - j);
962
696
    av1_apply_selfguided_restoration(
963
696
        src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
964
696
        rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth, 0);
965
696
  }
966
174
}
967
968
#if CONFIG_AV1_HIGHBITDEPTH
969
static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
970
                                        int stripe_width, int stripe_height,
971
                                        int procunit_width, const uint8_t *src8,
972
                                        int src_stride, uint8_t *dst8,
973
                                        int dst_stride, int32_t *tmpbuf,
974
0
                                        int bit_depth) {
975
0
  (void)tmpbuf;
976
0
  const ConvolveParams conv_params = get_conv_params_wiener(bit_depth);
977
978
0
  for (int j = 0; j < stripe_width; j += procunit_width) {
979
0
    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
980
0
    const uint8_t *src8_p = src8 + j;
981
0
    uint8_t *dst8_p = dst8 + j;
982
0
    av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
983
0
                                       rui->wiener_info.hfilter, 16,
984
0
                                       rui->wiener_info.vfilter, 16, w,
985
0
                                       stripe_height, &conv_params, bit_depth);
986
0
  }
987
0
}
988
989
static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
990
                                         int stripe_width, int stripe_height,
991
                                         int procunit_width,
992
                                         const uint8_t *src8, int src_stride,
993
                                         uint8_t *dst8, int dst_stride,
994
0
                                         int32_t *tmpbuf, int bit_depth) {
995
0
  for (int j = 0; j < stripe_width; j += procunit_width) {
996
0
    int w = AOMMIN(procunit_width, stripe_width - j);
997
0
    av1_apply_selfguided_restoration(
998
0
        src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
999
0
        rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
1000
0
  }
1001
0
}
1002
#endif  // CONFIG_AV1_HIGHBITDEPTH
1003
1004
typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
1005
                                  int stripe_width, int stripe_height,
1006
                                  int procunit_width, const uint8_t *src,
1007
                                  int src_stride, uint8_t *dst, int dst_stride,
1008
                                  int32_t *tmpbuf, int bit_depth);
1009
1010
#if CONFIG_AV1_HIGHBITDEPTH
1011
#define NUM_STRIPE_FILTERS 4
1012
static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
1013
  wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
1014
  sgrproj_filter_stripe_highbd
1015
};
1016
#else
1017
#define NUM_STRIPE_FILTERS 2
1018
static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
1019
  wiener_filter_stripe, sgrproj_filter_stripe
1020
};
1021
#endif  // CONFIG_AV1_HIGHBITDEPTH
1022
1023
// Filter one restoration unit
1024
void av1_loop_restoration_filter_unit(
1025
    const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
1026
    const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
1027
    const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
1028
    int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
1029
90
    int dst_stride, int32_t *tmpbuf, int optimized_lr) {
1030
90
  RestorationType unit_rtype = rui->restoration_type;
1031
1032
90
  int unit_h = limits->v_end - limits->v_start;
1033
90
  int unit_w = limits->h_end - limits->h_start;
1034
90
  uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start;
1035
90
  uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start;
1036
1037
90
  if (unit_rtype == RESTORE_NONE) {
1038
32
    copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd);
1039
32
    return;
1040
32
  }
1041
1042
58
  const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
1043
58
  assert(filter_idx < NUM_STRIPE_FILTERS);
1044
58
  const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
1045
1046
58
  const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
1047
1048
  // Convolve the whole tile one stripe at a time
1049
58
  RestorationTileLimits remaining_stripes = *limits;
1050
58
  int i = 0;
1051
232
  while (i < unit_h) {
1052
174
    int copy_above, copy_below;
1053
174
    remaining_stripes.v_start = limits->v_start + i;
1054
1055
174
    get_stripe_boundary_info(&remaining_stripes, tile_rect, ss_y, &copy_above,
1056
174
                             &copy_below);
1057
1058
174
    const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1059
174
    const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
1060
1061
    // Work out where this stripe's boundaries are within
1062
    // rsb->stripe_boundary_{above,below}
1063
174
    const int tile_stripe =
1064
174
        (remaining_stripes.v_start - tile_rect->top + runit_offset) /
1065
174
        full_stripe_height;
1066
174
    const int frame_stripe = tile_stripe0 + tile_stripe;
1067
174
    const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
1068
1069
    // Calculate this stripe's height, based on two rules:
1070
    // * The topmost stripe in each tile is 8 luma pixels shorter than usual.
1071
    // * We can't extend past the end of the current restoration unit
1072
174
    const int nominal_stripe_height =
1073
174
        full_stripe_height - ((tile_stripe == 0) ? runit_offset : 0);
1074
174
    const int h = AOMMIN(nominal_stripe_height,
1075
174
                         remaining_stripes.v_end - remaining_stripes.v_start);
1076
1077
174
    setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
1078
174
                                     h, data8, stride, rlbs, copy_above,
1079
174
                                     copy_below, optimized_lr);
1080
1081
174
    stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
1082
174
                  dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth);
1083
1084
174
    restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
1085
174
                                       data8, stride, copy_above, copy_below,
1086
174
                                       optimized_lr);
1087
1088
174
    i += h;
1089
174
  }
1090
58
}
1091
1092
static void filter_frame_on_unit(const RestorationTileLimits *limits,
1093
                                 const AV1PixelRect *tile_rect,
1094
                                 int rest_unit_idx, void *priv, int32_t *tmpbuf,
1095
90
                                 RestorationLineBuffers *rlbs) {
1096
90
  FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
1097
90
  const RestorationInfo *rsi = ctxt->rsi;
1098
1099
90
  av1_loop_restoration_filter_unit(
1100
90
      limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, tile_rect,
1101
90
      ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, ctxt->bit_depth,
1102
90
      ctxt->data8, ctxt->data_stride, ctxt->dst8, ctxt->dst_stride, tmpbuf,
1103
90
      rsi->optimized_lr);
1104
90
}
1105
1106
void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
1107
                                            YV12_BUFFER_CONFIG *frame,
1108
                                            AV1_COMMON *cm, int optimized_lr,
1109
2
                                            int num_planes) {
1110
2
  const SequenceHeader *const seq_params = cm->seq_params;
1111
2
  const int bit_depth = seq_params->bit_depth;
1112
2
  const int highbd = seq_params->use_highbitdepth;
1113
2
  lr_ctxt->dst = &cm->rst_frame;
1114
1115
2
  const int frame_width = frame->crop_widths[0];
1116
2
  const int frame_height = frame->crop_heights[0];
1117
2
  if (aom_realloc_frame_buffer(
1118
2
          lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
1119
2
          seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
1120
2
          cm->features.byte_alignment, NULL, NULL, NULL, 0) < 0)
1121
0
    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
1122
0
                       "Failed to allocate restoration dst buffer");
1123
1124
2
  lr_ctxt->on_rest_unit = filter_frame_on_unit;
1125
2
  lr_ctxt->frame = frame;
1126
8
  for (int plane = 0; plane < num_planes; ++plane) {
1127
6
    RestorationInfo *rsi = &cm->rst_info[plane];
1128
6
    RestorationType rtype = rsi->frame_restoration_type;
1129
6
    rsi->optimized_lr = optimized_lr;
1130
1131
6
    if (rtype == RESTORE_NONE) {
1132
0
      continue;
1133
0
    }
1134
1135
6
    const int is_uv = plane > 0;
1136
6
    const int plane_width = frame->crop_widths[is_uv];
1137
6
    const int plane_height = frame->crop_heights[is_uv];
1138
6
    FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
1139
1140
6
    av1_extend_frame(frame->buffers[plane], plane_width, plane_height,
1141
6
                     frame->strides[is_uv], RESTORATION_BORDER,
1142
6
                     RESTORATION_BORDER, highbd);
1143
1144
6
    lr_plane_ctxt->rsi = rsi;
1145
6
    lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
1146
6
    lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
1147
6
    lr_plane_ctxt->highbd = highbd;
1148
6
    lr_plane_ctxt->bit_depth = bit_depth;
1149
6
    lr_plane_ctxt->data8 = frame->buffers[plane];
1150
6
    lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
1151
6
    lr_plane_ctxt->data_stride = frame->strides[is_uv];
1152
6
    lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
1153
6
    lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv);
1154
6
    lr_plane_ctxt->tile_stripe0 = 0;
1155
6
  }
1156
2
}
1157
1158
void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
1159
2
                                      AV1_COMMON *cm, int num_planes) {
1160
2
  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
1161
2
                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
1162
2
                           int vstart, int vend);
1163
2
  static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
1164
2
                                         aom_yv12_partial_coloc_copy_u,
1165
2
                                         aom_yv12_partial_coloc_copy_v };
1166
2
  assert(num_planes <= 3);
1167
8
  for (int plane = 0; plane < num_planes; ++plane) {
1168
6
    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
1169
6
    AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
1170
6
    copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, tile_rect.left,
1171
6
                     tile_rect.right, tile_rect.top, tile_rect.bottom);
1172
6
  }
1173
2
}
1174
1175
static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
1176
2
                                        int num_planes) {
1177
2
  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
1178
1179
8
  for (int plane = 0; plane < num_planes; ++plane) {
1180
6
    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
1181
0
      continue;
1182
0
    }
1183
1184
6
    av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit,
1185
6
                                   &ctxt[plane], &ctxt[plane].tile_rect,
1186
6
                                   cm->rst_tmpbuf, cm->rlbs);
1187
6
  }
1188
2
}
1189
1190
void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
1191
                                       AV1_COMMON *cm, int optimized_lr,
1192
2
                                       void *lr_ctxt) {
1193
2
  assert(!cm->features.all_lossless);
1194
2
  const int num_planes = av1_num_planes(cm);
1195
1196
2
  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
1197
1198
2
  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
1199
2
                                         optimized_lr, num_planes);
1200
1201
2
  foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
1202
1203
2
  av1_loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
1204
2
}
1205
1206
void av1_foreach_rest_unit_in_row(
1207
    RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
1208
    rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
1209
    int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
1210
    void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
1211
    sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
1212
6
    struct AV1LrSyncData *const lr_sync) {
1213
6
  const int tile_w = tile_rect->right - tile_rect->left;
1214
6
  const int ext_size = unit_size * 3 / 2;
1215
6
  int x0 = 0, j = 0;
1216
96
  while (x0 < tile_w) {
1217
90
    int remaining_w = tile_w - x0;
1218
90
    int w = (remaining_w < ext_size) ? remaining_w : unit_size;
1219
1220
90
    limits->h_start = tile_rect->left + x0;
1221
90
    limits->h_end = tile_rect->left + x0 + w;
1222
90
    assert(limits->h_end <= tile_rect->right);
1223
1224
90
    const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j;
1225
1226
    // No sync for even numbered rows
1227
    // For odd numbered rows, Loop Restoration of current block requires the LR
1228
    // of top-right and bottom-right blocks to be completed
1229
1230
    // top-right sync
1231
90
    on_sync_read(lr_sync, row_number, j, plane);
1232
90
    if ((row_number + 1) < vunits_per_tile)
1233
      // bottom-right sync
1234
0
      on_sync_read(lr_sync, row_number + 2, j, plane);
1235
1236
90
    on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs);
1237
1238
90
    on_sync_write(lr_sync, row_number, j, hunits_per_tile, plane);
1239
1240
90
    x0 += w;
1241
90
    ++j;
1242
90
  }
1243
6
}
1244
1245
90
void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
1246
90
  (void)lr_sync;
1247
90
  (void)r;
1248
90
  (void)c;
1249
90
  (void)plane;
1250
90
}
1251
1252
void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
1253
90
                             const int sb_cols, int plane) {
1254
90
  (void)lr_sync;
1255
90
  (void)r;
1256
90
  (void)c;
1257
90
  (void)sb_cols;
1258
90
  (void)plane;
1259
90
}
1260
1261
static void foreach_rest_unit_in_tile(
1262
    const AV1PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols,
1263
    int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size,
1264
    int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv,
1265
6
    int32_t *tmpbuf, RestorationLineBuffers *rlbs) {
1266
6
  const int tile_h = tile_rect->bottom - tile_rect->top;
1267
6
  const int ext_size = unit_size * 3 / 2;
1268
1269
6
  const int tile_idx = tile_col + tile_row * tile_cols;
1270
6
  const int unit_idx0 = tile_idx * units_per_tile;
1271
1272
6
  int y0 = 0, i = 0;
1273
12
  while (y0 < tile_h) {
1274
6
    int remaining_h = tile_h - y0;
1275
6
    int h = (remaining_h < ext_size) ? remaining_h : unit_size;
1276
1277
6
    RestorationTileLimits limits;
1278
6
    limits.v_start = tile_rect->top + y0;
1279
6
    limits.v_end = tile_rect->top + y0 + h;
1280
6
    assert(limits.v_end <= tile_rect->bottom);
1281
    // Offset the tile upwards to align with the restoration processing stripe
1282
6
    const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
1283
6
    limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
1284
6
    if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
1285
1286
6
    av1_foreach_rest_unit_in_row(
1287
6
        &limits, tile_rect, on_rest_unit, i, unit_size, unit_idx0,
1288
6
        hunits_per_tile, vunits_per_tile, plane, priv, tmpbuf, rlbs,
1289
6
        av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL);
1290
1291
6
    y0 += h;
1292
6
    ++i;
1293
6
  }
1294
6
}
1295
1296
void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
1297
                                    rest_unit_visitor_t on_rest_unit,
1298
                                    void *priv, AV1PixelRect *tile_rect,
1299
                                    int32_t *tmpbuf,
1300
6
                                    RestorationLineBuffers *rlbs) {
1301
6
  const int is_uv = plane > 0;
1302
6
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1303
1304
6
  const RestorationInfo *rsi = &cm->rst_info[plane];
1305
1306
6
  foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS,
1307
6
                            rsi->horz_units_per_tile, rsi->vert_units_per_tile,
1308
6
                            rsi->units_per_tile, rsi->restoration_unit_size,
1309
6
                            ss_y, plane, on_rest_unit, priv, tmpbuf, rlbs);
1310
6
}
1311
1312
int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
1313
                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
1314
                                       int *rcol0, int *rcol1, int *rrow0,
1315
8.26M
                                       int *rrow1) {
1316
8.26M
  assert(rcol0 && rcol1 && rrow0 && rrow1);
1317
1318
8.26M
  if (bsize != cm->seq_params->sb_size) return 0;
1319
1.26M
  if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
1320
1321
92.3k
  assert(!cm->features.all_lossless);
1322
1323
92.3k
  const int is_uv = plane > 0;
1324
1325
92.3k
  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
1326
92.3k
  const int tile_w = tile_rect.right - tile_rect.left;
1327
92.3k
  const int tile_h = tile_rect.bottom - tile_rect.top;
1328
1329
92.3k
  const int mi_top = 0;
1330
92.3k
  const int mi_left = 0;
1331
1332
  // Compute the mi-unit corners of the superblock relative to the top-left of
1333
  // the tile
1334
92.3k
  const int mi_rel_row0 = mi_row - mi_top;
1335
92.3k
  const int mi_rel_col0 = mi_col - mi_left;
1336
92.3k
  const int mi_rel_row1 = mi_rel_row0 + mi_size_high[bsize];
1337
92.3k
  const int mi_rel_col1 = mi_rel_col0 + mi_size_wide[bsize];
1338
1339
92.3k
  const RestorationInfo *rsi = &cm->rst_info[plane];
1340
92.3k
  const int size = rsi->restoration_unit_size;
1341
1342
  // Calculate the number of restoration units in this tile (which might be
1343
  // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile)
1344
92.3k
  const int horz_units = av1_lr_count_units_in_tile(size, tile_w);
1345
92.3k
  const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
1346
1347
  // The size of an MI-unit on this plane of the image
1348
92.3k
  const int ss_x = is_uv && cm->seq_params->subsampling_x;
1349
92.3k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1350
92.3k
  const int mi_size_x = MI_SIZE >> ss_x;
1351
92.3k
  const int mi_size_y = MI_SIZE >> ss_y;
1352
1353
  // Write m for the relative mi column or row, D for the superres denominator
1354
  // and N for the superres numerator. If u is the upscaled pixel offset then
1355
  // we can write the downscaled pixel offset in two ways as:
1356
  //
1357
  //   MI_SIZE * m = N / D u
1358
  //
1359
  // from which we get u = D * MI_SIZE * m / N
1360
92.3k
  const int mi_to_num_x = av1_superres_scaled(cm)
1361
92.3k
                              ? mi_size_x * cm->superres_scale_denominator
1362
92.3k
                              : mi_size_x;
1363
92.3k
  const int mi_to_num_y = mi_size_y;
1364
92.3k
  const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
1365
92.3k
  const int denom_y = size;
1366
1367
92.3k
  const int rnd_x = denom_x - 1;
1368
92.3k
  const int rnd_y = denom_y - 1;
1369
1370
  // rcol0/rrow0 should be the first column/row of restoration units (relative
1371
  // to the top-left of the tile) that doesn't start left/below of
1372
  // mi_col/mi_row. For this calculation, we need to round up the division (if
1373
  // the sb starts at runit column 10.1, the first matching runit has column
1374
  // index 11)
1375
92.3k
  *rcol0 = (mi_rel_col0 * mi_to_num_x + rnd_x) / denom_x;
1376
92.3k
  *rrow0 = (mi_rel_row0 * mi_to_num_y + rnd_y) / denom_y;
1377
1378
  // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
1379
  // below-right. If we're at the bottom or right of the tile, this restoration
1380
  // unit might not exist, in which case we'll clamp accordingly.
1381
92.3k
  *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
1382
92.3k
  *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
1383
1384
92.3k
  return *rcol0 < *rcol1 && *rrow0 < *rrow1;
1385
1.26M
}
1386
1387
// Extend to left and right
1388
static void extend_lines(uint8_t *buf, int width, int height, int stride,
1389
0
                         int extend, int use_highbitdepth) {
1390
0
  for (int i = 0; i < height; ++i) {
1391
0
    if (use_highbitdepth) {
1392
0
      uint16_t *buf16 = (uint16_t *)buf;
1393
0
      aom_memset16(buf16 - extend, buf16[0], extend);
1394
0
      aom_memset16(buf16 + width, buf16[width - 1], extend);
1395
0
    } else {
1396
0
      memset(buf - extend, buf[0], extend);
1397
0
      memset(buf + width, buf[width - 1], extend);
1398
0
    }
1399
0
    buf += stride;
1400
0
  }
1401
0
}
1402
1403
static void save_deblock_boundary_lines(
1404
    const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
1405
    int stripe, int use_highbd, int is_above,
1406
0
    RestorationStripeBoundaries *boundaries) {
1407
0
  const int is_uv = plane > 0;
1408
0
  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1409
0
  const int src_stride = frame->strides[is_uv] << use_highbd;
1410
0
  const uint8_t *src_rows = src_buf + row * src_stride;
1411
1412
0
  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1413
0
                               : boundaries->stripe_boundary_below;
1414
0
  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1415
0
  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1416
0
  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1417
1418
  // There is a rare case in which a processing stripe can end 1px above the
1419
  // crop border. In this case, we do want to use deblocked pixels from below
1420
  // the stripe (hence why we ended up in this function), but instead of
1421
  // fetching 2 "below" rows we need to fetch one and duplicate it.
1422
  // This is equivalent to clamping the sample locations against the crop border
1423
0
  const int lines_to_save =
1424
0
      AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
1425
0
  assert(lines_to_save == 1 || lines_to_save == 2);
1426
1427
0
  int upscaled_width;
1428
0
  int line_bytes;
1429
0
  if (av1_superres_scaled(cm)) {
1430
0
    const int ss_x = is_uv && cm->seq_params->subsampling_x;
1431
0
    upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
1432
0
    line_bytes = upscaled_width << use_highbd;
1433
0
    if (use_highbd)
1434
0
      av1_upscale_normative_rows(
1435
0
          cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
1436
0
          CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
1437
0
          plane, lines_to_save);
1438
0
    else
1439
0
      av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
1440
0
                                 boundaries->stripe_boundary_stride, plane,
1441
0
                                 lines_to_save);
1442
0
  } else {
1443
0
    upscaled_width = frame->crop_widths[is_uv];
1444
0
    line_bytes = upscaled_width << use_highbd;
1445
0
    for (int i = 0; i < lines_to_save; i++) {
1446
0
      memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
1447
0
             line_bytes);
1448
0
    }
1449
0
  }
1450
  // If we only saved one line, then copy it into the second line buffer
1451
0
  if (lines_to_save == 1)
1452
0
    memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
1453
1454
0
  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1455
0
               RESTORATION_EXTRA_HORZ, use_highbd);
1456
0
}
1457
1458
static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1459
                                     const AV1_COMMON *cm, int plane, int row,
1460
                                     int stripe, int use_highbd, int is_above,
1461
0
                                     RestorationStripeBoundaries *boundaries) {
1462
0
  const int is_uv = plane > 0;
1463
0
  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1464
0
  const int src_stride = frame->strides[is_uv] << use_highbd;
1465
0
  const uint8_t *src_rows = src_buf + row * src_stride;
1466
1467
0
  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1468
0
                               : boundaries->stripe_boundary_below;
1469
0
  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1470
0
  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1471
0
  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1472
0
  const int src_width = frame->crop_widths[is_uv];
1473
1474
  // At the point where this function is called, we've already applied
1475
  // superres. So we don't need to extend the lines here, we can just
1476
  // pull directly from the topmost row of the upscaled frame.
1477
0
  const int ss_x = is_uv && cm->seq_params->subsampling_x;
1478
0
  const int upscaled_width = av1_superres_scaled(cm)
1479
0
                                 ? (cm->superres_upscaled_width + ss_x) >> ss_x
1480
0
                                 : src_width;
1481
0
  const int line_bytes = upscaled_width << use_highbd;
1482
0
  for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
1483
    // Copy the line at 'row' into both context lines. This is because
1484
    // we want to (effectively) extend the outermost row of CDEF data
1485
    // from this tile to produce a border, rather than using deblocked
1486
    // pixels from the tile above/below.
1487
0
    memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
1488
0
  }
1489
0
  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1490
0
               RESTORATION_EXTRA_HORZ, use_highbd);
1491
0
}
1492
1493
static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1494
                                         int use_highbd, int plane,
1495
0
                                         AV1_COMMON *cm, int after_cdef) {
1496
0
  const int is_uv = plane > 0;
1497
0
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1498
0
  const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1499
0
  const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
1500
1501
  // Get the tile rectangle, with height rounded up to the next multiple of 8
1502
  // luma pixels (only relevant for the bottom tile of the frame)
1503
0
  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
1504
0
  const int stripe0 = 0;
1505
1506
0
  RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
1507
1508
0
  const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
1509
1510
0
  int tile_stripe;
1511
0
  for (tile_stripe = 0;; ++tile_stripe) {
1512
0
    const int rel_y0 = AOMMAX(0, tile_stripe * stripe_height - stripe_off);
1513
0
    const int y0 = tile_rect.top + rel_y0;
1514
0
    if (y0 >= tile_rect.bottom) break;
1515
1516
0
    const int rel_y1 = (tile_stripe + 1) * stripe_height - stripe_off;
1517
0
    const int y1 = AOMMIN(tile_rect.top + rel_y1, tile_rect.bottom);
1518
1519
0
    const int frame_stripe = stripe0 + tile_stripe;
1520
1521
    // In this case, we should only use CDEF pixels at the top
1522
    // and bottom of the frame as a whole; internal tile boundaries
1523
    // can use deblocked pixels from adjacent tiles for context.
1524
0
    const int use_deblock_above = (frame_stripe > 0);
1525
0
    const int use_deblock_below = (y1 < plane_height);
1526
1527
0
    if (!after_cdef) {
1528
      // Save deblocked context where needed.
1529
0
      if (use_deblock_above) {
1530
0
        save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
1531
0
                                    frame_stripe, use_highbd, 1, boundaries);
1532
0
      }
1533
0
      if (use_deblock_below) {
1534
0
        save_deblock_boundary_lines(frame, cm, plane, y1, frame_stripe,
1535
0
                                    use_highbd, 0, boundaries);
1536
0
      }
1537
0
    } else {
1538
      // Save CDEF context where needed. Note that we need to save the CDEF
1539
      // context for a particular boundary iff we *didn't* save deblocked
1540
      // context for that boundary.
1541
      //
1542
      // In addition, we need to save copies of the outermost line within
1543
      // the tile, rather than using data from outside the tile.
1544
0
      if (!use_deblock_above) {
1545
0
        save_cdef_boundary_lines(frame, cm, plane, y0, frame_stripe, use_highbd,
1546
0
                                 1, boundaries);
1547
0
      }
1548
0
      if (!use_deblock_below) {
1549
0
        save_cdef_boundary_lines(frame, cm, plane, y1 - 1, frame_stripe,
1550
0
                                 use_highbd, 0, boundaries);
1551
0
      }
1552
0
    }
1553
0
  }
1554
0
}
1555
1556
// For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
1557
// lines to be used as boundary in the loop restoration process. The
1558
// lines are saved in rst_internal.stripe_boundary_lines
1559
void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1560
0
                                              AV1_COMMON *cm, int after_cdef) {
1561
0
  const int num_planes = av1_num_planes(cm);
1562
0
  const int use_highbd = cm->seq_params->use_highbitdepth;
1563
0
  for (int p = 0; p < num_planes; ++p) {
1564
0
    save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
1565
0
  }
1566
0
}