Coverage Report

Created: 2025-07-16 07:53

/src/aom/av1/common/restoration.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 *
11
 */
12
13
#include <math.h>
14
#include <stddef.h>
15
16
#include "config/aom_config.h"
17
#include "config/aom_scale_rtcd.h"
18
19
#include "aom/internal/aom_codec_internal.h"
20
#include "aom_mem/aom_mem.h"
21
#include "aom_dsp/aom_dsp_common.h"
22
#include "aom_mem/aom_mem.h"
23
#include "aom_ports/mem.h"
24
#include "aom_util/aom_pthread.h"
25
26
#include "av1/common/av1_common_int.h"
27
#include "av1/common/convolve.h"
28
#include "av1/common/enums.h"
29
#include "av1/common/resize.h"
30
#include "av1/common/restoration.h"
31
#include "av1/common/thread_common.h"
32
33
// The 's' values are calculated based on original 'r' and 'e' values in the
34
// spec using GenSgrprojVtable().
35
// Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
36
const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
37
  { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
38
  { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
39
  { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
40
  { { 2, 1 }, { 47, 1079 } },  { { 2, 1 }, { 37, 996 } },
41
  { { 2, 1 }, { 30, 925 } },   { { 2, 1 }, { 25, 863 } },
42
  { { 0, 1 }, { -1, 2589 } },  { { 0, 1 }, { -1, 1618 } },
43
  { { 0, 1 }, { -1, 1177 } },  { { 0, 1 }, { -1, 925 } },
44
  { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
45
};
46
47
void av1_get_upsampled_plane_size(const AV1_COMMON *cm, int is_uv, int *plane_w,
48
62.9k
                                  int *plane_h) {
49
62.9k
  int ss_x = is_uv && cm->seq_params->subsampling_x;
50
62.9k
  int ss_y = is_uv && cm->seq_params->subsampling_y;
51
62.9k
  *plane_w = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
52
62.9k
  *plane_h = ROUND_POWER_OF_TWO(cm->height, ss_y);
53
62.9k
}
54
55
// Count horizontal or vertical units in a plane (use a width or height for
56
// plane_size, respectively). We basically want to divide the plane size by the
57
// size of a restoration unit. Rather than rounding up unconditionally as you
58
// might expect, we round to nearest, which models the way a right or bottom
59
// restoration unit can extend to up to 150% its normal width or height.
60
//
61
// The max with 1 is to deal with small frames, which may be smaller than
62
// half of an LR unit in size.
63
71.9k
int av1_lr_count_units(int unit_size, int plane_size) {
64
71.9k
  return AOMMAX((plane_size + (unit_size >> 1)) / unit_size, 1);
65
71.9k
}
66
67
void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
68
29.0k
                                  int is_uv) {
69
29.0k
  int plane_w, plane_h;
70
29.0k
  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
71
72
29.0k
  const int unit_size = rsi->restoration_unit_size;
73
29.0k
  const int horz_units = av1_lr_count_units(unit_size, plane_w);
74
29.0k
  const int vert_units = av1_lr_count_units(unit_size, plane_h);
75
76
29.0k
  rsi->num_rest_units = horz_units * vert_units;
77
29.0k
  rsi->horz_units = horz_units;
78
29.0k
  rsi->vert_units = vert_units;
79
80
29.0k
  aom_free(rsi->unit_info);
81
29.0k
  CHECK_MEM_ERROR(cm, rsi->unit_info,
82
29.0k
                  (RestorationUnitInfo *)aom_memalign(
83
29.0k
                      16, sizeof(*rsi->unit_info) * rsi->num_rest_units));
84
29.0k
}
85
86
67.6k
void av1_free_restoration_struct(RestorationInfo *rst_info) {
87
67.6k
  aom_free(rst_info->unit_info);
88
67.6k
  rst_info->unit_info = NULL;
89
67.6k
}
90
91
#if 0
92
// Pair of values for each sgrproj parameter:
93
// Index 0 corresponds to r[0], e[0]
94
// Index 1 corresponds to r[1], e[1]
95
int sgrproj_mtable[SGRPROJ_PARAMS][2];
96
97
static void GenSgrprojVtable(void) {
98
  for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
99
    const sgr_params_type *const params = &av1_sgr_params[i];
100
    for (int j = 0; j < 2; ++j) {
101
      const int e = params->e[j];
102
      const int r = params->r[j];
103
      if (r == 0) {                 // filter is disabled
104
        sgrproj_mtable[i][j] = -1;  // mark invalid
105
      } else {                      // filter is enabled
106
        const int n = (2 * r + 1) * (2 * r + 1);
107
        const int n2e = n * n * e;
108
        assert(n2e != 0);
109
        sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
110
      }
111
    }
112
  }
113
}
114
#endif
115
116
22.5k
void av1_loop_restoration_precal(void) {
117
#if 0
118
  GenSgrprojVtable();
119
#endif
120
22.5k
}
121
122
static void extend_frame_lowbd(uint8_t *data, int width, int height,
123
                               ptrdiff_t stride, int border_horz,
124
3.13k
                               int border_vert) {
125
3.13k
  uint8_t *data_p;
126
3.13k
  int i;
127
934k
  for (i = 0; i < height; ++i) {
128
931k
    data_p = data + i * stride;
129
931k
    memset(data_p - border_horz, data_p[0], border_horz);
130
931k
    memset(data_p + width, data_p[width - 1], border_horz);
131
931k
  }
132
3.13k
  data_p = data - border_horz;
133
12.5k
  for (i = -border_vert; i < 0; ++i) {
134
9.40k
    memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
135
9.40k
  }
136
12.5k
  for (i = height; i < height + border_vert; ++i) {
137
9.40k
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
138
9.40k
           width + 2 * border_horz);
139
9.40k
  }
140
3.13k
}
141
142
#if CONFIG_AV1_HIGHBITDEPTH
143
static void extend_frame_highbd(uint16_t *data, int width, int height,
144
                                ptrdiff_t stride, int border_horz,
145
5.82k
                                int border_vert) {
146
5.82k
  uint16_t *data_p;
147
5.82k
  int i, j;
148
1.57M
  for (i = 0; i < height; ++i) {
149
1.57M
    data_p = data + i * stride;
150
6.28M
    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
151
6.28M
    for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
152
1.57M
  }
153
5.82k
  data_p = data - border_horz;
154
23.3k
  for (i = -border_vert; i < 0; ++i) {
155
17.4k
    memcpy(data_p + i * stride, data_p,
156
17.4k
           (width + 2 * border_horz) * sizeof(uint16_t));
157
17.4k
  }
158
23.3k
  for (i = height; i < height + border_vert; ++i) {
159
17.4k
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
160
17.4k
           (width + 2 * border_horz) * sizeof(uint16_t));
161
17.4k
  }
162
5.82k
}
163
164
static void copy_rest_unit_highbd(int width, int height, const uint16_t *src,
165
                                  int src_stride, uint16_t *dst,
166
11.4k
                                  int dst_stride) {
167
655k
  for (int i = 0; i < height; ++i)
168
643k
    memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
169
11.4k
}
170
#endif
171
172
void av1_extend_frame(uint8_t *data, int width, int height, int stride,
173
8.96k
                      int border_horz, int border_vert, int highbd) {
174
8.96k
#if CONFIG_AV1_HIGHBITDEPTH
175
8.96k
  if (highbd) {
176
5.82k
    extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
177
5.82k
                        border_horz, border_vert);
178
5.82k
    return;
179
5.82k
  }
180
3.13k
#endif
181
3.13k
  (void)highbd;
182
3.13k
  extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
183
3.13k
}
184
185
static void copy_rest_unit_lowbd(int width, int height, const uint8_t *src,
186
6.17k
                                 int src_stride, uint8_t *dst, int dst_stride) {
187
416k
  for (int i = 0; i < height; ++i)
188
409k
    memcpy(dst + i * dst_stride, src + i * src_stride, width);
189
6.17k
}
190
191
static void copy_rest_unit(int width, int height, const uint8_t *src,
192
                           int src_stride, uint8_t *dst, int dst_stride,
193
17.6k
                           int highbd) {
194
17.6k
#if CONFIG_AV1_HIGHBITDEPTH
195
17.6k
  if (highbd) {
196
11.4k
    copy_rest_unit_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
197
11.4k
                          CONVERT_TO_SHORTPTR(dst), dst_stride);
198
11.4k
    return;
199
11.4k
  }
200
6.18k
#endif
201
6.18k
  (void)highbd;
202
6.18k
  copy_rest_unit_lowbd(width, height, src, src_stride, dst, dst_stride);
203
6.18k
}
204
205
552k
#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
206
207
// With striped loop restoration, the filtering for each 64-pixel stripe gets
208
// most of its input from the output of CDEF (stored in data8), but we need to
209
// fill out a border of 3 pixels above/below the stripe according to the
210
// following rules:
211
//
212
// * At the top and bottom of the frame, we copy the outermost row of CDEF
213
//   pixels three times. This extension is done by a call to av1_extend_frame()
214
//   at the start of the loop restoration process, so the value of
215
//   copy_above/copy_below doesn't strictly matter.
216
//
217
// * All other boundaries are stripe boundaries within the frame. In that case,
218
//   we take 2 rows of deblocked pixels and extend them to 3 rows of context.
219
static void get_stripe_boundary_info(const RestorationTileLimits *limits,
220
                                     int plane_w, int plane_h, int ss_y,
221
36.8k
                                     int *copy_above, int *copy_below) {
222
36.8k
  (void)plane_w;
223
224
36.8k
  *copy_above = 1;
225
36.8k
  *copy_below = 1;
226
227
36.8k
  const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
228
36.8k
  const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
229
230
36.8k
  const int first_stripe_in_plane = (limits->v_start == 0);
231
36.8k
  const int this_stripe_height =
232
36.8k
      full_stripe_height - (first_stripe_in_plane ? runit_offset : 0);
233
36.8k
  const int last_stripe_in_plane =
234
36.8k
      (limits->v_start + this_stripe_height >= plane_h);
235
236
36.8k
  if (first_stripe_in_plane) *copy_above = 0;
237
36.8k
  if (last_stripe_in_plane) *copy_below = 0;
238
36.8k
}
239
240
// Overwrite the border pixels around a processing stripe so that the conditions
241
// listed above get_stripe_boundary_info() are preserved.
242
// We save the pixels which get overwritten into a temporary buffer, so that
243
// they can be restored by restore_processing_stripe_boundary() after we've
244
// processed the stripe.
245
//
246
// limits gives the rectangular limits of the remaining stripes for the current
247
// restoration unit. rsb is the stored stripe boundaries (taken from either
248
// deblock or CDEF output as necessary).
249
static void setup_processing_stripe_boundary(
250
    const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
251
    int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
252
36.8k
    RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
253
  // Offsets within the line buffers. The buffer logically starts at column
254
  // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
255
  // has column x0 in the buffer.
256
36.8k
  const int buf_stride = rsb->stripe_boundary_stride;
257
36.8k
  const int buf_x0_off = limits->h_start;
258
36.8k
  const int line_width =
259
36.8k
      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
260
36.8k
  const int line_size = line_width << use_highbd;
261
262
36.8k
  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
263
264
  // Replace RESTORATION_BORDER pixels above the top of the stripe
265
  // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
266
  // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
267
  // duplicating the topmost of the 2 lines (see the AOMMAX call when
268
  // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
269
36.8k
  if (!opt) {
270
26.6k
    if (copy_above) {
271
19.8k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
272
273
79.1k
      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
274
59.3k
        const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
275
59.3k
        const int buf_off = buf_x0_off + buf_row * buf_stride;
276
59.3k
        const uint8_t *buf =
277
59.3k
            rsb->stripe_boundary_above + (buf_off << use_highbd);
278
59.3k
        uint8_t *dst8 = data8_tl + i * data_stride;
279
        // Save old pixels, then replace with data from stripe_boundary_above
280
59.3k
        memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
281
59.3k
               REAL_PTR(use_highbd, dst8), line_size);
282
59.3k
        memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
283
59.3k
      }
284
19.8k
    }
285
286
    // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
287
    // The second buffer row is repeated, so src_row gets the values 0, 1, 1
288
    // for i = 0, 1, 2.
289
26.6k
    if (copy_below) {
290
17.7k
      const int stripe_end = limits->v_start + h;
291
17.7k
      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
292
293
71.0k
      for (int i = 0; i < RESTORATION_BORDER; ++i) {
294
53.3k
        const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
295
53.3k
        const int buf_off = buf_x0_off + buf_row * buf_stride;
296
53.3k
        const uint8_t *src =
297
53.3k
            rsb->stripe_boundary_below + (buf_off << use_highbd);
298
299
53.3k
        uint8_t *dst8 = data8_bl + i * data_stride;
300
        // Save old pixels, then replace with data from stripe_boundary_below
301
53.3k
        memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
302
53.3k
        memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
303
53.3k
      }
304
17.7k
    }
305
26.6k
  } else {
306
10.2k
    if (copy_above) {
307
9.69k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
308
309
      // Only save and overwrite i=-RESTORATION_BORDER line.
310
9.69k
      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
311
      // Save old pixels, then replace with data from stripe_boundary_above
312
9.69k
      memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
313
9.69k
      memcpy(REAL_PTR(use_highbd, dst8),
314
9.69k
             REAL_PTR(use_highbd,
315
9.69k
                      data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
316
9.69k
             line_size);
317
9.69k
    }
318
319
10.2k
    if (copy_below) {
320
9.67k
      const int stripe_end = limits->v_start + h;
321
9.67k
      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
322
323
      // Only save and overwrite i=2 line.
324
9.67k
      uint8_t *dst8 = data8_bl + 2 * data_stride;
325
      // Save old pixels, then replace with data from stripe_boundary_below
326
9.67k
      memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
327
9.67k
      memcpy(REAL_PTR(use_highbd, dst8),
328
9.67k
             REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
329
9.67k
    }
330
10.2k
  }
331
36.8k
}
332
333
// Once a processing stripe is finished, this function sets the boundary
334
// pixels which were overwritten by setup_processing_stripe_boundary()
335
// back to their original values
336
static void restore_processing_stripe_boundary(
337
    const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
338
    int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
339
36.8k
    int copy_below, int opt) {
340
36.8k
  const int line_width =
341
36.8k
      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
342
36.8k
  const int line_size = line_width << use_highbd;
343
344
36.8k
  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
345
346
36.8k
  if (!opt) {
347
26.6k
    if (copy_above) {
348
19.8k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
349
79.3k
      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
350
59.5k
        uint8_t *dst8 = data8_tl + i * data_stride;
351
59.5k
        memcpy(REAL_PTR(use_highbd, dst8),
352
59.5k
               rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
353
59.5k
      }
354
19.8k
    }
355
356
26.6k
    if (copy_below) {
357
17.8k
      const int stripe_bottom = limits->v_start + h;
358
17.8k
      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
359
360
71.2k
      for (int i = 0; i < RESTORATION_BORDER; ++i) {
361
53.4k
        if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
362
363
53.4k
        uint8_t *dst8 = data8_bl + i * data_stride;
364
53.4k
        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
365
53.4k
      }
366
17.8k
    }
367
26.6k
  } else {
368
10.1k
    if (copy_above) {
369
9.67k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
370
371
      // Only restore i=-RESTORATION_BORDER line.
372
9.67k
      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
373
9.67k
      memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
374
9.67k
    }
375
376
10.1k
    if (copy_below) {
377
9.65k
      const int stripe_bottom = limits->v_start + h;
378
9.65k
      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
379
380
      // Only restore i=2 line.
381
9.65k
      if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
382
9.65k
        uint8_t *dst8 = data8_bl + 2 * data_stride;
383
9.65k
        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
384
9.65k
      }
385
9.65k
    }
386
10.1k
  }
387
36.8k
}
388
389
static void wiener_filter_stripe(const RestorationUnitInfo *rui,
390
                                 int stripe_width, int stripe_height,
391
                                 int procunit_width, const uint8_t *src,
392
                                 int src_stride, uint8_t *dst, int dst_stride,
393
                                 int32_t *tmpbuf, int bit_depth,
394
5.37k
                                 struct aom_internal_error_info *error_info) {
395
5.37k
  (void)tmpbuf;
396
5.37k
  (void)bit_depth;
397
5.37k
  (void)error_info;
398
5.37k
  assert(bit_depth == 8);
399
5.37k
  const WienerConvolveParams conv_params = get_conv_params_wiener(8);
400
401
11.3k
  for (int j = 0; j < stripe_width; j += procunit_width) {
402
6.01k
    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
403
6.01k
    const uint8_t *src_p = src + j;
404
6.01k
    uint8_t *dst_p = dst + j;
405
6.01k
    av1_wiener_convolve_add_src(
406
6.01k
        src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
407
6.01k
        rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
408
6.01k
  }
409
5.37k
}
410
411
/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
412
   over the input. The window is of size (2r + 1)x(2r + 1), and we
413
   specialize to r = 1, 2, 3. A default function is used for r > 3.
414
415
   Each loop follows the same format: We keep a window's worth of input
416
   in individual variables and select data out of that as appropriate.
417
*/
418
static void boxsum1(int32_t *src, int width, int height, int src_stride,
419
33.5k
                    int sqr, int32_t *dst, int dst_stride) {
420
33.5k
  int i, j, a, b, c;
421
33.5k
  assert(width > 2 * SGRPROJ_BORDER_HORZ);
422
33.5k
  assert(height > 2 * SGRPROJ_BORDER_VERT);
423
424
  // Vertical sum over 3-pixel regions, from src into dst.
425
33.5k
  if (!sqr) {
426
612k
    for (j = 0; j < width; ++j) {
427
595k
      a = src[j];
428
595k
      b = src[src_stride + j];
429
595k
      c = src[2 * src_stride + j];
430
431
595k
      dst[j] = a + b;
432
24.1M
      for (i = 1; i < height - 2; ++i) {
433
        // Loop invariant: At the start of each iteration,
434
        // a = src[(i - 1) * src_stride + j]
435
        // b = src[(i    ) * src_stride + j]
436
        // c = src[(i + 1) * src_stride + j]
437
23.5M
        dst[i * dst_stride + j] = a + b + c;
438
23.5M
        a = b;
439
23.5M
        b = c;
440
23.5M
        c = src[(i + 2) * src_stride + j];
441
23.5M
      }
442
595k
      dst[i * dst_stride + j] = a + b + c;
443
595k
      dst[(i + 1) * dst_stride + j] = b + c;
444
595k
    }
445
16.7k
  } else {
446
612k
    for (j = 0; j < width; ++j) {
447
596k
      a = src[j] * src[j];
448
596k
      b = src[src_stride + j] * src[src_stride + j];
449
596k
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
450
451
596k
      dst[j] = a + b;
452
24.1M
      for (i = 1; i < height - 2; ++i) {
453
23.5M
        dst[i * dst_stride + j] = a + b + c;
454
23.5M
        a = b;
455
23.5M
        b = c;
456
23.5M
        c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
457
23.5M
      }
458
596k
      dst[i * dst_stride + j] = a + b + c;
459
596k
      dst[(i + 1) * dst_stride + j] = b + c;
460
596k
    }
461
16.7k
  }
462
463
  // Horizontal sum over 3-pixel regions of dst
464
1.59M
  for (i = 0; i < height; ++i) {
465
1.55M
    a = dst[i * dst_stride];
466
1.55M
    b = dst[i * dst_stride + 1];
467
1.55M
    c = dst[i * dst_stride + 2];
468
469
1.55M
    dst[i * dst_stride] = a + b;
470
47.2M
    for (j = 1; j < width - 2; ++j) {
471
      // Loop invariant: At the start of each iteration,
472
      // a = src[i * src_stride + (j - 1)]
473
      // b = src[i * src_stride + (j    )]
474
      // c = src[i * src_stride + (j + 1)]
475
45.7M
      dst[i * dst_stride + j] = a + b + c;
476
45.7M
      a = b;
477
45.7M
      b = c;
478
45.7M
      c = dst[i * dst_stride + (j + 2)];
479
45.7M
    }
480
1.55M
    dst[i * dst_stride + j] = a + b + c;
481
1.55M
    dst[i * dst_stride + (j + 1)] = b + c;
482
1.55M
  }
483
33.5k
}
484
485
static void boxsum2(int32_t *src, int width, int height, int src_stride,
486
30.9k
                    int sqr, int32_t *dst, int dst_stride) {
487
30.9k
  int i, j, a, b, c, d, e;
488
30.9k
  assert(width > 2 * SGRPROJ_BORDER_HORZ);
489
30.9k
  assert(height > 2 * SGRPROJ_BORDER_VERT);
490
491
  // Vertical sum over 5-pixel regions, from src into dst.
492
30.9k
  if (!sqr) {
493
552k
    for (j = 0; j < width; ++j) {
494
537k
      a = src[j];
495
537k
      b = src[src_stride + j];
496
537k
      c = src[2 * src_stride + j];
497
537k
      d = src[3 * src_stride + j];
498
537k
      e = src[4 * src_stride + j];
499
500
537k
      dst[j] = a + b + c;
501
537k
      dst[dst_stride + j] = a + b + c + d;
502
19.9M
      for (i = 2; i < height - 3; ++i) {
503
        // Loop invariant: At the start of each iteration,
504
        // a = src[(i - 2) * src_stride + j]
505
        // b = src[(i - 1) * src_stride + j]
506
        // c = src[(i    ) * src_stride + j]
507
        // d = src[(i + 1) * src_stride + j]
508
        // e = src[(i + 2) * src_stride + j]
509
19.3M
        dst[i * dst_stride + j] = a + b + c + d + e;
510
19.3M
        a = b;
511
19.3M
        b = c;
512
19.3M
        c = d;
513
19.3M
        d = e;
514
19.3M
        e = src[(i + 3) * src_stride + j];
515
19.3M
      }
516
537k
      dst[i * dst_stride + j] = a + b + c + d + e;
517
537k
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
518
537k
      dst[(i + 2) * dst_stride + j] = c + d + e;
519
537k
    }
520
15.4k
  } else {
521
552k
    for (j = 0; j < width; ++j) {
522
537k
      a = src[j] * src[j];
523
537k
      b = src[src_stride + j] * src[src_stride + j];
524
537k
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
525
537k
      d = src[3 * src_stride + j] * src[3 * src_stride + j];
526
537k
      e = src[4 * src_stride + j] * src[4 * src_stride + j];
527
528
537k
      dst[j] = a + b + c;
529
537k
      dst[dst_stride + j] = a + b + c + d;
530
20.0M
      for (i = 2; i < height - 3; ++i) {
531
19.4M
        dst[i * dst_stride + j] = a + b + c + d + e;
532
19.4M
        a = b;
533
19.4M
        b = c;
534
19.4M
        c = d;
535
19.4M
        d = e;
536
19.4M
        e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
537
19.4M
      }
538
537k
      dst[i * dst_stride + j] = a + b + c + d + e;
539
537k
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
540
537k
      dst[(i + 2) * dst_stride + j] = c + d + e;
541
537k
    }
542
15.4k
  }
543
544
  // Horizontal sum over 5-pixel regions of dst
545
1.38M
  for (i = 0; i < height; ++i) {
546
1.35M
    a = dst[i * dst_stride];
547
1.35M
    b = dst[i * dst_stride + 1];
548
1.35M
    c = dst[i * dst_stride + 2];
549
1.35M
    d = dst[i * dst_stride + 3];
550
1.35M
    e = dst[i * dst_stride + 4];
551
552
1.35M
    dst[i * dst_stride] = a + b + c;
553
1.35M
    dst[i * dst_stride + 1] = a + b + c + d;
554
38.4M
    for (j = 2; j < width - 3; ++j) {
555
      // Loop invariant: At the start of each iteration,
556
      // a = src[i * src_stride + (j - 2)]
557
      // b = src[i * src_stride + (j - 1)]
558
      // c = src[i * src_stride + (j    )]
559
      // d = src[i * src_stride + (j + 1)]
560
      // e = src[i * src_stride + (j + 2)]
561
37.1M
      dst[i * dst_stride + j] = a + b + c + d + e;
562
37.1M
      a = b;
563
37.1M
      b = c;
564
37.1M
      c = d;
565
37.1M
      d = e;
566
37.1M
      e = dst[i * dst_stride + (j + 3)];
567
37.1M
    }
568
1.35M
    dst[i * dst_stride + j] = a + b + c + d + e;
569
1.35M
    dst[i * dst_stride + (j + 1)] = b + c + d + e;
570
1.35M
    dst[i * dst_stride + (j + 2)] = c + d + e;
571
1.35M
  }
572
30.9k
}
573
574
static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
575
64.4k
                   int sqr, int32_t *dst, int dst_stride) {
576
64.4k
  if (r == 1)
577
33.5k
    boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
578
30.8k
  else if (r == 2)
579
30.9k
    boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
580
18.4E
  else
581
18.4E
    assert(0 && "Invalid value of r in self-guided filter");
582
64.4k
}
583
584
20.5k
void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
585
20.5k
  if (params->r[0] == 0) {
586
5.12k
    xq[0] = 0;
587
5.12k
    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
588
15.4k
  } else if (params->r[1] == 0) {
589
3.82k
    xq[0] = xqd[0];
590
3.82k
    xq[1] = 0;
591
11.6k
  } else {
592
11.6k
    xq[0] = xqd[0];
593
11.6k
    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
594
11.6k
  }
595
20.5k
}
596
597
const int32_t av1_x_by_xplus1[256] = {
598
  // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
599
  // instead of 0. See comments in selfguided_restoration_internal() for why
600
  1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
601
  240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
602
  248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
603
  250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
604
  252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
605
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
606
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
607
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
608
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
609
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
610
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
611
  254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
612
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
613
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
614
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
615
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
616
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
617
  256,
618
};
619
620
const int32_t av1_one_by_x[MAX_NELEM] = {
621
  4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
622
  293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
623
};
624
625
static void calculate_intermediate_result(int32_t *dgd, int width, int height,
626
                                          int dgd_stride, int bit_depth,
627
                                          int sgr_params_idx, int radius_idx,
628
32.2k
                                          int pass, int32_t *A, int32_t *B) {
629
32.2k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
630
32.2k
  const int r = params->r[radius_idx];
631
32.2k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
632
32.2k
  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
633
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
634
  // leading to a significant speed improvement.
635
  // We also align the stride to a multiple of 16 bytes, for consistency
636
  // with the SIMD version of this function.
637
32.2k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
638
32.2k
  const int step = pass == 0 ? 1 : 2;
639
32.2k
  int i, j;
640
641
32.2k
  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
642
32.2k
  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
643
32.2k
         "Need SGRPROJ_BORDER_* >= r+1");
644
645
32.2k
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
646
32.2k
         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
647
32.2k
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
648
32.2k
         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
649
32.2k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
650
32.2k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
651
  // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
652
  // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
653
1.08M
  for (i = -1; i < height + 1; i += step) {
654
30.0M
    for (j = -1; j < width + 1; ++j) {
655
29.0M
      const int k = i * buf_stride + j;
656
29.0M
      const int n = (2 * r + 1) * (2 * r + 1);
657
658
      // a < 2^16 * n < 2^22 regardless of bit depth
659
29.0M
      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
660
      // b < 2^8 * n < 2^14 regardless of bit depth
661
29.0M
      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
662
663
      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
664
      // and p itself satisfies p < 2^14 * n^2 < 2^26.
665
      // This bound on p is due to:
666
      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
667
      //
668
      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
669
      // This is an artefact of rounding, and can only happen if all pixels
670
      // are (almost) identical, so in this case we saturate to p=0.
671
29.0M
      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
672
673
29.0M
      const uint32_t s = params->s[radius_idx];
674
675
      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
676
      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
677
      // (this holds even after accounting for the rounding in s)
678
29.0M
      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
679
680
      // Note: We have to be quite careful about the value of A[k].
681
      // This is used as a blend factor between individual pixel values and the
682
      // local mean. So it logically has a range of [0, 256], including both
683
      // endpoints.
684
      //
685
      // This is a pain for hardware, as we'd like something which can be stored
686
      // in exactly 8 bits.
687
      // Further, in the calculation of B[k] below, if z == 0 and r == 2,
688
      // then A[k] "should be" 0. But then we can end up setting B[k] to a value
689
      // slightly above 2^(8 + bit depth), due to rounding in the value of
690
      // av1_one_by_x[25-1].
691
      //
692
      // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
693
      // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
694
      // overflow), without significantly affecting the final result: z == 0
695
      // implies that the image is essentially "flat", so the local mean and
696
      // individual pixel values are very similar.
697
      //
698
      // Note that saturating on the other side, ie. requring A[k] <= 255,
699
      // would be a bad idea, as that corresponds to the case where the image
700
      // is very variable, when we want to preserve the local pixel value as
701
      // much as possible.
702
29.0M
      A[k] = av1_x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
703
704
      // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
705
      // av1_one_by_x[n - 1] = round(2^12 / n)
706
      // => the product here is < 2^(20 + bit_depth) <= 2^32,
707
      // and B[k] is set to a value < 2^(8 + bit depth)
708
      // This holds even with the rounding in av1_one_by_x and in the overall
709
      // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
710
29.0M
      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
711
29.0M
                                             (uint32_t)B[k] *
712
29.0M
                                             (uint32_t)av1_one_by_x[n - 1],
713
29.0M
                                         SGRPROJ_RECIP_BITS);
714
29.0M
    }
715
1.05M
  }
716
32.2k
}
717
718
static void selfguided_restoration_fast_internal(
719
    int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
720
15.4k
    int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
721
15.4k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
722
15.4k
  const int r = params->r[radius_idx];
723
15.4k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
724
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
725
  // leading to a significant speed improvement.
726
  // We also align the stride to a multiple of 16 bytes, for consistency
727
  // with the SIMD version of this function.
728
15.4k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
729
15.4k
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
730
15.4k
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
731
15.4k
  int32_t *A = A_;
732
15.4k
  int32_t *B = B_;
733
15.4k
  int i, j;
734
15.4k
  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
735
15.4k
                                sgr_params_idx, radius_idx, 1, A, B);
736
15.4k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
737
15.4k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
738
739
  // Use the A[] and B[] arrays to calculate the filtered image
740
15.4k
  (void)r;
741
15.4k
  assert(r == 2);
742
663k
  for (i = 0; i < height; ++i) {
743
647k
    if (!(i & 1)) {  // even row
744
8.65M
      for (j = 0; j < width; ++j) {
745
8.33M
        const int k = i * buf_stride + j;
746
8.33M
        const int l = i * dgd_stride + j;
747
8.33M
        const int m = i * dst_stride + j;
748
8.33M
        const int nb = 5;
749
8.33M
        const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
750
8.33M
                          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
751
8.33M
                           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
752
8.33M
                              5;
753
8.33M
        const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
754
8.33M
                          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
755
8.33M
                           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
756
8.33M
                              5;
757
8.33M
        const int32_t v = a * dgd[l] + b;
758
8.33M
        dst[m] =
759
8.33M
            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
760
8.33M
      }
761
324k
    } else {  // odd row
762
8.55M
      for (j = 0; j < width; ++j) {
763
8.23M
        const int k = i * buf_stride + j;
764
8.23M
        const int l = i * dgd_stride + j;
765
8.23M
        const int m = i * dst_stride + j;
766
8.23M
        const int nb = 4;
767
8.23M
        const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
768
8.23M
        const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
769
8.23M
        const int32_t v = a * dgd[l] + b;
770
8.23M
        dst[m] =
771
8.23M
            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
772
8.23M
      }
773
323k
    }
774
647k
  }
775
15.4k
}
776
777
static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
778
                                            int dgd_stride, int32_t *dst,
779
                                            int dst_stride, int bit_depth,
780
                                            int sgr_params_idx,
781
16.7k
                                            int radius_idx) {
782
16.7k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
783
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
784
  // leading to a significant speed improvement.
785
  // We also align the stride to a multiple of 16 bytes, for consistency
786
  // with the SIMD version of this function.
787
16.7k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
788
16.7k
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
789
16.7k
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
790
16.7k
  int32_t *A = A_;
791
16.7k
  int32_t *B = B_;
792
16.7k
  int i, j;
793
16.7k
  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
794
16.7k
                                sgr_params_idx, radius_idx, 0, A, B);
795
16.7k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
796
16.7k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
797
798
  // Use the A[] and B[] arrays to calculate the filtered image
799
678k
  for (i = 0; i < height; ++i) {
800
17.7M
    for (j = 0; j < width; ++j) {
801
17.1M
      const int k = i * buf_stride + j;
802
17.1M
      const int l = i * dgd_stride + j;
803
17.1M
      const int m = i * dst_stride + j;
804
17.1M
      const int nb = 5;
805
17.1M
      const int32_t a =
806
17.1M
          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
807
17.1M
              4 +
808
17.1M
          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
809
17.1M
           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
810
17.1M
              3;
811
17.1M
      const int32_t b =
812
17.1M
          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
813
17.1M
              4 +
814
17.1M
          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
815
17.1M
           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
816
17.1M
              3;
817
17.1M
      const int32_t v = a * dgd[l] + b;
818
17.1M
      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
819
17.1M
    }
820
661k
  }
821
16.7k
}
822
823
int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
824
                                 int dgd_stride, int32_t *flt0, int32_t *flt1,
825
                                 int flt_stride, int sgr_params_idx,
826
20.5k
                                 int bit_depth, int highbd) {
827
20.5k
  int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
828
20.5k
  const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
829
20.5k
  int32_t *dgd32 =
830
20.5k
      dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
831
832
20.5k
  if (highbd) {
833
13.9k
    const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
834
583k
    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
835
21.5M
      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
836
20.9M
        dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
837
20.9M
      }
838
569k
    }
839
13.9k
  } else {
840
368k
    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
841
9.59M
      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
842
9.23M
        dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
843
9.23M
      }
844
361k
    }
845
6.66k
  }
846
847
20.5k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
848
  // If params->r == 0 we skip the corresponding filter. We only allow one of
849
  // the radii to be 0, as having both equal to 0 would be equivalent to
850
  // skipping SGR entirely.
851
20.5k
  assert(!(params->r[0] == 0 && params->r[1] == 0));
852
853
20.5k
  if (params->r[0] > 0)
854
15.4k
    selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
855
15.4k
                                         flt0, flt_stride, bit_depth,
856
15.4k
                                         sgr_params_idx, 0);
857
20.5k
  if (params->r[1] > 0)
858
16.7k
    selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
859
16.7k
                                    flt_stride, bit_depth, sgr_params_idx, 1);
860
20.5k
  return 0;
861
20.5k
}
862
863
int av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
864
                                       int height, int stride, int eps,
865
                                       const int *xqd, uint8_t *dst8,
866
                                       int dst_stride, int32_t *tmpbuf,
867
20.5k
                                       int bit_depth, int highbd) {
868
20.5k
  int32_t *flt0 = tmpbuf;
869
20.5k
  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
870
20.5k
  assert(width * height <= RESTORATION_UNITPELS_MAX);
871
872
20.5k
  const int ret = av1_selfguided_restoration_c(
873
20.5k
      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
874
20.5k
  if (ret != 0) return ret;
875
20.5k
  const sgr_params_type *const params = &av1_sgr_params[eps];
876
20.5k
  int xq[2];
877
20.5k
  av1_decode_xq(xqd, xq, params);
878
794k
  for (int i = 0; i < height; ++i) {
879
19.9M
    for (int j = 0; j < width; ++j) {
880
19.1M
      const int k = i * width + j;
881
19.1M
      uint8_t *dst8ij = dst8 + i * dst_stride + j;
882
19.1M
      const uint8_t *dat8ij = dat8 + i * stride + j;
883
884
19.1M
      const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
885
19.1M
      const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
886
19.1M
      int32_t v = u << SGRPROJ_PRJ_BITS;
887
      // If params->r == 0 then we skipped the filtering in
888
      // av1_selfguided_restoration_c, i.e. flt[k] == u
889
19.1M
      if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
890
19.1M
      if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
891
19.1M
      const int16_t w =
892
19.1M
          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
893
894
19.1M
      const uint16_t out = clip_pixel_highbd(w, bit_depth);
895
19.1M
      if (highbd)
896
14.0M
        *CONVERT_TO_SHORTPTR(dst8ij) = out;
897
5.13M
      else
898
5.13M
        *dst8ij = (uint8_t)out;
899
19.1M
    }
900
774k
  }
901
20.5k
  return 0;
902
20.5k
}
903
904
static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
905
                                  int stripe_width, int stripe_height,
906
                                  int procunit_width, const uint8_t *src,
907
                                  int src_stride, uint8_t *dst, int dst_stride,
908
                                  int32_t *tmpbuf, int bit_depth,
909
5.83k
                                  struct aom_internal_error_info *error_info) {
910
5.83k
  (void)bit_depth;
911
5.83k
  assert(bit_depth == 8);
912
913
12.5k
  for (int j = 0; j < stripe_width; j += procunit_width) {
914
6.67k
    int w = AOMMIN(procunit_width, stripe_width - j);
915
6.67k
    if (av1_apply_selfguided_restoration(
916
6.67k
            src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
917
6.67k
            rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth,
918
6.67k
            0) != 0) {
919
0
      aom_internal_error(
920
0
          error_info, AOM_CODEC_MEM_ERROR,
921
0
          "Error allocating buffer in av1_apply_selfguided_restoration");
922
0
    }
923
6.67k
  }
924
5.83k
}
925
926
#if CONFIG_AV1_HIGHBITDEPTH
927
static void wiener_filter_stripe_highbd(
928
    const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
929
    int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
930
    int dst_stride, int32_t *tmpbuf, int bit_depth,
931
15.7k
    struct aom_internal_error_info *error_info) {
932
15.7k
  (void)tmpbuf;
933
15.7k
  (void)error_info;
934
15.7k
  const WienerConvolveParams conv_params = get_conv_params_wiener(bit_depth);
935
936
33.9k
  for (int j = 0; j < stripe_width; j += procunit_width) {
937
18.1k
    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
938
18.1k
    const uint8_t *src8_p = src8 + j;
939
18.1k
    uint8_t *dst8_p = dst8 + j;
940
18.1k
    av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
941
18.1k
                                       rui->wiener_info.hfilter, 16,
942
18.1k
                                       rui->wiener_info.vfilter, 16, w,
943
18.1k
                                       stripe_height, &conv_params, bit_depth);
944
18.1k
  }
945
15.7k
}
946
947
static void sgrproj_filter_stripe_highbd(
948
    const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
949
    int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
950
    int dst_stride, int32_t *tmpbuf, int bit_depth,
951
9.82k
    struct aom_internal_error_info *error_info) {
952
23.7k
  for (int j = 0; j < stripe_width; j += procunit_width) {
953
13.9k
    int w = AOMMIN(procunit_width, stripe_width - j);
954
13.9k
    if (av1_apply_selfguided_restoration(
955
13.9k
            src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
956
13.9k
            rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth,
957
13.9k
            1) != 0) {
958
0
      aom_internal_error(
959
0
          error_info, AOM_CODEC_MEM_ERROR,
960
0
          "Error allocating buffer in av1_apply_selfguided_restoration");
961
0
    }
962
13.9k
  }
963
9.82k
}
964
#endif  // CONFIG_AV1_HIGHBITDEPTH
965
966
typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
967
                                  int stripe_width, int stripe_height,
968
                                  int procunit_width, const uint8_t *src,
969
                                  int src_stride, uint8_t *dst, int dst_stride,
970
                                  int32_t *tmpbuf, int bit_depth,
971
                                  struct aom_internal_error_info *error_info);
972
973
#if CONFIG_AV1_HIGHBITDEPTH
974
#define NUM_STRIPE_FILTERS 4
975
static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
976
  wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
977
  sgrproj_filter_stripe_highbd
978
};
979
#else
980
#define NUM_STRIPE_FILTERS 2
981
static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
982
  wiener_filter_stripe, sgrproj_filter_stripe
983
};
984
#endif  // CONFIG_AV1_HIGHBITDEPTH
985
986
// Filter one restoration unit
987
void av1_loop_restoration_filter_unit(
988
    const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
989
    const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
990
    int plane_w, int plane_h, int ss_x, int ss_y, int highbd, int bit_depth,
991
    uint8_t *data8, int stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf,
992
38.5k
    int optimized_lr, struct aom_internal_error_info *error_info) {
993
38.5k
  RestorationType unit_rtype = rui->restoration_type;
994
995
38.5k
  int unit_h = limits->v_end - limits->v_start;
996
38.5k
  int unit_w = limits->h_end - limits->h_start;
997
38.5k
  uint8_t *data8_tl =
998
38.5k
      data8 + limits->v_start * (ptrdiff_t)stride + limits->h_start;
999
38.5k
  uint8_t *dst8_tl =
1000
38.5k
      dst8 + limits->v_start * (ptrdiff_t)dst_stride + limits->h_start;
1001
1002
38.5k
  if (unit_rtype == RESTORE_NONE) {
1003
17.6k
    copy_rest_unit(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride,
1004
17.6k
                   highbd);
1005
17.6k
    return;
1006
17.6k
  }
1007
1008
20.8k
  const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
1009
20.8k
  assert(filter_idx < NUM_STRIPE_FILTERS);
1010
20.8k
  const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
1011
1012
20.8k
  const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
1013
1014
  // Filter the whole image one stripe at a time
1015
20.8k
  RestorationTileLimits remaining_stripes = *limits;
1016
20.8k
  int i = 0;
1017
57.7k
  while (i < unit_h) {
1018
36.8k
    int copy_above, copy_below;
1019
36.8k
    remaining_stripes.v_start = limits->v_start + i;
1020
1021
36.8k
    get_stripe_boundary_info(&remaining_stripes, plane_w, plane_h, ss_y,
1022
36.8k
                             &copy_above, &copy_below);
1023
1024
36.8k
    const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1025
36.8k
    const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
1026
1027
    // Work out where this stripe's boundaries are within
1028
    // rsb->stripe_boundary_{above,below}
1029
36.8k
    const int frame_stripe =
1030
36.8k
        (remaining_stripes.v_start + runit_offset) / full_stripe_height;
1031
36.8k
    const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
1032
1033
    // Calculate this stripe's height, based on two rules:
1034
    // * The topmost stripe in the frame is 8 luma pixels shorter than usual.
1035
    // * We can't extend past the end of the current restoration unit
1036
36.8k
    const int nominal_stripe_height =
1037
36.8k
        full_stripe_height - ((frame_stripe == 0) ? runit_offset : 0);
1038
36.8k
    const int h = AOMMIN(nominal_stripe_height,
1039
36.8k
                         remaining_stripes.v_end - remaining_stripes.v_start);
1040
1041
36.8k
    setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
1042
36.8k
                                     h, data8, stride, rlbs, copy_above,
1043
36.8k
                                     copy_below, optimized_lr);
1044
1045
36.8k
    stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
1046
36.8k
                  dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth,
1047
36.8k
                  error_info);
1048
1049
36.8k
    restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
1050
36.8k
                                       data8, stride, copy_above, copy_below,
1051
36.8k
                                       optimized_lr);
1052
1053
36.8k
    i += h;
1054
36.8k
  }
1055
20.8k
}
1056
1057
static void filter_frame_on_unit(const RestorationTileLimits *limits,
1058
                                 int rest_unit_idx, void *priv, int32_t *tmpbuf,
1059
                                 RestorationLineBuffers *rlbs,
1060
38.5k
                                 struct aom_internal_error_info *error_info) {
1061
38.5k
  FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
1062
38.5k
  const RestorationInfo *rsi = ctxt->rsi;
1063
1064
38.5k
  av1_loop_restoration_filter_unit(
1065
38.5k
      limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs,
1066
38.5k
      ctxt->plane_w, ctxt->plane_h, ctxt->ss_x, ctxt->ss_y, ctxt->highbd,
1067
38.5k
      ctxt->bit_depth, ctxt->data8, ctxt->data_stride, ctxt->dst8,
1068
38.5k
      ctxt->dst_stride, tmpbuf, rsi->optimized_lr, error_info);
1069
38.5k
}
1070
1071
void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
1072
                                            YV12_BUFFER_CONFIG *frame,
1073
                                            AV1_COMMON *cm, int optimized_lr,
1074
5.64k
                                            int num_planes) {
1075
5.64k
  const SequenceHeader *const seq_params = cm->seq_params;
1076
5.64k
  const int bit_depth = seq_params->bit_depth;
1077
5.64k
  const int highbd = seq_params->use_highbitdepth;
1078
5.64k
  lr_ctxt->dst = &cm->rst_frame;
1079
1080
5.64k
  const int frame_width = frame->crop_widths[0];
1081
5.64k
  const int frame_height = frame->crop_heights[0];
1082
5.64k
  if (aom_realloc_frame_buffer(
1083
5.64k
          lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
1084
5.64k
          seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
1085
5.64k
          cm->features.byte_alignment, NULL, NULL, NULL, false,
1086
5.64k
          0) != AOM_CODEC_OK)
1087
0
    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
1088
0
                       "Failed to allocate restoration dst buffer");
1089
1090
5.64k
  lr_ctxt->on_rest_unit = filter_frame_on_unit;
1091
5.64k
  lr_ctxt->frame = frame;
1092
19.5k
  for (int plane = 0; plane < num_planes; ++plane) {
1093
13.9k
    RestorationInfo *rsi = &cm->rst_info[plane];
1094
13.9k
    RestorationType rtype = rsi->frame_restoration_type;
1095
13.9k
    rsi->optimized_lr = optimized_lr;
1096
13.9k
    lr_ctxt->ctxt[plane].rsi = rsi;
1097
1098
13.9k
    if (rtype == RESTORE_NONE) {
1099
4.97k
      continue;
1100
4.97k
    }
1101
1102
8.96k
    const int is_uv = plane > 0;
1103
8.96k
    int plane_w, plane_h;
1104
8.96k
    av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1105
8.96k
    assert(plane_w == frame->crop_widths[is_uv]);
1106
8.96k
    assert(plane_h == frame->crop_heights[is_uv]);
1107
1108
8.96k
    av1_extend_frame(frame->buffers[plane], plane_w, plane_h,
1109
8.96k
                     frame->strides[is_uv], RESTORATION_BORDER,
1110
8.96k
                     RESTORATION_BORDER, highbd);
1111
1112
8.96k
    FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
1113
8.96k
    lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
1114
8.96k
    lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
1115
8.96k
    lr_plane_ctxt->plane_w = plane_w;
1116
8.96k
    lr_plane_ctxt->plane_h = plane_h;
1117
8.96k
    lr_plane_ctxt->highbd = highbd;
1118
8.96k
    lr_plane_ctxt->bit_depth = bit_depth;
1119
8.96k
    lr_plane_ctxt->data8 = frame->buffers[plane];
1120
8.96k
    lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
1121
8.96k
    lr_plane_ctxt->data_stride = frame->strides[is_uv];
1122
8.96k
    lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
1123
8.96k
  }
1124
5.64k
}
1125
1126
static void loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
1127
1.64k
                                         AV1_COMMON *cm, int num_planes) {
1128
1.64k
  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
1129
1.64k
                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
1130
1.64k
                           int vstart, int vend);
1131
1.64k
  static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
1132
1.64k
                                         aom_yv12_partial_coloc_copy_u,
1133
1.64k
                                         aom_yv12_partial_coloc_copy_v };
1134
1.64k
  assert(num_planes <= 3);
1135
3.85k
  for (int plane = 0; plane < num_planes; ++plane) {
1136
2.21k
    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
1137
1.98k
    FilterFrameCtxt *lr_plane_ctxt = &loop_rest_ctxt->ctxt[plane];
1138
1.98k
    copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, 0,
1139
1.98k
                     lr_plane_ctxt->plane_w, 0, lr_plane_ctxt->plane_h);
1140
1.98k
  }
1141
1.64k
}
1142
1143
// Call on_rest_unit for each loop restoration unit in the plane.
1144
static void foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
1145
                                       rest_unit_visitor_t on_rest_unit,
1146
                                       void *priv, int32_t *tmpbuf,
1147
1.98k
                                       RestorationLineBuffers *rlbs) {
1148
1.98k
  const RestorationInfo *rsi = &cm->rst_info[plane];
1149
1.98k
  const int hnum_rest_units = rsi->horz_units;
1150
1.98k
  const int vnum_rest_units = rsi->vert_units;
1151
1.98k
  const int unit_size = rsi->restoration_unit_size;
1152
1153
1.98k
  const int is_uv = plane > 0;
1154
1.98k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1155
1.98k
  const int ext_size = unit_size * 3 / 2;
1156
1.98k
  int plane_w, plane_h;
1157
1.98k
  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1158
1159
1.98k
  int y0 = 0, i = 0;
1160
4.92k
  while (y0 < plane_h) {
1161
2.94k
    int remaining_h = plane_h - y0;
1162
2.94k
    int h = (remaining_h < ext_size) ? remaining_h : unit_size;
1163
1164
2.94k
    RestorationTileLimits limits;
1165
2.94k
    limits.v_start = y0;
1166
2.94k
    limits.v_end = y0 + h;
1167
2.94k
    assert(limits.v_end <= plane_h);
1168
    // Offset upwards to align with the restoration processing stripe
1169
2.94k
    const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
1170
2.94k
    limits.v_start = AOMMAX(0, limits.v_start - voffset);
1171
2.94k
    if (limits.v_end < plane_h) limits.v_end -= voffset;
1172
1173
2.94k
    av1_foreach_rest_unit_in_row(&limits, plane_w, on_rest_unit, i, unit_size,
1174
2.94k
                                 hnum_rest_units, vnum_rest_units, plane, priv,
1175
2.94k
                                 tmpbuf, rlbs, av1_lr_sync_read_dummy,
1176
2.94k
                                 av1_lr_sync_write_dummy, NULL, cm->error);
1177
1178
2.94k
    y0 += h;
1179
2.94k
    ++i;
1180
2.94k
  }
1181
1.98k
}
1182
1183
static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
1184
1.64k
                                        int num_planes) {
1185
1.64k
  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
1186
1187
3.85k
  for (int plane = 0; plane < num_planes; ++plane) {
1188
2.21k
    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
1189
232
      continue;
1190
232
    }
1191
1192
1.98k
    foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit, &ctxt[plane],
1193
1.98k
                               cm->rst_tmpbuf, cm->rlbs);
1194
1.98k
  }
1195
1.64k
}
1196
1197
void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
1198
                                       AV1_COMMON *cm, int optimized_lr,
1199
1.64k
                                       void *lr_ctxt) {
1200
1.64k
  assert(!cm->features.all_lossless);
1201
1.64k
  const int num_planes = av1_num_planes(cm);
1202
1203
1.64k
  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
1204
1205
1.64k
  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
1206
1.64k
                                         optimized_lr, num_planes);
1207
1208
1.64k
  foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
1209
1210
1.64k
  loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
1211
1.64k
}
1212
1213
void av1_foreach_rest_unit_in_row(
1214
    RestorationTileLimits *limits, int plane_w,
1215
    rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
1216
    int hnum_rest_units, int vnum_rest_units, int plane, void *priv,
1217
    int32_t *tmpbuf, RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read,
1218
    sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync,
1219
27.7k
    struct aom_internal_error_info *error_info) {
1220
27.7k
  const int ext_size = unit_size * 3 / 2;
1221
27.7k
  int x0 = 0, j = 0;
1222
66.2k
  while (x0 < plane_w) {
1223
38.5k
    int remaining_w = plane_w - x0;
1224
38.5k
    int w = (remaining_w < ext_size) ? remaining_w : unit_size;
1225
1226
38.5k
    limits->h_start = x0;
1227
38.5k
    limits->h_end = x0 + w;
1228
38.5k
    assert(limits->h_end <= plane_w);
1229
1230
38.5k
    const int unit_idx = row_number * hnum_rest_units + j;
1231
1232
    // No sync for even numbered rows
1233
    // For odd numbered rows, Loop Restoration of current block requires the LR
1234
    // of top-right and bottom-right blocks to be completed
1235
1236
    // top-right sync
1237
38.5k
    on_sync_read(lr_sync, row_number, j, plane);
1238
38.5k
    if ((row_number + 1) < vnum_rest_units)
1239
      // bottom-right sync
1240
20.3k
      on_sync_read(lr_sync, row_number + 2, j, plane);
1241
1242
38.5k
#if CONFIG_MULTITHREAD
1243
38.5k
    if (lr_sync && lr_sync->num_workers > 1) {
1244
26.5k
      pthread_mutex_lock(lr_sync->job_mutex);
1245
26.5k
      const bool lr_mt_exit = lr_sync->lr_mt_exit;
1246
26.5k
      pthread_mutex_unlock(lr_sync->job_mutex);
1247
      // Exit in case any worker has encountered an error.
1248
26.5k
      if (lr_mt_exit) return;
1249
26.5k
    }
1250
38.5k
#endif
1251
1252
38.5k
    on_rest_unit(limits, unit_idx, priv, tmpbuf, rlbs, error_info);
1253
1254
38.5k
    on_sync_write(lr_sync, row_number, j, hnum_rest_units, plane);
1255
1256
38.5k
    x0 += w;
1257
38.5k
    ++j;
1258
38.5k
  }
1259
27.7k
}
1260
1261
40.3k
void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
1262
40.3k
  (void)lr_sync;
1263
40.3k
  (void)r;
1264
40.3k
  (void)c;
1265
40.3k
  (void)plane;
1266
40.3k
}
1267
1268
void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
1269
22.9k
                             const int sb_cols, int plane) {
1270
22.9k
  (void)lr_sync;
1271
22.9k
  (void)r;
1272
22.9k
  (void)c;
1273
22.9k
  (void)sb_cols;
1274
22.9k
  (void)plane;
1275
22.9k
}
1276
1277
int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
1278
                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
1279
                                       int *rcol0, int *rcol1, int *rrow0,
1280
1.20M
                                       int *rrow1) {
1281
1.20M
  assert(rcol0 && rcol1 && rrow0 && rrow1);
1282
1283
1.20M
  if (bsize != cm->seq_params->sb_size) return 0;
1284
1285
141k
  assert(!cm->features.all_lossless);
1286
1287
141k
  const int is_uv = plane > 0;
1288
1289
  // Compute the mi-unit corners of the superblock
1290
141k
  const int mi_row0 = mi_row;
1291
141k
  const int mi_col0 = mi_col;
1292
141k
  const int mi_row1 = mi_row0 + mi_size_high[bsize];
1293
141k
  const int mi_col1 = mi_col0 + mi_size_wide[bsize];
1294
1295
141k
  const RestorationInfo *rsi = &cm->rst_info[plane];
1296
141k
  const int size = rsi->restoration_unit_size;
1297
141k
  const int horz_units = rsi->horz_units;
1298
141k
  const int vert_units = rsi->vert_units;
1299
1300
  // The size of an MI-unit on this plane of the image
1301
141k
  const int ss_x = is_uv && cm->seq_params->subsampling_x;
1302
141k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1303
141k
  const int mi_size_x = MI_SIZE >> ss_x;
1304
141k
  const int mi_size_y = MI_SIZE >> ss_y;
1305
1306
  // Write m for the relative mi column or row, D for the superres denominator
1307
  // and N for the superres numerator. If u is the upscaled pixel offset then
1308
  // we can write the downscaled pixel offset in two ways as:
1309
  //
1310
  //   MI_SIZE * m = N / D u
1311
  //
1312
  // from which we get u = D * MI_SIZE * m / N
1313
141k
  const int mi_to_num_x = av1_superres_scaled(cm)
1314
141k
                              ? mi_size_x * cm->superres_scale_denominator
1315
141k
                              : mi_size_x;
1316
141k
  const int mi_to_num_y = mi_size_y;
1317
141k
  const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
1318
141k
  const int denom_y = size;
1319
1320
141k
  const int rnd_x = denom_x - 1;
1321
141k
  const int rnd_y = denom_y - 1;
1322
1323
  // rcol0/rrow0 should be the first column/row of restoration units that
1324
  // doesn't start left/below of mi_col/mi_row. For this calculation, we need
1325
  // to round up the division (if the sb starts at runit column 10.1, the first
1326
  // matching runit has column index 11)
1327
141k
  *rcol0 = (mi_col0 * mi_to_num_x + rnd_x) / denom_x;
1328
141k
  *rrow0 = (mi_row0 * mi_to_num_y + rnd_y) / denom_y;
1329
1330
  // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
1331
  // below-right. If we're at the bottom or right of the frame, this restoration
1332
  // unit might not exist, in which case we'll clamp accordingly.
1333
141k
  *rcol1 = AOMMIN((mi_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
1334
141k
  *rrow1 = AOMMIN((mi_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
1335
1336
141k
  return *rcol0 < *rcol1 && *rrow0 < *rrow1;
1337
1.20M
}
1338
1339
// Extend to left and right
1340
static void extend_lines(uint8_t *buf, int width, int height, int stride,
1341
136k
                         int extend, int use_highbitdepth) {
1342
410k
  for (int i = 0; i < height; ++i) {
1343
273k
    if (use_highbitdepth) {
1344
211k
      uint16_t *buf16 = (uint16_t *)buf;
1345
211k
      aom_memset16(buf16 - extend, buf16[0], extend);
1346
211k
      aom_memset16(buf16 + width, buf16[width - 1], extend);
1347
211k
    } else {
1348
61.7k
      memset(buf - extend, buf[0], extend);
1349
61.7k
      memset(buf + width, buf[width - 1], extend);
1350
61.7k
    }
1351
273k
    buf += stride;
1352
273k
  }
1353
136k
}
1354
1355
static void save_deblock_boundary_lines(
1356
    const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
1357
    int stripe, int use_highbd, int is_above,
1358
113k
    RestorationStripeBoundaries *boundaries) {
1359
113k
  const int is_uv = plane > 0;
1360
113k
  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1361
113k
  const int src_stride = frame->strides[is_uv] << use_highbd;
1362
113k
  const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
1363
1364
113k
  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1365
113k
                               : boundaries->stripe_boundary_below;
1366
113k
  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1367
113k
  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1368
113k
  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1369
1370
  // There is a rare case in which a processing stripe can end 1px above the
1371
  // crop border. In this case, we do want to use deblocked pixels from below
1372
  // the stripe (hence why we ended up in this function), but instead of
1373
  // fetching 2 "below" rows we need to fetch one and duplicate it.
1374
  // This is equivalent to clamping the sample locations against the crop border
1375
113k
  const int lines_to_save =
1376
113k
      AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
1377
113k
  assert(lines_to_save == 1 || lines_to_save == 2);
1378
1379
113k
  int upscaled_width;
1380
113k
  int line_bytes;
1381
113k
  if (av1_superres_scaled(cm)) {
1382
102k
    const int ss_x = is_uv && cm->seq_params->subsampling_x;
1383
102k
    upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
1384
102k
    line_bytes = upscaled_width << use_highbd;
1385
102k
    if (use_highbd)
1386
80.2k
      av1_upscale_normative_rows(
1387
80.2k
          cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
1388
80.2k
          CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
1389
80.2k
          plane, lines_to_save);
1390
21.8k
    else
1391
21.8k
      av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
1392
21.8k
                                 boundaries->stripe_boundary_stride, plane,
1393
21.8k
                                 lines_to_save);
1394
102k
  } else {
1395
11.8k
    upscaled_width = frame->crop_widths[is_uv];
1396
11.8k
    line_bytes = upscaled_width << use_highbd;
1397
35.3k
    for (int i = 0; i < lines_to_save; i++) {
1398
23.5k
      memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
1399
23.5k
             line_bytes);
1400
23.5k
    }
1401
11.8k
  }
1402
  // If we only saved one line, then copy it into the second line buffer
1403
113k
  if (lines_to_save == 1)
1404
109
    memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
1405
1406
113k
  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1407
113k
               RESTORATION_EXTRA_HORZ, use_highbd);
1408
113k
}
1409
1410
static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1411
                                     const AV1_COMMON *cm, int plane, int row,
1412
                                     int stripe, int use_highbd, int is_above,
1413
22.9k
                                     RestorationStripeBoundaries *boundaries) {
1414
22.9k
  const int is_uv = plane > 0;
1415
22.9k
  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1416
22.9k
  const int src_stride = frame->strides[is_uv] << use_highbd;
1417
22.9k
  const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
1418
1419
22.9k
  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1420
22.9k
                               : boundaries->stripe_boundary_below;
1421
22.9k
  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1422
22.9k
  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1423
22.9k
  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1424
22.9k
  const int src_width = frame->crop_widths[is_uv];
1425
1426
  // At the point where this function is called, we've already applied
1427
  // superres. So we don't need to extend the lines here, we can just
1428
  // pull directly from the topmost row of the upscaled frame.
1429
22.9k
  const int ss_x = is_uv && cm->seq_params->subsampling_x;
1430
22.9k
  const int upscaled_width = av1_superres_scaled(cm)
1431
22.9k
                                 ? (cm->superres_upscaled_width + ss_x) >> ss_x
1432
22.9k
                                 : src_width;
1433
22.9k
  const int line_bytes = upscaled_width << use_highbd;
1434
68.8k
  for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
1435
    // Copy the line at 'src_rows' into both context lines
1436
45.8k
    memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
1437
45.8k
  }
1438
22.9k
  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1439
22.9k
               RESTORATION_EXTRA_HORZ, use_highbd);
1440
22.9k
}
1441
1442
static void save_boundary_lines(const YV12_BUFFER_CONFIG *frame, int use_highbd,
1443
22.9k
                                int plane, AV1_COMMON *cm, int after_cdef) {
1444
22.9k
  const int is_uv = plane > 0;
1445
22.9k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1446
22.9k
  const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1447
22.9k
  const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
1448
1449
22.9k
  int plane_w, plane_h;
1450
22.9k
  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1451
1452
22.9k
  RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
1453
1454
22.9k
  const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
1455
1456
22.9k
  int stripe_idx;
1457
159k
  for (stripe_idx = 0;; ++stripe_idx) {
1458
159k
    const int rel_y0 = AOMMAX(0, stripe_idx * stripe_height - stripe_off);
1459
159k
    const int y0 = rel_y0;
1460
159k
    if (y0 >= plane_h) break;
1461
1462
136k
    const int rel_y1 = (stripe_idx + 1) * stripe_height - stripe_off;
1463
136k
    const int y1 = AOMMIN(rel_y1, plane_h);
1464
1465
    // Extend using CDEF pixels at the top and bottom of the frame,
1466
    // and deblocked pixels at internal stripe boundaries
1467
136k
    const int use_deblock_above = (stripe_idx > 0);
1468
136k
    const int use_deblock_below = (y1 < plane_height);
1469
1470
136k
    if (!after_cdef) {
1471
      // Save deblocked context at internal stripe boundaries
1472
68.4k
      if (use_deblock_above) {
1473
56.9k
        save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
1474
56.9k
                                    stripe_idx, use_highbd, 1, boundaries);
1475
56.9k
      }
1476
68.4k
      if (use_deblock_below) {
1477
56.9k
        save_deblock_boundary_lines(frame, cm, plane, y1, stripe_idx,
1478
56.9k
                                    use_highbd, 0, boundaries);
1479
56.9k
      }
1480
68.4k
    } else {
1481
      // Save CDEF context at frame boundaries
1482
68.4k
      if (!use_deblock_above) {
1483
11.4k
        save_cdef_boundary_lines(frame, cm, plane, y0, stripe_idx, use_highbd,
1484
11.4k
                                 1, boundaries);
1485
11.4k
      }
1486
68.4k
      if (!use_deblock_below) {
1487
11.4k
        save_cdef_boundary_lines(frame, cm, plane, y1 - 1, stripe_idx,
1488
11.4k
                                 use_highbd, 0, boundaries);
1489
11.4k
      }
1490
68.4k
    }
1491
136k
  }
1492
22.9k
}
1493
1494
// For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
1495
// lines to be used as boundary in the loop restoration process. The
1496
// lines are saved in rst_internal.stripe_boundary_lines
1497
void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1498
7.71k
                                              AV1_COMMON *cm, int after_cdef) {
1499
7.71k
  const int num_planes = av1_num_planes(cm);
1500
7.71k
  const int use_highbd = cm->seq_params->use_highbitdepth;
1501
30.6k
  for (int p = 0; p < num_planes; ++p) {
1502
22.9k
    save_boundary_lines(frame, use_highbd, p, cm, after_cdef);
1503
22.9k
  }
1504
7.71k
}