Coverage Report

Created: 2025-06-22 08:04

/src/aom/av1/common/restoration.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 *
11
 */
12
13
#include <math.h>
14
#include <stddef.h>
15
16
#include "config/aom_config.h"
17
#include "config/aom_scale_rtcd.h"
18
19
#include "aom/internal/aom_codec_internal.h"
20
#include "aom_mem/aom_mem.h"
21
#include "aom_dsp/aom_dsp_common.h"
22
#include "aom_mem/aom_mem.h"
23
#include "aom_ports/mem.h"
24
#include "aom_util/aom_pthread.h"
25
26
#include "av1/common/av1_common_int.h"
27
#include "av1/common/convolve.h"
28
#include "av1/common/enums.h"
29
#include "av1/common/resize.h"
30
#include "av1/common/restoration.h"
31
#include "av1/common/thread_common.h"
32
33
// The 's' values are calculated based on original 'r' and 'e' values in the
34
// spec using GenSgrprojVtable().
35
// Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
36
const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
37
  { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
38
  { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
39
  { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
40
  { { 2, 1 }, { 47, 1079 } },  { { 2, 1 }, { 37, 996 } },
41
  { { 2, 1 }, { 30, 925 } },   { { 2, 1 }, { 25, 863 } },
42
  { { 0, 1 }, { -1, 2589 } },  { { 0, 1 }, { -1, 1618 } },
43
  { { 0, 1 }, { -1, 1177 } },  { { 0, 1 }, { -1, 925 } },
44
  { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
45
};
46
47
void av1_get_upsampled_plane_size(const AV1_COMMON *cm, int is_uv, int *plane_w,
48
75.1k
                                  int *plane_h) {
49
75.1k
  int ss_x = is_uv && cm->seq_params->subsampling_x;
50
75.1k
  int ss_y = is_uv && cm->seq_params->subsampling_y;
51
75.1k
  *plane_w = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
52
75.1k
  *plane_h = ROUND_POWER_OF_TWO(cm->height, ss_y);
53
75.1k
}
54
55
// Count horizontal or vertical units in a plane (use a width or height for
56
// plane_size, respectively). We basically want to divide the plane size by the
57
// size of a restoration unit. Rather than rounding up unconditionally as you
58
// might expect, we round to nearest, which models the way a right or bottom
59
// restoration unit can extend to up to 150% its normal width or height.
60
//
61
// The max with 1 is to deal with small frames, which may be smaller than
62
// half of an LR unit in size.
63
85.2k
int av1_lr_count_units(int unit_size, int plane_size) {
64
85.2k
  return AOMMAX((plane_size + (unit_size >> 1)) / unit_size, 1);
65
85.2k
}
66
67
void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
68
34.0k
                                  int is_uv) {
69
34.0k
  int plane_w, plane_h;
70
34.0k
  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
71
72
34.0k
  const int unit_size = rsi->restoration_unit_size;
73
34.0k
  const int horz_units = av1_lr_count_units(unit_size, plane_w);
74
34.0k
  const int vert_units = av1_lr_count_units(unit_size, plane_h);
75
76
34.0k
  rsi->num_rest_units = horz_units * vert_units;
77
34.0k
  rsi->horz_units = horz_units;
78
34.0k
  rsi->vert_units = vert_units;
79
80
34.0k
  aom_free(rsi->unit_info);
81
34.0k
  CHECK_MEM_ERROR(cm, rsi->unit_info,
82
34.0k
                  (RestorationUnitInfo *)aom_memalign(
83
34.0k
                      16, sizeof(*rsi->unit_info) * rsi->num_rest_units));
84
34.0k
}
85
86
84.7k
void av1_free_restoration_struct(RestorationInfo *rst_info) {
87
84.7k
  aom_free(rst_info->unit_info);
88
84.7k
  rst_info->unit_info = NULL;
89
84.7k
}
90
91
#if 0
92
// Pair of values for each sgrproj parameter:
93
// Index 0 corresponds to r[0], e[0]
94
// Index 1 corresponds to r[1], e[1]
95
int sgrproj_mtable[SGRPROJ_PARAMS][2];
96
97
static void GenSgrprojVtable(void) {
98
  for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
99
    const sgr_params_type *const params = &av1_sgr_params[i];
100
    for (int j = 0; j < 2; ++j) {
101
      const int e = params->e[j];
102
      const int r = params->r[j];
103
      if (r == 0) {                 // filter is disabled
104
        sgrproj_mtable[i][j] = -1;  // mark invalid
105
      } else {                      // filter is enabled
106
        const int n = (2 * r + 1) * (2 * r + 1);
107
        const int n2e = n * n * e;
108
        assert(n2e != 0);
109
        sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
110
      }
111
    }
112
  }
113
}
114
#endif
115
116
28.2k
void av1_loop_restoration_precal(void) {
117
#if 0
118
  GenSgrprojVtable();
119
#endif
120
28.2k
}
121
122
static void extend_frame_lowbd(uint8_t *data, int width, int height,
123
                               ptrdiff_t stride, int border_horz,
124
3.75k
                               int border_vert) {
125
3.75k
  uint8_t *data_p;
126
3.75k
  int i;
127
1.38M
  for (i = 0; i < height; ++i) {
128
1.37M
    data_p = data + i * stride;
129
1.37M
    memset(data_p - border_horz, data_p[0], border_horz);
130
1.37M
    memset(data_p + width, data_p[width - 1], border_horz);
131
1.37M
  }
132
3.75k
  data_p = data - border_horz;
133
15.0k
  for (i = -border_vert; i < 0; ++i) {
134
11.2k
    memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
135
11.2k
  }
136
15.0k
  for (i = height; i < height + border_vert; ++i) {
137
11.2k
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
138
11.2k
           width + 2 * border_horz);
139
11.2k
  }
140
3.75k
}
141
142
#if CONFIG_AV1_HIGHBITDEPTH
143
static void extend_frame_highbd(uint16_t *data, int width, int height,
144
                                ptrdiff_t stride, int border_horz,
145
7.20k
                                int border_vert) {
146
7.20k
  uint16_t *data_p;
147
7.20k
  int i, j;
148
1.97M
  for (i = 0; i < height; ++i) {
149
1.96M
    data_p = data + i * stride;
150
7.85M
    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
151
7.85M
    for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
152
1.96M
  }
153
7.20k
  data_p = data - border_horz;
154
28.8k
  for (i = -border_vert; i < 0; ++i) {
155
21.6k
    memcpy(data_p + i * stride, data_p,
156
21.6k
           (width + 2 * border_horz) * sizeof(uint16_t));
157
21.6k
  }
158
28.8k
  for (i = height; i < height + border_vert; ++i) {
159
21.6k
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
160
21.6k
           (width + 2 * border_horz) * sizeof(uint16_t));
161
21.6k
  }
162
7.20k
}
163
164
static void copy_rest_unit_highbd(int width, int height, const uint16_t *src,
165
                                  int src_stride, uint16_t *dst,
166
15.1k
                                  int dst_stride) {
167
830k
  for (int i = 0; i < height; ++i)
168
814k
    memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
169
15.1k
}
170
#endif
171
172
void av1_extend_frame(uint8_t *data, int width, int height, int stride,
173
10.9k
                      int border_horz, int border_vert, int highbd) {
174
10.9k
#if CONFIG_AV1_HIGHBITDEPTH
175
10.9k
  if (highbd) {
176
7.20k
    extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
177
7.20k
                        border_horz, border_vert);
178
7.20k
    return;
179
7.20k
  }
180
3.75k
#endif
181
3.75k
  (void)highbd;
182
3.75k
  extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
183
3.75k
}
184
185
static void copy_rest_unit_lowbd(int width, int height, const uint8_t *src,
186
7.98k
                                 int src_stride, uint8_t *dst, int dst_stride) {
187
516k
  for (int i = 0; i < height; ++i)
188
508k
    memcpy(dst + i * dst_stride, src + i * src_stride, width);
189
7.98k
}
190
191
static void copy_rest_unit(int width, int height, const uint8_t *src,
192
                           int src_stride, uint8_t *dst, int dst_stride,
193
23.2k
                           int highbd) {
194
23.2k
#if CONFIG_AV1_HIGHBITDEPTH
195
23.2k
  if (highbd) {
196
15.2k
    copy_rest_unit_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
197
15.2k
                          CONVERT_TO_SHORTPTR(dst), dst_stride);
198
15.2k
    return;
199
15.2k
  }
200
8.03k
#endif
201
8.03k
  (void)highbd;
202
8.03k
  copy_rest_unit_lowbd(width, height, src, src_stride, dst, dst_stride);
203
8.03k
}
204
205
691k
#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
206
207
// With striped loop restoration, the filtering for each 64-pixel stripe gets
208
// most of its input from the output of CDEF (stored in data8), but we need to
209
// fill out a border of 3 pixels above/below the stripe according to the
210
// following rules:
211
//
212
// * At the top and bottom of the frame, we copy the outermost row of CDEF
213
//   pixels three times. This extension is done by a call to av1_extend_frame()
214
//   at the start of the loop restoration process, so the value of
215
//   copy_above/copy_below doesn't strictly matter.
216
//
217
// * All other boundaries are stripe boundaries within the frame. In that case,
218
//   we take 2 rows of deblocked pixels and extend them to 3 rows of context.
219
static void get_stripe_boundary_info(const RestorationTileLimits *limits,
220
                                     int plane_w, int plane_h, int ss_y,
221
47.9k
                                     int *copy_above, int *copy_below) {
222
47.9k
  (void)plane_w;
223
224
47.9k
  *copy_above = 1;
225
47.9k
  *copy_below = 1;
226
227
47.9k
  const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
228
47.9k
  const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
229
230
47.9k
  const int first_stripe_in_plane = (limits->v_start == 0);
231
47.9k
  const int this_stripe_height =
232
47.9k
      full_stripe_height - (first_stripe_in_plane ? runit_offset : 0);
233
47.9k
  const int last_stripe_in_plane =
234
47.9k
      (limits->v_start + this_stripe_height >= plane_h);
235
236
47.9k
  if (first_stripe_in_plane) *copy_above = 0;
237
47.9k
  if (last_stripe_in_plane) *copy_below = 0;
238
47.9k
}
239
240
// Overwrite the border pixels around a processing stripe so that the conditions
241
// listed above get_stripe_boundary_info() are preserved.
242
// We save the pixels which get overwritten into a temporary buffer, so that
243
// they can be restored by restore_processing_stripe_boundary() after we've
244
// processed the stripe.
245
//
246
// limits gives the rectangular limits of the remaining stripes for the current
247
// restoration unit. rsb is the stored stripe boundaries (taken from either
248
// deblock or CDEF output as necessary).
249
static void setup_processing_stripe_boundary(
250
    const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
251
    int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
252
47.9k
    RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
253
  // Offsets within the line buffers. The buffer logically starts at column
254
  // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
255
  // has column x0 in the buffer.
256
47.9k
  const int buf_stride = rsb->stripe_boundary_stride;
257
47.9k
  const int buf_x0_off = limits->h_start;
258
47.9k
  const int line_width =
259
47.9k
      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
260
47.9k
  const int line_size = line_width << use_highbd;
261
262
47.9k
  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
263
264
  // Replace RESTORATION_BORDER pixels above the top of the stripe
265
  // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
266
  // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
267
  // duplicating the topmost of the 2 lines (see the AOMMAX call when
268
  // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
269
47.9k
  if (!opt) {
270
32.1k
    if (copy_above) {
271
23.6k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
272
273
94.5k
      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
274
70.8k
        const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
275
70.8k
        const int buf_off = buf_x0_off + buf_row * buf_stride;
276
70.8k
        const uint8_t *buf =
277
70.8k
            rsb->stripe_boundary_above + (buf_off << use_highbd);
278
70.8k
        uint8_t *dst8 = data8_tl + i * data_stride;
279
        // Save old pixels, then replace with data from stripe_boundary_above
280
70.8k
        memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
281
70.8k
               REAL_PTR(use_highbd, dst8), line_size);
282
70.8k
        memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
283
70.8k
      }
284
23.6k
    }
285
286
    // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
287
    // The second buffer row is repeated, so src_row gets the values 0, 1, 1
288
    // for i = 0, 1, 2.
289
32.1k
    if (copy_below) {
290
21.3k
      const int stripe_end = limits->v_start + h;
291
21.3k
      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
292
293
85.1k
      for (int i = 0; i < RESTORATION_BORDER; ++i) {
294
63.7k
        const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
295
63.7k
        const int buf_off = buf_x0_off + buf_row * buf_stride;
296
63.7k
        const uint8_t *src =
297
63.7k
            rsb->stripe_boundary_below + (buf_off << use_highbd);
298
299
63.7k
        uint8_t *dst8 = data8_bl + i * data_stride;
300
        // Save old pixels, then replace with data from stripe_boundary_below
301
63.7k
        memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
302
63.7k
        memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
303
63.7k
      }
304
21.3k
    }
305
32.1k
  } else {
306
15.8k
    if (copy_above) {
307
15.1k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
308
309
      // Only save and overwrite i=-RESTORATION_BORDER line.
310
15.1k
      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
311
      // Save old pixels, then replace with data from stripe_boundary_above
312
15.1k
      memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
313
15.1k
      memcpy(REAL_PTR(use_highbd, dst8),
314
15.1k
             REAL_PTR(use_highbd,
315
15.1k
                      data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
316
15.1k
             line_size);
317
15.1k
    }
318
319
15.8k
    if (copy_below) {
320
15.2k
      const int stripe_end = limits->v_start + h;
321
15.2k
      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
322
323
      // Only save and overwrite i=2 line.
324
15.2k
      uint8_t *dst8 = data8_bl + 2 * data_stride;
325
      // Save old pixels, then replace with data from stripe_boundary_below
326
15.2k
      memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
327
15.2k
      memcpy(REAL_PTR(use_highbd, dst8),
328
15.2k
             REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
329
15.2k
    }
330
15.8k
  }
331
47.9k
}
332
333
// Once a processing stripe is finished, this function sets the boundary
334
// pixels which were overwritten by setup_processing_stripe_boundary()
335
// back to their original values
336
static void restore_processing_stripe_boundary(
337
    const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
338
    int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
339
47.3k
    int copy_below, int opt) {
340
47.3k
  const int line_width =
341
47.3k
      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
342
47.3k
  const int line_size = line_width << use_highbd;
343
344
47.3k
  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
345
346
47.3k
  if (!opt) {
347
32.1k
    if (copy_above) {
348
23.6k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
349
94.6k
      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
350
70.9k
        uint8_t *dst8 = data8_tl + i * data_stride;
351
70.9k
        memcpy(REAL_PTR(use_highbd, dst8),
352
70.9k
               rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
353
70.9k
      }
354
23.6k
    }
355
356
32.1k
    if (copy_below) {
357
21.3k
      const int stripe_bottom = limits->v_start + h;
358
21.3k
      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
359
360
85.3k
      for (int i = 0; i < RESTORATION_BORDER; ++i) {
361
63.9k
        if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
362
363
63.9k
        uint8_t *dst8 = data8_bl + i * data_stride;
364
63.9k
        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
365
63.9k
      }
366
21.3k
    }
367
32.1k
  } else {
368
15.2k
    if (copy_above) {
369
14.6k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
370
371
      // Only restore i=-RESTORATION_BORDER line.
372
14.6k
      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
373
14.6k
      memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
374
14.6k
    }
375
376
15.2k
    if (copy_below) {
377
14.6k
      const int stripe_bottom = limits->v_start + h;
378
14.6k
      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
379
380
      // Only restore i=2 line.
381
14.6k
      if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
382
14.6k
        uint8_t *dst8 = data8_bl + 2 * data_stride;
383
14.6k
        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
384
14.6k
      }
385
14.6k
    }
386
15.2k
  }
387
47.3k
}
388
389
static void wiener_filter_stripe(const RestorationUnitInfo *rui,
390
                                 int stripe_width, int stripe_height,
391
                                 int procunit_width, const uint8_t *src,
392
                                 int src_stride, uint8_t *dst, int dst_stride,
393
                                 int32_t *tmpbuf, int bit_depth,
394
7.29k
                                 struct aom_internal_error_info *error_info) {
395
7.29k
  (void)tmpbuf;
396
7.29k
  (void)bit_depth;
397
7.29k
  (void)error_info;
398
7.29k
  assert(bit_depth == 8);
399
7.29k
  const WienerConvolveParams conv_params = get_conv_params_wiener(8);
400
401
15.8k
  for (int j = 0; j < stripe_width; j += procunit_width) {
402
8.50k
    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
403
8.50k
    const uint8_t *src_p = src + j;
404
8.50k
    uint8_t *dst_p = dst + j;
405
8.50k
    av1_wiener_convolve_add_src(
406
8.50k
        src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
407
8.50k
        rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
408
8.50k
  }
409
7.29k
}
410
411
/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
412
   over the input. The window is of size (2r + 1)x(2r + 1), and we
413
   specialize to r = 1, 2, 3. A default function is used for r > 3.
414
415
   Each loop follows the same format: We keep a window's worth of input
416
   in individual variables and select data out of that as appropriate.
417
*/
418
static void boxsum1(int32_t *src, int width, int height, int src_stride,
419
45.7k
                    int sqr, int32_t *dst, int dst_stride) {
420
45.7k
  int i, j, a, b, c;
421
45.7k
  assert(width > 2 * SGRPROJ_BORDER_HORZ);
422
45.7k
  assert(height > 2 * SGRPROJ_BORDER_VERT);
423
424
  // Vertical sum over 3-pixel regions, from src into dst.
425
45.7k
  if (!sqr) {
426
824k
    for (j = 0; j < width; ++j) {
427
801k
      a = src[j];
428
801k
      b = src[src_stride + j];
429
801k
      c = src[2 * src_stride + j];
430
431
801k
      dst[j] = a + b;
432
33.1M
      for (i = 1; i < height - 2; ++i) {
433
        // Loop invariant: At the start of each iteration,
434
        // a = src[(i - 1) * src_stride + j]
435
        // b = src[(i    ) * src_stride + j]
436
        // c = src[(i + 1) * src_stride + j]
437
32.3M
        dst[i * dst_stride + j] = a + b + c;
438
32.3M
        a = b;
439
32.3M
        b = c;
440
32.3M
        c = src[(i + 2) * src_stride + j];
441
32.3M
      }
442
801k
      dst[i * dst_stride + j] = a + b + c;
443
801k
      dst[(i + 1) * dst_stride + j] = b + c;
444
801k
    }
445
22.8k
  } else {
446
824k
    for (j = 0; j < width; ++j) {
447
801k
      a = src[j] * src[j];
448
801k
      b = src[src_stride + j] * src[src_stride + j];
449
801k
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
450
451
801k
      dst[j] = a + b;
452
33.1M
      for (i = 1; i < height - 2; ++i) {
453
32.3M
        dst[i * dst_stride + j] = a + b + c;
454
32.3M
        a = b;
455
32.3M
        b = c;
456
32.3M
        c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
457
32.3M
      }
458
801k
      dst[i * dst_stride + j] = a + b + c;
459
801k
      dst[(i + 1) * dst_stride + j] = b + c;
460
801k
    }
461
22.8k
  }
462
463
  // Horizontal sum over 3-pixel regions of dst
464
1.71M
  for (i = 0; i < height; ++i) {
465
1.67M
    a = dst[i * dst_stride];
466
1.67M
    b = dst[i * dst_stride + 1];
467
1.67M
    c = dst[i * dst_stride + 2];
468
469
1.67M
    dst[i * dst_stride] = a + b;
470
58.6M
    for (j = 1; j < width - 2; ++j) {
471
      // Loop invariant: At the start of each iteration,
472
      // a = src[i * src_stride + (j - 1)]
473
      // b = src[i * src_stride + (j    )]
474
      // c = src[i * src_stride + (j + 1)]
475
56.9M
      dst[i * dst_stride + j] = a + b + c;
476
56.9M
      a = b;
477
56.9M
      b = c;
478
56.9M
      c = dst[i * dst_stride + (j + 2)];
479
56.9M
    }
480
1.67M
    dst[i * dst_stride + j] = a + b + c;
481
1.67M
    dst[i * dst_stride + (j + 1)] = b + c;
482
1.67M
  }
483
45.7k
}
484
485
static void boxsum2(int32_t *src, int width, int height, int src_stride,
486
42.0k
                    int sqr, int32_t *dst, int dst_stride) {
487
42.0k
  int i, j, a, b, c, d, e;
488
42.0k
  assert(width > 2 * SGRPROJ_BORDER_HORZ);
489
42.0k
  assert(height > 2 * SGRPROJ_BORDER_VERT);
490
491
  // Vertical sum over 5-pixel regions, from src into dst.
492
42.0k
  if (!sqr) {
493
723k
    for (j = 0; j < width; ++j) {
494
702k
      a = src[j];
495
702k
      b = src[src_stride + j];
496
702k
      c = src[2 * src_stride + j];
497
702k
      d = src[3 * src_stride + j];
498
702k
      e = src[4 * src_stride + j];
499
500
702k
      dst[j] = a + b + c;
501
702k
      dst[dst_stride + j] = a + b + c + d;
502
27.0M
      for (i = 2; i < height - 3; ++i) {
503
        // Loop invariant: At the start of each iteration,
504
        // a = src[(i - 2) * src_stride + j]
505
        // b = src[(i - 1) * src_stride + j]
506
        // c = src[(i    ) * src_stride + j]
507
        // d = src[(i + 1) * src_stride + j]
508
        // e = src[(i + 2) * src_stride + j]
509
26.3M
        dst[i * dst_stride + j] = a + b + c + d + e;
510
26.3M
        a = b;
511
26.3M
        b = c;
512
26.3M
        c = d;
513
26.3M
        d = e;
514
26.3M
        e = src[(i + 3) * src_stride + j];
515
26.3M
      }
516
702k
      dst[i * dst_stride + j] = a + b + c + d + e;
517
702k
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
518
702k
      dst[(i + 2) * dst_stride + j] = c + d + e;
519
702k
    }
520
21.1k
  } else {
521
724k
    for (j = 0; j < width; ++j) {
522
703k
      a = src[j] * src[j];
523
703k
      b = src[src_stride + j] * src[src_stride + j];
524
703k
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
525
703k
      d = src[3 * src_stride + j] * src[3 * src_stride + j];
526
703k
      e = src[4 * src_stride + j] * src[4 * src_stride + j];
527
528
703k
      dst[j] = a + b + c;
529
703k
      dst[dst_stride + j] = a + b + c + d;
530
27.1M
      for (i = 2; i < height - 3; ++i) {
531
26.4M
        dst[i * dst_stride + j] = a + b + c + d + e;
532
26.4M
        a = b;
533
26.4M
        b = c;
534
26.4M
        c = d;
535
26.4M
        d = e;
536
26.4M
        e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
537
26.4M
      }
538
703k
      dst[i * dst_stride + j] = a + b + c + d + e;
539
703k
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
540
703k
      dst[(i + 2) * dst_stride + j] = c + d + e;
541
703k
    }
542
20.9k
  }
543
544
  // Horizontal sum over 5-pixel regions of dst
545
1.47M
  for (i = 0; i < height; ++i) {
546
1.43M
    a = dst[i * dst_stride];
547
1.43M
    b = dst[i * dst_stride + 1];
548
1.43M
    c = dst[i * dst_stride + 2];
549
1.43M
    d = dst[i * dst_stride + 3];
550
1.43M
    e = dst[i * dst_stride + 4];
551
552
1.43M
    dst[i * dst_stride] = a + b + c;
553
1.43M
    dst[i * dst_stride + 1] = a + b + c + d;
554
48.1M
    for (j = 2; j < width - 3; ++j) {
555
      // Loop invariant: At the start of each iteration,
556
      // a = src[i * src_stride + (j - 2)]
557
      // b = src[i * src_stride + (j - 1)]
558
      // c = src[i * src_stride + (j    )]
559
      // d = src[i * src_stride + (j + 1)]
560
      // e = src[i * src_stride + (j + 2)]
561
46.7M
      dst[i * dst_stride + j] = a + b + c + d + e;
562
46.7M
      a = b;
563
46.7M
      b = c;
564
46.7M
      c = d;
565
46.7M
      d = e;
566
46.7M
      e = dst[i * dst_stride + (j + 3)];
567
46.7M
    }
568
1.43M
    dst[i * dst_stride + j] = a + b + c + d + e;
569
1.43M
    dst[i * dst_stride + (j + 1)] = b + c + d + e;
570
1.43M
    dst[i * dst_stride + (j + 2)] = c + d + e;
571
1.43M
  }
572
42.0k
}
573
574
static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
575
87.7k
                   int sqr, int32_t *dst, int dst_stride) {
576
87.7k
  if (r == 1)
577
45.7k
    boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
578
42.0k
  else if (r == 2)
579
42.2k
    boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
580
18.4E
  else
581
18.4E
    assert(0 && "Invalid value of r in self-guided filter");
582
87.7k
}
583
584
28.3k
void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
585
28.3k
  if (params->r[0] == 0) {
586
7.15k
    xq[0] = 0;
587
7.15k
    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
588
21.1k
  } else if (params->r[1] == 0) {
589
5.41k
    xq[0] = xqd[0];
590
5.41k
    xq[1] = 0;
591
15.7k
  } else {
592
15.7k
    xq[0] = xqd[0];
593
15.7k
    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
594
15.7k
  }
595
28.3k
}
596
597
const int32_t av1_x_by_xplus1[256] = {
598
  // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
599
  // instead of 0. See comments in selfguided_restoration_internal() for why
600
  1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
601
  240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
602
  248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
603
  250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
604
  252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
605
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
606
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
607
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
608
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
609
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
610
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
611
  254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
612
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
613
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
614
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
615
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
616
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
617
  256,
618
};
619
620
const int32_t av1_one_by_x[MAX_NELEM] = {
621
  4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
622
  293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
623
};
624
625
static void calculate_intermediate_result(int32_t *dgd, int width, int height,
626
                                          int dgd_stride, int bit_depth,
627
                                          int sgr_params_idx, int radius_idx,
628
44.0k
                                          int pass, int32_t *A, int32_t *B) {
629
44.0k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
630
44.0k
  const int r = params->r[radius_idx];
631
44.0k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
632
44.0k
  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
633
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
634
  // leading to a significant speed improvement.
635
  // We also align the stride to a multiple of 16 bytes, for consistency
636
  // with the SIMD version of this function.
637
44.0k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
638
44.0k
  const int step = pass == 0 ? 1 : 2;
639
44.0k
  int i, j;
640
641
44.0k
  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
642
44.0k
  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
643
44.0k
         "Need SGRPROJ_BORDER_* >= r+1");
644
645
44.0k
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
646
44.0k
         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
647
44.0k
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
648
44.0k
         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
649
44.0k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
650
44.0k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
651
  // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
652
  // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
653
1.49M
  for (i = -1; i < height + 1; i += step) {
654
40.3M
    for (j = -1; j < width + 1; ++j) {
655
38.8M
      const int k = i * buf_stride + j;
656
38.8M
      const int n = (2 * r + 1) * (2 * r + 1);
657
658
      // a < 2^16 * n < 2^22 regardless of bit depth
659
38.8M
      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
660
      // b < 2^8 * n < 2^14 regardless of bit depth
661
38.8M
      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
662
663
      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
664
      // and p itself satisfies p < 2^14 * n^2 < 2^26.
665
      // This bound on p is due to:
666
      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
667
      //
668
      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
669
      // This is an artefact of rounding, and can only happen if all pixels
670
      // are (almost) identical, so in this case we saturate to p=0.
671
38.8M
      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
672
673
38.8M
      const uint32_t s = params->s[radius_idx];
674
675
      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
676
      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
677
      // (this holds even after accounting for the rounding in s)
678
38.8M
      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
679
680
      // Note: We have to be quite careful about the value of A[k].
681
      // This is used as a blend factor between individual pixel values and the
682
      // local mean. So it logically has a range of [0, 256], including both
683
      // endpoints.
684
      //
685
      // This is a pain for hardware, as we'd like something which can be stored
686
      // in exactly 8 bits.
687
      // Further, in the calculation of B[k] below, if z == 0 and r == 2,
688
      // then A[k] "should be" 0. But then we can end up setting B[k] to a value
689
      // slightly above 2^(8 + bit depth), due to rounding in the value of
690
      // av1_one_by_x[25-1].
691
      //
692
      // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
693
      // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
694
      // overflow), without significantly affecting the final result: z == 0
695
      // implies that the image is essentially "flat", so the local mean and
696
      // individual pixel values are very similar.
697
      //
698
      // Note that saturating on the other side, ie. requring A[k] <= 255,
699
      // would be a bad idea, as that corresponds to the case where the image
700
      // is very variable, when we want to preserve the local pixel value as
701
      // much as possible.
702
38.8M
      A[k] = av1_x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
703
704
      // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
705
      // av1_one_by_x[n - 1] = round(2^12 / n)
706
      // => the product here is < 2^(20 + bit_depth) <= 2^32,
707
      // and B[k] is set to a value < 2^(8 + bit depth)
708
      // This holds even with the rounding in av1_one_by_x and in the overall
709
      // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
710
38.8M
      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
711
38.8M
                                             (uint32_t)B[k] *
712
38.8M
                                             (uint32_t)av1_one_by_x[n - 1],
713
38.8M
                                         SGRPROJ_RECIP_BITS);
714
38.8M
    }
715
1.44M
  }
716
44.0k
}
717
718
static void selfguided_restoration_fast_internal(
719
    int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
720
21.1k
    int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
721
21.1k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
722
21.1k
  const int r = params->r[radius_idx];
723
21.1k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
724
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
725
  // leading to a significant speed improvement.
726
  // We also align the stride to a multiple of 16 bytes, for consistency
727
  // with the SIMD version of this function.
728
21.1k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
729
21.1k
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
730
21.1k
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
731
21.1k
  int32_t *A = A_;
732
21.1k
  int32_t *B = B_;
733
21.1k
  int i, j;
734
21.1k
  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
735
21.1k
                                sgr_params_idx, radius_idx, 1, A, B);
736
21.1k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
737
21.1k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
738
739
  // Use the A[] and B[] arrays to calculate the filtered image
740
21.1k
  (void)r;
741
21.1k
  assert(r == 2);
742
932k
  for (i = 0; i < height; ++i) {
743
911k
    if (!(i & 1)) {  // even row
744
11.5M
      for (j = 0; j < width; ++j) {
745
11.0M
        const int k = i * buf_stride + j;
746
11.0M
        const int l = i * dgd_stride + j;
747
11.0M
        const int m = i * dst_stride + j;
748
11.0M
        const int nb = 5;
749
11.0M
        const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
750
11.0M
                          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
751
11.0M
                           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
752
11.0M
                              5;
753
11.0M
        const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
754
11.0M
                          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
755
11.0M
                           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
756
11.0M
                              5;
757
11.0M
        const int32_t v = a * dgd[l] + b;
758
11.0M
        dst[m] =
759
11.0M
            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
760
11.0M
      }
761
457k
    } else {  // odd row
762
11.4M
      for (j = 0; j < width; ++j) {
763
11.0M
        const int k = i * buf_stride + j;
764
11.0M
        const int l = i * dgd_stride + j;
765
11.0M
        const int m = i * dst_stride + j;
766
11.0M
        const int nb = 4;
767
11.0M
        const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
768
11.0M
        const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
769
11.0M
        const int32_t v = a * dgd[l] + b;
770
11.0M
        dst[m] =
771
11.0M
            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
772
11.0M
      }
773
453k
    }
774
911k
  }
775
21.1k
}
776
777
static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
778
                                            int dgd_stride, int32_t *dst,
779
                                            int dst_stride, int bit_depth,
780
                                            int sgr_params_idx,
781
22.9k
                                            int radius_idx) {
782
22.9k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
783
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
784
  // leading to a significant speed improvement.
785
  // We also align the stride to a multiple of 16 bytes, for consistency
786
  // with the SIMD version of this function.
787
22.9k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
788
22.9k
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
789
22.9k
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
790
22.9k
  int32_t *A = A_;
791
22.9k
  int32_t *B = B_;
792
22.9k
  int i, j;
793
22.9k
  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
794
22.9k
                                sgr_params_idx, radius_idx, 0, A, B);
795
22.9k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
796
22.9k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
797
798
  // Use the A[] and B[] arrays to calculate the filtered image
799
992k
  for (i = 0; i < height; ++i) {
800
25.1M
    for (j = 0; j < width; ++j) {
801
24.2M
      const int k = i * buf_stride + j;
802
24.2M
      const int l = i * dgd_stride + j;
803
24.2M
      const int m = i * dst_stride + j;
804
24.2M
      const int nb = 5;
805
24.2M
      const int32_t a =
806
24.2M
          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
807
24.2M
              4 +
808
24.2M
          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
809
24.2M
           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
810
24.2M
              3;
811
24.2M
      const int32_t b =
812
24.2M
          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
813
24.2M
              4 +
814
24.2M
          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
815
24.2M
           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
816
24.2M
              3;
817
24.2M
      const int32_t v = a * dgd[l] + b;
818
24.2M
      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
819
24.2M
    }
820
969k
  }
821
22.9k
}
822
823
int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
824
                                 int dgd_stride, int32_t *flt0, int32_t *flt1,
825
                                 int flt_stride, int sgr_params_idx,
826
28.2k
                                 int bit_depth, int highbd) {
827
28.2k
  int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
828
28.2k
  const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
829
28.2k
  int32_t *dgd32 =
830
28.2k
      dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
831
832
28.2k
  if (highbd) {
833
17.0k
    const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
834
677k
    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
835
25.4M
      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
836
24.8M
        dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
837
24.8M
      }
838
660k
    }
839
17.0k
  } else {
840
603k
    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
841
16.0M
      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
842
15.4M
        dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
843
15.4M
      }
844
591k
    }
845
11.2k
  }
846
847
28.2k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
848
  // If params->r == 0 we skip the corresponding filter. We only allow one of
849
  // the radii to be 0, as having both equal to 0 would be equivalent to
850
  // skipping SGR entirely.
851
28.2k
  assert(!(params->r[0] == 0 && params->r[1] == 0));
852
853
28.2k
  if (params->r[0] > 0)
854
21.1k
    selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
855
21.1k
                                         flt0, flt_stride, bit_depth,
856
21.1k
                                         sgr_params_idx, 0);
857
28.2k
  if (params->r[1] > 0)
858
22.9k
    selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
859
22.9k
                                    flt_stride, bit_depth, sgr_params_idx, 1);
860
28.2k
  return 0;
861
28.2k
}
862
863
int av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
864
                                       int height, int stride, int eps,
865
                                       const int *xqd, uint8_t *dst8,
866
                                       int dst_stride, int32_t *tmpbuf,
867
28.2k
                                       int bit_depth, int highbd) {
868
28.2k
  int32_t *flt0 = tmpbuf;
869
28.2k
  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
870
28.2k
  assert(width * height <= RESTORATION_UNITPELS_MAX);
871
872
28.2k
  const int ret = av1_selfguided_restoration_c(
873
28.2k
      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
874
28.2k
  if (ret != 0) return ret;
875
28.2k
  const sgr_params_type *const params = &av1_sgr_params[eps];
876
28.2k
  int xq[2];
877
28.2k
  av1_decode_xq(xqd, xq, params);
878
780k
  for (int i = 0; i < height; ++i) {
879
24.6M
    for (int j = 0; j < width; ++j) {
880
23.9M
      const int k = i * width + j;
881
23.9M
      uint8_t *dst8ij = dst8 + i * dst_stride + j;
882
23.9M
      const uint8_t *dat8ij = dat8 + i * stride + j;
883
884
23.9M
      const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
885
23.9M
      const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
886
23.9M
      int32_t v = u << SGRPROJ_PRJ_BITS;
887
      // If params->r == 0 then we skipped the filtering in
888
      // av1_selfguided_restoration_c, i.e. flt[k] == u
889
23.9M
      if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
890
23.9M
      if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
891
23.9M
      const int16_t w =
892
23.9M
          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
893
894
23.9M
      const uint16_t out = clip_pixel_highbd(w, bit_depth);
895
23.9M
      if (highbd)
896
15.9M
        *CONVERT_TO_SHORTPTR(dst8ij) = out;
897
8.03M
      else
898
8.03M
        *dst8ij = (uint8_t)out;
899
23.9M
    }
900
752k
  }
901
28.2k
  return 0;
902
28.2k
}
903
904
static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
905
                                  int stripe_width, int stripe_height,
906
                                  int procunit_width, const uint8_t *src,
907
                                  int src_stride, uint8_t *dst, int dst_stride,
908
                                  int32_t *tmpbuf, int bit_depth,
909
9.75k
                                  struct aom_internal_error_info *error_info) {
910
9.75k
  (void)bit_depth;
911
9.75k
  assert(bit_depth == 8);
912
913
21.0k
  for (int j = 0; j < stripe_width; j += procunit_width) {
914
11.2k
    int w = AOMMIN(procunit_width, stripe_width - j);
915
11.2k
    if (av1_apply_selfguided_restoration(
916
11.2k
            src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
917
11.2k
            rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth,
918
11.2k
            0) != 0) {
919
0
      aom_internal_error(
920
0
          error_info, AOM_CODEC_MEM_ERROR,
921
0
          "Error allocating buffer in av1_apply_selfguided_restoration");
922
0
    }
923
11.2k
  }
924
9.75k
}
925
926
#if CONFIG_AV1_HIGHBITDEPTH
927
static void wiener_filter_stripe_highbd(
928
    const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
929
    int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
930
    int dst_stride, int32_t *tmpbuf, int bit_depth,
931
18.9k
    struct aom_internal_error_info *error_info) {
932
18.9k
  (void)tmpbuf;
933
18.9k
  (void)error_info;
934
18.9k
  const WienerConvolveParams conv_params = get_conv_params_wiener(bit_depth);
935
936
41.1k
  for (int j = 0; j < stripe_width; j += procunit_width) {
937
22.1k
    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
938
22.1k
    const uint8_t *src8_p = src8 + j;
939
22.1k
    uint8_t *dst8_p = dst8 + j;
940
22.1k
    av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
941
22.1k
                                       rui->wiener_info.hfilter, 16,
942
22.1k
                                       rui->wiener_info.vfilter, 16, w,
943
22.1k
                                       stripe_height, &conv_params, bit_depth);
944
22.1k
  }
945
18.9k
}
946
947
static void sgrproj_filter_stripe_highbd(
948
    const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
949
    int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
950
    int dst_stride, int32_t *tmpbuf, int bit_depth,
951
12.1k
    struct aom_internal_error_info *error_info) {
952
29.1k
  for (int j = 0; j < stripe_width; j += procunit_width) {
953
17.0k
    int w = AOMMIN(procunit_width, stripe_width - j);
954
17.0k
    if (av1_apply_selfguided_restoration(
955
17.0k
            src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
956
17.0k
            rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth,
957
17.0k
            1) != 0) {
958
0
      aom_internal_error(
959
0
          error_info, AOM_CODEC_MEM_ERROR,
960
0
          "Error allocating buffer in av1_apply_selfguided_restoration");
961
0
    }
962
17.0k
  }
963
12.1k
}
964
#endif  // CONFIG_AV1_HIGHBITDEPTH
965
966
typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
967
                                  int stripe_width, int stripe_height,
968
                                  int procunit_width, const uint8_t *src,
969
                                  int src_stride, uint8_t *dst, int dst_stride,
970
                                  int32_t *tmpbuf, int bit_depth,
971
                                  struct aom_internal_error_info *error_info);
972
973
#if CONFIG_AV1_HIGHBITDEPTH
974
#define NUM_STRIPE_FILTERS 4
975
static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
976
  wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
977
  sgrproj_filter_stripe_highbd
978
};
979
#else
980
#define NUM_STRIPE_FILTERS 2
981
static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
982
  wiener_filter_stripe, sgrproj_filter_stripe
983
};
984
#endif  // CONFIG_AV1_HIGHBITDEPTH
985
986
// Filter one restoration unit
987
void av1_loop_restoration_filter_unit(
988
    const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
989
    const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
990
    int plane_w, int plane_h, int ss_x, int ss_y, int highbd, int bit_depth,
991
    uint8_t *data8, int stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf,
992
49.9k
    int optimized_lr, struct aom_internal_error_info *error_info) {
993
49.9k
  RestorationType unit_rtype = rui->restoration_type;
994
995
49.9k
  int unit_h = limits->v_end - limits->v_start;
996
49.9k
  int unit_w = limits->h_end - limits->h_start;
997
49.9k
  uint8_t *data8_tl =
998
49.9k
      data8 + limits->v_start * (ptrdiff_t)stride + limits->h_start;
999
49.9k
  uint8_t *dst8_tl =
1000
49.9k
      dst8 + limits->v_start * (ptrdiff_t)dst_stride + limits->h_start;
1001
1002
49.9k
  if (unit_rtype == RESTORE_NONE) {
1003
23.2k
    copy_rest_unit(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride,
1004
23.2k
                   highbd);
1005
23.2k
    return;
1006
23.2k
  }
1007
1008
26.6k
  const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
1009
26.6k
  assert(filter_idx < NUM_STRIPE_FILTERS);
1010
26.6k
  const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
1011
1012
26.6k
  const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
1013
1014
  // Filter the whole image one stripe at a time
1015
26.6k
  RestorationTileLimits remaining_stripes = *limits;
1016
26.6k
  int i = 0;
1017
74.5k
  while (i < unit_h) {
1018
47.9k
    int copy_above, copy_below;
1019
47.9k
    remaining_stripes.v_start = limits->v_start + i;
1020
1021
47.9k
    get_stripe_boundary_info(&remaining_stripes, plane_w, plane_h, ss_y,
1022
47.9k
                             &copy_above, &copy_below);
1023
1024
47.9k
    const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1025
47.9k
    const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
1026
1027
    // Work out where this stripe's boundaries are within
1028
    // rsb->stripe_boundary_{above,below}
1029
47.9k
    const int frame_stripe =
1030
47.9k
        (remaining_stripes.v_start + runit_offset) / full_stripe_height;
1031
47.9k
    const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
1032
1033
    // Calculate this stripe's height, based on two rules:
1034
    // * The topmost stripe in the frame is 8 luma pixels shorter than usual.
1035
    // * We can't extend past the end of the current restoration unit
1036
47.9k
    const int nominal_stripe_height =
1037
47.9k
        full_stripe_height - ((frame_stripe == 0) ? runit_offset : 0);
1038
47.9k
    const int h = AOMMIN(nominal_stripe_height,
1039
47.9k
                         remaining_stripes.v_end - remaining_stripes.v_start);
1040
1041
47.9k
    setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
1042
47.9k
                                     h, data8, stride, rlbs, copy_above,
1043
47.9k
                                     copy_below, optimized_lr);
1044
1045
47.9k
    stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
1046
47.9k
                  dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth,
1047
47.9k
                  error_info);
1048
1049
47.9k
    restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
1050
47.9k
                                       data8, stride, copy_above, copy_below,
1051
47.9k
                                       optimized_lr);
1052
1053
47.9k
    i += h;
1054
47.9k
  }
1055
26.6k
}
1056
1057
static void filter_frame_on_unit(const RestorationTileLimits *limits,
1058
                                 int rest_unit_idx, void *priv, int32_t *tmpbuf,
1059
                                 RestorationLineBuffers *rlbs,
1060
50.0k
                                 struct aom_internal_error_info *error_info) {
1061
50.0k
  FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
1062
50.0k
  const RestorationInfo *rsi = ctxt->rsi;
1063
1064
50.0k
  av1_loop_restoration_filter_unit(
1065
50.0k
      limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs,
1066
50.0k
      ctxt->plane_w, ctxt->plane_h, ctxt->ss_x, ctxt->ss_y, ctxt->highbd,
1067
50.0k
      ctxt->bit_depth, ctxt->data8, ctxt->data_stride, ctxt->dst8,
1068
50.0k
      ctxt->dst_stride, tmpbuf, rsi->optimized_lr, error_info);
1069
50.0k
}
1070
1071
void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
1072
                                            YV12_BUFFER_CONFIG *frame,
1073
                                            AV1_COMMON *cm, int optimized_lr,
1074
6.77k
                                            int num_planes) {
1075
6.77k
  const SequenceHeader *const seq_params = cm->seq_params;
1076
6.77k
  const int bit_depth = seq_params->bit_depth;
1077
6.77k
  const int highbd = seq_params->use_highbitdepth;
1078
6.77k
  lr_ctxt->dst = &cm->rst_frame;
1079
1080
6.77k
  const int frame_width = frame->crop_widths[0];
1081
6.77k
  const int frame_height = frame->crop_heights[0];
1082
6.77k
  if (aom_realloc_frame_buffer(
1083
6.77k
          lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
1084
6.77k
          seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
1085
6.77k
          cm->features.byte_alignment, NULL, NULL, NULL, false,
1086
6.77k
          0) != AOM_CODEC_OK)
1087
0
    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
1088
0
                       "Failed to allocate restoration dst buffer");
1089
1090
6.77k
  lr_ctxt->on_rest_unit = filter_frame_on_unit;
1091
6.77k
  lr_ctxt->frame = frame;
1092
23.6k
  for (int plane = 0; plane < num_planes; ++plane) {
1093
16.9k
    RestorationInfo *rsi = &cm->rst_info[plane];
1094
16.9k
    RestorationType rtype = rsi->frame_restoration_type;
1095
16.9k
    rsi->optimized_lr = optimized_lr;
1096
16.9k
    lr_ctxt->ctxt[plane].rsi = rsi;
1097
1098
16.9k
    if (rtype == RESTORE_NONE) {
1099
5.93k
      continue;
1100
5.93k
    }
1101
1102
10.9k
    const int is_uv = plane > 0;
1103
10.9k
    int plane_w, plane_h;
1104
10.9k
    av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1105
10.9k
    assert(plane_w == frame->crop_widths[is_uv]);
1106
10.9k
    assert(plane_h == frame->crop_heights[is_uv]);
1107
1108
10.9k
    av1_extend_frame(frame->buffers[plane], plane_w, plane_h,
1109
10.9k
                     frame->strides[is_uv], RESTORATION_BORDER,
1110
10.9k
                     RESTORATION_BORDER, highbd);
1111
1112
10.9k
    FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
1113
10.9k
    lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
1114
10.9k
    lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
1115
10.9k
    lr_plane_ctxt->plane_w = plane_w;
1116
10.9k
    lr_plane_ctxt->plane_h = plane_h;
1117
10.9k
    lr_plane_ctxt->highbd = highbd;
1118
10.9k
    lr_plane_ctxt->bit_depth = bit_depth;
1119
10.9k
    lr_plane_ctxt->data8 = frame->buffers[plane];
1120
10.9k
    lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
1121
10.9k
    lr_plane_ctxt->data_stride = frame->strides[is_uv];
1122
10.9k
    lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
1123
10.9k
  }
1124
6.77k
}
1125
1126
static void loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
1127
1.85k
                                         AV1_COMMON *cm, int num_planes) {
1128
1.85k
  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
1129
1.85k
                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
1130
1.85k
                           int vstart, int vend);
1131
1.85k
  static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
1132
1.85k
                                         aom_yv12_partial_coloc_copy_u,
1133
1.85k
                                         aom_yv12_partial_coloc_copy_v };
1134
1.85k
  assert(num_planes <= 3);
1135
4.43k
  for (int plane = 0; plane < num_planes; ++plane) {
1136
2.57k
    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
1137
2.30k
    FilterFrameCtxt *lr_plane_ctxt = &loop_rest_ctxt->ctxt[plane];
1138
2.30k
    copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, 0,
1139
2.30k
                     lr_plane_ctxt->plane_w, 0, lr_plane_ctxt->plane_h);
1140
2.30k
  }
1141
1.85k
}
1142
1143
// Call on_rest_unit for each loop restoration unit in the plane.
1144
static void foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
1145
                                       rest_unit_visitor_t on_rest_unit,
1146
                                       void *priv, int32_t *tmpbuf,
1147
2.30k
                                       RestorationLineBuffers *rlbs) {
1148
2.30k
  const RestorationInfo *rsi = &cm->rst_info[plane];
1149
2.30k
  const int hnum_rest_units = rsi->horz_units;
1150
2.30k
  const int vnum_rest_units = rsi->vert_units;
1151
2.30k
  const int unit_size = rsi->restoration_unit_size;
1152
1153
2.30k
  const int is_uv = plane > 0;
1154
2.30k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1155
2.30k
  const int ext_size = unit_size * 3 / 2;
1156
2.30k
  int plane_w, plane_h;
1157
2.30k
  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1158
1159
2.30k
  int y0 = 0, i = 0;
1160
5.77k
  while (y0 < plane_h) {
1161
3.47k
    int remaining_h = plane_h - y0;
1162
3.47k
    int h = (remaining_h < ext_size) ? remaining_h : unit_size;
1163
1164
3.47k
    RestorationTileLimits limits;
1165
3.47k
    limits.v_start = y0;
1166
3.47k
    limits.v_end = y0 + h;
1167
3.47k
    assert(limits.v_end <= plane_h);
1168
    // Offset upwards to align with the restoration processing stripe
1169
3.47k
    const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
1170
3.47k
    limits.v_start = AOMMAX(0, limits.v_start - voffset);
1171
3.47k
    if (limits.v_end < plane_h) limits.v_end -= voffset;
1172
1173
3.47k
    av1_foreach_rest_unit_in_row(&limits, plane_w, on_rest_unit, i, unit_size,
1174
3.47k
                                 hnum_rest_units, vnum_rest_units, plane, priv,
1175
3.47k
                                 tmpbuf, rlbs, av1_lr_sync_read_dummy,
1176
3.47k
                                 av1_lr_sync_write_dummy, NULL, cm->error);
1177
1178
3.47k
    y0 += h;
1179
3.47k
    ++i;
1180
3.47k
  }
1181
2.30k
}
1182
1183
static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
1184
1.85k
                                        int num_planes) {
1185
1.85k
  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
1186
1187
4.43k
  for (int plane = 0; plane < num_planes; ++plane) {
1188
2.57k
    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
1189
278
      continue;
1190
278
    }
1191
1192
2.30k
    foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit, &ctxt[plane],
1193
2.30k
                               cm->rst_tmpbuf, cm->rlbs);
1194
2.30k
  }
1195
1.85k
}
1196
1197
void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
1198
                                       AV1_COMMON *cm, int optimized_lr,
1199
1.85k
                                       void *lr_ctxt) {
1200
1.85k
  assert(!cm->features.all_lossless);
1201
1.85k
  const int num_planes = av1_num_planes(cm);
1202
1203
1.85k
  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
1204
1205
1.85k
  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
1206
1.85k
                                         optimized_lr, num_planes);
1207
1208
1.85k
  foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
1209
1210
1.85k
  loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
1211
1.85k
}
1212
1213
void av1_foreach_rest_unit_in_row(
1214
    RestorationTileLimits *limits, int plane_w,
1215
    rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
1216
    int hnum_rest_units, int vnum_rest_units, int plane, void *priv,
1217
    int32_t *tmpbuf, RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read,
1218
    sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync,
1219
36.6k
    struct aom_internal_error_info *error_info) {
1220
36.6k
  const int ext_size = unit_size * 3 / 2;
1221
36.6k
  int x0 = 0, j = 0;
1222
86.7k
  while (x0 < plane_w) {
1223
50.0k
    int remaining_w = plane_w - x0;
1224
50.0k
    int w = (remaining_w < ext_size) ? remaining_w : unit_size;
1225
1226
50.0k
    limits->h_start = x0;
1227
50.0k
    limits->h_end = x0 + w;
1228
50.0k
    assert(limits->h_end <= plane_w);
1229
1230
50.0k
    const int unit_idx = row_number * hnum_rest_units + j;
1231
1232
    // No sync for even numbered rows
1233
    // For odd numbered rows, Loop Restoration of current block requires the LR
1234
    // of top-right and bottom-right blocks to be completed
1235
1236
    // top-right sync
1237
50.0k
    on_sync_read(lr_sync, row_number, j, plane);
1238
50.0k
    if ((row_number + 1) < vnum_rest_units)
1239
      // bottom-right sync
1240
27.5k
      on_sync_read(lr_sync, row_number + 2, j, plane);
1241
1242
50.0k
#if CONFIG_MULTITHREAD
1243
50.0k
    if (lr_sync && lr_sync->num_workers > 1) {
1244
35.2k
      pthread_mutex_lock(lr_sync->job_mutex);
1245
35.2k
      const bool lr_mt_exit = lr_sync->lr_mt_exit;
1246
35.2k
      pthread_mutex_unlock(lr_sync->job_mutex);
1247
      // Exit in case any worker has encountered an error.
1248
35.2k
      if (lr_mt_exit) return;
1249
35.2k
    }
1250
50.0k
#endif
1251
1252
50.0k
    on_rest_unit(limits, unit_idx, priv, tmpbuf, rlbs, error_info);
1253
1254
50.0k
    on_sync_write(lr_sync, row_number, j, hnum_rest_units, plane);
1255
1256
50.0k
    x0 += w;
1257
50.0k
    ++j;
1258
50.0k
  }
1259
36.6k
}
1260
1261
52.3k
void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
1262
52.3k
  (void)lr_sync;
1263
52.3k
  (void)r;
1264
52.3k
  (void)c;
1265
52.3k
  (void)plane;
1266
52.3k
}
1267
1268
void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
1269
29.3k
                             const int sb_cols, int plane) {
1270
29.3k
  (void)lr_sync;
1271
29.3k
  (void)r;
1272
29.3k
  (void)c;
1273
29.3k
  (void)sb_cols;
1274
29.3k
  (void)plane;
1275
29.3k
}
1276
1277
int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
1278
                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
1279
                                       int *rcol0, int *rcol1, int *rrow0,
1280
1.27M
                                       int *rrow1) {
1281
1.27M
  assert(rcol0 && rcol1 && rrow0 && rrow1);
1282
1283
1.27M
  if (bsize != cm->seq_params->sb_size) return 0;
1284
1285
150k
  assert(!cm->features.all_lossless);
1286
1287
150k
  const int is_uv = plane > 0;
1288
1289
  // Compute the mi-unit corners of the superblock
1290
150k
  const int mi_row0 = mi_row;
1291
150k
  const int mi_col0 = mi_col;
1292
150k
  const int mi_row1 = mi_row0 + mi_size_high[bsize];
1293
150k
  const int mi_col1 = mi_col0 + mi_size_wide[bsize];
1294
1295
150k
  const RestorationInfo *rsi = &cm->rst_info[plane];
1296
150k
  const int size = rsi->restoration_unit_size;
1297
150k
  const int horz_units = rsi->horz_units;
1298
150k
  const int vert_units = rsi->vert_units;
1299
1300
  // The size of an MI-unit on this plane of the image
1301
150k
  const int ss_x = is_uv && cm->seq_params->subsampling_x;
1302
150k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1303
150k
  const int mi_size_x = MI_SIZE >> ss_x;
1304
150k
  const int mi_size_y = MI_SIZE >> ss_y;
1305
1306
  // Write m for the relative mi column or row, D for the superres denominator
1307
  // and N for the superres numerator. If u is the upscaled pixel offset then
1308
  // we can write the downscaled pixel offset in two ways as:
1309
  //
1310
  //   MI_SIZE * m = N / D u
1311
  //
1312
  // from which we get u = D * MI_SIZE * m / N
1313
150k
  const int mi_to_num_x = av1_superres_scaled(cm)
1314
150k
                              ? mi_size_x * cm->superres_scale_denominator
1315
150k
                              : mi_size_x;
1316
150k
  const int mi_to_num_y = mi_size_y;
1317
150k
  const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
1318
150k
  const int denom_y = size;
1319
1320
150k
  const int rnd_x = denom_x - 1;
1321
150k
  const int rnd_y = denom_y - 1;
1322
1323
  // rcol0/rrow0 should be the first column/row of restoration units that
1324
  // doesn't start left/below of mi_col/mi_row. For this calculation, we need
1325
  // to round up the division (if the sb starts at runit column 10.1, the first
1326
  // matching runit has column index 11)
1327
150k
  *rcol0 = (mi_col0 * mi_to_num_x + rnd_x) / denom_x;
1328
150k
  *rrow0 = (mi_row0 * mi_to_num_y + rnd_y) / denom_y;
1329
1330
  // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
1331
  // below-right. If we're at the bottom or right of the frame, this restoration
1332
  // unit might not exist, in which case we'll clamp accordingly.
1333
150k
  *rcol1 = AOMMIN((mi_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
1334
150k
  *rrow1 = AOMMIN((mi_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
1335
1336
150k
  return *rcol0 < *rcol1 && *rrow0 < *rrow1;
1337
1.27M
}
1338
1339
// Extend to left and right
1340
static void extend_lines(uint8_t *buf, int width, int height, int stride,
1341
166k
                         int extend, int use_highbitdepth) {
1342
498k
  for (int i = 0; i < height; ++i) {
1343
332k
    if (use_highbitdepth) {
1344
257k
      uint16_t *buf16 = (uint16_t *)buf;
1345
257k
      aom_memset16(buf16 - extend, buf16[0], extend);
1346
257k
      aom_memset16(buf16 + width, buf16[width - 1], extend);
1347
257k
    } else {
1348
74.8k
      memset(buf - extend, buf[0], extend);
1349
74.8k
      memset(buf + width, buf[width - 1], extend);
1350
74.8k
    }
1351
332k
    buf += stride;
1352
332k
  }
1353
166k
}
1354
1355
static void save_deblock_boundary_lines(
1356
    const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
1357
    int stripe, int use_highbd, int is_above,
1358
138k
    RestorationStripeBoundaries *boundaries) {
1359
138k
  const int is_uv = plane > 0;
1360
138k
  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1361
138k
  const int src_stride = frame->strides[is_uv] << use_highbd;
1362
138k
  const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
1363
1364
138k
  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1365
138k
                               : boundaries->stripe_boundary_below;
1366
138k
  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1367
138k
  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1368
138k
  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1369
1370
  // There is a rare case in which a processing stripe can end 1px above the
1371
  // crop border. In this case, we do want to use deblocked pixels from below
1372
  // the stripe (hence why we ended up in this function), but instead of
1373
  // fetching 2 "below" rows we need to fetch one and duplicate it.
1374
  // This is equivalent to clamping the sample locations against the crop border
1375
138k
  const int lines_to_save =
1376
138k
      AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
1377
138k
  assert(lines_to_save == 1 || lines_to_save == 2);
1378
1379
138k
  int upscaled_width;
1380
138k
  int line_bytes;
1381
138k
  if (av1_superres_scaled(cm)) {
1382
123k
    const int ss_x = is_uv && cm->seq_params->subsampling_x;
1383
123k
    upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
1384
123k
    line_bytes = upscaled_width << use_highbd;
1385
123k
    if (use_highbd)
1386
98.0k
      av1_upscale_normative_rows(
1387
98.0k
          cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
1388
98.0k
          CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
1389
98.0k
          plane, lines_to_save);
1390
24.9k
    else
1391
24.9k
      av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
1392
24.9k
                                 boundaries->stripe_boundary_stride, plane,
1393
24.9k
                                 lines_to_save);
1394
123k
  } else {
1395
15.3k
    upscaled_width = frame->crop_widths[is_uv];
1396
15.3k
    line_bytes = upscaled_width << use_highbd;
1397
45.8k
    for (int i = 0; i < lines_to_save; i++) {
1398
30.5k
      memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
1399
30.5k
             line_bytes);
1400
30.5k
    }
1401
15.3k
  }
1402
  // If we only saved one line, then copy it into the second line buffer
1403
138k
  if (lines_to_save == 1)
1404
167
    memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
1405
1406
138k
  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1407
138k
               RESTORATION_EXTRA_HORZ, use_highbd);
1408
138k
}
1409
1410
static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1411
                                     const AV1_COMMON *cm, int plane, int row,
1412
                                     int stripe, int use_highbd, int is_above,
1413
27.9k
                                     RestorationStripeBoundaries *boundaries) {
1414
27.9k
  const int is_uv = plane > 0;
1415
27.9k
  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1416
27.9k
  const int src_stride = frame->strides[is_uv] << use_highbd;
1417
27.9k
  const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
1418
1419
27.9k
  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1420
27.9k
                               : boundaries->stripe_boundary_below;
1421
27.9k
  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1422
27.9k
  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1423
27.9k
  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1424
27.9k
  const int src_width = frame->crop_widths[is_uv];
1425
1426
  // At the point where this function is called, we've already applied
1427
  // superres. So we don't need to extend the lines here, we can just
1428
  // pull directly from the topmost row of the upscaled frame.
1429
27.9k
  const int ss_x = is_uv && cm->seq_params->subsampling_x;
1430
27.9k
  const int upscaled_width = av1_superres_scaled(cm)
1431
27.9k
                                 ? (cm->superres_upscaled_width + ss_x) >> ss_x
1432
27.9k
                                 : src_width;
1433
27.9k
  const int line_bytes = upscaled_width << use_highbd;
1434
83.7k
  for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
1435
    // Copy the line at 'src_rows' into both context lines
1436
55.8k
    memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
1437
55.8k
  }
1438
27.9k
  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1439
27.9k
               RESTORATION_EXTRA_HORZ, use_highbd);
1440
27.9k
}
1441
1442
static void save_boundary_lines(const YV12_BUFFER_CONFIG *frame, int use_highbd,
1443
27.9k
                                int plane, AV1_COMMON *cm, int after_cdef) {
1444
27.9k
  const int is_uv = plane > 0;
1445
27.9k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1446
27.9k
  const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1447
27.9k
  const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
1448
1449
27.9k
  int plane_w, plane_h;
1450
27.9k
  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1451
1452
27.9k
  RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
1453
1454
27.9k
  const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
1455
1456
27.9k
  int stripe_idx;
1457
194k
  for (stripe_idx = 0;; ++stripe_idx) {
1458
194k
    const int rel_y0 = AOMMAX(0, stripe_idx * stripe_height - stripe_off);
1459
194k
    const int y0 = rel_y0;
1460
194k
    if (y0 >= plane_h) break;
1461
1462
166k
    const int rel_y1 = (stripe_idx + 1) * stripe_height - stripe_off;
1463
166k
    const int y1 = AOMMIN(rel_y1, plane_h);
1464
1465
    // Extend using CDEF pixels at the top and bottom of the frame,
1466
    // and deblocked pixels at internal stripe boundaries
1467
166k
    const int use_deblock_above = (stripe_idx > 0);
1468
166k
    const int use_deblock_below = (y1 < plane_height);
1469
1470
166k
    if (!after_cdef) {
1471
      // Save deblocked context at internal stripe boundaries
1472
83.1k
      if (use_deblock_above) {
1473
69.1k
        save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
1474
69.1k
                                    stripe_idx, use_highbd, 1, boundaries);
1475
69.1k
      }
1476
83.1k
      if (use_deblock_below) {
1477
69.1k
        save_deblock_boundary_lines(frame, cm, plane, y1, stripe_idx,
1478
69.1k
                                    use_highbd, 0, boundaries);
1479
69.1k
      }
1480
83.1k
    } else {
1481
      // Save CDEF context at frame boundaries
1482
83.1k
      if (!use_deblock_above) {
1483
13.9k
        save_cdef_boundary_lines(frame, cm, plane, y0, stripe_idx, use_highbd,
1484
13.9k
                                 1, boundaries);
1485
13.9k
      }
1486
83.1k
      if (!use_deblock_below) {
1487
13.9k
        save_cdef_boundary_lines(frame, cm, plane, y1 - 1, stripe_idx,
1488
13.9k
                                 use_highbd, 0, boundaries);
1489
13.9k
      }
1490
83.1k
    }
1491
166k
  }
1492
27.9k
}
1493
1494
// For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
1495
// lines to be used as boundary in the loop restoration process. The
1496
// lines are saved in rst_internal.stripe_boundary_lines
1497
void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1498
9.41k
                                              AV1_COMMON *cm, int after_cdef) {
1499
9.41k
  const int num_planes = av1_num_planes(cm);
1500
9.41k
  const int use_highbd = cm->seq_params->use_highbitdepth;
1501
37.3k
  for (int p = 0; p < num_planes; ++p) {
1502
27.9k
    save_boundary_lines(frame, use_highbd, p, cm, after_cdef);
1503
27.9k
  }
1504
9.41k
}