Coverage Report

Created: 2026-06-14 06:57

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/av1/common/restoration.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 *
11
 */
12
13
#include <math.h>
14
#include <stddef.h>
15
16
#include "config/aom_config.h"
17
#include "config/aom_scale_rtcd.h"
18
19
#include "aom/internal/aom_codec_internal.h"
20
#include "aom_mem/aom_mem.h"
21
#include "aom_dsp/aom_dsp_common.h"
22
#include "aom_mem/aom_mem.h"
23
#include "aom_ports/mem.h"
24
#include "aom_util/aom_pthread.h"
25
26
#include "av1/common/av1_common_int.h"
27
#include "av1/common/convolve.h"
28
#include "av1/common/enums.h"
29
#include "av1/common/resize.h"
30
#include "av1/common/restoration.h"
31
#include "av1/common/thread_common.h"
32
33
// The 's' values are calculated based on original 'r' and 'e' values in the
34
// spec using GenSgrprojVtable().
35
// Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
36
const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
37
  { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
38
  { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
39
  { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
40
  { { 2, 1 }, { 47, 1079 } },  { { 2, 1 }, { 37, 996 } },
41
  { { 2, 1 }, { 30, 925 } },   { { 2, 1 }, { 25, 863 } },
42
  { { 0, 1 }, { -1, 2589 } },  { { 0, 1 }, { -1, 1618 } },
43
  { { 0, 1 }, { -1, 1177 } },  { { 0, 1 }, { -1, 925 } },
44
  { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
45
};
46
47
void av1_get_upsampled_plane_size(const AV1_COMMON *cm, int is_uv, int *plane_w,
48
47.9k
                                  int *plane_h) {
49
47.9k
  int ss_x = is_uv && cm->seq_params->subsampling_x;
50
47.9k
  int ss_y = is_uv && cm->seq_params->subsampling_y;
51
47.9k
  *plane_w = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
52
47.9k
  *plane_h = ROUND_POWER_OF_TWO(cm->height, ss_y);
53
47.9k
}
54
55
// Count horizontal or vertical units in a plane (use a width or height for
56
// plane_size, respectively). We basically want to divide the plane size by the
57
// size of a restoration unit. Rather than rounding up unconditionally as you
58
// might expect, we round to nearest, which models the way a right or bottom
59
// restoration unit can extend to up to 150% its normal width or height.
60
//
61
// The max with 1 is to deal with small frames, which may be smaller than
62
// half of an LR unit in size.
63
50.8k
int av1_lr_count_units(int unit_size, int plane_size) {
64
50.8k
  return AOMMAX((plane_size + (unit_size >> 1)) / unit_size, 1);
65
50.8k
}
66
67
void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
68
18.4k
                                  int is_uv) {
69
18.4k
  int plane_w, plane_h;
70
18.4k
  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
71
72
18.4k
  const int unit_size = rsi->restoration_unit_size;
73
18.4k
  const int horz_units = av1_lr_count_units(unit_size, plane_w);
74
18.4k
  const int vert_units = av1_lr_count_units(unit_size, plane_h);
75
76
18.4k
  rsi->num_rest_units = horz_units * vert_units;
77
18.4k
  rsi->horz_units = horz_units;
78
18.4k
  rsi->vert_units = vert_units;
79
80
18.4k
  aom_free(rsi->unit_info);
81
18.4k
  CHECK_MEM_ERROR(cm, rsi->unit_info,
82
18.4k
                  (RestorationUnitInfo *)aom_memalign(
83
18.4k
                      16, sizeof(*rsi->unit_info) * rsi->num_rest_units));
84
18.4k
}
85
86
55.6k
void av1_free_restoration_struct(RestorationInfo *rst_info) {
87
55.6k
  aom_free(rst_info->unit_info);
88
55.6k
  rst_info->unit_info = NULL;
89
55.6k
}
90
91
#if 0
92
// Pair of values for each sgrproj parameter:
93
// Index 0 corresponds to r[0], e[0]
94
// Index 1 corresponds to r[1], e[1]
95
int sgrproj_mtable[SGRPROJ_PARAMS][2];
96
97
static void GenSgrprojVtable(void) {
98
  for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
99
    const sgr_params_type *const params = &av1_sgr_params[i];
100
    for (int j = 0; j < 2; ++j) {
101
      const int e = params->e[j];
102
      const int r = params->r[j];
103
      if (r == 0) {                 // filter is disabled
104
        sgrproj_mtable[i][j] = -1;  // mark invalid
105
      } else {                      // filter is enabled
106
        const int n = (2 * r + 1) * (2 * r + 1);
107
        const int n2e = n * n * e;
108
        assert(n2e != 0);
109
        sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
110
      }
111
    }
112
  }
113
}
114
#endif
115
116
18.5k
void av1_loop_restoration_precal(void) {
117
#if 0
118
  GenSgrprojVtable();
119
#endif
120
18.5k
}
121
122
static void extend_frame_lowbd(uint8_t *data, int width, int height,
123
                               ptrdiff_t stride, int border_horz,
124
2.44k
                               int border_vert) {
125
2.44k
  uint8_t *data_p;
126
2.44k
  int i;
127
884k
  for (i = 0; i < height; ++i) {
128
882k
    data_p = data + i * stride;
129
882k
    memset(data_p - border_horz, data_p[0], border_horz);
130
882k
    memset(data_p + width, data_p[width - 1], border_horz);
131
882k
  }
132
2.44k
  data_p = data - border_horz;
133
9.79k
  for (i = -border_vert; i < 0; ++i) {
134
7.34k
    memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
135
7.34k
  }
136
9.79k
  for (i = height; i < height + border_vert; ++i) {
137
7.34k
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
138
7.34k
           width + 2 * border_horz);
139
7.34k
  }
140
2.44k
}
141
142
#if CONFIG_AV1_HIGHBITDEPTH
143
static void extend_frame_highbd(uint16_t *data, int width, int height,
144
                                ptrdiff_t stride, int border_horz,
145
5.16k
                                int border_vert) {
146
5.16k
  uint16_t *data_p;
147
5.16k
  int i, j;
148
1.13M
  for (i = 0; i < height; ++i) {
149
1.12M
    data_p = data + i * stride;
150
4.50M
    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
151
4.50M
    for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
152
1.12M
  }
153
5.16k
  data_p = data - border_horz;
154
20.6k
  for (i = -border_vert; i < 0; ++i) {
155
15.4k
    memcpy(data_p + i * stride, data_p,
156
15.4k
           (width + 2 * border_horz) * sizeof(uint16_t));
157
15.4k
  }
158
20.6k
  for (i = height; i < height + border_vert; ++i) {
159
15.4k
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
160
15.4k
           (width + 2 * border_horz) * sizeof(uint16_t));
161
15.4k
  }
162
5.16k
}
163
164
static void copy_rest_unit_highbd(int width, int height, const uint16_t *src,
165
                                  int src_stride, uint16_t *dst,
166
6.44k
                                  int dst_stride) {
167
381k
  for (int i = 0; i < height; ++i)
168
374k
    memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
169
6.44k
}
170
#endif
171
172
void av1_extend_frame(uint8_t *data, int width, int height, int stride,
173
7.61k
                      int border_horz, int border_vert, int highbd) {
174
7.61k
#if CONFIG_AV1_HIGHBITDEPTH
175
7.61k
  if (highbd) {
176
5.16k
    extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
177
5.16k
                        border_horz, border_vert);
178
5.16k
    return;
179
5.16k
  }
180
2.44k
#endif
181
2.44k
  (void)highbd;
182
2.44k
  extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
183
2.44k
}
184
185
static void copy_rest_unit_lowbd(int width, int height, const uint8_t *src,
186
6.82k
                                 int src_stride, uint8_t *dst, int dst_stride) {
187
462k
  for (int i = 0; i < height; ++i)
188
455k
    memcpy(dst + i * dst_stride, src + i * src_stride, width);
189
6.82k
}
190
191
static void copy_rest_unit(int width, int height, const uint8_t *src,
192
                           int src_stride, uint8_t *dst, int dst_stride,
193
13.2k
                           int highbd) {
194
13.2k
#if CONFIG_AV1_HIGHBITDEPTH
195
13.2k
  if (highbd) {
196
6.45k
    copy_rest_unit_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
197
6.45k
                          CONVERT_TO_SHORTPTR(dst), dst_stride);
198
6.45k
    return;
199
6.45k
  }
200
6.82k
#endif
201
6.82k
  (void)highbd;
202
6.82k
  copy_rest_unit_lowbd(width, height, src, src_stride, dst, dst_stride);
203
6.82k
}
204
205
505k
#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
206
207
// With striped loop restoration, the filtering for each 64-pixel stripe gets
208
// most of its input from the output of CDEF (stored in data8), but we need to
209
// fill out a border of 3 pixels above/below the stripe according to the
210
// following rules:
211
//
212
// * At the top and bottom of the frame, we copy the outermost row of CDEF
213
//   pixels three times. This extension is done by a call to av1_extend_frame()
214
//   at the start of the loop restoration process, so the value of
215
//   copy_above/copy_below doesn't strictly matter.
216
//
217
// * All other boundaries are stripe boundaries within the frame. In that case,
218
//   we take 2 rows of deblocked pixels and extend them to 3 rows of context.
219
static void get_stripe_boundary_info(const RestorationTileLimits *limits,
220
                                     int plane_w, int plane_h, int ss_y,
221
31.6k
                                     int *copy_above, int *copy_below) {
222
31.6k
  (void)plane_w;
223
224
31.6k
  *copy_above = 1;
225
31.6k
  *copy_below = 1;
226
227
31.6k
  const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
228
31.6k
  const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
229
230
31.6k
  const int first_stripe_in_plane = (limits->v_start == 0);
231
31.6k
  const int this_stripe_height =
232
31.6k
      full_stripe_height - (first_stripe_in_plane ? runit_offset : 0);
233
31.6k
  const int last_stripe_in_plane =
234
31.6k
      (limits->v_start + this_stripe_height >= plane_h);
235
236
31.6k
  if (first_stripe_in_plane) *copy_above = 0;
237
31.6k
  if (last_stripe_in_plane) *copy_below = 0;
238
31.6k
}
239
240
// Overwrite the border pixels around a processing stripe so that the conditions
241
// listed above get_stripe_boundary_info() are preserved.
242
// We save the pixels which get overwritten into a temporary buffer, so that
243
// they can be restored by restore_processing_stripe_boundary() after we've
244
// processed the stripe.
245
//
246
// limits gives the rectangular limits of the remaining stripes for the current
247
// restoration unit. rsb is the stored stripe boundaries (taken from either
248
// deblock or CDEF output as necessary).
249
static void setup_processing_stripe_boundary(
250
    const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
251
    int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
252
31.6k
    RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
253
  // Offsets within the line buffers. The buffer logically starts at column
254
  // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
255
  // has column x0 in the buffer.
256
31.6k
  const int buf_stride = rsb->stripe_boundary_stride;
257
31.6k
  const int buf_x0_off = limits->h_start;
258
31.6k
  const int line_width =
259
31.6k
      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
260
31.6k
  const int line_size = line_width << use_highbd;
261
262
31.6k
  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
263
264
  // Replace RESTORATION_BORDER pixels above the top of the stripe
265
  // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
266
  // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
267
  // duplicating the topmost of the 2 lines (see the AOMMAX call when
268
  // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
269
31.6k
  if (!opt) {
270
22.9k
    if (copy_above) {
271
19.1k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
272
273
76.7k
      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
274
57.5k
        const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
275
57.5k
        const int buf_off = buf_x0_off + buf_row * buf_stride;
276
57.5k
        const uint8_t *buf =
277
57.5k
            rsb->stripe_boundary_above + (buf_off << use_highbd);
278
57.5k
        uint8_t *dst8 = data8_tl + i * data_stride;
279
        // Save old pixels, then replace with data from stripe_boundary_above
280
57.5k
        memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
281
57.5k
               REAL_PTR(use_highbd, dst8), line_size);
282
57.5k
        memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
283
57.5k
      }
284
19.1k
    }
285
286
    // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
287
    // The second buffer row is repeated, so src_row gets the values 0, 1, 1
288
    // for i = 0, 1, 2.
289
22.9k
    if (copy_below) {
290
17.5k
      const int stripe_end = limits->v_start + h;
291
17.5k
      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
292
293
70.2k
      for (int i = 0; i < RESTORATION_BORDER; ++i) {
294
52.6k
        const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
295
52.6k
        const int buf_off = buf_x0_off + buf_row * buf_stride;
296
52.6k
        const uint8_t *src =
297
52.6k
            rsb->stripe_boundary_below + (buf_off << use_highbd);
298
299
52.6k
        uint8_t *dst8 = data8_bl + i * data_stride;
300
        // Save old pixels, then replace with data from stripe_boundary_below
301
52.6k
        memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
302
52.6k
        memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
303
52.6k
      }
304
17.5k
    }
305
22.9k
  } else {
306
8.69k
    if (copy_above) {
307
8.06k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
308
309
      // Only save and overwrite i=-RESTORATION_BORDER line.
310
8.06k
      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
311
      // Save old pixels, then replace with data from stripe_boundary_above
312
8.06k
      memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
313
8.06k
      memcpy(REAL_PTR(use_highbd, dst8),
314
8.06k
             REAL_PTR(use_highbd,
315
8.06k
                      data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
316
8.06k
             line_size);
317
8.06k
    }
318
319
8.69k
    if (copy_below) {
320
8.03k
      const int stripe_end = limits->v_start + h;
321
8.03k
      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
322
323
      // Only save and overwrite i=2 line.
324
8.03k
      uint8_t *dst8 = data8_bl + 2 * data_stride;
325
      // Save old pixels, then replace with data from stripe_boundary_below
326
8.03k
      memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
327
8.03k
      memcpy(REAL_PTR(use_highbd, dst8),
328
8.03k
             REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
329
8.03k
    }
330
8.69k
  }
331
31.6k
}
332
333
// Once a processing stripe is finished, this function sets the boundary
334
// pixels which were overwritten by setup_processing_stripe_boundary()
335
// back to their original values
336
static void restore_processing_stripe_boundary(
337
    const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
338
    int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
339
31.5k
    int copy_below, int opt) {
340
31.5k
  const int line_width =
341
31.5k
      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
342
31.5k
  const int line_size = line_width << use_highbd;
343
344
31.5k
  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
345
346
31.5k
  if (!opt) {
347
22.9k
    if (copy_above) {
348
19.2k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
349
76.8k
      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
350
57.6k
        uint8_t *dst8 = data8_tl + i * data_stride;
351
57.6k
        memcpy(REAL_PTR(use_highbd, dst8),
352
57.6k
               rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
353
57.6k
      }
354
19.2k
    }
355
356
22.9k
    if (copy_below) {
357
17.5k
      const int stripe_bottom = limits->v_start + h;
358
17.5k
      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
359
360
70.3k
      for (int i = 0; i < RESTORATION_BORDER; ++i) {
361
52.7k
        if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
362
363
52.7k
        uint8_t *dst8 = data8_bl + i * data_stride;
364
52.7k
        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
365
52.7k
      }
366
17.5k
    }
367
22.9k
  } else {
368
8.68k
    if (copy_above) {
369
8.05k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
370
371
      // Only restore i=-RESTORATION_BORDER line.
372
8.05k
      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
373
8.05k
      memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
374
8.05k
    }
375
376
8.68k
    if (copy_below) {
377
8.02k
      const int stripe_bottom = limits->v_start + h;
378
8.02k
      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
379
380
      // Only restore i=2 line.
381
8.02k
      if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
382
8.02k
        uint8_t *dst8 = data8_bl + 2 * data_stride;
383
8.02k
        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
384
8.02k
      }
385
8.02k
    }
386
8.68k
  }
387
31.5k
}
388
389
static void wiener_filter_stripe(const RestorationUnitInfo *rui,
390
                                 int stripe_width, int stripe_height,
391
                                 int procunit_width, const uint8_t *src,
392
                                 int src_stride, uint8_t *dst, int dst_stride,
393
                                 int32_t *tmpbuf, int bit_depth,
394
7.19k
                                 struct aom_internal_error_info *error_info) {
395
7.19k
  (void)tmpbuf;
396
7.19k
  (void)bit_depth;
397
7.19k
  (void)error_info;
398
7.19k
  assert(bit_depth == 8);
399
7.19k
  const WienerConvolveParams conv_params = get_conv_params_wiener(8);
400
401
20.8k
  for (int j = 0; j < stripe_width; j += procunit_width) {
402
13.6k
    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
403
13.6k
    const uint8_t *src_p = src + j;
404
13.6k
    uint8_t *dst_p = dst + j;
405
13.6k
    av1_wiener_convolve_add_src(
406
13.6k
        src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
407
13.6k
        rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
408
13.6k
  }
409
7.19k
}
410
411
/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
412
   over the input. The window is of size (2r + 1)x(2r + 1), and we
413
   specialize to r = 1, 2, 3. A default function is used for r > 3.
414
415
   Each loop follows the same format: We keep a window's worth of input
416
   in individual variables and select data out of that as appropriate.
417
*/
418
static void boxsum1(int32_t *src, int width, int height, int src_stride,
419
32.9k
                    int sqr, int32_t *dst, int dst_stride) {
420
32.9k
  int i, j, a, b, c;
421
32.9k
  assert(width > 2 * SGRPROJ_BORDER_HORZ);
422
32.9k
  assert(height > 2 * SGRPROJ_BORDER_VERT);
423
424
  // Vertical sum over 3-pixel regions, from src into dst.
425
32.9k
  if (!sqr) {
426
777k
    for (j = 0; j < width; ++j) {
427
760k
      a = src[j];
428
760k
      b = src[src_stride + j];
429
760k
      c = src[2 * src_stride + j];
430
431
760k
      dst[j] = a + b;
432
40.6M
      for (i = 1; i < height - 2; ++i) {
433
        // Loop invariant: At the start of each iteration,
434
        // a = src[(i - 1) * src_stride + j]
435
        // b = src[(i    ) * src_stride + j]
436
        // c = src[(i + 1) * src_stride + j]
437
39.9M
        dst[i * dst_stride + j] = a + b + c;
438
39.9M
        a = b;
439
39.9M
        b = c;
440
39.9M
        c = src[(i + 2) * src_stride + j];
441
39.9M
      }
442
760k
      dst[i * dst_stride + j] = a + b + c;
443
760k
      dst[(i + 1) * dst_stride + j] = b + c;
444
760k
    }
445
16.4k
  } else {
446
777k
    for (j = 0; j < width; ++j) {
447
761k
      a = src[j] * src[j];
448
761k
      b = src[src_stride + j] * src[src_stride + j];
449
761k
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
450
451
761k
      dst[j] = a + b;
452
40.5M
      for (i = 1; i < height - 2; ++i) {
453
39.7M
        dst[i * dst_stride + j] = a + b + c;
454
39.7M
        a = b;
455
39.7M
        b = c;
456
39.7M
        c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
457
39.7M
      }
458
761k
      dst[i * dst_stride + j] = a + b + c;
459
761k
      dst[(i + 1) * dst_stride + j] = b + c;
460
761k
    }
461
16.4k
  }
462
463
  // Horizontal sum over 3-pixel regions of dst
464
1.85M
  for (i = 0; i < height; ++i) {
465
1.82M
    a = dst[i * dst_stride];
466
1.82M
    b = dst[i * dst_stride + 1];
467
1.82M
    c = dst[i * dst_stride + 2];
468
469
1.82M
    dst[i * dst_stride] = a + b;
470
79.8M
    for (j = 1; j < width - 2; ++j) {
471
      // Loop invariant: At the start of each iteration,
472
      // a = src[i * src_stride + (j - 1)]
473
      // b = src[i * src_stride + (j    )]
474
      // c = src[i * src_stride + (j + 1)]
475
78.0M
      dst[i * dst_stride + j] = a + b + c;
476
78.0M
      a = b;
477
78.0M
      b = c;
478
78.0M
      c = dst[i * dst_stride + (j + 2)];
479
78.0M
    }
480
1.82M
    dst[i * dst_stride + j] = a + b + c;
481
1.82M
    dst[i * dst_stride + (j + 1)] = b + c;
482
1.82M
  }
483
32.9k
}
484
485
static void boxsum2(int32_t *src, int width, int height, int src_stride,
486
39.7k
                    int sqr, int32_t *dst, int dst_stride) {
487
39.7k
  int i, j, a, b, c, d, e;
488
39.7k
  assert(width > 2 * SGRPROJ_BORDER_HORZ);
489
39.7k
  assert(height > 2 * SGRPROJ_BORDER_VERT);
490
491
  // Vertical sum over 5-pixel regions, from src into dst.
492
39.7k
  if (!sqr) {
493
1.05M
    for (j = 0; j < width; ++j) {
494
1.03M
      a = src[j];
495
1.03M
      b = src[src_stride + j];
496
1.03M
      c = src[2 * src_stride + j];
497
1.03M
      d = src[3 * src_stride + j];
498
1.03M
      e = src[4 * src_stride + j];
499
500
1.03M
      dst[j] = a + b + c;
501
1.03M
      dst[dst_stride + j] = a + b + c + d;
502
59.5M
      for (i = 2; i < height - 3; ++i) {
503
        // Loop invariant: At the start of each iteration,
504
        // a = src[(i - 2) * src_stride + j]
505
        // b = src[(i - 1) * src_stride + j]
506
        // c = src[(i    ) * src_stride + j]
507
        // d = src[(i + 1) * src_stride + j]
508
        // e = src[(i + 2) * src_stride + j]
509
58.5M
        dst[i * dst_stride + j] = a + b + c + d + e;
510
58.5M
        a = b;
511
58.5M
        b = c;
512
58.5M
        c = d;
513
58.5M
        d = e;
514
58.5M
        e = src[(i + 3) * src_stride + j];
515
58.5M
      }
516
1.03M
      dst[i * dst_stride + j] = a + b + c + d + e;
517
1.03M
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
518
1.03M
      dst[(i + 2) * dst_stride + j] = c + d + e;
519
1.03M
    }
520
19.8k
  } else {
521
1.05M
    for (j = 0; j < width; ++j) {
522
1.03M
      a = src[j] * src[j];
523
1.03M
      b = src[src_stride + j] * src[src_stride + j];
524
1.03M
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
525
1.03M
      d = src[3 * src_stride + j] * src[3 * src_stride + j];
526
1.03M
      e = src[4 * src_stride + j] * src[4 * src_stride + j];
527
528
1.03M
      dst[j] = a + b + c;
529
1.03M
      dst[dst_stride + j] = a + b + c + d;
530
57.2M
      for (i = 2; i < height - 3; ++i) {
531
56.1M
        dst[i * dst_stride + j] = a + b + c + d + e;
532
56.1M
        a = b;
533
56.1M
        b = c;
534
56.1M
        c = d;
535
56.1M
        d = e;
536
56.1M
        e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
537
56.1M
      }
538
1.03M
      dst[i * dst_stride + j] = a + b + c + d + e;
539
1.03M
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
540
1.03M
      dst[(i + 2) * dst_stride + j] = c + d + e;
541
1.03M
    }
542
19.8k
  }
543
544
  // Horizontal sum over 5-pixel regions of dst
545
2.34M
  for (i = 0; i < height; ++i) {
546
2.30M
    a = dst[i * dst_stride];
547
2.30M
    b = dst[i * dst_stride + 1];
548
2.30M
    c = dst[i * dst_stride + 2];
549
2.30M
    d = dst[i * dst_stride + 3];
550
2.30M
    e = dst[i * dst_stride + 4];
551
552
2.30M
    dst[i * dst_stride] = a + b + c;
553
2.30M
    dst[i * dst_stride + 1] = a + b + c + d;
554
108M
    for (j = 2; j < width - 3; ++j) {
555
      // Loop invariant: At the start of each iteration,
556
      // a = src[i * src_stride + (j - 2)]
557
      // b = src[i * src_stride + (j - 1)]
558
      // c = src[i * src_stride + (j    )]
559
      // d = src[i * src_stride + (j + 1)]
560
      // e = src[i * src_stride + (j + 2)]
561
106M
      dst[i * dst_stride + j] = a + b + c + d + e;
562
106M
      a = b;
563
106M
      b = c;
564
106M
      c = d;
565
106M
      d = e;
566
106M
      e = dst[i * dst_stride + (j + 3)];
567
106M
    }
568
2.30M
    dst[i * dst_stride + j] = a + b + c + d + e;
569
2.30M
    dst[i * dst_stride + (j + 1)] = b + c + d + e;
570
2.30M
    dst[i * dst_stride + (j + 2)] = c + d + e;
571
2.30M
  }
572
39.7k
}
573
574
static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
575
72.6k
                   int sqr, int32_t *dst, int dst_stride) {
576
72.6k
  if (r == 1)
577
32.9k
    boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
578
39.7k
  else if (r == 2)
579
39.7k
    boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
580
18.4E
  else
581
18.4E
    assert(0 && "Invalid value of r in self-guided filter");
582
72.6k
}
583
584
25.1k
void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
585
25.1k
  if (params->r[0] == 0) {
586
5.30k
    xq[0] = 0;
587
5.30k
    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
588
19.8k
  } else if (params->r[1] == 0) {
589
8.68k
    xq[0] = xqd[0];
590
8.68k
    xq[1] = 0;
591
11.1k
  } else {
592
11.1k
    xq[0] = xqd[0];
593
11.1k
    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
594
11.1k
  }
595
25.1k
}
596
597
const int32_t av1_x_by_xplus1[256] = {
598
  // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
599
  // instead of 0. See comments in selfguided_restoration_internal() for why
600
  1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
601
  240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
602
  248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
603
  250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
604
  252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
605
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
606
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
607
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
608
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
609
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
610
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
611
  254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
612
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
613
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
614
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
615
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
616
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
617
  256,
618
};
619
620
const int32_t av1_one_by_x[MAX_NELEM] = {
621
  4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
622
  293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
623
};
624
625
static void calculate_intermediate_result(int32_t *dgd, int width, int height,
626
                                          int dgd_stride, int bit_depth,
627
                                          int sgr_params_idx, int radius_idx,
628
36.3k
                                          int pass, int32_t *A, int32_t *B) {
629
36.3k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
630
36.3k
  const int r = params->r[radius_idx];
631
36.3k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
632
36.3k
  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
633
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
634
  // leading to a significant speed improvement.
635
  // We also align the stride to a multiple of 16 bytes, for consistency
636
  // with the SIMD version of this function.
637
36.3k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
638
36.3k
  const int step = pass == 0 ? 1 : 2;
639
36.3k
  int i, j;
640
641
36.3k
  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
642
36.3k
  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
643
36.3k
         "Need SGRPROJ_BORDER_* >= r+1");
644
645
36.3k
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
646
36.3k
         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
647
36.3k
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
648
36.3k
         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
649
36.3k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
650
36.3k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
651
  // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
652
  // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
653
1.45M
  for (i = -1; i < height + 1; i += step) {
654
63.3M
    for (j = -1; j < width + 1; ++j) {
655
61.9M
      const int k = i * buf_stride + j;
656
61.9M
      const int n = (2 * r + 1) * (2 * r + 1);
657
658
      // a < 2^16 * n < 2^22 regardless of bit depth
659
61.9M
      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
660
      // b < 2^8 * n < 2^14 regardless of bit depth
661
61.9M
      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
662
663
      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
664
      // and p itself satisfies p < 2^14 * n^2 < 2^26.
665
      // This bound on p is due to:
666
      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
667
      //
668
      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
669
      // This is an artefact of rounding, and can only happen if all pixels
670
      // are (almost) identical, so in this case we saturate to p=0.
671
61.9M
      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
672
673
61.9M
      const uint32_t s = params->s[radius_idx];
674
675
      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
676
      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
677
      // (this holds even after accounting for the rounding in s)
678
61.9M
      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
679
680
      // Note: We have to be quite careful about the value of A[k].
681
      // This is used as a blend factor between individual pixel values and the
682
      // local mean. So it logically has a range of [0, 256], including both
683
      // endpoints.
684
      //
685
      // This is a pain for hardware, as we'd like something which can be stored
686
      // in exactly 8 bits.
687
      // Further, in the calculation of B[k] below, if z == 0 and r == 2,
688
      // then A[k] "should be" 0. But then we can end up setting B[k] to a value
689
      // slightly above 2^(8 + bit depth), due to rounding in the value of
690
      // av1_one_by_x[25-1].
691
      //
692
      // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
693
      // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
694
      // overflow), without significantly affecting the final result: z == 0
695
      // implies that the image is essentially "flat", so the local mean and
696
      // individual pixel values are very similar.
697
      //
698
      // Note that saturating on the other side, ie. requring A[k] <= 255,
699
      // would be a bad idea, as that corresponds to the case where the image
700
      // is very variable, when we want to preserve the local pixel value as
701
      // much as possible.
702
61.9M
      A[k] = av1_x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
703
704
      // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
705
      // av1_one_by_x[n - 1] = round(2^12 / n)
706
      // => the product here is < 2^(20 + bit_depth) <= 2^32,
707
      // and B[k] is set to a value < 2^(8 + bit depth)
708
      // This holds even with the rounding in av1_one_by_x and in the overall
709
      // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
710
61.9M
      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
711
61.9M
                                             (uint32_t)B[k] *
712
61.9M
                                             (uint32_t)av1_one_by_x[n - 1],
713
61.9M
                                         SGRPROJ_RECIP_BITS);
714
61.9M
    }
715
1.41M
  }
716
36.3k
}
717
718
static void selfguided_restoration_fast_internal(
719
    int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
720
19.8k
    int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
721
19.8k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
722
19.8k
  const int r = params->r[radius_idx];
723
19.8k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
724
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
725
  // leading to a significant speed improvement.
726
  // We also align the stride to a multiple of 16 bytes, for consistency
727
  // with the SIMD version of this function.
728
19.8k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
729
19.8k
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
730
19.8k
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
731
19.8k
  int32_t *A = A_;
732
19.8k
  int32_t *B = B_;
733
19.8k
  int i, j;
734
19.8k
  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
735
19.8k
                                sgr_params_idx, radius_idx, 1, A, B);
736
19.8k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
737
19.8k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
738
739
  // Use the A[] and B[] arrays to calculate the filtered image
740
19.8k
  (void)r;
741
19.8k
  assert(r == 2);
742
1.11M
  for (i = 0; i < height; ++i) {
743
1.09M
    if (!(i & 1)) {  // even row
744
26.1M
      for (j = 0; j < width; ++j) {
745
25.5M
        const int k = i * buf_stride + j;
746
25.5M
        const int l = i * dgd_stride + j;
747
25.5M
        const int m = i * dst_stride + j;
748
25.5M
        const int nb = 5;
749
25.5M
        const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
750
25.5M
                          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
751
25.5M
                           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
752
25.5M
                              5;
753
25.5M
        const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
754
25.5M
                          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
755
25.5M
                           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
756
25.5M
                              5;
757
25.5M
        const int32_t v = a * dgd[l] + b;
758
25.5M
        dst[m] =
759
25.5M
            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
760
25.5M
      }
761
549k
    } else {  // odd row
762
26.2M
      for (j = 0; j < width; ++j) {
763
25.6M
        const int k = i * buf_stride + j;
764
25.6M
        const int l = i * dgd_stride + j;
765
25.6M
        const int m = i * dst_stride + j;
766
25.6M
        const int nb = 4;
767
25.6M
        const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
768
25.6M
        const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
769
25.6M
        const int32_t v = a * dgd[l] + b;
770
25.6M
        dst[m] =
771
25.6M
            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
772
25.6M
      }
773
546k
    }
774
1.09M
  }
775
19.8k
}
776
777
static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
778
                                            int dgd_stride, int32_t *dst,
779
                                            int dst_stride, int bit_depth,
780
                                            int sgr_params_idx,
781
16.4k
                                            int radius_idx) {
782
16.4k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
783
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
784
  // leading to a significant speed improvement.
785
  // We also align the stride to a multiple of 16 bytes, for consistency
786
  // with the SIMD version of this function.
787
16.4k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
788
16.4k
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
789
16.4k
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
790
16.4k
  int32_t *A = A_;
791
16.4k
  int32_t *B = B_;
792
16.4k
  int i, j;
793
16.4k
  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
794
16.4k
                                sgr_params_idx, radius_idx, 0, A, B);
795
16.4k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
796
16.4k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
797
798
  // Use the A[] and B[] arrays to calculate the filtered image
799
834k
  for (i = 0; i < height; ++i) {
800
33.2M
    for (j = 0; j < width; ++j) {
801
32.4M
      const int k = i * buf_stride + j;
802
32.4M
      const int l = i * dgd_stride + j;
803
32.4M
      const int m = i * dst_stride + j;
804
32.4M
      const int nb = 5;
805
32.4M
      const int32_t a =
806
32.4M
          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
807
32.4M
              4 +
808
32.4M
          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
809
32.4M
           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
810
32.4M
              3;
811
32.4M
      const int32_t b =
812
32.4M
          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
813
32.4M
              4 +
814
32.4M
          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
815
32.4M
           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
816
32.4M
              3;
817
32.4M
      const int32_t v = a * dgd[l] + b;
818
32.4M
      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
819
32.4M
    }
820
817k
  }
821
16.4k
}
822
823
int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
824
                                 int dgd_stride, int32_t *flt0, int32_t *flt1,
825
                                 int flt_stride, int sgr_params_idx,
826
25.1k
                                 int bit_depth, int highbd) {
827
25.1k
  int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
828
25.1k
  const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
829
25.1k
  int32_t *dgd32 =
830
25.1k
      dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
831
832
25.1k
  if (highbd) {
833
6.39k
    const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
834
326k
    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
835
12.4M
      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
836
12.1M
        dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
837
12.1M
      }
838
320k
    }
839
18.7k
  } else {
840
1.11M
    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
841
63.2M
      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
842
62.1M
        dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
843
62.1M
      }
844
1.09M
    }
845
18.7k
  }
846
847
25.1k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
848
  // If params->r == 0 we skip the corresponding filter. We only allow one of
849
  // the radii to be 0, as having both equal to 0 would be equivalent to
850
  // skipping SGR entirely.
851
25.1k
  assert(!(params->r[0] == 0 && params->r[1] == 0));
852
853
25.1k
  if (params->r[0] > 0)
854
19.8k
    selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
855
19.8k
                                         flt0, flt_stride, bit_depth,
856
19.8k
                                         sgr_params_idx, 0);
857
25.1k
  if (params->r[1] > 0)
858
16.4k
    selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
859
16.4k
                                    flt_stride, bit_depth, sgr_params_idx, 1);
860
25.1k
  return 0;
861
25.1k
}
862
863
int av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
864
                                       int height, int stride, int eps,
865
                                       const int *xqd, uint8_t *dst8,
866
                                       int dst_stride, int32_t *tmpbuf,
867
25.1k
                                       int bit_depth, int highbd) {
868
25.1k
  int32_t *flt0 = tmpbuf;
869
25.1k
  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
870
25.1k
  assert(width * height <= RESTORATION_UNITPELS_MAX);
871
872
25.1k
  const int ret = av1_selfguided_restoration_c(
873
25.1k
      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
874
25.1k
  if (ret != 0) return ret;
875
25.1k
  const sgr_params_type *const params = &av1_sgr_params[eps];
876
25.1k
  int xq[2];
877
25.1k
  av1_decode_xq(xqd, xq, params);
878
1.28M
  for (int i = 0; i < height; ++i) {
879
53.1M
    for (int j = 0; j < width; ++j) {
880
51.9M
      const int k = i * width + j;
881
51.9M
      uint8_t *dst8ij = dst8 + i * dst_stride + j;
882
51.9M
      const uint8_t *dat8ij = dat8 + i * stride + j;
883
884
51.9M
      const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
885
51.9M
      const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
886
51.9M
      int32_t v = u << SGRPROJ_PRJ_BITS;
887
      // If params->r == 0 then we skipped the filtering in
888
      // av1_selfguided_restoration_c, i.e. flt[k] == u
889
51.9M
      if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
890
51.9M
      if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
891
51.9M
      const int16_t w =
892
51.9M
          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
893
894
51.9M
      const uint16_t out = clip_pixel_highbd(w, bit_depth);
895
51.9M
      if (highbd)
896
8.22M
        *CONVERT_TO_SHORTPTR(dst8ij) = out;
897
43.6M
      else
898
43.6M
        *dst8ij = (uint8_t)out;
899
51.9M
    }
900
1.26M
  }
901
25.1k
  return 0;
902
25.1k
}
903
904
static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
905
                                  int stripe_width, int stripe_height,
906
                                  int procunit_width, const uint8_t *src,
907
                                  int src_stride, uint8_t *dst, int dst_stride,
908
                                  int32_t *tmpbuf, int bit_depth,
909
8.39k
                                  struct aom_internal_error_info *error_info) {
910
8.39k
  (void)bit_depth;
911
8.39k
  assert(bit_depth == 8);
912
913
27.1k
  for (int j = 0; j < stripe_width; j += procunit_width) {
914
18.7k
    int w = AOMMIN(procunit_width, stripe_width - j);
915
18.7k
    if (av1_apply_selfguided_restoration(
916
18.7k
            src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
917
18.7k
            rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth,
918
18.7k
            0) != 0) {
919
0
      aom_internal_error(
920
0
          error_info, AOM_CODEC_MEM_ERROR,
921
0
          "Error allocating buffer in av1_apply_selfguided_restoration");
922
0
    }
923
18.7k
  }
924
8.39k
}
925
926
#if CONFIG_AV1_HIGHBITDEPTH
927
static void wiener_filter_stripe_highbd(
928
    const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
929
    int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
930
    int dst_stride, int32_t *tmpbuf, int bit_depth,
931
11.7k
    struct aom_internal_error_info *error_info) {
932
11.7k
  (void)tmpbuf;
933
11.7k
  (void)error_info;
934
11.7k
  const WienerConvolveParams conv_params = get_conv_params_wiener(bit_depth);
935
936
25.0k
  for (int j = 0; j < stripe_width; j += procunit_width) {
937
13.2k
    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
938
13.2k
    const uint8_t *src8_p = src8 + j;
939
13.2k
    uint8_t *dst8_p = dst8 + j;
940
13.2k
    av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
941
13.2k
                                       rui->wiener_info.hfilter, 16,
942
13.2k
                                       rui->wiener_info.vfilter, 16, w,
943
13.2k
                                       stripe_height, &conv_params, bit_depth);
944
13.2k
  }
945
11.7k
}
946
947
static void sgrproj_filter_stripe_highbd(
948
    const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
949
    int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
950
    int dst_stride, int32_t *tmpbuf, int bit_depth,
951
4.21k
    struct aom_internal_error_info *error_info) {
952
10.6k
  for (int j = 0; j < stripe_width; j += procunit_width) {
953
6.39k
    int w = AOMMIN(procunit_width, stripe_width - j);
954
6.39k
    if (av1_apply_selfguided_restoration(
955
6.39k
            src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
956
6.39k
            rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth,
957
6.39k
            1) != 0) {
958
0
      aom_internal_error(
959
0
          error_info, AOM_CODEC_MEM_ERROR,
960
0
          "Error allocating buffer in av1_apply_selfguided_restoration");
961
0
    }
962
6.39k
  }
963
4.21k
}
964
#endif  // CONFIG_AV1_HIGHBITDEPTH
965
966
typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
967
                                  int stripe_width, int stripe_height,
968
                                  int procunit_width, const uint8_t *src,
969
                                  int src_stride, uint8_t *dst, int dst_stride,
970
                                  int32_t *tmpbuf, int bit_depth,
971
                                  struct aom_internal_error_info *error_info);
972
973
#if CONFIG_AV1_HIGHBITDEPTH
974
#define NUM_STRIPE_FILTERS 4
975
static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
976
  wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
977
  sgrproj_filter_stripe_highbd
978
};
979
#else
980
#define NUM_STRIPE_FILTERS 2
981
static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
982
  wiener_filter_stripe, sgrproj_filter_stripe
983
};
984
#endif  // CONFIG_AV1_HIGHBITDEPTH
985
986
// Filter one restoration unit
987
void av1_loop_restoration_filter_unit(
988
    const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
989
    const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
990
    int plane_w, int plane_h, int ss_x, int ss_y, int highbd, int bit_depth,
991
    uint8_t *data8, int stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf,
992
28.8k
    int optimized_lr, struct aom_internal_error_info *error_info) {
993
28.8k
  RestorationType unit_rtype = rui->restoration_type;
994
995
28.8k
  int unit_h = limits->v_end - limits->v_start;
996
28.8k
  int unit_w = limits->h_end - limits->h_start;
997
28.8k
  uint8_t *data8_tl =
998
28.8k
      data8 + limits->v_start * (ptrdiff_t)stride + limits->h_start;
999
28.8k
  uint8_t *dst8_tl =
1000
28.8k
      dst8 + limits->v_start * (ptrdiff_t)dst_stride + limits->h_start;
1001
1002
28.8k
  if (unit_rtype == RESTORE_NONE) {
1003
13.2k
    copy_rest_unit(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride,
1004
13.2k
                   highbd);
1005
13.2k
    return;
1006
13.2k
  }
1007
1008
15.5k
  const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
1009
15.5k
  assert(filter_idx < NUM_STRIPE_FILTERS);
1010
15.5k
  const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
1011
1012
15.5k
  const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
1013
1014
  // Filter the whole image one stripe at a time
1015
15.5k
  RestorationTileLimits remaining_stripes = *limits;
1016
15.5k
  int i = 0;
1017
47.1k
  while (i < unit_h) {
1018
31.6k
    int copy_above, copy_below;
1019
31.6k
    remaining_stripes.v_start = limits->v_start + i;
1020
1021
31.6k
    get_stripe_boundary_info(&remaining_stripes, plane_w, plane_h, ss_y,
1022
31.6k
                             &copy_above, &copy_below);
1023
1024
31.6k
    const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1025
31.6k
    const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
1026
1027
    // Work out where this stripe's boundaries are within
1028
    // rsb->stripe_boundary_{above,below}
1029
31.6k
    const int frame_stripe =
1030
31.6k
        (remaining_stripes.v_start + runit_offset) / full_stripe_height;
1031
31.6k
    const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
1032
1033
    // Calculate this stripe's height, based on two rules:
1034
    // * The topmost stripe in the frame is 8 luma pixels shorter than usual.
1035
    // * We can't extend past the end of the current restoration unit
1036
31.6k
    const int nominal_stripe_height =
1037
31.6k
        full_stripe_height - ((frame_stripe == 0) ? runit_offset : 0);
1038
31.6k
    const int h = AOMMIN(nominal_stripe_height,
1039
31.6k
                         remaining_stripes.v_end - remaining_stripes.v_start);
1040
1041
31.6k
    setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
1042
31.6k
                                     h, data8, stride, rlbs, copy_above,
1043
31.6k
                                     copy_below, optimized_lr);
1044
1045
31.6k
    stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
1046
31.6k
                  dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth,
1047
31.6k
                  error_info);
1048
1049
31.6k
    restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
1050
31.6k
                                       data8, stride, copy_above, copy_below,
1051
31.6k
                                       optimized_lr);
1052
1053
31.6k
    i += h;
1054
31.6k
  }
1055
15.5k
}
1056
1057
static void filter_frame_on_unit(const RestorationTileLimits *limits,
1058
                                 int rest_unit_idx, void *priv, int32_t *tmpbuf,
1059
                                 RestorationLineBuffers *rlbs,
1060
28.9k
                                 struct aom_internal_error_info *error_info) {
1061
28.9k
  FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
1062
28.9k
  const RestorationInfo *rsi = ctxt->rsi;
1063
1064
28.9k
  av1_loop_restoration_filter_unit(
1065
28.9k
      limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs,
1066
28.9k
      ctxt->plane_w, ctxt->plane_h, ctxt->ss_x, ctxt->ss_y, ctxt->highbd,
1067
28.9k
      ctxt->bit_depth, ctxt->data8, ctxt->data_stride, ctxt->dst8,
1068
28.9k
      ctxt->dst_stride, tmpbuf, rsi->optimized_lr, error_info);
1069
28.9k
}
1070
1071
void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
1072
                                            YV12_BUFFER_CONFIG *frame,
1073
                                            AV1_COMMON *cm, int optimized_lr,
1074
4.16k
                                            int num_planes) {
1075
4.16k
  const SequenceHeader *const seq_params = cm->seq_params;
1076
4.16k
  const int bit_depth = seq_params->bit_depth;
1077
4.16k
  const int highbd = seq_params->use_highbitdepth;
1078
4.16k
  lr_ctxt->dst = &cm->rst_frame;
1079
1080
4.16k
  const int frame_width = frame->crop_widths[0];
1081
4.16k
  const int frame_height = frame->crop_heights[0];
1082
4.16k
  if (aom_realloc_frame_buffer(
1083
4.16k
          lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
1084
4.16k
          seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
1085
4.16k
          cm->features.byte_alignment, NULL, NULL, NULL, false,
1086
4.16k
          0) != AOM_CODEC_OK)
1087
0
    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
1088
0
                       "Failed to allocate restoration dst buffer");
1089
1090
4.16k
  lr_ctxt->on_rest_unit = filter_frame_on_unit;
1091
4.16k
  lr_ctxt->frame = frame;
1092
15.8k
  for (int plane = 0; plane < num_planes; ++plane) {
1093
11.6k
    RestorationInfo *rsi = &cm->rst_info[plane];
1094
11.6k
    RestorationType rtype = rsi->frame_restoration_type;
1095
11.6k
    rsi->optimized_lr = optimized_lr;
1096
11.6k
    lr_ctxt->ctxt[plane].rsi = rsi;
1097
1098
11.6k
    if (rtype == RESTORE_NONE) {
1099
4.06k
      continue;
1100
4.06k
    }
1101
1102
7.61k
    const int is_uv = plane > 0;
1103
7.61k
    int plane_w, plane_h;
1104
7.61k
    av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1105
7.61k
    assert(plane_w == frame->crop_widths[is_uv]);
1106
7.61k
    assert(plane_h == frame->crop_heights[is_uv]);
1107
1108
7.61k
    av1_extend_frame(frame->buffers[plane], plane_w, plane_h,
1109
7.61k
                     frame->strides[is_uv], RESTORATION_BORDER,
1110
7.61k
                     RESTORATION_BORDER, highbd);
1111
1112
7.61k
    FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
1113
7.61k
    lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
1114
7.61k
    lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
1115
7.61k
    lr_plane_ctxt->plane_w = plane_w;
1116
7.61k
    lr_plane_ctxt->plane_h = plane_h;
1117
7.61k
    lr_plane_ctxt->highbd = highbd;
1118
7.61k
    lr_plane_ctxt->bit_depth = bit_depth;
1119
7.61k
    lr_plane_ctxt->data8 = frame->buffers[plane];
1120
7.61k
    lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
1121
7.61k
    lr_plane_ctxt->data_stride = frame->strides[is_uv];
1122
7.61k
    lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
1123
7.61k
  }
1124
4.16k
}
1125
1126
static void loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
1127
407
                                         AV1_COMMON *cm, int num_planes) {
1128
407
  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
1129
407
                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
1130
407
                           int vstart, int vend);
1131
407
  static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
1132
407
                                         aom_yv12_partial_coloc_copy_u,
1133
407
                                         aom_yv12_partial_coloc_copy_v };
1134
407
  assert(num_planes <= 3);
1135
1.21k
  for (int plane = 0; plane < num_planes; ++plane) {
1136
807
    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
1137
576
    FilterFrameCtxt *lr_plane_ctxt = &loop_rest_ctxt->ctxt[plane];
1138
576
    copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, 0,
1139
576
                     lr_plane_ctxt->plane_w, 0, lr_plane_ctxt->plane_h);
1140
576
  }
1141
407
}
1142
1143
// Call on_rest_unit for each loop restoration unit in the plane.
1144
static void foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
1145
                                       rest_unit_visitor_t on_rest_unit,
1146
                                       void *priv, int32_t *tmpbuf,
1147
576
                                       RestorationLineBuffers *rlbs) {
1148
576
  const RestorationInfo *rsi = &cm->rst_info[plane];
1149
576
  const int hnum_rest_units = rsi->horz_units;
1150
576
  const int vnum_rest_units = rsi->vert_units;
1151
576
  const int unit_size = rsi->restoration_unit_size;
1152
1153
576
  const int is_uv = plane > 0;
1154
576
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1155
576
  const int ext_size = unit_size * 3 / 2;
1156
576
  int plane_w, plane_h;
1157
576
  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1158
1159
576
  int y0 = 0, i = 0;
1160
1.20k
  while (y0 < plane_h) {
1161
630
    int remaining_h = plane_h - y0;
1162
630
    int h = (remaining_h < ext_size) ? remaining_h : unit_size;
1163
1164
630
    RestorationTileLimits limits;
1165
630
    limits.v_start = y0;
1166
630
    limits.v_end = y0 + h;
1167
630
    assert(limits.v_end <= plane_h);
1168
    // Offset upwards to align with the restoration processing stripe
1169
630
    const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
1170
630
    limits.v_start = AOMMAX(0, limits.v_start - voffset);
1171
630
    if (limits.v_end < plane_h) limits.v_end -= voffset;
1172
1173
630
    av1_foreach_rest_unit_in_row(&limits, plane_w, on_rest_unit, i, unit_size,
1174
630
                                 hnum_rest_units, vnum_rest_units, plane, priv,
1175
630
                                 tmpbuf, rlbs, av1_lr_sync_read_dummy,
1176
630
                                 av1_lr_sync_write_dummy, NULL, cm->error);
1177
1178
630
    y0 += h;
1179
630
    ++i;
1180
630
  }
1181
576
}
1182
1183
static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
1184
407
                                        int num_planes) {
1185
407
  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
1186
1187
1.21k
  for (int plane = 0; plane < num_planes; ++plane) {
1188
807
    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
1189
231
      continue;
1190
231
    }
1191
1192
576
    foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit, &ctxt[plane],
1193
576
                               cm->rst_tmpbuf, cm->rlbs);
1194
576
  }
1195
407
}
1196
1197
void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
1198
                                       AV1_COMMON *cm, int optimized_lr,
1199
407
                                       void *lr_ctxt) {
1200
407
  assert(!cm->features.all_lossless);
1201
407
  const int num_planes = av1_num_planes(cm);
1202
1203
407
  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
1204
1205
407
  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
1206
407
                                         optimized_lr, num_planes);
1207
1208
407
  foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
1209
1210
407
  loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
1211
407
}
1212
1213
void av1_foreach_rest_unit_in_row(
1214
    RestorationTileLimits *limits, int plane_w,
1215
    rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
1216
    int hnum_rest_units, int vnum_rest_units, int plane, void *priv,
1217
    int32_t *tmpbuf, RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read,
1218
    sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync,
1219
23.4k
    struct aom_internal_error_info *error_info) {
1220
23.4k
  const int ext_size = unit_size * 3 / 2;
1221
23.4k
  int x0 = 0, j = 0;
1222
52.3k
  while (x0 < plane_w) {
1223
28.8k
    int remaining_w = plane_w - x0;
1224
28.8k
    int w = (remaining_w < ext_size) ? remaining_w : unit_size;
1225
1226
28.8k
    limits->h_start = x0;
1227
28.8k
    limits->h_end = x0 + w;
1228
28.8k
    assert(limits->h_end <= plane_w);
1229
1230
28.8k
    const int unit_idx = row_number * hnum_rest_units + j;
1231
1232
    // No sync for even numbered rows
1233
    // For odd numbered rows, Loop Restoration of current block requires the LR
1234
    // of top-right and bottom-right blocks to be completed
1235
1236
    // top-right sync
1237
28.8k
    on_sync_read(lr_sync, row_number, j, plane);
1238
28.8k
    if ((row_number + 1) < vnum_rest_units)
1239
      // bottom-right sync
1240
17.7k
      on_sync_read(lr_sync, row_number + 2, j, plane);
1241
1242
28.8k
#if CONFIG_MULTITHREAD
1243
28.8k
    if (lr_sync && lr_sync->num_workers > 1) {
1244
25.4k
      pthread_mutex_lock(lr_sync->job_mutex);
1245
25.4k
      const bool lr_mt_exit = lr_sync->lr_mt_exit;
1246
25.4k
      pthread_mutex_unlock(lr_sync->job_mutex);
1247
      // Exit in case any worker has encountered an error.
1248
25.4k
      if (lr_mt_exit) return;
1249
25.4k
    }
1250
28.8k
#endif
1251
1252
28.8k
    on_rest_unit(limits, unit_idx, priv, tmpbuf, rlbs, error_info);
1253
1254
28.8k
    on_sync_write(lr_sync, row_number, j, hnum_rest_units, plane);
1255
1256
28.8k
    x0 += w;
1257
28.8k
    ++j;
1258
28.8k
  }
1259
23.4k
}
1260
1261
29.3k
void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
1262
29.3k
  (void)lr_sync;
1263
29.3k
  (void)r;
1264
29.3k
  (void)c;
1265
29.3k
  (void)plane;
1266
29.3k
}
1267
1268
void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
1269
13.4k
                             const int sb_cols, int plane) {
1270
13.4k
  (void)lr_sync;
1271
13.4k
  (void)r;
1272
13.4k
  (void)c;
1273
13.4k
  (void)sb_cols;
1274
13.4k
  (void)plane;
1275
13.4k
}
1276
1277
int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
1278
                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
1279
                                       int *rcol0, int *rcol1, int *rrow0,
1280
1.00M
                                       int *rrow1) {
1281
1.00M
  assert(rcol0 && rcol1 && rrow0 && rrow1);
1282
1283
1.00M
  if (bsize != cm->seq_params->sb_size) return 0;
1284
1285
1.00M
  assert(!cm->features.all_lossless);
1286
1287
113k
  const int is_uv = plane > 0;
1288
1289
  // Compute the mi-unit corners of the superblock
1290
113k
  const int mi_row0 = mi_row;
1291
113k
  const int mi_col0 = mi_col;
1292
113k
  const int mi_row1 = mi_row0 + mi_size_high[bsize];
1293
113k
  const int mi_col1 = mi_col0 + mi_size_wide[bsize];
1294
1295
113k
  const RestorationInfo *rsi = &cm->rst_info[plane];
1296
113k
  const int size = rsi->restoration_unit_size;
1297
113k
  const int horz_units = rsi->horz_units;
1298
113k
  const int vert_units = rsi->vert_units;
1299
1300
  // The size of an MI-unit on this plane of the image
1301
113k
  const int ss_x = is_uv && cm->seq_params->subsampling_x;
1302
113k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1303
113k
  const int mi_size_x = MI_SIZE >> ss_x;
1304
113k
  const int mi_size_y = MI_SIZE >> ss_y;
1305
1306
  // Write m for the relative mi column or row, D for the superres denominator
1307
  // and N for the superres numerator. If u is the upscaled pixel offset then
1308
  // we can write the downscaled pixel offset in two ways as:
1309
  //
1310
  //   MI_SIZE * m = N / D u
1311
  //
1312
  // from which we get u = D * MI_SIZE * m / N
1313
113k
  const int mi_to_num_x = av1_superres_scaled(cm)
1314
113k
                              ? mi_size_x * cm->superres_scale_denominator
1315
113k
                              : mi_size_x;
1316
113k
  const int mi_to_num_y = mi_size_y;
1317
113k
  const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
1318
113k
  const int denom_y = size;
1319
1320
113k
  const int rnd_x = denom_x - 1;
1321
113k
  const int rnd_y = denom_y - 1;
1322
1323
  // rcol0/rrow0 should be the first column/row of restoration units that
1324
  // doesn't start left/below of mi_col/mi_row. For this calculation, we need
1325
  // to round up the division (if the sb starts at runit column 10.1, the first
1326
  // matching runit has column index 11)
1327
113k
  *rcol0 = (mi_col0 * mi_to_num_x + rnd_x) / denom_x;
1328
113k
  *rrow0 = (mi_row0 * mi_to_num_y + rnd_y) / denom_y;
1329
1330
  // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
1331
  // below-right. If we're at the bottom or right of the frame, this restoration
1332
  // unit might not exist, in which case we'll clamp accordingly.
1333
113k
  *rcol1 = AOMMIN((mi_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
1334
113k
  *rrow1 = AOMMIN((mi_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
1335
1336
113k
  return *rcol0 < *rcol1 && *rrow0 < *rrow1;
1337
1.00M
}
1338
1339
// Extend to left and right
1340
static void extend_lines(uint8_t *buf, int width, int height, int stride,
1341
110k
                         int extend, int use_highbitdepth) {
1342
331k
  for (int i = 0; i < height; ++i) {
1343
220k
    if (use_highbitdepth) {
1344
159k
      uint16_t *buf16 = (uint16_t *)buf;
1345
159k
      aom_memset16(buf16 - extend, buf16[0], extend);
1346
159k
      aom_memset16(buf16 + width, buf16[width - 1], extend);
1347
159k
    } else {
1348
61.5k
      memset(buf - extend, buf[0], extend);
1349
61.5k
      memset(buf + width, buf[width - 1], extend);
1350
61.5k
    }
1351
220k
    buf += stride;
1352
220k
  }
1353
110k
}
1354
1355
static void save_deblock_boundary_lines(
1356
    const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
1357
    int stripe, int use_highbd, int is_above,
1358
89.0k
    RestorationStripeBoundaries *boundaries) {
1359
89.0k
  const int is_uv = plane > 0;
1360
89.0k
  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1361
89.0k
  const int src_stride = frame->strides[is_uv] << use_highbd;
1362
89.0k
  const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
1363
1364
89.0k
  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1365
89.0k
                               : boundaries->stripe_boundary_below;
1366
89.0k
  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1367
89.0k
  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1368
89.0k
  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1369
1370
  // There is a rare case in which a processing stripe can end 1px above the
1371
  // crop border. In this case, we do want to use deblocked pixels from below
1372
  // the stripe (hence why we ended up in this function), but instead of
1373
  // fetching 2 "below" rows we need to fetch one and duplicate it.
1374
  // This is equivalent to clamping the sample locations against the crop border
1375
89.0k
  const int lines_to_save =
1376
89.0k
      AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
1377
89.0k
  assert(lines_to_save == 1 || lines_to_save == 2);
1378
1379
89.0k
  int upscaled_width;
1380
89.0k
  int line_bytes;
1381
89.0k
  if (av1_superres_scaled(cm)) {
1382
76.9k
    const int ss_x = is_uv && cm->seq_params->subsampling_x;
1383
76.9k
    upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
1384
76.9k
    line_bytes = upscaled_width << use_highbd;
1385
76.9k
    if (use_highbd)
1386
56.4k
      av1_upscale_normative_rows(
1387
56.4k
          cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
1388
56.4k
          CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
1389
56.4k
          plane, lines_to_save);
1390
20.5k
    else
1391
20.5k
      av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
1392
20.5k
                                 boundaries->stripe_boundary_stride, plane,
1393
20.5k
                                 lines_to_save);
1394
76.9k
  } else {
1395
12.1k
    upscaled_width = frame->crop_widths[is_uv];
1396
12.1k
    line_bytes = upscaled_width << use_highbd;
1397
36.2k
    for (int i = 0; i < lines_to_save; i++) {
1398
24.1k
      memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
1399
24.1k
             line_bytes);
1400
24.1k
    }
1401
12.1k
  }
1402
  // If we only saved one line, then copy it into the second line buffer
1403
89.0k
  if (lines_to_save == 1)
1404
193
    memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
1405
1406
89.0k
  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1407
89.0k
               RESTORATION_EXTRA_HORZ, use_highbd);
1408
89.0k
}
1409
1410
static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1411
                                     const AV1_COMMON *cm, int plane, int row,
1412
                                     int stripe, int use_highbd, int is_above,
1413
21.3k
                                     RestorationStripeBoundaries *boundaries) {
1414
21.3k
  const int is_uv = plane > 0;
1415
21.3k
  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1416
21.3k
  const int src_stride = frame->strides[is_uv] << use_highbd;
1417
21.3k
  const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
1418
1419
21.3k
  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1420
21.3k
                               : boundaries->stripe_boundary_below;
1421
21.3k
  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1422
21.3k
  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1423
21.3k
  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1424
21.3k
  const int src_width = frame->crop_widths[is_uv];
1425
1426
  // At the point where this function is called, we've already applied
1427
  // superres. So we don't need to extend the lines here, we can just
1428
  // pull directly from the topmost row of the upscaled frame.
1429
21.3k
  const int ss_x = is_uv && cm->seq_params->subsampling_x;
1430
21.3k
  const int upscaled_width = av1_superres_scaled(cm)
1431
21.3k
                                 ? (cm->superres_upscaled_width + ss_x) >> ss_x
1432
21.3k
                                 : src_width;
1433
21.3k
  const int line_bytes = upscaled_width << use_highbd;
1434
64.1k
  for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
1435
    // Copy the line at 'src_rows' into both context lines
1436
42.7k
    memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
1437
42.7k
  }
1438
21.3k
  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1439
21.3k
               RESTORATION_EXTRA_HORZ, use_highbd);
1440
21.3k
}
1441
1442
static void save_boundary_lines(const YV12_BUFFER_CONFIG *frame, int use_highbd,
1443
21.3k
                                int plane, AV1_COMMON *cm, int after_cdef) {
1444
21.3k
  const int is_uv = plane > 0;
1445
21.3k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1446
21.3k
  const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1447
21.3k
  const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
1448
1449
21.3k
  int plane_w, plane_h;
1450
21.3k
  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1451
1452
21.3k
  RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
1453
1454
21.3k
  const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
1455
1456
21.3k
  int stripe_idx;
1457
131k
  for (stripe_idx = 0;; ++stripe_idx) {
1458
131k
    const int rel_y0 = AOMMAX(0, stripe_idx * stripe_height - stripe_off);
1459
131k
    const int y0 = rel_y0;
1460
131k
    if (y0 >= plane_h) break;
1461
1462
110k
    const int rel_y1 = (stripe_idx + 1) * stripe_height - stripe_off;
1463
110k
    const int y1 = AOMMIN(rel_y1, plane_h);
1464
1465
    // Extend using CDEF pixels at the top and bottom of the frame,
1466
    // and deblocked pixels at internal stripe boundaries
1467
110k
    const int use_deblock_above = (stripe_idx > 0);
1468
110k
    const int use_deblock_below = (y1 < plane_height);
1469
1470
110k
    if (!after_cdef) {
1471
      // Save deblocked context at internal stripe boundaries
1472
55.2k
      if (use_deblock_above) {
1473
44.5k
        save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
1474
44.5k
                                    stripe_idx, use_highbd, 1, boundaries);
1475
44.5k
      }
1476
55.2k
      if (use_deblock_below) {
1477
44.5k
        save_deblock_boundary_lines(frame, cm, plane, y1, stripe_idx,
1478
44.5k
                                    use_highbd, 0, boundaries);
1479
44.5k
      }
1480
55.2k
    } else {
1481
      // Save CDEF context at frame boundaries
1482
55.2k
      if (!use_deblock_above) {
1483
10.6k
        save_cdef_boundary_lines(frame, cm, plane, y0, stripe_idx, use_highbd,
1484
10.6k
                                 1, boundaries);
1485
10.6k
      }
1486
55.2k
      if (!use_deblock_below) {
1487
10.6k
        save_cdef_boundary_lines(frame, cm, plane, y1 - 1, stripe_idx,
1488
10.6k
                                 use_highbd, 0, boundaries);
1489
10.6k
      }
1490
55.2k
    }
1491
110k
  }
1492
21.3k
}
1493
1494
// For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
1495
// lines to be used as boundary in the loop restoration process. The
1496
// lines are saved in rst_internal.stripe_boundary_lines
1497
void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1498
7.46k
                                              AV1_COMMON *cm, int after_cdef) {
1499
7.46k
  const int num_planes = av1_num_planes(cm);
1500
7.46k
  const int use_highbd = cm->seq_params->use_highbitdepth;
1501
28.8k
  for (int p = 0; p < num_planes; ++p) {
1502
21.3k
    save_boundary_lines(frame, use_highbd, p, cm, after_cdef);
1503
21.3k
  }
1504
7.46k
}