Coverage Report

Created: 2026-02-14 07:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/av1/common/restoration.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 *
11
 */
12
13
#include <math.h>
14
#include <stddef.h>
15
16
#include "config/aom_config.h"
17
#include "config/aom_scale_rtcd.h"
18
19
#include "aom/internal/aom_codec_internal.h"
20
#include "aom_mem/aom_mem.h"
21
#include "aom_dsp/aom_dsp_common.h"
22
#include "aom_mem/aom_mem.h"
23
#include "aom_ports/mem.h"
24
#include "aom_util/aom_pthread.h"
25
26
#include "av1/common/av1_common_int.h"
27
#include "av1/common/convolve.h"
28
#include "av1/common/enums.h"
29
#include "av1/common/resize.h"
30
#include "av1/common/restoration.h"
31
#include "av1/common/thread_common.h"
32
33
// The 's' values are calculated based on original 'r' and 'e' values in the
34
// spec using GenSgrprojVtable().
35
// Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
36
const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
37
  { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
38
  { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
39
  { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
40
  { { 2, 1 }, { 47, 1079 } },  { { 2, 1 }, { 37, 996 } },
41
  { { 2, 1 }, { 30, 925 } },   { { 2, 1 }, { 25, 863 } },
42
  { { 0, 1 }, { -1, 2589 } },  { { 0, 1 }, { -1, 1618 } },
43
  { { 0, 1 }, { -1, 1177 } },  { { 0, 1 }, { -1, 925 } },
44
  { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
45
};
46
47
void av1_get_upsampled_plane_size(const AV1_COMMON *cm, int is_uv, int *plane_w,
48
37.3k
                                  int *plane_h) {
49
37.3k
  int ss_x = is_uv && cm->seq_params->subsampling_x;
50
37.3k
  int ss_y = is_uv && cm->seq_params->subsampling_y;
51
37.3k
  *plane_w = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
52
37.3k
  *plane_h = ROUND_POWER_OF_TWO(cm->height, ss_y);
53
37.3k
}
54
55
// Count horizontal or vertical units in a plane (use a width or height for
56
// plane_size, respectively). We basically want to divide the plane size by the
57
// size of a restoration unit. Rather than rounding up unconditionally as you
58
// might expect, we round to nearest, which models the way a right or bottom
59
// restoration unit can extend to up to 150% its normal width or height.
60
//
61
// The max with 1 is to deal with small frames, which may be smaller than
62
// half of an LR unit in size.
63
39.7k
int av1_lr_count_units(int unit_size, int plane_size) {
64
39.7k
  return AOMMAX((plane_size + (unit_size >> 1)) / unit_size, 1);
65
39.7k
}
66
67
void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
68
13.8k
                                  int is_uv) {
69
13.8k
  int plane_w, plane_h;
70
13.8k
  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
71
72
13.8k
  const int unit_size = rsi->restoration_unit_size;
73
13.8k
  const int horz_units = av1_lr_count_units(unit_size, plane_w);
74
13.8k
  const int vert_units = av1_lr_count_units(unit_size, plane_h);
75
76
13.8k
  rsi->num_rest_units = horz_units * vert_units;
77
13.8k
  rsi->horz_units = horz_units;
78
13.8k
  rsi->vert_units = vert_units;
79
80
13.8k
  aom_free(rsi->unit_info);
81
13.8k
  CHECK_MEM_ERROR(cm, rsi->unit_info,
82
13.8k
                  (RestorationUnitInfo *)aom_memalign(
83
13.8k
                      16, sizeof(*rsi->unit_info) * rsi->num_rest_units));
84
13.8k
}
85
86
46.6k
void av1_free_restoration_struct(RestorationInfo *rst_info) {
87
46.6k
  aom_free(rst_info->unit_info);
88
46.6k
  rst_info->unit_info = NULL;
89
46.6k
}
90
91
#if 0
92
// Pair of values for each sgrproj parameter:
93
// Index 0 corresponds to r[0], e[0]
94
// Index 1 corresponds to r[1], e[1]
95
int sgrproj_mtable[SGRPROJ_PARAMS][2];
96
97
static void GenSgrprojVtable(void) {
98
  for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
99
    const sgr_params_type *const params = &av1_sgr_params[i];
100
    for (int j = 0; j < 2; ++j) {
101
      const int e = params->e[j];
102
      const int r = params->r[j];
103
      if (r == 0) {                 // filter is disabled
104
        sgrproj_mtable[i][j] = -1;  // mark invalid
105
      } else {                      // filter is enabled
106
        const int n = (2 * r + 1) * (2 * r + 1);
107
        const int n2e = n * n * e;
108
        assert(n2e != 0);
109
        sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
110
      }
111
    }
112
  }
113
}
114
#endif
115
116
15.5k
void av1_loop_restoration_precal(void) {
117
#if 0
118
  GenSgrprojVtable();
119
#endif
120
15.5k
}
121
122
static void extend_frame_lowbd(uint8_t *data, int width, int height,
123
                               ptrdiff_t stride, int border_horz,
124
1.76k
                               int border_vert) {
125
1.76k
  uint8_t *data_p;
126
1.76k
  int i;
127
548k
  for (i = 0; i < height; ++i) {
128
546k
    data_p = data + i * stride;
129
546k
    memset(data_p - border_horz, data_p[0], border_horz);
130
546k
    memset(data_p + width, data_p[width - 1], border_horz);
131
546k
  }
132
1.76k
  data_p = data - border_horz;
133
7.05k
  for (i = -border_vert; i < 0; ++i) {
134
5.28k
    memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
135
5.28k
  }
136
7.05k
  for (i = height; i < height + border_vert; ++i) {
137
5.28k
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
138
5.28k
           width + 2 * border_horz);
139
5.28k
  }
140
1.76k
}
141
142
#if CONFIG_AV1_HIGHBITDEPTH
143
static void extend_frame_highbd(uint16_t *data, int width, int height,
144
                                ptrdiff_t stride, int border_horz,
145
4.29k
                                int border_vert) {
146
4.29k
  uint16_t *data_p;
147
4.29k
  int i, j;
148
949k
  for (i = 0; i < height; ++i) {
149
945k
    data_p = data + i * stride;
150
3.78M
    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
151
3.78M
    for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
152
945k
  }
153
4.29k
  data_p = data - border_horz;
154
17.1k
  for (i = -border_vert; i < 0; ++i) {
155
12.8k
    memcpy(data_p + i * stride, data_p,
156
12.8k
           (width + 2 * border_horz) * sizeof(uint16_t));
157
12.8k
  }
158
17.1k
  for (i = height; i < height + border_vert; ++i) {
159
12.8k
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
160
12.8k
           (width + 2 * border_horz) * sizeof(uint16_t));
161
12.8k
  }
162
4.29k
}
163
164
static void copy_rest_unit_highbd(int width, int height, const uint16_t *src,
165
                                  int src_stride, uint16_t *dst,
166
5.74k
                                  int dst_stride) {
167
319k
  for (int i = 0; i < height; ++i)
168
313k
    memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
169
5.74k
}
170
#endif
171
172
void av1_extend_frame(uint8_t *data, int width, int height, int stride,
173
6.05k
                      int border_horz, int border_vert, int highbd) {
174
6.05k
#if CONFIG_AV1_HIGHBITDEPTH
175
6.05k
  if (highbd) {
176
4.29k
    extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
177
4.29k
                        border_horz, border_vert);
178
4.29k
    return;
179
4.29k
  }
180
1.76k
#endif
181
1.76k
  (void)highbd;
182
1.76k
  extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
183
1.76k
}
184
185
static void copy_rest_unit_lowbd(int width, int height, const uint8_t *src,
186
2.88k
                                 int src_stride, uint8_t *dst, int dst_stride) {
187
172k
  for (int i = 0; i < height; ++i)
188
169k
    memcpy(dst + i * dst_stride, src + i * src_stride, width);
189
2.88k
}
190
191
static void copy_rest_unit(int width, int height, const uint8_t *src,
192
                           int src_stride, uint8_t *dst, int dst_stride,
193
8.62k
                           int highbd) {
194
8.62k
#if CONFIG_AV1_HIGHBITDEPTH
195
8.62k
  if (highbd) {
196
5.74k
    copy_rest_unit_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
197
5.74k
                          CONVERT_TO_SHORTPTR(dst), dst_stride);
198
5.74k
    return;
199
5.74k
  }
200
2.88k
#endif
201
2.88k
  (void)highbd;
202
2.88k
  copy_rest_unit_lowbd(width, height, src, src_stride, dst, dst_stride);
203
2.88k
}
204
205
336k
#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
206
207
// With striped loop restoration, the filtering for each 64-pixel stripe gets
208
// most of its input from the output of CDEF (stored in data8), but we need to
209
// fill out a border of 3 pixels above/below the stripe according to the
210
// following rules:
211
//
212
// * At the top and bottom of the frame, we copy the outermost row of CDEF
213
//   pixels three times. This extension is done by a call to av1_extend_frame()
214
//   at the start of the loop restoration process, so the value of
215
//   copy_above/copy_below doesn't strictly matter.
216
//
217
// * All other boundaries are stripe boundaries within the frame. In that case,
218
//   we take 2 rows of deblocked pixels and extend them to 3 rows of context.
219
static void get_stripe_boundary_info(const RestorationTileLimits *limits,
220
                                     int plane_w, int plane_h, int ss_y,
221
20.1k
                                     int *copy_above, int *copy_below) {
222
20.1k
  (void)plane_w;
223
224
20.1k
  *copy_above = 1;
225
20.1k
  *copy_below = 1;
226
227
20.1k
  const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
228
20.1k
  const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
229
230
20.1k
  const int first_stripe_in_plane = (limits->v_start == 0);
231
20.1k
  const int this_stripe_height =
232
20.1k
      full_stripe_height - (first_stripe_in_plane ? runit_offset : 0);
233
20.1k
  const int last_stripe_in_plane =
234
20.1k
      (limits->v_start + this_stripe_height >= plane_h);
235
236
20.1k
  if (first_stripe_in_plane) *copy_above = 0;
237
20.1k
  if (last_stripe_in_plane) *copy_below = 0;
238
20.1k
}
239
240
// Overwrite the border pixels around a processing stripe so that the conditions
241
// listed above get_stripe_boundary_info() are preserved.
242
// We save the pixels which get overwritten into a temporary buffer, so that
243
// they can be restored by restore_processing_stripe_boundary() after we've
244
// processed the stripe.
245
//
246
// limits gives the rectangular limits of the remaining stripes for the current
247
// restoration unit. rsb is the stored stripe boundaries (taken from either
248
// deblock or CDEF output as necessary).
249
static void setup_processing_stripe_boundary(
250
    const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
251
    int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
252
20.1k
    RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
253
  // Offsets within the line buffers. The buffer logically starts at column
254
  // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
255
  // has column x0 in the buffer.
256
20.1k
  const int buf_stride = rsb->stripe_boundary_stride;
257
20.1k
  const int buf_x0_off = limits->h_start;
258
20.1k
  const int line_width =
259
20.1k
      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
260
20.1k
  const int line_size = line_width << use_highbd;
261
262
20.1k
  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
263
264
  // Replace RESTORATION_BORDER pixels above the top of the stripe
265
  // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
266
  // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
267
  // duplicating the topmost of the 2 lines (see the AOMMAX call when
268
  // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
269
20.1k
  if (!opt) {
270
14.2k
    if (copy_above) {
271
11.8k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
272
273
47.4k
      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
274
35.5k
        const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
275
35.5k
        const int buf_off = buf_x0_off + buf_row * buf_stride;
276
35.5k
        const uint8_t *buf =
277
35.5k
            rsb->stripe_boundary_above + (buf_off << use_highbd);
278
35.5k
        uint8_t *dst8 = data8_tl + i * data_stride;
279
        // Save old pixels, then replace with data from stripe_boundary_above
280
35.5k
        memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
281
35.5k
               REAL_PTR(use_highbd, dst8), line_size);
282
35.5k
        memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
283
35.5k
      }
284
11.8k
    }
285
286
    // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
287
    // The second buffer row is repeated, so src_row gets the values 0, 1, 1
288
    // for i = 0, 1, 2.
289
14.2k
    if (copy_below) {
290
10.5k
      const int stripe_end = limits->v_start + h;
291
10.5k
      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
292
293
42.1k
      for (int i = 0; i < RESTORATION_BORDER; ++i) {
294
31.5k
        const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
295
31.5k
        const int buf_off = buf_x0_off + buf_row * buf_stride;
296
31.5k
        const uint8_t *src =
297
31.5k
            rsb->stripe_boundary_below + (buf_off << use_highbd);
298
299
31.5k
        uint8_t *dst8 = data8_bl + i * data_stride;
300
        // Save old pixels, then replace with data from stripe_boundary_below
301
31.5k
        memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
302
31.5k
        memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
303
31.5k
      }
304
10.5k
    }
305
14.2k
  } else {
306
5.92k
    if (copy_above) {
307
5.52k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
308
309
      // Only save and overwrite i=-RESTORATION_BORDER line.
310
5.52k
      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
311
      // Save old pixels, then replace with data from stripe_boundary_above
312
5.52k
      memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
313
5.52k
      memcpy(REAL_PTR(use_highbd, dst8),
314
5.52k
             REAL_PTR(use_highbd,
315
5.52k
                      data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
316
5.52k
             line_size);
317
5.52k
    }
318
319
5.92k
    if (copy_below) {
320
5.50k
      const int stripe_end = limits->v_start + h;
321
5.50k
      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
322
323
      // Only save and overwrite i=2 line.
324
5.50k
      uint8_t *dst8 = data8_bl + 2 * data_stride;
325
      // Save old pixels, then replace with data from stripe_boundary_below
326
5.50k
      memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
327
5.50k
      memcpy(REAL_PTR(use_highbd, dst8),
328
5.50k
             REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
329
5.50k
    }
330
5.92k
  }
331
20.1k
}
332
333
// Once a processing stripe is finished, this function sets the boundary
334
// pixels which were overwritten by setup_processing_stripe_boundary()
335
// back to their original values
336
static void restore_processing_stripe_boundary(
337
    const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
338
    int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
339
20.1k
    int copy_below, int opt) {
340
20.1k
  const int line_width =
341
20.1k
      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
342
20.1k
  const int line_size = line_width << use_highbd;
343
344
20.1k
  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
345
346
20.1k
  if (!opt) {
347
14.2k
    if (copy_above) {
348
11.8k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
349
47.5k
      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
350
35.6k
        uint8_t *dst8 = data8_tl + i * data_stride;
351
35.6k
        memcpy(REAL_PTR(use_highbd, dst8),
352
35.6k
               rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
353
35.6k
      }
354
11.8k
    }
355
356
14.2k
    if (copy_below) {
357
10.5k
      const int stripe_bottom = limits->v_start + h;
358
10.5k
      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
359
360
42.2k
      for (int i = 0; i < RESTORATION_BORDER; ++i) {
361
31.6k
        if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
362
363
31.6k
        uint8_t *dst8 = data8_bl + i * data_stride;
364
31.6k
        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
365
31.6k
      }
366
10.5k
    }
367
14.2k
  } else {
368
5.92k
    if (copy_above) {
369
5.52k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
370
371
      // Only restore i=-RESTORATION_BORDER line.
372
5.52k
      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
373
5.52k
      memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
374
5.52k
    }
375
376
5.92k
    if (copy_below) {
377
5.51k
      const int stripe_bottom = limits->v_start + h;
378
5.51k
      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
379
380
      // Only restore i=2 line.
381
5.51k
      if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
382
5.51k
        uint8_t *dst8 = data8_bl + 2 * data_stride;
383
5.51k
        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
384
5.51k
      }
385
5.51k
    }
386
5.92k
  }
387
20.1k
}
388
389
static void wiener_filter_stripe(const RestorationUnitInfo *rui,
390
                                 int stripe_width, int stripe_height,
391
                                 int procunit_width, const uint8_t *src,
392
                                 int src_stride, uint8_t *dst, int dst_stride,
393
                                 int32_t *tmpbuf, int bit_depth,
394
3.84k
                                 struct aom_internal_error_info *error_info) {
395
3.84k
  (void)tmpbuf;
396
3.84k
  (void)bit_depth;
397
3.84k
  (void)error_info;
398
3.84k
  assert(bit_depth == 8);
399
3.84k
  const WienerConvolveParams conv_params = get_conv_params_wiener(8);
400
401
7.78k
  for (int j = 0; j < stripe_width; j += procunit_width) {
402
3.93k
    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
403
3.93k
    const uint8_t *src_p = src + j;
404
3.93k
    uint8_t *dst_p = dst + j;
405
3.93k
    av1_wiener_convolve_add_src(
406
3.93k
        src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
407
3.93k
        rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
408
3.93k
  }
409
3.84k
}
410
411
/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
412
   over the input. The window is of size (2r + 1)x(2r + 1), and we
413
   specialize to r = 1, 2, 3. A default function is used for r > 3.
414
415
   Each loop follows the same format: We keep a window's worth of input
416
   in individual variables and select data out of that as appropriate.
417
*/
418
static void boxsum1(int32_t *src, int width, int height, int src_stride,
419
13.8k
                    int sqr, int32_t *dst, int dst_stride) {
420
13.8k
  int i, j, a, b, c;
421
13.8k
  assert(width > 2 * SGRPROJ_BORDER_HORZ);
422
13.8k
  assert(height > 2 * SGRPROJ_BORDER_VERT);
423
424
  // Vertical sum over 3-pixel regions, from src into dst.
425
13.8k
  if (!sqr) {
426
235k
    for (j = 0; j < width; ++j) {
427
228k
      a = src[j];
428
228k
      b = src[src_stride + j];
429
228k
      c = src[2 * src_stride + j];
430
431
228k
      dst[j] = a + b;
432
10.1M
      for (i = 1; i < height - 2; ++i) {
433
        // Loop invariant: At the start of each iteration,
434
        // a = src[(i - 1) * src_stride + j]
435
        // b = src[(i    ) * src_stride + j]
436
        // c = src[(i + 1) * src_stride + j]
437
9.87M
        dst[i * dst_stride + j] = a + b + c;
438
9.87M
        a = b;
439
9.87M
        b = c;
440
9.87M
        c = src[(i + 2) * src_stride + j];
441
9.87M
      }
442
228k
      dst[i * dst_stride + j] = a + b + c;
443
228k
      dst[(i + 1) * dst_stride + j] = b + c;
444
228k
    }
445
6.93k
  } else {
446
235k
    for (j = 0; j < width; ++j) {
447
228k
      a = src[j] * src[j];
448
228k
      b = src[src_stride + j] * src[src_stride + j];
449
228k
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
450
451
228k
      dst[j] = a + b;
452
10.0M
      for (i = 1; i < height - 2; ++i) {
453
9.87M
        dst[i * dst_stride + j] = a + b + c;
454
9.87M
        a = b;
455
9.87M
        b = c;
456
9.87M
        c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
457
9.87M
      }
458
228k
      dst[i * dst_stride + j] = a + b + c;
459
228k
      dst[(i + 1) * dst_stride + j] = b + c;
460
228k
    }
461
6.93k
  }
462
463
  // Horizontal sum over 3-pixel regions of dst
464
722k
  for (i = 0; i < height; ++i) {
465
708k
    a = dst[i * dst_stride];
466
708k
    b = dst[i * dst_stride + 1];
467
708k
    c = dst[i * dst_stride + 2];
468
469
708k
    dst[i * dst_stride] = a + b;
470
19.4M
    for (j = 1; j < width - 2; ++j) {
471
      // Loop invariant: At the start of each iteration,
472
      // a = src[i * src_stride + (j - 1)]
473
      // b = src[i * src_stride + (j    )]
474
      // c = src[i * src_stride + (j + 1)]
475
18.7M
      dst[i * dst_stride + j] = a + b + c;
476
18.7M
      a = b;
477
18.7M
      b = c;
478
18.7M
      c = dst[i * dst_stride + (j + 2)];
479
18.7M
    }
480
708k
    dst[i * dst_stride + j] = a + b + c;
481
708k
    dst[i * dst_stride + (j + 1)] = b + c;
482
708k
  }
483
13.8k
}
484
485
static void boxsum2(int32_t *src, int width, int height, int src_stride,
486
13.3k
                    int sqr, int32_t *dst, int dst_stride) {
487
13.3k
  int i, j, a, b, c, d, e;
488
13.3k
  assert(width > 2 * SGRPROJ_BORDER_HORZ);
489
13.3k
  assert(height > 2 * SGRPROJ_BORDER_VERT);
490
491
  // Vertical sum over 5-pixel regions, from src into dst.
492
13.3k
  if (!sqr) {
493
224k
    for (j = 0; j < width; ++j) {
494
218k
      a = src[j];
495
218k
      b = src[src_stride + j];
496
218k
      c = src[2 * src_stride + j];
497
218k
      d = src[3 * src_stride + j];
498
218k
      e = src[4 * src_stride + j];
499
500
218k
      dst[j] = a + b + c;
501
218k
      dst[dst_stride + j] = a + b + c + d;
502
9.49M
      for (i = 2; i < height - 3; ++i) {
503
        // Loop invariant: At the start of each iteration,
504
        // a = src[(i - 2) * src_stride + j]
505
        // b = src[(i - 1) * src_stride + j]
506
        // c = src[(i    ) * src_stride + j]
507
        // d = src[(i + 1) * src_stride + j]
508
        // e = src[(i + 2) * src_stride + j]
509
9.27M
        dst[i * dst_stride + j] = a + b + c + d + e;
510
9.27M
        a = b;
511
9.27M
        b = c;
512
9.27M
        c = d;
513
9.27M
        d = e;
514
9.27M
        e = src[(i + 3) * src_stride + j];
515
9.27M
      }
516
218k
      dst[i * dst_stride + j] = a + b + c + d + e;
517
218k
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
518
218k
      dst[(i + 2) * dst_stride + j] = c + d + e;
519
218k
    }
520
6.66k
  } else {
521
225k
    for (j = 0; j < width; ++j) {
522
218k
      a = src[j] * src[j];
523
218k
      b = src[src_stride + j] * src[src_stride + j];
524
218k
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
525
218k
      d = src[3 * src_stride + j] * src[3 * src_stride + j];
526
218k
      e = src[4 * src_stride + j] * src[4 * src_stride + j];
527
528
218k
      dst[j] = a + b + c;
529
218k
      dst[dst_stride + j] = a + b + c + d;
530
9.47M
      for (i = 2; i < height - 3; ++i) {
531
9.25M
        dst[i * dst_stride + j] = a + b + c + d + e;
532
9.25M
        a = b;
533
9.25M
        b = c;
534
9.25M
        c = d;
535
9.25M
        d = e;
536
9.25M
        e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
537
9.25M
      }
538
218k
      dst[i * dst_stride + j] = a + b + c + d + e;
539
218k
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
540
218k
      dst[(i + 2) * dst_stride + j] = c + d + e;
541
218k
    }
542
6.65k
  }
543
544
  // Horizontal sum over 5-pixel regions of dst
545
650k
  for (i = 0; i < height; ++i) {
546
637k
    a = dst[i * dst_stride];
547
637k
    b = dst[i * dst_stride + 1];
548
637k
    c = dst[i * dst_stride + 2];
549
637k
    d = dst[i * dst_stride + 3];
550
637k
    e = dst[i * dst_stride + 4];
551
552
637k
    dst[i * dst_stride] = a + b + c;
553
637k
    dst[i * dst_stride + 1] = a + b + c + d;
554
16.4M
    for (j = 2; j < width - 3; ++j) {
555
      // Loop invariant: At the start of each iteration,
556
      // a = src[i * src_stride + (j - 2)]
557
      // b = src[i * src_stride + (j - 1)]
558
      // c = src[i * src_stride + (j    )]
559
      // d = src[i * src_stride + (j + 1)]
560
      // e = src[i * src_stride + (j + 2)]
561
15.7M
      dst[i * dst_stride + j] = a + b + c + d + e;
562
15.7M
      a = b;
563
15.7M
      b = c;
564
15.7M
      c = d;
565
15.7M
      d = e;
566
15.7M
      e = dst[i * dst_stride + (j + 3)];
567
15.7M
    }
568
637k
    dst[i * dst_stride + j] = a + b + c + d + e;
569
637k
    dst[i * dst_stride + (j + 1)] = b + c + d + e;
570
637k
    dst[i * dst_stride + (j + 2)] = c + d + e;
571
637k
  }
572
13.3k
}
573
574
static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
575
27.1k
                   int sqr, int32_t *dst, int dst_stride) {
576
27.1k
  if (r == 1)
577
13.8k
    boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
578
13.2k
  else if (r == 2)
579
13.3k
    boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
580
18.4E
  else
581
18.4E
    assert(0 && "Invalid value of r in self-guided filter");
582
27.1k
}
583
584
8.89k
void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
585
8.89k
  if (params->r[0] == 0) {
586
2.22k
    xq[0] = 0;
587
2.22k
    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
588
6.67k
  } else if (params->r[1] == 0) {
589
1.95k
    xq[0] = xqd[0];
590
1.95k
    xq[1] = 0;
591
4.71k
  } else {
592
4.71k
    xq[0] = xqd[0];
593
4.71k
    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
594
4.71k
  }
595
8.89k
}
596
597
const int32_t av1_x_by_xplus1[256] = {
598
  // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
599
  // instead of 0. See comments in selfguided_restoration_internal() for why
600
  1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
601
  240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
602
  248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
603
  250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
604
  252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
605
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
606
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
607
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
608
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
609
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
610
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
611
  254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
612
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
613
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
614
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
615
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
616
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
617
  256,
618
};
619
620
const int32_t av1_one_by_x[MAX_NELEM] = {
621
  4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
622
  293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
623
};
624
625
static void calculate_intermediate_result(int32_t *dgd, int width, int height,
626
                                          int dgd_stride, int bit_depth,
627
                                          int sgr_params_idx, int radius_idx,
628
13.5k
                                          int pass, int32_t *A, int32_t *B) {
629
13.5k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
630
13.5k
  const int r = params->r[radius_idx];
631
13.5k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
632
13.5k
  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
633
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
634
  // leading to a significant speed improvement.
635
  // We also align the stride to a multiple of 16 bytes, for consistency
636
  // with the SIMD version of this function.
637
13.5k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
638
13.5k
  const int step = pass == 0 ? 1 : 2;
639
13.5k
  int i, j;
640
641
13.5k
  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
642
13.5k
  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
643
13.5k
         "Need SGRPROJ_BORDER_* >= r+1");
644
645
13.5k
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
646
13.5k
         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
647
13.5k
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
648
13.5k
         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
649
13.5k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
650
13.5k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
651
  // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
652
  // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
653
504k
  for (i = -1; i < height + 1; i += step) {
654
12.6M
    for (j = -1; j < width + 1; ++j) {
655
12.1M
      const int k = i * buf_stride + j;
656
12.1M
      const int n = (2 * r + 1) * (2 * r + 1);
657
658
      // a < 2^16 * n < 2^22 regardless of bit depth
659
12.1M
      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
660
      // b < 2^8 * n < 2^14 regardless of bit depth
661
12.1M
      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
662
663
      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
664
      // and p itself satisfies p < 2^14 * n^2 < 2^26.
665
      // This bound on p is due to:
666
      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
667
      //
668
      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
669
      // This is an artefact of rounding, and can only happen if all pixels
670
      // are (almost) identical, so in this case we saturate to p=0.
671
12.1M
      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
672
673
12.1M
      const uint32_t s = params->s[radius_idx];
674
675
      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
676
      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
677
      // (this holds even after accounting for the rounding in s)
678
12.1M
      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
679
680
      // Note: We have to be quite careful about the value of A[k].
681
      // This is used as a blend factor between individual pixel values and the
682
      // local mean. So it logically has a range of [0, 256], including both
683
      // endpoints.
684
      //
685
      // This is a pain for hardware, as we'd like something which can be stored
686
      // in exactly 8 bits.
687
      // Further, in the calculation of B[k] below, if z == 0 and r == 2,
688
      // then A[k] "should be" 0. But then we can end up setting B[k] to a value
689
      // slightly above 2^(8 + bit depth), due to rounding in the value of
690
      // av1_one_by_x[25-1].
691
      //
692
      // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
693
      // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
694
      // overflow), without significantly affecting the final result: z == 0
695
      // implies that the image is essentially "flat", so the local mean and
696
      // individual pixel values are very similar.
697
      //
698
      // Note that saturating on the other side, ie. requring A[k] <= 255,
699
      // would be a bad idea, as that corresponds to the case where the image
700
      // is very variable, when we want to preserve the local pixel value as
701
      // much as possible.
702
12.1M
      A[k] = av1_x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
703
704
      // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
705
      // av1_one_by_x[n - 1] = round(2^12 / n)
706
      // => the product here is < 2^(20 + bit_depth) <= 2^32,
707
      // and B[k] is set to a value < 2^(8 + bit depth)
708
      // This holds even with the rounding in av1_one_by_x and in the overall
709
      // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
710
12.1M
      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
711
12.1M
                                             (uint32_t)B[k] *
712
12.1M
                                             (uint32_t)av1_one_by_x[n - 1],
713
12.1M
                                         SGRPROJ_RECIP_BITS);
714
12.1M
    }
715
490k
  }
716
13.5k
}
717
718
static void selfguided_restoration_fast_internal(
719
    int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
720
6.66k
    int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
721
6.66k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
722
6.66k
  const int r = params->r[radius_idx];
723
6.66k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
724
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
725
  // leading to a significant speed improvement.
726
  // We also align the stride to a multiple of 16 bytes, for consistency
727
  // with the SIMD version of this function.
728
6.66k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
729
6.66k
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
730
6.66k
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
731
6.66k
  int32_t *A = A_;
732
6.66k
  int32_t *B = B_;
733
6.66k
  int i, j;
734
6.66k
  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
735
6.66k
                                sgr_params_idx, radius_idx, 1, A, B);
736
6.66k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
737
6.66k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
738
739
  // Use the A[] and B[] arrays to calculate the filtered image
740
6.66k
  (void)r;
741
6.66k
  assert(r == 2);
742
320k
  for (i = 0; i < height; ++i) {
743
313k
    if (!(i & 1)) {  // even row
744
3.82M
      for (j = 0; j < width; ++j) {
745
3.66M
        const int k = i * buf_stride + j;
746
3.66M
        const int l = i * dgd_stride + j;
747
3.66M
        const int m = i * dst_stride + j;
748
3.66M
        const int nb = 5;
749
3.66M
        const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
750
3.66M
                          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
751
3.66M
                           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
752
3.66M
                              5;
753
3.66M
        const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
754
3.66M
                          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
755
3.66M
                           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
756
3.66M
                              5;
757
3.66M
        const int32_t v = a * dgd[l] + b;
758
3.66M
        dst[m] =
759
3.66M
            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
760
3.66M
      }
761
157k
    } else {  // odd row
762
3.83M
      for (j = 0; j < width; ++j) {
763
3.67M
        const int k = i * buf_stride + j;
764
3.67M
        const int l = i * dgd_stride + j;
765
3.67M
        const int m = i * dst_stride + j;
766
3.67M
        const int nb = 4;
767
3.67M
        const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
768
3.67M
        const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
769
3.67M
        const int32_t v = a * dgd[l] + b;
770
3.67M
        dst[m] =
771
3.67M
            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
772
3.67M
      }
773
156k
    }
774
313k
  }
775
6.66k
}
776
777
static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
778
                                            int dgd_stride, int32_t *dst,
779
                                            int dst_stride, int bit_depth,
780
                                            int sgr_params_idx,
781
6.93k
                                            int radius_idx) {
782
6.93k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
783
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
784
  // leading to a significant speed improvement.
785
  // We also align the stride to a multiple of 16 bytes, for consistency
786
  // with the SIMD version of this function.
787
6.93k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
788
6.93k
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
789
6.93k
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
790
6.93k
  int32_t *A = A_;
791
6.93k
  int32_t *B = B_;
792
6.93k
  int i, j;
793
6.93k
  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
794
6.93k
                                sgr_params_idx, radius_idx, 0, A, B);
795
6.93k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
796
6.93k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
797
798
  // Use the A[] and B[] arrays to calculate the filtered image
799
323k
  for (i = 0; i < height; ++i) {
800
7.49M
    for (j = 0; j < width; ++j) {
801
7.17M
      const int k = i * buf_stride + j;
802
7.17M
      const int l = i * dgd_stride + j;
803
7.17M
      const int m = i * dst_stride + j;
804
7.17M
      const int nb = 5;
805
7.17M
      const int32_t a =
806
7.17M
          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
807
7.17M
              4 +
808
7.17M
          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
809
7.17M
           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
810
7.17M
              3;
811
7.17M
      const int32_t b =
812
7.17M
          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
813
7.17M
              4 +
814
7.17M
          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
815
7.17M
           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
816
7.17M
              3;
817
7.17M
      const int32_t v = a * dgd[l] + b;
818
7.17M
      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
819
7.17M
    }
820
316k
  }
821
6.93k
}
822
823
int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
824
                                 int dgd_stride, int32_t *flt0, int32_t *flt1,
825
                                 int flt_stride, int sgr_params_idx,
826
8.88k
                                 int bit_depth, int highbd) {
827
8.88k
  int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
828
8.88k
  const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
829
8.88k
  int32_t *dgd32 =
830
8.88k
      dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
831
832
8.88k
  if (highbd) {
833
4.75k
    const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
834
224k
    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
835
8.49M
      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
836
8.27M
        dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
837
8.27M
      }
838
219k
    }
839
4.75k
  } else {
840
226k
    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
841
5.19M
      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
842
4.97M
        dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
843
4.97M
      }
844
222k
    }
845
4.12k
  }
846
847
8.88k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
848
  // If params->r == 0 we skip the corresponding filter. We only allow one of
849
  // the radii to be 0, as having both equal to 0 would be equivalent to
850
  // skipping SGR entirely.
851
8.88k
  assert(!(params->r[0] == 0 && params->r[1] == 0));
852
853
8.88k
  if (params->r[0] > 0)
854
6.66k
    selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
855
6.66k
                                         flt0, flt_stride, bit_depth,
856
6.66k
                                         sgr_params_idx, 0);
857
8.88k
  if (params->r[1] > 0)
858
6.93k
    selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
859
6.93k
                                    flt_stride, bit_depth, sgr_params_idx, 1);
860
8.88k
  return 0;
861
8.88k
}
862
863
int av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
864
                                       int height, int stride, int eps,
865
                                       const int *xqd, uint8_t *dst8,
866
                                       int dst_stride, int32_t *tmpbuf,
867
8.88k
                                       int bit_depth, int highbd) {
868
8.88k
  int32_t *flt0 = tmpbuf;
869
8.88k
  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
870
8.88k
  assert(width * height <= RESTORATION_UNITPELS_MAX);
871
872
8.88k
  const int ret = av1_selfguided_restoration_c(
873
8.88k
      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
874
8.88k
  if (ret != 0) return ret;
875
8.88k
  const sgr_params_type *const params = &av1_sgr_params[eps];
876
8.88k
  int xq[2];
877
8.88k
  av1_decode_xq(xqd, xq, params);
878
376k
  for (int i = 0; i < height; ++i) {
879
8.44M
    for (int j = 0; j < width; ++j) {
880
8.07M
      const int k = i * width + j;
881
8.07M
      uint8_t *dst8ij = dst8 + i * dst_stride + j;
882
8.07M
      const uint8_t *dat8ij = dat8 + i * stride + j;
883
884
8.07M
      const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
885
8.07M
      const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
886
8.07M
      int32_t v = u << SGRPROJ_PRJ_BITS;
887
      // If params->r == 0 then we skipped the filtering in
888
      // av1_selfguided_restoration_c, i.e. flt[k] == u
889
8.07M
      if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
890
8.07M
      if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
891
8.07M
      const int16_t w =
892
8.07M
          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
893
894
8.07M
      const uint16_t out = clip_pixel_highbd(w, bit_depth);
895
8.07M
      if (highbd)
896
5.51M
        *CONVERT_TO_SHORTPTR(dst8ij) = out;
897
2.56M
      else
898
2.56M
        *dst8ij = (uint8_t)out;
899
8.07M
    }
900
367k
  }
901
8.88k
  return 0;
902
8.88k
}
903
904
static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
905
                                  int stripe_width, int stripe_height,
906
                                  int procunit_width, const uint8_t *src,
907
                                  int src_stride, uint8_t *dst, int dst_stride,
908
                                  int32_t *tmpbuf, int bit_depth,
909
3.78k
                                  struct aom_internal_error_info *error_info) {
910
3.78k
  (void)bit_depth;
911
3.78k
  assert(bit_depth == 8);
912
913
7.90k
  for (int j = 0; j < stripe_width; j += procunit_width) {
914
4.12k
    int w = AOMMIN(procunit_width, stripe_width - j);
915
4.12k
    if (av1_apply_selfguided_restoration(
916
4.12k
            src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
917
4.12k
            rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth,
918
4.12k
            0) != 0) {
919
0
      aom_internal_error(
920
0
          error_info, AOM_CODEC_MEM_ERROR,
921
0
          "Error allocating buffer in av1_apply_selfguided_restoration");
922
0
    }
923
4.12k
  }
924
3.78k
}
925
926
#if CONFIG_AV1_HIGHBITDEPTH
927
static void wiener_filter_stripe_highbd(
928
    const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
929
    int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
930
    int dst_stride, int32_t *tmpbuf, int bit_depth,
931
9.60k
    struct aom_internal_error_info *error_info) {
932
9.60k
  (void)tmpbuf;
933
9.60k
  (void)error_info;
934
9.60k
  const WienerConvolveParams conv_params = get_conv_params_wiener(bit_depth);
935
936
19.7k
  for (int j = 0; j < stripe_width; j += procunit_width) {
937
10.1k
    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
938
10.1k
    const uint8_t *src8_p = src8 + j;
939
10.1k
    uint8_t *dst8_p = dst8 + j;
940
10.1k
    av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
941
10.1k
                                       rui->wiener_info.hfilter, 16,
942
10.1k
                                       rui->wiener_info.vfilter, 16, w,
943
10.1k
                                       stripe_height, &conv_params, bit_depth);
944
10.1k
  }
945
9.60k
}
946
947
static void sgrproj_filter_stripe_highbd(
948
    const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
949
    int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
950
    int dst_stride, int32_t *tmpbuf, int bit_depth,
951
2.93k
    struct aom_internal_error_info *error_info) {
952
7.69k
  for (int j = 0; j < stripe_width; j += procunit_width) {
953
4.75k
    int w = AOMMIN(procunit_width, stripe_width - j);
954
4.75k
    if (av1_apply_selfguided_restoration(
955
4.75k
            src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
956
4.75k
            rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth,
957
4.75k
            1) != 0) {
958
0
      aom_internal_error(
959
0
          error_info, AOM_CODEC_MEM_ERROR,
960
0
          "Error allocating buffer in av1_apply_selfguided_restoration");
961
0
    }
962
4.75k
  }
963
2.93k
}
964
#endif  // CONFIG_AV1_HIGHBITDEPTH
965
966
typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
967
                                  int stripe_width, int stripe_height,
968
                                  int procunit_width, const uint8_t *src,
969
                                  int src_stride, uint8_t *dst, int dst_stride,
970
                                  int32_t *tmpbuf, int bit_depth,
971
                                  struct aom_internal_error_info *error_info);
972
973
#if CONFIG_AV1_HIGHBITDEPTH
974
#define NUM_STRIPE_FILTERS 4
975
static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
976
  wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
977
  sgrproj_filter_stripe_highbd
978
};
979
#else
980
#define NUM_STRIPE_FILTERS 2
981
static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
982
  wiener_filter_stripe, sgrproj_filter_stripe
983
};
984
#endif  // CONFIG_AV1_HIGHBITDEPTH
985
986
// Filter one restoration unit
987
void av1_loop_restoration_filter_unit(
988
    const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
989
    const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
990
    int plane_w, int plane_h, int ss_x, int ss_y, int highbd, int bit_depth,
991
    uint8_t *data8, int stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf,
992
19.3k
    int optimized_lr, struct aom_internal_error_info *error_info) {
993
19.3k
  RestorationType unit_rtype = rui->restoration_type;
994
995
19.3k
  int unit_h = limits->v_end - limits->v_start;
996
19.3k
  int unit_w = limits->h_end - limits->h_start;
997
19.3k
  uint8_t *data8_tl =
998
19.3k
      data8 + limits->v_start * (ptrdiff_t)stride + limits->h_start;
999
19.3k
  uint8_t *dst8_tl =
1000
19.3k
      dst8 + limits->v_start * (ptrdiff_t)dst_stride + limits->h_start;
1001
1002
19.3k
  if (unit_rtype == RESTORE_NONE) {
1003
8.62k
    copy_rest_unit(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride,
1004
8.62k
                   highbd);
1005
8.62k
    return;
1006
8.62k
  }
1007
1008
10.6k
  const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
1009
10.6k
  assert(filter_idx < NUM_STRIPE_FILTERS);
1010
10.6k
  const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
1011
1012
10.6k
  const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
1013
1014
  // Filter the whole image one stripe at a time
1015
10.6k
  RestorationTileLimits remaining_stripes = *limits;
1016
10.6k
  int i = 0;
1017
30.8k
  while (i < unit_h) {
1018
20.1k
    int copy_above, copy_below;
1019
20.1k
    remaining_stripes.v_start = limits->v_start + i;
1020
1021
20.1k
    get_stripe_boundary_info(&remaining_stripes, plane_w, plane_h, ss_y,
1022
20.1k
                             &copy_above, &copy_below);
1023
1024
20.1k
    const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1025
20.1k
    const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
1026
1027
    // Work out where this stripe's boundaries are within
1028
    // rsb->stripe_boundary_{above,below}
1029
20.1k
    const int frame_stripe =
1030
20.1k
        (remaining_stripes.v_start + runit_offset) / full_stripe_height;
1031
20.1k
    const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
1032
1033
    // Calculate this stripe's height, based on two rules:
1034
    // * The topmost stripe in the frame is 8 luma pixels shorter than usual.
1035
    // * We can't extend past the end of the current restoration unit
1036
20.1k
    const int nominal_stripe_height =
1037
20.1k
        full_stripe_height - ((frame_stripe == 0) ? runit_offset : 0);
1038
20.1k
    const int h = AOMMIN(nominal_stripe_height,
1039
20.1k
                         remaining_stripes.v_end - remaining_stripes.v_start);
1040
1041
20.1k
    setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
1042
20.1k
                                     h, data8, stride, rlbs, copy_above,
1043
20.1k
                                     copy_below, optimized_lr);
1044
1045
20.1k
    stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
1046
20.1k
                  dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth,
1047
20.1k
                  error_info);
1048
1049
20.1k
    restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
1050
20.1k
                                       data8, stride, copy_above, copy_below,
1051
20.1k
                                       optimized_lr);
1052
1053
20.1k
    i += h;
1054
20.1k
  }
1055
10.6k
}
1056
1057
static void filter_frame_on_unit(const RestorationTileLimits *limits,
1058
                                 int rest_unit_idx, void *priv, int32_t *tmpbuf,
1059
                                 RestorationLineBuffers *rlbs,
1060
19.3k
                                 struct aom_internal_error_info *error_info) {
1061
19.3k
  FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
1062
19.3k
  const RestorationInfo *rsi = ctxt->rsi;
1063
1064
19.3k
  av1_loop_restoration_filter_unit(
1065
19.3k
      limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs,
1066
19.3k
      ctxt->plane_w, ctxt->plane_h, ctxt->ss_x, ctxt->ss_y, ctxt->highbd,
1067
19.3k
      ctxt->bit_depth, ctxt->data8, ctxt->data_stride, ctxt->dst8,
1068
19.3k
      ctxt->dst_stride, tmpbuf, rsi->optimized_lr, error_info);
1069
19.3k
}
1070
1071
void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
1072
                                            YV12_BUFFER_CONFIG *frame,
1073
                                            AV1_COMMON *cm, int optimized_lr,
1074
3.23k
                                            int num_planes) {
1075
3.23k
  const SequenceHeader *const seq_params = cm->seq_params;
1076
3.23k
  const int bit_depth = seq_params->bit_depth;
1077
3.23k
  const int highbd = seq_params->use_highbitdepth;
1078
3.23k
  lr_ctxt->dst = &cm->rst_frame;
1079
1080
3.23k
  const int frame_width = frame->crop_widths[0];
1081
3.23k
  const int frame_height = frame->crop_heights[0];
1082
3.23k
  if (aom_realloc_frame_buffer(
1083
3.23k
          lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
1084
3.23k
          seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
1085
3.23k
          cm->features.byte_alignment, NULL, NULL, NULL, false,
1086
3.23k
          0) != AOM_CODEC_OK)
1087
0
    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
1088
0
                       "Failed to allocate restoration dst buffer");
1089
1090
3.23k
  lr_ctxt->on_rest_unit = filter_frame_on_unit;
1091
3.23k
  lr_ctxt->frame = frame;
1092
12.6k
  for (int plane = 0; plane < num_planes; ++plane) {
1093
9.38k
    RestorationInfo *rsi = &cm->rst_info[plane];
1094
9.38k
    RestorationType rtype = rsi->frame_restoration_type;
1095
9.38k
    rsi->optimized_lr = optimized_lr;
1096
9.38k
    lr_ctxt->ctxt[plane].rsi = rsi;
1097
1098
9.38k
    if (rtype == RESTORE_NONE) {
1099
3.32k
      continue;
1100
3.32k
    }
1101
1102
6.05k
    const int is_uv = plane > 0;
1103
6.05k
    int plane_w, plane_h;
1104
6.05k
    av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1105
6.05k
    assert(plane_w == frame->crop_widths[is_uv]);
1106
6.05k
    assert(plane_h == frame->crop_heights[is_uv]);
1107
1108
6.05k
    av1_extend_frame(frame->buffers[plane], plane_w, plane_h,
1109
6.05k
                     frame->strides[is_uv], RESTORATION_BORDER,
1110
6.05k
                     RESTORATION_BORDER, highbd);
1111
1112
6.05k
    FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
1113
6.05k
    lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
1114
6.05k
    lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
1115
6.05k
    lr_plane_ctxt->plane_w = plane_w;
1116
6.05k
    lr_plane_ctxt->plane_h = plane_h;
1117
6.05k
    lr_plane_ctxt->highbd = highbd;
1118
6.05k
    lr_plane_ctxt->bit_depth = bit_depth;
1119
6.05k
    lr_plane_ctxt->data8 = frame->buffers[plane];
1120
6.05k
    lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
1121
6.05k
    lr_plane_ctxt->data_stride = frame->strides[is_uv];
1122
6.05k
    lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
1123
6.05k
  }
1124
3.23k
}
1125
1126
static void loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
1127
10
                                         AV1_COMMON *cm, int num_planes) {
1128
10
  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
1129
10
                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
1130
10
                           int vstart, int vend);
1131
10
  static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
1132
10
                                         aom_yv12_partial_coloc_copy_u,
1133
10
                                         aom_yv12_partial_coloc_copy_v };
1134
10
  assert(num_planes <= 3);
1135
40
  for (int plane = 0; plane < num_planes; ++plane) {
1136
30
    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
1137
10
    FilterFrameCtxt *lr_plane_ctxt = &loop_rest_ctxt->ctxt[plane];
1138
10
    copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, 0,
1139
10
                     lr_plane_ctxt->plane_w, 0, lr_plane_ctxt->plane_h);
1140
10
  }
1141
10
}
1142
1143
// Call on_rest_unit for each loop restoration unit in the plane.
1144
static void foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
1145
                                       rest_unit_visitor_t on_rest_unit,
1146
                                       void *priv, int32_t *tmpbuf,
1147
10
                                       RestorationLineBuffers *rlbs) {
1148
10
  const RestorationInfo *rsi = &cm->rst_info[plane];
1149
10
  const int hnum_rest_units = rsi->horz_units;
1150
10
  const int vnum_rest_units = rsi->vert_units;
1151
10
  const int unit_size = rsi->restoration_unit_size;
1152
1153
10
  const int is_uv = plane > 0;
1154
10
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1155
10
  const int ext_size = unit_size * 3 / 2;
1156
10
  int plane_w, plane_h;
1157
10
  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1158
1159
10
  int y0 = 0, i = 0;
1160
20
  while (y0 < plane_h) {
1161
10
    int remaining_h = plane_h - y0;
1162
10
    int h = (remaining_h < ext_size) ? remaining_h : unit_size;
1163
1164
10
    RestorationTileLimits limits;
1165
10
    limits.v_start = y0;
1166
10
    limits.v_end = y0 + h;
1167
10
    assert(limits.v_end <= plane_h);
1168
    // Offset upwards to align with the restoration processing stripe
1169
10
    const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
1170
10
    limits.v_start = AOMMAX(0, limits.v_start - voffset);
1171
10
    if (limits.v_end < plane_h) limits.v_end -= voffset;
1172
1173
10
    av1_foreach_rest_unit_in_row(&limits, plane_w, on_rest_unit, i, unit_size,
1174
10
                                 hnum_rest_units, vnum_rest_units, plane, priv,
1175
10
                                 tmpbuf, rlbs, av1_lr_sync_read_dummy,
1176
10
                                 av1_lr_sync_write_dummy, NULL, cm->error);
1177
1178
10
    y0 += h;
1179
10
    ++i;
1180
10
  }
1181
10
}
1182
1183
static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
1184
10
                                        int num_planes) {
1185
10
  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
1186
1187
40
  for (int plane = 0; plane < num_planes; ++plane) {
1188
30
    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
1189
20
      continue;
1190
20
    }
1191
1192
10
    foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit, &ctxt[plane],
1193
10
                               cm->rst_tmpbuf, cm->rlbs);
1194
10
  }
1195
10
}
1196
1197
void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
1198
                                       AV1_COMMON *cm, int optimized_lr,
1199
10
                                       void *lr_ctxt) {
1200
10
  assert(!cm->features.all_lossless);
1201
10
  const int num_planes = av1_num_planes(cm);
1202
1203
10
  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
1204
1205
10
  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
1206
10
                                         optimized_lr, num_planes);
1207
1208
10
  foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
1209
1210
10
  loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
1211
10
}
1212
1213
void av1_foreach_rest_unit_in_row(
1214
    RestorationTileLimits *limits, int plane_w,
1215
    rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
1216
    int hnum_rest_units, int vnum_rest_units, int plane, void *priv,
1217
    int32_t *tmpbuf, RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read,
1218
    sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync,
1219
18.1k
    struct aom_internal_error_info *error_info) {
1220
18.1k
  const int ext_size = unit_size * 3 / 2;
1221
18.1k
  int x0 = 0, j = 0;
1222
37.4k
  while (x0 < plane_w) {
1223
19.3k
    int remaining_w = plane_w - x0;
1224
19.3k
    int w = (remaining_w < ext_size) ? remaining_w : unit_size;
1225
1226
19.3k
    limits->h_start = x0;
1227
19.3k
    limits->h_end = x0 + w;
1228
19.3k
    assert(limits->h_end <= plane_w);
1229
1230
19.3k
    const int unit_idx = row_number * hnum_rest_units + j;
1231
1232
    // No sync for even numbered rows
1233
    // For odd numbered rows, Loop Restoration of current block requires the LR
1234
    // of top-right and bottom-right blocks to be completed
1235
1236
    // top-right sync
1237
19.3k
    on_sync_read(lr_sync, row_number, j, plane);
1238
19.3k
    if ((row_number + 1) < vnum_rest_units)
1239
      // bottom-right sync
1240
12.3k
      on_sync_read(lr_sync, row_number + 2, j, plane);
1241
1242
19.3k
#if CONFIG_MULTITHREAD
1243
19.3k
    if (lr_sync && lr_sync->num_workers > 1) {
1244
19.1k
      pthread_mutex_lock(lr_sync->job_mutex);
1245
19.1k
      const bool lr_mt_exit = lr_sync->lr_mt_exit;
1246
19.1k
      pthread_mutex_unlock(lr_sync->job_mutex);
1247
      // Exit in case any worker has encountered an error.
1248
19.1k
      if (lr_mt_exit) return;
1249
19.1k
    }
1250
19.3k
#endif
1251
1252
19.3k
    on_rest_unit(limits, unit_idx, priv, tmpbuf, rlbs, error_info);
1253
1254
19.3k
    on_sync_write(lr_sync, row_number, j, hnum_rest_units, plane);
1255
1256
19.3k
    x0 += w;
1257
19.3k
    ++j;
1258
19.3k
  }
1259
18.1k
}
1260
1261
19.3k
void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
1262
19.3k
  (void)lr_sync;
1263
19.3k
  (void)r;
1264
19.3k
  (void)c;
1265
19.3k
  (void)plane;
1266
19.3k
}
1267
1268
void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
1269
7.54k
                             const int sb_cols, int plane) {
1270
7.54k
  (void)lr_sync;
1271
7.54k
  (void)r;
1272
7.54k
  (void)c;
1273
7.54k
  (void)sb_cols;
1274
7.54k
  (void)plane;
1275
7.54k
}
1276
1277
int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
1278
                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
1279
                                       int *rcol0, int *rcol1, int *rrow0,
1280
564k
                                       int *rrow1) {
1281
564k
  assert(rcol0 && rcol1 && rrow0 && rrow1);
1282
1283
564k
  if (bsize != cm->seq_params->sb_size) return 0;
1284
1285
564k
  assert(!cm->features.all_lossless);
1286
1287
74.4k
  const int is_uv = plane > 0;
1288
1289
  // Compute the mi-unit corners of the superblock
1290
74.4k
  const int mi_row0 = mi_row;
1291
74.4k
  const int mi_col0 = mi_col;
1292
74.4k
  const int mi_row1 = mi_row0 + mi_size_high[bsize];
1293
74.4k
  const int mi_col1 = mi_col0 + mi_size_wide[bsize];
1294
1295
74.4k
  const RestorationInfo *rsi = &cm->rst_info[plane];
1296
74.4k
  const int size = rsi->restoration_unit_size;
1297
74.4k
  const int horz_units = rsi->horz_units;
1298
74.4k
  const int vert_units = rsi->vert_units;
1299
1300
  // The size of an MI-unit on this plane of the image
1301
74.4k
  const int ss_x = is_uv && cm->seq_params->subsampling_x;
1302
74.4k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1303
74.4k
  const int mi_size_x = MI_SIZE >> ss_x;
1304
74.4k
  const int mi_size_y = MI_SIZE >> ss_y;
1305
1306
  // Write m for the relative mi column or row, D for the superres denominator
1307
  // and N for the superres numerator. If u is the upscaled pixel offset then
1308
  // we can write the downscaled pixel offset in two ways as:
1309
  //
1310
  //   MI_SIZE * m = N / D u
1311
  //
1312
  // from which we get u = D * MI_SIZE * m / N
1313
74.4k
  const int mi_to_num_x = av1_superres_scaled(cm)
1314
74.4k
                              ? mi_size_x * cm->superres_scale_denominator
1315
74.4k
                              : mi_size_x;
1316
74.4k
  const int mi_to_num_y = mi_size_y;
1317
74.4k
  const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
1318
74.4k
  const int denom_y = size;
1319
1320
74.4k
  const int rnd_x = denom_x - 1;
1321
74.4k
  const int rnd_y = denom_y - 1;
1322
1323
  // rcol0/rrow0 should be the first column/row of restoration units that
1324
  // doesn't start left/below of mi_col/mi_row. For this calculation, we need
1325
  // to round up the division (if the sb starts at runit column 10.1, the first
1326
  // matching runit has column index 11)
1327
74.4k
  *rcol0 = (mi_col0 * mi_to_num_x + rnd_x) / denom_x;
1328
74.4k
  *rrow0 = (mi_row0 * mi_to_num_y + rnd_y) / denom_y;
1329
1330
  // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
1331
  // below-right. If we're at the bottom or right of the frame, this restoration
1332
  // unit might not exist, in which case we'll clamp accordingly.
1333
74.4k
  *rcol1 = AOMMIN((mi_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
1334
74.4k
  *rrow1 = AOMMIN((mi_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
1335
1336
74.4k
  return *rcol0 < *rcol1 && *rrow0 < *rrow1;
1337
564k
}
1338
1339
// Extend to left and right
1340
static void extend_lines(uint8_t *buf, int width, int height, int stride,
1341
90.3k
                         int extend, int use_highbitdepth) {
1342
271k
  for (int i = 0; i < height; ++i) {
1343
180k
    if (use_highbitdepth) {
1344
139k
      uint16_t *buf16 = (uint16_t *)buf;
1345
139k
      aom_memset16(buf16 - extend, buf16[0], extend);
1346
139k
      aom_memset16(buf16 + width, buf16[width - 1], extend);
1347
139k
    } else {
1348
41.7k
      memset(buf - extend, buf[0], extend);
1349
41.7k
      memset(buf + width, buf[width - 1], extend);
1350
41.7k
    }
1351
180k
    buf += stride;
1352
180k
  }
1353
90.3k
}
1354
1355
static void save_deblock_boundary_lines(
1356
    const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
1357
    int stripe, int use_highbd, int is_above,
1358
72.9k
    RestorationStripeBoundaries *boundaries) {
1359
72.9k
  const int is_uv = plane > 0;
1360
72.9k
  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1361
72.9k
  const int src_stride = frame->strides[is_uv] << use_highbd;
1362
72.9k
  const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
1363
1364
72.9k
  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1365
72.9k
                               : boundaries->stripe_boundary_below;
1366
72.9k
  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1367
72.9k
  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1368
72.9k
  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1369
1370
  // There is a rare case in which a processing stripe can end 1px above the
1371
  // crop border. In this case, we do want to use deblocked pixels from below
1372
  // the stripe (hence why we ended up in this function), but instead of
1373
  // fetching 2 "below" rows we need to fetch one and duplicate it.
1374
  // This is equivalent to clamping the sample locations against the crop border
1375
72.9k
  const int lines_to_save =
1376
72.9k
      AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
1377
72.9k
  assert(lines_to_save == 1 || lines_to_save == 2);
1378
1379
72.9k
  int upscaled_width;
1380
72.9k
  int line_bytes;
1381
72.9k
  if (av1_superres_scaled(cm)) {
1382
62.9k
    const int ss_x = is_uv && cm->seq_params->subsampling_x;
1383
62.9k
    upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
1384
62.9k
    line_bytes = upscaled_width << use_highbd;
1385
62.9k
    if (use_highbd)
1386
49.1k
      av1_upscale_normative_rows(
1387
49.1k
          cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
1388
49.1k
          CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
1389
49.1k
          plane, lines_to_save);
1390
13.7k
    else
1391
13.7k
      av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
1392
13.7k
                                 boundaries->stripe_boundary_stride, plane,
1393
13.7k
                                 lines_to_save);
1394
62.9k
  } else {
1395
9.96k
    upscaled_width = frame->crop_widths[is_uv];
1396
9.96k
    line_bytes = upscaled_width << use_highbd;
1397
29.8k
    for (int i = 0; i < lines_to_save; i++) {
1398
19.8k
      memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
1399
19.8k
             line_bytes);
1400
19.8k
    }
1401
9.96k
  }
1402
  // If we only saved one line, then copy it into the second line buffer
1403
72.9k
  if (lines_to_save == 1)
1404
107
    memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
1405
1406
72.9k
  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1407
72.9k
               RESTORATION_EXTRA_HORZ, use_highbd);
1408
72.9k
}
1409
1410
static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1411
                                     const AV1_COMMON *cm, int plane, int row,
1412
                                     int stripe, int use_highbd, int is_above,
1413
17.4k
                                     RestorationStripeBoundaries *boundaries) {
1414
17.4k
  const int is_uv = plane > 0;
1415
17.4k
  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1416
17.4k
  const int src_stride = frame->strides[is_uv] << use_highbd;
1417
17.4k
  const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
1418
1419
17.4k
  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1420
17.4k
                               : boundaries->stripe_boundary_below;
1421
17.4k
  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1422
17.4k
  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1423
17.4k
  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1424
17.4k
  const int src_width = frame->crop_widths[is_uv];
1425
1426
  // At the point where this function is called, we've already applied
1427
  // superres. So we don't need to extend the lines here, we can just
1428
  // pull directly from the topmost row of the upscaled frame.
1429
17.4k
  const int ss_x = is_uv && cm->seq_params->subsampling_x;
1430
17.4k
  const int upscaled_width = av1_superres_scaled(cm)
1431
17.4k
                                 ? (cm->superres_upscaled_width + ss_x) >> ss_x
1432
17.4k
                                 : src_width;
1433
17.4k
  const int line_bytes = upscaled_width << use_highbd;
1434
52.3k
  for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
1435
    // Copy the line at 'src_rows' into both context lines
1436
34.8k
    memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
1437
34.8k
  }
1438
17.4k
  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1439
17.4k
               RESTORATION_EXTRA_HORZ, use_highbd);
1440
17.4k
}
1441
1442
static void save_boundary_lines(const YV12_BUFFER_CONFIG *frame, int use_highbd,
1443
17.4k
                                int plane, AV1_COMMON *cm, int after_cdef) {
1444
17.4k
  const int is_uv = plane > 0;
1445
17.4k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1446
17.4k
  const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1447
17.4k
  const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
1448
1449
17.4k
  int plane_w, plane_h;
1450
17.4k
  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1451
1452
17.4k
  RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
1453
1454
17.4k
  const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
1455
1456
17.4k
  int stripe_idx;
1457
107k
  for (stripe_idx = 0;; ++stripe_idx) {
1458
107k
    const int rel_y0 = AOMMAX(0, stripe_idx * stripe_height - stripe_off);
1459
107k
    const int y0 = rel_y0;
1460
107k
    if (y0 >= plane_h) break;
1461
1462
90.3k
    const int rel_y1 = (stripe_idx + 1) * stripe_height - stripe_off;
1463
90.3k
    const int y1 = AOMMIN(rel_y1, plane_h);
1464
1465
    // Extend using CDEF pixels at the top and bottom of the frame,
1466
    // and deblocked pixels at internal stripe boundaries
1467
90.3k
    const int use_deblock_above = (stripe_idx > 0);
1468
90.3k
    const int use_deblock_below = (y1 < plane_height);
1469
1470
90.3k
    if (!after_cdef) {
1471
      // Save deblocked context at internal stripe boundaries
1472
45.1k
      if (use_deblock_above) {
1473
36.4k
        save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
1474
36.4k
                                    stripe_idx, use_highbd, 1, boundaries);
1475
36.4k
      }
1476
45.1k
      if (use_deblock_below) {
1477
36.4k
        save_deblock_boundary_lines(frame, cm, plane, y1, stripe_idx,
1478
36.4k
                                    use_highbd, 0, boundaries);
1479
36.4k
      }
1480
45.1k
    } else {
1481
      // Save CDEF context at frame boundaries
1482
45.1k
      if (!use_deblock_above) {
1483
8.72k
        save_cdef_boundary_lines(frame, cm, plane, y0, stripe_idx, use_highbd,
1484
8.72k
                                 1, boundaries);
1485
8.72k
      }
1486
45.1k
      if (!use_deblock_below) {
1487
8.72k
        save_cdef_boundary_lines(frame, cm, plane, y1 - 1, stripe_idx,
1488
8.72k
                                 use_highbd, 0, boundaries);
1489
8.72k
      }
1490
45.1k
    }
1491
90.3k
  }
1492
17.4k
}
1493
1494
// For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
1495
// lines to be used as boundary in the loop restoration process. The
1496
// lines are saved in rst_internal.stripe_boundary_lines
1497
void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1498
5.88k
                                              AV1_COMMON *cm, int after_cdef) {
1499
5.88k
  const int num_planes = av1_num_planes(cm);
1500
5.88k
  const int use_highbd = cm->seq_params->use_highbitdepth;
1501
23.3k
  for (int p = 0; p < num_planes; ++p) {
1502
17.4k
    save_boundary_lines(frame, use_highbd, p, cm, after_cdef);
1503
17.4k
  }
1504
5.88k
}