Coverage Report

Created: 2025-07-18 06:57

/src/libavif/ext/aom/av1/common/restoration.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 *
11
 */
12
13
#include <math.h>
14
#include <stddef.h>
15
16
#include "config/aom_config.h"
17
#include "config/aom_scale_rtcd.h"
18
19
#include "aom/internal/aom_codec_internal.h"
20
#include "aom_mem/aom_mem.h"
21
#include "aom_dsp/aom_dsp_common.h"
22
#include "aom_mem/aom_mem.h"
23
#include "aom_ports/mem.h"
24
#include "aom_util/aom_pthread.h"
25
26
#include "av1/common/av1_common_int.h"
27
#include "av1/common/convolve.h"
28
#include "av1/common/enums.h"
29
#include "av1/common/resize.h"
30
#include "av1/common/restoration.h"
31
#include "av1/common/thread_common.h"
32
33
// The 's' values are calculated based on original 'r' and 'e' values in the
34
// spec using GenSgrprojVtable().
35
// Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
36
const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
37
  { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
38
  { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
39
  { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
40
  { { 2, 1 }, { 47, 1079 } },  { { 2, 1 }, { 37, 996 } },
41
  { { 2, 1 }, { 30, 925 } },   { { 2, 1 }, { 25, 863 } },
42
  { { 0, 1 }, { -1, 2589 } },  { { 0, 1 }, { -1, 1618 } },
43
  { { 0, 1 }, { -1, 1177 } },  { { 0, 1 }, { -1, 925 } },
44
  { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
45
};
46
47
void av1_get_upsampled_plane_size(const AV1_COMMON *cm, int is_uv, int *plane_w,
48
223k
                                  int *plane_h) {
49
223k
  int ss_x = is_uv && cm->seq_params->subsampling_x;
50
223k
  int ss_y = is_uv && cm->seq_params->subsampling_y;
51
223k
  *plane_w = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
52
223k
  *plane_h = ROUND_POWER_OF_TWO(cm->height, ss_y);
53
223k
}
54
55
// Count horizontal or vertical units in a plane (use a width or height for
56
// plane_size, respectively). We basically want to divide the plane size by the
57
// size of a restoration unit. Rather than rounding up unconditionally as you
58
// might expect, we round to nearest, which models the way a right or bottom
59
// restoration unit can extend to up to 150% its normal width or height.
60
//
61
// The max with 1 is to deal with small frames, which may be smaller than
62
// half of an LR unit in size.
63
187k
int av1_lr_count_units(int unit_size, int plane_size) {
64
187k
  return AOMMAX((plane_size + (unit_size >> 1)) / unit_size, 1);
65
187k
}
66
67
void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
68
9.17k
                                  int is_uv) {
69
9.17k
  int plane_w, plane_h;
70
9.17k
  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
71
72
9.17k
  const int unit_size = rsi->restoration_unit_size;
73
9.17k
  const int horz_units = av1_lr_count_units(unit_size, plane_w);
74
9.17k
  const int vert_units = av1_lr_count_units(unit_size, plane_h);
75
76
9.17k
  rsi->num_rest_units = horz_units * vert_units;
77
9.17k
  rsi->horz_units = horz_units;
78
9.17k
  rsi->vert_units = vert_units;
79
80
9.17k
  aom_free(rsi->unit_info);
81
9.17k
  CHECK_MEM_ERROR(cm, rsi->unit_info,
82
9.17k
                  (RestorationUnitInfo *)aom_memalign(
83
9.17k
                      16, sizeof(*rsi->unit_info) * rsi->num_rest_units));
84
9.17k
}
85
86
385k
void av1_free_restoration_struct(RestorationInfo *rst_info) {
87
385k
  aom_free(rst_info->unit_info);
88
385k
  rst_info->unit_info = NULL;
89
385k
}
90
91
#if 0
92
// Pair of values for each sgrproj parameter:
93
// Index 0 corresponds to r[0], e[0]
94
// Index 1 corresponds to r[1], e[1]
95
int sgrproj_mtable[SGRPROJ_PARAMS][2];
96
97
static void GenSgrprojVtable(void) {
98
  for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
99
    const sgr_params_type *const params = &av1_sgr_params[i];
100
    for (int j = 0; j < 2; ++j) {
101
      const int e = params->e[j];
102
      const int r = params->r[j];
103
      if (r == 0) {                 // filter is disabled
104
        sgrproj_mtable[i][j] = -1;  // mark invalid
105
      } else {                      // filter is enabled
106
        const int n = (2 * r + 1) * (2 * r + 1);
107
        const int n2e = n * n * e;
108
        assert(n2e != 0);
109
        sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
110
      }
111
    }
112
  }
113
}
114
#endif
115
116
128k
void av1_loop_restoration_precal(void) {
117
#if 0
118
  GenSgrprojVtable();
119
#endif
120
128k
}
121
122
static void extend_frame_lowbd(uint8_t *data, int width, int height,
123
                               ptrdiff_t stride, int border_horz,
124
19.8k
                               int border_vert) {
125
19.8k
  uint8_t *data_p;
126
19.8k
  int i;
127
1.32M
  for (i = 0; i < height; ++i) {
128
1.30M
    data_p = data + i * stride;
129
1.30M
    memset(data_p - border_horz, data_p[0], border_horz);
130
1.30M
    memset(data_p + width, data_p[width - 1], border_horz);
131
1.30M
  }
132
19.8k
  data_p = data - border_horz;
133
79.3k
  for (i = -border_vert; i < 0; ++i) {
134
59.5k
    memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
135
59.5k
  }
136
79.3k
  for (i = height; i < height + border_vert; ++i) {
137
59.5k
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
138
59.5k
           width + 2 * border_horz);
139
59.5k
  }
140
19.8k
}
141
142
#if CONFIG_AV1_HIGHBITDEPTH
143
static void extend_frame_highbd(uint16_t *data, int width, int height,
144
                                ptrdiff_t stride, int border_horz,
145
7.04k
                                int border_vert) {
146
7.04k
  uint16_t *data_p;
147
7.04k
  int i, j;
148
777k
  for (i = 0; i < height; ++i) {
149
770k
    data_p = data + i * stride;
150
3.08M
    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
151
3.08M
    for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
152
770k
  }
153
7.04k
  data_p = data - border_horz;
154
28.1k
  for (i = -border_vert; i < 0; ++i) {
155
21.1k
    memcpy(data_p + i * stride, data_p,
156
21.1k
           (width + 2 * border_horz) * sizeof(uint16_t));
157
21.1k
  }
158
28.1k
  for (i = height; i < height + border_vert; ++i) {
159
21.1k
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
160
21.1k
           (width + 2 * border_horz) * sizeof(uint16_t));
161
21.1k
  }
162
7.04k
}
163
164
static void copy_rest_unit_highbd(int width, int height, const uint16_t *src,
165
                                  int src_stride, uint16_t *dst,
166
3.20k
                                  int dst_stride) {
167
351k
  for (int i = 0; i < height; ++i)
168
347k
    memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
169
3.20k
}
170
#endif
171
172
void av1_extend_frame(uint8_t *data, int width, int height, int stride,
173
26.8k
                      int border_horz, int border_vert, int highbd) {
174
26.8k
#if CONFIG_AV1_HIGHBITDEPTH
175
26.8k
  if (highbd) {
176
7.04k
    extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
177
7.04k
                        border_horz, border_vert);
178
7.04k
    return;
179
7.04k
  }
180
19.8k
#endif
181
19.8k
  (void)highbd;
182
19.8k
  extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
183
19.8k
}
184
185
static void copy_rest_unit_lowbd(int width, int height, const uint8_t *src,
186
5.47k
                                 int src_stride, uint8_t *dst, int dst_stride) {
187
495k
  for (int i = 0; i < height; ++i)
188
489k
    memcpy(dst + i * dst_stride, src + i * src_stride, width);
189
5.47k
}
190
191
static void copy_rest_unit(int width, int height, const uint8_t *src,
192
                           int src_stride, uint8_t *dst, int dst_stride,
193
8.67k
                           int highbd) {
194
8.67k
#if CONFIG_AV1_HIGHBITDEPTH
195
8.67k
  if (highbd) {
196
3.20k
    copy_rest_unit_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
197
3.20k
                          CONVERT_TO_SHORTPTR(dst), dst_stride);
198
3.20k
    return;
199
3.20k
  }
200
5.47k
#endif
201
5.47k
  (void)highbd;
202
5.47k
  copy_rest_unit_lowbd(width, height, src, src_stride, dst, dst_stride);
203
5.47k
}
204
205
551k
#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
206
207
// With striped loop restoration, the filtering for each 64-pixel stripe gets
208
// most of its input from the output of CDEF (stored in data8), but we need to
209
// fill out a border of 3 pixels above/below the stripe according to the
210
// following rules:
211
//
212
// * At the top and bottom of the frame, we copy the outermost row of CDEF
213
//   pixels three times. This extension is done by a call to av1_extend_frame()
214
//   at the start of the loop restoration process, so the value of
215
//   copy_above/copy_below doesn't strictly matter.
216
//
217
// * All other boundaries are stripe boundaries within the frame. In that case,
218
//   we take 2 rows of deblocked pixels and extend them to 3 rows of context.
219
static void get_stripe_boundary_info(const RestorationTileLimits *limits,
220
                                     int plane_w, int plane_h, int ss_y,
221
35.0k
                                     int *copy_above, int *copy_below) {
222
35.0k
  (void)plane_w;
223
224
35.0k
  *copy_above = 1;
225
35.0k
  *copy_below = 1;
226
227
35.0k
  const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
228
35.0k
  const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
229
230
35.0k
  const int first_stripe_in_plane = (limits->v_start == 0);
231
35.0k
  const int this_stripe_height =
232
35.0k
      full_stripe_height - (first_stripe_in_plane ? runit_offset : 0);
233
35.0k
  const int last_stripe_in_plane =
234
35.0k
      (limits->v_start + this_stripe_height >= plane_h);
235
236
35.0k
  if (first_stripe_in_plane) *copy_above = 0;
237
35.0k
  if (last_stripe_in_plane) *copy_below = 0;
238
35.0k
}
239
240
// Overwrite the border pixels around a processing stripe so that the conditions
241
// listed above get_stripe_boundary_info() are preserved.
242
// We save the pixels which get overwritten into a temporary buffer, so that
243
// they can be restored by restore_processing_stripe_boundary() after we've
244
// processed the stripe.
245
//
246
// limits gives the rectangular limits of the remaining stripes for the current
247
// restoration unit. rsb is the stored stripe boundaries (taken from either
248
// deblock or CDEF output as necessary).
249
static void setup_processing_stripe_boundary(
250
    const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
251
    int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
252
35.0k
    RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
253
  // Offsets within the line buffers. The buffer logically starts at column
254
  // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
255
  // has column x0 in the buffer.
256
35.0k
  const int buf_stride = rsb->stripe_boundary_stride;
257
35.0k
  const int buf_x0_off = limits->h_start;
258
35.0k
  const int line_width =
259
35.0k
      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
260
35.0k
  const int line_size = line_width << use_highbd;
261
262
35.0k
  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
263
264
  // Replace RESTORATION_BORDER pixels above the top of the stripe
265
  // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
266
  // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
267
  // duplicating the topmost of the 2 lines (see the AOMMAX call when
268
  // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
269
35.0k
  if (!opt) {
270
34.2k
    if (copy_above) {
271
23.1k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
272
273
92.5k
      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
274
69.3k
        const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
275
69.3k
        const int buf_off = buf_x0_off + buf_row * buf_stride;
276
69.3k
        const uint8_t *buf =
277
69.3k
            rsb->stripe_boundary_above + (buf_off << use_highbd);
278
69.3k
        uint8_t *dst8 = data8_tl + i * data_stride;
279
        // Save old pixels, then replace with data from stripe_boundary_above
280
69.3k
        memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
281
69.3k
               REAL_PTR(use_highbd, dst8), line_size);
282
69.3k
        memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
283
69.3k
      }
284
23.1k
    }
285
286
    // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
287
    // The second buffer row is repeated, so src_row gets the values 0, 1, 1
288
    // for i = 0, 1, 2.
289
34.2k
    if (copy_below) {
290
23.0k
      const int stripe_end = limits->v_start + h;
291
23.0k
      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
292
293
92.2k
      for (int i = 0; i < RESTORATION_BORDER; ++i) {
294
69.1k
        const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
295
69.1k
        const int buf_off = buf_x0_off + buf_row * buf_stride;
296
69.1k
        const uint8_t *src =
297
69.1k
            rsb->stripe_boundary_below + (buf_off << use_highbd);
298
299
69.1k
        uint8_t *dst8 = data8_bl + i * data_stride;
300
        // Save old pixels, then replace with data from stripe_boundary_below
301
69.1k
        memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
302
69.1k
        memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
303
69.1k
      }
304
23.0k
    }
305
34.2k
  } else {
306
818
    if (copy_above) {
307
654
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
308
309
      // Only save and overwrite i=-RESTORATION_BORDER line.
310
654
      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
311
      // Save old pixels, then replace with data from stripe_boundary_above
312
654
      memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
313
654
      memcpy(REAL_PTR(use_highbd, dst8),
314
654
             REAL_PTR(use_highbd,
315
654
                      data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
316
654
             line_size);
317
654
    }
318
319
818
    if (copy_below) {
320
674
      const int stripe_end = limits->v_start + h;
321
674
      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
322
323
      // Only save and overwrite i=2 line.
324
674
      uint8_t *dst8 = data8_bl + 2 * data_stride;
325
      // Save old pixels, then replace with data from stripe_boundary_below
326
674
      memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
327
674
      memcpy(REAL_PTR(use_highbd, dst8),
328
674
             REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
329
674
    }
330
818
  }
331
35.0k
}
332
333
// Once a processing stripe is finished, this function sets the boundary
334
// pixels which were overwritten by setup_processing_stripe_boundary()
335
// back to their original values
336
static void restore_processing_stripe_boundary(
337
    const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
338
    int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
339
34.9k
    int copy_below, int opt) {
340
34.9k
  const int line_width =
341
34.9k
      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
342
34.9k
  const int line_size = line_width << use_highbd;
343
344
34.9k
  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
345
346
34.9k
  if (!opt) {
347
34.1k
    if (copy_above) {
348
23.1k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
349
92.4k
      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
350
69.3k
        uint8_t *dst8 = data8_tl + i * data_stride;
351
69.3k
        memcpy(REAL_PTR(use_highbd, dst8),
352
69.3k
               rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
353
69.3k
      }
354
23.1k
    }
355
356
34.1k
    if (copy_below) {
357
23.1k
      const int stripe_bottom = limits->v_start + h;
358
23.1k
      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
359
360
92.4k
      for (int i = 0; i < RESTORATION_BORDER; ++i) {
361
69.3k
        if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
362
363
69.3k
        uint8_t *dst8 = data8_bl + i * data_stride;
364
69.3k
        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
365
69.3k
      }
366
23.1k
    }
367
34.1k
  } else {
368
814
    if (copy_above) {
369
651
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
370
371
      // Only restore i=-RESTORATION_BORDER line.
372
651
      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
373
651
      memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
374
651
    }
375
376
814
    if (copy_below) {
377
671
      const int stripe_bottom = limits->v_start + h;
378
671
      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
379
380
      // Only restore i=2 line.
381
671
      if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
382
671
        uint8_t *dst8 = data8_bl + 2 * data_stride;
383
671
        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
384
671
      }
385
671
    }
386
814
  }
387
34.9k
}
388
389
static void wiener_filter_stripe(const RestorationUnitInfo *rui,
390
                                 int stripe_width, int stripe_height,
391
                                 int procunit_width, const uint8_t *src,
392
                                 int src_stride, uint8_t *dst, int dst_stride,
393
                                 int32_t *tmpbuf, int bit_depth,
394
14.5k
                                 struct aom_internal_error_info *error_info) {
395
14.5k
  (void)tmpbuf;
396
14.5k
  (void)bit_depth;
397
14.5k
  (void)error_info;
398
14.5k
  assert(bit_depth == 8);
399
14.5k
  const WienerConvolveParams conv_params = get_conv_params_wiener(8);
400
401
40.9k
  for (int j = 0; j < stripe_width; j += procunit_width) {
402
26.3k
    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
403
26.3k
    const uint8_t *src_p = src + j;
404
26.3k
    uint8_t *dst_p = dst + j;
405
26.3k
    av1_wiener_convolve_add_src(
406
26.3k
        src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
407
26.3k
        rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
408
26.3k
  }
409
14.5k
}
410
411
/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
412
   over the input. The window is of size (2r + 1)x(2r + 1), and we
413
   specialize to r = 1, 2, 3. A default function is used for r > 3.
414
415
   Each loop follows the same format: We keep a window's worth of input
416
   in individual variables and select data out of that as appropriate.
417
*/
418
static void boxsum1(int32_t *src, int width, int height, int src_stride,
419
0
                    int sqr, int32_t *dst, int dst_stride) {
420
0
  int i, j, a, b, c;
421
0
  assert(width > 2 * SGRPROJ_BORDER_HORZ);
422
0
  assert(height > 2 * SGRPROJ_BORDER_VERT);
423
424
  // Vertical sum over 3-pixel regions, from src into dst.
425
0
  if (!sqr) {
426
0
    for (j = 0; j < width; ++j) {
427
0
      a = src[j];
428
0
      b = src[src_stride + j];
429
0
      c = src[2 * src_stride + j];
430
431
0
      dst[j] = a + b;
432
0
      for (i = 1; i < height - 2; ++i) {
433
        // Loop invariant: At the start of each iteration,
434
        // a = src[(i - 1) * src_stride + j]
435
        // b = src[(i    ) * src_stride + j]
436
        // c = src[(i + 1) * src_stride + j]
437
0
        dst[i * dst_stride + j] = a + b + c;
438
0
        a = b;
439
0
        b = c;
440
0
        c = src[(i + 2) * src_stride + j];
441
0
      }
442
0
      dst[i * dst_stride + j] = a + b + c;
443
0
      dst[(i + 1) * dst_stride + j] = b + c;
444
0
    }
445
0
  } else {
446
0
    for (j = 0; j < width; ++j) {
447
0
      a = src[j] * src[j];
448
0
      b = src[src_stride + j] * src[src_stride + j];
449
0
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
450
451
0
      dst[j] = a + b;
452
0
      for (i = 1; i < height - 2; ++i) {
453
0
        dst[i * dst_stride + j] = a + b + c;
454
0
        a = b;
455
0
        b = c;
456
0
        c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
457
0
      }
458
0
      dst[i * dst_stride + j] = a + b + c;
459
0
      dst[(i + 1) * dst_stride + j] = b + c;
460
0
    }
461
0
  }
462
463
  // Horizontal sum over 3-pixel regions of dst
464
0
  for (i = 0; i < height; ++i) {
465
0
    a = dst[i * dst_stride];
466
0
    b = dst[i * dst_stride + 1];
467
0
    c = dst[i * dst_stride + 2];
468
469
0
    dst[i * dst_stride] = a + b;
470
0
    for (j = 1; j < width - 2; ++j) {
471
      // Loop invariant: At the start of each iteration,
472
      // a = src[i * src_stride + (j - 1)]
473
      // b = src[i * src_stride + (j    )]
474
      // c = src[i * src_stride + (j + 1)]
475
0
      dst[i * dst_stride + j] = a + b + c;
476
0
      a = b;
477
0
      b = c;
478
0
      c = dst[i * dst_stride + (j + 2)];
479
0
    }
480
0
    dst[i * dst_stride + j] = a + b + c;
481
0
    dst[i * dst_stride + (j + 1)] = b + c;
482
0
  }
483
0
}
484
485
static void boxsum2(int32_t *src, int width, int height, int src_stride,
486
0
                    int sqr, int32_t *dst, int dst_stride) {
487
0
  int i, j, a, b, c, d, e;
488
0
  assert(width > 2 * SGRPROJ_BORDER_HORZ);
489
0
  assert(height > 2 * SGRPROJ_BORDER_VERT);
490
491
  // Vertical sum over 5-pixel regions, from src into dst.
492
0
  if (!sqr) {
493
0
    for (j = 0; j < width; ++j) {
494
0
      a = src[j];
495
0
      b = src[src_stride + j];
496
0
      c = src[2 * src_stride + j];
497
0
      d = src[3 * src_stride + j];
498
0
      e = src[4 * src_stride + j];
499
500
0
      dst[j] = a + b + c;
501
0
      dst[dst_stride + j] = a + b + c + d;
502
0
      for (i = 2; i < height - 3; ++i) {
503
        // Loop invariant: At the start of each iteration,
504
        // a = src[(i - 2) * src_stride + j]
505
        // b = src[(i - 1) * src_stride + j]
506
        // c = src[(i    ) * src_stride + j]
507
        // d = src[(i + 1) * src_stride + j]
508
        // e = src[(i + 2) * src_stride + j]
509
0
        dst[i * dst_stride + j] = a + b + c + d + e;
510
0
        a = b;
511
0
        b = c;
512
0
        c = d;
513
0
        d = e;
514
0
        e = src[(i + 3) * src_stride + j];
515
0
      }
516
0
      dst[i * dst_stride + j] = a + b + c + d + e;
517
0
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
518
0
      dst[(i + 2) * dst_stride + j] = c + d + e;
519
0
    }
520
0
  } else {
521
0
    for (j = 0; j < width; ++j) {
522
0
      a = src[j] * src[j];
523
0
      b = src[src_stride + j] * src[src_stride + j];
524
0
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
525
0
      d = src[3 * src_stride + j] * src[3 * src_stride + j];
526
0
      e = src[4 * src_stride + j] * src[4 * src_stride + j];
527
528
0
      dst[j] = a + b + c;
529
0
      dst[dst_stride + j] = a + b + c + d;
530
0
      for (i = 2; i < height - 3; ++i) {
531
0
        dst[i * dst_stride + j] = a + b + c + d + e;
532
0
        a = b;
533
0
        b = c;
534
0
        c = d;
535
0
        d = e;
536
0
        e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
537
0
      }
538
0
      dst[i * dst_stride + j] = a + b + c + d + e;
539
0
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
540
0
      dst[(i + 2) * dst_stride + j] = c + d + e;
541
0
    }
542
0
  }
543
544
  // Horizontal sum over 5-pixel regions of dst
545
0
  for (i = 0; i < height; ++i) {
546
0
    a = dst[i * dst_stride];
547
0
    b = dst[i * dst_stride + 1];
548
0
    c = dst[i * dst_stride + 2];
549
0
    d = dst[i * dst_stride + 3];
550
0
    e = dst[i * dst_stride + 4];
551
552
0
    dst[i * dst_stride] = a + b + c;
553
0
    dst[i * dst_stride + 1] = a + b + c + d;
554
0
    for (j = 2; j < width - 3; ++j) {
555
      // Loop invariant: At the start of each iteration,
556
      // a = src[i * src_stride + (j - 2)]
557
      // b = src[i * src_stride + (j - 1)]
558
      // c = src[i * src_stride + (j    )]
559
      // d = src[i * src_stride + (j + 1)]
560
      // e = src[i * src_stride + (j + 2)]
561
0
      dst[i * dst_stride + j] = a + b + c + d + e;
562
0
      a = b;
563
0
      b = c;
564
0
      c = d;
565
0
      d = e;
566
0
      e = dst[i * dst_stride + (j + 3)];
567
0
    }
568
0
    dst[i * dst_stride + j] = a + b + c + d + e;
569
0
    dst[i * dst_stride + (j + 1)] = b + c + d + e;
570
0
    dst[i * dst_stride + (j + 2)] = c + d + e;
571
0
  }
572
0
}
573
574
static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
575
0
                   int sqr, int32_t *dst, int dst_stride) {
576
0
  if (r == 1)
577
0
    boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
578
0
  else if (r == 2)
579
0
    boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
580
0
  else
581
0
    assert(0 && "Invalid value of r in self-guided filter");
582
0
}
583
584
25.8k
void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
585
25.8k
  if (params->r[0] == 0) {
586
4.38k
    xq[0] = 0;
587
4.38k
    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
588
21.4k
  } else if (params->r[1] == 0) {
589
7.88k
    xq[0] = xqd[0];
590
7.88k
    xq[1] = 0;
591
13.5k
  } else {
592
13.5k
    xq[0] = xqd[0];
593
13.5k
    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
594
13.5k
  }
595
25.8k
}
596
597
const int32_t av1_x_by_xplus1[256] = {
598
  // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
599
  // instead of 0. See comments in selfguided_restoration_internal() for why
600
  1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
601
  240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
602
  248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
603
  250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
604
  252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
605
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
606
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
607
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
608
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
609
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
610
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
611
  254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
612
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
613
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
614
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
615
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
616
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
617
  256,
618
};
619
620
const int32_t av1_one_by_x[MAX_NELEM] = {
621
  4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
622
  293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
623
};
624
625
static void calculate_intermediate_result(int32_t *dgd, int width, int height,
626
                                          int dgd_stride, int bit_depth,
627
                                          int sgr_params_idx, int radius_idx,
628
0
                                          int pass, int32_t *A, int32_t *B) {
629
0
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
630
0
  const int r = params->r[radius_idx];
631
0
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
632
0
  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
633
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
634
  // leading to a significant speed improvement.
635
  // We also align the stride to a multiple of 16 bytes, for consistency
636
  // with the SIMD version of this function.
637
0
  int buf_stride = ((width_ext + 3) & ~3) + 16;
638
0
  const int step = pass == 0 ? 1 : 2;
639
0
  int i, j;
640
641
0
  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
642
0
  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
643
0
         "Need SGRPROJ_BORDER_* >= r+1");
644
645
0
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
646
0
         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
647
0
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
648
0
         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
649
0
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
650
0
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
651
  // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
652
  // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
653
0
  for (i = -1; i < height + 1; i += step) {
654
0
    for (j = -1; j < width + 1; ++j) {
655
0
      const int k = i * buf_stride + j;
656
0
      const int n = (2 * r + 1) * (2 * r + 1);
657
658
      // a < 2^16 * n < 2^22 regardless of bit depth
659
0
      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
660
      // b < 2^8 * n < 2^14 regardless of bit depth
661
0
      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
662
663
      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
664
      // and p itself satisfies p < 2^14 * n^2 < 2^26.
665
      // This bound on p is due to:
666
      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
667
      //
668
      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
669
      // This is an artefact of rounding, and can only happen if all pixels
670
      // are (almost) identical, so in this case we saturate to p=0.
671
0
      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
672
673
0
      const uint32_t s = params->s[radius_idx];
674
675
      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
676
      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
677
      // (this holds even after accounting for the rounding in s)
678
0
      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
679
680
      // Note: We have to be quite careful about the value of A[k].
681
      // This is used as a blend factor between individual pixel values and the
682
      // local mean. So it logically has a range of [0, 256], including both
683
      // endpoints.
684
      //
685
      // This is a pain for hardware, as we'd like something which can be stored
686
      // in exactly 8 bits.
687
      // Further, in the calculation of B[k] below, if z == 0 and r == 2,
688
      // then A[k] "should be" 0. But then we can end up setting B[k] to a value
689
      // slightly above 2^(8 + bit depth), due to rounding in the value of
690
      // av1_one_by_x[25-1].
691
      //
692
      // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
693
      // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
694
      // overflow), without significantly affecting the final result: z == 0
695
      // implies that the image is essentially "flat", so the local mean and
696
      // individual pixel values are very similar.
697
      //
698
      // Note that saturating on the other side, ie. requring A[k] <= 255,
699
      // would be a bad idea, as that corresponds to the case where the image
700
      // is very variable, when we want to preserve the local pixel value as
701
      // much as possible.
702
0
      A[k] = av1_x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
703
704
      // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
705
      // av1_one_by_x[n - 1] = round(2^12 / n)
706
      // => the product here is < 2^(20 + bit_depth) <= 2^32,
707
      // and B[k] is set to a value < 2^(8 + bit depth)
708
      // This holds even with the rounding in av1_one_by_x and in the overall
709
      // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
710
0
      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
711
0
                                             (uint32_t)B[k] *
712
0
                                             (uint32_t)av1_one_by_x[n - 1],
713
0
                                         SGRPROJ_RECIP_BITS);
714
0
    }
715
0
  }
716
0
}
717
718
static void selfguided_restoration_fast_internal(
719
    int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
720
0
    int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
721
0
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
722
0
  const int r = params->r[radius_idx];
723
0
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
724
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
725
  // leading to a significant speed improvement.
726
  // We also align the stride to a multiple of 16 bytes, for consistency
727
  // with the SIMD version of this function.
728
0
  int buf_stride = ((width_ext + 3) & ~3) + 16;
729
0
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
730
0
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
731
0
  int32_t *A = A_;
732
0
  int32_t *B = B_;
733
0
  int i, j;
734
0
  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
735
0
                                sgr_params_idx, radius_idx, 1, A, B);
736
0
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
737
0
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
738
739
  // Use the A[] and B[] arrays to calculate the filtered image
740
0
  (void)r;
741
0
  assert(r == 2);
742
0
  for (i = 0; i < height; ++i) {
743
0
    if (!(i & 1)) {  // even row
744
0
      for (j = 0; j < width; ++j) {
745
0
        const int k = i * buf_stride + j;
746
0
        const int l = i * dgd_stride + j;
747
0
        const int m = i * dst_stride + j;
748
0
        const int nb = 5;
749
0
        const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
750
0
                          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
751
0
                           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
752
0
                              5;
753
0
        const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
754
0
                          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
755
0
                           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
756
0
                              5;
757
0
        const int32_t v = a * dgd[l] + b;
758
0
        dst[m] =
759
0
            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
760
0
      }
761
0
    } else {  // odd row
762
0
      for (j = 0; j < width; ++j) {
763
0
        const int k = i * buf_stride + j;
764
0
        const int l = i * dgd_stride + j;
765
0
        const int m = i * dst_stride + j;
766
0
        const int nb = 4;
767
0
        const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
768
0
        const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
769
0
        const int32_t v = a * dgd[l] + b;
770
0
        dst[m] =
771
0
            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
772
0
      }
773
0
    }
774
0
  }
775
0
}
776
777
static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
778
                                            int dgd_stride, int32_t *dst,
779
                                            int dst_stride, int bit_depth,
780
                                            int sgr_params_idx,
781
0
                                            int radius_idx) {
782
0
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
783
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
784
  // leading to a significant speed improvement.
785
  // We also align the stride to a multiple of 16 bytes, for consistency
786
  // with the SIMD version of this function.
787
0
  int buf_stride = ((width_ext + 3) & ~3) + 16;
788
0
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
789
0
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
790
0
  int32_t *A = A_;
791
0
  int32_t *B = B_;
792
0
  int i, j;
793
0
  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
794
0
                                sgr_params_idx, radius_idx, 0, A, B);
795
0
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
796
0
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
797
798
  // Use the A[] and B[] arrays to calculate the filtered image
799
0
  for (i = 0; i < height; ++i) {
800
0
    for (j = 0; j < width; ++j) {
801
0
      const int k = i * buf_stride + j;
802
0
      const int l = i * dgd_stride + j;
803
0
      const int m = i * dst_stride + j;
804
0
      const int nb = 5;
805
0
      const int32_t a =
806
0
          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
807
0
              4 +
808
0
          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
809
0
           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
810
0
              3;
811
0
      const int32_t b =
812
0
          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
813
0
              4 +
814
0
          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
815
0
           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
816
0
              3;
817
0
      const int32_t v = a * dgd[l] + b;
818
0
      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
819
0
    }
820
0
  }
821
0
}
822
823
int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
824
                                 int dgd_stride, int32_t *flt0, int32_t *flt1,
825
                                 int flt_stride, int sgr_params_idx,
826
0
                                 int bit_depth, int highbd) {
827
0
  int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
828
0
  const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
829
0
  int32_t *dgd32 =
830
0
      dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
831
832
0
  if (highbd) {
833
0
    const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
834
0
    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
835
0
      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
836
0
        dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
837
0
      }
838
0
    }
839
0
  } else {
840
0
    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
841
0
      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
842
0
        dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
843
0
      }
844
0
    }
845
0
  }
846
847
0
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
848
  // If params->r == 0 we skip the corresponding filter. We only allow one of
849
  // the radii to be 0, as having both equal to 0 would be equivalent to
850
  // skipping SGR entirely.
851
0
  assert(!(params->r[0] == 0 && params->r[1] == 0));
852
853
0
  if (params->r[0] > 0)
854
0
    selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
855
0
                                         flt0, flt_stride, bit_depth,
856
0
                                         sgr_params_idx, 0);
857
0
  if (params->r[1] > 0)
858
0
    selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
859
0
                                    flt_stride, bit_depth, sgr_params_idx, 1);
860
0
  return 0;
861
0
}
862
863
int av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
864
                                       int height, int stride, int eps,
865
                                       const int *xqd, uint8_t *dst8,
866
                                       int dst_stride, int32_t *tmpbuf,
867
0
                                       int bit_depth, int highbd) {
868
0
  int32_t *flt0 = tmpbuf;
869
0
  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
870
0
  assert(width * height <= RESTORATION_UNITPELS_MAX);
871
872
0
  const int ret = av1_selfguided_restoration_c(
873
0
      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
874
0
  if (ret != 0) return ret;
875
0
  const sgr_params_type *const params = &av1_sgr_params[eps];
876
0
  int xq[2];
877
0
  av1_decode_xq(xqd, xq, params);
878
0
  for (int i = 0; i < height; ++i) {
879
0
    for (int j = 0; j < width; ++j) {
880
0
      const int k = i * width + j;
881
0
      uint8_t *dst8ij = dst8 + i * dst_stride + j;
882
0
      const uint8_t *dat8ij = dat8 + i * stride + j;
883
884
0
      const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
885
0
      const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
886
0
      int32_t v = u << SGRPROJ_PRJ_BITS;
887
      // If params->r == 0 then we skipped the filtering in
888
      // av1_selfguided_restoration_c, i.e. flt[k] == u
889
0
      if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
890
0
      if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
891
0
      const int16_t w =
892
0
          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
893
894
0
      const uint16_t out = clip_pixel_highbd(w, bit_depth);
895
0
      if (highbd)
896
0
        *CONVERT_TO_SHORTPTR(dst8ij) = out;
897
0
      else
898
0
        *dst8ij = (uint8_t)out;
899
0
    }
900
0
  }
901
0
  return 0;
902
0
}
903
904
static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
905
                                  int stripe_width, int stripe_height,
906
                                  int procunit_width, const uint8_t *src,
907
                                  int src_stride, uint8_t *dst, int dst_stride,
908
                                  int32_t *tmpbuf, int bit_depth,
909
6.19k
                                  struct aom_internal_error_info *error_info) {
910
6.19k
  (void)bit_depth;
911
6.19k
  assert(bit_depth == 8);
912
913
16.8k
  for (int j = 0; j < stripe_width; j += procunit_width) {
914
10.6k
    int w = AOMMIN(procunit_width, stripe_width - j);
915
10.6k
    if (av1_apply_selfguided_restoration(
916
10.6k
            src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
917
10.6k
            rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth,
918
10.6k
            0) != 0) {
919
0
      aom_internal_error(
920
0
          error_info, AOM_CODEC_MEM_ERROR,
921
0
          "Error allocating buffer in av1_apply_selfguided_restoration");
922
0
    }
923
10.6k
  }
924
6.19k
}
925
926
#if CONFIG_AV1_HIGHBITDEPTH
927
static void wiener_filter_stripe_highbd(
928
    const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
929
    int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
930
    int dst_stride, int32_t *tmpbuf, int bit_depth,
931
7.99k
    struct aom_internal_error_info *error_info) {
932
7.99k
  (void)tmpbuf;
933
7.99k
  (void)error_info;
934
7.99k
  const WienerConvolveParams conv_params = get_conv_params_wiener(bit_depth);
935
936
20.6k
  for (int j = 0; j < stripe_width; j += procunit_width) {
937
12.6k
    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
938
12.6k
    const uint8_t *src8_p = src8 + j;
939
12.6k
    uint8_t *dst8_p = dst8 + j;
940
12.6k
    av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
941
12.6k
                                       rui->wiener_info.hfilter, 16,
942
12.6k
                                       rui->wiener_info.vfilter, 16, w,
943
12.6k
                                       stripe_height, &conv_params, bit_depth);
944
12.6k
  }
945
7.99k
}
946
947
static void sgrproj_filter_stripe_highbd(
948
    const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
949
    int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
950
    int dst_stride, int32_t *tmpbuf, int bit_depth,
951
6.34k
    struct aom_internal_error_info *error_info) {
952
21.5k
  for (int j = 0; j < stripe_width; j += procunit_width) {
953
15.1k
    int w = AOMMIN(procunit_width, stripe_width - j);
954
15.1k
    if (av1_apply_selfguided_restoration(
955
15.1k
            src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
956
15.1k
            rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth,
957
15.1k
            1) != 0) {
958
0
      aom_internal_error(
959
0
          error_info, AOM_CODEC_MEM_ERROR,
960
0
          "Error allocating buffer in av1_apply_selfguided_restoration");
961
0
    }
962
15.1k
  }
963
6.34k
}
964
#endif  // CONFIG_AV1_HIGHBITDEPTH
965
966
typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
967
                                  int stripe_width, int stripe_height,
968
                                  int procunit_width, const uint8_t *src,
969
                                  int src_stride, uint8_t *dst, int dst_stride,
970
                                  int32_t *tmpbuf, int bit_depth,
971
                                  struct aom_internal_error_info *error_info);
972
973
#if CONFIG_AV1_HIGHBITDEPTH
974
#define NUM_STRIPE_FILTERS 4
975
static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
976
  wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
977
  sgrproj_filter_stripe_highbd
978
};
979
#else
980
#define NUM_STRIPE_FILTERS 2
981
static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
982
  wiener_filter_stripe, sgrproj_filter_stripe
983
};
984
#endif  // CONFIG_AV1_HIGHBITDEPTH
985
986
// Filter one restoration unit
987
void av1_loop_restoration_filter_unit(
988
    const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
989
    const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
990
    int plane_w, int plane_h, int ss_x, int ss_y, int highbd, int bit_depth,
991
    uint8_t *data8, int stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf,
992
28.7k
    int optimized_lr, struct aom_internal_error_info *error_info) {
993
28.7k
  RestorationType unit_rtype = rui->restoration_type;
994
995
28.7k
  int unit_h = limits->v_end - limits->v_start;
996
28.7k
  int unit_w = limits->h_end - limits->h_start;
997
28.7k
  uint8_t *data8_tl =
998
28.7k
      data8 + limits->v_start * (ptrdiff_t)stride + limits->h_start;
999
28.7k
  uint8_t *dst8_tl =
1000
28.7k
      dst8 + limits->v_start * (ptrdiff_t)dst_stride + limits->h_start;
1001
1002
28.7k
  if (unit_rtype == RESTORE_NONE) {
1003
8.67k
    copy_rest_unit(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride,
1004
8.67k
                   highbd);
1005
8.67k
    return;
1006
8.67k
  }
1007
1008
20.0k
  const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
1009
20.0k
  assert(filter_idx < NUM_STRIPE_FILTERS);
1010
20.0k
  const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
1011
1012
20.0k
  const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
1013
1014
  // Filter the whole image one stripe at a time
1015
20.0k
  RestorationTileLimits remaining_stripes = *limits;
1016
20.0k
  int i = 0;
1017
55.1k
  while (i < unit_h) {
1018
35.0k
    int copy_above, copy_below;
1019
35.0k
    remaining_stripes.v_start = limits->v_start + i;
1020
1021
35.0k
    get_stripe_boundary_info(&remaining_stripes, plane_w, plane_h, ss_y,
1022
35.0k
                             &copy_above, &copy_below);
1023
1024
35.0k
    const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1025
35.0k
    const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
1026
1027
    // Work out where this stripe's boundaries are within
1028
    // rsb->stripe_boundary_{above,below}
1029
35.0k
    const int frame_stripe =
1030
35.0k
        (remaining_stripes.v_start + runit_offset) / full_stripe_height;
1031
35.0k
    const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
1032
1033
    // Calculate this stripe's height, based on two rules:
1034
    // * The topmost stripe in the frame is 8 luma pixels shorter than usual.
1035
    // * We can't extend past the end of the current restoration unit
1036
35.0k
    const int nominal_stripe_height =
1037
35.0k
        full_stripe_height - ((frame_stripe == 0) ? runit_offset : 0);
1038
35.0k
    const int h = AOMMIN(nominal_stripe_height,
1039
35.0k
                         remaining_stripes.v_end - remaining_stripes.v_start);
1040
1041
35.0k
    setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
1042
35.0k
                                     h, data8, stride, rlbs, copy_above,
1043
35.0k
                                     copy_below, optimized_lr);
1044
1045
35.0k
    stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
1046
35.0k
                  dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth,
1047
35.0k
                  error_info);
1048
1049
35.0k
    restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
1050
35.0k
                                       data8, stride, copy_above, copy_below,
1051
35.0k
                                       optimized_lr);
1052
1053
35.0k
    i += h;
1054
35.0k
  }
1055
20.0k
}
1056
1057
static void filter_frame_on_unit(const RestorationTileLimits *limits,
1058
                                 int rest_unit_idx, void *priv, int32_t *tmpbuf,
1059
                                 RestorationLineBuffers *rlbs,
1060
21.3k
                                 struct aom_internal_error_info *error_info) {
1061
21.3k
  FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
1062
21.3k
  const RestorationInfo *rsi = ctxt->rsi;
1063
1064
21.3k
  av1_loop_restoration_filter_unit(
1065
21.3k
      limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs,
1066
21.3k
      ctxt->plane_w, ctxt->plane_h, ctxt->ss_x, ctxt->ss_y, ctxt->highbd,
1067
21.3k
      ctxt->bit_depth, ctxt->data8, ctxt->data_stride, ctxt->dst8,
1068
21.3k
      ctxt->dst_stride, tmpbuf, rsi->optimized_lr, error_info);
1069
21.3k
}
1070
1071
void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
1072
                                            YV12_BUFFER_CONFIG *frame,
1073
                                            AV1_COMMON *cm, int optimized_lr,
1074
1.56k
                                            int num_planes) {
1075
1.56k
  const SequenceHeader *const seq_params = cm->seq_params;
1076
1.56k
  const int bit_depth = seq_params->bit_depth;
1077
1.56k
  const int highbd = seq_params->use_highbitdepth;
1078
1.56k
  lr_ctxt->dst = &cm->rst_frame;
1079
1080
1.56k
  const int frame_width = frame->crop_widths[0];
1081
1.56k
  const int frame_height = frame->crop_heights[0];
1082
1.56k
  if (aom_realloc_frame_buffer(
1083
1.56k
          lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
1084
1.56k
          seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
1085
1.56k
          cm->features.byte_alignment, NULL, NULL, NULL, false,
1086
1.56k
          0) != AOM_CODEC_OK)
1087
0
    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
1088
0
                       "Failed to allocate restoration dst buffer");
1089
1090
1.56k
  lr_ctxt->on_rest_unit = filter_frame_on_unit;
1091
1.56k
  lr_ctxt->frame = frame;
1092
5.71k
  for (int plane = 0; plane < num_planes; ++plane) {
1093
4.15k
    RestorationInfo *rsi = &cm->rst_info[plane];
1094
4.15k
    RestorationType rtype = rsi->frame_restoration_type;
1095
4.15k
    rsi->optimized_lr = optimized_lr;
1096
4.15k
    lr_ctxt->ctxt[plane].rsi = rsi;
1097
1098
4.15k
    if (rtype == RESTORE_NONE) {
1099
927
      continue;
1100
927
    }
1101
1102
3.22k
    const int is_uv = plane > 0;
1103
3.22k
    int plane_w, plane_h;
1104
3.22k
    av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1105
3.22k
    assert(plane_w == frame->crop_widths[is_uv]);
1106
3.22k
    assert(plane_h == frame->crop_heights[is_uv]);
1107
1108
3.22k
    av1_extend_frame(frame->buffers[plane], plane_w, plane_h,
1109
3.22k
                     frame->strides[is_uv], RESTORATION_BORDER,
1110
3.22k
                     RESTORATION_BORDER, highbd);
1111
1112
3.22k
    FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
1113
3.22k
    lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
1114
3.22k
    lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
1115
3.22k
    lr_plane_ctxt->plane_w = plane_w;
1116
3.22k
    lr_plane_ctxt->plane_h = plane_h;
1117
3.22k
    lr_plane_ctxt->highbd = highbd;
1118
3.22k
    lr_plane_ctxt->bit_depth = bit_depth;
1119
3.22k
    lr_plane_ctxt->data8 = frame->buffers[plane];
1120
3.22k
    lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
1121
3.22k
    lr_plane_ctxt->data_stride = frame->strides[is_uv];
1122
3.22k
    lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
1123
3.22k
  }
1124
1.56k
}
1125
1126
static void loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
1127
609
                                         AV1_COMMON *cm, int num_planes) {
1128
609
  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
1129
609
                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
1130
609
                           int vstart, int vend);
1131
609
  static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
1132
609
                                         aom_yv12_partial_coloc_copy_u,
1133
609
                                         aom_yv12_partial_coloc_copy_v };
1134
609
  assert(num_planes <= 3);
1135
2.10k
  for (int plane = 0; plane < num_planes; ++plane) {
1136
1.49k
    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
1137
1.02k
    FilterFrameCtxt *lr_plane_ctxt = &loop_rest_ctxt->ctxt[plane];
1138
1.02k
    copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, 0,
1139
1.02k
                     lr_plane_ctxt->plane_w, 0, lr_plane_ctxt->plane_h);
1140
1.02k
  }
1141
609
}
1142
1143
// Call on_rest_unit for each loop restoration unit in the plane.
1144
static void foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
1145
                                       rest_unit_visitor_t on_rest_unit,
1146
                                       void *priv, int32_t *tmpbuf,
1147
1.02k
                                       RestorationLineBuffers *rlbs) {
1148
1.02k
  const RestorationInfo *rsi = &cm->rst_info[plane];
1149
1.02k
  const int hnum_rest_units = rsi->horz_units;
1150
1.02k
  const int vnum_rest_units = rsi->vert_units;
1151
1.02k
  const int unit_size = rsi->restoration_unit_size;
1152
1153
1.02k
  const int is_uv = plane > 0;
1154
1.02k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1155
1.02k
  const int ext_size = unit_size * 3 / 2;
1156
1.02k
  int plane_w, plane_h;
1157
1.02k
  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1158
1159
1.02k
  int y0 = 0, i = 0;
1160
3.38k
  while (y0 < plane_h) {
1161
2.35k
    int remaining_h = plane_h - y0;
1162
2.35k
    int h = (remaining_h < ext_size) ? remaining_h : unit_size;
1163
1164
2.35k
    RestorationTileLimits limits;
1165
2.35k
    limits.v_start = y0;
1166
2.35k
    limits.v_end = y0 + h;
1167
2.35k
    assert(limits.v_end <= plane_h);
1168
    // Offset upwards to align with the restoration processing stripe
1169
2.35k
    const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
1170
2.35k
    limits.v_start = AOMMAX(0, limits.v_start - voffset);
1171
2.35k
    if (limits.v_end < plane_h) limits.v_end -= voffset;
1172
1173
2.35k
    av1_foreach_rest_unit_in_row(&limits, plane_w, on_rest_unit, i, unit_size,
1174
2.35k
                                 hnum_rest_units, vnum_rest_units, plane, priv,
1175
2.35k
                                 tmpbuf, rlbs, av1_lr_sync_read_dummy,
1176
2.35k
                                 av1_lr_sync_write_dummy, NULL, cm->error);
1177
1178
2.35k
    y0 += h;
1179
2.35k
    ++i;
1180
2.35k
  }
1181
1.02k
}
1182
1183
static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
1184
609
                                        int num_planes) {
1185
609
  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
1186
1187
2.10k
  for (int plane = 0; plane < num_planes; ++plane) {
1188
1.49k
    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
1189
470
      continue;
1190
470
    }
1191
1192
1.02k
    foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit, &ctxt[plane],
1193
1.02k
                               cm->rst_tmpbuf, cm->rlbs);
1194
1.02k
  }
1195
609
}
1196
1197
void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
1198
                                       AV1_COMMON *cm, int optimized_lr,
1199
609
                                       void *lr_ctxt) {
1200
609
  assert(!cm->features.all_lossless);
1201
609
  const int num_planes = av1_num_planes(cm);
1202
1203
609
  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
1204
1205
609
  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
1206
609
                                         optimized_lr, num_planes);
1207
1208
609
  foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
1209
1210
609
  loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
1211
609
}
1212
1213
void av1_foreach_rest_unit_in_row(
1214
    RestorationTileLimits *limits, int plane_w,
1215
    rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
1216
    int hnum_rest_units, int vnum_rest_units, int plane, void *priv,
1217
    int32_t *tmpbuf, RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read,
1218
    sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync,
1219
12.9k
    struct aom_internal_error_info *error_info) {
1220
12.9k
  const int ext_size = unit_size * 3 / 2;
1221
12.9k
  int x0 = 0, j = 0;
1222
34.2k
  while (x0 < plane_w) {
1223
21.3k
    int remaining_w = plane_w - x0;
1224
21.3k
    int w = (remaining_w < ext_size) ? remaining_w : unit_size;
1225
1226
21.3k
    limits->h_start = x0;
1227
21.3k
    limits->h_end = x0 + w;
1228
21.3k
    assert(limits->h_end <= plane_w);
1229
1230
21.3k
    const int unit_idx = row_number * hnum_rest_units + j;
1231
1232
    // No sync for even numbered rows
1233
    // For odd numbered rows, Loop Restoration of current block requires the LR
1234
    // of top-right and bottom-right blocks to be completed
1235
1236
    // top-right sync
1237
21.3k
    on_sync_read(lr_sync, row_number, j, plane);
1238
21.3k
    if ((row_number + 1) < vnum_rest_units)
1239
      // bottom-right sync
1240
15.7k
      on_sync_read(lr_sync, row_number + 2, j, plane);
1241
1242
21.3k
#if CONFIG_MULTITHREAD
1243
21.3k
    if (lr_sync && lr_sync->num_workers > 1) {
1244
18.1k
      pthread_mutex_lock(lr_sync->job_mutex);
1245
18.1k
      const bool lr_mt_exit = lr_sync->lr_mt_exit;
1246
18.1k
      pthread_mutex_unlock(lr_sync->job_mutex);
1247
      // Exit in case any worker has encountered an error.
1248
18.1k
      if (lr_mt_exit) return;
1249
18.1k
    }
1250
21.3k
#endif
1251
1252
21.3k
    on_rest_unit(limits, unit_idx, priv, tmpbuf, rlbs, error_info);
1253
1254
21.3k
    on_sync_write(lr_sync, row_number, j, hnum_rest_units, plane);
1255
1256
21.3k
    x0 += w;
1257
21.3k
    ++j;
1258
21.3k
  }
1259
12.9k
}
1260
1261
23.0k
void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
1262
23.0k
  (void)lr_sync;
1263
23.0k
  (void)r;
1264
23.0k
  (void)c;
1265
23.0k
  (void)plane;
1266
23.0k
}
1267
1268
void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
1269
10.5k
                             const int sb_cols, int plane) {
1270
10.5k
  (void)lr_sync;
1271
10.5k
  (void)r;
1272
10.5k
  (void)c;
1273
10.5k
  (void)sb_cols;
1274
10.5k
  (void)plane;
1275
10.5k
}
1276
1277
int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
1278
                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
1279
                                       int *rcol0, int *rcol1, int *rrow0,
1280
753k
                                       int *rrow1) {
1281
753k
  assert(rcol0 && rcol1 && rrow0 && rrow1);
1282
1283
753k
  if (bsize != cm->seq_params->sb_size) return 0;
1284
1285
143k
  assert(!cm->features.all_lossless);
1286
1287
143k
  const int is_uv = plane > 0;
1288
1289
  // Compute the mi-unit corners of the superblock
1290
143k
  const int mi_row0 = mi_row;
1291
143k
  const int mi_col0 = mi_col;
1292
143k
  const int mi_row1 = mi_row0 + mi_size_high[bsize];
1293
143k
  const int mi_col1 = mi_col0 + mi_size_wide[bsize];
1294
1295
143k
  const RestorationInfo *rsi = &cm->rst_info[plane];
1296
143k
  const int size = rsi->restoration_unit_size;
1297
143k
  const int horz_units = rsi->horz_units;
1298
143k
  const int vert_units = rsi->vert_units;
1299
1300
  // The size of an MI-unit on this plane of the image
1301
143k
  const int ss_x = is_uv && cm->seq_params->subsampling_x;
1302
143k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1303
143k
  const int mi_size_x = MI_SIZE >> ss_x;
1304
143k
  const int mi_size_y = MI_SIZE >> ss_y;
1305
1306
  // Write m for the relative mi column or row, D for the superres denominator
1307
  // and N for the superres numerator. If u is the upscaled pixel offset then
1308
  // we can write the downscaled pixel offset in two ways as:
1309
  //
1310
  //   MI_SIZE * m = N / D u
1311
  //
1312
  // from which we get u = D * MI_SIZE * m / N
1313
143k
  const int mi_to_num_x = av1_superres_scaled(cm)
1314
143k
                              ? mi_size_x * cm->superres_scale_denominator
1315
143k
                              : mi_size_x;
1316
143k
  const int mi_to_num_y = mi_size_y;
1317
143k
  const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
1318
143k
  const int denom_y = size;
1319
1320
143k
  const int rnd_x = denom_x - 1;
1321
143k
  const int rnd_y = denom_y - 1;
1322
1323
  // rcol0/rrow0 should be the first column/row of restoration units that
1324
  // doesn't start left/below of mi_col/mi_row. For this calculation, we need
1325
  // to round up the division (if the sb starts at runit column 10.1, the first
1326
  // matching runit has column index 11)
1327
143k
  *rcol0 = (mi_col0 * mi_to_num_x + rnd_x) / denom_x;
1328
143k
  *rrow0 = (mi_row0 * mi_to_num_y + rnd_y) / denom_y;
1329
1330
  // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
1331
  // below-right. If we're at the bottom or right of the frame, this restoration
1332
  // unit might not exist, in which case we'll clamp accordingly.
1333
143k
  *rcol1 = AOMMIN((mi_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
1334
143k
  *rrow1 = AOMMIN((mi_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
1335
1336
143k
  return *rcol0 < *rcol1 && *rrow0 < *rrow1;
1337
753k
}
1338
1339
// Extend to left and right
1340
static void extend_lines(uint8_t *buf, int width, int height, int stride,
1341
130k
                         int extend, int use_highbitdepth) {
1342
392k
  for (int i = 0; i < height; ++i) {
1343
261k
    if (use_highbitdepth) {
1344
78.5k
      uint16_t *buf16 = (uint16_t *)buf;
1345
78.5k
      aom_memset16(buf16 - extend, buf16[0], extend);
1346
78.5k
      aom_memset16(buf16 + width, buf16[width - 1], extend);
1347
182k
    } else {
1348
182k
      memset(buf - extend, buf[0], extend);
1349
182k
      memset(buf + width, buf[width - 1], extend);
1350
182k
    }
1351
261k
    buf += stride;
1352
261k
  }
1353
130k
}
1354
1355
static void save_deblock_boundary_lines(
1356
    const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
1357
    int stripe, int use_highbd, int is_above,
1358
70.4k
    RestorationStripeBoundaries *boundaries) {
1359
70.4k
  const int is_uv = plane > 0;
1360
70.4k
  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1361
70.4k
  const int src_stride = frame->strides[is_uv] << use_highbd;
1362
70.4k
  const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
1363
1364
70.4k
  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1365
70.4k
                               : boundaries->stripe_boundary_below;
1366
70.4k
  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1367
70.4k
  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1368
70.4k
  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1369
1370
  // There is a rare case in which a processing stripe can end 1px above the
1371
  // crop border. In this case, we do want to use deblocked pixels from below
1372
  // the stripe (hence why we ended up in this function), but instead of
1373
  // fetching 2 "below" rows we need to fetch one and duplicate it.
1374
  // This is equivalent to clamping the sample locations against the crop border
1375
70.4k
  const int lines_to_save =
1376
70.4k
      AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
1377
70.4k
  assert(lines_to_save == 1 || lines_to_save == 2);
1378
1379
70.4k
  int upscaled_width;
1380
70.4k
  int line_bytes;
1381
70.4k
  if (av1_superres_scaled(cm)) {
1382
9.06k
    const int ss_x = is_uv && cm->seq_params->subsampling_x;
1383
9.06k
    upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
1384
9.06k
    line_bytes = upscaled_width << use_highbd;
1385
9.06k
    if (use_highbd)
1386
5.36k
      av1_upscale_normative_rows(
1387
5.36k
          cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
1388
5.36k
          CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
1389
5.36k
          plane, lines_to_save);
1390
3.69k
    else
1391
3.69k
      av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
1392
3.69k
                                 boundaries->stripe_boundary_stride, plane,
1393
3.69k
                                 lines_to_save);
1394
61.4k
  } else {
1395
61.4k
    upscaled_width = frame->crop_widths[is_uv];
1396
61.4k
    line_bytes = upscaled_width << use_highbd;
1397
184k
    for (int i = 0; i < lines_to_save; i++) {
1398
122k
      memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
1399
122k
             line_bytes);
1400
122k
    }
1401
61.4k
  }
1402
  // If we only saved one line, then copy it into the second line buffer
1403
70.4k
  if (lines_to_save == 1)
1404
237
    memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
1405
1406
70.4k
  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1407
70.4k
               RESTORATION_EXTRA_HORZ, use_highbd);
1408
70.4k
}
1409
1410
static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1411
                                     const AV1_COMMON *cm, int plane, int row,
1412
                                     int stripe, int use_highbd, int is_above,
1413
60.2k
                                     RestorationStripeBoundaries *boundaries) {
1414
60.2k
  const int is_uv = plane > 0;
1415
60.2k
  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1416
60.2k
  const int src_stride = frame->strides[is_uv] << use_highbd;
1417
60.2k
  const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
1418
1419
60.2k
  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1420
60.2k
                               : boundaries->stripe_boundary_below;
1421
60.2k
  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1422
60.2k
  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1423
60.2k
  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1424
60.2k
  const int src_width = frame->crop_widths[is_uv];
1425
1426
  // At the point where this function is called, we've already applied
1427
  // superres. So we don't need to extend the lines here, we can just
1428
  // pull directly from the topmost row of the upscaled frame.
1429
60.2k
  const int ss_x = is_uv && cm->seq_params->subsampling_x;
1430
60.2k
  const int upscaled_width = av1_superres_scaled(cm)
1431
60.2k
                                 ? (cm->superres_upscaled_width + ss_x) >> ss_x
1432
60.2k
                                 : src_width;
1433
60.2k
  const int line_bytes = upscaled_width << use_highbd;
1434
180k
  for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
1435
    // Copy the line at 'src_rows' into both context lines
1436
120k
    memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
1437
120k
  }
1438
60.2k
  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1439
60.2k
               RESTORATION_EXTRA_HORZ, use_highbd);
1440
60.2k
}
1441
1442
static void save_boundary_lines(const YV12_BUFFER_CONFIG *frame, int use_highbd,
1443
60.2k
                                int plane, AV1_COMMON *cm, int after_cdef) {
1444
60.2k
  const int is_uv = plane > 0;
1445
60.2k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1446
60.2k
  const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1447
60.2k
  const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
1448
1449
60.2k
  int plane_w, plane_h;
1450
60.2k
  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1451
1452
60.2k
  RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
1453
1454
60.2k
  const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
1455
1456
60.2k
  int stripe_idx;
1457
191k
  for (stripe_idx = 0;; ++stripe_idx) {
1458
191k
    const int rel_y0 = AOMMAX(0, stripe_idx * stripe_height - stripe_off);
1459
191k
    const int y0 = rel_y0;
1460
191k
    if (y0 >= plane_h) break;
1461
1462
130k
    const int rel_y1 = (stripe_idx + 1) * stripe_height - stripe_off;
1463
130k
    const int y1 = AOMMIN(rel_y1, plane_h);
1464
1465
    // Extend using CDEF pixels at the top and bottom of the frame,
1466
    // and deblocked pixels at internal stripe boundaries
1467
130k
    const int use_deblock_above = (stripe_idx > 0);
1468
130k
    const int use_deblock_below = (y1 < plane_height);
1469
1470
130k
    if (!after_cdef) {
1471
      // Save deblocked context at internal stripe boundaries
1472
65.3k
      if (use_deblock_above) {
1473
35.2k
        save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
1474
35.2k
                                    stripe_idx, use_highbd, 1, boundaries);
1475
35.2k
      }
1476
65.3k
      if (use_deblock_below) {
1477
35.2k
        save_deblock_boundary_lines(frame, cm, plane, y1, stripe_idx,
1478
35.2k
                                    use_highbd, 0, boundaries);
1479
35.2k
      }
1480
65.3k
    } else {
1481
      // Save CDEF context at frame boundaries
1482
65.3k
      if (!use_deblock_above) {
1483
30.1k
        save_cdef_boundary_lines(frame, cm, plane, y0, stripe_idx, use_highbd,
1484
30.1k
                                 1, boundaries);
1485
30.1k
      }
1486
65.3k
      if (!use_deblock_below) {
1487
30.1k
        save_cdef_boundary_lines(frame, cm, plane, y1 - 1, stripe_idx,
1488
30.1k
                                 use_highbd, 0, boundaries);
1489
30.1k
      }
1490
65.3k
    }
1491
130k
  }
1492
60.2k
}
1493
1494
// For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
1495
// lines to be used as boundary in the loop restoration process. The
1496
// lines are saved in rst_internal.stripe_boundary_lines
1497
void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1498
30.7k
                                              AV1_COMMON *cm, int after_cdef) {
1499
30.7k
  const int num_planes = av1_num_planes(cm);
1500
30.7k
  const int use_highbd = cm->seq_params->use_highbitdepth;
1501
91.0k
  for (int p = 0; p < num_planes; ++p) {
1502
60.2k
    save_boundary_lines(frame, use_highbd, p, cm, after_cdef);
1503
60.2k
  }
1504
30.7k
}