Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/av1/common/restoration.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 *
11
 */
12
13
#include <math.h>
14
#include <stddef.h>
15
16
#include "config/aom_config.h"
17
#include "config/aom_scale_rtcd.h"
18
19
#include "aom/internal/aom_codec_internal.h"
20
#include "aom_mem/aom_mem.h"
21
#include "aom_dsp/aom_dsp_common.h"
22
#include "aom_mem/aom_mem.h"
23
#include "aom_ports/mem.h"
24
#include "aom_util/aom_pthread.h"
25
26
#include "av1/common/av1_common_int.h"
27
#include "av1/common/convolve.h"
28
#include "av1/common/enums.h"
29
#include "av1/common/resize.h"
30
#include "av1/common/restoration.h"
31
#include "av1/common/thread_common.h"
32
33
// The 's' values are calculated based on original 'r' and 'e' values in the
34
// spec using GenSgrprojVtable().
35
// Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
36
const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
37
  { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
38
  { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
39
  { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
40
  { { 2, 1 }, { 47, 1079 } },  { { 2, 1 }, { 37, 996 } },
41
  { { 2, 1 }, { 30, 925 } },   { { 2, 1 }, { 25, 863 } },
42
  { { 0, 1 }, { -1, 2589 } },  { { 0, 1 }, { -1, 1618 } },
43
  { { 0, 1 }, { -1, 1177 } },  { { 0, 1 }, { -1, 925 } },
44
  { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
45
};
46
47
void av1_get_upsampled_plane_size(const AV1_COMMON *cm, int is_uv, int *plane_w,
48
107k
                                  int *plane_h) {
49
107k
  int ss_x = is_uv && cm->seq_params->subsampling_x;
50
107k
  int ss_y = is_uv && cm->seq_params->subsampling_y;
51
107k
  *plane_w = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
52
107k
  *plane_h = ROUND_POWER_OF_TWO(cm->height, ss_y);
53
107k
}
54
55
// Count horizontal or vertical units in a plane (use a width or height for
56
// plane_size, respectively). We basically want to divide the plane size by the
57
// size of a restoration unit. Rather than rounding up unconditionally as you
58
// might expect, we round to nearest, which models the way a right or bottom
59
// restoration unit can extend to up to 150% its normal width or height.
60
//
61
// The max with 1 is to deal with small frames, which may be smaller than
62
// half of an LR unit in size.
63
75.0k
int av1_lr_count_units(int unit_size, int plane_size) {
64
75.0k
  return AOMMAX((plane_size + (unit_size >> 1)) / unit_size, 1);
65
75.0k
}
66
67
void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
68
37.5k
                                  int is_uv) {
69
37.5k
  int plane_w, plane_h;
70
37.5k
  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
71
72
37.5k
  const int unit_size = rsi->restoration_unit_size;
73
37.5k
  const int horz_units = av1_lr_count_units(unit_size, plane_w);
74
37.5k
  const int vert_units = av1_lr_count_units(unit_size, plane_h);
75
76
37.5k
  rsi->num_rest_units = horz_units * vert_units;
77
37.5k
  rsi->horz_units = horz_units;
78
37.5k
  rsi->vert_units = vert_units;
79
80
37.5k
  aom_free(rsi->unit_info);
81
37.5k
  CHECK_MEM_ERROR(cm, rsi->unit_info,
82
37.5k
                  (RestorationUnitInfo *)aom_memalign(
83
37.5k
                      16, sizeof(*rsi->unit_info) * rsi->num_rest_units));
84
37.5k
}
85
86
118k
void av1_free_restoration_struct(RestorationInfo *rst_info) {
87
118k
  aom_free(rst_info->unit_info);
88
118k
  rst_info->unit_info = NULL;
89
118k
}
90
91
#if 0
92
// Pair of values for each sgrproj parameter:
93
// Index 0 corresponds to r[0], e[0]
94
// Index 1 corresponds to r[1], e[1]
95
int sgrproj_mtable[SGRPROJ_PARAMS][2];
96
97
static void GenSgrprojVtable(void) {
98
  for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
99
    const sgr_params_type *const params = &av1_sgr_params[i];
100
    for (int j = 0; j < 2; ++j) {
101
      const int e = params->e[j];
102
      const int r = params->r[j];
103
      if (r == 0) {                 // filter is disabled
104
        sgrproj_mtable[i][j] = -1;  // mark invalid
105
      } else {                      // filter is enabled
106
        const int n = (2 * r + 1) * (2 * r + 1);
107
        const int n2e = n * n * e;
108
        assert(n2e != 0);
109
        sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
110
      }
111
    }
112
  }
113
}
114
#endif
115
116
39.3k
void av1_loop_restoration_precal(void) {
117
#if 0
118
  GenSgrprojVtable();
119
#endif
120
39.3k
}
121
122
static void extend_frame_lowbd(uint8_t *data, int width, int height,
123
                               ptrdiff_t stride, int border_horz,
124
4.30k
                               int border_vert) {
125
4.30k
  uint8_t *data_p;
126
4.30k
  int i;
127
1.42M
  for (i = 0; i < height; ++i) {
128
1.42M
    data_p = data + i * stride;
129
1.42M
    memset(data_p - border_horz, data_p[0], border_horz);
130
1.42M
    memset(data_p + width, data_p[width - 1], border_horz);
131
1.42M
  }
132
4.30k
  data_p = data - border_horz;
133
17.2k
  for (i = -border_vert; i < 0; ++i) {
134
12.9k
    memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
135
12.9k
  }
136
17.2k
  for (i = height; i < height + border_vert; ++i) {
137
12.9k
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
138
12.9k
           width + 2 * border_horz);
139
12.9k
  }
140
4.30k
}
141
142
#if CONFIG_AV1_HIGHBITDEPTH
143
static void extend_frame_highbd(uint16_t *data, int width, int height,
144
                                ptrdiff_t stride, int border_horz,
145
8.94k
                                int border_vert) {
146
8.94k
  uint16_t *data_p;
147
8.94k
  int i, j;
148
2.49M
  for (i = 0; i < height; ++i) {
149
2.48M
    data_p = data + i * stride;
150
9.95M
    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
151
9.95M
    for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
152
2.48M
  }
153
8.94k
  data_p = data - border_horz;
154
35.7k
  for (i = -border_vert; i < 0; ++i) {
155
26.8k
    memcpy(data_p + i * stride, data_p,
156
26.8k
           (width + 2 * border_horz) * sizeof(uint16_t));
157
26.8k
  }
158
35.7k
  for (i = height; i < height + border_vert; ++i) {
159
26.8k
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
160
26.8k
           (width + 2 * border_horz) * sizeof(uint16_t));
161
26.8k
  }
162
8.94k
}
163
164
static void copy_rest_unit_highbd(int width, int height, const uint16_t *src,
165
                                  int src_stride, uint16_t *dst,
166
9.85k
                                  int dst_stride) {
167
749k
  for (int i = 0; i < height; ++i)
168
739k
    memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
169
9.85k
}
170
#endif
171
172
void av1_extend_frame(uint8_t *data, int width, int height, int stride,
173
13.2k
                      int border_horz, int border_vert, int highbd) {
174
13.2k
#if CONFIG_AV1_HIGHBITDEPTH
175
13.2k
  if (highbd) {
176
8.94k
    extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
177
8.94k
                        border_horz, border_vert);
178
8.94k
    return;
179
8.94k
  }
180
4.30k
#endif
181
4.30k
  (void)highbd;
182
4.30k
  extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
183
4.30k
}
184
185
static void copy_rest_unit_lowbd(int width, int height, const uint8_t *src,
186
4.69k
                                 int src_stride, uint8_t *dst, int dst_stride) {
187
318k
  for (int i = 0; i < height; ++i)
188
314k
    memcpy(dst + i * dst_stride, src + i * src_stride, width);
189
4.69k
}
190
191
static void copy_rest_unit(int width, int height, const uint8_t *src,
192
                           int src_stride, uint8_t *dst, int dst_stride,
193
14.5k
                           int highbd) {
194
14.5k
#if CONFIG_AV1_HIGHBITDEPTH
195
14.5k
  if (highbd) {
196
9.85k
    copy_rest_unit_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
197
9.85k
                          CONVERT_TO_SHORTPTR(dst), dst_stride);
198
9.85k
    return;
199
9.85k
  }
200
4.69k
#endif
201
4.69k
  (void)highbd;
202
4.69k
  copy_rest_unit_lowbd(width, height, src, src_stride, dst, dst_stride);
203
4.69k
}
204
205
1.03M
#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
206
207
// With striped loop restoration, the filtering for each 64-pixel stripe gets
208
// most of its input from the output of CDEF (stored in data8), but we need to
209
// fill out a border of 3 pixels above/below the stripe according to the
210
// following rules:
211
//
212
// * At the top and bottom of the frame, we copy the outermost row of CDEF
213
//   pixels three times. This extension is done by a call to av1_extend_frame()
214
//   at the start of the loop restoration process, so the value of
215
//   copy_above/copy_below doesn't strictly matter.
216
//
217
// * All other boundaries are stripe boundaries within the frame. In that case,
218
//   we take 2 rows of deblocked pixels and extend them to 3 rows of context.
219
static void get_stripe_boundary_info(const RestorationTileLimits *limits,
220
                                     int plane_w, int plane_h, int ss_y,
221
55.7k
                                     int *copy_above, int *copy_below) {
222
55.7k
  (void)plane_w;
223
224
55.7k
  *copy_above = 1;
225
55.7k
  *copy_below = 1;
226
227
55.7k
  const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
228
55.7k
  const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
229
230
55.7k
  const int first_stripe_in_plane = (limits->v_start == 0);
231
55.7k
  const int this_stripe_height =
232
55.7k
      full_stripe_height - (first_stripe_in_plane ? runit_offset : 0);
233
55.7k
  const int last_stripe_in_plane =
234
55.7k
      (limits->v_start + this_stripe_height >= plane_h);
235
236
55.7k
  if (first_stripe_in_plane) *copy_above = 0;
237
55.7k
  if (last_stripe_in_plane) *copy_below = 0;
238
55.7k
}
239
240
// Overwrite the border pixels around a processing stripe so that the conditions
241
// listed above get_stripe_boundary_info() are preserved.
242
// We save the pixels which get overwritten into a temporary buffer, so that
243
// they can be restored by restore_processing_stripe_boundary() after we've
244
// processed the stripe.
245
//
246
// limits gives the rectangular limits of the remaining stripes for the current
247
// restoration unit. rsb is the stored stripe boundaries (taken from either
248
// deblock or CDEF output as necessary).
249
static void setup_processing_stripe_boundary(
250
    const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
251
    int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
252
55.7k
    RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
253
  // Offsets within the line buffers. The buffer logically starts at column
254
  // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
255
  // has column x0 in the buffer.
256
55.7k
  const int buf_stride = rsb->stripe_boundary_stride;
257
55.7k
  const int buf_x0_off = limits->h_start;
258
55.7k
  const int line_width =
259
55.7k
      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
260
55.7k
  const int line_size = line_width << use_highbd;
261
262
55.7k
  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
263
264
  // Replace RESTORATION_BORDER pixels above the top of the stripe
265
  // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
266
  // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
267
  // duplicating the topmost of the 2 lines (see the AOMMAX call when
268
  // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
269
55.7k
  if (!opt) {
270
42.6k
    if (copy_above) {
271
39.6k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
272
273
158k
      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
274
118k
        const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
275
118k
        const int buf_off = buf_x0_off + buf_row * buf_stride;
276
118k
        const uint8_t *buf =
277
118k
            rsb->stripe_boundary_above + (buf_off << use_highbd);
278
118k
        uint8_t *dst8 = data8_tl + i * data_stride;
279
        // Save old pixels, then replace with data from stripe_boundary_above
280
118k
        memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
281
118k
               REAL_PTR(use_highbd, dst8), line_size);
282
118k
        memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
283
118k
      }
284
39.6k
    }
285
286
    // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
287
    // The second buffer row is repeated, so src_row gets the values 0, 1, 1
288
    // for i = 0, 1, 2.
289
42.6k
    if (copy_below) {
290
35.1k
      const int stripe_end = limits->v_start + h;
291
35.1k
      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
292
293
140k
      for (int i = 0; i < RESTORATION_BORDER; ++i) {
294
105k
        const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
295
105k
        const int buf_off = buf_x0_off + buf_row * buf_stride;
296
105k
        const uint8_t *src =
297
105k
            rsb->stripe_boundary_below + (buf_off << use_highbd);
298
299
105k
        uint8_t *dst8 = data8_bl + i * data_stride;
300
        // Save old pixels, then replace with data from stripe_boundary_below
301
105k
        memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
302
105k
        memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
303
105k
      }
304
35.1k
    }
305
42.6k
  } else {
306
13.1k
    if (copy_above) {
307
12.9k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
308
309
      // Only save and overwrite i=-RESTORATION_BORDER line.
310
12.9k
      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
311
      // Save old pixels, then replace with data from stripe_boundary_above
312
12.9k
      memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
313
12.9k
      memcpy(REAL_PTR(use_highbd, dst8),
314
12.9k
             REAL_PTR(use_highbd,
315
12.9k
                      data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
316
12.9k
             line_size);
317
12.9k
    }
318
319
13.1k
    if (copy_below) {
320
12.9k
      const int stripe_end = limits->v_start + h;
321
12.9k
      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
322
323
      // Only save and overwrite i=2 line.
324
12.9k
      uint8_t *dst8 = data8_bl + 2 * data_stride;
325
      // Save old pixels, then replace with data from stripe_boundary_below
326
12.9k
      memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
327
12.9k
      memcpy(REAL_PTR(use_highbd, dst8),
328
12.9k
             REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
329
12.9k
    }
330
13.1k
  }
331
55.7k
}
332
333
// Once a processing stripe is finished, this function sets the boundary
334
// pixels which were overwritten by setup_processing_stripe_boundary()
335
// back to their original values
336
static void restore_processing_stripe_boundary(
337
    const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
338
    int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
339
55.7k
    int copy_below, int opt) {
340
55.7k
  const int line_width =
341
55.7k
      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
342
55.7k
  const int line_size = line_width << use_highbd;
343
344
55.7k
  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
345
346
55.7k
  if (!opt) {
347
42.6k
    if (copy_above) {
348
39.6k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
349
158k
      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
350
118k
        uint8_t *dst8 = data8_tl + i * data_stride;
351
118k
        memcpy(REAL_PTR(use_highbd, dst8),
352
118k
               rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
353
118k
      }
354
39.6k
    }
355
356
42.6k
    if (copy_below) {
357
35.1k
      const int stripe_bottom = limits->v_start + h;
358
35.1k
      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
359
360
140k
      for (int i = 0; i < RESTORATION_BORDER; ++i) {
361
105k
        if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
362
363
105k
        uint8_t *dst8 = data8_bl + i * data_stride;
364
105k
        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
365
105k
      }
366
35.1k
    }
367
42.6k
  } else {
368
13.1k
    if (copy_above) {
369
12.9k
      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
370
371
      // Only restore i=-RESTORATION_BORDER line.
372
12.9k
      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
373
12.9k
      memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
374
12.9k
    }
375
376
13.1k
    if (copy_below) {
377
12.9k
      const int stripe_bottom = limits->v_start + h;
378
12.9k
      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
379
380
      // Only restore i=2 line.
381
12.9k
      if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
382
12.9k
        uint8_t *dst8 = data8_bl + 2 * data_stride;
383
12.9k
        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
384
12.9k
      }
385
12.9k
    }
386
13.1k
  }
387
55.7k
}
388
389
static void wiener_filter_stripe(const RestorationUnitInfo *rui,
390
                                 int stripe_width, int stripe_height,
391
                                 int procunit_width, const uint8_t *src,
392
                                 int src_stride, uint8_t *dst, int dst_stride,
393
                                 int32_t *tmpbuf, int bit_depth,
394
13.5k
                                 struct aom_internal_error_info *error_info) {
395
13.5k
  (void)tmpbuf;
396
13.5k
  (void)bit_depth;
397
13.5k
  (void)error_info;
398
13.5k
  assert(bit_depth == 8);
399
13.5k
  const WienerConvolveParams conv_params = get_conv_params_wiener(8);
400
401
29.7k
  for (int j = 0; j < stripe_width; j += procunit_width) {
402
16.2k
    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
403
16.2k
    const uint8_t *src_p = src + j;
404
16.2k
    uint8_t *dst_p = dst + j;
405
16.2k
    av1_wiener_convolve_add_src(
406
16.2k
        src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
407
16.2k
        rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
408
16.2k
  }
409
13.5k
}
410
411
/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
412
   over the input. The window is of size (2r + 1)x(2r + 1), and we
413
   specialize to r = 1, 2, 3. A default function is used for r > 3.
414
415
   Each loop follows the same format: We keep a window's worth of input
416
   in individual variables and select data out of that as appropriate.
417
*/
418
static void boxsum1(int32_t *src, int width, int height, int src_stride,
419
29.6k
                    int sqr, int32_t *dst, int dst_stride) {
420
29.6k
  int i, j, a, b, c;
421
29.6k
  assert(width > 2 * SGRPROJ_BORDER_HORZ);
422
29.6k
  assert(height > 2 * SGRPROJ_BORDER_VERT);
423
424
  // Vertical sum over 3-pixel regions, from src into dst.
425
29.6k
  if (!sqr) {
426
456k
    for (j = 0; j < width; ++j) {
427
441k
      a = src[j];
428
441k
      b = src[src_stride + j];
429
441k
      c = src[2 * src_stride + j];
430
431
441k
      dst[j] = a + b;
432
21.7M
      for (i = 1; i < height - 2; ++i) {
433
        // Loop invariant: At the start of each iteration,
434
        // a = src[(i - 1) * src_stride + j]
435
        // b = src[(i    ) * src_stride + j]
436
        // c = src[(i + 1) * src_stride + j]
437
21.3M
        dst[i * dst_stride + j] = a + b + c;
438
21.3M
        a = b;
439
21.3M
        b = c;
440
21.3M
        c = src[(i + 2) * src_stride + j];
441
21.3M
      }
442
441k
      dst[i * dst_stride + j] = a + b + c;
443
441k
      dst[(i + 1) * dst_stride + j] = b + c;
444
441k
    }
445
14.8k
  } else {
446
456k
    for (j = 0; j < width; ++j) {
447
441k
      a = src[j] * src[j];
448
441k
      b = src[src_stride + j] * src[src_stride + j];
449
441k
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
450
451
441k
      dst[j] = a + b;
452
21.7M
      for (i = 1; i < height - 2; ++i) {
453
21.3M
        dst[i * dst_stride + j] = a + b + c;
454
21.3M
        a = b;
455
21.3M
        b = c;
456
21.3M
        c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
457
21.3M
      }
458
441k
      dst[i * dst_stride + j] = a + b + c;
459
441k
      dst[(i + 1) * dst_stride + j] = b + c;
460
441k
    }
461
14.8k
  }
462
463
  // Horizontal sum over 3-pixel regions of dst
464
1.66M
  for (i = 0; i < height; ++i) {
465
1.63M
    a = dst[i * dst_stride];
466
1.63M
    b = dst[i * dst_stride + 1];
467
1.63M
    c = dst[i * dst_stride + 2];
468
469
1.63M
    dst[i * dst_stride] = a + b;
470
42.0M
    for (j = 1; j < width - 2; ++j) {
471
      // Loop invariant: At the start of each iteration,
472
      // a = src[i * src_stride + (j - 1)]
473
      // b = src[i * src_stride + (j    )]
474
      // c = src[i * src_stride + (j + 1)]
475
40.4M
      dst[i * dst_stride + j] = a + b + c;
476
40.4M
      a = b;
477
40.4M
      b = c;
478
40.4M
      c = dst[i * dst_stride + (j + 2)];
479
40.4M
    }
480
1.63M
    dst[i * dst_stride + j] = a + b + c;
481
1.63M
    dst[i * dst_stride + (j + 1)] = b + c;
482
1.63M
  }
483
29.6k
}
484
485
static void boxsum2(int32_t *src, int width, int height, int src_stride,
486
26.4k
                    int sqr, int32_t *dst, int dst_stride) {
487
26.4k
  int i, j, a, b, c, d, e;
488
26.4k
  assert(width > 2 * SGRPROJ_BORDER_HORZ);
489
26.4k
  assert(height > 2 * SGRPROJ_BORDER_VERT);
490
491
  // Vertical sum over 5-pixel regions, from src into dst.
492
26.4k
  if (!sqr) {
493
393k
    for (j = 0; j < width; ++j) {
494
380k
      a = src[j];
495
380k
      b = src[src_stride + j];
496
380k
      c = src[2 * src_stride + j];
497
380k
      d = src[3 * src_stride + j];
498
380k
      e = src[4 * src_stride + j];
499
500
380k
      dst[j] = a + b + c;
501
380k
      dst[dst_stride + j] = a + b + c + d;
502
18.8M
      for (i = 2; i < height - 3; ++i) {
503
        // Loop invariant: At the start of each iteration,
504
        // a = src[(i - 2) * src_stride + j]
505
        // b = src[(i - 1) * src_stride + j]
506
        // c = src[(i    ) * src_stride + j]
507
        // d = src[(i + 1) * src_stride + j]
508
        // e = src[(i + 2) * src_stride + j]
509
18.4M
        dst[i * dst_stride + j] = a + b + c + d + e;
510
18.4M
        a = b;
511
18.4M
        b = c;
512
18.4M
        c = d;
513
18.4M
        d = e;
514
18.4M
        e = src[(i + 3) * src_stride + j];
515
18.4M
      }
516
380k
      dst[i * dst_stride + j] = a + b + c + d + e;
517
380k
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
518
380k
      dst[(i + 2) * dst_stride + j] = c + d + e;
519
380k
    }
520
13.2k
  } else {
521
393k
    for (j = 0; j < width; ++j) {
522
380k
      a = src[j] * src[j];
523
380k
      b = src[src_stride + j] * src[src_stride + j];
524
380k
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
525
380k
      d = src[3 * src_stride + j] * src[3 * src_stride + j];
526
380k
      e = src[4 * src_stride + j] * src[4 * src_stride + j];
527
528
380k
      dst[j] = a + b + c;
529
380k
      dst[dst_stride + j] = a + b + c + d;
530
18.8M
      for (i = 2; i < height - 3; ++i) {
531
18.4M
        dst[i * dst_stride + j] = a + b + c + d + e;
532
18.4M
        a = b;
533
18.4M
        b = c;
534
18.4M
        c = d;
535
18.4M
        d = e;
536
18.4M
        e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
537
18.4M
      }
538
380k
      dst[i * dst_stride + j] = a + b + c + d + e;
539
380k
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
540
380k
      dst[(i + 2) * dst_stride + j] = c + d + e;
541
380k
    }
542
13.2k
  }
543
544
  // Horizontal sum over 5-pixel regions of dst
545
1.54M
  for (i = 0; i < height; ++i) {
546
1.51M
    a = dst[i * dst_stride];
547
1.51M
    b = dst[i * dst_stride + 1];
548
1.51M
    c = dst[i * dst_stride + 2];
549
1.51M
    d = dst[i * dst_stride + 3];
550
1.51M
    e = dst[i * dst_stride + 4];
551
552
1.51M
    dst[i * dst_stride] = a + b + c;
553
1.51M
    dst[i * dst_stride + 1] = a + b + c + d;
554
34.5M
    for (j = 2; j < width - 3; ++j) {
555
      // Loop invariant: At the start of each iteration,
556
      // a = src[i * src_stride + (j - 2)]
557
      // b = src[i * src_stride + (j - 1)]
558
      // c = src[i * src_stride + (j    )]
559
      // d = src[i * src_stride + (j + 1)]
560
      // e = src[i * src_stride + (j + 2)]
561
33.0M
      dst[i * dst_stride + j] = a + b + c + d + e;
562
33.0M
      a = b;
563
33.0M
      b = c;
564
33.0M
      c = d;
565
33.0M
      d = e;
566
33.0M
      e = dst[i * dst_stride + (j + 3)];
567
33.0M
    }
568
1.51M
    dst[i * dst_stride + j] = a + b + c + d + e;
569
1.51M
    dst[i * dst_stride + (j + 1)] = b + c + d + e;
570
1.51M
    dst[i * dst_stride + (j + 2)] = c + d + e;
571
1.51M
  }
572
26.4k
}
573
574
static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
575
56.0k
                   int sqr, int32_t *dst, int dst_stride) {
576
56.0k
  if (r == 1)
577
29.6k
    boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
578
26.4k
  else if (r == 2)
579
26.4k
    boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
580
0
  else
581
26.4k
    assert(0 && "Invalid value of r in self-guided filter");
582
56.0k
}
583
584
17.6k
void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
585
17.6k
  if (params->r[0] == 0) {
586
4.46k
    xq[0] = 0;
587
4.46k
    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
588
13.2k
  } else if (params->r[1] == 0) {
589
2.86k
    xq[0] = xqd[0];
590
2.86k
    xq[1] = 0;
591
10.3k
  } else {
592
10.3k
    xq[0] = xqd[0];
593
10.3k
    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
594
10.3k
  }
595
17.6k
}
596
597
const int32_t av1_x_by_xplus1[256] = {
598
  // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
599
  // instead of 0. See comments in selfguided_restoration_internal() for why
600
  1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
601
  240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
602
  248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
603
  250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
604
  252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
605
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
606
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
607
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
608
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
609
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
610
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
611
  254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
612
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
613
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
614
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
615
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
616
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
617
  256,
618
};
619
620
const int32_t av1_one_by_x[MAX_NELEM] = {
621
  4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
622
  293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
623
};
624
625
static void calculate_intermediate_result(int32_t *dgd, int width, int height,
626
                                          int dgd_stride, int bit_depth,
627
                                          int sgr_params_idx, int radius_idx,
628
28.0k
                                          int pass, int32_t *A, int32_t *B) {
629
28.0k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
630
28.0k
  const int r = params->r[radius_idx];
631
28.0k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
632
28.0k
  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
633
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
634
  // leading to a significant speed improvement.
635
  // We also align the stride to a multiple of 16 bytes, for consistency
636
  // with the SIMD version of this function.
637
28.0k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
638
28.0k
  const int step = pass == 0 ? 1 : 2;
639
28.0k
  int i, j;
640
641
28.0k
  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
642
28.0k
  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
643
28.0k
         "Need SGRPROJ_BORDER_* >= r+1");
644
645
28.0k
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
646
28.0k
         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
647
28.0k
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
648
28.0k
         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
649
28.0k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
650
28.0k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
651
  // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
652
  // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
653
1.14M
  for (i = -1; i < height + 1; i += step) {
654
26.9M
    for (j = -1; j < width + 1; ++j) {
655
25.8M
      const int k = i * buf_stride + j;
656
25.8M
      const int n = (2 * r + 1) * (2 * r + 1);
657
658
      // a < 2^16 * n < 2^22 regardless of bit depth
659
25.8M
      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
660
      // b < 2^8 * n < 2^14 regardless of bit depth
661
25.8M
      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
662
663
      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
664
      // and p itself satisfies p < 2^14 * n^2 < 2^26.
665
      // This bound on p is due to:
666
      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
667
      //
668
      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
669
      // This is an artefact of rounding, and can only happen if all pixels
670
      // are (almost) identical, so in this case we saturate to p=0.
671
25.8M
      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
672
673
25.8M
      const uint32_t s = params->s[radius_idx];
674
675
      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
676
      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
677
      // (this holds even after accounting for the rounding in s)
678
25.8M
      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
679
680
      // Note: We have to be quite careful about the value of A[k].
681
      // This is used as a blend factor between individual pixel values and the
682
      // local mean. So it logically has a range of [0, 256], including both
683
      // endpoints.
684
      //
685
      // This is a pain for hardware, as we'd like something which can be stored
686
      // in exactly 8 bits.
687
      // Further, in the calculation of B[k] below, if z == 0 and r == 2,
688
      // then A[k] "should be" 0. But then we can end up setting B[k] to a value
689
      // slightly above 2^(8 + bit depth), due to rounding in the value of
690
      // av1_one_by_x[25-1].
691
      //
692
      // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
693
      // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
694
      // overflow), without significantly affecting the final result: z == 0
695
      // implies that the image is essentially "flat", so the local mean and
696
      // individual pixel values are very similar.
697
      //
698
      // Note that saturating on the other side, ie. requring A[k] <= 255,
699
      // would be a bad idea, as that corresponds to the case where the image
700
      // is very variable, when we want to preserve the local pixel value as
701
      // much as possible.
702
25.8M
      A[k] = av1_x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
703
704
      // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
705
      // av1_one_by_x[n - 1] = round(2^12 / n)
706
      // => the product here is < 2^(20 + bit_depth) <= 2^32,
707
      // and B[k] is set to a value < 2^(8 + bit depth)
708
      // This holds even with the rounding in av1_one_by_x and in the overall
709
      // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
710
25.8M
      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
711
25.8M
                                             (uint32_t)B[k] *
712
25.8M
                                             (uint32_t)av1_one_by_x[n - 1],
713
25.8M
                                         SGRPROJ_RECIP_BITS);
714
25.8M
    }
715
1.11M
  }
716
28.0k
}
717
718
static void selfguided_restoration_fast_internal(
719
    int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
720
13.2k
    int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
721
13.2k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
722
13.2k
  const int r = params->r[radius_idx];
723
13.2k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
724
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
725
  // leading to a significant speed improvement.
726
  // We also align the stride to a multiple of 16 bytes, for consistency
727
  // with the SIMD version of this function.
728
13.2k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
729
13.2k
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
730
13.2k
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
731
13.2k
  int32_t *A = A_;
732
13.2k
  int32_t *B = B_;
733
13.2k
  int i, j;
734
13.2k
  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
735
13.2k
                                sgr_params_idx, radius_idx, 1, A, B);
736
13.2k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
737
13.2k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
738
739
  // Use the A[] and B[] arrays to calculate the filtered image
740
13.2k
  (void)r;
741
13.2k
  assert(r == 2);
742
693k
  for (i = 0; i < height; ++i) {
743
680k
    if (!(i & 1)) {  // even row
744
7.32M
      for (j = 0; j < width; ++j) {
745
6.98M
        const int k = i * buf_stride + j;
746
6.98M
        const int l = i * dgd_stride + j;
747
6.98M
        const int m = i * dst_stride + j;
748
6.98M
        const int nb = 5;
749
6.98M
        const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
750
6.98M
                          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
751
6.98M
                           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
752
6.98M
                              5;
753
6.98M
        const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
754
6.98M
                          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
755
6.98M
                           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
756
6.98M
                              5;
757
6.98M
        const int32_t v = a * dgd[l] + b;
758
6.98M
        dst[m] =
759
6.98M
            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
760
6.98M
      }
761
340k
    } else {  // odd row
762
7.31M
      for (j = 0; j < width; ++j) {
763
6.97M
        const int k = i * buf_stride + j;
764
6.97M
        const int l = i * dgd_stride + j;
765
6.97M
        const int m = i * dst_stride + j;
766
6.97M
        const int nb = 4;
767
6.97M
        const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
768
6.97M
        const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
769
6.97M
        const int32_t v = a * dgd[l] + b;
770
6.97M
        dst[m] =
771
6.97M
            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
772
6.97M
      }
773
339k
    }
774
680k
  }
775
13.2k
}
776
777
static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
778
                                            int dgd_stride, int32_t *dst,
779
                                            int dst_stride, int bit_depth,
780
                                            int sgr_params_idx,
781
14.8k
                                            int radius_idx) {
782
14.8k
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
783
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
784
  // leading to a significant speed improvement.
785
  // We also align the stride to a multiple of 16 bytes, for consistency
786
  // with the SIMD version of this function.
787
14.8k
  int buf_stride = ((width_ext + 3) & ~3) + 16;
788
14.8k
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
789
14.8k
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
790
14.8k
  int32_t *A = A_;
791
14.8k
  int32_t *B = B_;
792
14.8k
  int i, j;
793
14.8k
  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
794
14.8k
                                sgr_params_idx, radius_idx, 0, A, B);
795
14.8k
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
796
14.8k
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
797
798
  // Use the A[] and B[] arrays to calculate the filtered image
799
744k
  for (i = 0; i < height; ++i) {
800
16.3M
    for (j = 0; j < width; ++j) {
801
15.6M
      const int k = i * buf_stride + j;
802
15.6M
      const int l = i * dgd_stride + j;
803
15.6M
      const int m = i * dst_stride + j;
804
15.6M
      const int nb = 5;
805
15.6M
      const int32_t a =
806
15.6M
          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
807
15.6M
              4 +
808
15.6M
          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
809
15.6M
           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
810
15.6M
              3;
811
15.6M
      const int32_t b =
812
15.6M
          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
813
15.6M
              4 +
814
15.6M
          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
815
15.6M
           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
816
15.6M
              3;
817
15.6M
      const int32_t v = a * dgd[l] + b;
818
15.6M
      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
819
15.6M
    }
820
729k
  }
821
14.8k
}
822
823
int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
824
                                 int dgd_stride, int32_t *flt0, int32_t *flt1,
825
                                 int flt_stride, int sgr_params_idx,
826
17.6k
                                 int bit_depth, int highbd) {
827
17.6k
  int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
828
17.6k
  const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
829
17.6k
  int32_t *dgd32 =
830
17.6k
      dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
831
832
17.6k
  if (highbd) {
833
5.94k
    const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
834
332k
    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
835
9.50M
      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
836
9.18M
        dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
837
9.18M
      }
838
326k
    }
839
11.7k
  } else {
840
688k
    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
841
18.5M
      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
842
17.9M
        dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
843
17.9M
      }
844
677k
    }
845
11.7k
  }
846
847
17.6k
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
848
  // If params->r == 0 we skip the corresponding filter. We only allow one of
849
  // the radii to be 0, as having both equal to 0 would be equivalent to
850
  // skipping SGR entirely.
851
17.6k
  assert(!(params->r[0] == 0 && params->r[1] == 0));
852
853
17.6k
  if (params->r[0] > 0)
854
13.2k
    selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
855
13.2k
                                         flt0, flt_stride, bit_depth,
856
13.2k
                                         sgr_params_idx, 0);
857
17.6k
  if (params->r[1] > 0)
858
14.8k
    selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
859
14.8k
                                    flt_stride, bit_depth, sgr_params_idx, 1);
860
17.6k
  return 0;
861
17.6k
}
862
863
int av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
864
                                       int height, int stride, int eps,
865
                                       const int *xqd, uint8_t *dst8,
866
                                       int dst_stride, int32_t *tmpbuf,
867
17.6k
                                       int bit_depth, int highbd) {
868
17.6k
  int32_t *flt0 = tmpbuf;
869
17.6k
  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
870
17.6k
  assert(width * height <= RESTORATION_UNITPELS_MAX);
871
872
17.6k
  const int ret = av1_selfguided_restoration_c(
873
17.6k
      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
874
17.6k
  if (ret != 0) return ret;
875
17.6k
  const sgr_params_type *const params = &av1_sgr_params[eps];
876
17.6k
  int xq[2];
877
17.6k
  av1_decode_xq(xqd, xq, params);
878
915k
  for (int i = 0; i < height; ++i) {
879
19.5M
    for (int j = 0; j < width; ++j) {
880
18.6M
      const int k = i * width + j;
881
18.6M
      uint8_t *dst8ij = dst8 + i * dst_stride + j;
882
18.6M
      const uint8_t *dat8ij = dat8 + i * stride + j;
883
884
18.6M
      const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
885
18.6M
      const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
886
18.6M
      int32_t v = u << SGRPROJ_PRJ_BITS;
887
      // If params->r == 0 then we skipped the filtering in
888
      // av1_selfguided_restoration_c, i.e. flt[k] == u
889
18.6M
      if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
890
18.6M
      if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
891
18.6M
      const int16_t w =
892
18.6M
          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
893
894
18.6M
      const uint16_t out = clip_pixel_highbd(w, bit_depth);
895
18.6M
      if (highbd)
896
6.36M
        *CONVERT_TO_SHORTPTR(dst8ij) = out;
897
12.2M
      else
898
12.2M
        *dst8ij = (uint8_t)out;
899
18.6M
    }
900
897k
  }
901
17.6k
  return 0;
902
17.6k
}
903
904
static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
905
                                  int stripe_width, int stripe_height,
906
                                  int procunit_width, const uint8_t *src,
907
                                  int src_stride, uint8_t *dst, int dst_stride,
908
                                  int32_t *tmpbuf, int bit_depth,
909
9.44k
                                  struct aom_internal_error_info *error_info) {
910
9.44k
  (void)bit_depth;
911
9.44k
  assert(bit_depth == 8);
912
913
21.1k
  for (int j = 0; j < stripe_width; j += procunit_width) {
914
11.7k
    int w = AOMMIN(procunit_width, stripe_width - j);
915
11.7k
    if (av1_apply_selfguided_restoration(
916
11.7k
            src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
917
11.7k
            rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth,
918
11.7k
            0) != 0) {
919
0
      aom_internal_error(
920
0
          error_info, AOM_CODEC_MEM_ERROR,
921
0
          "Error allocating buffer in av1_apply_selfguided_restoration");
922
0
    }
923
11.7k
  }
924
9.44k
}
925
926
#if CONFIG_AV1_HIGHBITDEPTH
927
static void wiener_filter_stripe_highbd(
928
    const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
929
    int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
930
    int dst_stride, int32_t *tmpbuf, int bit_depth,
931
28.4k
    struct aom_internal_error_info *error_info) {
932
28.4k
  (void)tmpbuf;
933
28.4k
  (void)error_info;
934
28.4k
  const WienerConvolveParams conv_params = get_conv_params_wiener(bit_depth);
935
936
59.2k
  for (int j = 0; j < stripe_width; j += procunit_width) {
937
30.8k
    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
938
30.8k
    const uint8_t *src8_p = src8 + j;
939
30.8k
    uint8_t *dst8_p = dst8 + j;
940
30.8k
    av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
941
30.8k
                                       rui->wiener_info.hfilter, 16,
942
30.8k
                                       rui->wiener_info.vfilter, 16, w,
943
30.8k
                                       stripe_height, &conv_params, bit_depth);
944
30.8k
  }
945
28.4k
}
946
947
static void sgrproj_filter_stripe_highbd(
948
    const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
949
    int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
950
    int dst_stride, int32_t *tmpbuf, int bit_depth,
951
4.39k
    struct aom_internal_error_info *error_info) {
952
10.3k
  for (int j = 0; j < stripe_width; j += procunit_width) {
953
5.94k
    int w = AOMMIN(procunit_width, stripe_width - j);
954
5.94k
    if (av1_apply_selfguided_restoration(
955
5.94k
            src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
956
5.94k
            rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth,
957
5.94k
            1) != 0) {
958
0
      aom_internal_error(
959
0
          error_info, AOM_CODEC_MEM_ERROR,
960
0
          "Error allocating buffer in av1_apply_selfguided_restoration");
961
0
    }
962
5.94k
  }
963
4.39k
}
964
#endif  // CONFIG_AV1_HIGHBITDEPTH
965
966
typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
967
                                  int stripe_width, int stripe_height,
968
                                  int procunit_width, const uint8_t *src,
969
                                  int src_stride, uint8_t *dst, int dst_stride,
970
                                  int32_t *tmpbuf, int bit_depth,
971
                                  struct aom_internal_error_info *error_info);
972
973
#if CONFIG_AV1_HIGHBITDEPTH
974
#define NUM_STRIPE_FILTERS 4
975
static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
976
  wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
977
  sgrproj_filter_stripe_highbd
978
};
979
#else
980
#define NUM_STRIPE_FILTERS 2
981
static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
982
  wiener_filter_stripe, sgrproj_filter_stripe
983
};
984
#endif  // CONFIG_AV1_HIGHBITDEPTH
985
986
// Filter one restoration unit
987
void av1_loop_restoration_filter_unit(
988
    const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
989
    const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
990
    int plane_w, int plane_h, int ss_x, int ss_y, int highbd, int bit_depth,
991
    uint8_t *data8, int stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf,
992
43.0k
    int optimized_lr, struct aom_internal_error_info *error_info) {
993
43.0k
  RestorationType unit_rtype = rui->restoration_type;
994
995
43.0k
  int unit_h = limits->v_end - limits->v_start;
996
43.0k
  int unit_w = limits->h_end - limits->h_start;
997
43.0k
  uint8_t *data8_tl =
998
43.0k
      data8 + limits->v_start * (ptrdiff_t)stride + limits->h_start;
999
43.0k
  uint8_t *dst8_tl =
1000
43.0k
      dst8 + limits->v_start * (ptrdiff_t)dst_stride + limits->h_start;
1001
1002
43.0k
  if (unit_rtype == RESTORE_NONE) {
1003
14.5k
    copy_rest_unit(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride,
1004
14.5k
                   highbd);
1005
14.5k
    return;
1006
14.5k
  }
1007
1008
28.4k
  const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
1009
28.4k
  assert(filter_idx < NUM_STRIPE_FILTERS);
1010
28.4k
  const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
1011
1012
28.4k
  const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
1013
1014
  // Filter the whole image one stripe at a time
1015
28.4k
  RestorationTileLimits remaining_stripes = *limits;
1016
28.4k
  int i = 0;
1017
84.2k
  while (i < unit_h) {
1018
55.7k
    int copy_above, copy_below;
1019
55.7k
    remaining_stripes.v_start = limits->v_start + i;
1020
1021
55.7k
    get_stripe_boundary_info(&remaining_stripes, plane_w, plane_h, ss_y,
1022
55.7k
                             &copy_above, &copy_below);
1023
1024
55.7k
    const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1025
55.7k
    const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
1026
1027
    // Work out where this stripe's boundaries are within
1028
    // rsb->stripe_boundary_{above,below}
1029
55.7k
    const int frame_stripe =
1030
55.7k
        (remaining_stripes.v_start + runit_offset) / full_stripe_height;
1031
55.7k
    const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
1032
1033
    // Calculate this stripe's height, based on two rules:
1034
    // * The topmost stripe in the frame is 8 luma pixels shorter than usual.
1035
    // * We can't extend past the end of the current restoration unit
1036
55.7k
    const int nominal_stripe_height =
1037
55.7k
        full_stripe_height - ((frame_stripe == 0) ? runit_offset : 0);
1038
55.7k
    const int h = AOMMIN(nominal_stripe_height,
1039
55.7k
                         remaining_stripes.v_end - remaining_stripes.v_start);
1040
1041
55.7k
    setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
1042
55.7k
                                     h, data8, stride, rlbs, copy_above,
1043
55.7k
                                     copy_below, optimized_lr);
1044
1045
55.7k
    stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
1046
55.7k
                  dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth,
1047
55.7k
                  error_info);
1048
1049
55.7k
    restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
1050
55.7k
                                       data8, stride, copy_above, copy_below,
1051
55.7k
                                       optimized_lr);
1052
1053
55.7k
    i += h;
1054
55.7k
  }
1055
28.4k
}
1056
1057
static void filter_frame_on_unit(const RestorationTileLimits *limits,
1058
                                 int rest_unit_idx, void *priv, int32_t *tmpbuf,
1059
                                 RestorationLineBuffers *rlbs,
1060
43.0k
                                 struct aom_internal_error_info *error_info) {
1061
43.0k
  FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
1062
43.0k
  const RestorationInfo *rsi = ctxt->rsi;
1063
1064
43.0k
  av1_loop_restoration_filter_unit(
1065
43.0k
      limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs,
1066
43.0k
      ctxt->plane_w, ctxt->plane_h, ctxt->ss_x, ctxt->ss_y, ctxt->highbd,
1067
43.0k
      ctxt->bit_depth, ctxt->data8, ctxt->data_stride, ctxt->dst8,
1068
43.0k
      ctxt->dst_stride, tmpbuf, rsi->optimized_lr, error_info);
1069
43.0k
}
1070
1071
void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
1072
                                            YV12_BUFFER_CONFIG *frame,
1073
                                            AV1_COMMON *cm, int optimized_lr,
1074
8.19k
                                            int num_planes) {
1075
8.19k
  const SequenceHeader *const seq_params = cm->seq_params;
1076
8.19k
  const int bit_depth = seq_params->bit_depth;
1077
8.19k
  const int highbd = seq_params->use_highbitdepth;
1078
8.19k
  lr_ctxt->dst = &cm->rst_frame;
1079
1080
8.19k
  const int frame_width = frame->crop_widths[0];
1081
8.19k
  const int frame_height = frame->crop_heights[0];
1082
8.19k
  if (aom_realloc_frame_buffer(
1083
8.19k
          lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
1084
8.19k
          seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
1085
8.19k
          cm->features.byte_alignment, NULL, NULL, NULL, false,
1086
8.19k
          0) != AOM_CODEC_OK)
1087
0
    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
1088
0
                       "Failed to allocate restoration dst buffer");
1089
1090
8.19k
  lr_ctxt->on_rest_unit = filter_frame_on_unit;
1091
8.19k
  lr_ctxt->frame = frame;
1092
31.2k
  for (int plane = 0; plane < num_planes; ++plane) {
1093
23.0k
    RestorationInfo *rsi = &cm->rst_info[plane];
1094
23.0k
    RestorationType rtype = rsi->frame_restoration_type;
1095
23.0k
    rsi->optimized_lr = optimized_lr;
1096
23.0k
    lr_ctxt->ctxt[plane].rsi = rsi;
1097
1098
23.0k
    if (rtype == RESTORE_NONE) {
1099
9.83k
      continue;
1100
9.83k
    }
1101
1102
13.2k
    const int is_uv = plane > 0;
1103
13.2k
    int plane_w, plane_h;
1104
13.2k
    av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1105
13.2k
    assert(plane_w == frame->crop_widths[is_uv]);
1106
13.2k
    assert(plane_h == frame->crop_heights[is_uv]);
1107
1108
13.2k
    av1_extend_frame(frame->buffers[plane], plane_w, plane_h,
1109
13.2k
                     frame->strides[is_uv], RESTORATION_BORDER,
1110
13.2k
                     RESTORATION_BORDER, highbd);
1111
1112
13.2k
    FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
1113
13.2k
    lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
1114
13.2k
    lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
1115
13.2k
    lr_plane_ctxt->plane_w = plane_w;
1116
13.2k
    lr_plane_ctxt->plane_h = plane_h;
1117
13.2k
    lr_plane_ctxt->highbd = highbd;
1118
13.2k
    lr_plane_ctxt->bit_depth = bit_depth;
1119
13.2k
    lr_plane_ctxt->data8 = frame->buffers[plane];
1120
13.2k
    lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
1121
13.2k
    lr_plane_ctxt->data_stride = frame->strides[is_uv];
1122
13.2k
    lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
1123
13.2k
  }
1124
8.19k
}
1125
1126
static void loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
1127
8.19k
                                         AV1_COMMON *cm, int num_planes) {
1128
8.19k
  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
1129
8.19k
                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
1130
8.19k
                           int vstart, int vend);
1131
8.19k
  static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
1132
8.19k
                                         aom_yv12_partial_coloc_copy_u,
1133
8.19k
                                         aom_yv12_partial_coloc_copy_v };
1134
8.19k
  assert(num_planes <= 3);
1135
31.2k
  for (int plane = 0; plane < num_planes; ++plane) {
1136
23.0k
    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
1137
13.2k
    FilterFrameCtxt *lr_plane_ctxt = &loop_rest_ctxt->ctxt[plane];
1138
13.2k
    copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, 0,
1139
13.2k
                     lr_plane_ctxt->plane_w, 0, lr_plane_ctxt->plane_h);
1140
13.2k
  }
1141
8.19k
}
1142
1143
// Call on_rest_unit for each loop restoration unit in the plane.
1144
static void foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
1145
                                       rest_unit_visitor_t on_rest_unit,
1146
                                       void *priv, int32_t *tmpbuf,
1147
13.2k
                                       RestorationLineBuffers *rlbs) {
1148
13.2k
  const RestorationInfo *rsi = &cm->rst_info[plane];
1149
13.2k
  const int hnum_rest_units = rsi->horz_units;
1150
13.2k
  const int vnum_rest_units = rsi->vert_units;
1151
13.2k
  const int unit_size = rsi->restoration_unit_size;
1152
1153
13.2k
  const int is_uv = plane > 0;
1154
13.2k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1155
13.2k
  const int ext_size = unit_size * 3 / 2;
1156
13.2k
  int plane_w, plane_h;
1157
13.2k
  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1158
1159
13.2k
  int y0 = 0, i = 0;
1160
52.5k
  while (y0 < plane_h) {
1161
39.3k
    int remaining_h = plane_h - y0;
1162
39.3k
    int h = (remaining_h < ext_size) ? remaining_h : unit_size;
1163
1164
39.3k
    RestorationTileLimits limits;
1165
39.3k
    limits.v_start = y0;
1166
39.3k
    limits.v_end = y0 + h;
1167
39.3k
    assert(limits.v_end <= plane_h);
1168
    // Offset upwards to align with the restoration processing stripe
1169
39.3k
    const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
1170
39.3k
    limits.v_start = AOMMAX(0, limits.v_start - voffset);
1171
39.3k
    if (limits.v_end < plane_h) limits.v_end -= voffset;
1172
1173
39.3k
    av1_foreach_rest_unit_in_row(&limits, plane_w, on_rest_unit, i, unit_size,
1174
39.3k
                                 hnum_rest_units, vnum_rest_units, plane, priv,
1175
39.3k
                                 tmpbuf, rlbs, av1_lr_sync_read_dummy,
1176
39.3k
                                 av1_lr_sync_write_dummy, NULL, cm->error);
1177
1178
39.3k
    y0 += h;
1179
39.3k
    ++i;
1180
39.3k
  }
1181
13.2k
}
1182
1183
static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
1184
8.19k
                                        int num_planes) {
1185
8.19k
  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
1186
1187
31.2k
  for (int plane = 0; plane < num_planes; ++plane) {
1188
23.0k
    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
1189
9.83k
      continue;
1190
9.83k
    }
1191
1192
13.2k
    foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit, &ctxt[plane],
1193
13.2k
                               cm->rst_tmpbuf, cm->rlbs);
1194
13.2k
  }
1195
8.19k
}
1196
1197
void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
1198
                                       AV1_COMMON *cm, int optimized_lr,
1199
8.19k
                                       void *lr_ctxt) {
1200
8.19k
  assert(!cm->features.all_lossless);
1201
8.19k
  const int num_planes = av1_num_planes(cm);
1202
1203
8.19k
  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
1204
1205
8.19k
  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
1206
8.19k
                                         optimized_lr, num_planes);
1207
1208
8.19k
  foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
1209
1210
8.19k
  loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
1211
8.19k
}
1212
1213
void av1_foreach_rest_unit_in_row(
1214
    RestorationTileLimits *limits, int plane_w,
1215
    rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
1216
    int hnum_rest_units, int vnum_rest_units, int plane, void *priv,
1217
    int32_t *tmpbuf, RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read,
1218
    sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync,
1219
39.3k
    struct aom_internal_error_info *error_info) {
1220
39.3k
  const int ext_size = unit_size * 3 / 2;
1221
39.3k
  int x0 = 0, j = 0;
1222
82.3k
  while (x0 < plane_w) {
1223
43.0k
    int remaining_w = plane_w - x0;
1224
43.0k
    int w = (remaining_w < ext_size) ? remaining_w : unit_size;
1225
1226
43.0k
    limits->h_start = x0;
1227
43.0k
    limits->h_end = x0 + w;
1228
43.0k
    assert(limits->h_end <= plane_w);
1229
1230
43.0k
    const int unit_idx = row_number * hnum_rest_units + j;
1231
1232
    // No sync for even numbered rows
1233
    // For odd numbered rows, Loop Restoration of current block requires the LR
1234
    // of top-right and bottom-right blocks to be completed
1235
1236
    // top-right sync
1237
43.0k
    on_sync_read(lr_sync, row_number, j, plane);
1238
43.0k
    if ((row_number + 1) < vnum_rest_units)
1239
      // bottom-right sync
1240
28.7k
      on_sync_read(lr_sync, row_number + 2, j, plane);
1241
1242
43.0k
#if CONFIG_MULTITHREAD
1243
43.0k
    if (lr_sync && lr_sync->num_workers > 1) {
1244
0
      pthread_mutex_lock(lr_sync->job_mutex);
1245
0
      const bool lr_mt_exit = lr_sync->lr_mt_exit;
1246
0
      pthread_mutex_unlock(lr_sync->job_mutex);
1247
      // Exit in case any worker has encountered an error.
1248
0
      if (lr_mt_exit) return;
1249
0
    }
1250
43.0k
#endif
1251
1252
43.0k
    on_rest_unit(limits, unit_idx, priv, tmpbuf, rlbs, error_info);
1253
1254
43.0k
    on_sync_write(lr_sync, row_number, j, hnum_rest_units, plane);
1255
1256
43.0k
    x0 += w;
1257
43.0k
    ++j;
1258
43.0k
  }
1259
39.3k
}
1260
1261
71.7k
void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
1262
71.7k
  (void)lr_sync;
1263
71.7k
  (void)r;
1264
71.7k
  (void)c;
1265
71.7k
  (void)plane;
1266
71.7k
}
1267
1268
void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
1269
43.0k
                             const int sb_cols, int plane) {
1270
43.0k
  (void)lr_sync;
1271
43.0k
  (void)r;
1272
43.0k
  (void)c;
1273
43.0k
  (void)sb_cols;
1274
43.0k
  (void)plane;
1275
43.0k
}
1276
1277
int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
1278
                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
1279
                                       int *rcol0, int *rcol1, int *rrow0,
1280
8.40M
                                       int *rrow1) {
1281
8.40M
  assert(rcol0 && rcol1 && rrow0 && rrow1);
1282
1283
8.40M
  if (bsize != cm->seq_params->sb_size) return 0;
1284
1285
8.40M
  assert(!cm->features.all_lossless);
1286
1287
542k
  const int is_uv = plane > 0;
1288
1289
  // Compute the mi-unit corners of the superblock
1290
542k
  const int mi_row0 = mi_row;
1291
542k
  const int mi_col0 = mi_col;
1292
542k
  const int mi_row1 = mi_row0 + mi_size_high[bsize];
1293
542k
  const int mi_col1 = mi_col0 + mi_size_wide[bsize];
1294
1295
542k
  const RestorationInfo *rsi = &cm->rst_info[plane];
1296
542k
  const int size = rsi->restoration_unit_size;
1297
542k
  const int horz_units = rsi->horz_units;
1298
542k
  const int vert_units = rsi->vert_units;
1299
1300
  // The size of an MI-unit on this plane of the image
1301
542k
  const int ss_x = is_uv && cm->seq_params->subsampling_x;
1302
542k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1303
542k
  const int mi_size_x = MI_SIZE >> ss_x;
1304
542k
  const int mi_size_y = MI_SIZE >> ss_y;
1305
1306
  // Write m for the relative mi column or row, D for the superres denominator
1307
  // and N for the superres numerator. If u is the upscaled pixel offset then
1308
  // we can write the downscaled pixel offset in two ways as:
1309
  //
1310
  //   MI_SIZE * m = N / D u
1311
  //
1312
  // from which we get u = D * MI_SIZE * m / N
1313
542k
  const int mi_to_num_x = av1_superres_scaled(cm)
1314
542k
                              ? mi_size_x * cm->superres_scale_denominator
1315
542k
                              : mi_size_x;
1316
542k
  const int mi_to_num_y = mi_size_y;
1317
542k
  const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
1318
542k
  const int denom_y = size;
1319
1320
542k
  const int rnd_x = denom_x - 1;
1321
542k
  const int rnd_y = denom_y - 1;
1322
1323
  // rcol0/rrow0 should be the first column/row of restoration units that
1324
  // doesn't start left/below of mi_col/mi_row. For this calculation, we need
1325
  // to round up the division (if the sb starts at runit column 10.1, the first
1326
  // matching runit has column index 11)
1327
542k
  *rcol0 = (mi_col0 * mi_to_num_x + rnd_x) / denom_x;
1328
542k
  *rrow0 = (mi_row0 * mi_to_num_y + rnd_y) / denom_y;
1329
1330
  // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
1331
  // below-right. If we're at the bottom or right of the frame, this restoration
1332
  // unit might not exist, in which case we'll clamp accordingly.
1333
542k
  *rcol1 = AOMMIN((mi_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
1334
542k
  *rrow1 = AOMMIN((mi_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
1335
1336
542k
  return *rcol0 < *rcol1 && *rrow0 < *rrow1;
1337
542k
}
1338
1339
// Extend to left and right
1340
static void extend_lines(uint8_t *buf, int width, int height, int stride,
1341
258k
                         int extend, int use_highbitdepth) {
1342
774k
  for (int i = 0; i < height; ++i) {
1343
516k
    if (use_highbitdepth) {
1344
413k
      uint16_t *buf16 = (uint16_t *)buf;
1345
413k
      aom_memset16(buf16 - extend, buf16[0], extend);
1346
413k
      aom_memset16(buf16 + width, buf16[width - 1], extend);
1347
413k
    } else {
1348
103k
      memset(buf - extend, buf[0], extend);
1349
103k
      memset(buf + width, buf[width - 1], extend);
1350
103k
    }
1351
516k
    buf += stride;
1352
516k
  }
1353
258k
}
1354
1355
static void save_deblock_boundary_lines(
1356
    const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
1357
    int stripe, int use_highbd, int is_above,
1358
214k
    RestorationStripeBoundaries *boundaries) {
1359
214k
  const int is_uv = plane > 0;
1360
214k
  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1361
214k
  const int src_stride = frame->strides[is_uv] << use_highbd;
1362
214k
  const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
1363
1364
214k
  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1365
214k
                               : boundaries->stripe_boundary_below;
1366
214k
  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1367
214k
  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1368
214k
  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1369
1370
  // There is a rare case in which a processing stripe can end 1px above the
1371
  // crop border. In this case, we do want to use deblocked pixels from below
1372
  // the stripe (hence why we ended up in this function), but instead of
1373
  // fetching 2 "below" rows we need to fetch one and duplicate it.
1374
  // This is equivalent to clamping the sample locations against the crop border
1375
214k
  const int lines_to_save =
1376
214k
      AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
1377
214k
  assert(lines_to_save == 1 || lines_to_save == 2);
1378
1379
214k
  int upscaled_width;
1380
214k
  int line_bytes;
1381
214k
  if (av1_superres_scaled(cm)) {
1382
202k
    const int ss_x = is_uv && cm->seq_params->subsampling_x;
1383
202k
    upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
1384
202k
    line_bytes = upscaled_width << use_highbd;
1385
202k
    if (use_highbd)
1386
161k
      av1_upscale_normative_rows(
1387
161k
          cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
1388
161k
          CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
1389
161k
          plane, lines_to_save);
1390
40.9k
    else
1391
40.9k
      av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
1392
40.9k
                                 boundaries->stripe_boundary_stride, plane,
1393
40.9k
                                 lines_to_save);
1394
202k
  } else {
1395
12.3k
    upscaled_width = frame->crop_widths[is_uv];
1396
12.3k
    line_bytes = upscaled_width << use_highbd;
1397
36.9k
    for (int i = 0; i < lines_to_save; i++) {
1398
24.5k
      memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
1399
24.5k
             line_bytes);
1400
24.5k
    }
1401
12.3k
  }
1402
  // If we only saved one line, then copy it into the second line buffer
1403
214k
  if (lines_to_save == 1)
1404
39
    memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
1405
1406
214k
  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1407
214k
               RESTORATION_EXTRA_HORZ, use_highbd);
1408
214k
}
1409
1410
static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1411
                                     const AV1_COMMON *cm, int plane, int row,
1412
                                     int stripe, int use_highbd, int is_above,
1413
43.2k
                                     RestorationStripeBoundaries *boundaries) {
1414
43.2k
  const int is_uv = plane > 0;
1415
43.2k
  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1416
43.2k
  const int src_stride = frame->strides[is_uv] << use_highbd;
1417
43.2k
  const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
1418
1419
43.2k
  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1420
43.2k
                               : boundaries->stripe_boundary_below;
1421
43.2k
  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1422
43.2k
  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1423
43.2k
  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1424
43.2k
  const int src_width = frame->crop_widths[is_uv];
1425
1426
  // At the point where this function is called, we've already applied
1427
  // superres. So we don't need to extend the lines here, we can just
1428
  // pull directly from the topmost row of the upscaled frame.
1429
43.2k
  const int ss_x = is_uv && cm->seq_params->subsampling_x;
1430
43.2k
  const int upscaled_width = av1_superres_scaled(cm)
1431
43.2k
                                 ? (cm->superres_upscaled_width + ss_x) >> ss_x
1432
43.2k
                                 : src_width;
1433
43.2k
  const int line_bytes = upscaled_width << use_highbd;
1434
129k
  for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
1435
    // Copy the line at 'src_rows' into both context lines
1436
86.4k
    memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
1437
86.4k
  }
1438
43.2k
  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1439
43.2k
               RESTORATION_EXTRA_HORZ, use_highbd);
1440
43.2k
}
1441
1442
static void save_boundary_lines(const YV12_BUFFER_CONFIG *frame, int use_highbd,
1443
43.2k
                                int plane, AV1_COMMON *cm, int after_cdef) {
1444
43.2k
  const int is_uv = plane > 0;
1445
43.2k
  const int ss_y = is_uv && cm->seq_params->subsampling_y;
1446
43.2k
  const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1447
43.2k
  const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
1448
1449
43.2k
  int plane_w, plane_h;
1450
43.2k
  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1451
1452
43.2k
  RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
1453
1454
43.2k
  const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
1455
1456
43.2k
  int stripe_idx;
1457
301k
  for (stripe_idx = 0;; ++stripe_idx) {
1458
301k
    const int rel_y0 = AOMMAX(0, stripe_idx * stripe_height - stripe_off);
1459
301k
    const int y0 = rel_y0;
1460
301k
    if (y0 >= plane_h) break;
1461
1462
258k
    const int rel_y1 = (stripe_idx + 1) * stripe_height - stripe_off;
1463
258k
    const int y1 = AOMMIN(rel_y1, plane_h);
1464
1465
    // Extend using CDEF pixels at the top and bottom of the frame,
1466
    // and deblocked pixels at internal stripe boundaries
1467
258k
    const int use_deblock_above = (stripe_idx > 0);
1468
258k
    const int use_deblock_below = (y1 < plane_height);
1469
1470
258k
    if (!after_cdef) {
1471
      // Save deblocked context at internal stripe boundaries
1472
129k
      if (use_deblock_above) {
1473
107k
        save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
1474
107k
                                    stripe_idx, use_highbd, 1, boundaries);
1475
107k
      }
1476
129k
      if (use_deblock_below) {
1477
107k
        save_deblock_boundary_lines(frame, cm, plane, y1, stripe_idx,
1478
107k
                                    use_highbd, 0, boundaries);
1479
107k
      }
1480
129k
    } else {
1481
      // Save CDEF context at frame boundaries
1482
129k
      if (!use_deblock_above) {
1483
21.6k
        save_cdef_boundary_lines(frame, cm, plane, y0, stripe_idx, use_highbd,
1484
21.6k
                                 1, boundaries);
1485
21.6k
      }
1486
129k
      if (!use_deblock_below) {
1487
21.6k
        save_cdef_boundary_lines(frame, cm, plane, y1 - 1, stripe_idx,
1488
21.6k
                                 use_highbd, 0, boundaries);
1489
21.6k
      }
1490
129k
    }
1491
258k
  }
1492
43.2k
}
1493
1494
// For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
1495
// lines to be used as boundary in the loop restoration process. The
1496
// lines are saved in rst_internal.stripe_boundary_lines
1497
void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1498
14.5k
                                              AV1_COMMON *cm, int after_cdef) {
1499
14.5k
  const int num_planes = av1_num_planes(cm);
1500
14.5k
  const int use_highbd = cm->seq_params->use_highbitdepth;
1501
57.7k
  for (int p = 0; p < num_planes; ++p) {
1502
43.2k
    save_boundary_lines(frame, use_highbd, p, cm, after_cdef);
1503
43.2k
  }
1504
14.5k
}