Coverage Report

Created: 2026-02-14 07:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/av1/common/convolve.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <assert.h>
13
#include <string.h>
14
15
#include "config/aom_dsp_rtcd.h"
16
#include "config/av1_rtcd.h"
17
18
#include "av1/common/av1_common_int.h"
19
#include "av1/common/blockd.h"
20
#include "av1/common/convolve.h"
21
#include "av1/common/filter.h"
22
#include "av1/common/resize.h"
23
#include "aom_dsp/aom_dsp_common.h"
24
#include "aom_ports/mem.h"
25
26
void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
27
                             int dst_stride, int w, int h,
28
                             const int16_t *x_filters, int x0_qn,
29
15.9k
                             int x_step_qn) {
30
15.9k
  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
31
556k
  for (int y = 0; y < h; ++y) {
32
540k
    int x_qn = x0_qn;
33
138M
    for (int x = 0; x < w; ++x) {
34
138M
      const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
35
138M
      const int x_filter_idx =
36
138M
          (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
37
138M
      assert(x_filter_idx <= RS_SUBPEL_MASK);
38
138M
      const int16_t *const x_filter =
39
138M
          &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
40
138M
      int sum = 0;
41
1.24G
      for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
42
1.10G
        sum += src_x[k] * x_filter[k];
43
138M
      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
44
138M
      x_qn += x_step_qn;
45
138M
    }
46
540k
    src += src_stride;
47
540k
    dst += dst_stride;
48
540k
  }
49
15.9k
}
50
51
#if CONFIG_AV1_HIGHBITDEPTH
52
void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
53
                                    uint16_t *dst, int dst_stride, int w, int h,
54
                                    const int16_t *x_filters, int x0_qn,
55
53.6k
                                    int x_step_qn, int bd) {
56
53.6k
  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
57
1.97M
  for (int y = 0; y < h; ++y) {
58
1.91M
    int x_qn = x0_qn;
59
96.4M
    for (int x = 0; x < w; ++x) {
60
94.5M
      const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
61
94.5M
      const int x_filter_idx =
62
94.5M
          (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
63
94.5M
      assert(x_filter_idx <= RS_SUBPEL_MASK);
64
94.5M
      const int16_t *const x_filter =
65
94.5M
          &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
66
94.5M
      int sum = 0;
67
850M
      for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
68
756M
        sum += src_x[k] * x_filter[k];
69
94.5M
      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
70
94.5M
      x_qn += x_step_qn;
71
94.5M
    }
72
1.91M
    src += src_stride;
73
1.91M
    dst += dst_stride;
74
1.91M
  }
75
53.6k
}
76
#endif  // CONFIG_AV1_HIGHBITDEPTH
77
78
void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
79
                          int dst_stride, int w, int h,
80
                          const InterpFilterParams *filter_params_x,
81
                          const InterpFilterParams *filter_params_y,
82
                          const int subpel_x_qn, const int subpel_y_qn,
83
5.78k
                          ConvolveParams *conv_params) {
84
5.78k
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
85
5.78k
  int im_h = h + filter_params_y->taps - 1;
86
5.78k
  int im_stride = w;
87
5.78k
  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
88
5.78k
  const int fo_vert = filter_params_y->taps / 2 - 1;
89
5.78k
  const int fo_horiz = filter_params_x->taps / 2 - 1;
90
5.78k
  const int bd = 8;
91
5.78k
  const int bits =
92
5.78k
      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
93
94
  // horizontal filter
95
5.78k
  const uint8_t *src_horiz = src - fo_vert * src_stride;
96
5.78k
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
97
5.78k
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
98
80.6k
  for (int y = 0; y < im_h; ++y) {
99
485k
    for (int x = 0; x < w; ++x) {
100
411k
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
101
3.69M
      for (int k = 0; k < filter_params_x->taps; ++k) {
102
3.28M
        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
103
3.28M
      }
104
105
      // TODO(aomedia:3393): for 12-tap filter, in extreme cases, the result can
106
      // be beyond the following range. For better prediction, a clamping can be
107
      // added for 12 tap filter to ensure the horizontal filtering result is
108
      // within 16 bit. The same applies to the vertical filtering.
109
411k
      assert(filter_params_x->taps > 8 ||
110
411k
             (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
111
411k
      im_block[y * im_stride + x] =
112
411k
          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
113
411k
    }
114
74.9k
  }
115
116
  // vertical filter
117
5.78k
  int16_t *src_vert = im_block + fo_vert * im_stride;
118
5.78k
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
119
5.78k
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
120
5.78k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
121
40.1k
  for (int y = 0; y < h; ++y) {
122
232k
    for (int x = 0; x < w; ++x) {
123
198k
      int32_t sum = 1 << offset_bits;
124
1.78M
      for (int k = 0; k < filter_params_y->taps; ++k) {
125
1.58M
        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
126
1.58M
      }
127
198k
      assert(filter_params_y->taps > 8 ||
128
198k
             (0 <= sum && sum < (1 << (offset_bits + 2))));
129
198k
      int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
130
198k
                    ((1 << (offset_bits - conv_params->round_1)) +
131
198k
                     (1 << (offset_bits - conv_params->round_1 - 1)));
132
198k
      dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
133
198k
    }
134
34.3k
  }
135
5.78k
}
136
137
void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
138
                         int dst_stride, int w, int h,
139
                         const InterpFilterParams *filter_params_y,
140
3.92k
                         const int subpel_y_qn) {
141
3.92k
  const int fo_vert = filter_params_y->taps / 2 - 1;
142
143
  // vertical filter
144
3.92k
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
145
3.92k
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
146
26.9k
  for (int y = 0; y < h; ++y) {
147
168k
    for (int x = 0; x < w; ++x) {
148
145k
      int32_t res = 0;
149
1.30M
      for (int k = 0; k < filter_params_y->taps; ++k) {
150
1.16M
        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
151
1.16M
      }
152
145k
      dst[y * dst_stride + x] =
153
145k
          clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
154
145k
    }
155
23.0k
  }
156
3.92k
}
157
158
void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
159
                         int dst_stride, int w, int h,
160
                         const InterpFilterParams *filter_params_x,
161
5.66k
                         const int subpel_x_qn, ConvolveParams *conv_params) {
162
5.66k
  const int fo_horiz = filter_params_x->taps / 2 - 1;
163
5.66k
  const int bits = FILTER_BITS - conv_params->round_0;
164
165
5.66k
  assert(bits >= 0);
166
5.66k
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
167
5.66k
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
168
169
  // horizontal filter
170
5.66k
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
171
5.66k
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
172
173
37.6k
  for (int y = 0; y < h; ++y) {
174
228k
    for (int x = 0; x < w; ++x) {
175
196k
      int32_t res = 0;
176
1.77M
      for (int k = 0; k < filter_params_x->taps; ++k) {
177
1.57M
        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
178
1.57M
      }
179
196k
      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
180
196k
      dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
181
196k
    }
182
32.0k
  }
183
5.66k
}
184
185
// This function is exactly the same as av1_convolve_2d_sr_c, and is an
186
// optimized version for intrabc. Use the following 2-tap filter:
187
// DECLARE_ALIGNED(256, static const int16_t,
188
//                 av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
189
//   128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190
//   64,  64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
191
// };
192
void av1_convolve_2d_sr_intrabc_c(const uint8_t *src, int src_stride,
193
                                  uint8_t *dst, int dst_stride, int w, int h,
194
                                  const InterpFilterParams *filter_params_x,
195
                                  const InterpFilterParams *filter_params_y,
196
                                  const int subpel_x_qn, const int subpel_y_qn,
197
1.78k
                                  ConvolveParams *conv_params) {
198
1.78k
  assert(subpel_x_qn == 8);
199
1.78k
  assert(subpel_y_qn == 8);
200
1.78k
  assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
201
1.78k
  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
202
1.78k
  (void)filter_params_x;
203
1.78k
  (void)subpel_x_qn;
204
1.78k
  (void)filter_params_y;
205
1.78k
  (void)subpel_y_qn;
206
1.78k
  (void)conv_params;
207
208
1.78k
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
209
1.78k
  int im_h = h + 1;
210
1.78k
  int im_stride = w;
211
1.78k
  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
212
1.78k
  const int bd = 8;
213
214
  // horizontal filter
215
  // explicitly operate for subpel_x_qn = 8.
216
1.78k
  int16_t *im = im_block;
217
15.6k
  for (int y = 0; y < im_h; ++y) {
218
142k
    for (int x = 0; x < w; ++x) {
219
128k
      const int32_t sum = (1 << bd) + src[x] + src[x + 1];
220
128k
      assert(0 <= sum && sum < (1 << (bd + 2)));
221
128k
      im[x] = sum;
222
128k
    }
223
13.9k
    src += src_stride;
224
13.9k
    im += im_stride;
225
13.9k
  }
226
227
  // vertical filter
228
  // explicitly operate for subpel_y_qn = 8.
229
1.78k
  int16_t *src_vert = im_block;
230
13.9k
  for (int y = 0; y < h; ++y) {
231
127k
    for (int x = 0; x < w; ++x) {
232
114k
      const int32_t sum =
233
114k
          (1 << (bd + 2)) + src_vert[x] + src_vert[im_stride + x];
234
114k
      assert(0 <= sum && sum < (1 << (bd + 4)));
235
114k
      const int16_t res =
236
114k
          ROUND_POWER_OF_TWO(sum, 2) - ((1 << bd) + (1 << (bd - 1)));
237
114k
      dst[x] = clip_pixel(res);
238
114k
    }
239
12.1k
    src_vert += im_stride;
240
12.1k
    dst += dst_stride;
241
12.1k
  }
242
1.78k
}
243
244
// This function is exactly the same as av1_convolve_y_sr_c, and is an
245
// optimized version for intrabc.
246
void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride,
247
                                 uint8_t *dst, int dst_stride, int w, int h,
248
                                 const InterpFilterParams *filter_params_y,
249
1.97k
                                 const int subpel_y_qn) {
250
1.97k
  assert(subpel_y_qn == 8);
251
1.97k
  assert(filter_params_y->taps == 2);
252
1.97k
  (void)filter_params_y;
253
1.97k
  (void)subpel_y_qn;
254
255
  // vertical filter
256
  // explicitly operate for subpel_y_qn = 8.
257
14.7k
  for (int y = 0; y < h; ++y) {
258
161k
    for (int x = 0; x < w; ++x) {
259
148k
      const int32_t res = src[x] + src[src_stride + x];
260
148k
      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1));
261
148k
    }
262
12.7k
    src += src_stride;
263
12.7k
    dst += dst_stride;
264
12.7k
  }
265
1.97k
}
266
267
// This function is exactly the same as av1_convolve_x_sr_c, and is an
268
// optimized version for intrabc.
269
void av1_convolve_x_sr_intrabc_c(const uint8_t *src, int src_stride,
270
                                 uint8_t *dst, int dst_stride, int w, int h,
271
                                 const InterpFilterParams *filter_params_x,
272
                                 const int subpel_x_qn,
273
2.25k
                                 ConvolveParams *conv_params) {
274
2.25k
  assert(subpel_x_qn == 8);
275
2.25k
  assert(filter_params_x->taps == 2);
276
2.25k
  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
277
2.25k
  (void)filter_params_x;
278
2.25k
  (void)subpel_x_qn;
279
2.25k
  (void)conv_params;
280
281
  // horizontal filter
282
  // explicitly operate for subpel_x_qn = 8.
283
18.5k
  for (int y = 0; y < h; ++y) {
284
211k
    for (int x = 0; x < w; ++x) {
285
195k
      const int32_t res = src[x] + src[x + 1];
286
195k
      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1));
287
195k
    }
288
16.3k
    src += src_stride;
289
16.3k
    dst += dst_stride;
290
16.3k
  }
291
2.25k
}
292
293
void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
294
                                uint8_t *dst, int dst_stride, int w, int h,
295
                                const InterpFilterParams *filter_params_x,
296
                                const InterpFilterParams *filter_params_y,
297
                                const int subpel_x_qn, const int subpel_y_qn,
298
1.98k
                                ConvolveParams *conv_params) {
299
1.98k
  CONV_BUF_TYPE *dst16 = conv_params->dst;
300
1.98k
  int dst16_stride = conv_params->dst_stride;
301
1.98k
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
302
1.98k
  int im_h = h + filter_params_y->taps - 1;
303
1.98k
  int im_stride = w;
304
1.98k
  const int fo_vert = filter_params_y->taps / 2 - 1;
305
1.98k
  const int fo_horiz = filter_params_x->taps / 2 - 1;
306
1.98k
  const int bd = 8;
307
1.98k
  const int round_bits =
308
1.98k
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
309
310
  // horizontal filter
311
1.98k
  const uint8_t *src_horiz = src - fo_vert * src_stride;
312
1.98k
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
313
1.98k
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
314
32.1k
  for (int y = 0; y < im_h; ++y) {
315
263k
    for (int x = 0; x < w; ++x) {
316
233k
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
317
2.09M
      for (int k = 0; k < filter_params_x->taps; ++k) {
318
1.86M
        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
319
1.86M
      }
320
233k
      assert(filter_params_x->taps > 8 ||
321
233k
             (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
322
233k
      im_block[y * im_stride + x] =
323
233k
          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
324
233k
    }
325
30.1k
  }
326
327
  // vertical filter
328
1.98k
  int16_t *src_vert = im_block + fo_vert * im_stride;
329
1.98k
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
330
1.98k
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
331
1.98k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
332
18.2k
  for (int y = 0; y < h; ++y) {
333
142k
    for (int x = 0; x < w; ++x) {
334
126k
      int32_t sum = 1 << offset_bits;
335
1.14M
      for (int k = 0; k < filter_params_y->taps; ++k) {
336
1.01M
        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
337
1.01M
      }
338
126k
      assert(filter_params_y->taps > 8 ||
339
126k
             (0 <= sum && sum < (1 << (offset_bits + 2))));
340
126k
      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
341
126k
      if (conv_params->do_average) {
342
84.8k
        int32_t tmp = dst16[y * dst16_stride + x];
343
84.8k
        if (conv_params->use_dist_wtd_comp_avg) {
344
39.4k
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
345
39.4k
          tmp = tmp >> DIST_PRECISION_BITS;
346
45.4k
        } else {
347
45.4k
          tmp += res;
348
45.4k
          tmp = tmp >> 1;
349
45.4k
        }
350
84.8k
        tmp -= (1 << (offset_bits - conv_params->round_1)) +
351
84.8k
               (1 << (offset_bits - conv_params->round_1 - 1));
352
84.8k
        dst[y * dst_stride + x] =
353
84.8k
            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
354
84.8k
      } else {
355
41.8k
        dst16[y * dst16_stride + x] = res;
356
41.8k
      }
357
126k
    }
358
16.2k
  }
359
1.98k
}
360
361
void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
362
                               int dst_stride, int w, int h,
363
                               const InterpFilterParams *filter_params_y,
364
                               const int subpel_y_qn,
365
2.18k
                               ConvolveParams *conv_params) {
366
2.18k
  CONV_BUF_TYPE *dst16 = conv_params->dst;
367
2.18k
  int dst16_stride = conv_params->dst_stride;
368
2.18k
  const int fo_vert = filter_params_y->taps / 2 - 1;
369
2.18k
  const int bits = FILTER_BITS - conv_params->round_0;
370
2.18k
  const int bd = 8;
371
2.18k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
372
2.18k
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
373
2.18k
                           (1 << (offset_bits - conv_params->round_1 - 1));
374
2.18k
  const int round_bits =
375
2.18k
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
376
377
  // vertical filter
378
2.18k
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
379
2.18k
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
380
21.8k
  for (int y = 0; y < h; ++y) {
381
174k
    for (int x = 0; x < w; ++x) {
382
154k
      int32_t res = 0;
383
1.39M
      for (int k = 0; k < filter_params_y->taps; ++k) {
384
1.23M
        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
385
1.23M
      }
386
154k
      res *= (1 << bits);
387
154k
      res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
388
389
154k
      if (conv_params->do_average) {
390
61.6k
        int32_t tmp = dst16[y * dst16_stride + x];
391
61.6k
        if (conv_params->use_dist_wtd_comp_avg) {
392
3.90k
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
393
3.90k
          tmp = tmp >> DIST_PRECISION_BITS;
394
57.7k
        } else {
395
57.7k
          tmp += res;
396
57.7k
          tmp = tmp >> 1;
397
57.7k
        }
398
61.6k
        tmp -= round_offset;
399
61.6k
        dst[y * dst_stride + x] =
400
61.6k
            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
401
93.1k
      } else {
402
93.1k
        dst16[y * dst16_stride + x] = res;
403
93.1k
      }
404
154k
    }
405
19.6k
  }
406
2.18k
}
407
408
void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
409
                               int dst_stride, int w, int h,
410
                               const InterpFilterParams *filter_params_x,
411
                               const int subpel_x_qn,
412
1.00k
                               ConvolveParams *conv_params) {
413
1.00k
  CONV_BUF_TYPE *dst16 = conv_params->dst;
414
1.00k
  int dst16_stride = conv_params->dst_stride;
415
1.00k
  const int fo_horiz = filter_params_x->taps / 2 - 1;
416
1.00k
  const int bits = FILTER_BITS - conv_params->round_1;
417
1.00k
  const int bd = 8;
418
1.00k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
419
1.00k
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
420
1.00k
                           (1 << (offset_bits - conv_params->round_1 - 1));
421
1.00k
  const int round_bits =
422
1.00k
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
423
424
  // horizontal filter
425
1.00k
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
426
1.00k
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
427
9.72k
  for (int y = 0; y < h; ++y) {
428
96.6k
    for (int x = 0; x < w; ++x) {
429
87.9k
      int32_t res = 0;
430
791k
      for (int k = 0; k < filter_params_x->taps; ++k) {
431
703k
        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
432
703k
      }
433
87.9k
      res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
434
87.9k
      res += round_offset;
435
436
87.9k
      if (conv_params->do_average) {
437
16.8k
        int32_t tmp = dst16[y * dst16_stride + x];
438
16.8k
        if (conv_params->use_dist_wtd_comp_avg) {
439
7.48k
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
440
7.48k
          tmp = tmp >> DIST_PRECISION_BITS;
441
9.40k
        } else {
442
9.40k
          tmp += res;
443
9.40k
          tmp = tmp >> 1;
444
9.40k
        }
445
16.8k
        tmp -= round_offset;
446
16.8k
        dst[y * dst_stride + x] =
447
16.8k
            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
448
71.0k
      } else {
449
71.0k
        dst16[y * dst16_stride + x] = res;
450
71.0k
      }
451
87.9k
    }
452
8.72k
  }
453
1.00k
}
454
455
void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride,
456
                                     uint8_t *dst, int dst_stride, int w, int h,
457
4.16k
                                     ConvolveParams *conv_params) {
458
4.16k
  CONV_BUF_TYPE *dst16 = conv_params->dst;
459
4.16k
  int dst16_stride = conv_params->dst_stride;
460
4.16k
  const int bits =
461
4.16k
      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
462
4.16k
  const int bd = 8;
463
4.16k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
464
4.16k
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
465
4.16k
                           (1 << (offset_bits - conv_params->round_1 - 1));
466
467
42.8k
  for (int y = 0; y < h; ++y) {
468
440k
    for (int x = 0; x < w; ++x) {
469
401k
      CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
470
401k
      res += round_offset;
471
472
401k
      if (conv_params->do_average) {
473
81.5k
        int32_t tmp = dst16[y * dst16_stride + x];
474
81.5k
        if (conv_params->use_dist_wtd_comp_avg) {
475
8.89k
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
476
8.89k
          tmp = tmp >> DIST_PRECISION_BITS;
477
72.6k
        } else {
478
72.6k
          tmp += res;
479
72.6k
          tmp = tmp >> 1;
480
72.6k
        }
481
81.5k
        tmp -= round_offset;
482
81.5k
        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
483
320k
      } else {
484
320k
        dst16[y * dst16_stride + x] = res;
485
320k
      }
486
401k
    }
487
38.6k
  }
488
4.16k
}
489
490
void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst,
491
                             int dst_stride, int w, int h,
492
                             const InterpFilterParams *filter_params_x,
493
                             const InterpFilterParams *filter_params_y,
494
                             const int subpel_x_qn, const int x_step_qn,
495
                             const int subpel_y_qn, const int y_step_qn,
496
8.61k
                             ConvolveParams *conv_params) {
497
8.61k
  int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
498
8.61k
  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
499
8.61k
             filter_params_y->taps;
500
8.61k
  CONV_BUF_TYPE *dst16 = conv_params->dst;
501
8.61k
  const int dst16_stride = conv_params->dst_stride;
502
8.61k
  const int bits =
503
8.61k
      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
504
8.61k
  assert(bits >= 0);
505
8.61k
  int im_stride = w;
506
8.61k
  const int fo_vert = filter_params_y->taps / 2 - 1;
507
8.61k
  const int fo_horiz = filter_params_x->taps / 2 - 1;
508
8.61k
  const int bd = 8;
509
510
  // horizontal filter
511
8.61k
  const uint8_t *src_horiz = src - fo_vert * src_stride;
512
122k
  for (int y = 0; y < im_h; ++y) {
513
113k
    int x_qn = subpel_x_qn;
514
1.03M
    for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
515
921k
      const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
516
921k
      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
517
921k
      assert(x_filter_idx < SUBPEL_SHIFTS);
518
921k
      const int16_t *x_filter =
519
921k
          av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
520
921k
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
521
8.29M
      for (int k = 0; k < filter_params_x->taps; ++k) {
522
7.37M
        sum += x_filter[k] * src_x[k - fo_horiz];
523
7.37M
      }
524
921k
      assert(filter_params_x->taps > 8 ||
525
921k
             (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
526
921k
      im_block[y * im_stride + x] =
527
921k
          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
528
921k
    }
529
113k
    src_horiz += src_stride;
530
113k
  }
531
532
  // vertical filter
533
8.61k
  int16_t *src_vert = im_block + fo_vert * im_stride;
534
8.61k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
535
57.7k
  for (int x = 0; x < w; ++x) {
536
49.1k
    int y_qn = subpel_y_qn;
537
635k
    for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
538
586k
      const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
539
586k
      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
540
586k
      assert(y_filter_idx < SUBPEL_SHIFTS);
541
586k
      const int16_t *y_filter =
542
586k
          av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
543
586k
      int32_t sum = 1 << offset_bits;
544
5.27M
      for (int k = 0; k < filter_params_y->taps; ++k) {
545
4.69M
        sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
546
4.69M
      }
547
586k
      assert(filter_params_y->taps > 8 ||
548
586k
             (0 <= sum && sum < (1 << (offset_bits + 2))));
549
586k
      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
550
586k
      if (conv_params->is_compound) {
551
169k
        if (conv_params->do_average) {
552
138k
          int32_t tmp = dst16[y * dst16_stride + x];
553
138k
          if (conv_params->use_dist_wtd_comp_avg) {
554
98.5k
            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
555
98.5k
            tmp = tmp >> DIST_PRECISION_BITS;
556
98.5k
          } else {
557
40.0k
            tmp += res;
558
40.0k
            tmp = tmp >> 1;
559
40.0k
          }
560
          /* Subtract round offset and convolve round */
561
138k
          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
562
138k
                       (1 << (offset_bits - conv_params->round_1 - 1)));
563
138k
          dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
564
138k
        } else {
565
30.6k
          dst16[y * dst16_stride + x] = res;
566
30.6k
        }
567
417k
      } else {
568
        /* Subtract round offset and convolve round */
569
417k
        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
570
417k
                             (1 << (offset_bits - conv_params->round_1 - 1)));
571
417k
        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
572
417k
      }
573
586k
    }
574
49.1k
    src_vert++;
575
49.1k
  }
576
8.61k
}
577
578
static void convolve_2d_scale_wrapper(
579
    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
580
    int h, const InterpFilterParams *filter_params_x,
581
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
582
    const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
583
8.61k
    ConvolveParams *conv_params) {
584
8.61k
  if (conv_params->is_compound) {
585
1.54k
    assert(conv_params->dst != NULL);
586
1.54k
  }
587
8.61k
  av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
588
8.61k
                        filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
589
8.61k
                        y_step_qn, conv_params);
590
8.61k
}
591
592
static void convolve_2d_facade_compound(
593
    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
594
    int h, const InterpFilterParams *filter_params_x,
595
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
596
9.34k
    const int subpel_y_qn, ConvolveParams *conv_params) {
597
9.34k
  const bool need_x = subpel_x_qn != 0;
598
9.34k
  const bool need_y = subpel_y_qn != 0;
599
9.34k
  if (!need_x && !need_y) {
600
4.16k
    av1_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
601
4.16k
                                  conv_params);
602
5.17k
  } else if (need_x && !need_y) {
603
1.00k
    av1_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
604
1.00k
                            filter_params_x, subpel_x_qn, conv_params);
605
4.17k
  } else if (!need_x && need_y) {
606
2.18k
    av1_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
607
2.18k
                            filter_params_y, subpel_y_qn, conv_params);
608
2.18k
  } else {
609
1.98k
    assert(need_y && need_x);
610
1.98k
    av1_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
611
1.98k
                             filter_params_x, filter_params_y, subpel_x_qn,
612
1.98k
                             subpel_y_qn, conv_params);
613
1.98k
  }
614
9.34k
}
615
616
static void convolve_2d_facade_single(
617
    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
618
    int h, const InterpFilterParams *filter_params_x,
619
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
620
155k
    const int subpel_y_qn, ConvolveParams *conv_params) {
621
155k
  const bool need_x = subpel_x_qn != 0;
622
155k
  const bool need_y = subpel_y_qn != 0;
623
155k
  if (!need_x && !need_y) {
624
139k
    aom_convolve_copy(src, src_stride, dst, dst_stride, w, h);
625
139k
  } else if (need_x && !need_y) {
626
5.66k
    av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
627
5.66k
                      subpel_x_qn, conv_params);
628
9.70k
  } else if (!need_x && need_y) {
629
3.92k
    av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
630
3.92k
                      subpel_y_qn);
631
5.78k
  } else {
632
5.78k
    assert(need_x && need_y);
633
5.78k
    av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
634
5.78k
                       filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
635
5.78k
  }
636
155k
}
637
638
void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
639
                            int dst_stride, int w, int h,
640
                            const InterpFilterParams *interp_filters[2],
641
                            const int subpel_x_qn, int x_step_q4,
642
                            const int subpel_y_qn, int y_step_q4, int scaled,
643
179k
                            ConvolveParams *conv_params) {
644
179k
  (void)x_step_q4;
645
179k
  (void)y_step_q4;
646
179k
  (void)dst;
647
179k
  (void)dst_stride;
648
649
179k
  const InterpFilterParams *filter_params_x = interp_filters[0];
650
179k
  const InterpFilterParams *filter_params_y = interp_filters[1];
651
652
  // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case.
653
  // 2-tap filter indicates that it is for IntraBC.
654
179k
  if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
655
131k
    assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
656
131k
    assert(!scaled);
657
131k
    if (subpel_x_qn && subpel_y_qn) {
658
1.78k
      av1_convolve_2d_sr_intrabc(src, src_stride, dst, dst_stride, w, h,
659
1.78k
                                 filter_params_x, filter_params_y, subpel_x_qn,
660
1.78k
                                 subpel_y_qn, conv_params);
661
1.78k
      return;
662
130k
    } else if (subpel_x_qn) {
663
2.25k
      av1_convolve_x_sr_intrabc(src, src_stride, dst, dst_stride, w, h,
664
2.25k
                                filter_params_x, subpel_x_qn, conv_params);
665
2.25k
      return;
666
127k
    } else if (subpel_y_qn) {
667
1.97k
      av1_convolve_y_sr_intrabc(src, src_stride, dst, dst_stride, w, h,
668
1.97k
                                filter_params_y, subpel_y_qn);
669
1.97k
      return;
670
1.97k
    }
671
131k
  }
672
673
172k
  if (scaled) {
674
8.61k
    convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
675
8.61k
                              filter_params_x, filter_params_y, subpel_x_qn,
676
8.61k
                              x_step_q4, subpel_y_qn, y_step_q4, conv_params);
677
164k
  } else if (conv_params->is_compound) {
678
9.34k
    convolve_2d_facade_compound(src, src_stride, dst, dst_stride, w, h,
679
9.34k
                                filter_params_x, filter_params_y, subpel_x_qn,
680
9.34k
                                subpel_y_qn, conv_params);
681
155k
  } else {
682
155k
    convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
683
155k
                              filter_params_x, filter_params_y, subpel_x_qn,
684
155k
                              subpel_y_qn, conv_params);
685
155k
  }
686
172k
}
687
688
#if CONFIG_AV1_HIGHBITDEPTH
689
void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
690
                                uint16_t *dst, int dst_stride, int w, int h,
691
                                const InterpFilterParams *filter_params_x,
692
                                const int subpel_x_qn,
693
5.66k
                                ConvolveParams *conv_params, int bd) {
694
5.66k
  const int fo_horiz = filter_params_x->taps / 2 - 1;
695
5.66k
  const int bits = FILTER_BITS - conv_params->round_0;
696
697
5.66k
  assert(bits >= 0);
698
5.66k
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
699
5.66k
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
700
701
  // horizontal filter
702
5.66k
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
703
5.66k
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
704
40.7k
  for (int y = 0; y < h; ++y) {
705
256k
    for (int x = 0; x < w; ++x) {
706
221k
      int32_t res = 0;
707
1.99M
      for (int k = 0; k < filter_params_x->taps; ++k) {
708
1.77M
        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
709
1.77M
      }
710
221k
      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
711
221k
      dst[y * dst_stride + x] =
712
221k
          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
713
221k
    }
714
35.1k
  }
715
5.66k
}
716
717
void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
718
                                uint16_t *dst, int dst_stride, int w, int h,
719
                                const InterpFilterParams *filter_params_y,
720
4.00k
                                const int subpel_y_qn, int bd) {
721
4.00k
  const int fo_vert = filter_params_y->taps / 2 - 1;
722
  // vertical filter
723
4.00k
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
724
4.00k
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
725
31.6k
  for (int y = 0; y < h; ++y) {
726
220k
    for (int x = 0; x < w; ++x) {
727
192k
      int32_t res = 0;
728
1.73M
      for (int k = 0; k < filter_params_y->taps; ++k) {
729
1.54M
        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
730
1.54M
      }
731
192k
      dst[y * dst_stride + x] =
732
192k
          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
733
192k
    }
734
27.6k
  }
735
4.00k
}
736
737
void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
738
                                 uint16_t *dst, int dst_stride, int w, int h,
739
                                 const InterpFilterParams *filter_params_x,
740
                                 const InterpFilterParams *filter_params_y,
741
                                 const int subpel_x_qn, const int subpel_y_qn,
742
6.26k
                                 ConvolveParams *conv_params, int bd) {
743
6.26k
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
744
6.26k
  int im_h = h + filter_params_y->taps - 1;
745
6.26k
  int im_stride = w;
746
6.26k
  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
747
6.26k
  const int fo_vert = filter_params_y->taps / 2 - 1;
748
6.26k
  const int fo_horiz = filter_params_x->taps / 2 - 1;
749
6.26k
  const int bits =
750
6.26k
      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
751
6.26k
  assert(bits >= 0);
752
753
  // horizontal filter
754
6.26k
  const uint16_t *src_horiz = src - fo_vert * src_stride;
755
6.26k
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
756
6.26k
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
757
88.5k
  for (int y = 0; y < im_h; ++y) {
758
545k
    for (int x = 0; x < w; ++x) {
759
463k
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
760
4.17M
      for (int k = 0; k < filter_params_x->taps; ++k) {
761
3.70M
        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
762
3.70M
      }
763
463k
      assert(filter_params_x->taps > 8 ||
764
463k
             (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
765
463k
      im_block[y * im_stride + x] =
766
463k
          ROUND_POWER_OF_TWO(sum, conv_params->round_0);
767
463k
    }
768
82.3k
  }
769
770
  // vertical filter
771
6.26k
  int16_t *src_vert = im_block + fo_vert * im_stride;
772
6.26k
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
773
6.26k
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
774
6.26k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
775
44.7k
  for (int y = 0; y < h; ++y) {
776
265k
    for (int x = 0; x < w; ++x) {
777
226k
      int32_t sum = 1 << offset_bits;
778
2.04M
      for (int k = 0; k < filter_params_y->taps; ++k) {
779
1.81M
        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
780
1.81M
      }
781
226k
      assert(filter_params_y->taps > 8 ||
782
226k
             (0 <= sum && sum < (1 << (offset_bits + 2))));
783
226k
      int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
784
226k
                    ((1 << (offset_bits - conv_params->round_1)) +
785
226k
                     (1 << (offset_bits - conv_params->round_1 - 1)));
786
226k
      dst[y * dst_stride + x] =
787
226k
          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
788
226k
    }
789
38.4k
  }
790
6.26k
}
791
792
// This function is exactly the same as av1_highbd_convolve_2d_sr_c, and is an
793
// optimized version for intrabc. Use the following 2-tap filter:
794
// DECLARE_ALIGNED(256, static const int16_t,
795
//                 av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
796
//   128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
797
//   64,  64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
798
// };
799
void av1_highbd_convolve_2d_sr_intrabc_c(
800
    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
801
    int h, const InterpFilterParams *filter_params_x,
802
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
803
11.8k
    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
804
11.8k
  const int bits =
805
11.8k
      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
806
11.8k
  assert(bits >= 0);
807
11.8k
  assert(subpel_x_qn == 8);
808
11.8k
  assert(subpel_y_qn == 8);
809
11.8k
  assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
810
11.8k
  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
811
11.8k
  (void)filter_params_x;
812
11.8k
  (void)subpel_x_qn;
813
11.8k
  (void)filter_params_y;
814
11.8k
  (void)subpel_y_qn;
815
11.8k
  (void)conv_params;
816
817
11.8k
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
818
11.8k
  int im_h = h + 1;
819
11.8k
  int im_stride = w;
820
11.8k
  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
821
822
  // horizontal filter
823
  // explicitly operate for subpel_x_qn = 8.
824
11.8k
  int16_t *im = im_block;
825
99.5k
  for (int y = 0; y < im_h; ++y) {
826
882k
    for (int x = 0; x < w; ++x) {
827
795k
      int32_t sum = (1 << (bd + FILTER_BITS - 1)) + 64 * (src[x] + src[x + 1]);
828
795k
      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
829
795k
      sum = ROUND_POWER_OF_TWO(sum, conv_params->round_0);
830
795k
      im[x] = sum;
831
795k
    }
832
87.7k
    src += src_stride;
833
87.7k
    im += im_stride;
834
87.7k
  }
835
836
  // vertical filter
837
  // explicitly operate for subpel_y_qn = 8.
838
11.8k
  int16_t *src_vert = im_block;
839
11.8k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
840
87.7k
  for (int y = 0; y < h; ++y) {
841
781k
    for (int x = 0; x < w; ++x) {
842
705k
      const int32_t sum =
843
705k
          (1 << offset_bits) + 64 * (src_vert[x] + src_vert[im_stride + x]);
844
705k
      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
845
705k
      const int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
846
705k
                          ((1 << (offset_bits - conv_params->round_1)) +
847
705k
                           (1 << (offset_bits - conv_params->round_1 - 1)));
848
849
705k
      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
850
705k
    }
851
75.8k
    src_vert += im_stride;
852
75.8k
    dst += dst_stride;
853
75.8k
  }
854
11.8k
}
855
856
// This function is exactly the same as av1_highbd_convolve_y_sr_c, and is an
857
// optimized version for intrabc.
858
void av1_highbd_convolve_y_sr_intrabc_c(
859
    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
860
    int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
861
10.8k
    int bd) {
862
10.8k
  assert(subpel_y_qn == 8);
863
10.8k
  assert(filter_params_y->taps == 2);
864
10.8k
  (void)filter_params_y;
865
10.8k
  (void)subpel_y_qn;
866
867
  // vertical filter
868
  // explicitly operate for subpel_y_qn = 8.
869
79.8k
  for (int y = 0; y < h; ++y) {
870
690k
    for (int x = 0; x < w; ++x) {
871
621k
      const int32_t res = src[x] + src[src_stride + x];
872
621k
      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, 1), bd);
873
621k
    }
874
69.0k
    src += src_stride;
875
69.0k
    dst += dst_stride;
876
69.0k
  }
877
10.8k
}
878
879
// This function is exactly the same as av1_highbd_convolve_x_sr_c, and is an
880
// optimized version for intrabc.
881
void av1_highbd_convolve_x_sr_intrabc_c(
882
    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
883
    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
884
10.9k
    ConvolveParams *conv_params, int bd) {
885
10.9k
  const int bits = FILTER_BITS - conv_params->round_0;
886
10.9k
  assert(bits >= 0);
887
10.9k
  assert(subpel_x_qn == 8);
888
10.9k
  assert(filter_params_x->taps == 2);
889
10.9k
  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
890
10.9k
  (void)filter_params_x;
891
10.9k
  (void)subpel_x_qn;
892
893
  // horizontal filter
894
  // explicitly operate for subpel_x_qn = 8.
895
83.8k
  for (int y = 0; y < h; ++y) {
896
875k
    for (int x = 0; x < w; ++x) {
897
802k
      int32_t res = 64 * (src[x] + src[x + 1]);
898
802k
      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
899
802k
      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
900
802k
    }
901
72.9k
    src += src_stride;
902
72.9k
    dst += dst_stride;
903
72.9k
  }
904
10.9k
}
905
906
void av1_highbd_dist_wtd_convolve_2d_c(
907
    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
908
    int h, const InterpFilterParams *filter_params_x,
909
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
910
3.75k
    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
911
3.75k
  int x, y, k;
912
3.75k
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
913
3.75k
  CONV_BUF_TYPE *dst16 = conv_params->dst;
914
3.75k
  int dst16_stride = conv_params->dst_stride;
915
3.75k
  int im_h = h + filter_params_y->taps - 1;
916
3.75k
  int im_stride = w;
917
3.75k
  const int fo_vert = filter_params_y->taps / 2 - 1;
918
3.75k
  const int fo_horiz = filter_params_x->taps / 2 - 1;
919
3.75k
  const int round_bits =
920
3.75k
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
921
3.75k
  assert(round_bits >= 0);
922
923
  // horizontal filter
924
3.75k
  const uint16_t *src_horiz = src - fo_vert * src_stride;
925
3.75k
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
926
3.75k
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
927
65.1k
  for (y = 0; y < im_h; ++y) {
928
572k
    for (x = 0; x < w; ++x) {
929
510k
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
930
4.59M
      for (k = 0; k < filter_params_x->taps; ++k) {
931
4.08M
        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
932
4.08M
      }
933
510k
      assert(filter_params_x->taps > 8 ||
934
510k
             (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
935
510k
      (void)bd;
936
510k
      im_block[y * im_stride + x] =
937
510k
          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
938
510k
    }
939
61.4k
  }
940
941
  // vertical filter
942
3.75k
  int16_t *src_vert = im_block + fo_vert * im_stride;
943
3.75k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
944
3.75k
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
945
3.75k
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
946
38.9k
  for (y = 0; y < h; ++y) {
947
334k
    for (x = 0; x < w; ++x) {
948
299k
      int32_t sum = 1 << offset_bits;
949
2.69M
      for (k = 0; k < filter_params_y->taps; ++k) {
950
2.39M
        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
951
2.39M
      }
952
299k
      assert(filter_params_y->taps > 8 ||
953
299k
             (0 <= sum && sum < (1 << (offset_bits + 2))));
954
299k
      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
955
299k
      if (conv_params->do_average) {
956
94.4k
        int32_t tmp = dst16[y * dst16_stride + x];
957
94.4k
        if (conv_params->use_dist_wtd_comp_avg) {
958
6.52k
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
959
6.52k
          tmp = tmp >> DIST_PRECISION_BITS;
960
87.8k
        } else {
961
87.8k
          tmp += res;
962
87.8k
          tmp = tmp >> 1;
963
87.8k
        }
964
94.4k
        tmp -= (1 << (offset_bits - conv_params->round_1)) +
965
94.4k
               (1 << (offset_bits - conv_params->round_1 - 1));
966
94.4k
        dst[y * dst_stride + x] =
967
94.4k
            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
968
205k
      } else {
969
205k
        dst16[y * dst16_stride + x] = res;
970
205k
      }
971
299k
    }
972
35.1k
  }
973
3.75k
}
974
975
void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride,
976
                                      uint16_t *dst, int dst_stride, int w,
977
                                      int h,
978
                                      const InterpFilterParams *filter_params_x,
979
                                      const int subpel_x_qn,
980
2.77k
                                      ConvolveParams *conv_params, int bd) {
981
2.77k
  CONV_BUF_TYPE *dst16 = conv_params->dst;
982
2.77k
  int dst16_stride = conv_params->dst_stride;
983
2.77k
  const int fo_horiz = filter_params_x->taps / 2 - 1;
984
2.77k
  const int bits = FILTER_BITS - conv_params->round_1;
985
2.77k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
986
2.77k
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
987
2.77k
                           (1 << (offset_bits - conv_params->round_1 - 1));
988
2.77k
  const int round_bits =
989
2.77k
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
990
2.77k
  assert(round_bits >= 0);
991
2.77k
  assert(bits >= 0);
992
  // horizontal filter
993
2.77k
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
994
2.77k
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
995
34.0k
  for (int y = 0; y < h; ++y) {
996
345k
    for (int x = 0; x < w; ++x) {
997
314k
      int32_t res = 0;
998
2.82M
      for (int k = 0; k < filter_params_x->taps; ++k) {
999
2.51M
        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
1000
2.51M
      }
1001
314k
      res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
1002
314k
      res += round_offset;
1003
1004
314k
      if (conv_params->do_average) {
1005
217k
        int32_t tmp = dst16[y * dst16_stride + x];
1006
217k
        if (conv_params->use_dist_wtd_comp_avg) {
1007
5.12k
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1008
5.12k
          tmp = tmp >> DIST_PRECISION_BITS;
1009
212k
        } else {
1010
212k
          tmp += res;
1011
212k
          tmp = tmp >> 1;
1012
212k
        }
1013
217k
        tmp -= round_offset;
1014
217k
        dst[y * dst_stride + x] =
1015
217k
            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
1016
217k
      } else {
1017
96.5k
        dst16[y * dst16_stride + x] = res;
1018
96.5k
      }
1019
314k
    }
1020
31.3k
  }
1021
2.77k
}
1022
1023
void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride,
1024
                                      uint16_t *dst, int dst_stride, int w,
1025
                                      int h,
1026
                                      const InterpFilterParams *filter_params_y,
1027
                                      const int subpel_y_qn,
1028
2.39k
                                      ConvolveParams *conv_params, int bd) {
1029
2.39k
  CONV_BUF_TYPE *dst16 = conv_params->dst;
1030
2.39k
  int dst16_stride = conv_params->dst_stride;
1031
2.39k
  const int fo_vert = filter_params_y->taps / 2 - 1;
1032
2.39k
  const int bits = FILTER_BITS - conv_params->round_0;
1033
2.39k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1034
2.39k
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
1035
2.39k
                           (1 << (offset_bits - conv_params->round_1 - 1));
1036
2.39k
  const int round_bits =
1037
2.39k
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1038
2.39k
  assert(round_bits >= 0);
1039
2.39k
  assert(bits >= 0);
1040
  // vertical filter
1041
2.39k
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
1042
2.39k
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
1043
28.2k
  for (int y = 0; y < h; ++y) {
1044
304k
    for (int x = 0; x < w; ++x) {
1045
279k
      int32_t res = 0;
1046
2.51M
      for (int k = 0; k < filter_params_y->taps; ++k) {
1047
2.23M
        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
1048
2.23M
      }
1049
279k
      res *= (1 << bits);
1050
279k
      res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
1051
1052
279k
      if (conv_params->do_average) {
1053
79.7k
        int32_t tmp = dst16[y * dst16_stride + x];
1054
79.7k
        if (conv_params->use_dist_wtd_comp_avg) {
1055
9.08k
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1056
9.08k
          tmp = tmp >> DIST_PRECISION_BITS;
1057
70.6k
        } else {
1058
70.6k
          tmp += res;
1059
70.6k
          tmp = tmp >> 1;
1060
70.6k
        }
1061
79.7k
        tmp -= round_offset;
1062
79.7k
        dst[y * dst_stride + x] =
1063
79.7k
            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
1064
199k
      } else {
1065
199k
        dst16[y * dst16_stride + x] = res;
1066
199k
      }
1067
279k
    }
1068
25.8k
  }
1069
2.39k
}
1070
1071
void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride,
1072
                                            uint16_t *dst, int dst_stride,
1073
                                            int w, int h,
1074
                                            ConvolveParams *conv_params,
1075
10.9k
                                            int bd) {
1076
10.9k
  CONV_BUF_TYPE *dst16 = conv_params->dst;
1077
10.9k
  int dst16_stride = conv_params->dst_stride;
1078
10.9k
  const int bits =
1079
10.9k
      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
1080
10.9k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1081
10.9k
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
1082
10.9k
                           (1 << (offset_bits - conv_params->round_1 - 1));
1083
10.9k
  assert(bits >= 0);
1084
1085
109k
  for (int y = 0; y < h; ++y) {
1086
1.02M
    for (int x = 0; x < w; ++x) {
1087
929k
      CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
1088
929k
      res += round_offset;
1089
929k
      if (conv_params->do_average) {
1090
268k
        int32_t tmp = dst16[y * dst16_stride + x];
1091
268k
        if (conv_params->use_dist_wtd_comp_avg) {
1092
187k
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1093
187k
          tmp = tmp >> DIST_PRECISION_BITS;
1094
187k
        } else {
1095
81.1k
          tmp += res;
1096
81.1k
          tmp = tmp >> 1;
1097
81.1k
        }
1098
268k
        tmp -= round_offset;
1099
268k
        dst[y * dst_stride + x] =
1100
268k
            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1101
661k
      } else {
1102
661k
        dst16[y * dst16_stride + x] = res;
1103
661k
      }
1104
929k
    }
1105
98.5k
  }
1106
10.9k
}
1107
1108
void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
1109
                                    uint16_t *dst, int dst_stride, int w, int h,
1110
                                    const InterpFilterParams *filter_params_x,
1111
                                    const InterpFilterParams *filter_params_y,
1112
                                    const int subpel_x_qn, const int x_step_qn,
1113
                                    const int subpel_y_qn, const int y_step_qn,
1114
5.94k
                                    ConvolveParams *conv_params, int bd) {
1115
5.94k
  int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
1116
5.94k
  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
1117
5.94k
             filter_params_y->taps;
1118
5.94k
  int im_stride = w;
1119
5.94k
  const int fo_vert = filter_params_y->taps / 2 - 1;
1120
5.94k
  const int fo_horiz = filter_params_x->taps / 2 - 1;
1121
5.94k
  CONV_BUF_TYPE *dst16 = conv_params->dst;
1122
5.94k
  const int dst16_stride = conv_params->dst_stride;
1123
5.94k
  const int bits =
1124
5.94k
      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
1125
5.94k
  assert(bits >= 0);
1126
  // horizontal filter
1127
5.94k
  const uint16_t *src_horiz = src - fo_vert * src_stride;
1128
93.7k
  for (int y = 0; y < im_h; ++y) {
1129
87.7k
    int x_qn = subpel_x_qn;
1130
1.29M
    for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
1131
1.20M
      const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
1132
1.20M
      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
1133
1.20M
      assert(x_filter_idx < SUBPEL_SHIFTS);
1134
1.20M
      const int16_t *x_filter =
1135
1.20M
          av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
1136
1.20M
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
1137
10.8M
      for (int k = 0; k < filter_params_x->taps; ++k) {
1138
9.62M
        sum += x_filter[k] * src_x[k - fo_horiz];
1139
9.62M
      }
1140
1.20M
      assert(filter_params_x->taps > 8 ||
1141
1.20M
             (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
1142
1.20M
      im_block[y * im_stride + x] =
1143
1.20M
          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
1144
1.20M
    }
1145
87.7k
    src_horiz += src_stride;
1146
87.7k
  }
1147
1148
  // vertical filter
1149
5.94k
  int16_t *src_vert = im_block + fo_vert * im_stride;
1150
5.94k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1151
53.2k
  for (int x = 0; x < w; ++x) {
1152
47.3k
    int y_qn = subpel_y_qn;
1153
916k
    for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
1154
869k
      const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
1155
869k
      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
1156
869k
      assert(y_filter_idx < SUBPEL_SHIFTS);
1157
869k
      const int16_t *y_filter =
1158
869k
          av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
1159
869k
      int32_t sum = 1 << offset_bits;
1160
7.82M
      for (int k = 0; k < filter_params_y->taps; ++k) {
1161
6.95M
        sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
1162
6.95M
      }
1163
869k
      assert(filter_params_y->taps > 8 ||
1164
869k
             (0 <= sum && sum < (1 << (offset_bits + 2))));
1165
869k
      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
1166
869k
      if (conv_params->is_compound) {
1167
176k
        if (conv_params->do_average) {
1168
63.6k
          int32_t tmp = dst16[y * dst16_stride + x];
1169
63.6k
          if (conv_params->use_dist_wtd_comp_avg) {
1170
36.8k
            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1171
36.8k
            tmp = tmp >> DIST_PRECISION_BITS;
1172
36.8k
          } else {
1173
26.7k
            tmp += res;
1174
26.7k
            tmp = tmp >> 1;
1175
26.7k
          }
1176
          /* Subtract round offset and convolve round */
1177
63.6k
          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
1178
63.6k
                       (1 << (offset_bits - conv_params->round_1 - 1)));
1179
63.6k
          dst[y * dst_stride + x] =
1180
63.6k
              clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1181
112k
        } else {
1182
112k
          dst16[y * dst16_stride + x] = res;
1183
112k
        }
1184
693k
      } else {
1185
        /* Subtract round offset and convolve round */
1186
693k
        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
1187
693k
                             (1 << (offset_bits - conv_params->round_1 - 1)));
1188
693k
        dst[y * dst_stride + x] =
1189
693k
            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1190
693k
      }
1191
869k
    }
1192
47.3k
    src_vert++;
1193
47.3k
  }
1194
5.94k
}
1195
1196
static void highbd_convolve_2d_facade_compound(
1197
    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
1198
    const int w, const int h, const InterpFilterParams *filter_params_x,
1199
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
1200
19.8k
    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
1201
19.8k
  const bool need_x = subpel_x_qn != 0;
1202
19.8k
  const bool need_y = subpel_y_qn != 0;
1203
19.8k
  if (!need_x && !need_y) {
1204
10.9k
    av1_highbd_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
1205
10.9k
                                         conv_params, bd);
1206
10.9k
  } else if (need_x && !need_y) {
1207
2.77k
    av1_highbd_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
1208
2.77k
                                   filter_params_x, subpel_x_qn, conv_params,
1209
2.77k
                                   bd);
1210
6.14k
  } else if (!need_x && need_y) {
1211
2.39k
    av1_highbd_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
1212
2.39k
                                   filter_params_y, subpel_y_qn, conv_params,
1213
2.39k
                                   bd);
1214
3.75k
  } else {
1215
3.75k
    assert(need_x && need_y);
1216
3.75k
    av1_highbd_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
1217
3.75k
                                    filter_params_x, filter_params_y,
1218
3.75k
                                    subpel_x_qn, subpel_y_qn, conv_params, bd);
1219
3.75k
  }
1220
19.8k
}
1221
1222
static void highbd_convolve_2d_facade_single(
1223
    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
1224
    const int w, const int h, const InterpFilterParams *filter_params_x,
1225
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
1226
70.0k
    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
1227
70.0k
  const bool need_x = subpel_x_qn != 0;
1228
70.0k
  const bool need_y = subpel_y_qn != 0;
1229
1230
70.0k
  if (!need_x && !need_y) {
1231
54.0k
    aom_highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h);
1232
54.0k
  } else if (need_x && !need_y) {
1233
5.66k
    av1_highbd_convolve_x_sr(src, src_stride, dst, dst_stride, w, h,
1234
5.66k
                             filter_params_x, subpel_x_qn, conv_params, bd);
1235
10.2k
  } else if (!need_x && need_y) {
1236
4.00k
    av1_highbd_convolve_y_sr(src, src_stride, dst, dst_stride, w, h,
1237
4.00k
                             filter_params_y, subpel_y_qn, bd);
1238
6.26k
  } else {
1239
6.26k
    assert(need_x && need_y);
1240
6.26k
    av1_highbd_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
1241
6.26k
                              filter_params_x, filter_params_y, subpel_x_qn,
1242
6.26k
                              subpel_y_qn, conv_params, bd);
1243
6.26k
  }
1244
70.0k
}
1245
1246
void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
1247
                                   uint8_t *dst8, int dst_stride, int w, int h,
1248
                                   const InterpFilterParams *interp_filters[2],
1249
                                   const int subpel_x_qn, int x_step_q4,
1250
                                   const int subpel_y_qn, int y_step_q4,
1251
                                   int scaled, ConvolveParams *conv_params,
1252
129k
                                   int bd) {
1253
129k
  (void)x_step_q4;
1254
129k
  (void)y_step_q4;
1255
129k
  (void)dst_stride;
1256
129k
  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1257
1258
129k
  const InterpFilterParams *filter_params_x = interp_filters[0];
1259
129k
  const InterpFilterParams *filter_params_y = interp_filters[1];
1260
1261
129k
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1262
  // 2-tap filter indicates that it is for IntraBC.
1263
129k
  if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
1264
75.0k
    assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
1265
75.0k
    assert(!scaled);
1266
75.0k
    if (subpel_x_qn && subpel_y_qn) {
1267
11.8k
      av1_highbd_convolve_2d_sr_intrabc_c(
1268
11.8k
          src, src_stride, dst, dst_stride, w, h, filter_params_x,
1269
11.8k
          filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
1270
11.8k
      return;
1271
63.1k
    } else if (subpel_x_qn) {
1272
10.9k
      av1_highbd_convolve_x_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
1273
10.9k
                                         filter_params_x, subpel_x_qn,
1274
10.9k
                                         conv_params, bd);
1275
10.9k
      return;
1276
52.1k
    } else if (subpel_y_qn) {
1277
10.8k
      av1_highbd_convolve_y_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
1278
10.8k
                                         filter_params_y, subpel_y_qn, bd);
1279
10.8k
      return;
1280
10.8k
    }
1281
75.0k
  }
1282
1283
95.8k
  if (scaled) {
1284
5.94k
    if (conv_params->is_compound) {
1285
989
      assert(conv_params->dst != NULL);
1286
989
    }
1287
5.94k
    av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
1288
5.94k
                                 filter_params_x, filter_params_y, subpel_x_qn,
1289
5.94k
                                 x_step_q4, subpel_y_qn, y_step_q4, conv_params,
1290
5.94k
                                 bd);
1291
89.8k
  } else if (conv_params->is_compound) {
1292
19.8k
    highbd_convolve_2d_facade_compound(
1293
19.8k
        src, src_stride, dst, dst_stride, w, h, filter_params_x,
1294
19.8k
        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
1295
70.0k
  } else {
1296
70.0k
    highbd_convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
1297
70.0k
                                     filter_params_x, filter_params_y,
1298
70.0k
                                     subpel_x_qn, subpel_y_qn, conv_params, bd);
1299
70.0k
  }
1300
95.8k
}
1301
#endif  // CONFIG_AV1_HIGHBITDEPTH
1302
1303
// Note: Fixed size intermediate buffers, place limits on parameters
1304
// of some functions. 2d filtering proceeds in 2 steps:
1305
//   (1) Interpolate horizontally into an intermediate buffer, temp.
1306
//   (2) Interpolate temp vertically to derive the sub-pixel result.
1307
// Deriving the maximum number of rows in the temp buffer (135):
1308
// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1309
// --Largest block size is 128x128 pixels.
1310
// --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
1311
//   original frame (in 1/16th pixel units).
1312
// --Must round-up because block may be located at sub-pixel position.
1313
// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1314
// --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
1315
#define WIENER_MAX_EXT_SIZE 263
1316
1317
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1318
6.92M
static inline int horz_scalar_product(const uint8_t *a, const int16_t *b) {
1319
6.92M
  int sum = 0;
1320
62.3M
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1321
6.92M
  return sum;
1322
6.92M
}
1323
1324
#if CONFIG_AV1_HIGHBITDEPTH
1325
static inline int highbd_horz_scalar_product(const uint16_t *a,
1326
21.4M
                                             const int16_t *b) {
1327
21.4M
  int sum = 0;
1328
193M
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1329
21.4M
  return sum;
1330
21.4M
}
1331
#endif
1332
1333
static inline int highbd_vert_scalar_product(const uint16_t *a,
1334
                                             ptrdiff_t a_stride,
1335
26.7M
                                             const int16_t *b) {
1336
26.7M
  int sum = 0;
1337
230M
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
1338
26.7M
  return sum;
1339
26.7M
}
1340
1341
28.2k
static const InterpKernel *get_filter_base(const int16_t *filter) {
1342
  // NOTE: This assumes that the filter table is 256-byte aligned.
1343
  // TODO(agrange) Modify to make independent of table alignment.
1344
28.2k
  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
1345
28.2k
}
1346
1347
28.2k
static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
1348
28.2k
  return (int)((const InterpKernel *)(intptr_t)f - base);
1349
28.2k
}
1350
1351
static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
1352
                                       uint16_t *dst, ptrdiff_t dst_stride,
1353
                                       const InterpKernel *x_filters, int x0_q4,
1354
                                       int x_step_q4, int w, int h,
1355
3.93k
                                       int round0_bits) {
1356
3.93k
  const int bd = 8;
1357
3.93k
  src -= SUBPEL_TAPS / 2 - 1;
1358
153k
  for (int y = 0; y < h; ++y) {
1359
149k
    int x_q4 = x0_q4;
1360
7.07M
    for (int x = 0; x < w; ++x) {
1361
6.92M
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1362
6.92M
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1363
6.92M
      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1364
6.92M
                           (1 << (bd + FILTER_BITS - 1));
1365
6.92M
      const int sum = horz_scalar_product(src_x, x_filter) + rounding;
1366
6.92M
      dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1367
6.92M
                               WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
1368
6.92M
      x_q4 += x_step_q4;
1369
6.92M
    }
1370
149k
    src += src_stride;
1371
149k
    dst += dst_stride;
1372
149k
  }
1373
3.93k
}
1374
1375
static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
1376
                                      uint8_t *dst, ptrdiff_t dst_stride,
1377
                                      const InterpKernel *y_filters, int y0_q4,
1378
                                      int y_step_q4, int w, int h,
1379
3.94k
                                      int round1_bits) {
1380
3.94k
  const int bd = 8;
1381
3.94k
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1382
1383
164k
  for (int x = 0; x < w; ++x) {
1384
160k
    int y_q4 = y0_q4;
1385
7.30M
    for (int y = 0; y < h; ++y) {
1386
7.14M
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1387
7.14M
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1388
7.14M
      const int rounding =
1389
7.14M
          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1390
7.14M
          (1 << (bd + round1_bits - 1));
1391
7.14M
      const int sum =
1392
7.14M
          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1393
7.14M
      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
1394
7.14M
      y_q4 += y_step_q4;
1395
7.14M
    }
1396
160k
    ++src;
1397
160k
    ++dst;
1398
160k
  }
1399
3.94k
}
1400
1401
void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
1402
                                   uint8_t *dst, ptrdiff_t dst_stride,
1403
                                   const int16_t *filter_x, int x_step_q4,
1404
                                   const int16_t *filter_y, int y_step_q4,
1405
                                   int w, int h,
1406
3.94k
                                   const WienerConvolveParams *conv_params) {
1407
3.94k
  const InterpKernel *const filters_x = get_filter_base(filter_x);
1408
3.94k
  const int x0_q4 = get_filter_offset(filter_x, filters_x);
1409
1410
3.94k
  const InterpKernel *const filters_y = get_filter_base(filter_y);
1411
3.94k
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
1412
1413
3.94k
  uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1414
3.94k
  const int intermediate_height =
1415
3.94k
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
1416
3.94k
  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
1417
1418
3.94k
  assert(w <= MAX_SB_SIZE);
1419
3.94k
  assert(h <= MAX_SB_SIZE);
1420
3.94k
  assert(y_step_q4 <= 32);
1421
3.94k
  assert(x_step_q4 <= 32);
1422
1423
3.94k
  convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1424
3.94k
                             src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
1425
3.94k
                             x_step_q4, w, intermediate_height,
1426
3.94k
                             conv_params->round_0);
1427
3.94k
  convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1428
3.94k
                            MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
1429
3.94k
                            y_step_q4, w, h, conv_params->round_1);
1430
3.94k
}
1431
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1432
1433
#if CONFIG_AV1_HIGHBITDEPTH
1434
#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1435
static void highbd_convolve_add_src_horiz_hip(
1436
    const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
1437
    ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
1438
10.1k
    int x_step_q4, int w, int h, int round0_bits, int bd) {
1439
10.1k
  const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
1440
10.1k
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1441
10.1k
  src -= SUBPEL_TAPS / 2 - 1;
1442
499k
  for (int y = 0; y < h; ++y) {
1443
489k
    int x_q4 = x0_q4;
1444
21.9M
    for (int x = 0; x < w; ++x) {
1445
21.5M
      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1446
21.5M
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1447
21.5M
      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1448
21.5M
                           (1 << (bd + FILTER_BITS - 1));
1449
21.5M
      const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
1450
21.5M
      dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1451
21.5M
                               extraprec_clamp_limit - 1);
1452
21.5M
      x_q4 += x_step_q4;
1453
21.5M
    }
1454
489k
    src += src_stride;
1455
489k
    dst += dst_stride;
1456
489k
  }
1457
10.1k
}
1458
1459
static void highbd_convolve_add_src_vert_hip(
1460
    const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
1461
    ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
1462
10.1k
    int y_step_q4, int w, int h, int round1_bits, int bd) {
1463
10.1k
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1464
10.1k
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1465
430k
  for (int x = 0; x < w; ++x) {
1466
420k
    int y_q4 = y0_q4;
1467
20.8M
    for (int y = 0; y < h; ++y) {
1468
20.4M
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1469
20.4M
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1470
20.4M
      const int rounding =
1471
20.4M
          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1472
20.4M
          (1 << (bd + round1_bits - 1));
1473
20.4M
      const int sum =
1474
20.4M
          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1475
20.4M
      dst[y * dst_stride] =
1476
20.4M
          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
1477
20.4M
      y_q4 += y_step_q4;
1478
20.4M
    }
1479
420k
    ++src;
1480
420k
    ++dst;
1481
420k
  }
1482
10.1k
}
1483
1484
void av1_highbd_wiener_convolve_add_src_c(
1485
    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1486
    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1487
    const int16_t *filter_y, int y_step_q4, int w, int h,
1488
10.1k
    const WienerConvolveParams *conv_params, int bd) {
1489
10.1k
  const InterpKernel *const filters_x = get_filter_base(filter_x);
1490
10.1k
  const int x0_q4 = get_filter_offset(filter_x, filters_x);
1491
1492
10.1k
  const InterpKernel *const filters_y = get_filter_base(filter_y);
1493
10.1k
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
1494
1495
10.1k
  uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1496
10.1k
  const int intermediate_height =
1497
10.1k
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1498
1499
10.1k
  assert(w <= MAX_SB_SIZE);
1500
10.1k
  assert(h <= MAX_SB_SIZE);
1501
10.1k
  assert(y_step_q4 <= 32);
1502
10.1k
  assert(x_step_q4 <= 32);
1503
10.1k
  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
1504
1505
10.1k
  highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1506
10.1k
                                    src_stride, temp, MAX_SB_SIZE, filters_x,
1507
10.1k
                                    x0_q4, x_step_q4, w, intermediate_height,
1508
10.1k
                                    conv_params->round_0, bd);
1509
10.1k
  highbd_convolve_add_src_vert_hip(
1510
10.1k
      temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
1511
10.1k
      filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
1512
10.1k
}
1513
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1514
#endif  // CONFIG_AV1_HIGHBITDEPTH