Coverage Report

Created: 2023-06-07 06:31

/src/aom/av1/common/convolve.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <assert.h>
13
#include <string.h>
14
15
#include "config/aom_dsp_rtcd.h"
16
#include "config/av1_rtcd.h"
17
18
#include "av1/common/av1_common_int.h"
19
#include "av1/common/blockd.h"
20
#include "av1/common/convolve.h"
21
#include "av1/common/filter.h"
22
#include "av1/common/resize.h"
23
#include "aom_dsp/aom_dsp_common.h"
24
#include "aom_ports/mem.h"
25
26
void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
27
                             int dst_stride, int w, int h,
28
                             const int16_t *x_filters, int x0_qn,
29
0
                             int x_step_qn) {
30
0
  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
31
0
  for (int y = 0; y < h; ++y) {
32
0
    int x_qn = x0_qn;
33
0
    for (int x = 0; x < w; ++x) {
34
0
      const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
35
0
      const int x_filter_idx =
36
0
          (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
37
0
      assert(x_filter_idx <= RS_SUBPEL_MASK);
38
0
      const int16_t *const x_filter =
39
0
          &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
40
0
      int sum = 0;
41
0
      for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
42
0
        sum += src_x[k] * x_filter[k];
43
0
      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
44
0
      x_qn += x_step_qn;
45
0
    }
46
0
    src += src_stride;
47
0
    dst += dst_stride;
48
0
  }
49
0
}
50
51
void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
52
                                    uint16_t *dst, int dst_stride, int w, int h,
53
                                    const int16_t *x_filters, int x0_qn,
54
0
                                    int x_step_qn, int bd) {
55
0
  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
56
0
  for (int y = 0; y < h; ++y) {
57
0
    int x_qn = x0_qn;
58
0
    for (int x = 0; x < w; ++x) {
59
0
      const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
60
0
      const int x_filter_idx =
61
0
          (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
62
0
      assert(x_filter_idx <= RS_SUBPEL_MASK);
63
0
      const int16_t *const x_filter =
64
0
          &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
65
0
      int sum = 0;
66
0
      for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
67
0
        sum += src_x[k] * x_filter[k];
68
0
      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
69
0
      x_qn += x_step_qn;
70
0
    }
71
0
    src += src_stride;
72
0
    dst += dst_stride;
73
0
  }
74
0
}
75
76
void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
77
                          int dst_stride, int w, int h,
78
                          const InterpFilterParams *filter_params_x,
79
                          const InterpFilterParams *filter_params_y,
80
                          const int subpel_x_qn, const int subpel_y_qn,
81
0
                          ConvolveParams *conv_params) {
82
0
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
83
0
  int im_h = h + filter_params_y->taps - 1;
84
0
  int im_stride = w;
85
0
  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
86
0
  const int fo_vert = filter_params_y->taps / 2 - 1;
87
0
  const int fo_horiz = filter_params_x->taps / 2 - 1;
88
0
  const int bd = 8;
89
0
  const int bits =
90
0
      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
91
92
  // horizontal filter
93
0
  const uint8_t *src_horiz = src - fo_vert * src_stride;
94
0
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
95
0
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
96
0
  for (int y = 0; y < im_h; ++y) {
97
0
    for (int x = 0; x < w; ++x) {
98
0
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
99
0
      for (int k = 0; k < filter_params_x->taps; ++k) {
100
0
        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
101
0
      }
102
103
      // TODO(aomedia:3393): for 12-tap filter, in extreme cases, the result can
104
      // be beyond the following range. For better prediction, a clamping can be
105
      // added for 12 tap filter to ensure the horizontal filtering result is
106
      // within 16 bit. The same applies to the vertical filtering.
107
0
      assert(filter_params_x->taps > 8 ||
108
0
             (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
109
0
      im_block[y * im_stride + x] =
110
0
          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
111
0
    }
112
0
  }
113
114
  // vertical filter
115
0
  int16_t *src_vert = im_block + fo_vert * im_stride;
116
0
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
117
0
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
118
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
119
0
  for (int y = 0; y < h; ++y) {
120
0
    for (int x = 0; x < w; ++x) {
121
0
      int32_t sum = 1 << offset_bits;
122
0
      for (int k = 0; k < filter_params_y->taps; ++k) {
123
0
        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
124
0
      }
125
0
      assert(filter_params_y->taps > 8 ||
126
0
             (0 <= sum && sum < (1 << (offset_bits + 2))));
127
0
      int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
128
0
                    ((1 << (offset_bits - conv_params->round_1)) +
129
0
                     (1 << (offset_bits - conv_params->round_1 - 1)));
130
0
      dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
131
0
    }
132
0
  }
133
0
}
134
135
void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
136
                         int dst_stride, int w, int h,
137
                         const InterpFilterParams *filter_params_y,
138
0
                         const int subpel_y_qn) {
139
0
  const int fo_vert = filter_params_y->taps / 2 - 1;
140
141
  // vertical filter
142
0
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
143
0
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
144
0
  for (int y = 0; y < h; ++y) {
145
0
    for (int x = 0; x < w; ++x) {
146
0
      int32_t res = 0;
147
0
      for (int k = 0; k < filter_params_y->taps; ++k) {
148
0
        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
149
0
      }
150
0
      dst[y * dst_stride + x] =
151
0
          clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
152
0
    }
153
0
  }
154
0
}
155
156
void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
157
                         int dst_stride, int w, int h,
158
                         const InterpFilterParams *filter_params_x,
159
0
                         const int subpel_x_qn, ConvolveParams *conv_params) {
160
0
  const int fo_horiz = filter_params_x->taps / 2 - 1;
161
0
  const int bits = FILTER_BITS - conv_params->round_0;
162
163
0
  assert(bits >= 0);
164
0
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
165
0
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
166
167
  // horizontal filter
168
0
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
169
0
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
170
171
0
  for (int y = 0; y < h; ++y) {
172
0
    for (int x = 0; x < w; ++x) {
173
0
      int32_t res = 0;
174
0
      for (int k = 0; k < filter_params_x->taps; ++k) {
175
0
        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
176
0
      }
177
0
      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
178
0
      dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
179
0
    }
180
0
  }
181
0
}
182
183
// This function is exactly the same as av1_convolve_2d_sr_c, and is an
184
// optimized version for intrabc. Use the following 2-tap filter:
185
// DECLARE_ALIGNED(256, static const int16_t,
186
//                 av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
187
//   128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
188
//   64,  64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189
// };
190
void av1_convolve_2d_sr_intrabc_c(const uint8_t *src, int src_stride,
191
                                  uint8_t *dst, int dst_stride, int w, int h,
192
                                  const InterpFilterParams *filter_params_x,
193
                                  const InterpFilterParams *filter_params_y,
194
                                  const int subpel_x_qn, const int subpel_y_qn,
195
3.28k
                                  ConvolveParams *conv_params) {
196
3.28k
  assert(subpel_x_qn == 8);
197
0
  assert(subpel_y_qn == 8);
198
0
  assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
199
0
  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
200
0
  (void)filter_params_x;
201
3.28k
  (void)subpel_x_qn;
202
3.28k
  (void)filter_params_y;
203
3.28k
  (void)subpel_y_qn;
204
3.28k
  (void)conv_params;
205
206
3.28k
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
207
3.28k
  int im_h = h + 1;
208
3.28k
  int im_stride = w;
209
3.28k
  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
210
0
  const int bd = 8;
211
212
  // horizontal filter
213
  // explicitly operate for subpel_x_qn = 8.
214
3.28k
  int16_t *im = im_block;
215
33.8k
  for (int y = 0; y < im_h; ++y) {
216
557k
    for (int x = 0; x < w; ++x) {
217
526k
      const int32_t sum = (1 << bd) + src[x] + src[x + 1];
218
526k
      assert(0 <= sum && sum < (1 << (bd + 2)));
219
0
      im[x] = sum;
220
526k
    }
221
30.5k
    src += src_stride;
222
30.5k
    im += im_stride;
223
30.5k
  }
224
225
  // vertical filter
226
  // explicitly operate for subpel_y_qn = 8.
227
3.28k
  int16_t *src_vert = im_block;
228
30.5k
  for (int y = 0; y < h; ++y) {
229
524k
    for (int x = 0; x < w; ++x) {
230
497k
      const int32_t sum =
231
497k
          (1 << (bd + 2)) + src_vert[x] + src_vert[im_stride + x];
232
497k
      assert(0 <= sum && sum < (1 << (bd + 4)));
233
0
      const int16_t res =
234
497k
          ROUND_POWER_OF_TWO(sum, 2) - ((1 << bd) + (1 << (bd - 1)));
235
497k
      dst[x] = clip_pixel(res);
236
497k
    }
237
27.2k
    src_vert += im_stride;
238
27.2k
    dst += dst_stride;
239
27.2k
  }
240
3.28k
}
241
242
// This function is exactly the same as av1_convolve_y_sr_c, and is an
243
// optimized version for intrabc.
244
void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride,
245
                                 uint8_t *dst, int dst_stride, int w, int h,
246
                                 const InterpFilterParams *filter_params_y,
247
2.97k
                                 const int subpel_y_qn) {
248
2.97k
  assert(subpel_y_qn == 8);
249
0
  assert(filter_params_y->taps == 2);
250
0
  (void)filter_params_y;
251
2.97k
  (void)subpel_y_qn;
252
253
  // vertical filter
254
  // explicitly operate for subpel_y_qn = 8.
255
27.3k
  for (int y = 0; y < h; ++y) {
256
353k
    for (int x = 0; x < w; ++x) {
257
329k
      const int32_t res = src[x] + src[src_stride + x];
258
329k
      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1));
259
329k
    }
260
24.4k
    src += src_stride;
261
24.4k
    dst += dst_stride;
262
24.4k
  }
263
2.97k
}
264
265
// This function is exactly the same as av1_convolve_x_sr_c, and is an
266
// optimized version for intrabc.
267
void av1_convolve_x_sr_intrabc_c(const uint8_t *src, int src_stride,
268
                                 uint8_t *dst, int dst_stride, int w, int h,
269
                                 const InterpFilterParams *filter_params_x,
270
                                 const int subpel_x_qn,
271
3.40k
                                 ConvolveParams *conv_params) {
272
3.40k
  assert(subpel_x_qn == 8);
273
0
  assert(filter_params_x->taps == 2);
274
0
  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
275
0
  (void)filter_params_x;
276
3.40k
  (void)subpel_x_qn;
277
3.40k
  (void)conv_params;
278
279
  // horizontal filter
280
  // explicitly operate for subpel_x_qn = 8.
281
35.2k
  for (int y = 0; y < h; ++y) {
282
467k
    for (int x = 0; x < w; ++x) {
283
435k
      const int32_t res = src[x] + src[x + 1];
284
435k
      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1));
285
435k
    }
286
31.8k
    src += src_stride;
287
31.8k
    dst += dst_stride;
288
31.8k
  }
289
3.40k
}
290
291
void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
292
                                uint8_t *dst, int dst_stride, int w, int h,
293
                                const InterpFilterParams *filter_params_x,
294
                                const InterpFilterParams *filter_params_y,
295
                                const int subpel_x_qn, const int subpel_y_qn,
296
0
                                ConvolveParams *conv_params) {
297
0
  CONV_BUF_TYPE *dst16 = conv_params->dst;
298
0
  int dst16_stride = conv_params->dst_stride;
299
0
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
300
0
  int im_h = h + filter_params_y->taps - 1;
301
0
  int im_stride = w;
302
0
  const int fo_vert = filter_params_y->taps / 2 - 1;
303
0
  const int fo_horiz = filter_params_x->taps / 2 - 1;
304
0
  const int bd = 8;
305
0
  const int round_bits =
306
0
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
307
308
  // horizontal filter
309
0
  const uint8_t *src_horiz = src - fo_vert * src_stride;
310
0
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
311
0
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
312
0
  for (int y = 0; y < im_h; ++y) {
313
0
    for (int x = 0; x < w; ++x) {
314
0
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
315
0
      for (int k = 0; k < filter_params_x->taps; ++k) {
316
0
        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
317
0
      }
318
0
      assert(filter_params_x->taps > 8 ||
319
0
             (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
320
0
      im_block[y * im_stride + x] =
321
0
          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
322
0
    }
323
0
  }
324
325
  // vertical filter
326
0
  int16_t *src_vert = im_block + fo_vert * im_stride;
327
0
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
328
0
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
329
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
330
0
  for (int y = 0; y < h; ++y) {
331
0
    for (int x = 0; x < w; ++x) {
332
0
      int32_t sum = 1 << offset_bits;
333
0
      for (int k = 0; k < filter_params_y->taps; ++k) {
334
0
        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
335
0
      }
336
0
      assert(filter_params_y->taps > 8 ||
337
0
             (0 <= sum && sum < (1 << (offset_bits + 2))));
338
0
      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
339
0
      if (conv_params->do_average) {
340
0
        int32_t tmp = dst16[y * dst16_stride + x];
341
0
        if (conv_params->use_dist_wtd_comp_avg) {
342
0
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
343
0
          tmp = tmp >> DIST_PRECISION_BITS;
344
0
        } else {
345
0
          tmp += res;
346
0
          tmp = tmp >> 1;
347
0
        }
348
0
        tmp -= (1 << (offset_bits - conv_params->round_1)) +
349
0
               (1 << (offset_bits - conv_params->round_1 - 1));
350
0
        dst[y * dst_stride + x] =
351
0
            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
352
0
      } else {
353
0
        dst16[y * dst16_stride + x] = res;
354
0
      }
355
0
    }
356
0
  }
357
0
}
358
359
void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
360
                               int dst_stride, int w, int h,
361
                               const InterpFilterParams *filter_params_y,
362
                               const int subpel_y_qn,
363
0
                               ConvolveParams *conv_params) {
364
0
  CONV_BUF_TYPE *dst16 = conv_params->dst;
365
0
  int dst16_stride = conv_params->dst_stride;
366
0
  const int fo_vert = filter_params_y->taps / 2 - 1;
367
0
  const int bits = FILTER_BITS - conv_params->round_0;
368
0
  const int bd = 8;
369
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
370
0
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
371
0
                           (1 << (offset_bits - conv_params->round_1 - 1));
372
0
  const int round_bits =
373
0
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
374
375
  // vertical filter
376
0
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
377
0
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
378
0
  for (int y = 0; y < h; ++y) {
379
0
    for (int x = 0; x < w; ++x) {
380
0
      int32_t res = 0;
381
0
      for (int k = 0; k < filter_params_y->taps; ++k) {
382
0
        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
383
0
      }
384
0
      res *= (1 << bits);
385
0
      res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
386
387
0
      if (conv_params->do_average) {
388
0
        int32_t tmp = dst16[y * dst16_stride + x];
389
0
        if (conv_params->use_dist_wtd_comp_avg) {
390
0
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
391
0
          tmp = tmp >> DIST_PRECISION_BITS;
392
0
        } else {
393
0
          tmp += res;
394
0
          tmp = tmp >> 1;
395
0
        }
396
0
        tmp -= round_offset;
397
0
        dst[y * dst_stride + x] =
398
0
            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
399
0
      } else {
400
0
        dst16[y * dst16_stride + x] = res;
401
0
      }
402
0
    }
403
0
  }
404
0
}
405
406
void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
407
                               int dst_stride, int w, int h,
408
                               const InterpFilterParams *filter_params_x,
409
                               const int subpel_x_qn,
410
0
                               ConvolveParams *conv_params) {
411
0
  CONV_BUF_TYPE *dst16 = conv_params->dst;
412
0
  int dst16_stride = conv_params->dst_stride;
413
0
  const int fo_horiz = filter_params_x->taps / 2 - 1;
414
0
  const int bits = FILTER_BITS - conv_params->round_1;
415
0
  const int bd = 8;
416
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
417
0
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
418
0
                           (1 << (offset_bits - conv_params->round_1 - 1));
419
0
  const int round_bits =
420
0
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
421
422
  // horizontal filter
423
0
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
424
0
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
425
0
  for (int y = 0; y < h; ++y) {
426
0
    for (int x = 0; x < w; ++x) {
427
0
      int32_t res = 0;
428
0
      for (int k = 0; k < filter_params_x->taps; ++k) {
429
0
        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
430
0
      }
431
0
      res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
432
0
      res += round_offset;
433
434
0
      if (conv_params->do_average) {
435
0
        int32_t tmp = dst16[y * dst16_stride + x];
436
0
        if (conv_params->use_dist_wtd_comp_avg) {
437
0
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
438
0
          tmp = tmp >> DIST_PRECISION_BITS;
439
0
        } else {
440
0
          tmp += res;
441
0
          tmp = tmp >> 1;
442
0
        }
443
0
        tmp -= round_offset;
444
0
        dst[y * dst_stride + x] =
445
0
            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
446
0
      } else {
447
0
        dst16[y * dst16_stride + x] = res;
448
0
      }
449
0
    }
450
0
  }
451
0
}
452
453
void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride,
454
                                     uint8_t *dst, int dst_stride, int w, int h,
455
0
                                     ConvolveParams *conv_params) {
456
0
  CONV_BUF_TYPE *dst16 = conv_params->dst;
457
0
  int dst16_stride = conv_params->dst_stride;
458
0
  const int bits =
459
0
      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
460
0
  const int bd = 8;
461
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
462
0
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
463
0
                           (1 << (offset_bits - conv_params->round_1 - 1));
464
465
0
  for (int y = 0; y < h; ++y) {
466
0
    for (int x = 0; x < w; ++x) {
467
0
      CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
468
0
      res += round_offset;
469
470
0
      if (conv_params->do_average) {
471
0
        int32_t tmp = dst16[y * dst16_stride + x];
472
0
        if (conv_params->use_dist_wtd_comp_avg) {
473
0
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
474
0
          tmp = tmp >> DIST_PRECISION_BITS;
475
0
        } else {
476
0
          tmp += res;
477
0
          tmp = tmp >> 1;
478
0
        }
479
0
        tmp -= round_offset;
480
0
        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
481
0
      } else {
482
0
        dst16[y * dst16_stride + x] = res;
483
0
      }
484
0
    }
485
0
  }
486
0
}
487
488
void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst,
489
                             int dst_stride, int w, int h,
490
                             const InterpFilterParams *filter_params_x,
491
                             const InterpFilterParams *filter_params_y,
492
                             const int subpel_x_qn, const int x_step_qn,
493
                             const int subpel_y_qn, const int y_step_qn,
494
0
                             ConvolveParams *conv_params) {
495
0
  int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
496
0
  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
497
0
             filter_params_y->taps;
498
0
  CONV_BUF_TYPE *dst16 = conv_params->dst;
499
0
  const int dst16_stride = conv_params->dst_stride;
500
0
  const int bits =
501
0
      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
502
0
  assert(bits >= 0);
503
0
  int im_stride = w;
504
0
  const int fo_vert = filter_params_y->taps / 2 - 1;
505
0
  const int fo_horiz = filter_params_x->taps / 2 - 1;
506
0
  const int bd = 8;
507
508
  // horizontal filter
509
0
  const uint8_t *src_horiz = src - fo_vert * src_stride;
510
0
  for (int y = 0; y < im_h; ++y) {
511
0
    int x_qn = subpel_x_qn;
512
0
    for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
513
0
      const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
514
0
      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
515
0
      assert(x_filter_idx < SUBPEL_SHIFTS);
516
0
      const int16_t *x_filter =
517
0
          av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
518
0
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
519
0
      for (int k = 0; k < filter_params_x->taps; ++k) {
520
0
        sum += x_filter[k] * src_x[k - fo_horiz];
521
0
      }
522
0
      assert(filter_params_x->taps > 8 ||
523
0
             (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
524
0
      im_block[y * im_stride + x] =
525
0
          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
526
0
    }
527
0
    src_horiz += src_stride;
528
0
  }
529
530
  // vertical filter
531
0
  int16_t *src_vert = im_block + fo_vert * im_stride;
532
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
533
0
  for (int x = 0; x < w; ++x) {
534
0
    int y_qn = subpel_y_qn;
535
0
    for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
536
0
      const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
537
0
      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
538
0
      assert(y_filter_idx < SUBPEL_SHIFTS);
539
0
      const int16_t *y_filter =
540
0
          av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
541
0
      int32_t sum = 1 << offset_bits;
542
0
      for (int k = 0; k < filter_params_y->taps; ++k) {
543
0
        sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
544
0
      }
545
0
      assert(filter_params_y->taps > 8 ||
546
0
             (0 <= sum && sum < (1 << (offset_bits + 2))));
547
0
      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
548
0
      if (conv_params->is_compound) {
549
0
        if (conv_params->do_average) {
550
0
          int32_t tmp = dst16[y * dst16_stride + x];
551
0
          if (conv_params->use_dist_wtd_comp_avg) {
552
0
            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
553
0
            tmp = tmp >> DIST_PRECISION_BITS;
554
0
          } else {
555
0
            tmp += res;
556
0
            tmp = tmp >> 1;
557
0
          }
558
          /* Subtract round offset and convolve round */
559
0
          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
560
0
                       (1 << (offset_bits - conv_params->round_1 - 1)));
561
0
          dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
562
0
        } else {
563
0
          dst16[y * dst16_stride + x] = res;
564
0
        }
565
0
      } else {
566
        /* Subtract round offset and convolve round */
567
0
        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
568
0
                             (1 << (offset_bits - conv_params->round_1 - 1)));
569
0
        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
570
0
      }
571
0
    }
572
0
    src_vert++;
573
0
  }
574
0
}
575
576
static void convolve_2d_scale_wrapper(
577
    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
578
    int h, const InterpFilterParams *filter_params_x,
579
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
580
    const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
581
6.29M
    ConvolveParams *conv_params) {
582
6.29M
  if (conv_params->is_compound) {
583
235k
    assert(conv_params->dst != NULL);
584
235k
  }
585
0
  av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
586
6.29M
                        filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
587
6.29M
                        y_step_qn, conv_params);
588
6.29M
}
589
590
static void convolve_2d_facade_compound(
591
    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
592
    int h, const InterpFilterParams *filter_params_x,
593
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
594
2.52M
    const int subpel_y_qn, ConvolveParams *conv_params) {
595
2.52M
  const bool need_x = subpel_x_qn != 0;
596
2.52M
  const bool need_y = subpel_y_qn != 0;
597
2.52M
  if (!need_x && !need_y) {
598
1.01M
    av1_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
599
1.01M
                                  conv_params);
600
1.50M
  } else if (need_x && !need_y) {
601
367k
    av1_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
602
367k
                            filter_params_x, subpel_x_qn, conv_params);
603
1.13M
  } else if (!need_x && need_y) {
604
288k
    av1_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
605
288k
                            filter_params_y, subpel_y_qn, conv_params);
606
851k
  } else {
607
851k
    assert(need_y && need_x);
608
0
    av1_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
609
851k
                             filter_params_x, filter_params_y, subpel_x_qn,
610
851k
                             subpel_y_qn, conv_params);
611
851k
  }
612
2.52M
}
613
614
static void convolve_2d_facade_single(
615
    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
616
    int h, const InterpFilterParams *filter_params_x,
617
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
618
9.42M
    const int subpel_y_qn, ConvolveParams *conv_params) {
619
9.42M
  const bool need_x = subpel_x_qn != 0;
620
9.42M
  const bool need_y = subpel_y_qn != 0;
621
9.42M
  if (!need_x && !need_y) {
622
2.68M
    aom_convolve_copy(src, src_stride, dst, dst_stride, w, h);
623
6.73M
  } else if (need_x && !need_y) {
624
1.47M
    av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
625
1.47M
                      subpel_x_qn, conv_params);
626
5.26M
  } else if (!need_x && need_y) {
627
1.39M
    av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
628
1.39M
                      subpel_y_qn);
629
3.86M
  } else {
630
3.86M
    assert(need_x && need_y);
631
0
    av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
632
3.86M
                       filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
633
3.86M
  }
634
9.42M
}
635
636
void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
637
                            int dst_stride, int w, int h,
638
                            const InterpFilterParams *interp_filters[2],
639
                            const int subpel_x_qn, int x_step_q4,
640
                            const int subpel_y_qn, int y_step_q4, int scaled,
641
18.2M
                            ConvolveParams *conv_params) {
642
18.2M
  (void)x_step_q4;
643
18.2M
  (void)y_step_q4;
644
18.2M
  (void)dst;
645
18.2M
  (void)dst_stride;
646
647
18.2M
  const InterpFilterParams *filter_params_x = interp_filters[0];
648
18.2M
  const InterpFilterParams *filter_params_y = interp_filters[1];
649
650
  // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case.
651
  // 2-tap filter indicates that it is for IntraBC.
652
18.2M
  if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
653
62.5k
    assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
654
0
    assert(!scaled);
655
62.5k
    if (subpel_x_qn && subpel_y_qn) {
656
3.28k
      av1_convolve_2d_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
657
3.28k
                                   filter_params_x, filter_params_y,
658
3.28k
                                   subpel_x_qn, subpel_y_qn, conv_params);
659
3.28k
      return;
660
59.2k
    } else if (subpel_x_qn) {
661
3.40k
      av1_convolve_x_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
662
3.40k
                                  filter_params_x, subpel_x_qn, conv_params);
663
3.40k
      return;
664
55.8k
    } else if (subpel_y_qn) {
665
2.97k
      av1_convolve_y_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
666
2.97k
                                  filter_params_y, subpel_y_qn);
667
2.97k
      return;
668
2.97k
    }
669
62.5k
  }
670
671
18.2M
  if (scaled) {
672
6.29M
    convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
673
6.29M
                              filter_params_x, filter_params_y, subpel_x_qn,
674
6.29M
                              x_step_q4, subpel_y_qn, y_step_q4, conv_params);
675
11.9M
  } else if (conv_params->is_compound) {
676
2.52M
    convolve_2d_facade_compound(src, src_stride, dst, dst_stride, w, h,
677
2.52M
                                filter_params_x, filter_params_y, subpel_x_qn,
678
2.52M
                                subpel_y_qn, conv_params);
679
9.41M
  } else {
680
9.41M
    convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
681
9.41M
                              filter_params_x, filter_params_y, subpel_x_qn,
682
9.41M
                              subpel_y_qn, conv_params);
683
9.41M
  }
684
18.2M
}
685
686
#if CONFIG_AV1_HIGHBITDEPTH
687
void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
688
                                uint16_t *dst, int dst_stride, int w, int h,
689
                                const InterpFilterParams *filter_params_x,
690
                                const int subpel_x_qn,
691
0
                                ConvolveParams *conv_params, int bd) {
692
0
  const int fo_horiz = filter_params_x->taps / 2 - 1;
693
0
  const int bits = FILTER_BITS - conv_params->round_0;
694
695
0
  assert(bits >= 0);
696
0
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
697
0
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
698
699
  // horizontal filter
700
0
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
701
0
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
702
0
  for (int y = 0; y < h; ++y) {
703
0
    for (int x = 0; x < w; ++x) {
704
0
      int32_t res = 0;
705
0
      for (int k = 0; k < filter_params_x->taps; ++k) {
706
0
        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
707
0
      }
708
0
      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
709
0
      dst[y * dst_stride + x] =
710
0
          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
711
0
    }
712
0
  }
713
0
}
714
715
void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
716
                                uint16_t *dst, int dst_stride, int w, int h,
717
                                const InterpFilterParams *filter_params_y,
718
0
                                const int subpel_y_qn, int bd) {
719
0
  const int fo_vert = filter_params_y->taps / 2 - 1;
720
  // vertical filter
721
0
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
722
0
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
723
0
  for (int y = 0; y < h; ++y) {
724
0
    for (int x = 0; x < w; ++x) {
725
0
      int32_t res = 0;
726
0
      for (int k = 0; k < filter_params_y->taps; ++k) {
727
0
        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
728
0
      }
729
0
      dst[y * dst_stride + x] =
730
0
          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
731
0
    }
732
0
  }
733
0
}
734
735
void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
736
                                 uint16_t *dst, int dst_stride, int w, int h,
737
                                 const InterpFilterParams *filter_params_x,
738
                                 const InterpFilterParams *filter_params_y,
739
                                 const int subpel_x_qn, const int subpel_y_qn,
740
0
                                 ConvolveParams *conv_params, int bd) {
741
0
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
742
0
  int im_h = h + filter_params_y->taps - 1;
743
0
  int im_stride = w;
744
0
  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
745
0
  const int fo_vert = filter_params_y->taps / 2 - 1;
746
0
  const int fo_horiz = filter_params_x->taps / 2 - 1;
747
0
  const int bits =
748
0
      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
749
0
  assert(bits >= 0);
750
751
  // horizontal filter
752
0
  const uint16_t *src_horiz = src - fo_vert * src_stride;
753
0
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
754
0
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
755
0
  for (int y = 0; y < im_h; ++y) {
756
0
    for (int x = 0; x < w; ++x) {
757
0
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
758
0
      for (int k = 0; k < filter_params_x->taps; ++k) {
759
0
        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
760
0
      }
761
0
      assert(filter_params_x->taps > 8 ||
762
0
             (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
763
0
      im_block[y * im_stride + x] =
764
0
          ROUND_POWER_OF_TWO(sum, conv_params->round_0);
765
0
    }
766
0
  }
767
768
  // vertical filter
769
0
  int16_t *src_vert = im_block + fo_vert * im_stride;
770
0
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
771
0
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
772
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
773
0
  for (int y = 0; y < h; ++y) {
774
0
    for (int x = 0; x < w; ++x) {
775
0
      int32_t sum = 1 << offset_bits;
776
0
      for (int k = 0; k < filter_params_y->taps; ++k) {
777
0
        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
778
0
      }
779
0
      assert(filter_params_y->taps > 8 ||
780
0
             (0 <= sum && sum < (1 << (offset_bits + 2))));
781
0
      int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
782
0
                    ((1 << (offset_bits - conv_params->round_1)) +
783
0
                     (1 << (offset_bits - conv_params->round_1 - 1)));
784
0
      dst[y * dst_stride + x] =
785
0
          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
786
0
    }
787
0
  }
788
0
}
789
790
void av1_highbd_dist_wtd_convolve_2d_c(
791
    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
792
    int h, const InterpFilterParams *filter_params_x,
793
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
794
0
    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
795
0
  int x, y, k;
796
0
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
797
0
  CONV_BUF_TYPE *dst16 = conv_params->dst;
798
0
  int dst16_stride = conv_params->dst_stride;
799
0
  int im_h = h + filter_params_y->taps - 1;
800
0
  int im_stride = w;
801
0
  const int fo_vert = filter_params_y->taps / 2 - 1;
802
0
  const int fo_horiz = filter_params_x->taps / 2 - 1;
803
0
  const int round_bits =
804
0
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
805
0
  assert(round_bits >= 0);
806
807
  // horizontal filter
808
0
  const uint16_t *src_horiz = src - fo_vert * src_stride;
809
0
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
810
0
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
811
0
  for (y = 0; y < im_h; ++y) {
812
0
    for (x = 0; x < w; ++x) {
813
0
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
814
0
      for (k = 0; k < filter_params_x->taps; ++k) {
815
0
        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
816
0
      }
817
0
      assert(filter_params_x->taps > 8 ||
818
0
             (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
819
0
      (void)bd;
820
0
      im_block[y * im_stride + x] =
821
0
          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
822
0
    }
823
0
  }
824
825
  // vertical filter
826
0
  int16_t *src_vert = im_block + fo_vert * im_stride;
827
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
828
0
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
829
0
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
830
0
  for (y = 0; y < h; ++y) {
831
0
    for (x = 0; x < w; ++x) {
832
0
      int32_t sum = 1 << offset_bits;
833
0
      for (k = 0; k < filter_params_y->taps; ++k) {
834
0
        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
835
0
      }
836
0
      assert(filter_params_y->taps > 8 ||
837
0
             (0 <= sum && sum < (1 << (offset_bits + 2))));
838
0
      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
839
0
      if (conv_params->do_average) {
840
0
        int32_t tmp = dst16[y * dst16_stride + x];
841
0
        if (conv_params->use_dist_wtd_comp_avg) {
842
0
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
843
0
          tmp = tmp >> DIST_PRECISION_BITS;
844
0
        } else {
845
0
          tmp += res;
846
0
          tmp = tmp >> 1;
847
0
        }
848
0
        tmp -= (1 << (offset_bits - conv_params->round_1)) +
849
0
               (1 << (offset_bits - conv_params->round_1 - 1));
850
0
        dst[y * dst_stride + x] =
851
0
            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
852
0
      } else {
853
0
        dst16[y * dst16_stride + x] = res;
854
0
      }
855
0
    }
856
0
  }
857
0
}
858
859
void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride,
860
                                      uint16_t *dst, int dst_stride, int w,
861
                                      int h,
862
                                      const InterpFilterParams *filter_params_x,
863
                                      const int subpel_x_qn,
864
0
                                      ConvolveParams *conv_params, int bd) {
865
0
  CONV_BUF_TYPE *dst16 = conv_params->dst;
866
0
  int dst16_stride = conv_params->dst_stride;
867
0
  const int fo_horiz = filter_params_x->taps / 2 - 1;
868
0
  const int bits = FILTER_BITS - conv_params->round_1;
869
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
870
0
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
871
0
                           (1 << (offset_bits - conv_params->round_1 - 1));
872
0
  const int round_bits =
873
0
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
874
0
  assert(round_bits >= 0);
875
0
  assert(bits >= 0);
876
  // horizontal filter
877
0
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
878
0
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
879
0
  for (int y = 0; y < h; ++y) {
880
0
    for (int x = 0; x < w; ++x) {
881
0
      int32_t res = 0;
882
0
      for (int k = 0; k < filter_params_x->taps; ++k) {
883
0
        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
884
0
      }
885
0
      res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
886
0
      res += round_offset;
887
888
0
      if (conv_params->do_average) {
889
0
        int32_t tmp = dst16[y * dst16_stride + x];
890
0
        if (conv_params->use_dist_wtd_comp_avg) {
891
0
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
892
0
          tmp = tmp >> DIST_PRECISION_BITS;
893
0
        } else {
894
0
          tmp += res;
895
0
          tmp = tmp >> 1;
896
0
        }
897
0
        tmp -= round_offset;
898
0
        dst[y * dst_stride + x] =
899
0
            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
900
0
      } else {
901
0
        dst16[y * dst16_stride + x] = res;
902
0
      }
903
0
    }
904
0
  }
905
0
}
906
907
void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride,
908
                                      uint16_t *dst, int dst_stride, int w,
909
                                      int h,
910
                                      const InterpFilterParams *filter_params_y,
911
                                      const int subpel_y_qn,
912
0
                                      ConvolveParams *conv_params, int bd) {
913
0
  CONV_BUF_TYPE *dst16 = conv_params->dst;
914
0
  int dst16_stride = conv_params->dst_stride;
915
0
  const int fo_vert = filter_params_y->taps / 2 - 1;
916
0
  const int bits = FILTER_BITS - conv_params->round_0;
917
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
918
0
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
919
0
                           (1 << (offset_bits - conv_params->round_1 - 1));
920
0
  const int round_bits =
921
0
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
922
0
  assert(round_bits >= 0);
923
0
  assert(bits >= 0);
924
  // vertical filter
925
0
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
926
0
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
927
0
  for (int y = 0; y < h; ++y) {
928
0
    for (int x = 0; x < w; ++x) {
929
0
      int32_t res = 0;
930
0
      for (int k = 0; k < filter_params_y->taps; ++k) {
931
0
        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
932
0
      }
933
0
      res *= (1 << bits);
934
0
      res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
935
936
0
      if (conv_params->do_average) {
937
0
        int32_t tmp = dst16[y * dst16_stride + x];
938
0
        if (conv_params->use_dist_wtd_comp_avg) {
939
0
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
940
0
          tmp = tmp >> DIST_PRECISION_BITS;
941
0
        } else {
942
0
          tmp += res;
943
0
          tmp = tmp >> 1;
944
0
        }
945
0
        tmp -= round_offset;
946
0
        dst[y * dst_stride + x] =
947
0
            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
948
0
      } else {
949
0
        dst16[y * dst16_stride + x] = res;
950
0
      }
951
0
    }
952
0
  }
953
0
}
954
955
void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride,
956
                                            uint16_t *dst, int dst_stride,
957
                                            int w, int h,
958
                                            ConvolveParams *conv_params,
959
0
                                            int bd) {
960
0
  CONV_BUF_TYPE *dst16 = conv_params->dst;
961
0
  int dst16_stride = conv_params->dst_stride;
962
0
  const int bits =
963
0
      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
964
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
965
0
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
966
0
                           (1 << (offset_bits - conv_params->round_1 - 1));
967
0
  assert(bits >= 0);
968
969
0
  for (int y = 0; y < h; ++y) {
970
0
    for (int x = 0; x < w; ++x) {
971
0
      CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
972
0
      res += round_offset;
973
0
      if (conv_params->do_average) {
974
0
        int32_t tmp = dst16[y * dst16_stride + x];
975
0
        if (conv_params->use_dist_wtd_comp_avg) {
976
0
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
977
0
          tmp = tmp >> DIST_PRECISION_BITS;
978
0
        } else {
979
0
          tmp += res;
980
0
          tmp = tmp >> 1;
981
0
        }
982
0
        tmp -= round_offset;
983
0
        dst[y * dst_stride + x] =
984
0
            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
985
0
      } else {
986
0
        dst16[y * dst16_stride + x] = res;
987
0
      }
988
0
    }
989
0
  }
990
0
}
991
992
void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
993
                                    uint16_t *dst, int dst_stride, int w, int h,
994
                                    const InterpFilterParams *filter_params_x,
995
                                    const InterpFilterParams *filter_params_y,
996
                                    const int subpel_x_qn, const int x_step_qn,
997
                                    const int subpel_y_qn, const int y_step_qn,
998
0
                                    ConvolveParams *conv_params, int bd) {
999
0
  int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
1000
0
  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
1001
0
             filter_params_y->taps;
1002
0
  int im_stride = w;
1003
0
  const int fo_vert = filter_params_y->taps / 2 - 1;
1004
0
  const int fo_horiz = filter_params_x->taps / 2 - 1;
1005
0
  CONV_BUF_TYPE *dst16 = conv_params->dst;
1006
0
  const int dst16_stride = conv_params->dst_stride;
1007
0
  const int bits =
1008
0
      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
1009
0
  assert(bits >= 0);
1010
  // horizontal filter
1011
0
  const uint16_t *src_horiz = src - fo_vert * src_stride;
1012
0
  for (int y = 0; y < im_h; ++y) {
1013
0
    int x_qn = subpel_x_qn;
1014
0
    for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
1015
0
      const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
1016
0
      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
1017
0
      assert(x_filter_idx < SUBPEL_SHIFTS);
1018
0
      const int16_t *x_filter =
1019
0
          av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
1020
0
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
1021
0
      for (int k = 0; k < filter_params_x->taps; ++k) {
1022
0
        sum += x_filter[k] * src_x[k - fo_horiz];
1023
0
      }
1024
0
      assert(filter_params_x->taps > 8 ||
1025
0
             (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
1026
0
      im_block[y * im_stride + x] =
1027
0
          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
1028
0
    }
1029
0
    src_horiz += src_stride;
1030
0
  }
1031
1032
  // vertical filter
1033
0
  int16_t *src_vert = im_block + fo_vert * im_stride;
1034
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1035
0
  for (int x = 0; x < w; ++x) {
1036
0
    int y_qn = subpel_y_qn;
1037
0
    for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
1038
0
      const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
1039
0
      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
1040
0
      assert(y_filter_idx < SUBPEL_SHIFTS);
1041
0
      const int16_t *y_filter =
1042
0
          av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
1043
0
      int32_t sum = 1 << offset_bits;
1044
0
      for (int k = 0; k < filter_params_y->taps; ++k) {
1045
0
        sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
1046
0
      }
1047
0
      assert(filter_params_y->taps > 8 ||
1048
0
             (0 <= sum && sum < (1 << (offset_bits + 2))));
1049
0
      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
1050
0
      if (conv_params->is_compound) {
1051
0
        if (conv_params->do_average) {
1052
0
          int32_t tmp = dst16[y * dst16_stride + x];
1053
0
          if (conv_params->use_dist_wtd_comp_avg) {
1054
0
            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1055
0
            tmp = tmp >> DIST_PRECISION_BITS;
1056
0
          } else {
1057
0
            tmp += res;
1058
0
            tmp = tmp >> 1;
1059
0
          }
1060
          /* Subtract round offset and convolve round */
1061
0
          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
1062
0
                       (1 << (offset_bits - conv_params->round_1 - 1)));
1063
0
          dst[y * dst_stride + x] =
1064
0
              clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1065
0
        } else {
1066
0
          dst16[y * dst16_stride + x] = res;
1067
0
        }
1068
0
      } else {
1069
        /* Subtract round offset and convolve round */
1070
0
        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
1071
0
                             (1 << (offset_bits - conv_params->round_1 - 1)));
1072
0
        dst[y * dst_stride + x] =
1073
0
            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1074
0
      }
1075
0
    }
1076
0
    src_vert++;
1077
0
  }
1078
0
}
1079
1080
static void highbd_convolve_2d_facade_compound(
1081
    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
1082
    const int w, const int h, const InterpFilterParams *filter_params_x,
1083
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
1084
632k
    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
1085
632k
  const bool need_x = subpel_x_qn != 0;
1086
632k
  const bool need_y = subpel_y_qn != 0;
1087
632k
  if (!need_x && !need_y) {
1088
198k
    av1_highbd_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
1089
198k
                                         conv_params, bd);
1090
434k
  } else if (need_x && !need_y) {
1091
137k
    av1_highbd_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
1092
137k
                                   filter_params_x, subpel_x_qn, conv_params,
1093
137k
                                   bd);
1094
297k
  } else if (!need_x && need_y) {
1095
55.4k
    av1_highbd_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
1096
55.4k
                                   filter_params_y, subpel_y_qn, conv_params,
1097
55.4k
                                   bd);
1098
241k
  } else {
1099
241k
    assert(need_x && need_y);
1100
0
    av1_highbd_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
1101
241k
                                    filter_params_x, filter_params_y,
1102
241k
                                    subpel_x_qn, subpel_y_qn, conv_params, bd);
1103
241k
  }
1104
632k
}
1105
1106
static void highbd_convolve_2d_facade_single(
1107
    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
1108
    const int w, const int h, const InterpFilterParams *filter_params_x,
1109
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
1110
4.18M
    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
1111
4.18M
  const bool need_x = subpel_x_qn != 0;
1112
4.18M
  const bool need_y = subpel_y_qn != 0;
1113
1114
4.18M
  if (!need_x && !need_y) {
1115
1.32M
    aom_highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h);
1116
2.86M
  } else if (need_x && !need_y) {
1117
640k
    av1_highbd_convolve_x_sr(src, src_stride, dst, dst_stride, w, h,
1118
640k
                             filter_params_x, subpel_x_qn, conv_params, bd);
1119
2.21M
  } else if (!need_x && need_y) {
1120
649k
    av1_highbd_convolve_y_sr(src, src_stride, dst, dst_stride, w, h,
1121
649k
                             filter_params_y, subpel_y_qn, bd);
1122
1.57M
  } else {
1123
1.57M
    assert(need_x && need_y);
1124
0
    av1_highbd_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
1125
1.57M
                              filter_params_x, filter_params_y, subpel_x_qn,
1126
1.57M
                              subpel_y_qn, conv_params, bd);
1127
1.57M
  }
1128
4.18M
}
1129
1130
void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
1131
                                   uint8_t *dst8, int dst_stride, int w, int h,
1132
                                   const InterpFilterParams *interp_filters[2],
1133
                                   const int subpel_x_qn, int x_step_q4,
1134
                                   const int subpel_y_qn, int y_step_q4,
1135
                                   int scaled, ConvolveParams *conv_params,
1136
5.71M
                                   int bd) {
1137
5.71M
  (void)x_step_q4;
1138
5.71M
  (void)y_step_q4;
1139
5.71M
  (void)dst_stride;
1140
5.71M
  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1141
1142
5.71M
  const int need_filter_params_x = (subpel_x_qn != 0) | scaled;
1143
5.71M
  const int need_filter_params_y = (subpel_y_qn != 0) | scaled;
1144
5.71M
  const InterpFilterParams *filter_params_x =
1145
5.71M
      need_filter_params_x ? interp_filters[0] : NULL;
1146
5.71M
  const InterpFilterParams *filter_params_y =
1147
5.71M
      need_filter_params_y ? interp_filters[1] : NULL;
1148
1149
5.71M
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1150
5.71M
  if (scaled) {
1151
898k
    if (conv_params->is_compound) {
1152
240k
      assert(conv_params->dst != NULL);
1153
240k
    }
1154
0
    av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
1155
898k
                                 filter_params_x, filter_params_y, subpel_x_qn,
1156
898k
                                 x_step_q4, subpel_y_qn, y_step_q4, conv_params,
1157
898k
                                 bd);
1158
4.81M
  } else if (conv_params->is_compound) {
1159
632k
    highbd_convolve_2d_facade_compound(
1160
632k
        src, src_stride, dst, dst_stride, w, h, filter_params_x,
1161
632k
        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
1162
4.18M
  } else {
1163
4.18M
    highbd_convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
1164
4.18M
                                     filter_params_x, filter_params_y,
1165
4.18M
                                     subpel_x_qn, subpel_y_qn, conv_params, bd);
1166
4.18M
  }
1167
5.71M
}
1168
#endif  // CONFIG_AV1_HIGHBITDEPTH
1169
1170
// Note: Fixed size intermediate buffers, place limits on parameters
1171
// of some functions. 2d filtering proceeds in 2 steps:
1172
//   (1) Interpolate horizontally into an intermediate buffer, temp.
1173
//   (2) Interpolate temp vertically to derive the sub-pixel result.
1174
// Deriving the maximum number of rows in the temp buffer (135):
1175
// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1176
// --Largest block size is 128x128 pixels.
1177
// --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
1178
//   original frame (in 1/16th pixel units).
1179
// --Must round-up because block may be located at sub-pixel position.
1180
// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1181
// --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
1182
#define WIENER_MAX_EXT_SIZE 263
1183
1184
0
static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
1185
0
  int sum = 0;
1186
0
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1187
0
  return sum;
1188
0
}
1189
1190
#if CONFIG_AV1_HIGHBITDEPTH
1191
static INLINE int highbd_horz_scalar_product(const uint16_t *a,
1192
0
                                             const int16_t *b) {
1193
0
  int sum = 0;
1194
0
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1195
0
  return sum;
1196
0
}
1197
#endif
1198
1199
static INLINE int highbd_vert_scalar_product(const uint16_t *a,
1200
                                             ptrdiff_t a_stride,
1201
0
                                             const int16_t *b) {
1202
0
  int sum = 0;
1203
0
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
1204
0
  return sum;
1205
0
}
1206
1207
0
static const InterpKernel *get_filter_base(const int16_t *filter) {
1208
  // NOTE: This assumes that the filter table is 256-byte aligned.
1209
  // TODO(agrange) Modify to make independent of table alignment.
1210
0
  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
1211
0
}
1212
1213
0
static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
1214
0
  return (int)((const InterpKernel *)(intptr_t)f - base);
1215
0
}
1216
1217
static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
1218
                                       uint16_t *dst, ptrdiff_t dst_stride,
1219
                                       const InterpKernel *x_filters, int x0_q4,
1220
                                       int x_step_q4, int w, int h,
1221
0
                                       int round0_bits) {
1222
0
  const int bd = 8;
1223
0
  src -= SUBPEL_TAPS / 2 - 1;
1224
0
  for (int y = 0; y < h; ++y) {
1225
0
    int x_q4 = x0_q4;
1226
0
    for (int x = 0; x < w; ++x) {
1227
0
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1228
0
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1229
0
      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1230
0
                           (1 << (bd + FILTER_BITS - 1));
1231
0
      const int sum = horz_scalar_product(src_x, x_filter) + rounding;
1232
0
      dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1233
0
                               WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
1234
0
      x_q4 += x_step_q4;
1235
0
    }
1236
0
    src += src_stride;
1237
0
    dst += dst_stride;
1238
0
  }
1239
0
}
1240
1241
static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
1242
                                      uint8_t *dst, ptrdiff_t dst_stride,
1243
                                      const InterpKernel *y_filters, int y0_q4,
1244
                                      int y_step_q4, int w, int h,
1245
0
                                      int round1_bits) {
1246
0
  const int bd = 8;
1247
0
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1248
1249
0
  for (int x = 0; x < w; ++x) {
1250
0
    int y_q4 = y0_q4;
1251
0
    for (int y = 0; y < h; ++y) {
1252
0
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1253
0
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1254
0
      const int rounding =
1255
0
          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1256
0
          (1 << (bd + round1_bits - 1));
1257
0
      const int sum =
1258
0
          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1259
0
      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
1260
0
      y_q4 += y_step_q4;
1261
0
    }
1262
0
    ++src;
1263
0
    ++dst;
1264
0
  }
1265
0
}
1266
1267
void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
1268
                                   uint8_t *dst, ptrdiff_t dst_stride,
1269
                                   const int16_t *filter_x, int x_step_q4,
1270
                                   const int16_t *filter_y, int y_step_q4,
1271
                                   int w, int h,
1272
0
                                   const ConvolveParams *conv_params) {
1273
0
  const InterpKernel *const filters_x = get_filter_base(filter_x);
1274
0
  const int x0_q4 = get_filter_offset(filter_x, filters_x);
1275
1276
0
  const InterpKernel *const filters_y = get_filter_base(filter_y);
1277
0
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
1278
1279
0
  uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1280
0
  const int intermediate_height =
1281
0
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
1282
0
  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
1283
1284
0
  assert(w <= MAX_SB_SIZE);
1285
0
  assert(h <= MAX_SB_SIZE);
1286
0
  assert(y_step_q4 <= 32);
1287
0
  assert(x_step_q4 <= 32);
1288
1289
0
  convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1290
0
                             src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
1291
0
                             x_step_q4, w, intermediate_height,
1292
0
                             conv_params->round_0);
1293
0
  convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1294
0
                            MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
1295
0
                            y_step_q4, w, h, conv_params->round_1);
1296
0
}
1297
1298
#if CONFIG_AV1_HIGHBITDEPTH
1299
static void highbd_convolve_add_src_horiz_hip(
1300
    const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
1301
    ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
1302
0
    int x_step_q4, int w, int h, int round0_bits, int bd) {
1303
0
  const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
1304
0
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1305
0
  src -= SUBPEL_TAPS / 2 - 1;
1306
0
  for (int y = 0; y < h; ++y) {
1307
0
    int x_q4 = x0_q4;
1308
0
    for (int x = 0; x < w; ++x) {
1309
0
      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1310
0
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1311
0
      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1312
0
                           (1 << (bd + FILTER_BITS - 1));
1313
0
      const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
1314
0
      dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1315
0
                               extraprec_clamp_limit - 1);
1316
0
      x_q4 += x_step_q4;
1317
0
    }
1318
0
    src += src_stride;
1319
0
    dst += dst_stride;
1320
0
  }
1321
0
}
1322
1323
static void highbd_convolve_add_src_vert_hip(
1324
    const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
1325
    ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
1326
0
    int y_step_q4, int w, int h, int round1_bits, int bd) {
1327
0
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1328
0
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1329
0
  for (int x = 0; x < w; ++x) {
1330
0
    int y_q4 = y0_q4;
1331
0
    for (int y = 0; y < h; ++y) {
1332
0
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1333
0
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1334
0
      const int rounding =
1335
0
          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1336
0
          (1 << (bd + round1_bits - 1));
1337
0
      const int sum =
1338
0
          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1339
0
      dst[y * dst_stride] =
1340
0
          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
1341
0
      y_q4 += y_step_q4;
1342
0
    }
1343
0
    ++src;
1344
0
    ++dst;
1345
0
  }
1346
0
}
1347
1348
void av1_highbd_wiener_convolve_add_src_c(
1349
    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1350
    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1351
    const int16_t *filter_y, int y_step_q4, int w, int h,
1352
0
    const ConvolveParams *conv_params, int bd) {
1353
0
  const InterpKernel *const filters_x = get_filter_base(filter_x);
1354
0
  const int x0_q4 = get_filter_offset(filter_x, filters_x);
1355
1356
0
  const InterpKernel *const filters_y = get_filter_base(filter_y);
1357
0
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
1358
1359
0
  uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1360
0
  const int intermediate_height =
1361
0
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1362
1363
0
  assert(w <= MAX_SB_SIZE);
1364
0
  assert(h <= MAX_SB_SIZE);
1365
0
  assert(y_step_q4 <= 32);
1366
0
  assert(x_step_q4 <= 32);
1367
0
  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
1368
1369
0
  highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1370
0
                                    src_stride, temp, MAX_SB_SIZE, filters_x,
1371
0
                                    x0_q4, x_step_q4, w, intermediate_height,
1372
0
                                    conv_params->round_0, bd);
1373
0
  highbd_convolve_add_src_vert_hip(
1374
0
      temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
1375
0
      filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
1376
0
}
1377
#endif  // CONFIG_AV1_HIGHBITDEPTH