Coverage Report

Created: 2025-11-16 07:22

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/av1/common/convolve.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <assert.h>
13
#include <string.h>
14
15
#include "config/aom_dsp_rtcd.h"
16
#include "config/av1_rtcd.h"
17
18
#include "av1/common/av1_common_int.h"
19
#include "av1/common/blockd.h"
20
#include "av1/common/convolve.h"
21
#include "av1/common/filter.h"
22
#include "av1/common/resize.h"
23
#include "aom_dsp/aom_dsp_common.h"
24
#include "aom_ports/mem.h"
25
26
void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
27
                             int dst_stride, int w, int h,
28
                             const int16_t *x_filters, int x0_qn,
29
29.0k
                             int x_step_qn) {
30
29.0k
  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
31
1.14M
  for (int y = 0; y < h; ++y) {
32
1.11M
    int x_qn = x0_qn;
33
192M
    for (int x = 0; x < w; ++x) {
34
191M
      const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
35
191M
      const int x_filter_idx =
36
191M
          (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
37
191M
      assert(x_filter_idx <= RS_SUBPEL_MASK);
38
191M
      const int16_t *const x_filter =
39
191M
          &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
40
191M
      int sum = 0;
41
1.72G
      for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
42
1.53G
        sum += src_x[k] * x_filter[k];
43
191M
      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
44
191M
      x_qn += x_step_qn;
45
191M
    }
46
1.11M
    src += src_stride;
47
1.11M
    dst += dst_stride;
48
1.11M
  }
49
29.0k
}
50
51
void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
52
                                    uint16_t *dst, int dst_stride, int w, int h,
53
                                    const int16_t *x_filters, int x0_qn,
54
130k
                                    int x_step_qn, int bd) {
55
130k
  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
56
4.90M
  for (int y = 0; y < h; ++y) {
57
4.77M
    int x_qn = x0_qn;
58
301M
    for (int x = 0; x < w; ++x) {
59
296M
      const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
60
296M
      const int x_filter_idx =
61
296M
          (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
62
296M
      assert(x_filter_idx <= RS_SUBPEL_MASK);
63
296M
      const int16_t *const x_filter =
64
296M
          &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
65
296M
      int sum = 0;
66
2.67G
      for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
67
2.37G
        sum += src_x[k] * x_filter[k];
68
296M
      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
69
296M
      x_qn += x_step_qn;
70
296M
    }
71
4.77M
    src += src_stride;
72
4.77M
    dst += dst_stride;
73
4.77M
  }
74
130k
}
75
76
void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst,
77
                               int dst_stride, int w, int h, int dir,
78
0
                               double norm) {
79
0
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
80
0
  DECLARE_ALIGNED(256, static const int16_t, sobel_a[3]) = { 1, 0, -1 };
81
0
  DECLARE_ALIGNED(256, static const int16_t, sobel_b[3]) = { 1, 2, 1 };
82
0
  const int taps = 3;
83
0
  int im_h = h + taps - 1;
84
0
  int im_stride = w;
85
0
  const int fo_vert = 1;
86
0
  const int fo_horiz = 1;
87
88
  // horizontal filter
89
0
  const uint8_t *src_horiz = src - fo_vert * src_stride;
90
0
  const int16_t *x_filter = dir ? sobel_a : sobel_b;
91
0
  for (int y = 0; y < im_h; ++y) {
92
0
    for (int x = 0; x < w; ++x) {
93
0
      int16_t sum = 0;
94
0
      for (int k = 0; k < taps; ++k) {
95
0
        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
96
0
      }
97
0
      im_block[y * im_stride + x] = sum;
98
0
    }
99
0
  }
100
101
  // vertical filter
102
0
  int16_t *src_vert = im_block + fo_vert * im_stride;
103
0
  const int16_t *y_filter = dir ? sobel_b : sobel_a;
104
0
  for (int y = 0; y < h; ++y) {
105
0
    for (int x = 0; x < w; ++x) {
106
0
      int16_t sum = 0;
107
0
      for (int k = 0; k < taps; ++k) {
108
0
        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
109
0
      }
110
0
      dst[y * dst_stride + x] = sum * norm;
111
0
    }
112
0
  }
113
0
}
114
115
void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
116
                          int dst_stride, int w, int h,
117
                          const InterpFilterParams *filter_params_x,
118
                          const InterpFilterParams *filter_params_y,
119
                          const int subpel_x_qn, const int subpel_y_qn,
120
13.2k
                          ConvolveParams *conv_params) {
121
13.2k
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
122
13.2k
  int im_h = h + filter_params_y->taps - 1;
123
13.2k
  int im_stride = w;
124
13.2k
  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
125
13.2k
  const int fo_vert = filter_params_y->taps / 2 - 1;
126
13.2k
  const int fo_horiz = filter_params_x->taps / 2 - 1;
127
13.2k
  const int bd = 8;
128
13.2k
  const int bits =
129
13.2k
      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
130
131
  // horizontal filter
132
13.2k
  const uint8_t *src_horiz = src - fo_vert * src_stride;
133
13.2k
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
134
13.2k
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
135
157k
  for (int y = 0; y < im_h; ++y) {
136
1.47M
    for (int x = 0; x < w; ++x) {
137
1.32M
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
138
6.49M
      for (int k = 0; k < filter_params_x->taps; ++k) {
139
5.17M
        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
140
5.17M
      }
141
1.32M
      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
142
1.32M
      im_block[y * im_stride + x] =
143
1.32M
          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
144
1.32M
    }
145
144k
  }
146
147
  // vertical filter
148
13.2k
  int16_t *src_vert = im_block + fo_vert * im_stride;
149
13.2k
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
150
13.2k
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
151
13.2k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
152
105k
  for (int y = 0; y < h; ++y) {
153
1.12M
    for (int x = 0; x < w; ++x) {
154
1.03M
      int32_t sum = 1 << offset_bits;
155
4.26M
      for (int k = 0; k < filter_params_y->taps; ++k) {
156
3.23M
        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
157
3.23M
      }
158
1.03M
      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
159
1.03M
      int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
160
1.03M
                    ((1 << (offset_bits - conv_params->round_1)) +
161
1.03M
                     (1 << (offset_bits - conv_params->round_1 - 1)));
162
1.03M
      dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
163
1.03M
    }
164
91.9k
  }
165
13.2k
}
166
167
void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
168
                         int dst_stride, int w, int h,
169
                         const InterpFilterParams *filter_params_y,
170
11.0k
                         const int subpel_y_qn) {
171
11.0k
  const int fo_vert = filter_params_y->taps / 2 - 1;
172
173
  // vertical filter
174
11.0k
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
175
11.0k
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
176
90.3k
  for (int y = 0; y < h; ++y) {
177
1.02M
    for (int x = 0; x < w; ++x) {
178
950k
      int32_t res = 0;
179
3.77M
      for (int k = 0; k < filter_params_y->taps; ++k) {
180
2.82M
        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
181
2.82M
      }
182
950k
      dst[y * dst_stride + x] =
183
950k
          clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
184
950k
    }
185
79.2k
  }
186
11.0k
}
187
188
void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
189
                         int dst_stride, int w, int h,
190
                         const InterpFilterParams *filter_params_x,
191
9.28k
                         const int subpel_x_qn, ConvolveParams *conv_params) {
192
9.28k
  const int fo_horiz = filter_params_x->taps / 2 - 1;
193
9.28k
  const int bits = FILTER_BITS - conv_params->round_0;
194
195
9.28k
  assert(bits >= 0);
196
9.28k
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
197
9.28k
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
198
199
  // horizontal filter
200
9.28k
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
201
9.28k
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
202
203
88.8k
  for (int y = 0; y < h; ++y) {
204
1.34M
    for (int x = 0; x < w; ++x) {
205
1.26M
      int32_t res = 0;
206
4.51M
      for (int k = 0; k < filter_params_x->taps; ++k) {
207
3.24M
        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
208
3.24M
      }
209
1.26M
      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
210
1.26M
      dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
211
1.26M
    }
212
79.5k
  }
213
9.28k
}
214
215
void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
216
                                uint8_t *dst, int dst_stride, int w, int h,
217
                                const InterpFilterParams *filter_params_x,
218
                                const InterpFilterParams *filter_params_y,
219
                                const int subpel_x_qn, const int subpel_y_qn,
220
5.97k
                                ConvolveParams *conv_params) {
221
5.97k
  CONV_BUF_TYPE *dst16 = conv_params->dst;
222
5.97k
  int dst16_stride = conv_params->dst_stride;
223
5.97k
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
224
5.97k
  int im_h = h + filter_params_y->taps - 1;
225
5.97k
  int im_stride = w;
226
5.97k
  const int fo_vert = filter_params_y->taps / 2 - 1;
227
5.97k
  const int fo_horiz = filter_params_x->taps / 2 - 1;
228
5.97k
  const int bd = 8;
229
5.97k
  const int round_bits =
230
5.97k
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
231
232
  // horizontal filter
233
5.97k
  const uint8_t *src_horiz = src - fo_vert * src_stride;
234
5.97k
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
235
5.97k
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
236
98.1k
  for (int y = 0; y < im_h; ++y) {
237
804k
    for (int x = 0; x < w; ++x) {
238
711k
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
239
6.40M
      for (int k = 0; k < filter_params_x->taps; ++k) {
240
5.69M
        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
241
5.69M
      }
242
711k
      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
243
711k
      im_block[y * im_stride + x] =
244
711k
          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
245
711k
    }
246
92.1k
  }
247
248
  // vertical filter
249
5.97k
  int16_t *src_vert = im_block + fo_vert * im_stride;
250
5.97k
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
251
5.97k
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
252
5.97k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
253
56.2k
  for (int y = 0; y < h; ++y) {
254
444k
    for (int x = 0; x < w; ++x) {
255
394k
      int32_t sum = 1 << offset_bits;
256
3.54M
      for (int k = 0; k < filter_params_y->taps; ++k) {
257
3.15M
        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
258
3.15M
      }
259
394k
      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
260
394k
      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
261
394k
      if (conv_params->do_average) {
262
174k
        int32_t tmp = dst16[y * dst16_stride + x];
263
174k
        if (conv_params->use_dist_wtd_comp_avg) {
264
66.4k
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
265
66.4k
          tmp = tmp >> DIST_PRECISION_BITS;
266
108k
        } else {
267
108k
          tmp += res;
268
108k
          tmp = tmp >> 1;
269
108k
        }
270
174k
        tmp -= (1 << (offset_bits - conv_params->round_1)) +
271
174k
               (1 << (offset_bits - conv_params->round_1 - 1));
272
174k
        dst[y * dst_stride + x] =
273
174k
            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
274
219k
      } else {
275
219k
        dst16[y * dst16_stride + x] = res;
276
219k
      }
277
394k
    }
278
50.3k
  }
279
5.97k
}
280
281
void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
282
                               int dst_stride, int w, int h,
283
                               const InterpFilterParams *filter_params_y,
284
                               const int subpel_y_qn,
285
5.58k
                               ConvolveParams *conv_params) {
286
5.58k
  CONV_BUF_TYPE *dst16 = conv_params->dst;
287
5.58k
  int dst16_stride = conv_params->dst_stride;
288
5.58k
  const int fo_vert = filter_params_y->taps / 2 - 1;
289
5.58k
  const int bits = FILTER_BITS - conv_params->round_0;
290
5.58k
  const int bd = 8;
291
5.58k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
292
5.58k
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
293
5.58k
                           (1 << (offset_bits - conv_params->round_1 - 1));
294
5.58k
  const int round_bits =
295
5.58k
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
296
297
  // vertical filter
298
5.58k
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
299
5.58k
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
300
55.2k
  for (int y = 0; y < h; ++y) {
301
439k
    for (int x = 0; x < w; ++x) {
302
390k
      int32_t res = 0;
303
3.51M
      for (int k = 0; k < filter_params_y->taps; ++k) {
304
3.12M
        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
305
3.12M
      }
306
390k
      res *= (1 << bits);
307
390k
      res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
308
309
390k
      if (conv_params->do_average) {
310
99.1k
        int32_t tmp = dst16[y * dst16_stride + x];
311
99.1k
        if (conv_params->use_dist_wtd_comp_avg) {
312
25.7k
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
313
25.7k
          tmp = tmp >> DIST_PRECISION_BITS;
314
73.3k
        } else {
315
73.3k
          tmp += res;
316
73.3k
          tmp = tmp >> 1;
317
73.3k
        }
318
99.1k
        tmp -= round_offset;
319
99.1k
        dst[y * dst_stride + x] =
320
99.1k
            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
321
291k
      } else {
322
291k
        dst16[y * dst16_stride + x] = res;
323
291k
      }
324
390k
    }
325
49.7k
  }
326
5.58k
}
327
328
void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
329
                               int dst_stride, int w, int h,
330
                               const InterpFilterParams *filter_params_x,
331
                               const int subpel_x_qn,
332
2.29k
                               ConvolveParams *conv_params) {
333
2.29k
  CONV_BUF_TYPE *dst16 = conv_params->dst;
334
2.29k
  int dst16_stride = conv_params->dst_stride;
335
2.29k
  const int fo_horiz = filter_params_x->taps / 2 - 1;
336
2.29k
  const int bits = FILTER_BITS - conv_params->round_1;
337
2.29k
  const int bd = 8;
338
2.29k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
339
2.29k
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
340
2.29k
                           (1 << (offset_bits - conv_params->round_1 - 1));
341
2.29k
  const int round_bits =
342
2.29k
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
343
344
  // horizontal filter
345
2.29k
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
346
2.29k
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
347
25.3k
  for (int y = 0; y < h; ++y) {
348
211k
    for (int x = 0; x < w; ++x) {
349
188k
      int32_t res = 0;
350
1.69M
      for (int k = 0; k < filter_params_x->taps; ++k) {
351
1.51M
        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
352
1.51M
      }
353
188k
      res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
354
188k
      res += round_offset;
355
356
188k
      if (conv_params->do_average) {
357
57.7k
        int32_t tmp = dst16[y * dst16_stride + x];
358
57.7k
        if (conv_params->use_dist_wtd_comp_avg) {
359
27.3k
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
360
27.3k
          tmp = tmp >> DIST_PRECISION_BITS;
361
30.4k
        } else {
362
30.4k
          tmp += res;
363
30.4k
          tmp = tmp >> 1;
364
30.4k
        }
365
57.7k
        tmp -= round_offset;
366
57.7k
        dst[y * dst_stride + x] =
367
57.7k
            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
368
131k
      } else {
369
131k
        dst16[y * dst16_stride + x] = res;
370
131k
      }
371
188k
    }
372
23.0k
  }
373
2.29k
}
374
375
void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride,
376
                                     uint8_t *dst, int dst_stride, int w, int h,
377
6.22k
                                     ConvolveParams *conv_params) {
378
6.22k
  CONV_BUF_TYPE *dst16 = conv_params->dst;
379
6.22k
  int dst16_stride = conv_params->dst_stride;
380
6.22k
  const int bits =
381
6.22k
      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
382
6.22k
  const int bd = 8;
383
6.22k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
384
6.22k
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
385
6.22k
                           (1 << (offset_bits - conv_params->round_1 - 1));
386
387
59.6k
  for (int y = 0; y < h; ++y) {
388
484k
    for (int x = 0; x < w; ++x) {
389
430k
      CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
390
430k
      res += round_offset;
391
392
430k
      if (conv_params->do_average) {
393
142k
        int32_t tmp = dst16[y * dst16_stride + x];
394
142k
        if (conv_params->use_dist_wtd_comp_avg) {
395
32.0k
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
396
32.0k
          tmp = tmp >> DIST_PRECISION_BITS;
397
110k
        } else {
398
110k
          tmp += res;
399
110k
          tmp = tmp >> 1;
400
110k
        }
401
142k
        tmp -= round_offset;
402
142k
        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
403
288k
      } else {
404
288k
        dst16[y * dst16_stride + x] = res;
405
288k
      }
406
430k
    }
407
53.4k
  }
408
6.22k
}
409
410
void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst,
411
                             int dst_stride, int w, int h,
412
                             const InterpFilterParams *filter_params_x,
413
                             const InterpFilterParams *filter_params_y,
414
                             const int subpel_x_qn, const int x_step_qn,
415
                             const int subpel_y_qn, const int y_step_qn,
416
7.22k
                             ConvolveParams *conv_params) {
417
7.22k
  int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
418
7.22k
  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
419
7.22k
             filter_params_y->taps;
420
7.22k
  CONV_BUF_TYPE *dst16 = conv_params->dst;
421
7.22k
  const int dst16_stride = conv_params->dst_stride;
422
7.22k
  const int bits =
423
7.22k
      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
424
7.22k
  assert(bits >= 0);
425
7.22k
  int im_stride = w;
426
7.22k
  const int fo_vert = filter_params_y->taps / 2 - 1;
427
7.22k
  const int fo_horiz = filter_params_x->taps / 2 - 1;
428
7.22k
  const int bd = 8;
429
430
  // horizontal filter
431
7.22k
  const uint8_t *src_horiz = src - fo_vert * src_stride;
432
107k
  for (int y = 0; y < im_h; ++y) {
433
100k
    int x_qn = subpel_x_qn;
434
824k
    for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
435
724k
      const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
436
724k
      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
437
724k
      assert(x_filter_idx < SUBPEL_SHIFTS);
438
724k
      const int16_t *x_filter =
439
724k
          av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
440
724k
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
441
6.51M
      for (int k = 0; k < filter_params_x->taps; ++k) {
442
5.79M
        sum += x_filter[k] * src_x[k - fo_horiz];
443
5.79M
      }
444
724k
      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
445
724k
      im_block[y * im_stride + x] =
446
724k
          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
447
724k
    }
448
100k
    src_horiz += src_stride;
449
100k
  }
450
451
  // vertical filter
452
7.22k
  int16_t *src_vert = im_block + fo_vert * im_stride;
453
7.22k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
454
54.8k
  for (int x = 0; x < w; ++x) {
455
47.6k
    int y_qn = subpel_y_qn;
456
428k
    for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
457
381k
      const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
458
381k
      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
459
381k
      assert(y_filter_idx < SUBPEL_SHIFTS);
460
381k
      const int16_t *y_filter =
461
381k
          av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
462
381k
      int32_t sum = 1 << offset_bits;
463
3.42M
      for (int k = 0; k < filter_params_y->taps; ++k) {
464
3.04M
        sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
465
3.04M
      }
466
381k
      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
467
381k
      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
468
381k
      if (conv_params->is_compound) {
469
119k
        if (conv_params->do_average) {
470
47.2k
          int32_t tmp = dst16[y * dst16_stride + x];
471
47.2k
          if (conv_params->use_dist_wtd_comp_avg) {
472
11.9k
            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
473
11.9k
            tmp = tmp >> DIST_PRECISION_BITS;
474
35.3k
          } else {
475
35.3k
            tmp += res;
476
35.3k
            tmp = tmp >> 1;
477
35.3k
          }
478
          /* Subtract round offset and convolve round */
479
47.2k
          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
480
47.2k
                       (1 << (offset_bits - conv_params->round_1 - 1)));
481
47.2k
          dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
482
72.7k
        } else {
483
72.7k
          dst16[y * dst16_stride + x] = res;
484
72.7k
        }
485
261k
      } else {
486
        /* Subtract round offset and convolve round */
487
261k
        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
488
261k
                             (1 << (offset_bits - conv_params->round_1 - 1)));
489
261k
        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
490
261k
      }
491
381k
    }
492
47.6k
    src_vert++;
493
47.6k
  }
494
7.22k
}
495
496
static void convolve_2d_scale_wrapper(
497
    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
498
    int h, const InterpFilterParams *filter_params_x,
499
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
500
    const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
501
7.22k
    ConvolveParams *conv_params) {
502
7.22k
  if (conv_params->is_compound) {
503
1.47k
    assert(conv_params->dst != NULL);
504
1.47k
  }
505
7.22k
  av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
506
7.22k
                        filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
507
7.22k
                        y_step_qn, conv_params);
508
7.22k
}
509
510
static void convolve_2d_facade_compound(
511
    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
512
    int h, const InterpFilterParams *filter_params_x,
513
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
514
20.0k
    const int subpel_y_qn, ConvolveParams *conv_params) {
515
20.0k
  const bool need_x = subpel_x_qn != 0;
516
20.0k
  const bool need_y = subpel_y_qn != 0;
517
20.0k
  if (!need_x && !need_y) {
518
6.22k
    av1_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
519
6.22k
                                  conv_params);
520
13.8k
  } else if (need_x && !need_y) {
521
2.29k
    av1_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
522
2.29k
                            filter_params_x, subpel_x_qn, conv_params);
523
11.5k
  } else if (!need_x && need_y) {
524
5.58k
    av1_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
525
5.58k
                            filter_params_y, subpel_y_qn, conv_params);
526
5.97k
  } else {
527
5.97k
    assert(need_y && need_x);
528
5.97k
    av1_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
529
5.97k
                             filter_params_x, filter_params_y, subpel_x_qn,
530
5.97k
                             subpel_y_qn, conv_params);
531
5.97k
  }
532
20.0k
}
533
534
static void convolve_2d_facade_single(
535
    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
536
    int h, const InterpFilterParams *filter_params_x,
537
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
538
63.2k
    const int subpel_y_qn, ConvolveParams *conv_params) {
539
63.2k
  const bool need_x = subpel_x_qn != 0;
540
63.2k
  const bool need_y = subpel_y_qn != 0;
541
63.2k
  if (!need_x && !need_y) {
542
49.8k
    aom_convolve_copy(src, src_stride, dst, dst_stride, w, h);
543
49.8k
  } else if (need_x && !need_y) {
544
2.40k
    av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
545
2.40k
                      subpel_x_qn, conv_params);
546
11.0k
  } else if (!need_x && need_y) {
547
4.51k
    av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
548
4.51k
                      subpel_y_qn);
549
6.53k
  } else {
550
6.53k
    assert(need_x && need_y);
551
6.53k
    av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
552
6.53k
                       filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
553
6.53k
  }
554
63.2k
}
555
556
void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
557
                            int dst_stride, int w, int h,
558
                            const InterpFilterParams *interp_filters[2],
559
                            const int subpel_x_qn, int x_step_q4,
560
                            const int subpel_y_qn, int y_step_q4, int scaled,
561
110k
                            ConvolveParams *conv_params) {
562
110k
  (void)x_step_q4;
563
110k
  (void)y_step_q4;
564
110k
  (void)dst;
565
110k
  (void)dst_stride;
566
567
110k
  const InterpFilterParams *filter_params_x = interp_filters[0];
568
110k
  const InterpFilterParams *filter_params_y = interp_filters[1];
569
570
  // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case.
571
  // Do we have SIMD support to 4-tap case?
572
  // 2-tap filter indicates that it is for IntraBC.
573
110k
  if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
574
55.2k
    assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
575
55.2k
    assert(!scaled);
576
55.2k
    if (subpel_x_qn && subpel_y_qn) {
577
6.73k
      av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
578
6.73k
                           filter_params_x, filter_params_y, subpel_x_qn,
579
6.73k
                           subpel_y_qn, conv_params);
580
6.73k
      return;
581
48.4k
    } else if (subpel_x_qn) {
582
6.87k
      av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
583
6.87k
                          filter_params_x, subpel_x_qn, conv_params);
584
6.87k
      return;
585
41.6k
    } else if (subpel_y_qn) {
586
6.58k
      av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
587
6.58k
                          filter_params_y, subpel_y_qn);
588
6.58k
      return;
589
6.58k
    }
590
55.2k
  }
591
592
90.5k
  if (scaled) {
593
7.22k
    convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
594
7.22k
                              filter_params_x, filter_params_y, subpel_x_qn,
595
7.22k
                              x_step_q4, subpel_y_qn, y_step_q4, conv_params);
596
83.3k
  } else if (conv_params->is_compound) {
597
20.0k
    convolve_2d_facade_compound(src, src_stride, dst, dst_stride, w, h,
598
20.0k
                                filter_params_x, filter_params_y, subpel_x_qn,
599
20.0k
                                subpel_y_qn, conv_params);
600
63.2k
  } else {
601
63.2k
    convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
602
63.2k
                              filter_params_x, filter_params_y, subpel_x_qn,
603
63.2k
                              subpel_y_qn, conv_params);
604
63.2k
  }
605
90.5k
}
606
607
#if CONFIG_AV1_HIGHBITDEPTH
608
void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
609
                                uint16_t *dst, int dst_stride, int w, int h,
610
                                const InterpFilterParams *filter_params_x,
611
                                const int subpel_x_qn,
612
34.0k
                                ConvolveParams *conv_params, int bd) {
613
34.0k
  const int fo_horiz = filter_params_x->taps / 2 - 1;
614
34.0k
  const int bits = FILTER_BITS - conv_params->round_0;
615
616
34.0k
  assert(bits >= 0);
617
34.0k
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
618
34.0k
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
619
620
  // horizontal filter
621
34.0k
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
622
34.0k
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
623
271k
  for (int y = 0; y < h; ++y) {
624
2.75M
    for (int x = 0; x < w; ++x) {
625
2.51M
      int32_t res = 0;
626
10.4M
      for (int k = 0; k < filter_params_x->taps; ++k) {
627
7.92M
        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
628
7.92M
      }
629
2.51M
      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
630
2.51M
      dst[y * dst_stride + x] =
631
2.51M
          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
632
2.51M
    }
633
237k
  }
634
34.0k
}
635
636
void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
637
                                uint16_t *dst, int dst_stride, int w, int h,
638
                                const InterpFilterParams *filter_params_y,
639
27.1k
                                const int subpel_y_qn, int bd) {
640
27.1k
  const int fo_vert = filter_params_y->taps / 2 - 1;
641
  // vertical filter
642
27.1k
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
643
27.1k
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
644
200k
  for (int y = 0; y < h; ++y) {
645
1.65M
    for (int x = 0; x < w; ++x) {
646
1.47M
      int32_t res = 0;
647
5.43M
      for (int k = 0; k < filter_params_y->taps; ++k) {
648
3.95M
        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
649
3.95M
      }
650
1.47M
      dst[y * dst_stride + x] =
651
1.47M
          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
652
1.47M
    }
653
173k
  }
654
27.1k
}
655
656
void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
657
                                 uint16_t *dst, int dst_stride, int w, int h,
658
                                 const InterpFilterParams *filter_params_x,
659
                                 const InterpFilterParams *filter_params_y,
660
                                 const int subpel_x_qn, const int subpel_y_qn,
661
31.1k
                                 ConvolveParams *conv_params, int bd) {
662
31.1k
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
663
31.1k
  int im_h = h + filter_params_y->taps - 1;
664
31.1k
  int im_stride = w;
665
31.1k
  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
666
31.1k
  const int fo_vert = filter_params_y->taps / 2 - 1;
667
31.1k
  const int fo_horiz = filter_params_x->taps / 2 - 1;
668
31.1k
  const int bits =
669
31.1k
      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
670
31.1k
  assert(bits >= 0);
671
672
  // horizontal filter
673
31.1k
  const uint16_t *src_horiz = src - fo_vert * src_stride;
674
31.1k
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
675
31.1k
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
676
297k
  for (int y = 0; y < im_h; ++y) {
677
2.52M
    for (int x = 0; x < w; ++x) {
678
2.26M
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
679
9.51M
      for (int k = 0; k < filter_params_x->taps; ++k) {
680
7.24M
        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
681
7.24M
      }
682
2.26M
      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
683
2.26M
      im_block[y * im_stride + x] =
684
2.26M
          ROUND_POWER_OF_TWO(sum, conv_params->round_0);
685
2.26M
    }
686
265k
  }
687
688
  // vertical filter
689
31.1k
  int16_t *src_vert = im_block + fo_vert * im_stride;
690
31.1k
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
691
31.1k
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
692
31.1k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
693
234k
  for (int y = 0; y < h; ++y) {
694
2.05M
    for (int x = 0; x < w; ++x) {
695
1.84M
      int32_t sum = 1 << offset_bits;
696
6.93M
      for (int k = 0; k < filter_params_y->taps; ++k) {
697
5.08M
        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
698
5.08M
      }
699
1.84M
      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
700
1.84M
      int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
701
1.84M
                    ((1 << (offset_bits - conv_params->round_1)) +
702
1.84M
                     (1 << (offset_bits - conv_params->round_1 - 1)));
703
1.84M
      dst[y * dst_stride + x] =
704
1.84M
          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
705
1.84M
    }
706
203k
  }
707
31.1k
}
708
709
void av1_highbd_dist_wtd_convolve_2d_c(
710
    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
711
    int h, const InterpFilterParams *filter_params_x,
712
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
713
2.32k
    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
714
2.32k
  int x, y, k;
715
2.32k
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
716
2.32k
  CONV_BUF_TYPE *dst16 = conv_params->dst;
717
2.32k
  int dst16_stride = conv_params->dst_stride;
718
2.32k
  int im_h = h + filter_params_y->taps - 1;
719
2.32k
  int im_stride = w;
720
2.32k
  const int fo_vert = filter_params_y->taps / 2 - 1;
721
2.32k
  const int fo_horiz = filter_params_x->taps / 2 - 1;
722
2.32k
  const int round_bits =
723
2.32k
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
724
2.32k
  assert(round_bits >= 0);
725
726
  // horizontal filter
727
2.32k
  const uint16_t *src_horiz = src - fo_vert * src_stride;
728
2.32k
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
729
2.32k
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
730
39.5k
  for (y = 0; y < im_h; ++y) {
731
365k
    for (x = 0; x < w; ++x) {
732
328k
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
733
2.95M
      for (k = 0; k < filter_params_x->taps; ++k) {
734
2.62M
        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
735
2.62M
      }
736
328k
      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
737
328k
      (void)bd;
738
328k
      im_block[y * im_stride + x] =
739
328k
          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
740
328k
    }
741
37.2k
  }
742
743
  // vertical filter
744
2.32k
  int16_t *src_vert = im_block + fo_vert * im_stride;
745
2.32k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
746
2.32k
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
747
2.32k
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
748
23.2k
  for (y = 0; y < h; ++y) {
749
212k
    for (x = 0; x < w; ++x) {
750
191k
      int32_t sum = 1 << offset_bits;
751
1.72M
      for (k = 0; k < filter_params_y->taps; ++k) {
752
1.53M
        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
753
1.53M
      }
754
191k
      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
755
191k
      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
756
191k
      if (conv_params->do_average) {
757
74.4k
        int32_t tmp = dst16[y * dst16_stride + x];
758
74.4k
        if (conv_params->use_dist_wtd_comp_avg) {
759
18.0k
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
760
18.0k
          tmp = tmp >> DIST_PRECISION_BITS;
761
56.4k
        } else {
762
56.4k
          tmp += res;
763
56.4k
          tmp = tmp >> 1;
764
56.4k
        }
765
74.4k
        tmp -= (1 << (offset_bits - conv_params->round_1)) +
766
74.4k
               (1 << (offset_bits - conv_params->round_1 - 1));
767
74.4k
        dst[y * dst_stride + x] =
768
74.4k
            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
769
116k
      } else {
770
116k
        dst16[y * dst16_stride + x] = res;
771
116k
      }
772
191k
    }
773
20.9k
  }
774
2.32k
}
775
776
void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride,
777
                                      uint16_t *dst, int dst_stride, int w,
778
                                      int h,
779
                                      const InterpFilterParams *filter_params_x,
780
                                      const int subpel_x_qn,
781
5.62k
                                      ConvolveParams *conv_params, int bd) {
782
5.62k
  CONV_BUF_TYPE *dst16 = conv_params->dst;
783
5.62k
  int dst16_stride = conv_params->dst_stride;
784
5.62k
  const int fo_horiz = filter_params_x->taps / 2 - 1;
785
5.62k
  const int bits = FILTER_BITS - conv_params->round_1;
786
5.62k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
787
5.62k
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
788
5.62k
                           (1 << (offset_bits - conv_params->round_1 - 1));
789
5.62k
  const int round_bits =
790
5.62k
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
791
5.62k
  assert(round_bits >= 0);
792
5.62k
  assert(bits >= 0);
793
  // horizontal filter
794
5.62k
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
795
5.62k
      filter_params_x, subpel_x_qn & SUBPEL_MASK);
796
60.3k
  for (int y = 0; y < h; ++y) {
797
481k
    for (int x = 0; x < w; ++x) {
798
426k
      int32_t res = 0;
799
3.84M
      for (int k = 0; k < filter_params_x->taps; ++k) {
800
3.41M
        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
801
3.41M
      }
802
426k
      res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
803
426k
      res += round_offset;
804
805
426k
      if (conv_params->do_average) {
806
249k
        int32_t tmp = dst16[y * dst16_stride + x];
807
249k
        if (conv_params->use_dist_wtd_comp_avg) {
808
36.2k
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
809
36.2k
          tmp = tmp >> DIST_PRECISION_BITS;
810
213k
        } else {
811
213k
          tmp += res;
812
213k
          tmp = tmp >> 1;
813
213k
        }
814
249k
        tmp -= round_offset;
815
249k
        dst[y * dst_stride + x] =
816
249k
            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
817
249k
      } else {
818
176k
        dst16[y * dst16_stride + x] = res;
819
176k
      }
820
426k
    }
821
54.6k
  }
822
5.62k
}
823
824
void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride,
825
                                      uint16_t *dst, int dst_stride, int w,
826
                                      int h,
827
                                      const InterpFilterParams *filter_params_y,
828
                                      const int subpel_y_qn,
829
1.26k
                                      ConvolveParams *conv_params, int bd) {
830
1.26k
  CONV_BUF_TYPE *dst16 = conv_params->dst;
831
1.26k
  int dst16_stride = conv_params->dst_stride;
832
1.26k
  const int fo_vert = filter_params_y->taps / 2 - 1;
833
1.26k
  const int bits = FILTER_BITS - conv_params->round_0;
834
1.26k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
835
1.26k
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
836
1.26k
                           (1 << (offset_bits - conv_params->round_1 - 1));
837
1.26k
  const int round_bits =
838
1.26k
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
839
1.26k
  assert(round_bits >= 0);
840
1.26k
  assert(bits >= 0);
841
  // vertical filter
842
1.26k
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
843
1.26k
      filter_params_y, subpel_y_qn & SUBPEL_MASK);
844
13.3k
  for (int y = 0; y < h; ++y) {
845
128k
    for (int x = 0; x < w; ++x) {
846
116k
      int32_t res = 0;
847
1.04M
      for (int k = 0; k < filter_params_y->taps; ++k) {
848
929k
        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
849
929k
      }
850
116k
      res *= (1 << bits);
851
116k
      res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
852
853
116k
      if (conv_params->do_average) {
854
63.4k
        int32_t tmp = dst16[y * dst16_stride + x];
855
63.4k
        if (conv_params->use_dist_wtd_comp_avg) {
856
20.0k
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
857
20.0k
          tmp = tmp >> DIST_PRECISION_BITS;
858
43.4k
        } else {
859
43.4k
          tmp += res;
860
43.4k
          tmp = tmp >> 1;
861
43.4k
        }
862
63.4k
        tmp -= round_offset;
863
63.4k
        dst[y * dst_stride + x] =
864
63.4k
            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
865
63.4k
      } else {
866
52.7k
        dst16[y * dst16_stride + x] = res;
867
52.7k
      }
868
116k
    }
869
12.0k
  }
870
1.26k
}
871
872
void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride,
873
                                            uint16_t *dst, int dst_stride,
874
                                            int w, int h,
875
                                            ConvolveParams *conv_params,
876
23.5k
                                            int bd) {
877
23.5k
  CONV_BUF_TYPE *dst16 = conv_params->dst;
878
23.5k
  int dst16_stride = conv_params->dst_stride;
879
23.5k
  const int bits =
880
23.5k
      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
881
23.5k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
882
23.5k
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
883
23.5k
                           (1 << (offset_bits - conv_params->round_1 - 1));
884
23.5k
  assert(bits >= 0);
885
886
229k
  for (int y = 0; y < h; ++y) {
887
1.93M
    for (int x = 0; x < w; ++x) {
888
1.72M
      CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
889
1.72M
      res += round_offset;
890
1.72M
      if (conv_params->do_average) {
891
636k
        int32_t tmp = dst16[y * dst16_stride + x];
892
636k
        if (conv_params->use_dist_wtd_comp_avg) {
893
251k
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
894
251k
          tmp = tmp >> DIST_PRECISION_BITS;
895
384k
        } else {
896
384k
          tmp += res;
897
384k
          tmp = tmp >> 1;
898
384k
        }
899
636k
        tmp -= round_offset;
900
636k
        dst[y * dst_stride + x] =
901
636k
            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
902
1.08M
      } else {
903
1.08M
        dst16[y * dst16_stride + x] = res;
904
1.08M
      }
905
1.72M
    }
906
206k
  }
907
23.5k
}
908
909
void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
910
                                    uint16_t *dst, int dst_stride, int w, int h,
911
                                    const InterpFilterParams *filter_params_x,
912
                                    const InterpFilterParams *filter_params_y,
913
                                    const int subpel_x_qn, const int x_step_qn,
914
                                    const int subpel_y_qn, const int y_step_qn,
915
6.76k
                                    ConvolveParams *conv_params, int bd) {
916
6.76k
  int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
917
6.76k
  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
918
6.76k
             filter_params_y->taps;
919
6.76k
  int im_stride = w;
920
6.76k
  const int fo_vert = filter_params_y->taps / 2 - 1;
921
6.76k
  const int fo_horiz = filter_params_x->taps / 2 - 1;
922
6.76k
  CONV_BUF_TYPE *dst16 = conv_params->dst;
923
6.76k
  const int dst16_stride = conv_params->dst_stride;
924
6.76k
  const int bits =
925
6.76k
      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
926
6.76k
  assert(bits >= 0);
927
  // horizontal filter
928
6.76k
  const uint16_t *src_horiz = src - fo_vert * src_stride;
929
106k
  for (int y = 0; y < im_h; ++y) {
930
100k
    int x_qn = subpel_x_qn;
931
930k
    for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
932
830k
      const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
933
830k
      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
934
830k
      assert(x_filter_idx < SUBPEL_SHIFTS);
935
830k
      const int16_t *x_filter =
936
830k
          av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
937
830k
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
938
7.47M
      for (int k = 0; k < filter_params_x->taps; ++k) {
939
6.64M
        sum += x_filter[k] * src_x[k - fo_horiz];
940
6.64M
      }
941
830k
      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
942
830k
      im_block[y * im_stride + x] =
943
830k
          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
944
830k
    }
945
100k
    src_horiz += src_stride;
946
100k
  }
947
948
  // vertical filter
949
6.76k
  int16_t *src_vert = im_block + fo_vert * im_stride;
950
6.76k
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
951
57.7k
  for (int x = 0; x < w; ++x) {
952
51.0k
    int y_qn = subpel_y_qn;
953
493k
    for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
954
442k
      const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
955
442k
      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
956
442k
      assert(y_filter_idx < SUBPEL_SHIFTS);
957
442k
      const int16_t *y_filter =
958
442k
          av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
959
442k
      int32_t sum = 1 << offset_bits;
960
3.98M
      for (int k = 0; k < filter_params_y->taps; ++k) {
961
3.54M
        sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
962
3.54M
      }
963
442k
      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
964
442k
      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
965
442k
      if (conv_params->is_compound) {
966
72.3k
        if (conv_params->do_average) {
967
29.6k
          int32_t tmp = dst16[y * dst16_stride + x];
968
29.6k
          if (conv_params->use_dist_wtd_comp_avg) {
969
11.1k
            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
970
11.1k
            tmp = tmp >> DIST_PRECISION_BITS;
971
18.4k
          } else {
972
18.4k
            tmp += res;
973
18.4k
            tmp = tmp >> 1;
974
18.4k
          }
975
          /* Subtract round offset and convolve round */
976
29.6k
          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
977
29.6k
                       (1 << (offset_bits - conv_params->round_1 - 1)));
978
29.6k
          dst[y * dst_stride + x] =
979
29.6k
              clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
980
42.7k
        } else {
981
42.7k
          dst16[y * dst16_stride + x] = res;
982
42.7k
        }
983
370k
      } else {
984
        /* Subtract round offset and convolve round */
985
370k
        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
986
370k
                             (1 << (offset_bits - conv_params->round_1 - 1)));
987
370k
        dst[y * dst_stride + x] =
988
370k
            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
989
370k
      }
990
442k
    }
991
51.0k
    src_vert++;
992
51.0k
  }
993
6.76k
}
994
995
static void highbd_convolve_2d_facade_compound(
996
    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
997
    const int w, const int h, const InterpFilterParams *filter_params_x,
998
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
999
32.8k
    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
1000
32.8k
  const bool need_x = subpel_x_qn != 0;
1001
32.8k
  const bool need_y = subpel_y_qn != 0;
1002
32.8k
  if (!need_x && !need_y) {
1003
23.5k
    av1_highbd_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
1004
23.5k
                                         conv_params, bd);
1005
23.5k
  } else if (need_x && !need_y) {
1006
5.62k
    av1_highbd_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
1007
5.62k
                                   filter_params_x, subpel_x_qn, conv_params,
1008
5.62k
                                   bd);
1009
5.62k
  } else if (!need_x && need_y) {
1010
1.26k
    av1_highbd_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
1011
1.26k
                                   filter_params_y, subpel_y_qn, conv_params,
1012
1.26k
                                   bd);
1013
2.32k
  } else {
1014
2.32k
    assert(need_x && need_y);
1015
2.32k
    av1_highbd_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
1016
2.32k
                                    filter_params_x, filter_params_y,
1017
2.32k
                                    subpel_x_qn, subpel_y_qn, conv_params, bd);
1018
2.32k
  }
1019
32.8k
}
1020
1021
static void highbd_convolve_2d_facade_single(
1022
    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
1023
    const int w, const int h, const InterpFilterParams *filter_params_x,
1024
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
1025
244k
    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
1026
244k
  const bool need_x = subpel_x_qn != 0;
1027
244k
  const bool need_y = subpel_y_qn != 0;
1028
1029
244k
  if (!need_x && !need_y) {
1030
151k
    aom_highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h);
1031
151k
  } else if (need_x && !need_y) {
1032
34.0k
    av1_highbd_convolve_x_sr(src, src_stride, dst, dst_stride, w, h,
1033
34.0k
                             filter_params_x, subpel_x_qn, conv_params, bd);
1034
58.2k
  } else if (!need_x && need_y) {
1035
27.1k
    av1_highbd_convolve_y_sr(src, src_stride, dst, dst_stride, w, h,
1036
27.1k
                             filter_params_y, subpel_y_qn, bd);
1037
31.1k
  } else {
1038
31.1k
    assert(need_x && need_y);
1039
31.1k
    av1_highbd_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
1040
31.1k
                              filter_params_x, filter_params_y, subpel_x_qn,
1041
31.1k
                              subpel_y_qn, conv_params, bd);
1042
31.1k
  }
1043
244k
}
1044
1045
void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
1046
                                   uint8_t *dst8, int dst_stride, int w, int h,
1047
                                   const InterpFilterParams *interp_filters[2],
1048
                                   const int subpel_x_qn, int x_step_q4,
1049
                                   const int subpel_y_qn, int y_step_q4,
1050
                                   int scaled, ConvolveParams *conv_params,
1051
283k
                                   int bd) {
1052
283k
  (void)x_step_q4;
1053
283k
  (void)y_step_q4;
1054
283k
  (void)dst_stride;
1055
283k
  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1056
1057
283k
  const int need_filter_params_x = (subpel_x_qn != 0) | scaled;
1058
283k
  const int need_filter_params_y = (subpel_y_qn != 0) | scaled;
1059
283k
  const InterpFilterParams *filter_params_x =
1060
283k
      need_filter_params_x ? interp_filters[0] : NULL;
1061
283k
  const InterpFilterParams *filter_params_y =
1062
283k
      need_filter_params_y ? interp_filters[1] : NULL;
1063
1064
283k
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1065
283k
  if (scaled) {
1066
6.76k
    if (conv_params->is_compound) {
1067
642
      assert(conv_params->dst != NULL);
1068
642
    }
1069
6.76k
    av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
1070
6.76k
                                 filter_params_x, filter_params_y, subpel_x_qn,
1071
6.76k
                                 x_step_q4, subpel_y_qn, y_step_q4, conv_params,
1072
6.76k
                                 bd);
1073
276k
  } else if (conv_params->is_compound) {
1074
32.8k
    highbd_convolve_2d_facade_compound(
1075
32.8k
        src, src_stride, dst, dst_stride, w, h, filter_params_x,
1076
32.8k
        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
1077
244k
  } else {
1078
244k
    highbd_convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
1079
244k
                                     filter_params_x, filter_params_y,
1080
244k
                                     subpel_x_qn, subpel_y_qn, conv_params, bd);
1081
244k
  }
1082
283k
}
1083
#endif  // CONFIG_AV1_HIGHBITDEPTH
1084
1085
// Note: Fixed size intermediate buffers, place limits on parameters
1086
// of some functions. 2d filtering proceeds in 2 steps:
1087
//   (1) Interpolate horizontally into an intermediate buffer, temp.
1088
//   (2) Interpolate temp vertically to derive the sub-pixel result.
1089
// Deriving the maximum number of rows in the temp buffer (135):
1090
// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1091
// --Largest block size is 128x128 pixels.
1092
// --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
1093
//   original frame (in 1/16th pixel units).
1094
// --Must round-up because block may be located at sub-pixel position.
1095
// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1096
// --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
1097
#define WIENER_MAX_EXT_SIZE 263
1098
1099
52.2M
static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
1100
52.2M
  int sum = 0;
1101
470M
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1102
52.2M
  return sum;
1103
52.2M
}
1104
1105
#if CONFIG_AV1_HIGHBITDEPTH
1106
static INLINE int highbd_horz_scalar_product(const uint16_t *a,
1107
100M
                                             const int16_t *b) {
1108
100M
  int sum = 0;
1109
906M
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1110
100M
  return sum;
1111
100M
}
1112
#endif
1113
1114
static INLINE int highbd_vert_scalar_product(const uint16_t *a,
1115
                                             ptrdiff_t a_stride,
1116
136M
                                             const int16_t *b) {
1117
136M
  int sum = 0;
1118
1.22G
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
1119
136M
  return sum;
1120
136M
}
1121
1122
107k
static const InterpKernel *get_filter_base(const int16_t *filter) {
1123
  // NOTE: This assumes that the filter table is 256-byte aligned.
1124
  // TODO(agrange) Modify to make independent of table alignment.
1125
107k
  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
1126
107k
}
1127
1128
107k
static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
1129
107k
  return (int)((const InterpKernel *)(intptr_t)f - base);
1130
107k
}
1131
1132
static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
1133
                                       uint16_t *dst, ptrdiff_t dst_stride,
1134
                                       const InterpKernel *x_filters, int x0_q4,
1135
                                       int x_step_q4, int w, int h,
1136
18.9k
                                       int round0_bits) {
1137
18.9k
  const int bd = 8;
1138
18.9k
  src -= SUBPEL_TAPS / 2 - 1;
1139
1.09M
  for (int y = 0; y < h; ++y) {
1140
1.07M
    int x_q4 = x0_q4;
1141
53.3M
    for (int x = 0; x < w; ++x) {
1142
52.2M
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1143
52.2M
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1144
52.2M
      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1145
52.2M
                           (1 << (bd + FILTER_BITS - 1));
1146
52.2M
      const int sum = horz_scalar_product(src_x, x_filter) + rounding;
1147
52.2M
      dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1148
52.2M
                               WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
1149
52.2M
      x_q4 += x_step_q4;
1150
52.2M
    }
1151
1.07M
    src += src_stride;
1152
1.07M
    dst += dst_stride;
1153
1.07M
  }
1154
18.9k
}
1155
1156
static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
1157
                                      uint8_t *dst, ptrdiff_t dst_stride,
1158
                                      const InterpKernel *y_filters, int y0_q4,
1159
                                      int y_step_q4, int w, int h,
1160
18.9k
                                      int round1_bits) {
1161
18.9k
  const int bd = 8;
1162
18.9k
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1163
1164
898k
  for (int x = 0; x < w; ++x) {
1165
879k
    int y_q4 = y0_q4;
1166
47.8M
    for (int y = 0; y < h; ++y) {
1167
47.0M
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1168
47.0M
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1169
47.0M
      const int rounding =
1170
47.0M
          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1171
47.0M
          (1 << (bd + round1_bits - 1));
1172
47.0M
      const int sum =
1173
47.0M
          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1174
47.0M
      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
1175
47.0M
      y_q4 += y_step_q4;
1176
47.0M
    }
1177
879k
    ++src;
1178
879k
    ++dst;
1179
879k
  }
1180
18.9k
}
1181
1182
void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
1183
                                   uint8_t *dst, ptrdiff_t dst_stride,
1184
                                   const int16_t *filter_x, int x_step_q4,
1185
                                   const int16_t *filter_y, int y_step_q4,
1186
                                   int w, int h,
1187
18.9k
                                   const ConvolveParams *conv_params) {
1188
18.9k
  const InterpKernel *const filters_x = get_filter_base(filter_x);
1189
18.9k
  const int x0_q4 = get_filter_offset(filter_x, filters_x);
1190
1191
18.9k
  const InterpKernel *const filters_y = get_filter_base(filter_y);
1192
18.9k
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
1193
1194
18.9k
  uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1195
18.9k
  const int intermediate_height =
1196
18.9k
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
1197
18.9k
  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
1198
1199
18.9k
  assert(w <= MAX_SB_SIZE);
1200
18.9k
  assert(h <= MAX_SB_SIZE);
1201
18.9k
  assert(y_step_q4 <= 32);
1202
18.9k
  assert(x_step_q4 <= 32);
1203
1204
18.9k
  convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1205
18.9k
                             src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
1206
18.9k
                             x_step_q4, w, intermediate_height,
1207
18.9k
                             conv_params->round_0);
1208
18.9k
  convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1209
18.9k
                            MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
1210
18.9k
                            y_step_q4, w, h, conv_params->round_1);
1211
18.9k
}
1212
1213
#if CONFIG_AV1_HIGHBITDEPTH
1214
static void highbd_convolve_add_src_horiz_hip(
1215
    const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
1216
    ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
1217
34.6k
    int x_step_q4, int w, int h, int round0_bits, int bd) {
1218
34.6k
  const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
1219
34.6k
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1220
34.6k
  src -= SUBPEL_TAPS / 2 - 1;
1221
2.13M
  for (int y = 0; y < h; ++y) {
1222
2.09M
    int x_q4 = x0_q4;
1223
102M
    for (int x = 0; x < w; ++x) {
1224
100M
      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1225
100M
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1226
100M
      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1227
100M
                           (1 << (bd + FILTER_BITS - 1));
1228
100M
      const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
1229
100M
      dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1230
100M
                               extraprec_clamp_limit - 1);
1231
100M
      x_q4 += x_step_q4;
1232
100M
    }
1233
2.09M
    src += src_stride;
1234
2.09M
    dst += dst_stride;
1235
2.09M
  }
1236
34.6k
}
1237
1238
static void highbd_convolve_add_src_vert_hip(
1239
    const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
1240
    ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
1241
34.6k
    int y_step_q4, int w, int h, int round1_bits, int bd) {
1242
34.6k
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1243
34.6k
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1244
1.69M
  for (int x = 0; x < w; ++x) {
1245
1.66M
    int y_q4 = y0_q4;
1246
90.7M
    for (int y = 0; y < h; ++y) {
1247
89.1M
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1248
89.1M
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1249
89.1M
      const int rounding =
1250
89.1M
          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1251
89.1M
          (1 << (bd + round1_bits - 1));
1252
89.1M
      const int sum =
1253
89.1M
          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1254
89.1M
      dst[y * dst_stride] =
1255
89.1M
          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
1256
89.1M
      y_q4 += y_step_q4;
1257
89.1M
    }
1258
1.66M
    ++src;
1259
1.66M
    ++dst;
1260
1.66M
  }
1261
34.6k
}
1262
1263
void av1_highbd_wiener_convolve_add_src_c(
1264
    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1265
    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1266
    const int16_t *filter_y, int y_step_q4, int w, int h,
1267
34.6k
    const ConvolveParams *conv_params, int bd) {
1268
34.6k
  const InterpKernel *const filters_x = get_filter_base(filter_x);
1269
34.6k
  const int x0_q4 = get_filter_offset(filter_x, filters_x);
1270
1271
34.6k
  const InterpKernel *const filters_y = get_filter_base(filter_y);
1272
34.6k
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
1273
1274
34.6k
  uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1275
34.6k
  const int intermediate_height =
1276
34.6k
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1277
1278
34.6k
  assert(w <= MAX_SB_SIZE);
1279
34.6k
  assert(h <= MAX_SB_SIZE);
1280
34.6k
  assert(y_step_q4 <= 32);
1281
34.6k
  assert(x_step_q4 <= 32);
1282
34.6k
  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
1283
1284
34.6k
  highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1285
34.6k
                                    src_stride, temp, MAX_SB_SIZE, filters_x,
1286
34.6k
                                    x0_q4, x_step_q4, w, intermediate_height,
1287
34.6k
                                    conv_params->round_0, bd);
1288
34.6k
  highbd_convolve_add_src_vert_hip(
1289
34.6k
      temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
1290
34.6k
      filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
1291
34.6k
}
1292
#endif  // CONFIG_AV1_HIGHBITDEPTH