Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/third_party/aom/av1/common/convolve.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <assert.h>
13
#include <string.h>
14
15
#include "config/aom_dsp_rtcd.h"
16
#include "config/av1_rtcd.h"
17
18
#include "av1/common/blockd.h"
19
#include "av1/common/convolve.h"
20
#include "av1/common/filter.h"
21
#include "av1/common/onyxc_int.h"
22
#include "av1/common/resize.h"
23
#include "aom_dsp/aom_dsp_common.h"
24
#include "aom_ports/mem.h"
25
26
void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
27
                             int dst_stride, int w, int h,
28
                             const int16_t *x_filters, int x0_qn,
29
0
                             int x_step_qn) {
30
0
  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
31
0
  for (int y = 0; y < h; ++y) {
32
0
    int x_qn = x0_qn;
33
0
    for (int x = 0; x < w; ++x) {
34
0
      const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
35
0
      const int x_filter_idx =
36
0
          (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
37
0
      assert(x_filter_idx <= RS_SUBPEL_MASK);
38
0
      const int16_t *const x_filter =
39
0
          &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
40
0
      int sum = 0;
41
0
      for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
42
0
        sum += src_x[k] * x_filter[k];
43
0
      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
44
0
      x_qn += x_step_qn;
45
0
    }
46
0
    src += src_stride;
47
0
    dst += dst_stride;
48
0
  }
49
0
}
50
51
void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
52
                                    uint16_t *dst, int dst_stride, int w, int h,
53
                                    const int16_t *x_filters, int x0_qn,
54
0
                                    int x_step_qn, int bd) {
55
0
  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
56
0
  for (int y = 0; y < h; ++y) {
57
0
    int x_qn = x0_qn;
58
0
    for (int x = 0; x < w; ++x) {
59
0
      const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
60
0
      const int x_filter_idx =
61
0
          (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
62
0
      assert(x_filter_idx <= RS_SUBPEL_MASK);
63
0
      const int16_t *const x_filter =
64
0
          &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
65
0
      int sum = 0;
66
0
      for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
67
0
        sum += src_x[k] * x_filter[k];
68
0
      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
69
0
      x_qn += x_step_qn;
70
0
    }
71
0
    src += src_stride;
72
0
    dst += dst_stride;
73
0
  }
74
0
}
75
76
void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
77
                          int dst_stride, int w, int h,
78
                          const InterpFilterParams *filter_params_x,
79
                          const InterpFilterParams *filter_params_y,
80
                          const int subpel_x_q4, const int subpel_y_q4,
81
0
                          ConvolveParams *conv_params) {
82
0
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
83
0
  int im_h = h + filter_params_y->taps - 1;
84
0
  int im_stride = w;
85
0
  const int fo_vert = filter_params_y->taps / 2 - 1;
86
0
  const int fo_horiz = filter_params_x->taps / 2 - 1;
87
0
  const int bd = 8;
88
0
  const int bits =
89
0
      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
90
0
91
0
  // horizontal filter
92
0
  const uint8_t *src_horiz = src - fo_vert * src_stride;
93
0
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
94
0
      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
95
0
  for (int y = 0; y < im_h; ++y) {
96
0
    for (int x = 0; x < w; ++x) {
97
0
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
98
0
      for (int k = 0; k < filter_params_x->taps; ++k) {
99
0
        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
100
0
      }
101
0
      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
102
0
      im_block[y * im_stride + x] =
103
0
          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
104
0
    }
105
0
  }
106
0
107
0
  // vertical filter
108
0
  int16_t *src_vert = im_block + fo_vert * im_stride;
109
0
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
110
0
      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
111
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
112
0
  for (int y = 0; y < h; ++y) {
113
0
    for (int x = 0; x < w; ++x) {
114
0
      int32_t sum = 1 << offset_bits;
115
0
      for (int k = 0; k < filter_params_y->taps; ++k) {
116
0
        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
117
0
      }
118
0
      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
119
0
      int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
120
0
                    ((1 << (offset_bits - conv_params->round_1)) +
121
0
                     (1 << (offset_bits - conv_params->round_1 - 1)));
122
0
      dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
123
0
    }
124
0
  }
125
0
}
126
127
void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
128
                         int dst_stride, int w, int h,
129
                         const InterpFilterParams *filter_params_x,
130
                         const InterpFilterParams *filter_params_y,
131
                         const int subpel_x_q4, const int subpel_y_q4,
132
0
                         ConvolveParams *conv_params) {
133
0
  const int fo_vert = filter_params_y->taps / 2 - 1;
134
0
  (void)filter_params_x;
135
0
  (void)subpel_x_q4;
136
0
  (void)conv_params;
137
0
138
0
  assert(conv_params->round_0 <= FILTER_BITS);
139
0
  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
140
0
         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
141
0
142
0
  // vertical filter
143
0
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
144
0
      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
145
0
  for (int y = 0; y < h; ++y) {
146
0
    for (int x = 0; x < w; ++x) {
147
0
      int32_t res = 0;
148
0
      for (int k = 0; k < filter_params_y->taps; ++k) {
149
0
        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
150
0
      }
151
0
      dst[y * dst_stride + x] =
152
0
          clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
153
0
    }
154
0
  }
155
0
}
156
157
void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
158
                         int dst_stride, int w, int h,
159
                         const InterpFilterParams *filter_params_x,
160
                         const InterpFilterParams *filter_params_y,
161
                         const int subpel_x_q4, const int subpel_y_q4,
162
0
                         ConvolveParams *conv_params) {
163
0
  const int fo_horiz = filter_params_x->taps / 2 - 1;
164
0
  const int bits = FILTER_BITS - conv_params->round_0;
165
0
  (void)filter_params_y;
166
0
  (void)subpel_y_q4;
167
0
  (void)conv_params;
168
0
169
0
  assert(bits >= 0);
170
0
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
171
0
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
172
0
173
0
  // horizontal filter
174
0
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
175
0
      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
176
0
177
0
  for (int y = 0; y < h; ++y) {
178
0
    for (int x = 0; x < w; ++x) {
179
0
      int32_t res = 0;
180
0
      for (int k = 0; k < filter_params_x->taps; ++k) {
181
0
        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
182
0
      }
183
0
      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
184
0
      dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
185
0
    }
186
0
  }
187
0
}
188
189
void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
190
                               int dst_stride, int w, int h,
191
                               const InterpFilterParams *filter_params_x,
192
                               const InterpFilterParams *filter_params_y,
193
                               const int subpel_x_q4, const int subpel_y_q4,
194
0
                               ConvolveParams *conv_params) {
195
0
  (void)filter_params_x;
196
0
  (void)filter_params_y;
197
0
  (void)subpel_x_q4;
198
0
  (void)subpel_y_q4;
199
0
  (void)conv_params;
200
0
201
0
  for (int y = 0; y < h; ++y) {
202
0
    memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
203
0
  }
204
0
}
205
206
void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
207
                           int dst8_stride, int w, int h,
208
                           const InterpFilterParams *filter_params_x,
209
                           const InterpFilterParams *filter_params_y,
210
                           const int subpel_x_q4, const int subpel_y_q4,
211
0
                           ConvolveParams *conv_params) {
212
0
  CONV_BUF_TYPE *dst = conv_params->dst;
213
0
  int dst_stride = conv_params->dst_stride;
214
0
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
215
0
  int im_h = h + filter_params_y->taps - 1;
216
0
  int im_stride = w;
217
0
  const int fo_vert = filter_params_y->taps / 2 - 1;
218
0
  const int fo_horiz = filter_params_x->taps / 2 - 1;
219
0
  const int bd = 8;
220
0
  const int round_bits =
221
0
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
222
0
223
0
  // horizontal filter
224
0
  const uint8_t *src_horiz = src - fo_vert * src_stride;
225
0
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
226
0
      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
227
0
  for (int y = 0; y < im_h; ++y) {
228
0
    for (int x = 0; x < w; ++x) {
229
0
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
230
0
      for (int k = 0; k < filter_params_x->taps; ++k) {
231
0
        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
232
0
      }
233
0
      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
234
0
      im_block[y * im_stride + x] =
235
0
          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
236
0
    }
237
0
  }
238
0
239
0
  // vertical filter
240
0
  int16_t *src_vert = im_block + fo_vert * im_stride;
241
0
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
242
0
      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
243
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
244
0
  for (int y = 0; y < h; ++y) {
245
0
    for (int x = 0; x < w; ++x) {
246
0
      int32_t sum = 1 << offset_bits;
247
0
      for (int k = 0; k < filter_params_y->taps; ++k) {
248
0
        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
249
0
      }
250
0
      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
251
0
      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
252
0
      if (conv_params->do_average) {
253
0
        int32_t tmp = dst[y * dst_stride + x];
254
0
        if (conv_params->use_jnt_comp_avg) {
255
0
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
256
0
          tmp = tmp >> DIST_PRECISION_BITS;
257
0
        } else {
258
0
          tmp += res;
259
0
          tmp = tmp >> 1;
260
0
        }
261
0
        tmp -= (1 << (offset_bits - conv_params->round_1)) +
262
0
               (1 << (offset_bits - conv_params->round_1 - 1));
263
0
        dst8[y * dst8_stride + x] =
264
0
            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
265
0
      } else {
266
0
        dst[y * dst_stride + x] = res;
267
0
      }
268
0
    }
269
0
  }
270
0
}
271
272
void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
273
                          int dst8_stride, int w, int h,
274
                          const InterpFilterParams *filter_params_x,
275
                          const InterpFilterParams *filter_params_y,
276
                          const int subpel_x_q4, const int subpel_y_q4,
277
0
                          ConvolveParams *conv_params) {
278
0
  CONV_BUF_TYPE *dst = conv_params->dst;
279
0
  int dst_stride = conv_params->dst_stride;
280
0
  const int fo_vert = filter_params_y->taps / 2 - 1;
281
0
  const int bits = FILTER_BITS - conv_params->round_0;
282
0
  const int bd = 8;
283
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
284
0
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
285
0
                           (1 << (offset_bits - conv_params->round_1 - 1));
286
0
  const int round_bits =
287
0
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
288
0
  (void)filter_params_x;
289
0
  (void)subpel_x_q4;
290
0
291
0
  // vertical filter
292
0
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
293
0
      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
294
0
  for (int y = 0; y < h; ++y) {
295
0
    for (int x = 0; x < w; ++x) {
296
0
      int32_t res = 0;
297
0
      for (int k = 0; k < filter_params_y->taps; ++k) {
298
0
        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
299
0
      }
300
0
      res *= (1 << bits);
301
0
      res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
302
0
303
0
      if (conv_params->do_average) {
304
0
        int32_t tmp = dst[y * dst_stride + x];
305
0
        if (conv_params->use_jnt_comp_avg) {
306
0
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
307
0
          tmp = tmp >> DIST_PRECISION_BITS;
308
0
        } else {
309
0
          tmp += res;
310
0
          tmp = tmp >> 1;
311
0
        }
312
0
        tmp -= round_offset;
313
0
        dst8[y * dst8_stride + x] =
314
0
            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
315
0
      } else {
316
0
        dst[y * dst_stride + x] = res;
317
0
      }
318
0
    }
319
0
  }
320
0
}
321
322
void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
323
                          int dst8_stride, int w, int h,
324
                          const InterpFilterParams *filter_params_x,
325
                          const InterpFilterParams *filter_params_y,
326
                          const int subpel_x_q4, const int subpel_y_q4,
327
0
                          ConvolveParams *conv_params) {
328
0
  CONV_BUF_TYPE *dst = conv_params->dst;
329
0
  int dst_stride = conv_params->dst_stride;
330
0
  const int fo_horiz = filter_params_x->taps / 2 - 1;
331
0
  const int bits = FILTER_BITS - conv_params->round_1;
332
0
  const int bd = 8;
333
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
334
0
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
335
0
                           (1 << (offset_bits - conv_params->round_1 - 1));
336
0
  const int round_bits =
337
0
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
338
0
  (void)filter_params_y;
339
0
  (void)subpel_y_q4;
340
0
341
0
  // horizontal filter
342
0
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
343
0
      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
344
0
  for (int y = 0; y < h; ++y) {
345
0
    for (int x = 0; x < w; ++x) {
346
0
      int32_t res = 0;
347
0
      for (int k = 0; k < filter_params_x->taps; ++k) {
348
0
        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
349
0
      }
350
0
      res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
351
0
      res += round_offset;
352
0
353
0
      if (conv_params->do_average) {
354
0
        int32_t tmp = dst[y * dst_stride + x];
355
0
        if (conv_params->use_jnt_comp_avg) {
356
0
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
357
0
          tmp = tmp >> DIST_PRECISION_BITS;
358
0
        } else {
359
0
          tmp += res;
360
0
          tmp = tmp >> 1;
361
0
        }
362
0
        tmp -= round_offset;
363
0
        dst8[y * dst8_stride + x] =
364
0
            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
365
0
      } else {
366
0
        dst[y * dst_stride + x] = res;
367
0
      }
368
0
    }
369
0
  }
370
0
}
371
372
void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
373
                                uint8_t *dst8, int dst8_stride, int w, int h,
374
                                const InterpFilterParams *filter_params_x,
375
                                const InterpFilterParams *filter_params_y,
376
                                const int subpel_x_q4, const int subpel_y_q4,
377
0
                                ConvolveParams *conv_params) {
378
0
  CONV_BUF_TYPE *dst = conv_params->dst;
379
0
  int dst_stride = conv_params->dst_stride;
380
0
  const int bits =
381
0
      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
382
0
  const int bd = 8;
383
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
384
0
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
385
0
                           (1 << (offset_bits - conv_params->round_1 - 1));
386
0
  (void)filter_params_x;
387
0
  (void)filter_params_y;
388
0
  (void)subpel_x_q4;
389
0
  (void)subpel_y_q4;
390
0
391
0
  for (int y = 0; y < h; ++y) {
392
0
    for (int x = 0; x < w; ++x) {
393
0
      CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
394
0
      res += round_offset;
395
0
396
0
      if (conv_params->do_average) {
397
0
        int32_t tmp = dst[y * dst_stride + x];
398
0
        if (conv_params->use_jnt_comp_avg) {
399
0
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
400
0
          tmp = tmp >> DIST_PRECISION_BITS;
401
0
        } else {
402
0
          tmp += res;
403
0
          tmp = tmp >> 1;
404
0
        }
405
0
        tmp -= round_offset;
406
0
        dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
407
0
      } else {
408
0
        dst[y * dst_stride + x] = res;
409
0
      }
410
0
    }
411
0
  }
412
0
}
413
414
void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
415
                             int dst8_stride, int w, int h,
416
                             const InterpFilterParams *filter_params_x,
417
                             const InterpFilterParams *filter_params_y,
418
                             const int subpel_x_qn, const int x_step_qn,
419
                             const int subpel_y_qn, const int y_step_qn,
420
0
                             ConvolveParams *conv_params) {
421
0
  int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
422
0
  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
423
0
             filter_params_y->taps;
424
0
  CONV_BUF_TYPE *dst16 = conv_params->dst;
425
0
  const int dst16_stride = conv_params->dst_stride;
426
0
  const int bits =
427
0
      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
428
0
  assert(bits >= 0);
429
0
  int im_stride = w;
430
0
  const int fo_vert = filter_params_y->taps / 2 - 1;
431
0
  const int fo_horiz = filter_params_x->taps / 2 - 1;
432
0
  const int bd = 8;
433
0
434
0
  // horizontal filter
435
0
  const uint8_t *src_horiz = src - fo_vert * src_stride;
436
0
  for (int y = 0; y < im_h; ++y) {
437
0
    int x_qn = subpel_x_qn;
438
0
    for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
439
0
      const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
440
0
      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
441
0
      assert(x_filter_idx < SUBPEL_SHIFTS);
442
0
      const int16_t *x_filter =
443
0
          av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
444
0
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
445
0
      for (int k = 0; k < filter_params_x->taps; ++k) {
446
0
        sum += x_filter[k] * src_x[k - fo_horiz];
447
0
      }
448
0
      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
449
0
      im_block[y * im_stride + x] =
450
0
          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
451
0
    }
452
0
    src_horiz += src_stride;
453
0
  }
454
0
455
0
  // vertical filter
456
0
  int16_t *src_vert = im_block + fo_vert * im_stride;
457
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
458
0
  for (int x = 0; x < w; ++x) {
459
0
    int y_qn = subpel_y_qn;
460
0
    for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
461
0
      const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
462
0
      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
463
0
      assert(y_filter_idx < SUBPEL_SHIFTS);
464
0
      const int16_t *y_filter =
465
0
          av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
466
0
      int32_t sum = 1 << offset_bits;
467
0
      for (int k = 0; k < filter_params_y->taps; ++k) {
468
0
        sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
469
0
      }
470
0
      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
471
0
      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
472
0
      if (conv_params->is_compound) {
473
0
        if (conv_params->do_average) {
474
0
          int32_t tmp = dst16[y * dst16_stride + x];
475
0
          if (conv_params->use_jnt_comp_avg) {
476
0
            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
477
0
            tmp = tmp >> DIST_PRECISION_BITS;
478
0
          } else {
479
0
            tmp += res;
480
0
            tmp = tmp >> 1;
481
0
          }
482
0
          /* Subtract round offset and convolve round */
483
0
          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
484
0
                       (1 << (offset_bits - conv_params->round_1 - 1)));
485
0
          dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
486
0
        } else {
487
0
          dst16[y * dst16_stride + x] = res;
488
0
        }
489
0
      } else {
490
0
        /* Subtract round offset and convolve round */
491
0
        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
492
0
                             (1 << (offset_bits - conv_params->round_1 - 1)));
493
0
        dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
494
0
      }
495
0
    }
496
0
    src_vert++;
497
0
  }
498
0
}
499
500
static void convolve_2d_scale_wrapper(
501
    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
502
    int h, const InterpFilterParams *filter_params_x,
503
    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
504
    const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
505
0
    ConvolveParams *conv_params) {
506
0
  if (conv_params->is_compound) {
507
0
    assert(conv_params->dst != NULL);
508
0
  }
509
0
  av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
510
0
                        filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
511
0
                        y_step_qn, conv_params);
512
0
}
513
514
// TODO(huisu@google.com): bilinear filtering only needs 2 taps in general. So
515
// we may create optimized code to do 2-tap filtering for all bilinear filtering
516
// usages, not just IntraBC.
517
static void convolve_2d_for_intrabc(const uint8_t *src, int src_stride,
518
                                    uint8_t *dst, int dst_stride, int w, int h,
519
                                    int subpel_x_q4, int subpel_y_q4,
520
0
                                    ConvolveParams *conv_params) {
521
0
  const InterpFilterParams *filter_params_x =
522
0
      subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
523
0
  const InterpFilterParams *filter_params_y =
524
0
      subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
525
0
  if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
526
0
    av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
527
0
                         filter_params_x, filter_params_y, 0, 0, conv_params);
528
0
  } else if (subpel_x_q4 != 0) {
529
0
    av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
530
0
                        filter_params_y, 0, 0, conv_params);
531
0
  } else {
532
0
    av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
533
0
                        filter_params_y, 0, 0, conv_params);
534
0
  }
535
0
}
536
537
void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
538
                            int dst_stride, int w, int h,
539
                            InterpFilters interp_filters, const int subpel_x_q4,
540
                            int x_step_q4, const int subpel_y_q4, int y_step_q4,
541
                            int scaled, ConvolveParams *conv_params,
542
0
                            const struct scale_factors *sf, int is_intrabc) {
543
0
  assert(IMPLIES(is_intrabc, !scaled));
544
0
  (void)x_step_q4;
545
0
  (void)y_step_q4;
546
0
  (void)dst;
547
0
  (void)dst_stride;
548
0
549
0
  if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
550
0
    convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, subpel_x_q4,
551
0
                            subpel_y_q4, conv_params);
552
0
    return;
553
0
  }
554
0
555
0
  InterpFilter filter_x = 0;
556
0
  InterpFilter filter_y = 0;
557
0
  const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
558
0
  const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
559
0
  if (need_filter_params_x)
560
0
    filter_x = av1_extract_interp_filter(interp_filters, 1);
561
0
  if (need_filter_params_y)
562
0
    filter_y = av1_extract_interp_filter(interp_filters, 0);
563
0
  const InterpFilterParams *filter_params_x =
564
0
      need_filter_params_x
565
0
          ? av1_get_interp_filter_params_with_block_size(filter_x, w)
566
0
          : NULL;
567
0
  const InterpFilterParams *filter_params_y =
568
0
      need_filter_params_y
569
0
          ? av1_get_interp_filter_params_with_block_size(filter_y, h)
570
0
          : NULL;
571
0
572
0
  if (scaled) {
573
0
    convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
574
0
                              filter_params_x, filter_params_y, subpel_x_q4,
575
0
                              x_step_q4, subpel_y_q4, y_step_q4, conv_params);
576
0
  } else {
577
0
    sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound](
578
0
        src, src_stride, dst, dst_stride, w, h, filter_params_x,
579
0
        filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
580
0
  }
581
0
}
582
583
void av1_highbd_convolve_2d_copy_sr_c(
584
    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
585
    int h, const InterpFilterParams *filter_params_x,
586
    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
587
0
    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
588
0
  (void)filter_params_x;
589
0
  (void)filter_params_y;
590
0
  (void)subpel_x_q4;
591
0
  (void)subpel_y_q4;
592
0
  (void)conv_params;
593
0
  (void)bd;
594
0
595
0
  for (int y = 0; y < h; ++y) {
596
0
    memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
597
0
  }
598
0
}
599
600
void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
601
                                uint16_t *dst, int dst_stride, int w, int h,
602
                                const InterpFilterParams *filter_params_x,
603
                                const InterpFilterParams *filter_params_y,
604
                                const int subpel_x_q4, const int subpel_y_q4,
605
0
                                ConvolveParams *conv_params, int bd) {
606
0
  const int fo_horiz = filter_params_x->taps / 2 - 1;
607
0
  const int bits = FILTER_BITS - conv_params->round_0;
608
0
  (void)filter_params_y;
609
0
  (void)subpel_y_q4;
610
0
611
0
  assert(bits >= 0);
612
0
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
613
0
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
614
0
615
0
  // horizontal filter
616
0
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
617
0
      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
618
0
  for (int y = 0; y < h; ++y) {
619
0
    for (int x = 0; x < w; ++x) {
620
0
      int32_t res = 0;
621
0
      for (int k = 0; k < filter_params_x->taps; ++k) {
622
0
        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
623
0
      }
624
0
      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
625
0
      dst[y * dst_stride + x] =
626
0
          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
627
0
    }
628
0
  }
629
0
}
630
631
void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
632
                                uint16_t *dst, int dst_stride, int w, int h,
633
                                const InterpFilterParams *filter_params_x,
634
                                const InterpFilterParams *filter_params_y,
635
                                const int subpel_x_q4, const int subpel_y_q4,
636
0
                                ConvolveParams *conv_params, int bd) {
637
0
  const int fo_vert = filter_params_y->taps / 2 - 1;
638
0
  (void)filter_params_x;
639
0
  (void)subpel_x_q4;
640
0
  (void)conv_params;
641
0
642
0
  assert(conv_params->round_0 <= FILTER_BITS);
643
0
  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
644
0
         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
645
0
  // vertical filter
646
0
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
647
0
      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
648
0
  for (int y = 0; y < h; ++y) {
649
0
    for (int x = 0; x < w; ++x) {
650
0
      int32_t res = 0;
651
0
      for (int k = 0; k < filter_params_y->taps; ++k) {
652
0
        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
653
0
      }
654
0
      dst[y * dst_stride + x] =
655
0
          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
656
0
    }
657
0
  }
658
0
}
659
660
void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
661
                                 uint16_t *dst, int dst_stride, int w, int h,
662
                                 const InterpFilterParams *filter_params_x,
663
                                 const InterpFilterParams *filter_params_y,
664
                                 const int subpel_x_q4, const int subpel_y_q4,
665
0
                                 ConvolveParams *conv_params, int bd) {
666
0
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
667
0
  int im_h = h + filter_params_y->taps - 1;
668
0
  int im_stride = w;
669
0
  const int fo_vert = filter_params_y->taps / 2 - 1;
670
0
  const int fo_horiz = filter_params_x->taps / 2 - 1;
671
0
  const int bits =
672
0
      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
673
0
  assert(bits >= 0);
674
0
675
0
  // horizontal filter
676
0
  const uint16_t *src_horiz = src - fo_vert * src_stride;
677
0
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
678
0
      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
679
0
  for (int y = 0; y < im_h; ++y) {
680
0
    for (int x = 0; x < w; ++x) {
681
0
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
682
0
      for (int k = 0; k < filter_params_x->taps; ++k) {
683
0
        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
684
0
      }
685
0
      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
686
0
      im_block[y * im_stride + x] =
687
0
          ROUND_POWER_OF_TWO(sum, conv_params->round_0);
688
0
    }
689
0
  }
690
0
691
0
  // vertical filter
692
0
  int16_t *src_vert = im_block + fo_vert * im_stride;
693
0
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
694
0
      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
695
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
696
0
  for (int y = 0; y < h; ++y) {
697
0
    for (int x = 0; x < w; ++x) {
698
0
      int32_t sum = 1 << offset_bits;
699
0
      for (int k = 0; k < filter_params_y->taps; ++k) {
700
0
        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
701
0
      }
702
0
      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
703
0
      int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
704
0
                    ((1 << (offset_bits - conv_params->round_1)) +
705
0
                     (1 << (offset_bits - conv_params->round_1 - 1)));
706
0
      dst[y * dst_stride + x] =
707
0
          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
708
0
    }
709
0
  }
710
0
}
711
712
void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
713
                                  uint16_t *dst16, int dst16_stride, int w,
714
                                  int h,
715
                                  const InterpFilterParams *filter_params_x,
716
                                  const InterpFilterParams *filter_params_y,
717
                                  const int subpel_x_q4, const int subpel_y_q4,
718
0
                                  ConvolveParams *conv_params, int bd) {
719
0
  int x, y, k;
720
0
  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
721
0
  CONV_BUF_TYPE *dst = conv_params->dst;
722
0
  int dst_stride = conv_params->dst_stride;
723
0
  int im_h = h + filter_params_y->taps - 1;
724
0
  int im_stride = w;
725
0
  const int fo_vert = filter_params_y->taps / 2 - 1;
726
0
  const int fo_horiz = filter_params_x->taps / 2 - 1;
727
0
  const int round_bits =
728
0
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
729
0
  assert(round_bits >= 0);
730
0
731
0
  // horizontal filter
732
0
  const uint16_t *src_horiz = src - fo_vert * src_stride;
733
0
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
734
0
      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
735
0
  for (y = 0; y < im_h; ++y) {
736
0
    for (x = 0; x < w; ++x) {
737
0
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
738
0
      for (k = 0; k < filter_params_x->taps; ++k) {
739
0
        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
740
0
      }
741
0
      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
742
0
      (void)bd;
743
0
      im_block[y * im_stride + x] =
744
0
          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
745
0
    }
746
0
  }
747
0
748
0
  // vertical filter
749
0
  int16_t *src_vert = im_block + fo_vert * im_stride;
750
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
751
0
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
752
0
      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
753
0
  for (y = 0; y < h; ++y) {
754
0
    for (x = 0; x < w; ++x) {
755
0
      int32_t sum = 1 << offset_bits;
756
0
      for (k = 0; k < filter_params_y->taps; ++k) {
757
0
        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
758
0
      }
759
0
      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
760
0
      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
761
0
      if (conv_params->do_average) {
762
0
        int32_t tmp = dst[y * dst_stride + x];
763
0
        if (conv_params->use_jnt_comp_avg) {
764
0
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
765
0
          tmp = tmp >> DIST_PRECISION_BITS;
766
0
        } else {
767
0
          tmp += res;
768
0
          tmp = tmp >> 1;
769
0
        }
770
0
        tmp -= (1 << (offset_bits - conv_params->round_1)) +
771
0
               (1 << (offset_bits - conv_params->round_1 - 1));
772
0
        dst16[y * dst16_stride + x] =
773
0
            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
774
0
      } else {
775
0
        dst[y * dst_stride + x] = res;
776
0
      }
777
0
    }
778
0
  }
779
0
}
780
781
void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
782
                                 uint16_t *dst16, int dst16_stride, int w,
783
                                 int h,
784
                                 const InterpFilterParams *filter_params_x,
785
                                 const InterpFilterParams *filter_params_y,
786
                                 const int subpel_x_q4, const int subpel_y_q4,
787
0
                                 ConvolveParams *conv_params, int bd) {
788
0
  CONV_BUF_TYPE *dst = conv_params->dst;
789
0
  int dst_stride = conv_params->dst_stride;
790
0
  const int fo_horiz = filter_params_x->taps / 2 - 1;
791
0
  const int bits = FILTER_BITS - conv_params->round_1;
792
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
793
0
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
794
0
                           (1 << (offset_bits - conv_params->round_1 - 1));
795
0
  const int round_bits =
796
0
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
797
0
  assert(round_bits >= 0);
798
0
  (void)filter_params_y;
799
0
  (void)subpel_y_q4;
800
0
  assert(bits >= 0);
801
0
  // horizontal filter
802
0
  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
803
0
      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
804
0
  for (int y = 0; y < h; ++y) {
805
0
    for (int x = 0; x < w; ++x) {
806
0
      int32_t res = 0;
807
0
      for (int k = 0; k < filter_params_x->taps; ++k) {
808
0
        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
809
0
      }
810
0
      res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
811
0
      res += round_offset;
812
0
813
0
      if (conv_params->do_average) {
814
0
        int32_t tmp = dst[y * dst_stride + x];
815
0
        if (conv_params->use_jnt_comp_avg) {
816
0
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
817
0
          tmp = tmp >> DIST_PRECISION_BITS;
818
0
        } else {
819
0
          tmp += res;
820
0
          tmp = tmp >> 1;
821
0
        }
822
0
        tmp -= round_offset;
823
0
        dst16[y * dst16_stride + x] =
824
0
            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
825
0
      } else {
826
0
        dst[y * dst_stride + x] = res;
827
0
      }
828
0
    }
829
0
  }
830
0
}
831
832
void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
833
                                 uint16_t *dst16, int dst16_stride, int w,
834
                                 int h,
835
                                 const InterpFilterParams *filter_params_x,
836
                                 const InterpFilterParams *filter_params_y,
837
                                 const int subpel_x_q4, const int subpel_y_q4,
838
0
                                 ConvolveParams *conv_params, int bd) {
839
0
  CONV_BUF_TYPE *dst = conv_params->dst;
840
0
  int dst_stride = conv_params->dst_stride;
841
0
  const int fo_vert = filter_params_y->taps / 2 - 1;
842
0
  const int bits = FILTER_BITS - conv_params->round_0;
843
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
844
0
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
845
0
                           (1 << (offset_bits - conv_params->round_1 - 1));
846
0
  const int round_bits =
847
0
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
848
0
  assert(round_bits >= 0);
849
0
  (void)filter_params_x;
850
0
  (void)subpel_x_q4;
851
0
  assert(bits >= 0);
852
0
  // vertical filter
853
0
  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
854
0
      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
855
0
  for (int y = 0; y < h; ++y) {
856
0
    for (int x = 0; x < w; ++x) {
857
0
      int32_t res = 0;
858
0
      for (int k = 0; k < filter_params_y->taps; ++k) {
859
0
        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
860
0
      }
861
0
      res *= (1 << bits);
862
0
      res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
863
0
864
0
      if (conv_params->do_average) {
865
0
        int32_t tmp = dst[y * dst_stride + x];
866
0
        if (conv_params->use_jnt_comp_avg) {
867
0
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
868
0
          tmp = tmp >> DIST_PRECISION_BITS;
869
0
        } else {
870
0
          tmp += res;
871
0
          tmp = tmp >> 1;
872
0
        }
873
0
        tmp -= round_offset;
874
0
        dst16[y * dst16_stride + x] =
875
0
            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
876
0
      } else {
877
0
        dst[y * dst_stride + x] = res;
878
0
      }
879
0
    }
880
0
  }
881
0
}
882
883
void av1_highbd_jnt_convolve_2d_copy_c(
884
    const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
885
    int w, int h, const InterpFilterParams *filter_params_x,
886
    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
887
0
    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
888
0
  CONV_BUF_TYPE *dst = conv_params->dst;
889
0
  int dst_stride = conv_params->dst_stride;
890
0
  const int bits =
891
0
      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
892
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
893
0
  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
894
0
                           (1 << (offset_bits - conv_params->round_1 - 1));
895
0
  assert(bits >= 0);
896
0
  (void)filter_params_x;
897
0
  (void)filter_params_y;
898
0
  (void)subpel_x_q4;
899
0
  (void)subpel_y_q4;
900
0
901
0
  for (int y = 0; y < h; ++y) {
902
0
    for (int x = 0; x < w; ++x) {
903
0
      CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
904
0
      res += round_offset;
905
0
      if (conv_params->do_average) {
906
0
        int32_t tmp = dst[y * dst_stride + x];
907
0
        if (conv_params->use_jnt_comp_avg) {
908
0
          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
909
0
          tmp = tmp >> DIST_PRECISION_BITS;
910
0
        } else {
911
0
          tmp += res;
912
0
          tmp = tmp >> 1;
913
0
        }
914
0
        tmp -= round_offset;
915
0
        dst16[y * dst16_stride + x] =
916
0
            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
917
0
      } else {
918
0
        dst[y * dst_stride + x] = res;
919
0
      }
920
0
    }
921
0
  }
922
0
}
923
924
void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
925
                                    uint16_t *dst, int dst_stride, int w, int h,
926
                                    const InterpFilterParams *filter_params_x,
927
                                    const InterpFilterParams *filter_params_y,
928
                                    const int subpel_x_qn, const int x_step_qn,
929
                                    const int subpel_y_qn, const int y_step_qn,
930
0
                                    ConvolveParams *conv_params, int bd) {
931
0
  int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
932
0
  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
933
0
             filter_params_y->taps;
934
0
  int im_stride = w;
935
0
  const int fo_vert = filter_params_y->taps / 2 - 1;
936
0
  const int fo_horiz = filter_params_x->taps / 2 - 1;
937
0
  CONV_BUF_TYPE *dst16 = conv_params->dst;
938
0
  const int dst16_stride = conv_params->dst_stride;
939
0
  const int bits =
940
0
      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
941
0
  assert(bits >= 0);
942
0
  // horizontal filter
943
0
  const uint16_t *src_horiz = src - fo_vert * src_stride;
944
0
  for (int y = 0; y < im_h; ++y) {
945
0
    int x_qn = subpel_x_qn;
946
0
    for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
947
0
      const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
948
0
      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
949
0
      assert(x_filter_idx < SUBPEL_SHIFTS);
950
0
      const int16_t *x_filter =
951
0
          av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
952
0
      int32_t sum = (1 << (bd + FILTER_BITS - 1));
953
0
      for (int k = 0; k < filter_params_x->taps; ++k) {
954
0
        sum += x_filter[k] * src_x[k - fo_horiz];
955
0
      }
956
0
      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
957
0
      im_block[y * im_stride + x] =
958
0
          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
959
0
    }
960
0
    src_horiz += src_stride;
961
0
  }
962
0
963
0
  // vertical filter
964
0
  int16_t *src_vert = im_block + fo_vert * im_stride;
965
0
  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
966
0
  for (int x = 0; x < w; ++x) {
967
0
    int y_qn = subpel_y_qn;
968
0
    for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
969
0
      const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
970
0
      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
971
0
      assert(y_filter_idx < SUBPEL_SHIFTS);
972
0
      const int16_t *y_filter =
973
0
          av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
974
0
      int32_t sum = 1 << offset_bits;
975
0
      for (int k = 0; k < filter_params_y->taps; ++k) {
976
0
        sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
977
0
      }
978
0
      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
979
0
      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
980
0
      if (conv_params->is_compound) {
981
0
        if (conv_params->do_average) {
982
0
          int32_t tmp = dst16[y * dst16_stride + x];
983
0
          if (conv_params->use_jnt_comp_avg) {
984
0
            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
985
0
            tmp = tmp >> DIST_PRECISION_BITS;
986
0
          } else {
987
0
            tmp += res;
988
0
            tmp = tmp >> 1;
989
0
          }
990
0
          /* Subtract round offset and convolve round */
991
0
          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
992
0
                       (1 << (offset_bits - conv_params->round_1 - 1)));
993
0
          dst[y * dst_stride + x] =
994
0
              clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
995
0
        } else {
996
0
          dst16[y * dst16_stride + x] = res;
997
0
        }
998
0
      } else {
999
0
        /* Subtract round offset and convolve round */
1000
0
        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
1001
0
                             (1 << (offset_bits - conv_params->round_1 - 1)));
1002
0
        dst[y * dst_stride + x] =
1003
0
            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1004
0
      }
1005
0
    }
1006
0
    src_vert++;
1007
0
  }
1008
0
}
1009
1010
static void highbd_convolve_2d_for_intrabc(const uint16_t *src, int src_stride,
1011
                                           uint16_t *dst, int dst_stride, int w,
1012
                                           int h, int subpel_x_q4,
1013
                                           int subpel_y_q4,
1014
                                           ConvolveParams *conv_params,
1015
0
                                           int bd) {
1016
0
  const InterpFilterParams *filter_params_x =
1017
0
      subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
1018
0
  const InterpFilterParams *filter_params_y =
1019
0
      subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
1020
0
  if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
1021
0
    av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
1022
0
                                filter_params_x, filter_params_y, 0, 0,
1023
0
                                conv_params, bd);
1024
0
  } else if (subpel_x_q4 != 0) {
1025
0
    av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
1026
0
                               filter_params_x, filter_params_y, 0, 0,
1027
0
                               conv_params, bd);
1028
0
  } else {
1029
0
    av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
1030
0
                               filter_params_x, filter_params_y, 0, 0,
1031
0
                               conv_params, bd);
1032
0
  }
1033
0
}
1034
1035
void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
1036
                                   uint8_t *dst8, int dst_stride, int w, int h,
1037
                                   InterpFilters interp_filters,
1038
                                   const int subpel_x_q4, int x_step_q4,
1039
                                   const int subpel_y_q4, int y_step_q4,
1040
                                   int scaled, ConvolveParams *conv_params,
1041
                                   const struct scale_factors *sf,
1042
0
                                   int is_intrabc, int bd) {
1043
0
  assert(IMPLIES(is_intrabc, !scaled));
1044
0
  (void)x_step_q4;
1045
0
  (void)y_step_q4;
1046
0
  (void)dst_stride;
1047
0
  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1048
0
1049
0
  if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
1050
0
    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1051
0
    highbd_convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h,
1052
0
                                   subpel_x_q4, subpel_y_q4, conv_params, bd);
1053
0
    return;
1054
0
  }
1055
0
1056
0
  InterpFilter filter_x = 0;
1057
0
  InterpFilter filter_y = 0;
1058
0
  const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
1059
0
  const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
1060
0
  if (need_filter_params_x)
1061
0
    filter_x = av1_extract_interp_filter(interp_filters, 1);
1062
0
  if (need_filter_params_y)
1063
0
    filter_y = av1_extract_interp_filter(interp_filters, 0);
1064
0
  const InterpFilterParams *filter_params_x =
1065
0
      need_filter_params_x
1066
0
          ? av1_get_interp_filter_params_with_block_size(filter_x, w)
1067
0
          : NULL;
1068
0
  const InterpFilterParams *filter_params_y =
1069
0
      need_filter_params_y
1070
0
          ? av1_get_interp_filter_params_with_block_size(filter_y, h)
1071
0
          : NULL;
1072
0
1073
0
  if (scaled) {
1074
0
    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1075
0
    if (conv_params->is_compound) {
1076
0
      assert(conv_params->dst != NULL);
1077
0
    }
1078
0
    av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
1079
0
                                 filter_params_x, filter_params_y, subpel_x_q4,
1080
0
                                 x_step_q4, subpel_y_q4, y_step_q4, conv_params,
1081
0
                                 bd);
1082
0
  } else {
1083
0
    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1084
0
1085
0
    sf->highbd_convolve[subpel_x_q4 != 0][subpel_y_q4 !=
1086
0
                                          0][conv_params->is_compound](
1087
0
        src, src_stride, dst, dst_stride, w, h, filter_params_x,
1088
0
        filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd);
1089
0
  }
1090
0
}
1091
1092
// Note: Fixed size intermediate buffers, place limits on parameters
1093
// of some functions. 2d filtering proceeds in 2 steps:
1094
//   (1) Interpolate horizontally into an intermediate buffer, temp.
1095
//   (2) Interpolate temp vertically to derive the sub-pixel result.
1096
// Deriving the maximum number of rows in the temp buffer (135):
1097
// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1098
// --Largest block size is 128x128 pixels.
1099
// --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
1100
//   original frame (in 1/16th pixel units).
1101
// --Must round-up because block may be located at sub-pixel position.
1102
// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1103
// --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
1104
#define WIENER_MAX_EXT_SIZE 263
1105
1106
0
static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
1107
0
  int sum = 0;
1108
0
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1109
0
  return sum;
1110
0
}
1111
1112
static INLINE int highbd_horz_scalar_product(const uint16_t *a,
1113
0
                                             const int16_t *b) {
1114
0
  int sum = 0;
1115
0
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1116
0
  return sum;
1117
0
}
1118
1119
static INLINE int highbd_vert_scalar_product(const uint16_t *a,
1120
                                             ptrdiff_t a_stride,
1121
0
                                             const int16_t *b) {
1122
0
  int sum = 0;
1123
0
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
1124
0
  return sum;
1125
0
}
1126
1127
0
static const InterpKernel *get_filter_base(const int16_t *filter) {
1128
0
  // NOTE: This assumes that the filter table is 256-byte aligned.
1129
0
  // TODO(agrange) Modify to make independent of table alignment.
1130
0
  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
1131
0
}
1132
1133
0
static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
1134
0
  return (int)((const InterpKernel *)(intptr_t)f - base);
1135
0
}
1136
1137
static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
1138
                                       uint16_t *dst, ptrdiff_t dst_stride,
1139
                                       const InterpKernel *x_filters, int x0_q4,
1140
                                       int x_step_q4, int w, int h,
1141
0
                                       int round0_bits) {
1142
0
  const int bd = 8;
1143
0
  src -= SUBPEL_TAPS / 2 - 1;
1144
0
  for (int y = 0; y < h; ++y) {
1145
0
    int x_q4 = x0_q4;
1146
0
    for (int x = 0; x < w; ++x) {
1147
0
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1148
0
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1149
0
      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1150
0
                           (1 << (bd + FILTER_BITS - 1));
1151
0
      const int sum = horz_scalar_product(src_x, x_filter) + rounding;
1152
0
      dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1153
0
                               WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
1154
0
      x_q4 += x_step_q4;
1155
0
    }
1156
0
    src += src_stride;
1157
0
    dst += dst_stride;
1158
0
  }
1159
0
}
1160
1161
static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
1162
                                      uint8_t *dst, ptrdiff_t dst_stride,
1163
                                      const InterpKernel *y_filters, int y0_q4,
1164
                                      int y_step_q4, int w, int h,
1165
0
                                      int round1_bits) {
1166
0
  const int bd = 8;
1167
0
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1168
0
1169
0
  for (int x = 0; x < w; ++x) {
1170
0
    int y_q4 = y0_q4;
1171
0
    for (int y = 0; y < h; ++y) {
1172
0
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1173
0
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1174
0
      const int rounding =
1175
0
          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1176
0
          (1 << (bd + round1_bits - 1));
1177
0
      const int sum =
1178
0
          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1179
0
      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
1180
0
      y_q4 += y_step_q4;
1181
0
    }
1182
0
    ++src;
1183
0
    ++dst;
1184
0
  }
1185
0
}
1186
1187
void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
1188
                                   uint8_t *dst, ptrdiff_t dst_stride,
1189
                                   const int16_t *filter_x, int x_step_q4,
1190
                                   const int16_t *filter_y, int y_step_q4,
1191
                                   int w, int h,
1192
0
                                   const ConvolveParams *conv_params) {
1193
0
  const InterpKernel *const filters_x = get_filter_base(filter_x);
1194
0
  const int x0_q4 = get_filter_offset(filter_x, filters_x);
1195
0
1196
0
  const InterpKernel *const filters_y = get_filter_base(filter_y);
1197
0
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
1198
0
1199
0
  uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1200
0
  const int intermediate_height =
1201
0
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
1202
0
  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
1203
0
1204
0
  assert(w <= MAX_SB_SIZE);
1205
0
  assert(h <= MAX_SB_SIZE);
1206
0
  assert(y_step_q4 <= 32);
1207
0
  assert(x_step_q4 <= 32);
1208
0
1209
0
  convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1210
0
                             src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
1211
0
                             x_step_q4, w, intermediate_height,
1212
0
                             conv_params->round_0);
1213
0
  convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1214
0
                            MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
1215
0
                            y_step_q4, w, h, conv_params->round_1);
1216
0
}
1217
1218
static void highbd_convolve_add_src_horiz_hip(
1219
    const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
1220
    ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
1221
0
    int x_step_q4, int w, int h, int round0_bits, int bd) {
1222
0
  const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
1223
0
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1224
0
  src -= SUBPEL_TAPS / 2 - 1;
1225
0
  for (int y = 0; y < h; ++y) {
1226
0
    int x_q4 = x0_q4;
1227
0
    for (int x = 0; x < w; ++x) {
1228
0
      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1229
0
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1230
0
      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1231
0
                           (1 << (bd + FILTER_BITS - 1));
1232
0
      const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
1233
0
      dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1234
0
                               extraprec_clamp_limit - 1);
1235
0
      x_q4 += x_step_q4;
1236
0
    }
1237
0
    src += src_stride;
1238
0
    dst += dst_stride;
1239
0
  }
1240
0
}
1241
1242
static void highbd_convolve_add_src_vert_hip(
1243
    const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
1244
    ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
1245
0
    int y_step_q4, int w, int h, int round1_bits, int bd) {
1246
0
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1247
0
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1248
0
  for (int x = 0; x < w; ++x) {
1249
0
    int y_q4 = y0_q4;
1250
0
    for (int y = 0; y < h; ++y) {
1251
0
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1252
0
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1253
0
      const int rounding =
1254
0
          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1255
0
          (1 << (bd + round1_bits - 1));
1256
0
      const int sum =
1257
0
          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1258
0
      dst[y * dst_stride] =
1259
0
          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
1260
0
      y_q4 += y_step_q4;
1261
0
    }
1262
0
    ++src;
1263
0
    ++dst;
1264
0
  }
1265
0
}
1266
1267
void av1_highbd_wiener_convolve_add_src_c(
1268
    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1269
    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1270
    const int16_t *filter_y, int y_step_q4, int w, int h,
1271
0
    const ConvolveParams *conv_params, int bd) {
1272
0
  const InterpKernel *const filters_x = get_filter_base(filter_x);
1273
0
  const int x0_q4 = get_filter_offset(filter_x, filters_x);
1274
0
1275
0
  const InterpKernel *const filters_y = get_filter_base(filter_y);
1276
0
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
1277
0
1278
0
  uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1279
0
  const int intermediate_height =
1280
0
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1281
0
1282
0
  assert(w <= MAX_SB_SIZE);
1283
0
  assert(h <= MAX_SB_SIZE);
1284
0
  assert(y_step_q4 <= 32);
1285
0
  assert(x_step_q4 <= 32);
1286
0
  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
1287
0
1288
0
  highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1289
0
                                    src_stride, temp, MAX_SB_SIZE, filters_x,
1290
0
                                    x0_q4, x_step_q4, w, intermediate_height,
1291
0
                                    conv_params->round_0, bd);
1292
0
  highbd_convolve_add_src_vert_hip(
1293
0
      temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
1294
0
      filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
1295
0
}