Coverage Report

Created: 2025-06-13 07:07

/src/aom/aom_dsp/x86/highbd_convolve_avx2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
#include <immintrin.h>
12
#include <string.h>
13
14
#include "config/av1_rtcd.h"
15
16
#include "aom_dsp/x86/convolve.h"
17
#include "aom_dsp/x86/convolve_avx2.h"
18
#include "aom_dsp/x86/synonyms.h"
19
20
// -----------------------------------------------------------------------------
21
// Copy and average
22
23
static const uint8_t ip_shuffle_f2f3[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
24
                                             7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
25
                                             4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
26
static const uint8_t ip_shuffle_f4f5[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
27
                                             8, 9, 10, 11, 10, 11, 12, 13,
28
                                             4, 5, 6,  7,  6,  7,  8,  9,
29
                                             8, 9, 10, 11, 10, 11, 12, 13 };
30
31
void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
32
                                    uint16_t *dst, int dst_stride, int w, int h,
33
                                    const InterpFilterParams *filter_params_x,
34
                                    const int subpel_x_qn,
35
                                    ConvolveParams *conv_params, int bd);
36
void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
37
                                    uint16_t *dst, int dst_stride, int w, int h,
38
                                    const InterpFilterParams *filter_params_y,
39
                                    const int subpel_y_qn, int bd);
40
41
void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
42
                                   uint16_t *dst, int dst_stride, int w, int h,
43
                                   const InterpFilterParams *filter_params_y,
44
1.02M
                                   const int subpel_y_qn, int bd) {
45
1.02M
  if (filter_params_y->taps == 12) {
46
0
    av1_highbd_convolve_y_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
47
0
                                   filter_params_y, subpel_y_qn, bd);
48
0
    return;
49
0
  }
50
1.02M
  int i, j;
51
1.02M
  const int fo_vert = filter_params_y->taps / 2 - 1;
52
1.02M
  const uint16_t *const src_ptr = src - fo_vert * src_stride;
53
54
1.02M
  __m256i s[8], coeffs_y[4];
55
56
1.02M
  const int bits = FILTER_BITS;
57
58
1.02M
  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
59
1.02M
  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
60
1.02M
  const __m256i clip_pixel =
61
1.02M
      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
62
1.02M
  const __m256i zero = _mm256_setzero_si256();
63
64
1.02M
  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
65
66
2.45M
  for (j = 0; j < w; j += 8) {
67
1.42M
    const uint16_t *data = &src_ptr[j];
68
    /* Vertical filter */
69
1.42M
    {
70
1.42M
      __m256i src6;
71
1.42M
      __m256i s01 = _mm256_permute2x128_si256(
72
1.42M
          _mm256_castsi128_si256(
73
1.42M
              _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
74
1.42M
          _mm256_castsi128_si256(
75
1.42M
              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
76
1.42M
          0x20);
77
1.42M
      __m256i s12 = _mm256_permute2x128_si256(
78
1.42M
          _mm256_castsi128_si256(
79
1.42M
              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
80
1.42M
          _mm256_castsi128_si256(
81
1.42M
              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
82
1.42M
          0x20);
83
1.42M
      __m256i s23 = _mm256_permute2x128_si256(
84
1.42M
          _mm256_castsi128_si256(
85
1.42M
              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
86
1.42M
          _mm256_castsi128_si256(
87
1.42M
              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
88
1.42M
          0x20);
89
1.42M
      __m256i s34 = _mm256_permute2x128_si256(
90
1.42M
          _mm256_castsi128_si256(
91
1.42M
              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
92
1.42M
          _mm256_castsi128_si256(
93
1.42M
              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
94
1.42M
          0x20);
95
1.42M
      __m256i s45 = _mm256_permute2x128_si256(
96
1.42M
          _mm256_castsi128_si256(
97
1.42M
              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
98
1.42M
          _mm256_castsi128_si256(
99
1.42M
              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
100
1.42M
          0x20);
101
1.42M
      src6 = _mm256_castsi128_si256(
102
1.42M
          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
103
1.42M
      __m256i s56 = _mm256_permute2x128_si256(
104
1.42M
          _mm256_castsi128_si256(
105
1.42M
              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
106
1.42M
          src6, 0x20);
107
108
1.42M
      s[0] = _mm256_unpacklo_epi16(s01, s12);
109
1.42M
      s[1] = _mm256_unpacklo_epi16(s23, s34);
110
1.42M
      s[2] = _mm256_unpacklo_epi16(s45, s56);
111
112
1.42M
      s[4] = _mm256_unpackhi_epi16(s01, s12);
113
1.42M
      s[5] = _mm256_unpackhi_epi16(s23, s34);
114
1.42M
      s[6] = _mm256_unpackhi_epi16(s45, s56);
115
116
9.64M
      for (i = 0; i < h; i += 2) {
117
8.21M
        data = &src_ptr[i * src_stride + j];
118
119
8.21M
        const __m256i s67 = _mm256_permute2x128_si256(
120
8.21M
            src6,
121
8.21M
            _mm256_castsi128_si256(
122
8.21M
                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
123
8.21M
            0x20);
124
125
8.21M
        src6 = _mm256_castsi128_si256(
126
8.21M
            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
127
128
8.21M
        const __m256i s78 = _mm256_permute2x128_si256(
129
8.21M
            _mm256_castsi128_si256(
130
8.21M
                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
131
8.21M
            src6, 0x20);
132
133
8.21M
        s[3] = _mm256_unpacklo_epi16(s67, s78);
134
8.21M
        s[7] = _mm256_unpackhi_epi16(s67, s78);
135
136
8.21M
        const __m256i res_a = convolve(s, coeffs_y);
137
138
8.21M
        __m256i res_a_round = _mm256_sra_epi32(
139
8.21M
            _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
140
141
8.21M
        if (w - j > 4) {
142
6.84M
          const __m256i res_b = convolve(s + 4, coeffs_y);
143
6.84M
          __m256i res_b_round = _mm256_sra_epi32(
144
6.84M
              _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
145
146
6.84M
          __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
147
6.84M
          res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
148
6.84M
          res_16bit = _mm256_max_epi16(res_16bit, zero);
149
150
6.84M
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
151
6.84M
                           _mm256_castsi256_si128(res_16bit));
152
6.84M
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
153
6.84M
                           _mm256_extracti128_si256(res_16bit, 1));
154
6.84M
        } else if (w == 4) {
155
1.11M
          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
156
1.11M
          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
157
1.11M
          res_a_round = _mm256_max_epi16(res_a_round, zero);
158
159
1.11M
          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
160
1.11M
                           _mm256_castsi256_si128(res_a_round));
161
1.11M
          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
162
1.11M
                           _mm256_extracti128_si256(res_a_round, 1));
163
1.11M
        } else {
164
263k
          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
165
263k
          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
166
263k
          res_a_round = _mm256_max_epi16(res_a_round, zero);
167
168
263k
          xx_storel_32(&dst[i * dst_stride + j],
169
263k
                       _mm256_castsi256_si128(res_a_round));
170
263k
          xx_storel_32(&dst[i * dst_stride + j + dst_stride],
171
263k
                       _mm256_extracti128_si256(res_a_round, 1));
172
263k
        }
173
174
8.21M
        s[0] = s[1];
175
8.21M
        s[1] = s[2];
176
8.21M
        s[2] = s[3];
177
178
8.21M
        s[4] = s[5];
179
8.21M
        s[5] = s[6];
180
8.21M
        s[6] = s[7];
181
8.21M
      }
182
1.42M
    }
183
1.42M
  }
184
1.02M
}
185
186
void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
187
                                   uint16_t *dst, int dst_stride, int w, int h,
188
                                   const InterpFilterParams *filter_params_x,
189
                                   const int subpel_x_qn,
190
890k
                                   ConvolveParams *conv_params, int bd) {
191
890k
  if (filter_params_x->taps == 12) {
192
0
    av1_highbd_convolve_x_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
193
0
                                   filter_params_x, subpel_x_qn, conv_params,
194
0
                                   bd);
195
0
    return;
196
0
  }
197
890k
  int i, j;
198
890k
  const int fo_horiz = filter_params_x->taps / 2 - 1;
199
890k
  const uint16_t *const src_ptr = src - fo_horiz;
200
201
  // Check that, even with 12-bit input, the intermediate values will fit
202
  // into an unsigned 16-bit intermediate array.
203
890k
  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
204
205
890k
  __m256i s[4], coeffs_x[4];
206
207
890k
  const __m256i round_const_x =
208
890k
      _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
209
890k
  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
210
211
890k
  const int bits = FILTER_BITS - conv_params->round_0;
212
890k
  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
213
890k
  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
214
890k
  const __m256i clip_pixel =
215
890k
      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
216
890k
  const __m256i zero = _mm256_setzero_si256();
217
218
890k
  assert(bits >= 0);
219
890k
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
220
890k
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
221
222
890k
  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
223
224
2.20M
  for (j = 0; j < w; j += 8) {
225
    /* Horizontal filter */
226
10.4M
    for (i = 0; i < h; i += 2) {
227
9.10M
      const __m256i row0 =
228
9.10M
          _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
229
9.10M
      __m256i row1 =
230
9.10M
          _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
231
232
9.10M
      const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
233
9.10M
      const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
234
235
      // even pixels
236
9.10M
      s[0] = _mm256_alignr_epi8(r1, r0, 0);
237
9.10M
      s[1] = _mm256_alignr_epi8(r1, r0, 4);
238
9.10M
      s[2] = _mm256_alignr_epi8(r1, r0, 8);
239
9.10M
      s[3] = _mm256_alignr_epi8(r1, r0, 12);
240
241
9.10M
      __m256i res_even = convolve(s, coeffs_x);
242
9.10M
      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
243
9.10M
                                  round_shift_x);
244
245
      // odd pixels
246
9.10M
      s[0] = _mm256_alignr_epi8(r1, r0, 2);
247
9.10M
      s[1] = _mm256_alignr_epi8(r1, r0, 6);
248
9.10M
      s[2] = _mm256_alignr_epi8(r1, r0, 10);
249
9.10M
      s[3] = _mm256_alignr_epi8(r1, r0, 14);
250
251
9.10M
      __m256i res_odd = convolve(s, coeffs_x);
252
9.10M
      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
253
9.10M
                                 round_shift_x);
254
255
9.10M
      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits),
256
9.10M
                                  round_shift_bits);
257
9.10M
      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits),
258
9.10M
                                 round_shift_bits);
259
260
9.10M
      __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
261
9.10M
      __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
262
263
9.10M
      __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
264
9.10M
      res = _mm256_min_epi16(res, clip_pixel);
265
9.10M
      res = _mm256_max_epi16(res, zero);
266
267
9.10M
      if (w - j > 4) {
268
7.94M
        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
269
7.94M
                         _mm256_castsi256_si128(res));
270
7.94M
        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
271
7.94M
                         _mm256_extracti128_si256(res, 1));
272
7.94M
      } else if (w == 4) {
273
929k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
274
929k
                         _mm256_castsi256_si128(res));
275
929k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
276
929k
                         _mm256_extracti128_si256(res, 1));
277
929k
      } else {
278
229k
        xx_storel_32(&dst[i * dst_stride + j], _mm256_castsi256_si128(res));
279
229k
        xx_storel_32(&dst[i * dst_stride + j + dst_stride],
280
229k
                     _mm256_extracti128_si256(res, 1));
281
229k
      }
282
9.10M
    }
283
1.31M
  }
284
890k
}
285
286
0
#define CONV8_ROUNDING_BITS (7)
287
288
// -----------------------------------------------------------------------------
289
// Horizontal and vertical filtering
290
291
static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
292
                                              7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
293
                                              4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
294
295
static const uint8_t signal_pattern_1[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
296
                                              8, 9, 10, 11, 10, 11, 12, 13,
297
                                              4, 5, 6,  7,  6,  7,  8,  9,
298
                                              8, 9, 10, 11, 10, 11, 12, 13 };
299
300
static const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
301
                                              10, 11, 12, 13, 12, 13, 14, 15,
302
                                              6,  7,  8,  9,  8,  9,  10, 11,
303
                                              10, 11, 12, 13, 12, 13, 14, 15 };
304
305
static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
306
307
// -----------------------------------------------------------------------------
308
// Horizontal Filtering
309
310
0
static inline void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
311
0
  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
312
0
  const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
313
0
  const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
314
0
  const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
315
316
0
  p[0] = _mm256_shuffle_epi8(*s, sf0);  // x0x6
317
0
  p[1] = _mm256_shuffle_epi8(*s, sf1);  // x1x7
318
0
  p[2] = _mm256_shuffle_epi8(c, sf0);   // x2x4
319
0
  p[3] = _mm256_shuffle_epi8(c, sf1);   // x3x5
320
0
}
321
322
// Note:
323
//  Shared by 8x2 and 16x1 block
324
static inline void pack_16_pixels(const __m256i *s0, const __m256i *s1,
325
0
                                  __m256i *x /*x[8]*/) {
326
0
  __m256i pp[8];
327
0
  pack_pixels(s0, pp);
328
0
  pack_pixels(s1, &pp[4]);
329
0
  x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
330
0
  x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
331
0
  x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
332
0
  x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
333
0
  x[4] = x[2];
334
0
  x[5] = x[3];
335
0
  x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
336
0
  x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
337
0
}
338
339
0
static inline void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
340
0
  __m256i pp[8];
341
0
  __m256i s0;
342
0
  s0 = _mm256_loadu_si256((const __m256i *)src);
343
0
  pack_pixels(&s0, pp);
344
0
  x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
345
0
  x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
346
0
  x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
347
0
  x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
348
0
}
349
350
static inline void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
351
0
                                   __m256i *x) {
352
0
  __m256i s0, s1;
353
0
  s0 = _mm256_loadu_si256((const __m256i *)src);
354
0
  s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
355
0
  pack_16_pixels(&s0, &s1, x);
356
0
}
357
358
0
static inline void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
359
0
  __m256i s0, s1;
360
0
  s0 = _mm256_loadu_si256((const __m256i *)src);
361
0
  s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
362
0
  pack_16_pixels(&s0, &s1, x);
363
0
}
364
365
// Note:
366
//  Shared by horizontal and vertical filtering
367
0
static inline void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
368
0
  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
369
0
  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
370
0
  const __m256i p0 = _mm256_set1_epi32(0x03020100);
371
0
  const __m256i p1 = _mm256_set1_epi32(0x07060504);
372
0
  const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
373
0
  const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
374
0
  f[0] = _mm256_shuffle_epi8(hh, p0);
375
0
  f[1] = _mm256_shuffle_epi8(hh, p1);
376
0
  f[2] = _mm256_shuffle_epi8(hh, p2);
377
0
  f[3] = _mm256_shuffle_epi8(hh, p3);
378
0
}
379
380
static inline void pack_filters_4tap(const int16_t *filter,
381
0
                                     __m256i *f /*f[4]*/) {
382
0
  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
383
0
  const __m256i coeff = _mm256_broadcastsi128_si256(h);
384
385
  // coeffs 2 3 2 3 2 3 2 3
386
0
  f[0] = _mm256_shuffle_epi32(coeff, 0x55);
387
  // coeffs 4 5 4 5 4 5 4 5
388
0
  f[1] = _mm256_shuffle_epi32(coeff, 0xaa);
389
0
}
390
391
static inline void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
392
                                     const __m256i *fil /*fil[4]*/,
393
0
                                     __m256i *y) {
394
0
  __m256i a, a0, a1;
395
396
0
  a0 = _mm256_madd_epi16(fil[0], sig[0]);
397
0
  a1 = _mm256_madd_epi16(fil[3], sig[3]);
398
0
  a = _mm256_add_epi32(a0, a1);
399
400
0
  a0 = _mm256_madd_epi16(fil[1], sig[1]);
401
0
  a1 = _mm256_madd_epi16(fil[2], sig[2]);
402
403
0
  {
404
0
    const __m256i min = _mm256_min_epi32(a0, a1);
405
0
    a = _mm256_add_epi32(a, min);
406
0
  }
407
0
  {
408
0
    const __m256i max = _mm256_max_epi32(a0, a1);
409
0
    a = _mm256_add_epi32(a, max);
410
0
  }
411
0
  {
412
0
    const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
413
0
    a = _mm256_add_epi32(a, rounding);
414
0
    *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
415
0
  }
416
0
}
417
418
static inline void store_8x1_pixels(const __m256i *y, const __m256i *mask,
419
0
                                    uint16_t *dst) {
420
0
  const __m128i a0 = _mm256_castsi256_si128(*y);
421
0
  const __m128i a1 = _mm256_extractf128_si256(*y, 1);
422
0
  __m128i res = _mm_packus_epi32(a0, a1);
423
0
  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
424
0
  _mm_storeu_si128((__m128i *)dst, res);
425
0
}
426
427
static inline void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
428
                                    const __m256i *mask, uint16_t *dst,
429
0
                                    ptrdiff_t pitch) {
430
0
  __m256i a = _mm256_packus_epi32(*y0, *y1);
431
0
  a = _mm256_min_epi16(a, *mask);
432
0
  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
433
0
  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
434
0
}
435
436
static inline void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
437
0
                                     const __m256i *mask, uint16_t *dst) {
438
0
  __m256i a = _mm256_packus_epi32(*y0, *y1);
439
0
  a = _mm256_min_epi16(a, *mask);
440
0
  _mm256_storeu_si256((__m256i *)dst, a);
441
0
}
442
443
static void aom_highbd_filter_block1d8_h8_avx2(
444
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
445
0
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
446
0
  __m256i signal[8], res0, res1;
447
0
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
448
449
0
  __m256i ff[4];
450
0
  pack_filters(filter, ff);
451
452
0
  src_ptr -= 3;
453
0
  do {
454
0
    pack_8x2_pixels(src_ptr, src_pitch, signal);
455
0
    filter_8x1_pixels(signal, ff, &res0);
456
0
    filter_8x1_pixels(&signal[4], ff, &res1);
457
0
    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
458
0
    height -= 2;
459
0
    src_ptr += src_pitch << 1;
460
0
    dst_ptr += dst_pitch << 1;
461
0
  } while (height > 1);
462
463
0
  if (height > 0) {
464
0
    pack_8x1_pixels(src_ptr, signal);
465
0
    filter_8x1_pixels(signal, ff, &res0);
466
0
    store_8x1_pixels(&res0, &max, dst_ptr);
467
0
  }
468
0
}
469
470
static void aom_highbd_filter_block1d16_h8_avx2(
471
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
472
0
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
473
0
  __m256i signal[8], res0, res1;
474
0
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
475
476
0
  __m256i ff[4];
477
0
  pack_filters(filter, ff);
478
479
0
  src_ptr -= 3;
480
0
  do {
481
0
    pack_16x1_pixels(src_ptr, signal);
482
0
    filter_8x1_pixels(signal, ff, &res0);
483
0
    filter_8x1_pixels(&signal[4], ff, &res1);
484
0
    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
485
0
    height -= 1;
486
0
    src_ptr += src_pitch;
487
0
    dst_ptr += dst_pitch;
488
0
  } while (height > 0);
489
0
}
490
491
static void aom_highbd_filter_block1d4_h4_avx2(
492
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
493
0
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
494
0
  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
495
0
  __m256i ff[2], s[2];
496
0
  uint32_t i;
497
0
  const __m256i clip_pixel =
498
0
      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
499
0
  const __m256i zero = _mm256_setzero_si256();
500
501
0
  static const uint8_t shuffle_mask[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
502
0
                                            7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
503
0
                                            4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
504
505
0
  __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask);
506
0
  __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3);
507
0
  __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5);
508
509
0
  pack_filters_4tap(filter, ff);
510
0
  src_ptr -= 3;
511
0
  for (i = 0; i <= (height - 2); i += 2) {
512
0
    __m256i row0 = _mm256_castsi128_si256(
513
0
        _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2]));
514
0
    __m256i row1 = _mm256_castsi128_si256(
515
0
        _mm_loadu_si128((__m128i *)&src_ptr[(i + 1) * src_pitch + 2]));
516
517
0
    s[0] = _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1);
518
0
    s[1] = _mm256_alignr_epi8(s[0], s[0], 4);
519
520
0
    s[0] = _mm256_shuffle_epi8(s[0], mask);
521
0
    s[1] = _mm256_shuffle_epi8(s[1], mask);
522
523
0
    __m256i res = convolve_4tap(s, ff);
524
0
    res =
525
0
        _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
526
527
0
    res = _mm256_packs_epi32(res, res);
528
0
    res = _mm256_min_epi16(res, clip_pixel);
529
0
    res = _mm256_max_epi16(res, zero);
530
531
0
    _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
532
0
                     _mm256_castsi256_si128(res));
533
0
    _mm_storel_epi64((__m128i *)&dst_ptr[(i + 1) * dst_pitch],
534
0
                     _mm256_extracti128_si256(res, 1));
535
0
  }
536
0
  if (height % 2 != 0) {
537
0
    i = height - 1;
538
0
    const __m256i row0_0 = _mm256_castsi128_si256(
539
0
        _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2]));
540
0
    const __m256i row0_1 = _mm256_castsi128_si256(
541
0
        _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 6]));
542
543
0
    const __m256i r0 =
544
0
        _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1);
545
546
0
    s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3);
547
0
    s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5);
548
549
0
    __m256i res = convolve_4tap(s, ff);
550
0
    res =
551
0
        _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
552
553
0
    res = _mm256_packs_epi32(res, res);
554
0
    res = _mm256_min_epi16(res, clip_pixel);
555
0
    res = _mm256_max_epi16(res, zero);
556
557
0
    _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
558
0
                     _mm256_castsi256_si128(res));
559
0
  }
560
0
}
561
562
static void aom_highbd_filter_block1d8_h4_avx2(
563
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
564
0
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
565
0
  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
566
0
  __m256i ff[2], s[2];
567
0
  uint32_t i = 0;
568
0
  const __m256i clip_pixel =
569
0
      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
570
0
  const __m256i zero = _mm256_setzero_si256();
571
572
0
  static const uint8_t shuffle_mask[32] = { 0, 1, 8,  9,  2, 3, 10, 11,
573
0
                                            4, 5, 12, 13, 6, 7, 14, 15,
574
0
                                            0, 1, 8,  9,  2, 3, 10, 11,
575
0
                                            4, 5, 12, 13, 6, 7, 14, 15 };
576
577
0
  __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask);
578
0
  __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3);
579
0
  __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5);
580
581
0
  pack_filters_4tap(filter, ff);
582
0
  src_ptr -= 3;
583
584
  /* Horizontal filter */
585
586
0
  for (i = 0; i <= (height - 2); i += 2) {
587
0
    const __m256i row0 =
588
0
        _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]);
589
0
    __m256i row1 =
590
0
        _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_pitch + 2]);
591
592
0
    const __m256i r0 =
593
0
        _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1);
594
0
    const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
595
596
    // even pixels
597
0
    s[0] = r0;
598
0
    s[1] = _mm256_alignr_epi8(r1, r0, 4);
599
600
0
    __m256i res_even = convolve_4tap(s, ff);
601
0
    res_even = _mm256_srai_epi32(_mm256_add_epi32(res_even, rounding),
602
0
                                 CONV8_ROUNDING_BITS);
603
604
    // odd pixels
605
0
    s[0] = _mm256_alignr_epi8(r1, r0, 2);
606
0
    s[1] = _mm256_alignr_epi8(r1, r0, 6);
607
608
0
    __m256i res_odd = convolve_4tap(s, ff);
609
0
    res_odd = _mm256_srai_epi32(_mm256_add_epi32(res_odd, rounding),
610
0
                                CONV8_ROUNDING_BITS);
611
612
0
    __m256i res = _mm256_packs_epi32(res_even, res_odd);
613
0
    res = _mm256_shuffle_epi8(res, mask);
614
615
0
    res = _mm256_min_epi16(res, clip_pixel);
616
0
    res = _mm256_max_epi16(res, zero);
617
618
0
    _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch],
619
0
                     _mm256_castsi256_si128(res));
620
0
    _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
621
0
                     _mm256_extracti128_si256(res, 1));
622
0
  }
623
624
0
  if (height % 2 != 0) {
625
0
    i = height - 1;
626
0
    const __m256i row0_0 =
627
0
        _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]);
628
0
    const __m256i row0_1 =
629
0
        _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 6]);
630
631
0
    const __m256i r0 =
632
0
        _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1);
633
634
0
    s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3);
635
0
    s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5);
636
637
0
    __m256i res = convolve_4tap(s, ff);
638
0
    res =
639
0
        _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
640
641
0
    res = _mm256_packs_epi32(res, res);
642
0
    res = _mm256_min_epi16(res, clip_pixel);
643
0
    res = _mm256_max_epi16(res, zero);
644
645
0
    _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
646
0
                     _mm256_castsi256_si128(res));
647
0
    _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + 4],
648
0
                     _mm256_extracti128_si256(res, 1));
649
0
  }
650
0
}
651
652
static void aom_highbd_filter_block1d16_h4_avx2(
653
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
654
0
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
655
0
  aom_highbd_filter_block1d8_h4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch,
656
0
                                     height, filter, bd);
657
0
  aom_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8,
658
0
                                     dst_pitch, height, filter, bd);
659
0
}
660
661
// -----------------------------------------------------------------------------
662
// 2-tap horizontal filtering
663
664
0
static inline void pack_2t_filter(const int16_t *filter, __m256i *f) {
665
0
  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
666
0
  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
667
0
  const __m256i p = _mm256_set1_epi32(0x09080706);
668
0
  f[0] = _mm256_shuffle_epi8(hh, p);
669
0
}
670
671
// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
672
// the difference is s0/s1 specifies first and second rows or,
673
// first 16 samples and 8-sample shifted 16 samples
674
static inline void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
675
0
                                     __m256i *sig) {
676
0
  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
677
0
  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
678
0
  __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
679
0
  __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
680
0
  __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
681
0
  __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
682
0
  r0 = _mm256_shuffle_epi8(r0, sf2);
683
0
  r1 = _mm256_shuffle_epi8(r1, sf2);
684
0
  sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
685
0
  sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
686
0
}
687
688
static inline void pack_8x2_2t_pixels(const uint16_t *src,
689
0
                                      const ptrdiff_t pitch, __m256i *sig) {
690
0
  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
691
0
  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
692
0
  pack_16_2t_pixels(&r0, &r1, sig);
693
0
}
694
695
static inline void pack_16x1_2t_pixels(const uint16_t *src,
696
0
                                       __m256i *sig /*sig[2]*/) {
697
0
  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
698
0
  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
699
0
  pack_16_2t_pixels(&r0, &r1, sig);
700
0
}
701
702
static inline void pack_8x1_2t_pixels(const uint16_t *src,
703
0
                                      __m256i *sig /*sig[2]*/) {
704
0
  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
705
0
  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
706
0
  __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
707
0
  __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
708
0
  r0 = _mm256_permutevar8x32_epi32(r0, idx);
709
0
  r0 = _mm256_shuffle_epi8(r0, sf2);
710
0
  sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
711
0
}
712
713
// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
714
static inline void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
715
0
                                       __m256i *y0, __m256i *y1) {
716
0
  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
717
0
  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
718
0
  __m256i x1 = _mm256_madd_epi16(sig[1], *f);
719
0
  x0 = _mm256_add_epi32(x0, rounding);
720
0
  x1 = _mm256_add_epi32(x1, rounding);
721
0
  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
722
0
  *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
723
0
}
724
725
static inline void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
726
0
                                        __m256i *y0) {
727
0
  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
728
0
  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
729
0
  x0 = _mm256_add_epi32(x0, rounding);
730
0
  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
731
0
}
732
733
static void aom_highbd_filter_block1d8_h2_avx2(
734
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
735
0
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
736
0
  __m256i signal[2], res0, res1;
737
0
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
738
739
0
  __m256i ff;
740
0
  pack_2t_filter(filter, &ff);
741
742
0
  src_ptr -= 3;
743
0
  do {
744
0
    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
745
0
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
746
0
    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
747
0
    height -= 2;
748
0
    src_ptr += src_pitch << 1;
749
0
    dst_ptr += dst_pitch << 1;
750
0
  } while (height > 1);
751
752
0
  if (height > 0) {
753
0
    pack_8x1_2t_pixels(src_ptr, signal);
754
0
    filter_8x1_2t_pixels(signal, &ff, &res0);
755
0
    store_8x1_pixels(&res0, &max, dst_ptr);
756
0
  }
757
0
}
758
759
static void aom_highbd_filter_block1d16_h2_avx2(
760
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
761
0
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
762
0
  __m256i signal[2], res0, res1;
763
0
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
764
765
0
  __m256i ff;
766
0
  pack_2t_filter(filter, &ff);
767
768
0
  src_ptr -= 3;
769
0
  do {
770
0
    pack_16x1_2t_pixels(src_ptr, signal);
771
0
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
772
0
    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
773
0
    height -= 1;
774
0
    src_ptr += src_pitch;
775
0
    dst_ptr += dst_pitch;
776
0
  } while (height > 0);
777
0
}
778
779
// -----------------------------------------------------------------------------
780
// Vertical Filtering
781
782
0
static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
783
0
  __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
784
0
  __m256i s1 =
785
0
      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
786
0
  __m256i s2 = _mm256_castsi128_si256(
787
0
      _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
788
0
  __m256i s3 = _mm256_castsi128_si256(
789
0
      _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
790
0
  __m256i s4 = _mm256_castsi128_si256(
791
0
      _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
792
0
  __m256i s5 = _mm256_castsi128_si256(
793
0
      _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
794
0
  __m256i s6 = _mm256_castsi128_si256(
795
0
      _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
796
797
0
  s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
798
0
  s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
799
0
  s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
800
0
  s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
801
0
  s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
802
0
  s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
803
804
0
  sig[0] = _mm256_unpacklo_epi16(s0, s1);
805
0
  sig[4] = _mm256_unpackhi_epi16(s0, s1);
806
0
  sig[1] = _mm256_unpacklo_epi16(s2, s3);
807
0
  sig[5] = _mm256_unpackhi_epi16(s2, s3);
808
0
  sig[2] = _mm256_unpacklo_epi16(s4, s5);
809
0
  sig[6] = _mm256_unpackhi_epi16(s4, s5);
810
0
  sig[8] = s6;
811
0
}
812
813
static inline void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
814
0
                                   __m256i *sig) {
815
  // base + 7th row
816
0
  __m256i s0 = _mm256_castsi128_si256(
817
0
      _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
818
  // base + 8th row
819
0
  __m256i s1 = _mm256_castsi128_si256(
820
0
      _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
821
0
  __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
822
0
  __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
823
0
  sig[3] = _mm256_unpacklo_epi16(s2, s3);
824
0
  sig[7] = _mm256_unpackhi_epi16(s2, s3);
825
0
  sig[8] = s1;
826
0
}
827
828
static inline void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
829
0
                                     __m256i *y0, __m256i *y1) {
830
0
  filter_8x1_pixels(sig, f, y0);
831
0
  filter_8x1_pixels(&sig[4], f, y1);
832
0
}
833
834
0
static inline void update_pixels(__m256i *sig) {
835
0
  int i;
836
0
  for (i = 0; i < 3; ++i) {
837
0
    sig[i] = sig[i + 1];
838
0
    sig[i + 4] = sig[i + 5];
839
0
  }
840
0
}
841
842
static void aom_highbd_filter_block1d8_v8_avx2(
843
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
844
0
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
845
0
  __m256i signal[9], res0, res1;
846
0
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
847
848
0
  __m256i ff[4];
849
0
  pack_filters(filter, ff);
850
851
0
  pack_8x9_init(src_ptr, src_pitch, signal);
852
853
0
  do {
854
0
    pack_8x9_pixels(src_ptr, src_pitch, signal);
855
856
0
    filter_8x9_pixels(signal, ff, &res0, &res1);
857
0
    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
858
0
    update_pixels(signal);
859
860
0
    src_ptr += src_pitch << 1;
861
0
    dst_ptr += dst_pitch << 1;
862
0
    height -= 2;
863
0
  } while (height > 0);
864
0
}
865
866
0
static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
867
0
  __m256i u0, u1, u2, u3;
868
  // load 0-6 rows
869
0
  const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
870
0
  const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
871
0
  const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
872
0
  const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
873
0
  const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
874
0
  const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
875
0
  const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
876
877
0
  u0 = _mm256_permute2x128_si256(s0, s1, 0x20);  // 0, 1 low
878
0
  u1 = _mm256_permute2x128_si256(s0, s1, 0x31);  // 0, 1 high
879
880
0
  u2 = _mm256_permute2x128_si256(s1, s2, 0x20);  // 1, 2 low
881
0
  u3 = _mm256_permute2x128_si256(s1, s2, 0x31);  // 1, 2 high
882
883
0
  sig[0] = _mm256_unpacklo_epi16(u0, u2);
884
0
  sig[4] = _mm256_unpackhi_epi16(u0, u2);
885
886
0
  sig[8] = _mm256_unpacklo_epi16(u1, u3);
887
0
  sig[12] = _mm256_unpackhi_epi16(u1, u3);
888
889
0
  u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
890
0
  u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
891
892
0
  u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
893
0
  u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
894
895
0
  sig[1] = _mm256_unpacklo_epi16(u0, u2);
896
0
  sig[5] = _mm256_unpackhi_epi16(u0, u2);
897
898
0
  sig[9] = _mm256_unpacklo_epi16(u1, u3);
899
0
  sig[13] = _mm256_unpackhi_epi16(u1, u3);
900
901
0
  u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
902
0
  u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
903
904
0
  u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
905
0
  u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
906
907
0
  sig[2] = _mm256_unpacklo_epi16(u0, u2);
908
0
  sig[6] = _mm256_unpackhi_epi16(u0, u2);
909
910
0
  sig[10] = _mm256_unpacklo_epi16(u1, u3);
911
0
  sig[14] = _mm256_unpackhi_epi16(u1, u3);
912
913
0
  sig[16] = s6;
914
0
}
915
916
static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
917
0
                             __m256i *sig) {
918
  // base + 7th row
919
0
  const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
920
  // base + 8th row
921
0
  const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
922
923
0
  __m256i u0, u1, u2, u3;
924
0
  u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
925
0
  u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
926
927
0
  u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
928
0
  u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
929
930
0
  sig[3] = _mm256_unpacklo_epi16(u0, u2);
931
0
  sig[7] = _mm256_unpackhi_epi16(u0, u2);
932
933
0
  sig[11] = _mm256_unpacklo_epi16(u1, u3);
934
0
  sig[15] = _mm256_unpackhi_epi16(u1, u3);
935
936
0
  sig[16] = s8;
937
0
}
938
939
static inline void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
940
0
                                      __m256i *y0, __m256i *y1) {
941
0
  __m256i res[4];
942
0
  int i;
943
0
  for (i = 0; i < 4; ++i) {
944
0
    filter_8x1_pixels(&sig[i << 2], f, &res[i]);
945
0
  }
946
947
0
  {
948
0
    const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
949
0
    const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
950
0
    *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
951
0
    *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
952
0
  }
953
0
}
954
955
static inline void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
956
                                     const __m256i *mask, uint16_t *dst,
957
0
                                     ptrdiff_t pitch) {
958
0
  __m256i p = _mm256_min_epi16(*y0, *mask);
959
0
  _mm256_storeu_si256((__m256i *)dst, p);
960
0
  p = _mm256_min_epi16(*y1, *mask);
961
0
  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
962
0
}
963
964
0
static void update_16x9_pixels(__m256i *sig) {
965
0
  update_pixels(&sig[0]);
966
0
  update_pixels(&sig[8]);
967
0
}
968
969
static void aom_highbd_filter_block1d16_v8_avx2(
970
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
971
0
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
972
0
  __m256i signal[17], res0, res1;
973
0
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
974
975
0
  __m256i ff[4];
976
0
  pack_filters(filter, ff);
977
978
0
  pack_16x9_init(src_ptr, src_pitch, signal);
979
980
0
  do {
981
0
    pack_16x9_pixels(src_ptr, src_pitch, signal);
982
0
    filter_16x9_pixels(signal, ff, &res0, &res1);
983
0
    store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
984
0
    update_16x9_pixels(signal);
985
986
0
    src_ptr += src_pitch << 1;
987
0
    dst_ptr += dst_pitch << 1;
988
0
    height -= 2;
989
0
  } while (height > 0);
990
0
}
991
992
static void aom_highbd_filter_block1d4_v4_avx2(
993
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
994
0
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
995
0
  const int bits = FILTER_BITS;
996
997
0
  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
998
0
  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
999
0
  const __m256i clip_pixel =
1000
0
      _mm256_set1_epi32(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
1001
0
  const __m256i zero = _mm256_setzero_si256();
1002
0
  uint32_t i;
1003
0
  __m256i s[2], ff[2];
1004
1005
0
  pack_filters_4tap(filter, ff);
1006
1007
0
  const uint16_t *data = src_ptr;
1008
  /* Vertical filter */
1009
0
  {
1010
0
    __m128i s2 = _mm_loadl_epi64((__m128i *)(data + 2 * src_pitch));
1011
0
    __m128i s3 = _mm_loadl_epi64((__m128i *)(data + 3 * src_pitch));
1012
1013
0
    __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1);
1014
1015
0
    __m128i s4 = _mm_loadl_epi64((__m128i *)(data + 4 * src_pitch));
1016
1017
0
    __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1);
1018
1019
0
    s[0] = _mm256_unpacklo_epi16(s23, s34);
1020
1021
0
    for (i = 0; i < height; i += 2) {
1022
0
      data = &src_ptr[i * src_pitch];
1023
1024
0
      __m128i s5 = _mm_loadl_epi64((__m128i *)(data + 5 * src_pitch));
1025
0
      __m128i s6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_pitch));
1026
1027
0
      __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1);
1028
0
      __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1);
1029
1030
0
      s[1] = _mm256_unpacklo_epi16(s45, s56);
1031
1032
0
      const __m256i res_a = convolve_4tap(s, ff);
1033
1034
0
      __m256i res_a_round = _mm256_sra_epi32(
1035
0
          _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
1036
1037
0
      __m256i res_16bit = _mm256_min_epi32(res_a_round, clip_pixel);
1038
0
      res_16bit = _mm256_max_epi32(res_16bit, zero);
1039
0
      res_16bit = _mm256_packs_epi32(res_16bit, res_16bit);
1040
1041
0
      _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
1042
0
                       _mm256_castsi256_si128(res_16bit));
1043
0
      _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
1044
0
                       _mm256_extracti128_si256(res_16bit, 1));
1045
1046
0
      s[0] = s[1];
1047
0
      s4 = s6;
1048
0
    }
1049
0
  }
1050
0
}
1051
1052
static void aom_highbd_filter_block1d8_v4_avx2(
1053
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1054
0
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1055
0
  const int bits = FILTER_BITS;
1056
1057
0
  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
1058
0
  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
1059
0
  const __m256i clip_pixel =
1060
0
      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
1061
0
  const __m256i zero = _mm256_setzero_si256();
1062
0
  __m256i s[4], ff[2];
1063
0
  uint32_t i;
1064
0
  pack_filters_4tap(filter, ff);
1065
1066
0
  const uint16_t *data = src_ptr;
1067
  /* Vertical filter */
1068
0
  {
1069
0
    __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_pitch));
1070
0
    __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_pitch));
1071
1072
0
    __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1);
1073
1074
0
    __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_pitch));
1075
1076
0
    __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1);
1077
1078
0
    s[0] = _mm256_unpacklo_epi16(s23, s34);
1079
0
    s[2] = _mm256_unpackhi_epi16(s23, s34);
1080
1081
0
    for (i = 0; i < height; i += 2) {
1082
0
      data = &src_ptr[i * src_pitch];
1083
1084
0
      __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_pitch));
1085
0
      __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_pitch));
1086
1087
0
      __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1);
1088
0
      __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1);
1089
1090
0
      s[1] = _mm256_unpacklo_epi16(s45, s56);
1091
0
      s[3] = _mm256_unpackhi_epi16(s45, s56);
1092
1093
0
      const __m256i res_a = convolve_4tap(s, ff);
1094
1095
0
      __m256i res_a_round = _mm256_sra_epi32(
1096
0
          _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
1097
1098
0
      const __m256i res_b = convolve_4tap(s + 2, ff);
1099
0
      __m256i res_b_round = _mm256_sra_epi32(
1100
0
          _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
1101
1102
0
      __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
1103
0
      res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
1104
0
      res_16bit = _mm256_max_epi16(res_16bit, zero);
1105
1106
0
      _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch],
1107
0
                       _mm256_castsi256_si128(res_16bit));
1108
0
      _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
1109
0
                       _mm256_extracti128_si256(res_16bit, 1));
1110
1111
0
      s[0] = s[1];
1112
0
      s[2] = s[3];
1113
0
      s4 = s6;
1114
0
    }
1115
0
  }
1116
0
}
1117
1118
static void aom_highbd_filter_block1d16_v4_avx2(
1119
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1120
0
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1121
0
  aom_highbd_filter_block1d8_v4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch,
1122
0
                                     height, filter, bd);
1123
1124
0
  aom_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8,
1125
0
                                     dst_pitch, height, filter, bd);
1126
0
}
1127
1128
// -----------------------------------------------------------------------------
1129
// 2-tap vertical filtering
1130
1131
0
static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
1132
0
  sig[2] = _mm256_loadu_si256((const __m256i *)src);
1133
0
}
1134
1135
static inline void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
1136
0
                                       __m256i *sig) {
1137
  // load the next row
1138
0
  const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
1139
0
  sig[0] = _mm256_unpacklo_epi16(sig[2], u);
1140
0
  sig[1] = _mm256_unpackhi_epi16(sig[2], u);
1141
0
  sig[2] = u;
1142
0
}
1143
1144
static inline void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
1145
0
                                         __m256i *y0, __m256i *y1) {
1146
0
  filter_16_2t_pixels(sig, f, y0, y1);
1147
0
}
1148
1149
static void aom_highbd_filter_block1d16_v2_avx2(
1150
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1151
0
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1152
0
  __m256i signal[3], res0, res1;
1153
0
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1154
0
  __m256i ff;
1155
1156
0
  pack_2t_filter(filter, &ff);
1157
0
  pack_16x2_init(src_ptr, signal);
1158
1159
0
  do {
1160
0
    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
1161
0
    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
1162
0
    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
1163
1164
0
    src_ptr += src_pitch;
1165
0
    dst_ptr += dst_pitch;
1166
0
    height -= 1;
1167
0
  } while (height > 0);
1168
0
}
1169
1170
0
static inline void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
1171
0
  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
1172
0
  const __m128i p = _mm_set1_epi32(0x09080706);
1173
0
  f[0] = _mm_shuffle_epi8(h, p);
1174
0
}
1175
1176
0
static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
1177
0
  sig[2] = _mm_loadu_si128((const __m128i *)src);
1178
0
}
1179
1180
static inline void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
1181
0
                                          __m128i *sig) {
1182
  // load the next row
1183
0
  const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
1184
0
  sig[0] = _mm_unpacklo_epi16(sig[2], u);
1185
0
  sig[1] = _mm_unpackhi_epi16(sig[2], u);
1186
0
  sig[2] = u;
1187
0
}
1188
1189
static inline void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
1190
0
                                      __m128i *y0, __m128i *y1) {
1191
0
  const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
1192
0
  __m128i x0 = _mm_madd_epi16(sig[0], *f);
1193
0
  __m128i x1 = _mm_madd_epi16(sig[1], *f);
1194
0
  x0 = _mm_add_epi32(x0, rounding);
1195
0
  x1 = _mm_add_epi32(x1, rounding);
1196
0
  *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
1197
0
  *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
1198
0
}
1199
1200
static inline void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
1201
0
                                           const __m128i *mask, uint16_t *dst) {
1202
0
  __m128i res = _mm_packus_epi32(*y0, *y1);
1203
0
  res = _mm_min_epi16(res, *mask);
1204
0
  _mm_storeu_si128((__m128i *)dst, res);
1205
0
}
1206
1207
static void aom_highbd_filter_block1d8_v2_avx2(
1208
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1209
0
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1210
0
  __m128i signal[3], res0, res1;
1211
0
  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
1212
0
  __m128i ff;
1213
1214
0
  pack_8x1_2t_filter(filter, &ff);
1215
0
  pack_8x2_init(src_ptr, signal);
1216
1217
0
  do {
1218
0
    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
1219
0
    filter_8_2t_pixels(signal, &ff, &res0, &res1);
1220
0
    store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
1221
1222
0
    src_ptr += src_pitch;
1223
0
    dst_ptr += dst_pitch;
1224
0
    height -= 1;
1225
0
  } while (height > 0);
1226
0
}
1227
1228
void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
1229
                                        ptrdiff_t, uint32_t, const int16_t *,
1230
                                        int);
1231
void aom_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
1232
                                        ptrdiff_t, uint32_t, const int16_t *,
1233
                                        int);
1234
void aom_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
1235
                                        ptrdiff_t, uint32_t, const int16_t *,
1236
                                        int);
1237
void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
1238
                                        ptrdiff_t, uint32_t, const int16_t *,
1239
                                        int);
1240
0
#define aom_highbd_filter_block1d4_h8_avx2 aom_highbd_filter_block1d4_h8_sse2
1241
0
#define aom_highbd_filter_block1d4_h2_avx2 aom_highbd_filter_block1d4_h2_sse2
1242
0
#define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2
1243
0
#define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2
1244
1245
HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2)
1246
HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2)
1247
1248
#undef HIGHBD_FUNC