Coverage Report

Created: 2026-04-01 07:42

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
Line
Count
Source
1
/*
2
 *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
11
#include <immintrin.h>
12
#include "./vpx_dsp_rtcd.h"
13
#include "vpx_dsp/x86/convolve.h"
14
#include "vpx_dsp/x86/convolve_avx2.h"
15
16
// -----------------------------------------------------------------------------
17
// Copy and average
18
19
void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
20
                                   uint16_t *dst, ptrdiff_t dst_stride,
21
                                   const InterpKernel *filter, int x0_q4,
22
                                   int x_step_q4, int y0_q4, int y_step_q4,
23
1.70M
                                   int w, int h, int bd) {
24
1.70M
  (void)filter;
25
1.70M
  (void)x0_q4;
26
1.70M
  (void)x_step_q4;
27
1.70M
  (void)y0_q4;
28
1.70M
  (void)y_step_q4;
29
1.70M
  (void)bd;
30
31
1.70M
  assert(w % 4 == 0);
32
1.70M
  if (w > 32) {  // w = 64
33
7.58M
    do {
34
7.58M
      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
35
7.58M
      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
36
7.58M
      const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
37
7.58M
      const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
38
7.58M
      src += src_stride;
39
7.58M
      _mm256_storeu_si256((__m256i *)dst, p0);
40
7.58M
      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
41
7.58M
      _mm256_storeu_si256((__m256i *)(dst + 32), p2);
42
7.58M
      _mm256_storeu_si256((__m256i *)(dst + 48), p3);
43
7.58M
      dst += dst_stride;
44
7.58M
      h--;
45
7.58M
    } while (h > 0);
46
1.54M
  } else if (w > 16) {  // w = 32
47
9.49M
    do {
48
9.49M
      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
49
9.49M
      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
50
9.49M
      src += src_stride;
51
9.49M
      _mm256_storeu_si256((__m256i *)dst, p0);
52
9.49M
      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
53
9.49M
      dst += dst_stride;
54
9.49M
      h--;
55
9.49M
    } while (h > 0);
56
1.16M
  } else if (w > 8) {  // w = 16
57
221k
    __m256i p0, p1;
58
1.82M
    do {
59
1.82M
      p0 = _mm256_loadu_si256((const __m256i *)src);
60
1.82M
      src += src_stride;
61
1.82M
      p1 = _mm256_loadu_si256((const __m256i *)src);
62
1.82M
      src += src_stride;
63
64
1.82M
      _mm256_storeu_si256((__m256i *)dst, p0);
65
1.82M
      dst += dst_stride;
66
1.82M
      _mm256_storeu_si256((__m256i *)dst, p1);
67
1.82M
      dst += dst_stride;
68
1.82M
      h -= 2;
69
1.82M
    } while (h > 0);
70
948k
  } else if (w > 4) {  // w = 8
71
465k
    __m128i p0, p1;
72
1.90M
    do {
73
1.90M
      p0 = _mm_loadu_si128((const __m128i *)src);
74
1.90M
      src += src_stride;
75
1.90M
      p1 = _mm_loadu_si128((const __m128i *)src);
76
1.90M
      src += src_stride;
77
78
1.90M
      _mm_storeu_si128((__m128i *)dst, p0);
79
1.90M
      dst += dst_stride;
80
1.90M
      _mm_storeu_si128((__m128i *)dst, p1);
81
1.90M
      dst += dst_stride;
82
1.90M
      h -= 2;
83
1.90M
    } while (h > 0);
84
482k
  } else {  // w = 4
85
482k
    __m128i p0, p1;
86
1.03M
    do {
87
1.03M
      p0 = _mm_loadl_epi64((const __m128i *)src);
88
1.03M
      src += src_stride;
89
1.03M
      p1 = _mm_loadl_epi64((const __m128i *)src);
90
1.03M
      src += src_stride;
91
92
1.03M
      _mm_storel_epi64((__m128i *)dst, p0);
93
1.03M
      dst += dst_stride;
94
1.03M
      _mm_storel_epi64((__m128i *)dst, p1);
95
1.03M
      dst += dst_stride;
96
1.03M
      h -= 2;
97
1.03M
    } while (h > 0);
98
482k
  }
99
1.70M
}
100
101
void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
102
                                  uint16_t *dst, ptrdiff_t dst_stride,
103
                                  const InterpKernel *filter, int x0_q4,
104
                                  int x_step_q4, int y0_q4, int y_step_q4,
105
417k
                                  int w, int h, int bd) {
106
417k
  (void)filter;
107
417k
  (void)x0_q4;
108
417k
  (void)x_step_q4;
109
417k
  (void)y0_q4;
110
417k
  (void)y_step_q4;
111
417k
  (void)bd;
112
113
417k
  assert(w % 4 == 0);
114
417k
  if (w > 32) {  // w = 64
115
6.32k
    __m256i p0, p1, p2, p3, u0, u1, u2, u3;
116
388k
    do {
117
388k
      p0 = _mm256_loadu_si256((const __m256i *)src);
118
388k
      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
119
388k
      p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
120
388k
      p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
121
388k
      src += src_stride;
122
388k
      u0 = _mm256_loadu_si256((const __m256i *)dst);
123
388k
      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
124
388k
      u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
125
388k
      u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
126
388k
      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
127
388k
      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
128
388k
      _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
129
388k
      _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
130
388k
      dst += dst_stride;
131
388k
      h--;
132
388k
    } while (h > 0);
133
411k
  } else if (w > 16) {  // w = 32
134
35.1k
    __m256i p0, p1, u0, u1;
135
1.16M
    do {
136
1.16M
      p0 = _mm256_loadu_si256((const __m256i *)src);
137
1.16M
      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
138
1.16M
      src += src_stride;
139
1.16M
      u0 = _mm256_loadu_si256((const __m256i *)dst);
140
1.16M
      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
141
1.16M
      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
142
1.16M
      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
143
1.16M
      dst += dst_stride;
144
1.16M
      h--;
145
1.16M
    } while (h > 0);
146
376k
  } else if (w > 8) {  // w = 16
147
74.2k
    __m256i p0, p1, u0, u1;
148
636k
    do {
149
636k
      p0 = _mm256_loadu_si256((const __m256i *)src);
150
636k
      p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
151
636k
      src += src_stride << 1;
152
636k
      u0 = _mm256_loadu_si256((const __m256i *)dst);
153
636k
      u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
154
155
636k
      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
156
636k
      _mm256_storeu_si256((__m256i *)(dst + dst_stride),
157
636k
                          _mm256_avg_epu16(p1, u1));
158
636k
      dst += dst_stride << 1;
159
636k
      h -= 2;
160
636k
    } while (h > 0);
161
301k
  } else if (w > 4) {  // w = 8
162
143k
    __m128i p0, p1, u0, u1;
163
583k
    do {
164
583k
      p0 = _mm_loadu_si128((const __m128i *)src);
165
583k
      p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
166
583k
      src += src_stride << 1;
167
583k
      u0 = _mm_loadu_si128((const __m128i *)dst);
168
583k
      u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
169
170
583k
      _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
171
583k
      _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
172
583k
      dst += dst_stride << 1;
173
583k
      h -= 2;
174
583k
    } while (h > 0);
175
158k
  } else {  // w = 4
176
158k
    __m128i p0, p1, u0, u1;
177
348k
    do {
178
348k
      p0 = _mm_loadl_epi64((const __m128i *)src);
179
348k
      p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
180
348k
      src += src_stride << 1;
181
348k
      u0 = _mm_loadl_epi64((const __m128i *)dst);
182
348k
      u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
183
184
348k
      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
185
348k
      _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
186
348k
      dst += dst_stride << 1;
187
348k
      h -= 2;
188
348k
    } while (h > 0);
189
158k
  }
190
417k
}
191
192
#if HAVE_X86_ASM
193
// -----------------------------------------------------------------------------
194
// Horizontal and vertical filtering
195
196
static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
197
                                              7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
198
                                              4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
199
200
static const uint8_t signal_pattern_1[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
201
                                              8, 9, 10, 11, 10, 11, 12, 13,
202
                                              4, 5, 6,  7,  6,  7,  8,  9,
203
                                              8, 9, 10, 11, 10, 11, 12, 13 };
204
205
static const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
206
                                              10, 11, 12, 13, 12, 13, 14, 15,
207
                                              6,  7,  8,  9,  8,  9,  10, 11,
208
                                              10, 11, 12, 13, 12, 13, 14, 15 };
209
210
static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
211
212
347M
#define CONV8_ROUNDING_BITS (7)
213
0
#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1))
214
215
// -----------------------------------------------------------------------------
216
// Horizontal Filtering
217
218
72.7M
static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
219
72.7M
  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
220
72.7M
  const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
221
72.7M
  const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
222
72.7M
  const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
223
224
72.7M
  p[0] = _mm256_shuffle_epi8(*s, sf0);  // x0x6
225
72.7M
  p[1] = _mm256_shuffle_epi8(*s, sf1);  // x1x7
226
72.7M
  p[2] = _mm256_shuffle_epi8(c, sf0);   // x2x4
227
72.7M
  p[3] = _mm256_shuffle_epi8(c, sf1);   // x3x5
228
72.7M
}
229
230
// Note:
231
//  Shared by 8x2 and 16x1 block
232
static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
233
35.9M
                                  __m256i *x /*x[8]*/) {
234
35.9M
  __m256i pp[8];
235
35.9M
  pack_pixels(s0, pp);
236
35.9M
  pack_pixels(s1, &pp[4]);
237
35.9M
  x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
238
35.9M
  x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
239
35.9M
  x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
240
35.9M
  x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
241
35.9M
  x[4] = x[2];
242
35.9M
  x[5] = x[3];
243
35.9M
  x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
244
35.9M
  x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
245
35.9M
}
246
247
870k
static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
248
870k
  __m256i pp[8];
249
870k
  __m256i s0;
250
870k
  s0 = _mm256_loadu_si256((const __m256i *)src);
251
870k
  pack_pixels(&s0, pp);
252
870k
  x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
253
870k
  x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
254
870k
  x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
255
870k
  x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
256
870k
}
257
258
static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
259
6.84M
                                   __m256i *x) {
260
6.84M
  __m256i s0, s1;
261
6.84M
  s0 = _mm256_loadu_si256((const __m256i *)src);
262
6.84M
  s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
263
6.84M
  pack_16_pixels(&s0, &s1, x);
264
6.84M
}
265
266
29.0M
static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
267
29.0M
  __m256i s0, s1;
268
29.0M
  s0 = _mm256_loadu_si256((const __m256i *)src);
269
29.0M
  s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
270
29.0M
  pack_16_pixels(&s0, &s1, x);
271
29.0M
}
272
273
// Note:
274
//  Shared by horizontal and vertical filtering
275
4.03M
static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
276
4.03M
  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
277
4.03M
  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
278
4.03M
  const __m256i p0 = _mm256_set1_epi32(0x03020100);
279
4.03M
  const __m256i p1 = _mm256_set1_epi32(0x07060504);
280
4.03M
  const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
281
4.03M
  const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
282
4.03M
  f[0] = _mm256_shuffle_epi8(hh, p0);
283
4.03M
  f[1] = _mm256_shuffle_epi8(hh, p1);
284
4.03M
  f[2] = _mm256_shuffle_epi8(hh, p2);
285
4.03M
  f[3] = _mm256_shuffle_epi8(hh, p3);
286
4.03M
}
287
288
static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
289
                                     const __m256i *fil /*fil[4]*/,
290
138M
                                     __m256i *y) {
291
138M
  __m256i a, a0, a1;
292
293
138M
  a0 = _mm256_madd_epi16(fil[0], sig[0]);
294
138M
  a1 = _mm256_madd_epi16(fil[3], sig[3]);
295
138M
  a = _mm256_add_epi32(a0, a1);
296
297
138M
  a0 = _mm256_madd_epi16(fil[1], sig[1]);
298
138M
  a1 = _mm256_madd_epi16(fil[2], sig[2]);
299
300
138M
  {
301
138M
    const __m256i min = _mm256_min_epi32(a0, a1);
302
138M
    a = _mm256_add_epi32(a, min);
303
138M
  }
304
138M
  {
305
138M
    const __m256i max = _mm256_max_epi32(a0, a1);
306
138M
    a = _mm256_add_epi32(a, max);
307
138M
  }
308
138M
  {
309
138M
    const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
310
138M
    a = _mm256_add_epi32(a, rounding);
311
138M
    *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
312
138M
  }
313
138M
}
314
315
static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
316
1.06M
                                    uint16_t *dst) {
317
1.06M
  const __m128i a0 = _mm256_castsi256_si128(*y);
318
1.06M
  const __m128i a1 = _mm256_extractf128_si256(*y, 1);
319
1.06M
  __m128i res = _mm_packus_epi32(a0, a1);
320
1.06M
  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
321
1.06M
  _mm_storeu_si128((__m128i *)dst, res);
322
1.06M
}
323
324
static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
325
                                    const __m256i *mask, uint16_t *dst,
326
11.2M
                                    ptrdiff_t pitch) {
327
11.2M
  __m256i a = _mm256_packus_epi32(*y0, *y1);
328
11.2M
  a = _mm256_min_epi16(a, *mask);
329
11.2M
  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
330
11.2M
  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
331
11.2M
}
332
333
static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
334
42.9M
                                     const __m256i *mask, uint16_t *dst) {
335
42.9M
  __m256i a = _mm256_packus_epi32(*y0, *y1);
336
42.9M
  a = _mm256_min_epi16(a, *mask);
337
42.9M
  _mm256_storeu_si256((__m256i *)dst, a);
338
42.9M
}
339
340
static void vpx_highbd_filter_block1d8_h8_avx2(
341
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
342
1.02M
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
343
1.02M
  __m256i signal[8], res0, res1;
344
1.02M
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
345
346
1.02M
  __m256i ff[4];
347
1.02M
  pack_filters(filter, ff);
348
349
1.02M
  src_ptr -= 3;
350
6.62M
  do {
351
6.62M
    pack_8x2_pixels(src_ptr, src_pitch, signal);
352
6.62M
    filter_8x1_pixels(signal, ff, &res0);
353
6.62M
    filter_8x1_pixels(&signal[4], ff, &res1);
354
6.62M
    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
355
6.62M
    height -= 2;
356
6.62M
    src_ptr += src_pitch << 1;
357
6.62M
    dst_ptr += dst_pitch << 1;
358
6.62M
  } while (height > 1);
359
360
1.02M
  if (height > 0) {
361
870k
    pack_8x1_pixels(src_ptr, signal);
362
870k
    filter_8x1_pixels(signal, ff, &res0);
363
870k
    store_8x1_pixels(&res0, &max, dst_ptr);
364
870k
  }
365
1.02M
}
366
367
static void vpx_highbd_filter_block1d16_h8_avx2(
368
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
369
763k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
370
763k
  __m256i signal[8], res0, res1;
371
763k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
372
373
763k
  __m256i ff[4];
374
763k
  pack_filters(filter, ff);
375
376
763k
  src_ptr -= 3;
377
27.6M
  do {
378
27.6M
    pack_16x1_pixels(src_ptr, signal);
379
27.6M
    filter_8x1_pixels(signal, ff, &res0);
380
27.6M
    filter_8x1_pixels(&signal[4], ff, &res1);
381
27.6M
    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
382
27.6M
    height -= 1;
383
27.6M
    src_ptr += src_pitch;
384
27.6M
    dst_ptr += dst_pitch;
385
27.6M
  } while (height > 0);
386
763k
}
387
388
// -----------------------------------------------------------------------------
389
// 2-tap horizontal filtering
390
391
993k
static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
392
993k
  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
393
993k
  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
394
993k
  const __m256i p = _mm256_set1_epi32(0x09080706);
395
993k
  f[0] = _mm256_shuffle_epi8(hh, p);
396
993k
}
397
398
// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
399
// the difference is s0/s1 specifies first and second rows or,
400
// first 16 samples and 8-sample shifted 16 samples
401
static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
402
10.9M
                                     __m256i *sig) {
403
10.9M
  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
404
10.9M
  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
405
10.9M
  __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
406
10.9M
  __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
407
10.9M
  __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
408
10.9M
  __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
409
10.9M
  r0 = _mm256_shuffle_epi8(r0, sf2);
410
10.9M
  r1 = _mm256_shuffle_epi8(r1, sf2);
411
10.9M
  sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
412
10.9M
  sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
413
10.9M
}
414
415
static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
416
1.25M
                                      const ptrdiff_t pitch, __m256i *sig) {
417
1.25M
  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
418
1.25M
  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
419
1.25M
  pack_16_2t_pixels(&r0, &r1, sig);
420
1.25M
}
421
422
static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
423
9.67M
                                       __m256i *sig /*sig[2]*/) {
424
9.67M
  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
425
9.67M
  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
426
9.67M
  pack_16_2t_pixels(&r0, &r1, sig);
427
9.67M
}
428
429
static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
430
198k
                                      __m256i *sig /*sig[2]*/) {
431
198k
  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
432
198k
  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
433
198k
  __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
434
198k
  __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
435
198k
  r0 = _mm256_permutevar8x32_epi32(r0, idx);
436
198k
  r0 = _mm256_shuffle_epi8(r0, sf2);
437
198k
  sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
438
198k
}
439
440
// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
441
static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
442
20.7M
                                       __m256i *y0, __m256i *y1) {
443
20.7M
  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
444
20.7M
  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
445
20.7M
  __m256i x1 = _mm256_madd_epi16(sig[1], *f);
446
20.7M
  x0 = _mm256_add_epi32(x0, rounding);
447
20.7M
  x1 = _mm256_add_epi32(x1, rounding);
448
20.7M
  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
449
20.7M
  *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
450
20.7M
}
451
452
static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
453
198k
                                        __m256i *y0) {
454
198k
  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
455
198k
  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
456
198k
  x0 = _mm256_add_epi32(x0, rounding);
457
198k
  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
458
198k
}
459
460
static void vpx_highbd_filter_block1d8_h2_avx2(
461
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
462
225k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
463
225k
  __m256i signal[2], res0, res1;
464
225k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
465
466
225k
  __m256i ff;
467
225k
  pack_2t_filter(filter, &ff);
468
469
225k
  src_ptr -= 3;
470
1.20M
  do {
471
1.20M
    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
472
1.20M
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
473
1.20M
    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
474
1.20M
    height -= 2;
475
1.20M
    src_ptr += src_pitch << 1;
476
1.20M
    dst_ptr += dst_pitch << 1;
477
1.20M
  } while (height > 1);
478
479
225k
  if (height > 0) {
480
198k
    pack_8x1_2t_pixels(src_ptr, signal);
481
198k
    filter_8x1_2t_pixels(signal, &ff, &res0);
482
198k
    store_8x1_pixels(&res0, &max, dst_ptr);
483
198k
  }
484
225k
}
485
486
static void vpx_highbd_filter_block1d16_h2_avx2(
487
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
488
344k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
489
344k
  __m256i signal[2], res0, res1;
490
344k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
491
492
344k
  __m256i ff;
493
344k
  pack_2t_filter(filter, &ff);
494
495
344k
  src_ptr -= 3;
496
9.04M
  do {
497
9.04M
    pack_16x1_2t_pixels(src_ptr, signal);
498
9.04M
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
499
9.04M
    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
500
9.04M
    height -= 1;
501
9.04M
    src_ptr += src_pitch;
502
9.04M
    dst_ptr += dst_pitch;
503
9.04M
  } while (height > 0);
504
344k
}
505
506
// -----------------------------------------------------------------------------
507
// Vertical Filtering
508
509
1.24M
static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
510
1.24M
  __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
511
1.24M
  __m256i s1 =
512
1.24M
      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
513
1.24M
  __m256i s2 = _mm256_castsi128_si256(
514
1.24M
      _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
515
1.24M
  __m256i s3 = _mm256_castsi128_si256(
516
1.24M
      _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
517
1.24M
  __m256i s4 = _mm256_castsi128_si256(
518
1.24M
      _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
519
1.24M
  __m256i s5 = _mm256_castsi128_si256(
520
1.24M
      _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
521
1.24M
  __m256i s6 = _mm256_castsi128_si256(
522
1.24M
      _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
523
524
1.24M
  s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
525
1.24M
  s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
526
1.24M
  s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
527
1.24M
  s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
528
1.24M
  s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
529
1.24M
  s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
530
531
1.24M
  sig[0] = _mm256_unpacklo_epi16(s0, s1);
532
1.24M
  sig[4] = _mm256_unpackhi_epi16(s0, s1);
533
1.24M
  sig[1] = _mm256_unpacklo_epi16(s2, s3);
534
1.24M
  sig[5] = _mm256_unpackhi_epi16(s2, s3);
535
1.24M
  sig[2] = _mm256_unpacklo_epi16(s4, s5);
536
1.24M
  sig[6] = _mm256_unpackhi_epi16(s4, s5);
537
1.24M
  sig[8] = s6;
538
1.24M
}
539
540
static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
541
4.77M
                                   __m256i *sig) {
542
  // base + 7th row
543
4.77M
  __m256i s0 = _mm256_castsi128_si256(
544
4.77M
      _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
545
  // base + 8th row
546
4.77M
  __m256i s1 = _mm256_castsi128_si256(
547
4.77M
      _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
548
4.77M
  __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
549
4.77M
  __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
550
4.77M
  sig[3] = _mm256_unpacklo_epi16(s2, s3);
551
4.77M
  sig[7] = _mm256_unpackhi_epi16(s2, s3);
552
4.77M
  sig[8] = s1;
553
4.77M
}
554
555
static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
556
4.77M
                                     __m256i *y0, __m256i *y1) {
557
4.77M
  filter_8x1_pixels(sig, f, y0);
558
4.77M
  filter_8x1_pixels(&sig[4], f, y1);
559
4.77M
}
560
561
32.7M
static INLINE void update_pixels(__m256i *sig) {
562
32.7M
  int i;
563
130M
  for (i = 0; i < 3; ++i) {
564
98.1M
    sig[i] = sig[i + 1];
565
98.1M
    sig[i + 4] = sig[i + 5];
566
98.1M
  }
567
32.7M
}
568
569
static void vpx_highbd_filter_block1d8_v8_avx2(
570
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
571
881k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
572
881k
  __m256i signal[9], res0, res1;
573
881k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
574
575
881k
  __m256i ff[4];
576
881k
  pack_filters(filter, ff);
577
578
881k
  pack_8x9_init(src_ptr, src_pitch, signal);
579
580
3.39M
  do {
581
3.39M
    pack_8x9_pixels(src_ptr, src_pitch, signal);
582
583
3.39M
    filter_8x9_pixels(signal, ff, &res0, &res1);
584
3.39M
    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
585
3.39M
    update_pixels(signal);
586
587
3.39M
    src_ptr += src_pitch << 1;
588
3.39M
    dst_ptr += dst_pitch << 1;
589
3.39M
    height -= 2;
590
3.39M
  } while (height > 0);
591
881k
}
592
593
905k
static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
594
905k
  __m256i u0, u1, u2, u3;
595
  // load 0-6 rows
596
905k
  const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
597
905k
  const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
598
905k
  const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
599
905k
  const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
600
905k
  const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
601
905k
  const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
602
905k
  const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
603
604
905k
  u0 = _mm256_permute2x128_si256(s0, s1, 0x20);  // 0, 1 low
605
905k
  u1 = _mm256_permute2x128_si256(s0, s1, 0x31);  // 0, 1 high
606
607
905k
  u2 = _mm256_permute2x128_si256(s1, s2, 0x20);  // 1, 2 low
608
905k
  u3 = _mm256_permute2x128_si256(s1, s2, 0x31);  // 1, 2 high
609
610
905k
  sig[0] = _mm256_unpacklo_epi16(u0, u2);
611
905k
  sig[4] = _mm256_unpackhi_epi16(u0, u2);
612
613
905k
  sig[8] = _mm256_unpacklo_epi16(u1, u3);
614
905k
  sig[12] = _mm256_unpackhi_epi16(u1, u3);
615
616
905k
  u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
617
905k
  u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
618
619
905k
  u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
620
905k
  u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
621
622
905k
  sig[1] = _mm256_unpacklo_epi16(u0, u2);
623
905k
  sig[5] = _mm256_unpackhi_epi16(u0, u2);
624
625
905k
  sig[9] = _mm256_unpacklo_epi16(u1, u3);
626
905k
  sig[13] = _mm256_unpackhi_epi16(u1, u3);
627
628
905k
  u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
629
905k
  u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
630
631
905k
  u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
632
905k
  u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
633
634
905k
  sig[2] = _mm256_unpacklo_epi16(u0, u2);
635
905k
  sig[6] = _mm256_unpackhi_epi16(u0, u2);
636
637
905k
  sig[10] = _mm256_unpacklo_epi16(u1, u3);
638
905k
  sig[14] = _mm256_unpackhi_epi16(u1, u3);
639
640
905k
  sig[16] = s6;
641
905k
}
642
643
static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
644
13.9M
                             __m256i *sig) {
645
  // base + 7th row
646
13.9M
  const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
647
  // base + 8th row
648
13.9M
  const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
649
650
13.9M
  __m256i u0, u1, u2, u3;
651
13.9M
  u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
652
13.9M
  u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
653
654
13.9M
  u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
655
13.9M
  u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
656
657
13.9M
  sig[3] = _mm256_unpacklo_epi16(u0, u2);
658
13.9M
  sig[7] = _mm256_unpackhi_epi16(u0, u2);
659
660
13.9M
  sig[11] = _mm256_unpacklo_epi16(u1, u3);
661
13.9M
  sig[15] = _mm256_unpackhi_epi16(u1, u3);
662
663
13.9M
  sig[16] = s8;
664
13.9M
}
665
666
static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
667
13.9M
                                      __m256i *y0, __m256i *y1) {
668
13.9M
  __m256i res[4];
669
13.9M
  int i;
670
69.8M
  for (i = 0; i < 4; ++i) {
671
55.8M
    filter_8x1_pixels(&sig[i << 2], f, &res[i]);
672
55.8M
  }
673
674
13.9M
  {
675
13.9M
    const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
676
13.9M
    const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
677
13.9M
    *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
678
13.9M
    *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
679
13.9M
  }
680
13.9M
}
681
682
static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
683
                                     const __m256i *mask, uint16_t *dst,
684
9.86M
                                     ptrdiff_t pitch) {
685
9.86M
  __m256i p = _mm256_min_epi16(*y0, *mask);
686
9.86M
  _mm256_storeu_si256((__m256i *)dst, p);
687
9.86M
  p = _mm256_min_epi16(*y1, *mask);
688
9.86M
  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
689
9.86M
}
690
691
13.9M
static void update_16x9_pixels(__m256i *sig) {
692
13.9M
  update_pixels(&sig[0]);
693
13.9M
  update_pixels(&sig[8]);
694
13.9M
}
695
696
static void vpx_highbd_filter_block1d16_v8_avx2(
697
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
698
643k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
699
643k
  __m256i signal[17], res0, res1;
700
643k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
701
702
643k
  __m256i ff[4];
703
643k
  pack_filters(filter, ff);
704
705
643k
  pack_16x9_init(src_ptr, src_pitch, signal);
706
707
9.86M
  do {
708
9.86M
    pack_16x9_pixels(src_ptr, src_pitch, signal);
709
9.86M
    filter_16x9_pixels(signal, ff, &res0, &res1);
710
9.86M
    store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
711
9.86M
    update_16x9_pixels(signal);
712
713
9.86M
    src_ptr += src_pitch << 1;
714
9.86M
    dst_ptr += dst_pitch << 1;
715
9.86M
    height -= 2;
716
9.86M
  } while (height > 0);
717
643k
}
718
719
// -----------------------------------------------------------------------------
720
// 2-tap vertical filtering
721
722
388k
static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
723
388k
  sig[2] = _mm256_loadu_si256((const __m256i *)src);
724
388k
}
725
726
static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
727
9.84M
                                       __m256i *sig) {
728
  // load the next row
729
9.84M
  const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
730
9.84M
  sig[0] = _mm256_unpacklo_epi16(sig[2], u);
731
9.84M
  sig[1] = _mm256_unpackhi_epi16(sig[2], u);
732
9.84M
  sig[2] = u;
733
9.84M
}
734
735
static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
736
9.84M
                                         __m256i *y0, __m256i *y1) {
737
9.84M
  filter_16_2t_pixels(sig, f, y0, y1);
738
9.84M
}
739
740
static void vpx_highbd_filter_block1d16_v2_avx2(
741
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
742
249k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
743
249k
  __m256i signal[3], res0, res1;
744
249k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
745
249k
  __m256i ff;
746
747
249k
  pack_2t_filter(filter, &ff);
748
249k
  pack_16x2_init(src_ptr, signal);
749
750
6.32M
  do {
751
6.32M
    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
752
6.32M
    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
753
6.32M
    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
754
755
6.32M
    src_ptr += src_pitch;
756
6.32M
    dst_ptr += dst_pitch;
757
6.32M
    height -= 1;
758
6.32M
  } while (height > 0);
759
249k
}
760
761
251k
static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
762
251k
  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
763
251k
  const __m128i p = _mm_set1_epi32(0x09080706);
764
251k
  f[0] = _mm_shuffle_epi8(h, p);
765
251k
}
766
767
251k
static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
768
251k
  sig[2] = _mm_loadu_si128((const __m128i *)src);
769
251k
}
770
771
static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
772
2.68M
                                          __m128i *sig) {
773
  // load the next row
774
2.68M
  const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
775
2.68M
  sig[0] = _mm_unpacklo_epi16(sig[2], u);
776
2.68M
  sig[1] = _mm_unpackhi_epi16(sig[2], u);
777
2.68M
  sig[2] = u;
778
2.68M
}
779
780
static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
781
2.68M
                                      __m128i *y0, __m128i *y1) {
782
2.68M
  const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
783
2.68M
  __m128i x0 = _mm_madd_epi16(sig[0], *f);
784
2.68M
  __m128i x1 = _mm_madd_epi16(sig[1], *f);
785
2.68M
  x0 = _mm_add_epi32(x0, rounding);
786
2.68M
  x1 = _mm_add_epi32(x1, rounding);
787
2.68M
  *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
788
2.68M
  *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
789
2.68M
}
790
791
static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
792
1.78M
                                           const __m128i *mask, uint16_t *dst) {
793
1.78M
  __m128i res = _mm_packus_epi32(*y0, *y1);
794
1.78M
  res = _mm_min_epi16(res, *mask);
795
1.78M
  _mm_storeu_si128((__m128i *)dst, res);
796
1.78M
}
797
798
static void vpx_highbd_filter_block1d8_v2_avx2(
799
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
800
166k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
801
166k
  __m128i signal[3], res0, res1;
802
166k
  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
803
166k
  __m128i ff;
804
805
166k
  pack_8x1_2t_filter(filter, &ff);
806
166k
  pack_8x2_init(src_ptr, signal);
807
808
1.78M
  do {
809
1.78M
    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
810
1.78M
    filter_8_2t_pixels(signal, &ff, &res0, &res1);
811
1.78M
    store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
812
813
1.78M
    src_ptr += src_pitch;
814
1.78M
    dst_ptr += dst_pitch;
815
1.78M
    height -= 1;
816
1.78M
  } while (height > 0);
817
166k
}
818
819
// Calculation with averaging the input pixels
820
821
static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask,
822
0
                                        uint16_t *dst) {
823
0
  const __m128i a0 = _mm256_castsi256_si128(*y0);
824
0
  const __m128i a1 = _mm256_extractf128_si256(*y0, 1);
825
0
  __m128i res = _mm_packus_epi32(a0, a1);
826
0
  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
827
0
  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
828
0
  res = _mm_avg_epu16(res, pix);
829
0
  _mm_storeu_si128((__m128i *)dst, res);
830
0
}
831
832
static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1,
833
                                        const __m256i *mask, uint16_t *dst,
834
1.65M
                                        ptrdiff_t pitch) {
835
1.65M
  __m256i a = _mm256_packus_epi32(*y0, *y1);
836
1.65M
  const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst);
837
1.65M
  const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch));
838
1.65M
  const __m256i pix =
839
1.65M
      _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
840
1.65M
  a = _mm256_min_epi16(a, *mask);
841
1.65M
  a = _mm256_avg_epu16(a, pix);
842
1.65M
  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
843
1.65M
  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
844
1.65M
}
845
846
static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1,
847
5.61M
                                         const __m256i *mask, uint16_t *dst) {
848
5.61M
  __m256i a = _mm256_packus_epi32(*y0, *y1);
849
5.61M
  const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
850
5.61M
  a = _mm256_min_epi16(a, *mask);
851
5.61M
  a = _mm256_avg_epu16(a, pix);
852
5.61M
  _mm256_storeu_si256((__m256i *)dst, a);
853
5.61M
}
854
855
static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1,
856
                                         const __m256i *mask, uint16_t *dst,
857
4.09M
                                         ptrdiff_t pitch) {
858
4.09M
  const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst);
859
4.09M
  const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch));
860
4.09M
  __m256i p = _mm256_min_epi16(*y0, *mask);
861
4.09M
  p = _mm256_avg_epu16(p, pix0);
862
4.09M
  _mm256_storeu_si256((__m256i *)dst, p);
863
864
4.09M
  p = _mm256_min_epi16(*y1, *mask);
865
4.09M
  p = _mm256_avg_epu16(p, pix1);
866
4.09M
  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
867
4.09M
}
868
869
static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0,
870
                                               const __m128i *y1,
871
                                               const __m128i *mask,
872
897k
                                               uint16_t *dst) {
873
897k
  __m128i res = _mm_packus_epi32(*y0, *y1);
874
897k
  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
875
897k
  res = _mm_min_epi16(res, *mask);
876
897k
  res = _mm_avg_epu16(res, pix);
877
897k
  _mm_storeu_si128((__m128i *)dst, res);
878
897k
}
879
880
static void vpx_highbd_filter_block1d8_h8_avg_avx2(
881
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
882
53.4k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
883
53.4k
  __m256i signal[8], res0, res1;
884
53.4k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
885
886
53.4k
  __m256i ff[4];
887
53.4k
  pack_filters(filter, ff);
888
889
53.4k
  src_ptr -= 3;
890
219k
  do {
891
219k
    pack_8x2_pixels(src_ptr, src_pitch, signal);
892
219k
    filter_8x1_pixels(signal, ff, &res0);
893
219k
    filter_8x1_pixels(&signal[4], ff, &res1);
894
219k
    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
895
219k
    height -= 2;
896
219k
    src_ptr += src_pitch << 1;
897
219k
    dst_ptr += dst_pitch << 1;
898
219k
  } while (height > 1);
899
900
53.4k
  if (height > 0) {
901
0
    pack_8x1_pixels(src_ptr, signal);
902
0
    filter_8x1_pixels(signal, ff, &res0);
903
0
    store_8x1_avg_pixels(&res0, &max, dst_ptr);
904
0
  }
905
53.4k
}
906
907
static void vpx_highbd_filter_block1d16_h8_avg_avx2(
908
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
909
44.8k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
910
44.8k
  __m256i signal[8], res0, res1;
911
44.8k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
912
913
44.8k
  __m256i ff[4];
914
44.8k
  pack_filters(filter, ff);
915
916
44.8k
  src_ptr -= 3;
917
1.46M
  do {
918
1.46M
    pack_16x1_pixels(src_ptr, signal);
919
1.46M
    filter_8x1_pixels(signal, ff, &res0);
920
1.46M
    filter_8x1_pixels(&signal[4], ff, &res1);
921
1.46M
    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
922
1.46M
    height -= 1;
923
1.46M
    src_ptr += src_pitch;
924
1.46M
    dst_ptr += dst_pitch;
925
1.46M
  } while (height > 0);
926
44.8k
}
927
928
static void vpx_highbd_filter_block1d4_h4_avx2(
929
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
930
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
931
  // We extract the middle four elements of the kernel into two registers in
932
  // the form
933
  // ... k[3] k[2] k[3] k[2]
934
  // ... k[5] k[4] k[5] k[4]
935
  // Then we shuffle the source into
936
  // ... s[1] s[0] s[0] s[-1]
937
  // ... s[3] s[2] s[2] s[1]
938
  // Calling multiply and add gives us half of the sum. Calling add on the two
939
  // halves gives us the output. Since avx2 allows us to use 256-bit buffer, we
940
  // can do this two rows at a time.
941
942
0
  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
943
0
  __m256i res_reg;
944
0
  __m256i idx_shift_0 =
945
0
      _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
946
0
                       3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
947
0
  __m256i idx_shift_2 =
948
0
      _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
949
0
                       5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
950
951
0
  __m128i kernel_reg_128;  // Kernel
952
0
  __m256i kernel_reg, kernel_reg_23,
953
0
      kernel_reg_45;  // Segments of the kernel used
954
0
  const __m256i reg_round =
955
0
      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
956
0
  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
957
0
  const ptrdiff_t unrolled_src_stride = src_stride << 1;
958
0
  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
959
0
  int h;
960
961
  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
962
0
  src_ptr -= 1;
963
964
  // Load Kernel
965
0
  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
966
0
  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
967
0
  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
968
0
  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
969
970
0
  for (h = height; h >= 2; h -= 2) {
971
    // Load the source
972
0
    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
973
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
974
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
975
976
    // Get the output
977
0
    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
978
0
                                   &kernel_reg_23, &kernel_reg_45);
979
980
    // Round the result
981
0
    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
982
983
    // Finally combine to get the final dst
984
0
    res_reg = _mm256_packus_epi32(res_reg, res_reg);
985
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
986
0
    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
987
0
                        &res_reg);
988
989
0
    src_ptr += unrolled_src_stride;
990
0
    dst_ptr += unrolled_dst_stride;
991
0
  }
992
993
  // Repeat for the last row if needed
994
0
  if (h > 0) {
995
    // Load the source
996
0
    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
997
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
998
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
999
1000
    // Get the output
1001
0
    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1002
0
                                   &kernel_reg_23, &kernel_reg_45);
1003
1004
    // Round the result
1005
0
    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
1006
1007
    // Finally combine to get the final dst
1008
0
    res_reg = _mm256_packus_epi32(res_reg, res_reg);
1009
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1010
0
    _mm_storel_epi64((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg));
1011
0
  }
1012
0
}
1013
1014
static void vpx_highbd_filter_block1d8_h4_avx2(
1015
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1016
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1017
  // We will extract the middle four elements of the kernel into two registers
1018
  // in the form
1019
  // ... k[3] k[2] k[3] k[2]
1020
  // ... k[5] k[4] k[5] k[4]
1021
  // Then we shuffle the source into
1022
  // ... s[1] s[0] s[0] s[-1]
1023
  // ... s[3] s[2] s[2] s[1]
1024
  // Calling multiply and add gives us half of the sum of the first half.
1025
  // Calling add gives us first half of the output. Repat again to get the whole
1026
  // output. Since avx2 allows us to use 256-bit buffer, we can do this two rows
1027
  // at a time.
1028
1029
0
  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
1030
0
  __m256i res_reg, res_first, res_last;
1031
0
  __m256i idx_shift_0 =
1032
0
      _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
1033
0
                       3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
1034
0
  __m256i idx_shift_2 =
1035
0
      _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
1036
0
                       5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
1037
1038
0
  __m128i kernel_reg_128;  // Kernel
1039
0
  __m256i kernel_reg, kernel_reg_23,
1040
0
      kernel_reg_45;  // Segments of the kernel used
1041
0
  const __m256i reg_round =
1042
0
      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
1043
0
  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
1044
0
  const ptrdiff_t unrolled_src_stride = src_stride << 1;
1045
0
  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
1046
0
  int h;
1047
1048
  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
1049
0
  src_ptr -= 1;
1050
1051
  // Load Kernel
1052
0
  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
1053
0
  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
1054
0
  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
1055
0
  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
1056
1057
0
  for (h = height; h >= 2; h -= 2) {
1058
    // Load the source
1059
0
    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
1060
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
1061
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
1062
1063
    // Result for first half
1064
0
    res_first = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1065
0
                                     &kernel_reg_23, &kernel_reg_45);
1066
1067
    // Do again to get the second half of dst
1068
    // Load the source
1069
0
    src_reg = mm256_loadu2_si128(src_ptr + 4, src_ptr + src_stride + 4);
1070
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
1071
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
1072
1073
    // Result for second half
1074
0
    res_last = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1075
0
                                    &kernel_reg_23, &kernel_reg_45);
1076
1077
    // Round each result
1078
0
    res_first = mm256_round_epi32(&res_first, &reg_round, CONV8_ROUNDING_BITS);
1079
0
    res_last = mm256_round_epi32(&res_last, &reg_round, CONV8_ROUNDING_BITS);
1080
1081
    // Finally combine to get the final dst
1082
0
    res_reg = _mm256_packus_epi32(res_first, res_last);
1083
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1084
0
    mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
1085
0
                       &res_reg);
1086
1087
0
    src_ptr += unrolled_src_stride;
1088
0
    dst_ptr += unrolled_dst_stride;
1089
0
  }
1090
1091
  // Repeat for the last row if needed
1092
0
  if (h > 0) {
1093
0
    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
1094
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
1095
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
1096
1097
0
    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1098
0
                                   &kernel_reg_23, &kernel_reg_45);
1099
1100
0
    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
1101
1102
0
    res_reg = _mm256_packus_epi32(res_reg, res_reg);
1103
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1104
1105
0
    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg);
1106
0
  }
1107
0
}
1108
1109
static void vpx_highbd_filter_block1d16_h4_avx2(
1110
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1111
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1112
0
  vpx_highbd_filter_block1d8_h4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
1113
0
                                     height, kernel, bd);
1114
0
  vpx_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
1115
0
                                     dst_stride, height, kernel, bd);
1116
0
}
1117
1118
static void vpx_highbd_filter_block1d8_v8_avg_avx2(
1119
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1120
362k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1121
362k
  __m256i signal[9], res0, res1;
1122
362k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1123
1124
362k
  __m256i ff[4];
1125
362k
  pack_filters(filter, ff);
1126
1127
362k
  pack_8x9_init(src_ptr, src_pitch, signal);
1128
1129
1.38M
  do {
1130
1.38M
    pack_8x9_pixels(src_ptr, src_pitch, signal);
1131
1132
1.38M
    filter_8x9_pixels(signal, ff, &res0, &res1);
1133
1.38M
    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
1134
1.38M
    update_pixels(signal);
1135
1136
1.38M
    src_ptr += src_pitch << 1;
1137
1.38M
    dst_ptr += dst_pitch << 1;
1138
1.38M
    height -= 2;
1139
1.38M
  } while (height > 0);
1140
362k
}
1141
1142
static void vpx_highbd_filter_block1d16_v8_avg_avx2(
1143
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1144
262k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1145
262k
  __m256i signal[17], res0, res1;
1146
262k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1147
1148
262k
  __m256i ff[4];
1149
262k
  pack_filters(filter, ff);
1150
1151
262k
  pack_16x9_init(src_ptr, src_pitch, signal);
1152
1153
4.09M
  do {
1154
4.09M
    pack_16x9_pixels(src_ptr, src_pitch, signal);
1155
4.09M
    filter_16x9_pixels(signal, ff, &res0, &res1);
1156
4.09M
    store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
1157
4.09M
    update_16x9_pixels(signal);
1158
1159
4.09M
    src_ptr += src_pitch << 1;
1160
4.09M
    dst_ptr += dst_pitch << 1;
1161
4.09M
    height -= 2;
1162
4.09M
  } while (height > 0);
1163
262k
}
1164
1165
static void vpx_highbd_filter_block1d8_h2_avg_avx2(
1166
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1167
11.3k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1168
11.3k
  __m256i signal[2], res0, res1;
1169
11.3k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1170
1171
11.3k
  __m256i ff;
1172
11.3k
  pack_2t_filter(filter, &ff);
1173
1174
11.3k
  src_ptr -= 3;
1175
54.1k
  do {
1176
54.1k
    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
1177
54.1k
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
1178
54.1k
    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
1179
54.1k
    height -= 2;
1180
54.1k
    src_ptr += src_pitch << 1;
1181
54.1k
    dst_ptr += dst_pitch << 1;
1182
54.1k
  } while (height > 1);
1183
1184
11.3k
  if (height > 0) {
1185
0
    pack_8x1_2t_pixels(src_ptr, signal);
1186
0
    filter_8x1_2t_pixels(signal, &ff, &res0);
1187
0
    store_8x1_avg_pixels(&res0, &max, dst_ptr);
1188
0
  }
1189
11.3k
}
1190
1191
static void vpx_highbd_filter_block1d16_h2_avg_avx2(
1192
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1193
23.7k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1194
23.7k
  __m256i signal[2], res0, res1;
1195
23.7k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1196
1197
23.7k
  __m256i ff;
1198
23.7k
  pack_2t_filter(filter, &ff);
1199
1200
23.7k
  src_ptr -= 3;
1201
628k
  do {
1202
628k
    pack_16x1_2t_pixels(src_ptr, signal);
1203
628k
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
1204
628k
    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
1205
628k
    height -= 1;
1206
628k
    src_ptr += src_pitch;
1207
628k
    dst_ptr += dst_pitch;
1208
628k
  } while (height > 0);
1209
23.7k
}
1210
1211
static void vpx_highbd_filter_block1d16_v2_avg_avx2(
1212
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1213
139k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1214
139k
  __m256i signal[3], res0, res1;
1215
139k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1216
139k
  __m256i ff;
1217
1218
139k
  pack_2t_filter(filter, &ff);
1219
139k
  pack_16x2_init(src_ptr, signal);
1220
1221
3.52M
  do {
1222
3.52M
    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
1223
3.52M
    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
1224
3.52M
    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
1225
1226
3.52M
    src_ptr += src_pitch;
1227
3.52M
    dst_ptr += dst_pitch;
1228
3.52M
    height -= 1;
1229
3.52M
  } while (height > 0);
1230
139k
}
1231
1232
static void vpx_highbd_filter_block1d8_v2_avg_avx2(
1233
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1234
85.3k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1235
85.3k
  __m128i signal[3], res0, res1;
1236
85.3k
  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
1237
85.3k
  __m128i ff;
1238
1239
85.3k
  pack_8x1_2t_filter(filter, &ff);
1240
85.3k
  pack_8x2_init(src_ptr, signal);
1241
1242
897k
  do {
1243
897k
    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
1244
897k
    filter_8_2t_pixels(signal, &ff, &res0, &res1);
1245
897k
    store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr);
1246
1247
897k
    src_ptr += src_pitch;
1248
897k
    dst_ptr += dst_pitch;
1249
897k
    height -= 1;
1250
897k
  } while (height > 0);
1251
85.3k
}
1252
1253
static void vpx_highbd_filter_block1d4_v4_avx2(
1254
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1255
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1256
  // We will load two rows of pixels and rearrange them into the form
1257
  // ... s[1,0] s[0,0] s[0,0] s[-1,0]
1258
  // so that we can call multiply and add with the kernel partial output. Then
1259
  // we can call add with another row to get the output.
1260
1261
  // Register for source s[-1:3, :]
1262
0
  __m256i src_reg_1, src_reg_2, src_reg_3;
1263
  // Interleaved rows of the source. lo is first half, hi second
1264
0
  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
1265
0
  __m256i src_reg_m1001, src_reg_1223;
1266
1267
  // Result after multiply and add
1268
0
  __m256i res_reg;
1269
1270
0
  __m128i kernel_reg_128;                            // Kernel
1271
0
  __m256i kernel_reg, kernel_reg_23, kernel_reg_45;  // Segments of kernel used
1272
1273
0
  const __m256i reg_round =
1274
0
      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
1275
0
  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
1276
0
  const ptrdiff_t src_stride_unrolled = src_stride << 1;
1277
0
  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
1278
0
  int h;
1279
1280
  // Load Kernel
1281
0
  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
1282
0
  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
1283
0
  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
1284
0
  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
1285
1286
  // Row -1 to row 0
1287
0
  src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr,
1288
0
                                   (const __m128i *)(src_ptr + src_stride));
1289
1290
  // Row 0 to row 1
1291
0
  src_reg_1 = _mm256_castsi128_si256(
1292
0
      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
1293
0
  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
1294
1295
  // First three rows
1296
0
  src_reg_m1001 = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
1297
1298
0
  for (h = height; h > 1; h -= 2) {
1299
0
    src_reg_2 = _mm256_castsi128_si256(
1300
0
        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
1301
1302
0
    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
1303
0
                                         _mm256_castsi256_si128(src_reg_2), 1);
1304
1305
0
    src_reg_3 = _mm256_castsi128_si256(
1306
0
        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
1307
1308
0
    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
1309
0
                                         _mm256_castsi256_si128(src_reg_3), 1);
1310
1311
    // Last three rows
1312
0
    src_reg_1223 = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
1313
1314
    // Output
1315
0
    res_reg = mm256_madd_add_epi32(&src_reg_m1001, &src_reg_1223,
1316
0
                                   &kernel_reg_23, &kernel_reg_45);
1317
1318
    // Round the words
1319
0
    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
1320
1321
    // Combine to get the result
1322
0
    res_reg = _mm256_packus_epi32(res_reg, res_reg);
1323
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1324
1325
    // Save the result
1326
0
    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
1327
0
                        &res_reg);
1328
1329
    // Update the source by two rows
1330
0
    src_ptr += src_stride_unrolled;
1331
0
    dst_ptr += dst_stride_unrolled;
1332
1333
0
    src_reg_m1001 = src_reg_1223;
1334
0
    src_reg_1 = src_reg_3;
1335
0
  }
1336
0
}
1337
1338
static void vpx_highbd_filter_block1d8_v4_avx2(
1339
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1340
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1341
  // We will load two rows of pixels and rearrange them into the form
1342
  // ... s[1,0] s[0,0] s[0,0] s[-1,0]
1343
  // so that we can call multiply and add with the kernel partial output. Then
1344
  // we can call add with another row to get the output.
1345
1346
  // Register for source s[-1:3, :]
1347
0
  __m256i src_reg_1, src_reg_2, src_reg_3;
1348
  // Interleaved rows of the source. lo is first half, hi second
1349
0
  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
1350
0
  __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi;
1351
1352
0
  __m128i kernel_reg_128;                            // Kernel
1353
0
  __m256i kernel_reg, kernel_reg_23, kernel_reg_45;  // Segments of kernel
1354
1355
  // Result after multiply and add
1356
0
  __m256i res_reg, res_reg_lo, res_reg_hi;
1357
1358
0
  const __m256i reg_round =
1359
0
      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
1360
0
  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
1361
0
  const ptrdiff_t src_stride_unrolled = src_stride << 1;
1362
0
  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
1363
0
  int h;
1364
1365
  // Load Kernel
1366
0
  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
1367
0
  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
1368
0
  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
1369
0
  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
1370
1371
  // Row -1 to row 0
1372
0
  src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
1373
0
                                   (const __m128i *)(src_ptr + src_stride));
1374
1375
  // Row 0 to row 1
1376
0
  src_reg_1 = _mm256_castsi128_si256(
1377
0
      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
1378
0
  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
1379
1380
  // First three rows
1381
0
  src_reg_m1001_lo = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
1382
0
  src_reg_m1001_hi = _mm256_unpackhi_epi16(src_reg_m10, src_reg_01);
1383
1384
0
  for (h = height; h > 1; h -= 2) {
1385
0
    src_reg_2 = _mm256_castsi128_si256(
1386
0
        _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)));
1387
1388
0
    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
1389
0
                                         _mm256_castsi256_si128(src_reg_2), 1);
1390
1391
0
    src_reg_3 = _mm256_castsi128_si256(
1392
0
        _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)));
1393
1394
0
    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
1395
0
                                         _mm256_castsi256_si128(src_reg_3), 1);
1396
1397
    // Last three rows
1398
0
    src_reg_1223_lo = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
1399
0
    src_reg_1223_hi = _mm256_unpackhi_epi16(src_reg_12, src_reg_23);
1400
1401
    // Output from first half
1402
0
    res_reg_lo = mm256_madd_add_epi32(&src_reg_m1001_lo, &src_reg_1223_lo,
1403
0
                                      &kernel_reg_23, &kernel_reg_45);
1404
1405
    // Output from second half
1406
0
    res_reg_hi = mm256_madd_add_epi32(&src_reg_m1001_hi, &src_reg_1223_hi,
1407
0
                                      &kernel_reg_23, &kernel_reg_45);
1408
1409
    // Round the words
1410
0
    res_reg_lo =
1411
0
        mm256_round_epi32(&res_reg_lo, &reg_round, CONV8_ROUNDING_BITS);
1412
0
    res_reg_hi =
1413
0
        mm256_round_epi32(&res_reg_hi, &reg_round, CONV8_ROUNDING_BITS);
1414
1415
    // Combine to get the result
1416
0
    res_reg = _mm256_packus_epi32(res_reg_lo, res_reg_hi);
1417
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1418
1419
    // Save the result
1420
0
    mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
1421
0
                       &res_reg);
1422
1423
    // Update the source by two rows
1424
0
    src_ptr += src_stride_unrolled;
1425
0
    dst_ptr += dst_stride_unrolled;
1426
1427
0
    src_reg_m1001_lo = src_reg_1223_lo;
1428
0
    src_reg_m1001_hi = src_reg_1223_hi;
1429
0
    src_reg_1 = src_reg_3;
1430
0
  }
1431
0
}
1432
1433
static void vpx_highbd_filter_block1d16_v4_avx2(
1434
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1435
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1436
0
  vpx_highbd_filter_block1d8_v4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
1437
0
                                     height, kernel, bd);
1438
0
  vpx_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
1439
0
                                     dst_stride, height, kernel, bd);
1440
0
}
1441
1442
// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
1443
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
1444
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
1445
1446
// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
1447
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
1448
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
1449
1450
1.54M
#define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2
1451
381k
#define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2
1452
1.23M
#define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
1453
294k
#define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2
1454
1455
// Use the [vh]8 version because there is no [vh]4 implementation.
1456
#define vpx_highbd_filter_block1d16_v4_avg_avx2 \
1457
0
  vpx_highbd_filter_block1d16_v8_avg_avx2
1458
#define vpx_highbd_filter_block1d16_h4_avg_avx2 \
1459
0
  vpx_highbd_filter_block1d16_h8_avg_avx2
1460
#define vpx_highbd_filter_block1d8_v4_avg_avx2 \
1461
0
  vpx_highbd_filter_block1d8_v8_avg_avx2
1462
#define vpx_highbd_filter_block1d8_h4_avg_avx2 \
1463
0
  vpx_highbd_filter_block1d8_h8_avg_avx2
1464
#define vpx_highbd_filter_block1d4_v4_avg_avx2 \
1465
0
  vpx_highbd_filter_block1d4_v8_avg_avx2
1466
#define vpx_highbd_filter_block1d4_h4_avg_avx2 \
1467
0
  vpx_highbd_filter_block1d4_h8_avg_avx2
1468
1469
HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
1470
HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
1471
                 src - src_stride * (num_taps / 2 - 1), , avx2, 0)
1472
2.46M
HIGH_FUN_CONV_2D(, avx2, 0)
1473
1474
// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
1475
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
1476
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
1477
1478
// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
1479
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
1480
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
1481
1482
#define vpx_highbd_filter_block1d4_h8_avg_avx2 \
1483
73.0k
  vpx_highbd_filter_block1d4_h8_avg_sse2
1484
#define vpx_highbd_filter_block1d4_h2_avg_avx2 \
1485
18.6k
  vpx_highbd_filter_block1d4_h2_avg_sse2
1486
#define vpx_highbd_filter_block1d4_v8_avg_avx2 \
1487
505k
  vpx_highbd_filter_block1d4_v8_avg_sse2
1488
#define vpx_highbd_filter_block1d4_v2_avg_avx2 \
1489
126k
  vpx_highbd_filter_block1d4_v2_avg_sse2
1490
1491
HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
1492
HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
1493
                 src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
1494
HIGH_FUN_CONV_2D(avg_, avx2, 1)
1495
1496
#undef HIGHBD_FUNC
1497
#endif  // HAVE_X86_ASM