Coverage Report

Created: 2026-05-23 07:06

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
Line
Count
Source
1
/*
2
 *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
11
#include <immintrin.h>
12
#include "./vpx_dsp_rtcd.h"
13
#include "vpx_dsp/x86/convolve.h"
14
#include "vpx_dsp/x86/convolve_avx2.h"
15
16
// -----------------------------------------------------------------------------
17
// Copy and average
18
19
void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
20
                                   uint16_t *dst, ptrdiff_t dst_stride,
21
                                   const InterpKernel *filter, int x0_q4,
22
                                   int x_step_q4, int y0_q4, int y_step_q4,
23
1.74M
                                   int w, int h, int bd) {
24
1.74M
  (void)filter;
25
1.74M
  (void)x0_q4;
26
1.74M
  (void)x_step_q4;
27
1.74M
  (void)y0_q4;
28
1.74M
  (void)y_step_q4;
29
1.74M
  (void)bd;
30
31
1.74M
  assert(w % 4 == 0);
32
1.74M
  if (w > 32) {  // w = 64
33
7.59M
    do {
34
7.59M
      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
35
7.59M
      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
36
7.59M
      const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
37
7.59M
      const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
38
7.59M
      src += src_stride;
39
7.59M
      _mm256_storeu_si256((__m256i *)dst, p0);
40
7.59M
      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
41
7.59M
      _mm256_storeu_si256((__m256i *)(dst + 32), p2);
42
7.59M
      _mm256_storeu_si256((__m256i *)(dst + 48), p3);
43
7.59M
      dst += dst_stride;
44
7.59M
      h--;
45
7.59M
    } while (h > 0);
46
1.58M
  } else if (w > 16) {  // w = 32
47
9.80M
    do {
48
9.80M
      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
49
9.80M
      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
50
9.80M
      src += src_stride;
51
9.80M
      _mm256_storeu_si256((__m256i *)dst, p0);
52
9.80M
      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
53
9.80M
      dst += dst_stride;
54
9.80M
      h--;
55
9.80M
    } while (h > 0);
56
1.19M
  } else if (w > 8) {  // w = 16
57
237k
    __m256i p0, p1;
58
2.00M
    do {
59
2.00M
      p0 = _mm256_loadu_si256((const __m256i *)src);
60
2.00M
      src += src_stride;
61
2.00M
      p1 = _mm256_loadu_si256((const __m256i *)src);
62
2.00M
      src += src_stride;
63
64
2.00M
      _mm256_storeu_si256((__m256i *)dst, p0);
65
2.00M
      dst += dst_stride;
66
2.00M
      _mm256_storeu_si256((__m256i *)dst, p1);
67
2.00M
      dst += dst_stride;
68
2.00M
      h -= 2;
69
2.00M
    } while (h > 0);
70
959k
  } else if (w > 4) {  // w = 8
71
476k
    __m128i p0, p1;
72
1.96M
    do {
73
1.96M
      p0 = _mm_loadu_si128((const __m128i *)src);
74
1.96M
      src += src_stride;
75
1.96M
      p1 = _mm_loadu_si128((const __m128i *)src);
76
1.96M
      src += src_stride;
77
78
1.96M
      _mm_storeu_si128((__m128i *)dst, p0);
79
1.96M
      dst += dst_stride;
80
1.96M
      _mm_storeu_si128((__m128i *)dst, p1);
81
1.96M
      dst += dst_stride;
82
1.96M
      h -= 2;
83
1.96M
    } while (h > 0);
84
482k
  } else {  // w = 4
85
482k
    __m128i p0, p1;
86
1.03M
    do {
87
1.03M
      p0 = _mm_loadl_epi64((const __m128i *)src);
88
1.03M
      src += src_stride;
89
1.03M
      p1 = _mm_loadl_epi64((const __m128i *)src);
90
1.03M
      src += src_stride;
91
92
1.03M
      _mm_storel_epi64((__m128i *)dst, p0);
93
1.03M
      dst += dst_stride;
94
1.03M
      _mm_storel_epi64((__m128i *)dst, p1);
95
1.03M
      dst += dst_stride;
96
1.03M
      h -= 2;
97
1.03M
    } while (h > 0);
98
482k
  }
99
1.74M
}
100
101
void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
102
                                  uint16_t *dst, ptrdiff_t dst_stride,
103
                                  const InterpKernel *filter, int x0_q4,
104
                                  int x_step_q4, int y0_q4, int y_step_q4,
105
490k
                                  int w, int h, int bd) {
106
490k
  (void)filter;
107
490k
  (void)x0_q4;
108
490k
  (void)x_step_q4;
109
490k
  (void)y0_q4;
110
490k
  (void)y_step_q4;
111
490k
  (void)bd;
112
113
490k
  assert(w % 4 == 0);
114
490k
  if (w > 32) {  // w = 64
115
6.67k
    __m256i p0, p1, p2, p3, u0, u1, u2, u3;
116
411k
    do {
117
411k
      p0 = _mm256_loadu_si256((const __m256i *)src);
118
411k
      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
119
411k
      p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
120
411k
      p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
121
411k
      src += src_stride;
122
411k
      u0 = _mm256_loadu_si256((const __m256i *)dst);
123
411k
      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
124
411k
      u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
125
411k
      u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
126
411k
      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
127
411k
      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
128
411k
      _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
129
411k
      _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
130
411k
      dst += dst_stride;
131
411k
      h--;
132
411k
    } while (h > 0);
133
483k
  } else if (w > 16) {  // w = 32
134
40.9k
    __m256i p0, p1, u0, u1;
135
1.38M
    do {
136
1.38M
      p0 = _mm256_loadu_si256((const __m256i *)src);
137
1.38M
      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
138
1.38M
      src += src_stride;
139
1.38M
      u0 = _mm256_loadu_si256((const __m256i *)dst);
140
1.38M
      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
141
1.38M
      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
142
1.38M
      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
143
1.38M
      dst += dst_stride;
144
1.38M
      h--;
145
1.38M
    } while (h > 0);
146
442k
  } else if (w > 8) {  // w = 16
147
88.0k
    __m256i p0, p1, u0, u1;
148
775k
    do {
149
775k
      p0 = _mm256_loadu_si256((const __m256i *)src);
150
775k
      p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
151
775k
      src += src_stride << 1;
152
775k
      u0 = _mm256_loadu_si256((const __m256i *)dst);
153
775k
      u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
154
155
775k
      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
156
775k
      _mm256_storeu_si256((__m256i *)(dst + dst_stride),
157
775k
                          _mm256_avg_epu16(p1, u1));
158
775k
      dst += dst_stride << 1;
159
775k
      h -= 2;
160
775k
    } while (h > 0);
161
354k
  } else if (w > 4) {  // w = 8
162
171k
    __m128i p0, p1, u0, u1;
163
691k
    do {
164
691k
      p0 = _mm_loadu_si128((const __m128i *)src);
165
691k
      p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
166
691k
      src += src_stride << 1;
167
691k
      u0 = _mm_loadu_si128((const __m128i *)dst);
168
691k
      u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
169
170
691k
      _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
171
691k
      _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
172
691k
      dst += dst_stride << 1;
173
691k
      h -= 2;
174
691k
    } while (h > 0);
175
183k
  } else {  // w = 4
176
183k
    __m128i p0, p1, u0, u1;
177
402k
    do {
178
402k
      p0 = _mm_loadl_epi64((const __m128i *)src);
179
402k
      p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
180
402k
      src += src_stride << 1;
181
402k
      u0 = _mm_loadl_epi64((const __m128i *)dst);
182
402k
      u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
183
184
402k
      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
185
402k
      _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
186
402k
      dst += dst_stride << 1;
187
402k
      h -= 2;
188
402k
    } while (h > 0);
189
183k
  }
190
490k
}
191
192
#if HAVE_X86_ASM
193
// -----------------------------------------------------------------------------
194
// Horizontal and vertical filtering
195
196
static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
197
                                              7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
198
                                              4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
199
200
static const uint8_t signal_pattern_1[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
201
                                              8, 9, 10, 11, 10, 11, 12, 13,
202
                                              4, 5, 6,  7,  6,  7,  8,  9,
203
                                              8, 9, 10, 11, 10, 11, 12, 13 };
204
205
static const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
206
                                              10, 11, 12, 13, 12, 13, 14, 15,
207
                                              6,  7,  8,  9,  8,  9,  10, 11,
208
                                              10, 11, 12, 13, 12, 13, 14, 15 };
209
210
static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
211
212
336M
#define CONV8_ROUNDING_BITS (7)
213
0
#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1))
214
215
// -----------------------------------------------------------------------------
216
// Horizontal Filtering
217
218
62.5M
static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
219
62.5M
  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
220
62.5M
  const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
221
62.5M
  const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
222
62.5M
  const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
223
224
62.5M
  p[0] = _mm256_shuffle_epi8(*s, sf0);  // x0x6
225
62.5M
  p[1] = _mm256_shuffle_epi8(*s, sf1);  // x1x7
226
62.5M
  p[2] = _mm256_shuffle_epi8(c, sf0);   // x2x4
227
62.5M
  p[3] = _mm256_shuffle_epi8(c, sf1);   // x3x5
228
62.5M
}
229
230
// Note:
231
//  Shared by 8x2 and 16x1 block
232
static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
233
30.8M
                                  __m256i *x /*x[8]*/) {
234
30.8M
  __m256i pp[8];
235
30.8M
  pack_pixels(s0, pp);
236
30.8M
  pack_pixels(s1, &pp[4]);
237
30.8M
  x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
238
30.8M
  x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
239
30.8M
  x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
240
30.8M
  x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
241
30.8M
  x[4] = x[2];
242
30.8M
  x[5] = x[3];
243
30.8M
  x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
244
30.8M
  x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
245
30.8M
}
246
247
784k
static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
248
784k
  __m256i pp[8];
249
784k
  __m256i s0;
250
784k
  s0 = _mm256_loadu_si256((const __m256i *)src);
251
784k
  pack_pixels(&s0, pp);
252
784k
  x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
253
784k
  x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
254
784k
  x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
255
784k
  x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
256
784k
}
257
258
static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
259
6.12M
                                   __m256i *x) {
260
6.12M
  __m256i s0, s1;
261
6.12M
  s0 = _mm256_loadu_si256((const __m256i *)src);
262
6.12M
  s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
263
6.12M
  pack_16_pixels(&s0, &s1, x);
264
6.12M
}
265
266
24.7M
static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
267
24.7M
  __m256i s0, s1;
268
24.7M
  s0 = _mm256_loadu_si256((const __m256i *)src);
269
24.7M
  s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
270
24.7M
  pack_16_pixels(&s0, &s1, x);
271
24.7M
}
272
273
// Note:
274
//  Shared by horizontal and vertical filtering
275
3.63M
static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
276
3.63M
  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
277
3.63M
  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
278
3.63M
  const __m256i p0 = _mm256_set1_epi32(0x03020100);
279
3.63M
  const __m256i p1 = _mm256_set1_epi32(0x07060504);
280
3.63M
  const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
281
3.63M
  const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
282
3.63M
  f[0] = _mm256_shuffle_epi8(hh, p0);
283
3.63M
  f[1] = _mm256_shuffle_epi8(hh, p1);
284
3.63M
  f[2] = _mm256_shuffle_epi8(hh, p2);
285
3.63M
  f[3] = _mm256_shuffle_epi8(hh, p3);
286
3.63M
}
287
288
static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
289
                                     const __m256i *fil /*fil[4]*/,
290
121M
                                     __m256i *y) {
291
121M
  __m256i a, a0, a1;
292
293
121M
  a0 = _mm256_madd_epi16(fil[0], sig[0]);
294
121M
  a1 = _mm256_madd_epi16(fil[3], sig[3]);
295
121M
  a = _mm256_add_epi32(a0, a1);
296
297
121M
  a0 = _mm256_madd_epi16(fil[1], sig[1]);
298
121M
  a1 = _mm256_madd_epi16(fil[2], sig[2]);
299
300
121M
  {
301
121M
    const __m256i min = _mm256_min_epi32(a0, a1);
302
121M
    a = _mm256_add_epi32(a, min);
303
121M
  }
304
121M
  {
305
121M
    const __m256i max = _mm256_max_epi32(a0, a1);
306
121M
    a = _mm256_add_epi32(a, max);
307
121M
  }
308
121M
  {
309
121M
    const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
310
121M
    a = _mm256_add_epi32(a, rounding);
311
121M
    *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
312
121M
  }
313
121M
}
314
315
static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
316
1.04M
                                    uint16_t *dst) {
317
1.04M
  const __m128i a0 = _mm256_castsi256_si128(*y);
318
1.04M
  const __m128i a1 = _mm256_extractf128_si256(*y, 1);
319
1.04M
  __m128i res = _mm_packus_epi32(a0, a1);
320
1.04M
  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
321
1.04M
  _mm_storeu_si128((__m128i *)dst, res);
322
1.04M
}
323
324
static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
325
                                    const __m256i *mask, uint16_t *dst,
326
10.5M
                                    ptrdiff_t pitch) {
327
10.5M
  __m256i a = _mm256_packus_epi32(*y0, *y1);
328
10.5M
  a = _mm256_min_epi16(a, *mask);
329
10.5M
  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
330
10.5M
  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
331
10.5M
}
332
333
static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
334
43.6M
                                     const __m256i *mask, uint16_t *dst) {
335
43.6M
  __m256i a = _mm256_packus_epi32(*y0, *y1);
336
43.6M
  a = _mm256_min_epi16(a, *mask);
337
43.6M
  _mm256_storeu_si256((__m256i *)dst, a);
338
43.6M
}
339
340
static void vpx_highbd_filter_block1d8_h8_avx2(
341
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
342
923k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
343
923k
  __m256i signal[8], res0, res1;
344
923k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
345
346
923k
  __m256i ff[4];
347
923k
  pack_filters(filter, ff);
348
349
923k
  src_ptr -= 3;
350
5.89M
  do {
351
5.89M
    pack_8x2_pixels(src_ptr, src_pitch, signal);
352
5.89M
    filter_8x1_pixels(signal, ff, &res0);
353
5.89M
    filter_8x1_pixels(&signal[4], ff, &res1);
354
5.89M
    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
355
5.89M
    height -= 2;
356
5.89M
    src_ptr += src_pitch << 1;
357
5.89M
    dst_ptr += dst_pitch << 1;
358
5.89M
  } while (height > 1);
359
360
923k
  if (height > 0) {
361
784k
    pack_8x1_pixels(src_ptr, signal);
362
784k
    filter_8x1_pixels(signal, ff, &res0);
363
784k
    store_8x1_pixels(&res0, &max, dst_ptr);
364
784k
  }
365
923k
}
366
367
static void vpx_highbd_filter_block1d16_h8_avx2(
368
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
369
639k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
370
639k
  __m256i signal[8], res0, res1;
371
639k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
372
373
639k
  __m256i ff[4];
374
639k
  pack_filters(filter, ff);
375
376
639k
  src_ptr -= 3;
377
23.4M
  do {
378
23.4M
    pack_16x1_pixels(src_ptr, signal);
379
23.4M
    filter_8x1_pixels(signal, ff, &res0);
380
23.4M
    filter_8x1_pixels(&signal[4], ff, &res1);
381
23.4M
    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
382
23.4M
    height -= 1;
383
23.4M
    src_ptr += src_pitch;
384
23.4M
    dst_ptr += dst_pitch;
385
23.4M
  } while (height > 0);
386
639k
}
387
388
// -----------------------------------------------------------------------------
389
// 2-tap horizontal filtering
390
391
1.30M
static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
392
1.30M
  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
393
1.30M
  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
394
1.30M
  const __m256i p = _mm256_set1_epi32(0x09080706);
395
1.30M
  f[0] = _mm256_shuffle_epi8(hh, p);
396
1.30M
}
397
398
// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
399
// the difference is s0/s1 specifies first and second rows or,
400
// first 16 samples and 8-sample shifted 16 samples
401
static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
402
14.2M
                                     __m256i *sig) {
403
14.2M
  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
404
14.2M
  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
405
14.2M
  __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
406
14.2M
  __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
407
14.2M
  __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
408
14.2M
  __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
409
14.2M
  r0 = _mm256_shuffle_epi8(r0, sf2);
410
14.2M
  r1 = _mm256_shuffle_epi8(r1, sf2);
411
14.2M
  sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
412
14.2M
  sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
413
14.2M
}
414
415
static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
416
1.69M
                                      const ptrdiff_t pitch, __m256i *sig) {
417
1.69M
  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
418
1.69M
  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
419
1.69M
  pack_16_2t_pixels(&r0, &r1, sig);
420
1.69M
}
421
422
static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
423
12.5M
                                       __m256i *sig /*sig[2]*/) {
424
12.5M
  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
425
12.5M
  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
426
12.5M
  pack_16_2t_pixels(&r0, &r1, sig);
427
12.5M
}
428
429
static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
430
264k
                                      __m256i *sig /*sig[2]*/) {
431
264k
  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
432
264k
  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
433
264k
  __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
434
264k
  __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
435
264k
  r0 = _mm256_permutevar8x32_epi32(r0, idx);
436
264k
  r0 = _mm256_shuffle_epi8(r0, sf2);
437
264k
  sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
438
264k
}
439
440
// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
441
static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
442
27.2M
                                       __m256i *y0, __m256i *y1) {
443
27.2M
  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
444
27.2M
  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
445
27.2M
  __m256i x1 = _mm256_madd_epi16(sig[1], *f);
446
27.2M
  x0 = _mm256_add_epi32(x0, rounding);
447
27.2M
  x1 = _mm256_add_epi32(x1, rounding);
448
27.2M
  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
449
27.2M
  *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
450
27.2M
}
451
452
static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
453
264k
                                        __m256i *y0) {
454
264k
  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
455
264k
  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
456
264k
  x0 = _mm256_add_epi32(x0, rounding);
457
264k
  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
458
264k
}
459
460
static void vpx_highbd_filter_block1d8_h2_avx2(
461
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
462
298k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
463
298k
  __m256i signal[2], res0, res1;
464
298k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
465
466
298k
  __m256i ff;
467
298k
  pack_2t_filter(filter, &ff);
468
469
298k
  src_ptr -= 3;
470
1.62M
  do {
471
1.62M
    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
472
1.62M
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
473
1.62M
    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
474
1.62M
    height -= 2;
475
1.62M
    src_ptr += src_pitch << 1;
476
1.62M
    dst_ptr += dst_pitch << 1;
477
1.62M
  } while (height > 1);
478
479
298k
  if (height > 0) {
480
264k
    pack_8x1_2t_pixels(src_ptr, signal);
481
264k
    filter_8x1_2t_pixels(signal, &ff, &res0);
482
264k
    store_8x1_pixels(&res0, &max, dst_ptr);
483
264k
  }
484
298k
}
485
486
static void vpx_highbd_filter_block1d16_h2_avx2(
487
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
488
450k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
489
450k
  __m256i signal[2], res0, res1;
490
450k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
491
492
450k
  __m256i ff;
493
450k
  pack_2t_filter(filter, &ff);
494
495
450k
  src_ptr -= 3;
496
11.7M
  do {
497
11.7M
    pack_16x1_2t_pixels(src_ptr, signal);
498
11.7M
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
499
11.7M
    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
500
11.7M
    height -= 1;
501
11.7M
    src_ptr += src_pitch;
502
11.7M
    dst_ptr += dst_pitch;
503
11.7M
  } while (height > 0);
504
450k
}
505
506
// -----------------------------------------------------------------------------
507
// Vertical Filtering
508
509
1.18M
static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
510
1.18M
  __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
511
1.18M
  __m256i s1 =
512
1.18M
      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
513
1.18M
  __m256i s2 = _mm256_castsi128_si256(
514
1.18M
      _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
515
1.18M
  __m256i s3 = _mm256_castsi128_si256(
516
1.18M
      _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
517
1.18M
  __m256i s4 = _mm256_castsi128_si256(
518
1.18M
      _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
519
1.18M
  __m256i s5 = _mm256_castsi128_si256(
520
1.18M
      _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
521
1.18M
  __m256i s6 = _mm256_castsi128_si256(
522
1.18M
      _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
523
524
1.18M
  s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
525
1.18M
  s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
526
1.18M
  s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
527
1.18M
  s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
528
1.18M
  s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
529
1.18M
  s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
530
531
1.18M
  sig[0] = _mm256_unpacklo_epi16(s0, s1);
532
1.18M
  sig[4] = _mm256_unpackhi_epi16(s0, s1);
533
1.18M
  sig[1] = _mm256_unpacklo_epi16(s2, s3);
534
1.18M
  sig[5] = _mm256_unpackhi_epi16(s2, s3);
535
1.18M
  sig[2] = _mm256_unpacklo_epi16(s4, s5);
536
1.18M
  sig[6] = _mm256_unpackhi_epi16(s4, s5);
537
1.18M
  sig[8] = s6;
538
1.18M
}
539
540
static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
541
4.42M
                                   __m256i *sig) {
542
  // base + 7th row
543
4.42M
  __m256i s0 = _mm256_castsi128_si256(
544
4.42M
      _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
545
  // base + 8th row
546
4.42M
  __m256i s1 = _mm256_castsi128_si256(
547
4.42M
      _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
548
4.42M
  __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
549
4.42M
  __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
550
4.42M
  sig[3] = _mm256_unpacklo_epi16(s2, s3);
551
4.42M
  sig[7] = _mm256_unpackhi_epi16(s2, s3);
552
4.42M
  sig[8] = s1;
553
4.42M
}
554
555
static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
556
4.42M
                                     __m256i *y0, __m256i *y1) {
557
4.42M
  filter_8x1_pixels(sig, f, y0);
558
4.42M
  filter_8x1_pixels(&sig[4], f, y1);
559
4.42M
}
560
561
29.5M
static INLINE void update_pixels(__m256i *sig) {
562
29.5M
  int i;
563
118M
  for (i = 0; i < 3; ++i) {
564
88.5M
    sig[i] = sig[i + 1];
565
88.5M
    sig[i + 4] = sig[i + 5];
566
88.5M
  }
567
29.5M
}
568
569
static void vpx_highbd_filter_block1d8_v8_avx2(
570
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
571
793k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
572
793k
  __m256i signal[9], res0, res1;
573
793k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
574
575
793k
  __m256i ff[4];
576
793k
  pack_filters(filter, ff);
577
578
793k
  pack_8x9_init(src_ptr, src_pitch, signal);
579
580
2.99M
  do {
581
2.99M
    pack_8x9_pixels(src_ptr, src_pitch, signal);
582
583
2.99M
    filter_8x9_pixels(signal, ff, &res0, &res1);
584
2.99M
    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
585
2.99M
    update_pixels(signal);
586
587
2.99M
    src_ptr += src_pitch << 1;
588
2.99M
    dst_ptr += dst_pitch << 1;
589
2.99M
    height -= 2;
590
2.99M
  } while (height > 0);
591
793k
}
592
593
797k
static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
594
797k
  __m256i u0, u1, u2, u3;
595
  // load 0-6 rows
596
797k
  const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
597
797k
  const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
598
797k
  const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
599
797k
  const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
600
797k
  const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
601
797k
  const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
602
797k
  const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
603
604
797k
  u0 = _mm256_permute2x128_si256(s0, s1, 0x20);  // 0, 1 low
605
797k
  u1 = _mm256_permute2x128_si256(s0, s1, 0x31);  // 0, 1 high
606
607
797k
  u2 = _mm256_permute2x128_si256(s1, s2, 0x20);  // 1, 2 low
608
797k
  u3 = _mm256_permute2x128_si256(s1, s2, 0x31);  // 1, 2 high
609
610
797k
  sig[0] = _mm256_unpacklo_epi16(u0, u2);
611
797k
  sig[4] = _mm256_unpackhi_epi16(u0, u2);
612
613
797k
  sig[8] = _mm256_unpacklo_epi16(u1, u3);
614
797k
  sig[12] = _mm256_unpackhi_epi16(u1, u3);
615
616
797k
  u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
617
797k
  u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
618
619
797k
  u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
620
797k
  u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
621
622
797k
  sig[1] = _mm256_unpacklo_epi16(u0, u2);
623
797k
  sig[5] = _mm256_unpackhi_epi16(u0, u2);
624
625
797k
  sig[9] = _mm256_unpacklo_epi16(u1, u3);
626
797k
  sig[13] = _mm256_unpackhi_epi16(u1, u3);
627
628
797k
  u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
629
797k
  u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
630
631
797k
  u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
632
797k
  u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
633
634
797k
  sig[2] = _mm256_unpacklo_epi16(u0, u2);
635
797k
  sig[6] = _mm256_unpackhi_epi16(u0, u2);
636
637
797k
  sig[10] = _mm256_unpacklo_epi16(u1, u3);
638
797k
  sig[14] = _mm256_unpackhi_epi16(u1, u3);
639
640
797k
  sig[16] = s6;
641
797k
}
642
643
static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
644
12.5M
                             __m256i *sig) {
645
  // base + 7th row
646
12.5M
  const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
647
  // base + 8th row
648
12.5M
  const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
649
650
12.5M
  __m256i u0, u1, u2, u3;
651
12.5M
  u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
652
12.5M
  u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
653
654
12.5M
  u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
655
12.5M
  u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
656
657
12.5M
  sig[3] = _mm256_unpacklo_epi16(u0, u2);
658
12.5M
  sig[7] = _mm256_unpackhi_epi16(u0, u2);
659
660
12.5M
  sig[11] = _mm256_unpacklo_epi16(u1, u3);
661
12.5M
  sig[15] = _mm256_unpackhi_epi16(u1, u3);
662
663
12.5M
  sig[16] = s8;
664
12.5M
}
665
666
static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
667
12.5M
                                      __m256i *y0, __m256i *y1) {
668
12.5M
  __m256i res[4];
669
12.5M
  int i;
670
62.7M
  for (i = 0; i < 4; ++i) {
671
50.1M
    filter_8x1_pixels(&sig[i << 2], f, &res[i]);
672
50.1M
  }
673
674
12.5M
  {
675
12.5M
    const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
676
12.5M
    const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
677
12.5M
    *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
678
12.5M
    *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
679
12.5M
  }
680
12.5M
}
681
682
static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
683
                                     const __m256i *mask, uint16_t *dst,
684
8.52M
                                     ptrdiff_t pitch) {
685
8.52M
  __m256i p = _mm256_min_epi16(*y0, *mask);
686
8.52M
  _mm256_storeu_si256((__m256i *)dst, p);
687
8.52M
  p = _mm256_min_epi16(*y1, *mask);
688
8.52M
  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
689
8.52M
}
690
691
12.5M
static void update_16x9_pixels(__m256i *sig) {
692
12.5M
  update_pixels(&sig[0]);
693
12.5M
  update_pixels(&sig[8]);
694
12.5M
}
695
696
static void vpx_highbd_filter_block1d16_v8_avx2(
697
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
698
546k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
699
546k
  __m256i signal[17], res0, res1;
700
546k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
701
702
546k
  __m256i ff[4];
703
546k
  pack_filters(filter, ff);
704
705
546k
  pack_16x9_init(src_ptr, src_pitch, signal);
706
707
8.52M
  do {
708
8.52M
    pack_16x9_pixels(src_ptr, src_pitch, signal);
709
8.52M
    filter_16x9_pixels(signal, ff, &res0, &res1);
710
8.52M
    store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
711
8.52M
    update_16x9_pixels(signal);
712
713
8.52M
    src_ptr += src_pitch << 1;
714
8.52M
    dst_ptr += dst_pitch << 1;
715
8.52M
    height -= 2;
716
8.52M
  } while (height > 0);
717
546k
}
718
719
// -----------------------------------------------------------------------------
720
// 2-tap vertical filtering
721
722
513k
static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
723
513k
  sig[2] = _mm256_loadu_si256((const __m256i *)src);
724
513k
}
725
726
static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
727
12.9M
                                       __m256i *sig) {
728
  // load the next row
729
12.9M
  const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
730
12.9M
  sig[0] = _mm256_unpacklo_epi16(sig[2], u);
731
12.9M
  sig[1] = _mm256_unpackhi_epi16(sig[2], u);
732
12.9M
  sig[2] = u;
733
12.9M
}
734
735
static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
736
12.9M
                                         __m256i *y0, __m256i *y1) {
737
12.9M
  filter_16_2t_pixels(sig, f, y0, y1);
738
12.9M
}
739
740
static void vpx_highbd_filter_block1d16_v2_avx2(
741
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
742
336k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
743
336k
  __m256i signal[3], res0, res1;
744
336k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
745
336k
  __m256i ff;
746
747
336k
  pack_2t_filter(filter, &ff);
748
336k
  pack_16x2_init(src_ptr, signal);
749
750
8.49M
  do {
751
8.49M
    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
752
8.49M
    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
753
8.49M
    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
754
755
8.49M
    src_ptr += src_pitch;
756
8.49M
    dst_ptr += dst_pitch;
757
8.49M
    height -= 1;
758
8.49M
  } while (height > 0);
759
336k
}
760
761
332k
static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
762
332k
  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
763
332k
  const __m128i p = _mm_set1_epi32(0x09080706);
764
332k
  f[0] = _mm_shuffle_epi8(h, p);
765
332k
}
766
767
332k
static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
768
332k
  sig[2] = _mm_loadu_si128((const __m128i *)src);
769
332k
}
770
771
static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
772
3.59M
                                          __m128i *sig) {
773
  // load the next row
774
3.59M
  const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
775
3.59M
  sig[0] = _mm_unpacklo_epi16(sig[2], u);
776
3.59M
  sig[1] = _mm_unpackhi_epi16(sig[2], u);
777
3.59M
  sig[2] = u;
778
3.59M
}
779
780
static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
781
3.59M
                                      __m128i *y0, __m128i *y1) {
782
3.59M
  const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
783
3.59M
  __m128i x0 = _mm_madd_epi16(sig[0], *f);
784
3.59M
  __m128i x1 = _mm_madd_epi16(sig[1], *f);
785
3.59M
  x0 = _mm_add_epi32(x0, rounding);
786
3.59M
  x1 = _mm_add_epi32(x1, rounding);
787
3.59M
  *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
788
3.59M
  *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
789
3.59M
}
790
791
static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
792
2.43M
                                           const __m128i *mask, uint16_t *dst) {
793
2.43M
  __m128i res = _mm_packus_epi32(*y0, *y1);
794
2.43M
  res = _mm_min_epi16(res, *mask);
795
2.43M
  _mm_storeu_si128((__m128i *)dst, res);
796
2.43M
}
797
798
static void vpx_highbd_filter_block1d8_v2_avx2(
799
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
800
222k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
801
222k
  __m128i signal[3], res0, res1;
802
222k
  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
803
222k
  __m128i ff;
804
805
222k
  pack_8x1_2t_filter(filter, &ff);
806
222k
  pack_8x2_init(src_ptr, signal);
807
808
2.43M
  do {
809
2.43M
    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
810
2.43M
    filter_8_2t_pixels(signal, &ff, &res0, &res1);
811
2.43M
    store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
812
813
2.43M
    src_ptr += src_pitch;
814
2.43M
    dst_ptr += dst_pitch;
815
2.43M
    height -= 1;
816
2.43M
  } while (height > 0);
817
222k
}
818
819
// Calculation with averaging the input pixels
820
821
static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask,
822
0
                                        uint16_t *dst) {
823
0
  const __m128i a0 = _mm256_castsi256_si128(*y0);
824
0
  const __m128i a1 = _mm256_extractf128_si256(*y0, 1);
825
0
  __m128i res = _mm_packus_epi32(a0, a1);
826
0
  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
827
0
  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
828
0
  res = _mm_avg_epu16(res, pix);
829
0
  _mm_storeu_si128((__m128i *)dst, res);
830
0
}
831
832
static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1,
833
                                        const __m256i *mask, uint16_t *dst,
834
1.72M
                                        ptrdiff_t pitch) {
835
1.72M
  __m256i a = _mm256_packus_epi32(*y0, *y1);
836
1.72M
  const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst);
837
1.72M
  const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch));
838
1.72M
  const __m256i pix =
839
1.72M
      _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
840
1.72M
  a = _mm256_min_epi16(a, *mask);
841
1.72M
  a = _mm256_avg_epu16(a, pix);
842
1.72M
  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
843
1.72M
  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
844
1.72M
}
845
846
static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1,
847
6.59M
                                         const __m256i *mask, uint16_t *dst) {
848
6.59M
  __m256i a = _mm256_packus_epi32(*y0, *y1);
849
6.59M
  const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
850
6.59M
  a = _mm256_min_epi16(a, *mask);
851
6.59M
  a = _mm256_avg_epu16(a, pix);
852
6.59M
  _mm256_storeu_si256((__m256i *)dst, a);
853
6.59M
}
854
855
static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1,
856
                                         const __m256i *mask, uint16_t *dst,
857
4.01M
                                         ptrdiff_t pitch) {
858
4.01M
  const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst);
859
4.01M
  const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch));
860
4.01M
  __m256i p = _mm256_min_epi16(*y0, *mask);
861
4.01M
  p = _mm256_avg_epu16(p, pix0);
862
4.01M
  _mm256_storeu_si256((__m256i *)dst, p);
863
864
4.01M
  p = _mm256_min_epi16(*y1, *mask);
865
4.01M
  p = _mm256_avg_epu16(p, pix1);
866
4.01M
  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
867
4.01M
}
868
869
static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0,
870
                                               const __m128i *y1,
871
                                               const __m128i *mask,
872
1.16M
                                               uint16_t *dst) {
873
1.16M
  __m128i res = _mm_packus_epi32(*y0, *y1);
874
1.16M
  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
875
1.16M
  res = _mm_min_epi16(res, *mask);
876
1.16M
  res = _mm_avg_epu16(res, pix);
877
1.16M
  _mm_storeu_si128((__m128i *)dst, res);
878
1.16M
}
879
880
static void vpx_highbd_filter_block1d8_h8_avg_avx2(
881
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
882
56.9k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
883
56.9k
  __m256i signal[8], res0, res1;
884
56.9k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
885
886
56.9k
  __m256i ff[4];
887
56.9k
  pack_filters(filter, ff);
888
889
56.9k
  src_ptr -= 3;
890
228k
  do {
891
228k
    pack_8x2_pixels(src_ptr, src_pitch, signal);
892
228k
    filter_8x1_pixels(signal, ff, &res0);
893
228k
    filter_8x1_pixels(&signal[4], ff, &res1);
894
228k
    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
895
228k
    height -= 2;
896
228k
    src_ptr += src_pitch << 1;
897
228k
    dst_ptr += dst_pitch << 1;
898
228k
  } while (height > 1);
899
900
56.9k
  if (height > 0) {
901
0
    pack_8x1_pixels(src_ptr, signal);
902
0
    filter_8x1_pixels(signal, ff, &res0);
903
0
    store_8x1_avg_pixels(&res0, &max, dst_ptr);
904
0
  }
905
56.9k
}
906
907
static void vpx_highbd_filter_block1d16_h8_avg_avx2(
908
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
909
40.2k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
910
40.2k
  __m256i signal[8], res0, res1;
911
40.2k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
912
913
40.2k
  __m256i ff[4];
914
40.2k
  pack_filters(filter, ff);
915
916
40.2k
  src_ptr -= 3;
917
1.32M
  do {
918
1.32M
    pack_16x1_pixels(src_ptr, signal);
919
1.32M
    filter_8x1_pixels(signal, ff, &res0);
920
1.32M
    filter_8x1_pixels(&signal[4], ff, &res1);
921
1.32M
    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
922
1.32M
    height -= 1;
923
1.32M
    src_ptr += src_pitch;
924
1.32M
    dst_ptr += dst_pitch;
925
1.32M
  } while (height > 0);
926
40.2k
}
927
928
static void vpx_highbd_filter_block1d4_h4_avx2(
929
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
930
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
931
  // We extract the middle four elements of the kernel into two registers in
932
  // the form
933
  // ... k[3] k[2] k[3] k[2]
934
  // ... k[5] k[4] k[5] k[4]
935
  // Then we shuffle the source into
936
  // ... s[1] s[0] s[0] s[-1]
937
  // ... s[3] s[2] s[2] s[1]
938
  // Calling multiply and add gives us half of the sum. Calling add on the two
939
  // halves gives us the output. Since avx2 allows us to use 256-bit buffer, we
940
  // can do this two rows at a time.
941
942
0
  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
943
0
  __m256i res_reg;
944
0
  __m256i idx_shift_0 =
945
0
      _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
946
0
                       3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
947
0
  __m256i idx_shift_2 =
948
0
      _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
949
0
                       5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
950
951
0
  __m128i kernel_reg_128;  // Kernel
952
0
  __m256i kernel_reg, kernel_reg_23,
953
0
      kernel_reg_45;  // Segments of the kernel used
954
0
  const __m256i reg_round =
955
0
      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
956
0
  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
957
0
  const ptrdiff_t unrolled_src_stride = src_stride << 1;
958
0
  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
959
0
  int h;
960
961
  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
962
0
  src_ptr -= 1;
963
964
  // Load Kernel
965
0
  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
966
0
  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
967
0
  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
968
0
  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
969
970
0
  for (h = height; h >= 2; h -= 2) {
971
    // Load the source
972
0
    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
973
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
974
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
975
976
    // Get the output
977
0
    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
978
0
                                   &kernel_reg_23, &kernel_reg_45);
979
980
    // Round the result
981
0
    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
982
983
    // Finally combine to get the final dst
984
0
    res_reg = _mm256_packus_epi32(res_reg, res_reg);
985
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
986
0
    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
987
0
                        &res_reg);
988
989
0
    src_ptr += unrolled_src_stride;
990
0
    dst_ptr += unrolled_dst_stride;
991
0
  }
992
993
  // Repeat for the last row if needed
994
0
  if (h > 0) {
995
    // Load the source
996
0
    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
997
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
998
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
999
1000
    // Get the output
1001
0
    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1002
0
                                   &kernel_reg_23, &kernel_reg_45);
1003
1004
    // Round the result
1005
0
    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
1006
1007
    // Finally combine to get the final dst
1008
0
    res_reg = _mm256_packus_epi32(res_reg, res_reg);
1009
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1010
0
    _mm_storel_epi64((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg));
1011
0
  }
1012
0
}
1013
1014
static void vpx_highbd_filter_block1d8_h4_avx2(
1015
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1016
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1017
  // We will extract the middle four elements of the kernel into two registers
1018
  // in the form
1019
  // ... k[3] k[2] k[3] k[2]
1020
  // ... k[5] k[4] k[5] k[4]
1021
  // Then we shuffle the source into
1022
  // ... s[1] s[0] s[0] s[-1]
1023
  // ... s[3] s[2] s[2] s[1]
1024
  // Calling multiply and add gives us half of the sum of the first half.
1025
  // Calling add gives us first half of the output. Repat again to get the whole
1026
  // output. Since avx2 allows us to use 256-bit buffer, we can do this two rows
1027
  // at a time.
1028
1029
0
  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
1030
0
  __m256i res_reg, res_first, res_last;
1031
0
  __m256i idx_shift_0 =
1032
0
      _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
1033
0
                       3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
1034
0
  __m256i idx_shift_2 =
1035
0
      _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
1036
0
                       5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
1037
1038
0
  __m128i kernel_reg_128;  // Kernel
1039
0
  __m256i kernel_reg, kernel_reg_23,
1040
0
      kernel_reg_45;  // Segments of the kernel used
1041
0
  const __m256i reg_round =
1042
0
      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
1043
0
  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
1044
0
  const ptrdiff_t unrolled_src_stride = src_stride << 1;
1045
0
  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
1046
0
  int h;
1047
1048
  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
1049
0
  src_ptr -= 1;
1050
1051
  // Load Kernel
1052
0
  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
1053
0
  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
1054
0
  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
1055
0
  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
1056
1057
0
  for (h = height; h >= 2; h -= 2) {
1058
    // Load the source
1059
0
    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
1060
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
1061
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
1062
1063
    // Result for first half
1064
0
    res_first = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1065
0
                                     &kernel_reg_23, &kernel_reg_45);
1066
1067
    // Do again to get the second half of dst
1068
    // Load the source
1069
0
    src_reg = mm256_loadu2_si128(src_ptr + 4, src_ptr + src_stride + 4);
1070
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
1071
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
1072
1073
    // Result for second half
1074
0
    res_last = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1075
0
                                    &kernel_reg_23, &kernel_reg_45);
1076
1077
    // Round each result
1078
0
    res_first = mm256_round_epi32(&res_first, &reg_round, CONV8_ROUNDING_BITS);
1079
0
    res_last = mm256_round_epi32(&res_last, &reg_round, CONV8_ROUNDING_BITS);
1080
1081
    // Finally combine to get the final dst
1082
0
    res_reg = _mm256_packus_epi32(res_first, res_last);
1083
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1084
0
    mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
1085
0
                       &res_reg);
1086
1087
0
    src_ptr += unrolled_src_stride;
1088
0
    dst_ptr += unrolled_dst_stride;
1089
0
  }
1090
1091
  // Repeat for the last row if needed
1092
0
  if (h > 0) {
1093
0
    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
1094
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
1095
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
1096
1097
0
    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1098
0
                                   &kernel_reg_23, &kernel_reg_45);
1099
1100
0
    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
1101
1102
0
    res_reg = _mm256_packus_epi32(res_reg, res_reg);
1103
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1104
1105
0
    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg);
1106
0
  }
1107
0
}
1108
1109
static void vpx_highbd_filter_block1d16_h4_avx2(
1110
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1111
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1112
0
  vpx_highbd_filter_block1d8_h4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
1113
0
                                     height, kernel, bd);
1114
0
  vpx_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
1115
0
                                     dst_stride, height, kernel, bd);
1116
0
}
1117
1118
static void vpx_highbd_filter_block1d8_v8_avg_avx2(
1119
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1120
388k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1121
388k
  __m256i signal[9], res0, res1;
1122
388k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1123
1124
388k
  __m256i ff[4];
1125
388k
  pack_filters(filter, ff);
1126
1127
388k
  pack_8x9_init(src_ptr, src_pitch, signal);
1128
1129
1.42M
  do {
1130
1.42M
    pack_8x9_pixels(src_ptr, src_pitch, signal);
1131
1132
1.42M
    filter_8x9_pixels(signal, ff, &res0, &res1);
1133
1.42M
    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
1134
1.42M
    update_pixels(signal);
1135
1136
1.42M
    src_ptr += src_pitch << 1;
1137
1.42M
    dst_ptr += dst_pitch << 1;
1138
1.42M
    height -= 2;
1139
1.42M
  } while (height > 0);
1140
388k
}
1141
1142
static void vpx_highbd_filter_block1d16_v8_avg_avx2(
1143
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1144
251k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1145
251k
  __m256i signal[17], res0, res1;
1146
251k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1147
1148
251k
  __m256i ff[4];
1149
251k
  pack_filters(filter, ff);
1150
1151
251k
  pack_16x9_init(src_ptr, src_pitch, signal);
1152
1153
4.01M
  do {
1154
4.01M
    pack_16x9_pixels(src_ptr, src_pitch, signal);
1155
4.01M
    filter_16x9_pixels(signal, ff, &res0, &res1);
1156
4.01M
    store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
1157
4.01M
    update_16x9_pixels(signal);
1158
1159
4.01M
    src_ptr += src_pitch << 1;
1160
4.01M
    dst_ptr += dst_pitch << 1;
1161
4.01M
    height -= 2;
1162
4.01M
  } while (height > 0);
1163
251k
}
1164
1165
static void vpx_highbd_filter_block1d8_h2_avg_avx2(
1166
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1167
14.9k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1168
14.9k
  __m256i signal[2], res0, res1;
1169
14.9k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1170
1171
14.9k
  __m256i ff;
1172
14.9k
  pack_2t_filter(filter, &ff);
1173
1174
14.9k
  src_ptr -= 3;
1175
72.6k
  do {
1176
72.6k
    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
1177
72.6k
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
1178
72.6k
    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
1179
72.6k
    height -= 2;
1180
72.6k
    src_ptr += src_pitch << 1;
1181
72.6k
    dst_ptr += dst_pitch << 1;
1182
72.6k
  } while (height > 1);
1183
1184
14.9k
  if (height > 0) {
1185
0
    pack_8x1_2t_pixels(src_ptr, signal);
1186
0
    filter_8x1_2t_pixels(signal, &ff, &res0);
1187
0
    store_8x1_avg_pixels(&res0, &max, dst_ptr);
1188
0
  }
1189
14.9k
}
1190
1191
static void vpx_highbd_filter_block1d16_h2_avg_avx2(
1192
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1193
31.5k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1194
31.5k
  __m256i signal[2], res0, res1;
1195
31.5k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1196
1197
31.5k
  __m256i ff;
1198
31.5k
  pack_2t_filter(filter, &ff);
1199
1200
31.5k
  src_ptr -= 3;
1201
814k
  do {
1202
814k
    pack_16x1_2t_pixels(src_ptr, signal);
1203
814k
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
1204
814k
    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
1205
814k
    height -= 1;
1206
814k
    src_ptr += src_pitch;
1207
814k
    dst_ptr += dst_pitch;
1208
814k
  } while (height > 0);
1209
31.5k
}
1210
1211
static void vpx_highbd_filter_block1d16_v2_avg_avx2(
1212
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1213
176k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1214
176k
  __m256i signal[3], res0, res1;
1215
176k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1216
176k
  __m256i ff;
1217
1218
176k
  pack_2t_filter(filter, &ff);
1219
176k
  pack_16x2_init(src_ptr, signal);
1220
1221
4.44M
  do {
1222
4.44M
    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
1223
4.44M
    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
1224
4.44M
    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
1225
1226
4.44M
    src_ptr += src_pitch;
1227
4.44M
    dst_ptr += dst_pitch;
1228
4.44M
    height -= 1;
1229
4.44M
  } while (height > 0);
1230
176k
}
1231
1232
static void vpx_highbd_filter_block1d8_v2_avg_avx2(
1233
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1234
109k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1235
109k
  __m128i signal[3], res0, res1;
1236
109k
  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
1237
109k
  __m128i ff;
1238
1239
109k
  pack_8x1_2t_filter(filter, &ff);
1240
109k
  pack_8x2_init(src_ptr, signal);
1241
1242
1.16M
  do {
1243
1.16M
    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
1244
1.16M
    filter_8_2t_pixels(signal, &ff, &res0, &res1);
1245
1.16M
    store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr);
1246
1247
1.16M
    src_ptr += src_pitch;
1248
1.16M
    dst_ptr += dst_pitch;
1249
1.16M
    height -= 1;
1250
1.16M
  } while (height > 0);
1251
109k
}
1252
1253
static void vpx_highbd_filter_block1d4_v4_avx2(
1254
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1255
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1256
  // We will load two rows of pixels and rearrange them into the form
1257
  // ... s[1,0] s[0,0] s[0,0] s[-1,0]
1258
  // so that we can call multiply and add with the kernel partial output. Then
1259
  // we can call add with another row to get the output.
1260
1261
  // Register for source s[-1:3, :]
1262
0
  __m256i src_reg_1, src_reg_2, src_reg_3;
1263
  // Interleaved rows of the source. lo is first half, hi second
1264
0
  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
1265
0
  __m256i src_reg_m1001, src_reg_1223;
1266
1267
  // Result after multiply and add
1268
0
  __m256i res_reg;
1269
1270
0
  __m128i kernel_reg_128;                            // Kernel
1271
0
  __m256i kernel_reg, kernel_reg_23, kernel_reg_45;  // Segments of kernel used
1272
1273
0
  const __m256i reg_round =
1274
0
      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
1275
0
  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
1276
0
  const ptrdiff_t src_stride_unrolled = src_stride << 1;
1277
0
  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
1278
0
  int h;
1279
1280
  // Load Kernel
1281
0
  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
1282
0
  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
1283
0
  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
1284
0
  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
1285
1286
  // Row -1 to row 0
1287
0
  src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr,
1288
0
                                   (const __m128i *)(src_ptr + src_stride));
1289
1290
  // Row 0 to row 1
1291
0
  src_reg_1 = _mm256_castsi128_si256(
1292
0
      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
1293
0
  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
1294
1295
  // First three rows
1296
0
  src_reg_m1001 = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
1297
1298
0
  for (h = height; h > 1; h -= 2) {
1299
0
    src_reg_2 = _mm256_castsi128_si256(
1300
0
        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
1301
1302
0
    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
1303
0
                                         _mm256_castsi256_si128(src_reg_2), 1);
1304
1305
0
    src_reg_3 = _mm256_castsi128_si256(
1306
0
        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
1307
1308
0
    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
1309
0
                                         _mm256_castsi256_si128(src_reg_3), 1);
1310
1311
    // Last three rows
1312
0
    src_reg_1223 = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
1313
1314
    // Output
1315
0
    res_reg = mm256_madd_add_epi32(&src_reg_m1001, &src_reg_1223,
1316
0
                                   &kernel_reg_23, &kernel_reg_45);
1317
1318
    // Round the words
1319
0
    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
1320
1321
    // Combine to get the result
1322
0
    res_reg = _mm256_packus_epi32(res_reg, res_reg);
1323
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1324
1325
    // Save the result
1326
0
    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
1327
0
                        &res_reg);
1328
1329
    // Update the source by two rows
1330
0
    src_ptr += src_stride_unrolled;
1331
0
    dst_ptr += dst_stride_unrolled;
1332
1333
0
    src_reg_m1001 = src_reg_1223;
1334
0
    src_reg_1 = src_reg_3;
1335
0
  }
1336
0
}
1337
1338
static void vpx_highbd_filter_block1d8_v4_avx2(
1339
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1340
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1341
  // We will load two rows of pixels and rearrange them into the form
1342
  // ... s[1,0] s[0,0] s[0,0] s[-1,0]
1343
  // so that we can call multiply and add with the kernel partial output. Then
1344
  // we can call add with another row to get the output.
1345
1346
  // Register for source s[-1:3, :]
1347
0
  __m256i src_reg_1, src_reg_2, src_reg_3;
1348
  // Interleaved rows of the source. lo is first half, hi second
1349
0
  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
1350
0
  __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi;
1351
1352
0
  __m128i kernel_reg_128;                            // Kernel
1353
0
  __m256i kernel_reg, kernel_reg_23, kernel_reg_45;  // Segments of kernel
1354
1355
  // Result after multiply and add
1356
0
  __m256i res_reg, res_reg_lo, res_reg_hi;
1357
1358
0
  const __m256i reg_round =
1359
0
      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
1360
0
  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
1361
0
  const ptrdiff_t src_stride_unrolled = src_stride << 1;
1362
0
  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
1363
0
  int h;
1364
1365
  // Load Kernel
1366
0
  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
1367
0
  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
1368
0
  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
1369
0
  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
1370
1371
  // Row -1 to row 0
1372
0
  src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
1373
0
                                   (const __m128i *)(src_ptr + src_stride));
1374
1375
  // Row 0 to row 1
1376
0
  src_reg_1 = _mm256_castsi128_si256(
1377
0
      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
1378
0
  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
1379
1380
  // First three rows
1381
0
  src_reg_m1001_lo = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
1382
0
  src_reg_m1001_hi = _mm256_unpackhi_epi16(src_reg_m10, src_reg_01);
1383
1384
0
  for (h = height; h > 1; h -= 2) {
1385
0
    src_reg_2 = _mm256_castsi128_si256(
1386
0
        _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)));
1387
1388
0
    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
1389
0
                                         _mm256_castsi256_si128(src_reg_2), 1);
1390
1391
0
    src_reg_3 = _mm256_castsi128_si256(
1392
0
        _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)));
1393
1394
0
    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
1395
0
                                         _mm256_castsi256_si128(src_reg_3), 1);
1396
1397
    // Last three rows
1398
0
    src_reg_1223_lo = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
1399
0
    src_reg_1223_hi = _mm256_unpackhi_epi16(src_reg_12, src_reg_23);
1400
1401
    // Output from first half
1402
0
    res_reg_lo = mm256_madd_add_epi32(&src_reg_m1001_lo, &src_reg_1223_lo,
1403
0
                                      &kernel_reg_23, &kernel_reg_45);
1404
1405
    // Output from second half
1406
0
    res_reg_hi = mm256_madd_add_epi32(&src_reg_m1001_hi, &src_reg_1223_hi,
1407
0
                                      &kernel_reg_23, &kernel_reg_45);
1408
1409
    // Round the words
1410
0
    res_reg_lo =
1411
0
        mm256_round_epi32(&res_reg_lo, &reg_round, CONV8_ROUNDING_BITS);
1412
0
    res_reg_hi =
1413
0
        mm256_round_epi32(&res_reg_hi, &reg_round, CONV8_ROUNDING_BITS);
1414
1415
    // Combine to get the result
1416
0
    res_reg = _mm256_packus_epi32(res_reg_lo, res_reg_hi);
1417
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1418
1419
    // Save the result
1420
0
    mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
1421
0
                       &res_reg);
1422
1423
    // Update the source by two rows
1424
0
    src_ptr += src_stride_unrolled;
1425
0
    dst_ptr += dst_stride_unrolled;
1426
1427
0
    src_reg_m1001_lo = src_reg_1223_lo;
1428
0
    src_reg_m1001_hi = src_reg_1223_hi;
1429
0
    src_reg_1 = src_reg_3;
1430
0
  }
1431
0
}
1432
1433
static void vpx_highbd_filter_block1d16_v4_avx2(
1434
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1435
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1436
0
  vpx_highbd_filter_block1d8_v4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
1437
0
                                     height, kernel, bd);
1438
0
  vpx_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
1439
0
                                     dst_stride, height, kernel, bd);
1440
0
}
1441
1442
// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
1443
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
1444
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
1445
1446
// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
1447
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
1448
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
1449
1450
1.37M
#define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2
1451
491k
#define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2
1452
1.08M
#define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
1453
378k
#define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2
1454
1455
// Use the [vh]8 version because there is no [vh]4 implementation.
1456
#define vpx_highbd_filter_block1d16_v4_avg_avx2 \
1457
0
  vpx_highbd_filter_block1d16_v8_avg_avx2
1458
#define vpx_highbd_filter_block1d16_h4_avg_avx2 \
1459
0
  vpx_highbd_filter_block1d16_h8_avg_avx2
1460
#define vpx_highbd_filter_block1d8_v4_avg_avx2 \
1461
0
  vpx_highbd_filter_block1d8_v8_avg_avx2
1462
#define vpx_highbd_filter_block1d8_h4_avg_avx2 \
1463
0
  vpx_highbd_filter_block1d8_h8_avg_avx2
1464
#define vpx_highbd_filter_block1d4_v4_avg_avx2 \
1465
0
  vpx_highbd_filter_block1d4_v8_avg_avx2
1466
#define vpx_highbd_filter_block1d4_h4_avg_avx2 \
1467
0
  vpx_highbd_filter_block1d4_h8_avg_avx2
1468
1469
HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
1470
HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
1471
                 src - src_stride * (num_taps / 2 - 1), , avx2, 0)
1472
2.33M
HIGH_FUN_CONV_2D(, avx2, 0)
1473
1474
// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
1475
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
1476
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
1477
1478
// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
1479
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
1480
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
1481
1482
#define vpx_highbd_filter_block1d4_h8_avg_avx2 \
1483
75.2k
  vpx_highbd_filter_block1d4_h8_avg_sse2
1484
#define vpx_highbd_filter_block1d4_h2_avg_avx2 \
1485
22.6k
  vpx_highbd_filter_block1d4_h2_avg_sse2
1486
#define vpx_highbd_filter_block1d4_v8_avg_avx2 \
1487
510k
  vpx_highbd_filter_block1d4_v8_avg_sse2
1488
#define vpx_highbd_filter_block1d4_v2_avg_avx2 \
1489
162k
  vpx_highbd_filter_block1d4_v2_avg_sse2
1490
1491
HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
1492
HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
1493
                 src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
1494
HIGH_FUN_CONV_2D(avg_, avx2, 1)
1495
1496
#undef HIGHBD_FUNC
1497
#endif  // HAVE_X86_ASM