Coverage Report

Created: 2026-05-16 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
Line
Count
Source
1
/*
2
 *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
11
#include <immintrin.h>
12
#include "./vpx_dsp_rtcd.h"
13
#include "vpx_dsp/x86/convolve.h"
14
#include "vpx_dsp/x86/convolve_avx2.h"
15
16
// -----------------------------------------------------------------------------
17
// Copy and average
18
19
void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
20
                                   uint16_t *dst, ptrdiff_t dst_stride,
21
                                   const InterpKernel *filter, int x0_q4,
22
                                   int x_step_q4, int y0_q4, int y_step_q4,
23
1.81M
                                   int w, int h, int bd) {
24
1.81M
  (void)filter;
25
1.81M
  (void)x0_q4;
26
1.81M
  (void)x_step_q4;
27
1.81M
  (void)y0_q4;
28
1.81M
  (void)y_step_q4;
29
1.81M
  (void)bd;
30
31
1.81M
  assert(w % 4 == 0);
32
1.81M
  if (w > 32) {  // w = 64
33
7.91M
    do {
34
7.91M
      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
35
7.91M
      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
36
7.91M
      const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
37
7.91M
      const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
38
7.91M
      src += src_stride;
39
7.91M
      _mm256_storeu_si256((__m256i *)dst, p0);
40
7.91M
      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
41
7.91M
      _mm256_storeu_si256((__m256i *)(dst + 32), p2);
42
7.91M
      _mm256_storeu_si256((__m256i *)(dst + 48), p3);
43
7.91M
      dst += dst_stride;
44
7.91M
      h--;
45
7.91M
    } while (h > 0);
46
1.64M
  } else if (w > 16) {  // w = 32
47
10.1M
    do {
48
10.1M
      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
49
10.1M
      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
50
10.1M
      src += src_stride;
51
10.1M
      _mm256_storeu_si256((__m256i *)dst, p0);
52
10.1M
      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
53
10.1M
      dst += dst_stride;
54
10.1M
      h--;
55
10.1M
    } while (h > 0);
56
1.25M
  } else if (w > 8) {  // w = 16
57
251k
    __m256i p0, p1;
58
2.12M
    do {
59
2.12M
      p0 = _mm256_loadu_si256((const __m256i *)src);
60
2.12M
      src += src_stride;
61
2.12M
      p1 = _mm256_loadu_si256((const __m256i *)src);
62
2.12M
      src += src_stride;
63
64
2.12M
      _mm256_storeu_si256((__m256i *)dst, p0);
65
2.12M
      dst += dst_stride;
66
2.12M
      _mm256_storeu_si256((__m256i *)dst, p1);
67
2.12M
      dst += dst_stride;
68
2.12M
      h -= 2;
69
2.12M
    } while (h > 0);
70
1.00M
  } else if (w > 4) {  // w = 8
71
491k
    __m128i p0, p1;
72
2.03M
    do {
73
2.03M
      p0 = _mm_loadu_si128((const __m128i *)src);
74
2.03M
      src += src_stride;
75
2.03M
      p1 = _mm_loadu_si128((const __m128i *)src);
76
2.03M
      src += src_stride;
77
78
2.03M
      _mm_storeu_si128((__m128i *)dst, p0);
79
2.03M
      dst += dst_stride;
80
2.03M
      _mm_storeu_si128((__m128i *)dst, p1);
81
2.03M
      dst += dst_stride;
82
2.03M
      h -= 2;
83
2.03M
    } while (h > 0);
84
509k
  } else {  // w = 4
85
509k
    __m128i p0, p1;
86
1.09M
    do {
87
1.09M
      p0 = _mm_loadl_epi64((const __m128i *)src);
88
1.09M
      src += src_stride;
89
1.09M
      p1 = _mm_loadl_epi64((const __m128i *)src);
90
1.09M
      src += src_stride;
91
92
1.09M
      _mm_storel_epi64((__m128i *)dst, p0);
93
1.09M
      dst += dst_stride;
94
1.09M
      _mm_storel_epi64((__m128i *)dst, p1);
95
1.09M
      dst += dst_stride;
96
1.09M
      h -= 2;
97
1.09M
    } while (h > 0);
98
509k
  }
99
1.81M
}
100
101
void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
102
                                  uint16_t *dst, ptrdiff_t dst_stride,
103
                                  const InterpKernel *filter, int x0_q4,
104
                                  int x_step_q4, int y0_q4, int y_step_q4,
105
497k
                                  int w, int h, int bd) {
106
497k
  (void)filter;
107
497k
  (void)x0_q4;
108
497k
  (void)x_step_q4;
109
497k
  (void)y0_q4;
110
497k
  (void)y_step_q4;
111
497k
  (void)bd;
112
113
497k
  assert(w % 4 == 0);
114
497k
  if (w > 32) {  // w = 64
115
8.02k
    __m256i p0, p1, p2, p3, u0, u1, u2, u3;
116
495k
    do {
117
495k
      p0 = _mm256_loadu_si256((const __m256i *)src);
118
495k
      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
119
495k
      p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
120
495k
      p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
121
495k
      src += src_stride;
122
495k
      u0 = _mm256_loadu_si256((const __m256i *)dst);
123
495k
      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
124
495k
      u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
125
495k
      u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
126
495k
      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
127
495k
      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
128
495k
      _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
129
495k
      _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
130
495k
      dst += dst_stride;
131
495k
      h--;
132
495k
    } while (h > 0);
133
489k
  } else if (w > 16) {  // w = 32
134
43.4k
    __m256i p0, p1, u0, u1;
135
1.46M
    do {
136
1.46M
      p0 = _mm256_loadu_si256((const __m256i *)src);
137
1.46M
      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
138
1.46M
      src += src_stride;
139
1.46M
      u0 = _mm256_loadu_si256((const __m256i *)dst);
140
1.46M
      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
141
1.46M
      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
142
1.46M
      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
143
1.46M
      dst += dst_stride;
144
1.46M
      h--;
145
1.46M
    } while (h > 0);
146
445k
  } else if (w > 8) {  // w = 16
147
91.3k
    __m256i p0, p1, u0, u1;
148
800k
    do {
149
800k
      p0 = _mm256_loadu_si256((const __m256i *)src);
150
800k
      p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
151
800k
      src += src_stride << 1;
152
800k
      u0 = _mm256_loadu_si256((const __m256i *)dst);
153
800k
      u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
154
155
800k
      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
156
800k
      _mm256_storeu_si256((__m256i *)(dst + dst_stride),
157
800k
                          _mm256_avg_epu16(p1, u1));
158
800k
      dst += dst_stride << 1;
159
800k
      h -= 2;
160
800k
    } while (h > 0);
161
354k
  } else if (w > 4) {  // w = 8
162
169k
    __m128i p0, p1, u0, u1;
163
690k
    do {
164
690k
      p0 = _mm_loadu_si128((const __m128i *)src);
165
690k
      p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
166
690k
      src += src_stride << 1;
167
690k
      u0 = _mm_loadu_si128((const __m128i *)dst);
168
690k
      u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
169
170
690k
      _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
171
690k
      _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
172
690k
      dst += dst_stride << 1;
173
690k
      h -= 2;
174
690k
    } while (h > 0);
175
184k
  } else {  // w = 4
176
184k
    __m128i p0, p1, u0, u1;
177
407k
    do {
178
407k
      p0 = _mm_loadl_epi64((const __m128i *)src);
179
407k
      p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
180
407k
      src += src_stride << 1;
181
407k
      u0 = _mm_loadl_epi64((const __m128i *)dst);
182
407k
      u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
183
184
407k
      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
185
407k
      _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
186
407k
      dst += dst_stride << 1;
187
407k
      h -= 2;
188
407k
    } while (h > 0);
189
184k
  }
190
497k
}
191
192
#if HAVE_X86_ASM
193
// -----------------------------------------------------------------------------
194
// Horizontal and vertical filtering
195
196
static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
197
                                              7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
198
                                              4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
199
200
static const uint8_t signal_pattern_1[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
201
                                              8, 9, 10, 11, 10, 11, 12, 13,
202
                                              4, 5, 6,  7,  6,  7,  8,  9,
203
                                              8, 9, 10, 11, 10, 11, 12, 13 };
204
205
static const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
206
                                              10, 11, 12, 13, 12, 13, 14, 15,
207
                                              6,  7,  8,  9,  8,  9,  10, 11,
208
                                              10, 11, 12, 13, 12, 13, 14, 15 };
209
210
static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
211
212
375M
#define CONV8_ROUNDING_BITS (7)
213
0
#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1))
214
215
// -----------------------------------------------------------------------------
216
// Horizontal Filtering
217
218
73.7M
static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
219
73.7M
  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
220
73.7M
  const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
221
73.7M
  const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
222
73.7M
  const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
223
224
73.7M
  p[0] = _mm256_shuffle_epi8(*s, sf0);  // x0x6
225
73.7M
  p[1] = _mm256_shuffle_epi8(*s, sf1);  // x1x7
226
73.7M
  p[2] = _mm256_shuffle_epi8(c, sf0);   // x2x4
227
73.7M
  p[3] = _mm256_shuffle_epi8(c, sf1);   // x3x5
228
73.7M
}
229
230
// Note:
231
//  Shared by 8x2 and 16x1 block
232
static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
233
36.4M
                                  __m256i *x /*x[8]*/) {
234
36.4M
  __m256i pp[8];
235
36.4M
  pack_pixels(s0, pp);
236
36.4M
  pack_pixels(s1, &pp[4]);
237
36.4M
  x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
238
36.4M
  x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
239
36.4M
  x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
240
36.4M
  x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
241
36.4M
  x[4] = x[2];
242
36.4M
  x[5] = x[3];
243
36.4M
  x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
244
36.4M
  x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
245
36.4M
}
246
247
859k
static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
248
859k
  __m256i pp[8];
249
859k
  __m256i s0;
250
859k
  s0 = _mm256_loadu_si256((const __m256i *)src);
251
859k
  pack_pixels(&s0, pp);
252
859k
  x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
253
859k
  x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
254
859k
  x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
255
859k
  x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
256
859k
}
257
258
static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
259
6.74M
                                   __m256i *x) {
260
6.74M
  __m256i s0, s1;
261
6.74M
  s0 = _mm256_loadu_si256((const __m256i *)src);
262
6.74M
  s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
263
6.74M
  pack_16_pixels(&s0, &s1, x);
264
6.74M
}
265
266
29.6M
static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
267
29.6M
  __m256i s0, s1;
268
29.6M
  s0 = _mm256_loadu_si256((const __m256i *)src);
269
29.6M
  s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
270
29.6M
  pack_16_pixels(&s0, &s1, x);
271
29.6M
}
272
273
// Note:
274
//  Shared by horizontal and vertical filtering
275
4.06M
static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
276
4.06M
  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
277
4.06M
  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
278
4.06M
  const __m256i p0 = _mm256_set1_epi32(0x03020100);
279
4.06M
  const __m256i p1 = _mm256_set1_epi32(0x07060504);
280
4.06M
  const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
281
4.06M
  const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
282
4.06M
  f[0] = _mm256_shuffle_epi8(hh, p0);
283
4.06M
  f[1] = _mm256_shuffle_epi8(hh, p1);
284
4.06M
  f[2] = _mm256_shuffle_epi8(hh, p2);
285
4.06M
  f[3] = _mm256_shuffle_epi8(hh, p3);
286
4.06M
}
287
288
static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
289
                                     const __m256i *fil /*fil[4]*/,
290
141M
                                     __m256i *y) {
291
141M
  __m256i a, a0, a1;
292
293
141M
  a0 = _mm256_madd_epi16(fil[0], sig[0]);
294
141M
  a1 = _mm256_madd_epi16(fil[3], sig[3]);
295
141M
  a = _mm256_add_epi32(a0, a1);
296
297
141M
  a0 = _mm256_madd_epi16(fil[1], sig[1]);
298
141M
  a1 = _mm256_madd_epi16(fil[2], sig[2]);
299
300
141M
  {
301
141M
    const __m256i min = _mm256_min_epi32(a0, a1);
302
141M
    a = _mm256_add_epi32(a, min);
303
141M
  }
304
141M
  {
305
141M
    const __m256i max = _mm256_max_epi32(a0, a1);
306
141M
    a = _mm256_add_epi32(a, max);
307
141M
  }
308
141M
  {
309
141M
    const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
310
141M
    a = _mm256_add_epi32(a, rounding);
311
141M
    *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
312
141M
  }
313
141M
}
314
315
static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
316
1.12M
                                    uint16_t *dst) {
317
1.12M
  const __m128i a0 = _mm256_castsi256_si128(*y);
318
1.12M
  const __m128i a1 = _mm256_extractf128_si256(*y, 1);
319
1.12M
  __m128i res = _mm_packus_epi32(a0, a1);
320
1.12M
  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
321
1.12M
  _mm_storeu_si128((__m128i *)dst, res);
322
1.12M
}
323
324
static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
325
                                    const __m256i *mask, uint16_t *dst,
326
11.4M
                                    ptrdiff_t pitch) {
327
11.4M
  __m256i a = _mm256_packus_epi32(*y0, *y1);
328
11.4M
  a = _mm256_min_epi16(a, *mask);
329
11.4M
  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
330
11.4M
  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
331
11.4M
}
332
333
static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
334
48.1M
                                     const __m256i *mask, uint16_t *dst) {
335
48.1M
  __m256i a = _mm256_packus_epi32(*y0, *y1);
336
48.1M
  a = _mm256_min_epi16(a, *mask);
337
48.1M
  _mm256_storeu_si256((__m256i *)dst, a);
338
48.1M
}
339
340
static void vpx_highbd_filter_block1d8_h8_avx2(
341
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
342
1.01M
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
343
1.01M
  __m256i signal[8], res0, res1;
344
1.01M
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
345
346
1.01M
  __m256i ff[4];
347
1.01M
  pack_filters(filter, ff);
348
349
1.01M
  src_ptr -= 3;
350
6.50M
  do {
351
6.50M
    pack_8x2_pixels(src_ptr, src_pitch, signal);
352
6.50M
    filter_8x1_pixels(signal, ff, &res0);
353
6.50M
    filter_8x1_pixels(&signal[4], ff, &res1);
354
6.50M
    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
355
6.50M
    height -= 2;
356
6.50M
    src_ptr += src_pitch << 1;
357
6.50M
    dst_ptr += dst_pitch << 1;
358
6.50M
  } while (height > 1);
359
360
1.01M
  if (height > 0) {
361
859k
    pack_8x1_pixels(src_ptr, signal);
362
859k
    filter_8x1_pixels(signal, ff, &res0);
363
859k
    store_8x1_pixels(&res0, &max, dst_ptr);
364
859k
  }
365
1.01M
}
366
367
static void vpx_highbd_filter_block1d16_h8_avx2(
368
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
369
762k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
370
762k
  __m256i signal[8], res0, res1;
371
762k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
372
373
762k
  __m256i ff[4];
374
762k
  pack_filters(filter, ff);
375
376
762k
  src_ptr -= 3;
377
28.0M
  do {
378
28.0M
    pack_16x1_pixels(src_ptr, signal);
379
28.0M
    filter_8x1_pixels(signal, ff, &res0);
380
28.0M
    filter_8x1_pixels(&signal[4], ff, &res1);
381
28.0M
    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
382
28.0M
    height -= 1;
383
28.0M
    src_ptr += src_pitch;
384
28.0M
    dst_ptr += dst_pitch;
385
28.0M
  } while (height > 0);
386
762k
}
387
388
// -----------------------------------------------------------------------------
389
// 2-tap horizontal filtering
390
391
1.29M
static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
392
1.29M
  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
393
1.29M
  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
394
1.29M
  const __m256i p = _mm256_set1_epi32(0x09080706);
395
1.29M
  f[0] = _mm256_shuffle_epi8(hh, p);
396
1.29M
}
397
398
// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
399
// the difference is s0/s1 specifies first and second rows or,
400
// first 16 samples and 8-sample shifted 16 samples
401
static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
402
14.1M
                                     __m256i *sig) {
403
14.1M
  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
404
14.1M
  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
405
14.1M
  __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
406
14.1M
  __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
407
14.1M
  __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
408
14.1M
  __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
409
14.1M
  r0 = _mm256_shuffle_epi8(r0, sf2);
410
14.1M
  r1 = _mm256_shuffle_epi8(r1, sf2);
411
14.1M
  sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
412
14.1M
  sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
413
14.1M
}
414
415
static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
416
1.67M
                                      const ptrdiff_t pitch, __m256i *sig) {
417
1.67M
  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
418
1.67M
  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
419
1.67M
  pack_16_2t_pixels(&r0, &r1, sig);
420
1.67M
}
421
422
static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
423
12.4M
                                       __m256i *sig /*sig[2]*/) {
424
12.4M
  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
425
12.4M
  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
426
12.4M
  pack_16_2t_pixels(&r0, &r1, sig);
427
12.4M
}
428
429
static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
430
261k
                                      __m256i *sig /*sig[2]*/) {
431
261k
  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
432
261k
  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
433
261k
  __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
434
261k
  __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
435
261k
  r0 = _mm256_permutevar8x32_epi32(r0, idx);
436
261k
  r0 = _mm256_shuffle_epi8(r0, sf2);
437
261k
  sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
438
261k
}
439
440
// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
441
static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
442
26.9M
                                       __m256i *y0, __m256i *y1) {
443
26.9M
  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
444
26.9M
  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
445
26.9M
  __m256i x1 = _mm256_madd_epi16(sig[1], *f);
446
26.9M
  x0 = _mm256_add_epi32(x0, rounding);
447
26.9M
  x1 = _mm256_add_epi32(x1, rounding);
448
26.9M
  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
449
26.9M
  *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
450
26.9M
}
451
452
static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
453
261k
                                        __m256i *y0) {
454
261k
  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
455
261k
  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
456
261k
  x0 = _mm256_add_epi32(x0, rounding);
457
261k
  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
458
261k
}
459
460
static void vpx_highbd_filter_block1d8_h2_avx2(
461
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
462
295k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
463
295k
  __m256i signal[2], res0, res1;
464
295k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
465
466
295k
  __m256i ff;
467
295k
  pack_2t_filter(filter, &ff);
468
469
295k
  src_ptr -= 3;
470
1.60M
  do {
471
1.60M
    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
472
1.60M
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
473
1.60M
    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
474
1.60M
    height -= 2;
475
1.60M
    src_ptr += src_pitch << 1;
476
1.60M
    dst_ptr += dst_pitch << 1;
477
1.60M
  } while (height > 1);
478
479
295k
  if (height > 0) {
480
261k
    pack_8x1_2t_pixels(src_ptr, signal);
481
261k
    filter_8x1_2t_pixels(signal, &ff, &res0);
482
261k
    store_8x1_pixels(&res0, &max, dst_ptr);
483
261k
  }
484
295k
}
485
486
static void vpx_highbd_filter_block1d16_h2_avx2(
487
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
488
445k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
489
445k
  __m256i signal[2], res0, res1;
490
445k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
491
492
445k
  __m256i ff;
493
445k
  pack_2t_filter(filter, &ff);
494
495
445k
  src_ptr -= 3;
496
11.6M
  do {
497
11.6M
    pack_16x1_2t_pixels(src_ptr, signal);
498
11.6M
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
499
11.6M
    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
500
11.6M
    height -= 1;
501
11.6M
    src_ptr += src_pitch;
502
11.6M
    dst_ptr += dst_pitch;
503
11.6M
  } while (height > 0);
504
445k
}
505
506
// -----------------------------------------------------------------------------
507
// Vertical Filtering
508
509
1.25M
static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
510
1.25M
  __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
511
1.25M
  __m256i s1 =
512
1.25M
      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
513
1.25M
  __m256i s2 = _mm256_castsi128_si256(
514
1.25M
      _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
515
1.25M
  __m256i s3 = _mm256_castsi128_si256(
516
1.25M
      _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
517
1.25M
  __m256i s4 = _mm256_castsi128_si256(
518
1.25M
      _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
519
1.25M
  __m256i s5 = _mm256_castsi128_si256(
520
1.25M
      _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
521
1.25M
  __m256i s6 = _mm256_castsi128_si256(
522
1.25M
      _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
523
524
1.25M
  s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
525
1.25M
  s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
526
1.25M
  s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
527
1.25M
  s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
528
1.25M
  s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
529
1.25M
  s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
530
531
1.25M
  sig[0] = _mm256_unpacklo_epi16(s0, s1);
532
1.25M
  sig[4] = _mm256_unpackhi_epi16(s0, s1);
533
1.25M
  sig[1] = _mm256_unpacklo_epi16(s2, s3);
534
1.25M
  sig[5] = _mm256_unpackhi_epi16(s2, s3);
535
1.25M
  sig[2] = _mm256_unpacklo_epi16(s4, s5);
536
1.25M
  sig[6] = _mm256_unpackhi_epi16(s4, s5);
537
1.25M
  sig[8] = s6;
538
1.25M
}
539
540
static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
541
4.79M
                                   __m256i *sig) {
542
  // base + 7th row
543
4.79M
  __m256i s0 = _mm256_castsi128_si256(
544
4.79M
      _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
545
  // base + 8th row
546
4.79M
  __m256i s1 = _mm256_castsi128_si256(
547
4.79M
      _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
548
4.79M
  __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
549
4.79M
  __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
550
4.79M
  sig[3] = _mm256_unpacklo_epi16(s2, s3);
551
4.79M
  sig[7] = _mm256_unpackhi_epi16(s2, s3);
552
4.79M
  sig[8] = s1;
553
4.79M
}
554
555
static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
556
4.79M
                                     __m256i *y0, __m256i *y1) {
557
4.79M
  filter_8x1_pixels(sig, f, y0);
558
4.79M
  filter_8x1_pixels(&sig[4], f, y1);
559
4.79M
}
560
561
34.0M
static INLINE void update_pixels(__m256i *sig) {
562
34.0M
  int i;
563
136M
  for (i = 0; i < 3; ++i) {
564
102M
    sig[i] = sig[i + 1];
565
102M
    sig[i + 4] = sig[i + 5];
566
102M
  }
567
34.0M
}
568
569
static void vpx_highbd_filter_block1d8_v8_avx2(
570
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
571
863k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
572
863k
  __m256i signal[9], res0, res1;
573
863k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
574
575
863k
  __m256i ff[4];
576
863k
  pack_filters(filter, ff);
577
578
863k
  pack_8x9_init(src_ptr, src_pitch, signal);
579
580
3.31M
  do {
581
3.31M
    pack_8x9_pixels(src_ptr, src_pitch, signal);
582
583
3.31M
    filter_8x9_pixels(signal, ff, &res0, &res1);
584
3.31M
    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
585
3.31M
    update_pixels(signal);
586
587
3.31M
    src_ptr += src_pitch << 1;
588
3.31M
    dst_ptr += dst_pitch << 1;
589
3.31M
    height -= 2;
590
3.31M
  } while (height > 0);
591
863k
}
592
593
926k
static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
594
926k
  __m256i u0, u1, u2, u3;
595
  // load 0-6 rows
596
926k
  const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
597
926k
  const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
598
926k
  const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
599
926k
  const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
600
926k
  const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
601
926k
  const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
602
926k
  const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
603
604
926k
  u0 = _mm256_permute2x128_si256(s0, s1, 0x20);  // 0, 1 low
605
926k
  u1 = _mm256_permute2x128_si256(s0, s1, 0x31);  // 0, 1 high
606
607
926k
  u2 = _mm256_permute2x128_si256(s1, s2, 0x20);  // 1, 2 low
608
926k
  u3 = _mm256_permute2x128_si256(s1, s2, 0x31);  // 1, 2 high
609
610
926k
  sig[0] = _mm256_unpacklo_epi16(u0, u2);
611
926k
  sig[4] = _mm256_unpackhi_epi16(u0, u2);
612
613
926k
  sig[8] = _mm256_unpacklo_epi16(u1, u3);
614
926k
  sig[12] = _mm256_unpackhi_epi16(u1, u3);
615
616
926k
  u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
617
926k
  u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
618
619
926k
  u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
620
926k
  u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
621
622
926k
  sig[1] = _mm256_unpacklo_epi16(u0, u2);
623
926k
  sig[5] = _mm256_unpackhi_epi16(u0, u2);
624
625
926k
  sig[9] = _mm256_unpacklo_epi16(u1, u3);
626
926k
  sig[13] = _mm256_unpackhi_epi16(u1, u3);
627
628
926k
  u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
629
926k
  u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
630
631
926k
  u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
632
926k
  u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
633
634
926k
  sig[2] = _mm256_unpacklo_epi16(u0, u2);
635
926k
  sig[6] = _mm256_unpackhi_epi16(u0, u2);
636
637
926k
  sig[10] = _mm256_unpacklo_epi16(u1, u3);
638
926k
  sig[14] = _mm256_unpackhi_epi16(u1, u3);
639
640
926k
  sig[16] = s6;
641
926k
}
642
643
static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
644
14.6M
                             __m256i *sig) {
645
  // base + 7th row
646
14.6M
  const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
647
  // base + 8th row
648
14.6M
  const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
649
650
14.6M
  __m256i u0, u1, u2, u3;
651
14.6M
  u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
652
14.6M
  u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
653
654
14.6M
  u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
655
14.6M
  u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
656
657
14.6M
  sig[3] = _mm256_unpacklo_epi16(u0, u2);
658
14.6M
  sig[7] = _mm256_unpackhi_epi16(u0, u2);
659
660
14.6M
  sig[11] = _mm256_unpacklo_epi16(u1, u3);
661
14.6M
  sig[15] = _mm256_unpackhi_epi16(u1, u3);
662
663
14.6M
  sig[16] = s8;
664
14.6M
}
665
666
static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
667
14.6M
                                      __m256i *y0, __m256i *y1) {
668
14.6M
  __m256i res[4];
669
14.6M
  int i;
670
73.1M
  for (i = 0; i < 4; ++i) {
671
58.5M
    filter_8x1_pixels(&sig[i << 2], f, &res[i]);
672
58.5M
  }
673
674
14.6M
  {
675
14.6M
    const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
676
14.6M
    const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
677
14.6M
    *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
678
14.6M
    *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
679
14.6M
  }
680
14.6M
}
681
682
static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
683
                                     const __m256i *mask, uint16_t *dst,
684
10.0M
                                     ptrdiff_t pitch) {
685
10.0M
  __m256i p = _mm256_min_epi16(*y0, *mask);
686
10.0M
  _mm256_storeu_si256((__m256i *)dst, p);
687
10.0M
  p = _mm256_min_epi16(*y1, *mask);
688
10.0M
  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
689
10.0M
}
690
691
14.6M
static void update_16x9_pixels(__m256i *sig) {
692
14.6M
  update_pixels(&sig[0]);
693
14.6M
  update_pixels(&sig[8]);
694
14.6M
}
695
696
static void vpx_highbd_filter_block1d16_v8_avx2(
697
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
698
642k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
699
642k
  __m256i signal[17], res0, res1;
700
642k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
701
702
642k
  __m256i ff[4];
703
642k
  pack_filters(filter, ff);
704
705
642k
  pack_16x9_init(src_ptr, src_pitch, signal);
706
707
10.0M
  do {
708
10.0M
    pack_16x9_pixels(src_ptr, src_pitch, signal);
709
10.0M
    filter_16x9_pixels(signal, ff, &res0, &res1);
710
10.0M
    store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
711
10.0M
    update_16x9_pixels(signal);
712
713
10.0M
    src_ptr += src_pitch << 1;
714
10.0M
    dst_ptr += dst_pitch << 1;
715
10.0M
    height -= 2;
716
10.0M
  } while (height > 0);
717
642k
}
718
719
// -----------------------------------------------------------------------------
720
// 2-tap vertical filtering
721
722
508k
static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
723
508k
  sig[2] = _mm256_loadu_si256((const __m256i *)src);
724
508k
}
725
726
static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
727
12.8M
                                       __m256i *sig) {
728
  // load the next row
729
12.8M
  const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
730
12.8M
  sig[0] = _mm256_unpacklo_epi16(sig[2], u);
731
12.8M
  sig[1] = _mm256_unpackhi_epi16(sig[2], u);
732
12.8M
  sig[2] = u;
733
12.8M
}
734
735
static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
736
12.8M
                                         __m256i *y0, __m256i *y1) {
737
12.8M
  filter_16_2t_pixels(sig, f, y0, y1);
738
12.8M
}
739
740
static void vpx_highbd_filter_block1d16_v2_avx2(
741
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
742
333k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
743
333k
  __m256i signal[3], res0, res1;
744
333k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
745
333k
  __m256i ff;
746
747
333k
  pack_2t_filter(filter, &ff);
748
333k
  pack_16x2_init(src_ptr, signal);
749
750
8.42M
  do {
751
8.42M
    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
752
8.42M
    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
753
8.42M
    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
754
755
8.42M
    src_ptr += src_pitch;
756
8.42M
    dst_ptr += dst_pitch;
757
8.42M
    height -= 1;
758
8.42M
  } while (height > 0);
759
333k
}
760
761
328k
static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
762
328k
  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
763
328k
  const __m128i p = _mm_set1_epi32(0x09080706);
764
328k
  f[0] = _mm_shuffle_epi8(h, p);
765
328k
}
766
767
328k
static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
768
328k
  sig[2] = _mm_loadu_si128((const __m128i *)src);
769
328k
}
770
771
static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
772
3.56M
                                          __m128i *sig) {
773
  // load the next row
774
3.56M
  const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
775
3.56M
  sig[0] = _mm_unpacklo_epi16(sig[2], u);
776
3.56M
  sig[1] = _mm_unpackhi_epi16(sig[2], u);
777
3.56M
  sig[2] = u;
778
3.56M
}
779
780
static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
781
3.56M
                                      __m128i *y0, __m128i *y1) {
782
3.56M
  const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
783
3.56M
  __m128i x0 = _mm_madd_epi16(sig[0], *f);
784
3.56M
  __m128i x1 = _mm_madd_epi16(sig[1], *f);
785
3.56M
  x0 = _mm_add_epi32(x0, rounding);
786
3.56M
  x1 = _mm_add_epi32(x1, rounding);
787
3.56M
  *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
788
3.56M
  *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
789
3.56M
}
790
791
static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
792
2.41M
                                           const __m128i *mask, uint16_t *dst) {
793
2.41M
  __m128i res = _mm_packus_epi32(*y0, *y1);
794
2.41M
  res = _mm_min_epi16(res, *mask);
795
2.41M
  _mm_storeu_si128((__m128i *)dst, res);
796
2.41M
}
797
798
static void vpx_highbd_filter_block1d8_v2_avx2(
799
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
800
221k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
801
221k
  __m128i signal[3], res0, res1;
802
221k
  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
803
221k
  __m128i ff;
804
805
221k
  pack_8x1_2t_filter(filter, &ff);
806
221k
  pack_8x2_init(src_ptr, signal);
807
808
2.41M
  do {
809
2.41M
    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
810
2.41M
    filter_8_2t_pixels(signal, &ff, &res0, &res1);
811
2.41M
    store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
812
813
2.41M
    src_ptr += src_pitch;
814
2.41M
    dst_ptr += dst_pitch;
815
2.41M
    height -= 1;
816
2.41M
  } while (height > 0);
817
221k
}
818
819
// Calculation with averaging the input pixels
820
821
static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask,
822
0
                                        uint16_t *dst) {
823
0
  const __m128i a0 = _mm256_castsi256_si128(*y0);
824
0
  const __m128i a1 = _mm256_extractf128_si256(*y0, 1);
825
0
  __m128i res = _mm_packus_epi32(a0, a1);
826
0
  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
827
0
  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
828
0
  res = _mm_avg_epu16(res, pix);
829
0
  _mm_storeu_si128((__m128i *)dst, res);
830
0
}
831
832
static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1,
833
                                        const __m256i *mask, uint16_t *dst,
834
1.78M
                                        ptrdiff_t pitch) {
835
1.78M
  __m256i a = _mm256_packus_epi32(*y0, *y1);
836
1.78M
  const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst);
837
1.78M
  const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch));
838
1.78M
  const __m256i pix =
839
1.78M
      _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
840
1.78M
  a = _mm256_min_epi16(a, *mask);
841
1.78M
  a = _mm256_avg_epu16(a, pix);
842
1.78M
  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
843
1.78M
  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
844
1.78M
}
845
846
static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1,
847
6.81M
                                         const __m256i *mask, uint16_t *dst) {
848
6.81M
  __m256i a = _mm256_packus_epi32(*y0, *y1);
849
6.81M
  const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
850
6.81M
  a = _mm256_min_epi16(a, *mask);
851
6.81M
  a = _mm256_avg_epu16(a, pix);
852
6.81M
  _mm256_storeu_si256((__m256i *)dst, a);
853
6.81M
}
854
855
static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1,
856
                                         const __m256i *mask, uint16_t *dst,
857
4.57M
                                         ptrdiff_t pitch) {
858
4.57M
  const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst);
859
4.57M
  const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch));
860
4.57M
  __m256i p = _mm256_min_epi16(*y0, *mask);
861
4.57M
  p = _mm256_avg_epu16(p, pix0);
862
4.57M
  _mm256_storeu_si256((__m256i *)dst, p);
863
864
4.57M
  p = _mm256_min_epi16(*y1, *mask);
865
4.57M
  p = _mm256_avg_epu16(p, pix1);
866
4.57M
  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
867
4.57M
}
868
869
static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0,
870
                                               const __m128i *y1,
871
                                               const __m128i *mask,
872
1.14M
                                               uint16_t *dst) {
873
1.14M
  __m128i res = _mm_packus_epi32(*y0, *y1);
874
1.14M
  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
875
1.14M
  res = _mm_min_epi16(res, *mask);
876
1.14M
  res = _mm_avg_epu16(res, pix);
877
1.14M
  _mm_storeu_si128((__m128i *)dst, res);
878
1.14M
}
879
880
static void vpx_highbd_filter_block1d8_h8_avg_avx2(
881
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
882
58.4k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
883
58.4k
  __m256i signal[8], res0, res1;
884
58.4k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
885
886
58.4k
  __m256i ff[4];
887
58.4k
  pack_filters(filter, ff);
888
889
58.4k
  src_ptr -= 3;
890
236k
  do {
891
236k
    pack_8x2_pixels(src_ptr, src_pitch, signal);
892
236k
    filter_8x1_pixels(signal, ff, &res0);
893
236k
    filter_8x1_pixels(&signal[4], ff, &res1);
894
236k
    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
895
236k
    height -= 2;
896
236k
    src_ptr += src_pitch << 1;
897
236k
    dst_ptr += dst_pitch << 1;
898
236k
  } while (height > 1);
899
900
58.4k
  if (height > 0) {
901
0
    pack_8x1_pixels(src_ptr, signal);
902
0
    filter_8x1_pixels(signal, ff, &res0);
903
0
    store_8x1_avg_pixels(&res0, &max, dst_ptr);
904
0
  }
905
58.4k
}
906
907
static void vpx_highbd_filter_block1d16_h8_avg_avx2(
908
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
909
47.8k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
910
47.8k
  __m256i signal[8], res0, res1;
911
47.8k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
912
913
47.8k
  __m256i ff[4];
914
47.8k
  pack_filters(filter, ff);
915
916
47.8k
  src_ptr -= 3;
917
1.60M
  do {
918
1.60M
    pack_16x1_pixels(src_ptr, signal);
919
1.60M
    filter_8x1_pixels(signal, ff, &res0);
920
1.60M
    filter_8x1_pixels(&signal[4], ff, &res1);
921
1.60M
    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
922
1.60M
    height -= 1;
923
1.60M
    src_ptr += src_pitch;
924
1.60M
    dst_ptr += dst_pitch;
925
1.60M
  } while (height > 0);
926
47.8k
}
927
928
static void vpx_highbd_filter_block1d4_h4_avx2(
929
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
930
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
931
  // We extract the middle four elements of the kernel into two registers in
932
  // the form
933
  // ... k[3] k[2] k[3] k[2]
934
  // ... k[5] k[4] k[5] k[4]
935
  // Then we shuffle the source into
936
  // ... s[1] s[0] s[0] s[-1]
937
  // ... s[3] s[2] s[2] s[1]
938
  // Calling multiply and add gives us half of the sum. Calling add on the two
939
  // halves gives us the output. Since avx2 allows us to use 256-bit buffer, we
940
  // can do this two rows at a time.
941
942
0
  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
943
0
  __m256i res_reg;
944
0
  __m256i idx_shift_0 =
945
0
      _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
946
0
                       3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
947
0
  __m256i idx_shift_2 =
948
0
      _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
949
0
                       5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
950
951
0
  __m128i kernel_reg_128;  // Kernel
952
0
  __m256i kernel_reg, kernel_reg_23,
953
0
      kernel_reg_45;  // Segments of the kernel used
954
0
  const __m256i reg_round =
955
0
      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
956
0
  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
957
0
  const ptrdiff_t unrolled_src_stride = src_stride << 1;
958
0
  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
959
0
  int h;
960
961
  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
962
0
  src_ptr -= 1;
963
964
  // Load Kernel
965
0
  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
966
0
  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
967
0
  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
968
0
  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
969
970
0
  for (h = height; h >= 2; h -= 2) {
971
    // Load the source
972
0
    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
973
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
974
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
975
976
    // Get the output
977
0
    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
978
0
                                   &kernel_reg_23, &kernel_reg_45);
979
980
    // Round the result
981
0
    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
982
983
    // Finally combine to get the final dst
984
0
    res_reg = _mm256_packus_epi32(res_reg, res_reg);
985
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
986
0
    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
987
0
                        &res_reg);
988
989
0
    src_ptr += unrolled_src_stride;
990
0
    dst_ptr += unrolled_dst_stride;
991
0
  }
992
993
  // Repeat for the last row if needed
994
0
  if (h > 0) {
995
    // Load the source
996
0
    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
997
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
998
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
999
1000
    // Get the output
1001
0
    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1002
0
                                   &kernel_reg_23, &kernel_reg_45);
1003
1004
    // Round the result
1005
0
    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
1006
1007
    // Finally combine to get the final dst
1008
0
    res_reg = _mm256_packus_epi32(res_reg, res_reg);
1009
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1010
0
    _mm_storel_epi64((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg));
1011
0
  }
1012
0
}
1013
1014
static void vpx_highbd_filter_block1d8_h4_avx2(
1015
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1016
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1017
  // We will extract the middle four elements of the kernel into two registers
1018
  // in the form
1019
  // ... k[3] k[2] k[3] k[2]
1020
  // ... k[5] k[4] k[5] k[4]
1021
  // Then we shuffle the source into
1022
  // ... s[1] s[0] s[0] s[-1]
1023
  // ... s[3] s[2] s[2] s[1]
1024
  // Calling multiply and add gives us half of the sum of the first half.
1025
  // Calling add gives us first half of the output. Repat again to get the whole
1026
  // output. Since avx2 allows us to use 256-bit buffer, we can do this two rows
1027
  // at a time.
1028
1029
0
  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
1030
0
  __m256i res_reg, res_first, res_last;
1031
0
  __m256i idx_shift_0 =
1032
0
      _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
1033
0
                       3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
1034
0
  __m256i idx_shift_2 =
1035
0
      _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
1036
0
                       5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
1037
1038
0
  __m128i kernel_reg_128;  // Kernel
1039
0
  __m256i kernel_reg, kernel_reg_23,
1040
0
      kernel_reg_45;  // Segments of the kernel used
1041
0
  const __m256i reg_round =
1042
0
      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
1043
0
  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
1044
0
  const ptrdiff_t unrolled_src_stride = src_stride << 1;
1045
0
  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
1046
0
  int h;
1047
1048
  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
1049
0
  src_ptr -= 1;
1050
1051
  // Load Kernel
1052
0
  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
1053
0
  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
1054
0
  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
1055
0
  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
1056
1057
0
  for (h = height; h >= 2; h -= 2) {
1058
    // Load the source
1059
0
    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
1060
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
1061
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
1062
1063
    // Result for first half
1064
0
    res_first = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1065
0
                                     &kernel_reg_23, &kernel_reg_45);
1066
1067
    // Do again to get the second half of dst
1068
    // Load the source
1069
0
    src_reg = mm256_loadu2_si128(src_ptr + 4, src_ptr + src_stride + 4);
1070
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
1071
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
1072
1073
    // Result for second half
1074
0
    res_last = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1075
0
                                    &kernel_reg_23, &kernel_reg_45);
1076
1077
    // Round each result
1078
0
    res_first = mm256_round_epi32(&res_first, &reg_round, CONV8_ROUNDING_BITS);
1079
0
    res_last = mm256_round_epi32(&res_last, &reg_round, CONV8_ROUNDING_BITS);
1080
1081
    // Finally combine to get the final dst
1082
0
    res_reg = _mm256_packus_epi32(res_first, res_last);
1083
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1084
0
    mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
1085
0
                       &res_reg);
1086
1087
0
    src_ptr += unrolled_src_stride;
1088
0
    dst_ptr += unrolled_dst_stride;
1089
0
  }
1090
1091
  // Repeat for the last row if needed
1092
0
  if (h > 0) {
1093
0
    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
1094
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
1095
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
1096
1097
0
    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1098
0
                                   &kernel_reg_23, &kernel_reg_45);
1099
1100
0
    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
1101
1102
0
    res_reg = _mm256_packus_epi32(res_reg, res_reg);
1103
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1104
1105
0
    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg);
1106
0
  }
1107
0
}
1108
1109
static void vpx_highbd_filter_block1d16_h4_avx2(
1110
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1111
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1112
0
  vpx_highbd_filter_block1d8_h4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
1113
0
                                     height, kernel, bd);
1114
0
  vpx_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
1115
0
                                     dst_stride, height, kernel, bd);
1116
0
}
1117
1118
static void vpx_highbd_filter_block1d8_v8_avg_avx2(
1119
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1120
394k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1121
394k
  __m256i signal[9], res0, res1;
1122
394k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1123
1124
394k
  __m256i ff[4];
1125
394k
  pack_filters(filter, ff);
1126
1127
394k
  pack_8x9_init(src_ptr, src_pitch, signal);
1128
1129
1.47M
  do {
1130
1.47M
    pack_8x9_pixels(src_ptr, src_pitch, signal);
1131
1132
1.47M
    filter_8x9_pixels(signal, ff, &res0, &res1);
1133
1.47M
    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
1134
1.47M
    update_pixels(signal);
1135
1136
1.47M
    src_ptr += src_pitch << 1;
1137
1.47M
    dst_ptr += dst_pitch << 1;
1138
1.47M
    height -= 2;
1139
1.47M
  } while (height > 0);
1140
394k
}
1141
1142
static void vpx_highbd_filter_block1d16_v8_avg_avx2(
1143
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1144
283k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1145
283k
  __m256i signal[17], res0, res1;
1146
283k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1147
1148
283k
  __m256i ff[4];
1149
283k
  pack_filters(filter, ff);
1150
1151
283k
  pack_16x9_init(src_ptr, src_pitch, signal);
1152
1153
4.57M
  do {
1154
4.57M
    pack_16x9_pixels(src_ptr, src_pitch, signal);
1155
4.57M
    filter_16x9_pixels(signal, ff, &res0, &res1);
1156
4.57M
    store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
1157
4.57M
    update_16x9_pixels(signal);
1158
1159
4.57M
    src_ptr += src_pitch << 1;
1160
4.57M
    dst_ptr += dst_pitch << 1;
1161
4.57M
    height -= 2;
1162
4.57M
  } while (height > 0);
1163
283k
}
1164
1165
static void vpx_highbd_filter_block1d8_h2_avg_avx2(
1166
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1167
14.5k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1168
14.5k
  __m256i signal[2], res0, res1;
1169
14.5k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1170
1171
14.5k
  __m256i ff;
1172
14.5k
  pack_2t_filter(filter, &ff);
1173
1174
14.5k
  src_ptr -= 3;
1175
70.7k
  do {
1176
70.7k
    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
1177
70.7k
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
1178
70.7k
    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
1179
70.7k
    height -= 2;
1180
70.7k
    src_ptr += src_pitch << 1;
1181
70.7k
    dst_ptr += dst_pitch << 1;
1182
70.7k
  } while (height > 1);
1183
1184
14.5k
  if (height > 0) {
1185
0
    pack_8x1_2t_pixels(src_ptr, signal);
1186
0
    filter_8x1_2t_pixels(signal, &ff, &res0);
1187
0
    store_8x1_avg_pixels(&res0, &max, dst_ptr);
1188
0
  }
1189
14.5k
}
1190
1191
static void vpx_highbd_filter_block1d16_h2_avg_avx2(
1192
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1193
31.4k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1194
31.4k
  __m256i signal[2], res0, res1;
1195
31.4k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1196
1197
31.4k
  __m256i ff;
1198
31.4k
  pack_2t_filter(filter, &ff);
1199
1200
31.4k
  src_ptr -= 3;
1201
810k
  do {
1202
810k
    pack_16x1_2t_pixels(src_ptr, signal);
1203
810k
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
1204
810k
    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
1205
810k
    height -= 1;
1206
810k
    src_ptr += src_pitch;
1207
810k
    dst_ptr += dst_pitch;
1208
810k
  } while (height > 0);
1209
31.4k
}
1210
1211
static void vpx_highbd_filter_block1d16_v2_avg_avx2(
1212
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1213
174k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1214
174k
  __m256i signal[3], res0, res1;
1215
174k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1216
174k
  __m256i ff;
1217
1218
174k
  pack_2t_filter(filter, &ff);
1219
174k
  pack_16x2_init(src_ptr, signal);
1220
1221
4.40M
  do {
1222
4.40M
    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
1223
4.40M
    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
1224
4.40M
    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
1225
1226
4.40M
    src_ptr += src_pitch;
1227
4.40M
    dst_ptr += dst_pitch;
1228
4.40M
    height -= 1;
1229
4.40M
  } while (height > 0);
1230
174k
}
1231
1232
static void vpx_highbd_filter_block1d8_v2_avg_avx2(
1233
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1234
107k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1235
107k
  __m128i signal[3], res0, res1;
1236
107k
  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
1237
107k
  __m128i ff;
1238
1239
107k
  pack_8x1_2t_filter(filter, &ff);
1240
107k
  pack_8x2_init(src_ptr, signal);
1241
1242
1.14M
  do {
1243
1.14M
    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
1244
1.14M
    filter_8_2t_pixels(signal, &ff, &res0, &res1);
1245
1.14M
    store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr);
1246
1247
1.14M
    src_ptr += src_pitch;
1248
1.14M
    dst_ptr += dst_pitch;
1249
1.14M
    height -= 1;
1250
1.14M
  } while (height > 0);
1251
107k
}
1252
1253
static void vpx_highbd_filter_block1d4_v4_avx2(
1254
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1255
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1256
  // We will load two rows of pixels and rearrange them into the form
1257
  // ... s[1,0] s[0,0] s[0,0] s[-1,0]
1258
  // so that we can call multiply and add with the kernel partial output. Then
1259
  // we can call add with another row to get the output.
1260
1261
  // Register for source s[-1:3, :]
1262
0
  __m256i src_reg_1, src_reg_2, src_reg_3;
1263
  // Interleaved rows of the source. lo is first half, hi second
1264
0
  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
1265
0
  __m256i src_reg_m1001, src_reg_1223;
1266
1267
  // Result after multiply and add
1268
0
  __m256i res_reg;
1269
1270
0
  __m128i kernel_reg_128;                            // Kernel
1271
0
  __m256i kernel_reg, kernel_reg_23, kernel_reg_45;  // Segments of kernel used
1272
1273
0
  const __m256i reg_round =
1274
0
      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
1275
0
  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
1276
0
  const ptrdiff_t src_stride_unrolled = src_stride << 1;
1277
0
  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
1278
0
  int h;
1279
1280
  // Load Kernel
1281
0
  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
1282
0
  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
1283
0
  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
1284
0
  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
1285
1286
  // Row -1 to row 0
1287
0
  src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr,
1288
0
                                   (const __m128i *)(src_ptr + src_stride));
1289
1290
  // Row 0 to row 1
1291
0
  src_reg_1 = _mm256_castsi128_si256(
1292
0
      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
1293
0
  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
1294
1295
  // First three rows
1296
0
  src_reg_m1001 = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
1297
1298
0
  for (h = height; h > 1; h -= 2) {
1299
0
    src_reg_2 = _mm256_castsi128_si256(
1300
0
        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
1301
1302
0
    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
1303
0
                                         _mm256_castsi256_si128(src_reg_2), 1);
1304
1305
0
    src_reg_3 = _mm256_castsi128_si256(
1306
0
        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
1307
1308
0
    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
1309
0
                                         _mm256_castsi256_si128(src_reg_3), 1);
1310
1311
    // Last three rows
1312
0
    src_reg_1223 = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
1313
1314
    // Output
1315
0
    res_reg = mm256_madd_add_epi32(&src_reg_m1001, &src_reg_1223,
1316
0
                                   &kernel_reg_23, &kernel_reg_45);
1317
1318
    // Round the words
1319
0
    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
1320
1321
    // Combine to get the result
1322
0
    res_reg = _mm256_packus_epi32(res_reg, res_reg);
1323
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1324
1325
    // Save the result
1326
0
    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
1327
0
                        &res_reg);
1328
1329
    // Update the source by two rows
1330
0
    src_ptr += src_stride_unrolled;
1331
0
    dst_ptr += dst_stride_unrolled;
1332
1333
0
    src_reg_m1001 = src_reg_1223;
1334
0
    src_reg_1 = src_reg_3;
1335
0
  }
1336
0
}
1337
1338
static void vpx_highbd_filter_block1d8_v4_avx2(
1339
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1340
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1341
  // We will load two rows of pixels and rearrange them into the form
1342
  // ... s[1,0] s[0,0] s[0,0] s[-1,0]
1343
  // so that we can call multiply and add with the kernel partial output. Then
1344
  // we can call add with another row to get the output.
1345
1346
  // Register for source s[-1:3, :]
1347
0
  __m256i src_reg_1, src_reg_2, src_reg_3;
1348
  // Interleaved rows of the source. lo is first half, hi second
1349
0
  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
1350
0
  __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi;
1351
1352
0
  __m128i kernel_reg_128;                            // Kernel
1353
0
  __m256i kernel_reg, kernel_reg_23, kernel_reg_45;  // Segments of kernel
1354
1355
  // Result after multiply and add
1356
0
  __m256i res_reg, res_reg_lo, res_reg_hi;
1357
1358
0
  const __m256i reg_round =
1359
0
      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
1360
0
  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
1361
0
  const ptrdiff_t src_stride_unrolled = src_stride << 1;
1362
0
  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
1363
0
  int h;
1364
1365
  // Load Kernel
1366
0
  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
1367
0
  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
1368
0
  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
1369
0
  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
1370
1371
  // Row -1 to row 0
1372
0
  src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
1373
0
                                   (const __m128i *)(src_ptr + src_stride));
1374
1375
  // Row 0 to row 1
1376
0
  src_reg_1 = _mm256_castsi128_si256(
1377
0
      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
1378
0
  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
1379
1380
  // First three rows
1381
0
  src_reg_m1001_lo = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
1382
0
  src_reg_m1001_hi = _mm256_unpackhi_epi16(src_reg_m10, src_reg_01);
1383
1384
0
  for (h = height; h > 1; h -= 2) {
1385
0
    src_reg_2 = _mm256_castsi128_si256(
1386
0
        _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)));
1387
1388
0
    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
1389
0
                                         _mm256_castsi256_si128(src_reg_2), 1);
1390
1391
0
    src_reg_3 = _mm256_castsi128_si256(
1392
0
        _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)));
1393
1394
0
    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
1395
0
                                         _mm256_castsi256_si128(src_reg_3), 1);
1396
1397
    // Last three rows
1398
0
    src_reg_1223_lo = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
1399
0
    src_reg_1223_hi = _mm256_unpackhi_epi16(src_reg_12, src_reg_23);
1400
1401
    // Output from first half
1402
0
    res_reg_lo = mm256_madd_add_epi32(&src_reg_m1001_lo, &src_reg_1223_lo,
1403
0
                                      &kernel_reg_23, &kernel_reg_45);
1404
1405
    // Output from second half
1406
0
    res_reg_hi = mm256_madd_add_epi32(&src_reg_m1001_hi, &src_reg_1223_hi,
1407
0
                                      &kernel_reg_23, &kernel_reg_45);
1408
1409
    // Round the words
1410
0
    res_reg_lo =
1411
0
        mm256_round_epi32(&res_reg_lo, &reg_round, CONV8_ROUNDING_BITS);
1412
0
    res_reg_hi =
1413
0
        mm256_round_epi32(&res_reg_hi, &reg_round, CONV8_ROUNDING_BITS);
1414
1415
    // Combine to get the result
1416
0
    res_reg = _mm256_packus_epi32(res_reg_lo, res_reg_hi);
1417
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1418
1419
    // Save the result
1420
0
    mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
1421
0
                       &res_reg);
1422
1423
    // Update the source by two rows
1424
0
    src_ptr += src_stride_unrolled;
1425
0
    dst_ptr += dst_stride_unrolled;
1426
1427
0
    src_reg_m1001_lo = src_reg_1223_lo;
1428
0
    src_reg_m1001_hi = src_reg_1223_hi;
1429
0
    src_reg_1 = src_reg_3;
1430
0
  }
1431
0
}
1432
1433
static void vpx_highbd_filter_block1d16_v4_avx2(
1434
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1435
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1436
0
  vpx_highbd_filter_block1d8_v4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
1437
0
                                     height, kernel, bd);
1438
0
  vpx_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
1439
0
                                     dst_stride, height, kernel, bd);
1440
0
}
1441
1442
// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
1443
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
1444
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
1445
1446
// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
1447
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
1448
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
1449
1450
1.52M
#define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2
1451
488k
#define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2
1452
1.21M
#define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
1453
376k
#define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2
1454
1455
// Use the [vh]8 version because there is no [vh]4 implementation.
1456
#define vpx_highbd_filter_block1d16_v4_avg_avx2 \
1457
0
  vpx_highbd_filter_block1d16_v8_avg_avx2
1458
#define vpx_highbd_filter_block1d16_h4_avg_avx2 \
1459
0
  vpx_highbd_filter_block1d16_h8_avg_avx2
1460
#define vpx_highbd_filter_block1d8_v4_avg_avx2 \
1461
0
  vpx_highbd_filter_block1d8_v8_avg_avx2
1462
#define vpx_highbd_filter_block1d8_h4_avg_avx2 \
1463
0
  vpx_highbd_filter_block1d8_h8_avg_avx2
1464
#define vpx_highbd_filter_block1d4_v4_avg_avx2 \
1465
0
  vpx_highbd_filter_block1d4_v8_avg_avx2
1466
#define vpx_highbd_filter_block1d4_h4_avg_avx2 \
1467
0
  vpx_highbd_filter_block1d4_h8_avg_avx2
1468
1469
HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
1470
HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
1471
                 src - src_stride * (num_taps / 2 - 1), , avx2, 0)
1472
2.55M
HIGH_FUN_CONV_2D(, avx2, 0)
1473
1474
// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
1475
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
1476
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
1477
1478
// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
1479
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
1480
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
1481
1482
#define vpx_highbd_filter_block1d4_h8_avg_avx2 \
1483
78.8k
  vpx_highbd_filter_block1d4_h8_avg_sse2
1484
#define vpx_highbd_filter_block1d4_h2_avg_avx2 \
1485
22.2k
  vpx_highbd_filter_block1d4_h2_avg_sse2
1486
#define vpx_highbd_filter_block1d4_v8_avg_avx2 \
1487
535k
  vpx_highbd_filter_block1d4_v8_avg_sse2
1488
#define vpx_highbd_filter_block1d4_v2_avg_avx2 \
1489
159k
  vpx_highbd_filter_block1d4_v2_avg_sse2
1490
1491
HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
1492
HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
1493
                 src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
1494
HIGH_FUN_CONV_2D(avg_, avx2, 1)
1495
1496
#undef HIGHBD_FUNC
1497
#endif  // HAVE_X86_ASM