Coverage Report

Created: 2026-02-14 06:59

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
Line
Count
Source
1
/*
2
 *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
11
#include <immintrin.h>
12
#include "./vpx_dsp_rtcd.h"
13
#include "vpx_dsp/x86/convolve.h"
14
#include "vpx_dsp/x86/convolve_avx2.h"
15
16
// -----------------------------------------------------------------------------
17
// Copy and average
18
19
void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
20
                                   uint16_t *dst, ptrdiff_t dst_stride,
21
                                   const InterpKernel *filter, int x0_q4,
22
                                   int x_step_q4, int y0_q4, int y_step_q4,
23
1.60M
                                   int w, int h, int bd) {
24
1.60M
  (void)filter;
25
1.60M
  (void)x0_q4;
26
1.60M
  (void)x_step_q4;
27
1.60M
  (void)y0_q4;
28
1.60M
  (void)y_step_q4;
29
1.60M
  (void)bd;
30
31
1.60M
  assert(w % 4 == 0);
32
1.60M
  if (w > 32) {  // w = 64
33
7.65M
    do {
34
7.65M
      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
35
7.65M
      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
36
7.65M
      const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
37
7.65M
      const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
38
7.65M
      src += src_stride;
39
7.65M
      _mm256_storeu_si256((__m256i *)dst, p0);
40
7.65M
      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
41
7.65M
      _mm256_storeu_si256((__m256i *)(dst + 32), p2);
42
7.65M
      _mm256_storeu_si256((__m256i *)(dst + 48), p3);
43
7.65M
      dst += dst_stride;
44
7.65M
      h--;
45
7.65M
    } while (h > 0);
46
1.44M
  } else if (w > 16) {  // w = 32
47
9.45M
    do {
48
9.45M
      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
49
9.45M
      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
50
9.45M
      src += src_stride;
51
9.45M
      _mm256_storeu_si256((__m256i *)dst, p0);
52
9.45M
      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
53
9.45M
      dst += dst_stride;
54
9.45M
      h--;
55
9.45M
    } while (h > 0);
56
1.07M
  } else if (w > 8) {  // w = 16
57
209k
    __m256i p0, p1;
58
1.73M
    do {
59
1.73M
      p0 = _mm256_loadu_si256((const __m256i *)src);
60
1.73M
      src += src_stride;
61
1.73M
      p1 = _mm256_loadu_si256((const __m256i *)src);
62
1.73M
      src += src_stride;
63
64
1.73M
      _mm256_storeu_si256((__m256i *)dst, p0);
65
1.73M
      dst += dst_stride;
66
1.73M
      _mm256_storeu_si256((__m256i *)dst, p1);
67
1.73M
      dst += dst_stride;
68
1.73M
      h -= 2;
69
1.73M
    } while (h > 0);
70
864k
  } else if (w > 4) {  // w = 8
71
415k
    __m128i p0, p1;
72
1.72M
    do {
73
1.72M
      p0 = _mm_loadu_si128((const __m128i *)src);
74
1.72M
      src += src_stride;
75
1.72M
      p1 = _mm_loadu_si128((const __m128i *)src);
76
1.72M
      src += src_stride;
77
78
1.72M
      _mm_storeu_si128((__m128i *)dst, p0);
79
1.72M
      dst += dst_stride;
80
1.72M
      _mm_storeu_si128((__m128i *)dst, p1);
81
1.72M
      dst += dst_stride;
82
1.72M
      h -= 2;
83
1.72M
    } while (h > 0);
84
448k
  } else {  // w = 4
85
448k
    __m128i p0, p1;
86
963k
    do {
87
963k
      p0 = _mm_loadl_epi64((const __m128i *)src);
88
963k
      src += src_stride;
89
963k
      p1 = _mm_loadl_epi64((const __m128i *)src);
90
963k
      src += src_stride;
91
92
963k
      _mm_storel_epi64((__m128i *)dst, p0);
93
963k
      dst += dst_stride;
94
963k
      _mm_storel_epi64((__m128i *)dst, p1);
95
963k
      dst += dst_stride;
96
963k
      h -= 2;
97
963k
    } while (h > 0);
98
448k
  }
99
1.60M
}
100
101
void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
102
                                  uint16_t *dst, ptrdiff_t dst_stride,
103
                                  const InterpKernel *filter, int x0_q4,
104
                                  int x_step_q4, int y0_q4, int y_step_q4,
105
370k
                                  int w, int h, int bd) {
106
370k
  (void)filter;
107
370k
  (void)x0_q4;
108
370k
  (void)x_step_q4;
109
370k
  (void)y0_q4;
110
370k
  (void)y_step_q4;
111
370k
  (void)bd;
112
113
370k
  assert(w % 4 == 0);
114
370k
  if (w > 32) {  // w = 64
115
6.07k
    __m256i p0, p1, p2, p3, u0, u1, u2, u3;
116
372k
    do {
117
372k
      p0 = _mm256_loadu_si256((const __m256i *)src);
118
372k
      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
119
372k
      p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
120
372k
      p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
121
372k
      src += src_stride;
122
372k
      u0 = _mm256_loadu_si256((const __m256i *)dst);
123
372k
      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
124
372k
      u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
125
372k
      u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
126
372k
      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
127
372k
      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
128
372k
      _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
129
372k
      _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
130
372k
      dst += dst_stride;
131
372k
      h--;
132
372k
    } while (h > 0);
133
364k
  } else if (w > 16) {  // w = 32
134
33.4k
    __m256i p0, p1, u0, u1;
135
1.10M
    do {
136
1.10M
      p0 = _mm256_loadu_si256((const __m256i *)src);
137
1.10M
      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
138
1.10M
      src += src_stride;
139
1.10M
      u0 = _mm256_loadu_si256((const __m256i *)dst);
140
1.10M
      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
141
1.10M
      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
142
1.10M
      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
143
1.10M
      dst += dst_stride;
144
1.10M
      h--;
145
1.10M
    } while (h > 0);
146
331k
  } else if (w > 8) {  // w = 16
147
69.3k
    __m256i p0, p1, u0, u1;
148
592k
    do {
149
592k
      p0 = _mm256_loadu_si256((const __m256i *)src);
150
592k
      p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
151
592k
      src += src_stride << 1;
152
592k
      u0 = _mm256_loadu_si256((const __m256i *)dst);
153
592k
      u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
154
155
592k
      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
156
592k
      _mm256_storeu_si256((__m256i *)(dst + dst_stride),
157
592k
                          _mm256_avg_epu16(p1, u1));
158
592k
      dst += dst_stride << 1;
159
592k
      h -= 2;
160
592k
    } while (h > 0);
161
262k
  } else if (w > 4) {  // w = 8
162
119k
    __m128i p0, p1, u0, u1;
163
498k
    do {
164
498k
      p0 = _mm_loadu_si128((const __m128i *)src);
165
498k
      p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
166
498k
      src += src_stride << 1;
167
498k
      u0 = _mm_loadu_si128((const __m128i *)dst);
168
498k
      u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
169
170
498k
      _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
171
498k
      _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
172
498k
      dst += dst_stride << 1;
173
498k
      h -= 2;
174
498k
    } while (h > 0);
175
143k
  } else {  // w = 4
176
143k
    __m128i p0, p1, u0, u1;
177
314k
    do {
178
314k
      p0 = _mm_loadl_epi64((const __m128i *)src);
179
314k
      p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
180
314k
      src += src_stride << 1;
181
314k
      u0 = _mm_loadl_epi64((const __m128i *)dst);
182
314k
      u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
183
184
314k
      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
185
314k
      _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
186
314k
      dst += dst_stride << 1;
187
314k
      h -= 2;
188
314k
    } while (h > 0);
189
143k
  }
190
370k
}
191
192
#if HAVE_X86_ASM
193
// -----------------------------------------------------------------------------
194
// Horizontal and vertical filtering
195
196
static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
197
                                              7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
198
                                              4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
199
200
static const uint8_t signal_pattern_1[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
201
                                              8, 9, 10, 11, 10, 11, 12, 13,
202
                                              4, 5, 6,  7,  6,  7,  8,  9,
203
                                              8, 9, 10, 11, 10, 11, 12, 13 };
204
205
static const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
206
                                              10, 11, 12, 13, 12, 13, 14, 15,
207
                                              6,  7,  8,  9,  8,  9,  10, 11,
208
                                              10, 11, 12, 13, 12, 13, 14, 15 };
209
210
static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
211
212
312M
#define CONV8_ROUNDING_BITS (7)
213
0
#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1))
214
215
// -----------------------------------------------------------------------------
216
// Horizontal Filtering
217
218
63.3M
static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
219
63.3M
  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
220
63.3M
  const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
221
63.3M
  const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
222
63.3M
  const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
223
224
63.3M
  p[0] = _mm256_shuffle_epi8(*s, sf0);  // x0x6
225
63.3M
  p[1] = _mm256_shuffle_epi8(*s, sf1);  // x1x7
226
63.3M
  p[2] = _mm256_shuffle_epi8(c, sf0);   // x2x4
227
63.3M
  p[3] = _mm256_shuffle_epi8(c, sf1);   // x3x5
228
63.3M
}
229
230
// Note:
231
//  Shared by 8x2 and 16x1 block
232
static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
233
31.2M
                                  __m256i *x /*x[8]*/) {
234
31.2M
  __m256i pp[8];
235
31.2M
  pack_pixels(s0, pp);
236
31.2M
  pack_pixels(s1, &pp[4]);
237
31.2M
  x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
238
31.2M
  x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
239
31.2M
  x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
240
31.2M
  x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
241
31.2M
  x[4] = x[2];
242
31.2M
  x[5] = x[3];
243
31.2M
  x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
244
31.2M
  x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
245
31.2M
}
246
247
728k
static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
248
728k
  __m256i pp[8];
249
728k
  __m256i s0;
250
728k
  s0 = _mm256_loadu_si256((const __m256i *)src);
251
728k
  pack_pixels(&s0, pp);
252
728k
  x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
253
728k
  x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
254
728k
  x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
255
728k
  x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
256
728k
}
257
258
static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
259
5.75M
                                   __m256i *x) {
260
5.75M
  __m256i s0, s1;
261
5.75M
  s0 = _mm256_loadu_si256((const __m256i *)src);
262
5.75M
  s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
263
5.75M
  pack_16_pixels(&s0, &s1, x);
264
5.75M
}
265
266
25.5M
static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
267
25.5M
  __m256i s0, s1;
268
25.5M
  s0 = _mm256_loadu_si256((const __m256i *)src);
269
25.5M
  s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
270
25.5M
  pack_16_pixels(&s0, &s1, x);
271
25.5M
}
272
273
// Note:
274
//  Shared by horizontal and vertical filtering
275
3.40M
static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
276
3.40M
  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
277
3.40M
  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
278
3.40M
  const __m256i p0 = _mm256_set1_epi32(0x03020100);
279
3.40M
  const __m256i p1 = _mm256_set1_epi32(0x07060504);
280
3.40M
  const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
281
3.40M
  const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
282
3.40M
  f[0] = _mm256_shuffle_epi8(hh, p0);
283
3.40M
  f[1] = _mm256_shuffle_epi8(hh, p1);
284
3.40M
  f[2] = _mm256_shuffle_epi8(hh, p2);
285
3.40M
  f[3] = _mm256_shuffle_epi8(hh, p3);
286
3.40M
}
287
288
static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
289
                                     const __m256i *fil /*fil[4]*/,
290
119M
                                     __m256i *y) {
291
119M
  __m256i a, a0, a1;
292
293
119M
  a0 = _mm256_madd_epi16(fil[0], sig[0]);
294
119M
  a1 = _mm256_madd_epi16(fil[3], sig[3]);
295
119M
  a = _mm256_add_epi32(a0, a1);
296
297
119M
  a0 = _mm256_madd_epi16(fil[1], sig[1]);
298
119M
  a1 = _mm256_madd_epi16(fil[2], sig[2]);
299
300
119M
  {
301
119M
    const __m256i min = _mm256_min_epi32(a0, a1);
302
119M
    a = _mm256_add_epi32(a, min);
303
119M
  }
304
119M
  {
305
119M
    const __m256i max = _mm256_max_epi32(a0, a1);
306
119M
    a = _mm256_add_epi32(a, max);
307
119M
  }
308
119M
  {
309
119M
    const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
310
119M
    a = _mm256_add_epi32(a, rounding);
311
119M
    *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
312
119M
  }
313
119M
}
314
315
static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
316
934k
                                    uint16_t *dst) {
317
934k
  const __m128i a0 = _mm256_castsi256_si128(*y);
318
934k
  const __m128i a1 = _mm256_extractf128_si256(*y, 1);
319
934k
  __m128i res = _mm_packus_epi32(a0, a1);
320
934k
  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
321
934k
  _mm_storeu_si128((__m128i *)dst, res);
322
934k
}
323
324
static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
325
                                    const __m256i *mask, uint16_t *dst,
326
9.68M
                                    ptrdiff_t pitch) {
327
9.68M
  __m256i a = _mm256_packus_epi32(*y0, *y1);
328
9.68M
  a = _mm256_min_epi16(a, *mask);
329
9.68M
  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
330
9.68M
  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
331
9.68M
}
332
333
static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
334
40.2M
                                     const __m256i *mask, uint16_t *dst) {
335
40.2M
  __m256i a = _mm256_packus_epi32(*y0, *y1);
336
40.2M
  a = _mm256_min_epi16(a, *mask);
337
40.2M
  _mm256_storeu_si256((__m256i *)dst, a);
338
40.2M
}
339
340
static void vpx_highbd_filter_block1d8_h8_avx2(
341
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
342
866k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
343
866k
  __m256i signal[8], res0, res1;
344
866k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
345
346
866k
  __m256i ff[4];
347
866k
  pack_filters(filter, ff);
348
349
866k
  src_ptr -= 3;
350
5.57M
  do {
351
5.57M
    pack_8x2_pixels(src_ptr, src_pitch, signal);
352
5.57M
    filter_8x1_pixels(signal, ff, &res0);
353
5.57M
    filter_8x1_pixels(&signal[4], ff, &res1);
354
5.57M
    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
355
5.57M
    height -= 2;
356
5.57M
    src_ptr += src_pitch << 1;
357
5.57M
    dst_ptr += dst_pitch << 1;
358
5.57M
  } while (height > 1);
359
360
866k
  if (height > 0) {
361
728k
    pack_8x1_pixels(src_ptr, signal);
362
728k
    filter_8x1_pixels(signal, ff, &res0);
363
728k
    store_8x1_pixels(&res0, &max, dst_ptr);
364
728k
  }
365
866k
}
366
367
static void vpx_highbd_filter_block1d16_h8_avx2(
368
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
369
662k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
370
662k
  __m256i signal[8], res0, res1;
371
662k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
372
373
662k
  __m256i ff[4];
374
662k
  pack_filters(filter, ff);
375
376
662k
  src_ptr -= 3;
377
24.2M
  do {
378
24.2M
    pack_16x1_pixels(src_ptr, signal);
379
24.2M
    filter_8x1_pixels(signal, ff, &res0);
380
24.2M
    filter_8x1_pixels(&signal[4], ff, &res1);
381
24.2M
    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
382
24.2M
    height -= 1;
383
24.2M
    src_ptr += src_pitch;
384
24.2M
    dst_ptr += dst_pitch;
385
24.2M
  } while (height > 0);
386
662k
}
387
388
// -----------------------------------------------------------------------------
389
// 2-tap horizontal filtering
390
391
1.03M
static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
392
1.03M
  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
393
1.03M
  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
394
1.03M
  const __m256i p = _mm256_set1_epi32(0x09080706);
395
1.03M
  f[0] = _mm256_shuffle_epi8(hh, p);
396
1.03M
}
397
398
// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
399
// the difference is s0/s1 specifies first and second rows or,
400
// first 16 samples and 8-sample shifted 16 samples
401
static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
402
11.3M
                                     __m256i *sig) {
403
11.3M
  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
404
11.3M
  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
405
11.3M
  __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
406
11.3M
  __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
407
11.3M
  __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
408
11.3M
  __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
409
11.3M
  r0 = _mm256_shuffle_epi8(r0, sf2);
410
11.3M
  r1 = _mm256_shuffle_epi8(r1, sf2);
411
11.3M
  sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
412
11.3M
  sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
413
11.3M
}
414
415
static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
416
1.30M
                                      const ptrdiff_t pitch, __m256i *sig) {
417
1.30M
  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
418
1.30M
  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
419
1.30M
  pack_16_2t_pixels(&r0, &r1, sig);
420
1.30M
}
421
422
static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
423
10.0M
                                       __m256i *sig /*sig[2]*/) {
424
10.0M
  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
425
10.0M
  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
426
10.0M
  pack_16_2t_pixels(&r0, &r1, sig);
427
10.0M
}
428
429
static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
430
205k
                                      __m256i *sig /*sig[2]*/) {
431
205k
  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
432
205k
  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
433
205k
  __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
434
205k
  __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
435
205k
  r0 = _mm256_permutevar8x32_epi32(r0, idx);
436
205k
  r0 = _mm256_shuffle_epi8(r0, sf2);
437
205k
  sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
438
205k
}
439
440
// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
441
static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
442
21.6M
                                       __m256i *y0, __m256i *y1) {
443
21.6M
  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
444
21.6M
  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
445
21.6M
  __m256i x1 = _mm256_madd_epi16(sig[1], *f);
446
21.6M
  x0 = _mm256_add_epi32(x0, rounding);
447
21.6M
  x1 = _mm256_add_epi32(x1, rounding);
448
21.6M
  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
449
21.6M
  *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
450
21.6M
}
451
452
static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
453
205k
                                        __m256i *y0) {
454
205k
  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
455
205k
  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
456
205k
  x0 = _mm256_add_epi32(x0, rounding);
457
205k
  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
458
205k
}
459
460
static void vpx_highbd_filter_block1d8_h2_avx2(
461
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
462
233k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
463
233k
  __m256i signal[2], res0, res1;
464
233k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
465
466
233k
  __m256i ff;
467
233k
  pack_2t_filter(filter, &ff);
468
469
233k
  src_ptr -= 3;
470
1.24M
  do {
471
1.24M
    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
472
1.24M
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
473
1.24M
    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
474
1.24M
    height -= 2;
475
1.24M
    src_ptr += src_pitch << 1;
476
1.24M
    dst_ptr += dst_pitch << 1;
477
1.24M
  } while (height > 1);
478
479
233k
  if (height > 0) {
480
205k
    pack_8x1_2t_pixels(src_ptr, signal);
481
205k
    filter_8x1_2t_pixels(signal, &ff, &res0);
482
205k
    store_8x1_pixels(&res0, &max, dst_ptr);
483
205k
  }
484
233k
}
485
486
static void vpx_highbd_filter_block1d16_h2_avx2(
487
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
488
358k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
489
358k
  __m256i signal[2], res0, res1;
490
358k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
491
492
358k
  __m256i ff;
493
358k
  pack_2t_filter(filter, &ff);
494
495
358k
  src_ptr -= 3;
496
9.39M
  do {
497
9.39M
    pack_16x1_2t_pixels(src_ptr, signal);
498
9.39M
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
499
9.39M
    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
500
9.39M
    height -= 1;
501
9.39M
    src_ptr += src_pitch;
502
9.39M
    dst_ptr += dst_pitch;
503
9.39M
  } while (height > 0);
504
358k
}
505
506
// -----------------------------------------------------------------------------
507
// Vertical Filtering
508
509
1.02M
static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
510
1.02M
  __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
511
1.02M
  __m256i s1 =
512
1.02M
      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
513
1.02M
  __m256i s2 = _mm256_castsi128_si256(
514
1.02M
      _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
515
1.02M
  __m256i s3 = _mm256_castsi128_si256(
516
1.02M
      _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
517
1.02M
  __m256i s4 = _mm256_castsi128_si256(
518
1.02M
      _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
519
1.02M
  __m256i s5 = _mm256_castsi128_si256(
520
1.02M
      _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
521
1.02M
  __m256i s6 = _mm256_castsi128_si256(
522
1.02M
      _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
523
524
1.02M
  s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
525
1.02M
  s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
526
1.02M
  s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
527
1.02M
  s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
528
1.02M
  s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
529
1.02M
  s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
530
531
1.02M
  sig[0] = _mm256_unpacklo_epi16(s0, s1);
532
1.02M
  sig[4] = _mm256_unpackhi_epi16(s0, s1);
533
1.02M
  sig[1] = _mm256_unpacklo_epi16(s2, s3);
534
1.02M
  sig[5] = _mm256_unpackhi_epi16(s2, s3);
535
1.02M
  sig[2] = _mm256_unpacklo_epi16(s4, s5);
536
1.02M
  sig[6] = _mm256_unpackhi_epi16(s4, s5);
537
1.02M
  sig[8] = s6;
538
1.02M
}
539
540
static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
541
3.97M
                                   __m256i *sig) {
542
  // base + 7th row
543
3.97M
  __m256i s0 = _mm256_castsi128_si256(
544
3.97M
      _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
545
  // base + 8th row
546
3.97M
  __m256i s1 = _mm256_castsi128_si256(
547
3.97M
      _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
548
3.97M
  __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
549
3.97M
  __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
550
3.97M
  sig[3] = _mm256_unpacklo_epi16(s2, s3);
551
3.97M
  sig[7] = _mm256_unpackhi_epi16(s2, s3);
552
3.97M
  sig[8] = s1;
553
3.97M
}
554
555
static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
556
3.97M
                                     __m256i *y0, __m256i *y1) {
557
3.97M
  filter_8x1_pixels(sig, f, y0);
558
3.97M
  filter_8x1_pixels(&sig[4], f, y1);
559
3.97M
}
560
561
28.1M
static INLINE void update_pixels(__m256i *sig) {
562
28.1M
  int i;
563
112M
  for (i = 0; i < 3; ++i) {
564
84.4M
    sig[i] = sig[i + 1];
565
84.4M
    sig[i + 4] = sig[i + 5];
566
84.4M
  }
567
28.1M
}
568
569
static void vpx_highbd_filter_block1d8_v8_avx2(
570
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
571
740k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
572
740k
  __m256i signal[9], res0, res1;
573
740k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
574
575
740k
  __m256i ff[4];
576
740k
  pack_filters(filter, ff);
577
578
740k
  pack_8x9_init(src_ptr, src_pitch, signal);
579
580
2.86M
  do {
581
2.86M
    pack_8x9_pixels(src_ptr, src_pitch, signal);
582
583
2.86M
    filter_8x9_pixels(signal, ff, &res0, &res1);
584
2.86M
    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
585
2.86M
    update_pixels(signal);
586
587
2.86M
    src_ptr += src_pitch << 1;
588
2.86M
    dst_ptr += dst_pitch << 1;
589
2.86M
    height -= 2;
590
2.86M
  } while (height > 0);
591
740k
}
592
593
773k
static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
594
773k
  __m256i u0, u1, u2, u3;
595
  // load 0-6 rows
596
773k
  const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
597
773k
  const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
598
773k
  const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
599
773k
  const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
600
773k
  const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
601
773k
  const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
602
773k
  const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
603
604
773k
  u0 = _mm256_permute2x128_si256(s0, s1, 0x20);  // 0, 1 low
605
773k
  u1 = _mm256_permute2x128_si256(s0, s1, 0x31);  // 0, 1 high
606
607
773k
  u2 = _mm256_permute2x128_si256(s1, s2, 0x20);  // 1, 2 low
608
773k
  u3 = _mm256_permute2x128_si256(s1, s2, 0x31);  // 1, 2 high
609
610
773k
  sig[0] = _mm256_unpacklo_epi16(u0, u2);
611
773k
  sig[4] = _mm256_unpackhi_epi16(u0, u2);
612
613
773k
  sig[8] = _mm256_unpacklo_epi16(u1, u3);
614
773k
  sig[12] = _mm256_unpackhi_epi16(u1, u3);
615
616
773k
  u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
617
773k
  u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
618
619
773k
  u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
620
773k
  u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
621
622
773k
  sig[1] = _mm256_unpacklo_epi16(u0, u2);
623
773k
  sig[5] = _mm256_unpackhi_epi16(u0, u2);
624
625
773k
  sig[9] = _mm256_unpacklo_epi16(u1, u3);
626
773k
  sig[13] = _mm256_unpackhi_epi16(u1, u3);
627
628
773k
  u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
629
773k
  u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
630
631
773k
  u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
632
773k
  u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
633
634
773k
  sig[2] = _mm256_unpacklo_epi16(u0, u2);
635
773k
  sig[6] = _mm256_unpackhi_epi16(u0, u2);
636
637
773k
  sig[10] = _mm256_unpacklo_epi16(u1, u3);
638
773k
  sig[14] = _mm256_unpackhi_epi16(u1, u3);
639
640
773k
  sig[16] = s6;
641
773k
}
642
643
static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
644
12.0M
                             __m256i *sig) {
645
  // base + 7th row
646
12.0M
  const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
647
  // base + 8th row
648
12.0M
  const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
649
650
12.0M
  __m256i u0, u1, u2, u3;
651
12.0M
  u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
652
12.0M
  u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
653
654
12.0M
  u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
655
12.0M
  u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
656
657
12.0M
  sig[3] = _mm256_unpacklo_epi16(u0, u2);
658
12.0M
  sig[7] = _mm256_unpackhi_epi16(u0, u2);
659
660
12.0M
  sig[11] = _mm256_unpacklo_epi16(u1, u3);
661
12.0M
  sig[15] = _mm256_unpackhi_epi16(u1, u3);
662
663
12.0M
  sig[16] = s8;
664
12.0M
}
665
666
static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
667
12.0M
                                      __m256i *y0, __m256i *y1) {
668
12.0M
  __m256i res[4];
669
12.0M
  int i;
670
60.4M
  for (i = 0; i < 4; ++i) {
671
48.3M
    filter_8x1_pixels(&sig[i << 2], f, &res[i]);
672
48.3M
  }
673
674
12.0M
  {
675
12.0M
    const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
676
12.0M
    const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
677
12.0M
    *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
678
12.0M
    *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
679
12.0M
  }
680
12.0M
}
681
682
static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
683
                                     const __m256i *mask, uint16_t *dst,
684
8.62M
                                     ptrdiff_t pitch) {
685
8.62M
  __m256i p = _mm256_min_epi16(*y0, *mask);
686
8.62M
  _mm256_storeu_si256((__m256i *)dst, p);
687
8.62M
  p = _mm256_min_epi16(*y1, *mask);
688
8.62M
  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
689
8.62M
}
690
691
12.0M
static void update_16x9_pixels(__m256i *sig) {
692
12.0M
  update_pixels(&sig[0]);
693
12.0M
  update_pixels(&sig[8]);
694
12.0M
}
695
696
static void vpx_highbd_filter_block1d16_v8_avx2(
697
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
698
553k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
699
553k
  __m256i signal[17], res0, res1;
700
553k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
701
702
553k
  __m256i ff[4];
703
553k
  pack_filters(filter, ff);
704
705
553k
  pack_16x9_init(src_ptr, src_pitch, signal);
706
707
8.62M
  do {
708
8.62M
    pack_16x9_pixels(src_ptr, src_pitch, signal);
709
8.62M
    filter_16x9_pixels(signal, ff, &res0, &res1);
710
8.62M
    store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
711
8.62M
    update_16x9_pixels(signal);
712
713
8.62M
    src_ptr += src_pitch << 1;
714
8.62M
    dst_ptr += dst_pitch << 1;
715
8.62M
    height -= 2;
716
8.62M
  } while (height > 0);
717
553k
}
718
719
// -----------------------------------------------------------------------------
720
// 2-tap vertical filtering
721
722
406k
static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
723
406k
  sig[2] = _mm256_loadu_si256((const __m256i *)src);
724
406k
}
725
726
static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
727
10.2M
                                       __m256i *sig) {
728
  // load the next row
729
10.2M
  const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
730
10.2M
  sig[0] = _mm256_unpacklo_epi16(sig[2], u);
731
10.2M
  sig[1] = _mm256_unpackhi_epi16(sig[2], u);
732
10.2M
  sig[2] = u;
733
10.2M
}
734
735
static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
736
10.2M
                                         __m256i *y0, __m256i *y1) {
737
10.2M
  filter_16_2t_pixels(sig, f, y0, y1);
738
10.2M
}
739
740
static void vpx_highbd_filter_block1d16_v2_avx2(
741
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
742
260k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
743
260k
  __m256i signal[3], res0, res1;
744
260k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
745
260k
  __m256i ff;
746
747
260k
  pack_2t_filter(filter, &ff);
748
260k
  pack_16x2_init(src_ptr, signal);
749
750
6.60M
  do {
751
6.60M
    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
752
6.60M
    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
753
6.60M
    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
754
755
6.60M
    src_ptr += src_pitch;
756
6.60M
    dst_ptr += dst_pitch;
757
6.60M
    height -= 1;
758
6.60M
  } while (height > 0);
759
260k
}
760
761
260k
static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
762
260k
  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
763
260k
  const __m128i p = _mm_set1_epi32(0x09080706);
764
260k
  f[0] = _mm_shuffle_epi8(h, p);
765
260k
}
766
767
260k
static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
768
260k
  sig[2] = _mm_loadu_si128((const __m128i *)src);
769
260k
}
770
771
static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
772
2.78M
                                          __m128i *sig) {
773
  // load the next row
774
2.78M
  const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
775
2.78M
  sig[0] = _mm_unpacklo_epi16(sig[2], u);
776
2.78M
  sig[1] = _mm_unpackhi_epi16(sig[2], u);
777
2.78M
  sig[2] = u;
778
2.78M
}
779
780
static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
781
2.78M
                                      __m128i *y0, __m128i *y1) {
782
2.78M
  const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
783
2.78M
  __m128i x0 = _mm_madd_epi16(sig[0], *f);
784
2.78M
  __m128i x1 = _mm_madd_epi16(sig[1], *f);
785
2.78M
  x0 = _mm_add_epi32(x0, rounding);
786
2.78M
  x1 = _mm_add_epi32(x1, rounding);
787
2.78M
  *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
788
2.78M
  *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
789
2.78M
}
790
791
static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
792
1.84M
                                           const __m128i *mask, uint16_t *dst) {
793
1.84M
  __m128i res = _mm_packus_epi32(*y0, *y1);
794
1.84M
  res = _mm_min_epi16(res, *mask);
795
1.84M
  _mm_storeu_si128((__m128i *)dst, res);
796
1.84M
}
797
798
static void vpx_highbd_filter_block1d8_v2_avx2(
799
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
800
172k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
801
172k
  __m128i signal[3], res0, res1;
802
172k
  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
803
172k
  __m128i ff;
804
805
172k
  pack_8x1_2t_filter(filter, &ff);
806
172k
  pack_8x2_init(src_ptr, signal);
807
808
1.84M
  do {
809
1.84M
    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
810
1.84M
    filter_8_2t_pixels(signal, &ff, &res0, &res1);
811
1.84M
    store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
812
813
1.84M
    src_ptr += src_pitch;
814
1.84M
    dst_ptr += dst_pitch;
815
1.84M
    height -= 1;
816
1.84M
  } while (height > 0);
817
172k
}
818
819
// Calculation with averaging the input pixels
820
821
static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask,
822
0
                                        uint16_t *dst) {
823
0
  const __m128i a0 = _mm256_castsi256_si128(*y0);
824
0
  const __m128i a1 = _mm256_extractf128_si256(*y0, 1);
825
0
  __m128i res = _mm_packus_epi32(a0, a1);
826
0
  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
827
0
  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
828
0
  res = _mm_avg_epu16(res, pix);
829
0
  _mm_storeu_si128((__m128i *)dst, res);
830
0
}
831
832
static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1,
833
                                        const __m256i *mask, uint16_t *dst,
834
1.34M
                                        ptrdiff_t pitch) {
835
1.34M
  __m256i a = _mm256_packus_epi32(*y0, *y1);
836
1.34M
  const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst);
837
1.34M
  const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch));
838
1.34M
  const __m256i pix =
839
1.34M
      _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
840
1.34M
  a = _mm256_min_epi16(a, *mask);
841
1.34M
  a = _mm256_avg_epu16(a, pix);
842
1.34M
  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
843
1.34M
  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
844
1.34M
}
845
846
static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1,
847
5.60M
                                         const __m256i *mask, uint16_t *dst) {
848
5.60M
  __m256i a = _mm256_packus_epi32(*y0, *y1);
849
5.60M
  const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
850
5.60M
  a = _mm256_min_epi16(a, *mask);
851
5.60M
  a = _mm256_avg_epu16(a, pix);
852
5.60M
  _mm256_storeu_si256((__m256i *)dst, a);
853
5.60M
}
854
855
static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1,
856
                                         const __m256i *mask, uint16_t *dst,
857
3.46M
                                         ptrdiff_t pitch) {
858
3.46M
  const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst);
859
3.46M
  const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch));
860
3.46M
  __m256i p = _mm256_min_epi16(*y0, *mask);
861
3.46M
  p = _mm256_avg_epu16(p, pix0);
862
3.46M
  _mm256_storeu_si256((__m256i *)dst, p);
863
864
3.46M
  p = _mm256_min_epi16(*y1, *mask);
865
3.46M
  p = _mm256_avg_epu16(p, pix1);
866
3.46M
  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
867
3.46M
}
868
869
static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0,
870
                                               const __m128i *y1,
871
                                               const __m128i *mask,
872
932k
                                               uint16_t *dst) {
873
932k
  __m128i res = _mm_packus_epi32(*y0, *y1);
874
932k
  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
875
932k
  res = _mm_min_epi16(res, *mask);
876
932k
  res = _mm_avg_epu16(res, pix);
877
932k
  _mm_storeu_si128((__m128i *)dst, res);
878
932k
}
879
880
static void vpx_highbd_filter_block1d8_h8_avg_avx2(
881
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
882
42.8k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
883
42.8k
  __m256i signal[8], res0, res1;
884
42.8k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
885
886
42.8k
  __m256i ff[4];
887
42.8k
  pack_filters(filter, ff);
888
889
42.8k
  src_ptr -= 3;
890
177k
  do {
891
177k
    pack_8x2_pixels(src_ptr, src_pitch, signal);
892
177k
    filter_8x1_pixels(signal, ff, &res0);
893
177k
    filter_8x1_pixels(&signal[4], ff, &res1);
894
177k
    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
895
177k
    height -= 2;
896
177k
    src_ptr += src_pitch << 1;
897
177k
    dst_ptr += dst_pitch << 1;
898
177k
  } while (height > 1);
899
900
42.8k
  if (height > 0) {
901
0
    pack_8x1_pixels(src_ptr, signal);
902
0
    filter_8x1_pixels(signal, ff, &res0);
903
0
    store_8x1_avg_pixels(&res0, &max, dst_ptr);
904
0
  }
905
42.8k
}
906
907
static void vpx_highbd_filter_block1d16_h8_avg_avx2(
908
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
909
39.0k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
910
39.0k
  __m256i signal[8], res0, res1;
911
39.0k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
912
913
39.0k
  __m256i ff[4];
914
39.0k
  pack_filters(filter, ff);
915
916
39.0k
  src_ptr -= 3;
917
1.28M
  do {
918
1.28M
    pack_16x1_pixels(src_ptr, signal);
919
1.28M
    filter_8x1_pixels(signal, ff, &res0);
920
1.28M
    filter_8x1_pixels(&signal[4], ff, &res1);
921
1.28M
    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
922
1.28M
    height -= 1;
923
1.28M
    src_ptr += src_pitch;
924
1.28M
    dst_ptr += dst_pitch;
925
1.28M
  } while (height > 0);
926
39.0k
}
927
928
static void vpx_highbd_filter_block1d4_h4_avx2(
929
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
930
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
931
  // We extract the middle four elements of the kernel into two registers in
932
  // the form
933
  // ... k[3] k[2] k[3] k[2]
934
  // ... k[5] k[4] k[5] k[4]
935
  // Then we shuffle the source into
936
  // ... s[1] s[0] s[0] s[-1]
937
  // ... s[3] s[2] s[2] s[1]
938
  // Calling multiply and add gives us half of the sum. Calling add on the two
939
  // halves gives us the output. Since avx2 allows us to use 256-bit buffer, we
940
  // can do this two rows at a time.
941
942
0
  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
943
0
  __m256i res_reg;
944
0
  __m256i idx_shift_0 =
945
0
      _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
946
0
                       3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
947
0
  __m256i idx_shift_2 =
948
0
      _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
949
0
                       5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
950
951
0
  __m128i kernel_reg_128;  // Kernel
952
0
  __m256i kernel_reg, kernel_reg_23,
953
0
      kernel_reg_45;  // Segments of the kernel used
954
0
  const __m256i reg_round =
955
0
      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
956
0
  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
957
0
  const ptrdiff_t unrolled_src_stride = src_stride << 1;
958
0
  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
959
0
  int h;
960
961
  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
962
0
  src_ptr -= 1;
963
964
  // Load Kernel
965
0
  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
966
0
  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
967
0
  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
968
0
  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
969
970
0
  for (h = height; h >= 2; h -= 2) {
971
    // Load the source
972
0
    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
973
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
974
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
975
976
    // Get the output
977
0
    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
978
0
                                   &kernel_reg_23, &kernel_reg_45);
979
980
    // Round the result
981
0
    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
982
983
    // Finally combine to get the final dst
984
0
    res_reg = _mm256_packus_epi32(res_reg, res_reg);
985
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
986
0
    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
987
0
                        &res_reg);
988
989
0
    src_ptr += unrolled_src_stride;
990
0
    dst_ptr += unrolled_dst_stride;
991
0
  }
992
993
  // Repeat for the last row if needed
994
0
  if (h > 0) {
995
    // Load the source
996
0
    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
997
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
998
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
999
1000
    // Get the output
1001
0
    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1002
0
                                   &kernel_reg_23, &kernel_reg_45);
1003
1004
    // Round the result
1005
0
    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
1006
1007
    // Finally combine to get the final dst
1008
0
    res_reg = _mm256_packus_epi32(res_reg, res_reg);
1009
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1010
0
    _mm_storel_epi64((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg));
1011
0
  }
1012
0
}
1013
1014
static void vpx_highbd_filter_block1d8_h4_avx2(
1015
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1016
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1017
  // We will extract the middle four elements of the kernel into two registers
1018
  // in the form
1019
  // ... k[3] k[2] k[3] k[2]
1020
  // ... k[5] k[4] k[5] k[4]
1021
  // Then we shuffle the source into
1022
  // ... s[1] s[0] s[0] s[-1]
1023
  // ... s[3] s[2] s[2] s[1]
1024
  // Calling multiply and add gives us half of the sum of the first half.
1025
  // Calling add gives us first half of the output. Repat again to get the whole
1026
  // output. Since avx2 allows us to use 256-bit buffer, we can do this two rows
1027
  // at a time.
1028
1029
0
  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
1030
0
  __m256i res_reg, res_first, res_last;
1031
0
  __m256i idx_shift_0 =
1032
0
      _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
1033
0
                       3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
1034
0
  __m256i idx_shift_2 =
1035
0
      _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
1036
0
                       5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
1037
1038
0
  __m128i kernel_reg_128;  // Kernel
1039
0
  __m256i kernel_reg, kernel_reg_23,
1040
0
      kernel_reg_45;  // Segments of the kernel used
1041
0
  const __m256i reg_round =
1042
0
      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
1043
0
  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
1044
0
  const ptrdiff_t unrolled_src_stride = src_stride << 1;
1045
0
  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
1046
0
  int h;
1047
1048
  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
1049
0
  src_ptr -= 1;
1050
1051
  // Load Kernel
1052
0
  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
1053
0
  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
1054
0
  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
1055
0
  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
1056
1057
0
  for (h = height; h >= 2; h -= 2) {
1058
    // Load the source
1059
0
    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
1060
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
1061
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
1062
1063
    // Result for first half
1064
0
    res_first = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1065
0
                                     &kernel_reg_23, &kernel_reg_45);
1066
1067
    // Do again to get the second half of dst
1068
    // Load the source
1069
0
    src_reg = mm256_loadu2_si128(src_ptr + 4, src_ptr + src_stride + 4);
1070
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
1071
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
1072
1073
    // Result for second half
1074
0
    res_last = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1075
0
                                    &kernel_reg_23, &kernel_reg_45);
1076
1077
    // Round each result
1078
0
    res_first = mm256_round_epi32(&res_first, &reg_round, CONV8_ROUNDING_BITS);
1079
0
    res_last = mm256_round_epi32(&res_last, &reg_round, CONV8_ROUNDING_BITS);
1080
1081
    // Finally combine to get the final dst
1082
0
    res_reg = _mm256_packus_epi32(res_first, res_last);
1083
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1084
0
    mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
1085
0
                       &res_reg);
1086
1087
0
    src_ptr += unrolled_src_stride;
1088
0
    dst_ptr += unrolled_dst_stride;
1089
0
  }
1090
1091
  // Repeat for the last row if needed
1092
0
  if (h > 0) {
1093
0
    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
1094
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
1095
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
1096
1097
0
    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1098
0
                                   &kernel_reg_23, &kernel_reg_45);
1099
1100
0
    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
1101
1102
0
    res_reg = _mm256_packus_epi32(res_reg, res_reg);
1103
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1104
1105
0
    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg);
1106
0
  }
1107
0
}
1108
1109
static void vpx_highbd_filter_block1d16_h4_avx2(
1110
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1111
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1112
0
  vpx_highbd_filter_block1d8_h4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
1113
0
                                     height, kernel, bd);
1114
0
  vpx_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
1115
0
                                     dst_stride, height, kernel, bd);
1116
0
}
1117
1118
static void vpx_highbd_filter_block1d8_v8_avg_avx2(
1119
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1120
284k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1121
284k
  __m256i signal[9], res0, res1;
1122
284k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1123
1124
284k
  __m256i ff[4];
1125
284k
  pack_filters(filter, ff);
1126
1127
284k
  pack_8x9_init(src_ptr, src_pitch, signal);
1128
1129
1.10M
  do {
1130
1.10M
    pack_8x9_pixels(src_ptr, src_pitch, signal);
1131
1132
1.10M
    filter_8x9_pixels(signal, ff, &res0, &res1);
1133
1.10M
    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
1134
1.10M
    update_pixels(signal);
1135
1136
1.10M
    src_ptr += src_pitch << 1;
1137
1.10M
    dst_ptr += dst_pitch << 1;
1138
1.10M
    height -= 2;
1139
1.10M
  } while (height > 0);
1140
284k
}
1141
1142
static void vpx_highbd_filter_block1d16_v8_avg_avx2(
1143
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1144
220k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1145
220k
  __m256i signal[17], res0, res1;
1146
220k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1147
1148
220k
  __m256i ff[4];
1149
220k
  pack_filters(filter, ff);
1150
1151
220k
  pack_16x9_init(src_ptr, src_pitch, signal);
1152
1153
3.46M
  do {
1154
3.46M
    pack_16x9_pixels(src_ptr, src_pitch, signal);
1155
3.46M
    filter_16x9_pixels(signal, ff, &res0, &res1);
1156
3.46M
    store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
1157
3.46M
    update_16x9_pixels(signal);
1158
1159
3.46M
    src_ptr += src_pitch << 1;
1160
3.46M
    dst_ptr += dst_pitch << 1;
1161
3.46M
    height -= 2;
1162
3.46M
  } while (height > 0);
1163
220k
}
1164
1165
static void vpx_highbd_filter_block1d8_h2_avg_avx2(
1166
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1167
12.0k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1168
12.0k
  __m256i signal[2], res0, res1;
1169
12.0k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1170
1171
12.0k
  __m256i ff;
1172
12.0k
  pack_2t_filter(filter, &ff);
1173
1174
12.0k
  src_ptr -= 3;
1175
57.4k
  do {
1176
57.4k
    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
1177
57.4k
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
1178
57.4k
    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
1179
57.4k
    height -= 2;
1180
57.4k
    src_ptr += src_pitch << 1;
1181
57.4k
    dst_ptr += dst_pitch << 1;
1182
57.4k
  } while (height > 1);
1183
1184
12.0k
  if (height > 0) {
1185
0
    pack_8x1_2t_pixels(src_ptr, signal);
1186
0
    filter_8x1_2t_pixels(signal, &ff, &res0);
1187
0
    store_8x1_avg_pixels(&res0, &max, dst_ptr);
1188
0
  }
1189
12.0k
}
1190
1191
static void vpx_highbd_filter_block1d16_h2_avg_avx2(
1192
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1193
24.6k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1194
24.6k
  __m256i signal[2], res0, res1;
1195
24.6k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1196
1197
24.6k
  __m256i ff;
1198
24.6k
  pack_2t_filter(filter, &ff);
1199
1200
24.6k
  src_ptr -= 3;
1201
645k
  do {
1202
645k
    pack_16x1_2t_pixels(src_ptr, signal);
1203
645k
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
1204
645k
    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
1205
645k
    height -= 1;
1206
645k
    src_ptr += src_pitch;
1207
645k
    dst_ptr += dst_pitch;
1208
645k
  } while (height > 0);
1209
24.6k
}
1210
1211
static void vpx_highbd_filter_block1d16_v2_avg_avx2(
1212
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1213
146k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1214
146k
  __m256i signal[3], res0, res1;
1215
146k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1216
146k
  __m256i ff;
1217
1218
146k
  pack_2t_filter(filter, &ff);
1219
146k
  pack_16x2_init(src_ptr, signal);
1220
1221
3.68M
  do {
1222
3.68M
    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
1223
3.68M
    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
1224
3.68M
    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
1225
1226
3.68M
    src_ptr += src_pitch;
1227
3.68M
    dst_ptr += dst_pitch;
1228
3.68M
    height -= 1;
1229
3.68M
  } while (height > 0);
1230
146k
}
1231
1232
static void vpx_highbd_filter_block1d8_v2_avg_avx2(
1233
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1234
88.7k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1235
88.7k
  __m128i signal[3], res0, res1;
1236
88.7k
  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
1237
88.7k
  __m128i ff;
1238
1239
88.7k
  pack_8x1_2t_filter(filter, &ff);
1240
88.7k
  pack_8x2_init(src_ptr, signal);
1241
1242
932k
  do {
1243
932k
    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
1244
932k
    filter_8_2t_pixels(signal, &ff, &res0, &res1);
1245
932k
    store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr);
1246
1247
932k
    src_ptr += src_pitch;
1248
932k
    dst_ptr += dst_pitch;
1249
932k
    height -= 1;
1250
932k
  } while (height > 0);
1251
88.7k
}
1252
1253
static void vpx_highbd_filter_block1d4_v4_avx2(
1254
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1255
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1256
  // We will load two rows of pixels and rearrange them into the form
1257
  // ... s[1,0] s[0,0] s[0,0] s[-1,0]
1258
  // so that we can call multiply and add with the kernel partial output. Then
1259
  // we can call add with another row to get the output.
1260
1261
  // Register for source s[-1:3, :]
1262
0
  __m256i src_reg_1, src_reg_2, src_reg_3;
1263
  // Interleaved rows of the source. lo is first half, hi second
1264
0
  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
1265
0
  __m256i src_reg_m1001, src_reg_1223;
1266
1267
  // Result after multiply and add
1268
0
  __m256i res_reg;
1269
1270
0
  __m128i kernel_reg_128;                            // Kernel
1271
0
  __m256i kernel_reg, kernel_reg_23, kernel_reg_45;  // Segments of kernel used
1272
1273
0
  const __m256i reg_round =
1274
0
      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
1275
0
  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
1276
0
  const ptrdiff_t src_stride_unrolled = src_stride << 1;
1277
0
  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
1278
0
  int h;
1279
1280
  // Load Kernel
1281
0
  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
1282
0
  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
1283
0
  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
1284
0
  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
1285
1286
  // Row -1 to row 0
1287
0
  src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr,
1288
0
                                   (const __m128i *)(src_ptr + src_stride));
1289
1290
  // Row 0 to row 1
1291
0
  src_reg_1 = _mm256_castsi128_si256(
1292
0
      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
1293
0
  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
1294
1295
  // First three rows
1296
0
  src_reg_m1001 = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
1297
1298
0
  for (h = height; h > 1; h -= 2) {
1299
0
    src_reg_2 = _mm256_castsi128_si256(
1300
0
        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
1301
1302
0
    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
1303
0
                                         _mm256_castsi256_si128(src_reg_2), 1);
1304
1305
0
    src_reg_3 = _mm256_castsi128_si256(
1306
0
        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
1307
1308
0
    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
1309
0
                                         _mm256_castsi256_si128(src_reg_3), 1);
1310
1311
    // Last three rows
1312
0
    src_reg_1223 = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
1313
1314
    // Output
1315
0
    res_reg = mm256_madd_add_epi32(&src_reg_m1001, &src_reg_1223,
1316
0
                                   &kernel_reg_23, &kernel_reg_45);
1317
1318
    // Round the words
1319
0
    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
1320
1321
    // Combine to get the result
1322
0
    res_reg = _mm256_packus_epi32(res_reg, res_reg);
1323
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1324
1325
    // Save the result
1326
0
    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
1327
0
                        &res_reg);
1328
1329
    // Update the source by two rows
1330
0
    src_ptr += src_stride_unrolled;
1331
0
    dst_ptr += dst_stride_unrolled;
1332
1333
0
    src_reg_m1001 = src_reg_1223;
1334
0
    src_reg_1 = src_reg_3;
1335
0
  }
1336
0
}
1337
1338
static void vpx_highbd_filter_block1d8_v4_avx2(
1339
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1340
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1341
  // We will load two rows of pixels and rearrange them into the form
1342
  // ... s[1,0] s[0,0] s[0,0] s[-1,0]
1343
  // so that we can call multiply and add with the kernel partial output. Then
1344
  // we can call add with another row to get the output.
1345
1346
  // Register for source s[-1:3, :]
1347
0
  __m256i src_reg_1, src_reg_2, src_reg_3;
1348
  // Interleaved rows of the source. lo is first half, hi second
1349
0
  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
1350
0
  __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi;
1351
1352
0
  __m128i kernel_reg_128;                            // Kernel
1353
0
  __m256i kernel_reg, kernel_reg_23, kernel_reg_45;  // Segments of kernel
1354
1355
  // Result after multiply and add
1356
0
  __m256i res_reg, res_reg_lo, res_reg_hi;
1357
1358
0
  const __m256i reg_round =
1359
0
      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
1360
0
  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
1361
0
  const ptrdiff_t src_stride_unrolled = src_stride << 1;
1362
0
  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
1363
0
  int h;
1364
1365
  // Load Kernel
1366
0
  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
1367
0
  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
1368
0
  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
1369
0
  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
1370
1371
  // Row -1 to row 0
1372
0
  src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
1373
0
                                   (const __m128i *)(src_ptr + src_stride));
1374
1375
  // Row 0 to row 1
1376
0
  src_reg_1 = _mm256_castsi128_si256(
1377
0
      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
1378
0
  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
1379
1380
  // First three rows
1381
0
  src_reg_m1001_lo = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
1382
0
  src_reg_m1001_hi = _mm256_unpackhi_epi16(src_reg_m10, src_reg_01);
1383
1384
0
  for (h = height; h > 1; h -= 2) {
1385
0
    src_reg_2 = _mm256_castsi128_si256(
1386
0
        _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)));
1387
1388
0
    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
1389
0
                                         _mm256_castsi256_si128(src_reg_2), 1);
1390
1391
0
    src_reg_3 = _mm256_castsi128_si256(
1392
0
        _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)));
1393
1394
0
    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
1395
0
                                         _mm256_castsi256_si128(src_reg_3), 1);
1396
1397
    // Last three rows
1398
0
    src_reg_1223_lo = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
1399
0
    src_reg_1223_hi = _mm256_unpackhi_epi16(src_reg_12, src_reg_23);
1400
1401
    // Output from first half
1402
0
    res_reg_lo = mm256_madd_add_epi32(&src_reg_m1001_lo, &src_reg_1223_lo,
1403
0
                                      &kernel_reg_23, &kernel_reg_45);
1404
1405
    // Output from second half
1406
0
    res_reg_hi = mm256_madd_add_epi32(&src_reg_m1001_hi, &src_reg_1223_hi,
1407
0
                                      &kernel_reg_23, &kernel_reg_45);
1408
1409
    // Round the words
1410
0
    res_reg_lo =
1411
0
        mm256_round_epi32(&res_reg_lo, &reg_round, CONV8_ROUNDING_BITS);
1412
0
    res_reg_hi =
1413
0
        mm256_round_epi32(&res_reg_hi, &reg_round, CONV8_ROUNDING_BITS);
1414
1415
    // Combine to get the result
1416
0
    res_reg = _mm256_packus_epi32(res_reg_lo, res_reg_hi);
1417
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1418
1419
    // Save the result
1420
0
    mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
1421
0
                       &res_reg);
1422
1423
    // Update the source by two rows
1424
0
    src_ptr += src_stride_unrolled;
1425
0
    dst_ptr += dst_stride_unrolled;
1426
1427
0
    src_reg_m1001_lo = src_reg_1223_lo;
1428
0
    src_reg_m1001_hi = src_reg_1223_hi;
1429
0
    src_reg_1 = src_reg_3;
1430
0
  }
1431
0
}
1432
1433
static void vpx_highbd_filter_block1d16_v4_avx2(
1434
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1435
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1436
0
  vpx_highbd_filter_block1d8_v4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
1437
0
                                     height, kernel, bd);
1438
0
  vpx_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
1439
0
                                     dst_stride, height, kernel, bd);
1440
0
}
1441
1442
// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
1443
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
1444
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
1445
1446
// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
1447
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
1448
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
1449
1450
1.37M
#define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2
1451
394k
#define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2
1452
1.09M
#define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
1453
304k
#define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2
1454
1455
// Use the [vh]8 version because there is no [vh]4 implementation.
1456
#define vpx_highbd_filter_block1d16_v4_avg_avx2 \
1457
0
  vpx_highbd_filter_block1d16_v8_avg_avx2
1458
#define vpx_highbd_filter_block1d16_h4_avg_avx2 \
1459
0
  vpx_highbd_filter_block1d16_h8_avg_avx2
1460
#define vpx_highbd_filter_block1d8_v4_avg_avx2 \
1461
0
  vpx_highbd_filter_block1d8_v8_avg_avx2
1462
#define vpx_highbd_filter_block1d8_h4_avg_avx2 \
1463
0
  vpx_highbd_filter_block1d8_h8_avg_avx2
1464
#define vpx_highbd_filter_block1d4_v4_avg_avx2 \
1465
0
  vpx_highbd_filter_block1d4_v8_avg_avx2
1466
#define vpx_highbd_filter_block1d4_h4_avg_avx2 \
1467
0
  vpx_highbd_filter_block1d4_h8_avg_avx2
1468
1469
HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
1470
HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
1471
                 src - src_stride * (num_taps / 2 - 1), , avx2, 0)
1472
2.23M
HIGH_FUN_CONV_2D(, avx2, 0)
1473
1474
// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
1475
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
1476
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
1477
1478
// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
1479
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
1480
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
1481
1482
#define vpx_highbd_filter_block1d4_h8_avg_avx2 \
1483
63.4k
  vpx_highbd_filter_block1d4_h8_avg_sse2
1484
#define vpx_highbd_filter_block1d4_h2_avg_avx2 \
1485
19.7k
  vpx_highbd_filter_block1d4_h2_avg_sse2
1486
#define vpx_highbd_filter_block1d4_v8_avg_avx2 \
1487
446k
  vpx_highbd_filter_block1d4_v8_avg_sse2
1488
#define vpx_highbd_filter_block1d4_v2_avg_avx2 \
1489
131k
  vpx_highbd_filter_block1d4_v2_avg_sse2
1490
1491
HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
1492
HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
1493
                 src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
1494
HIGH_FUN_CONV_2D(avg_, avx2, 1)
1495
1496
#undef HIGHBD_FUNC
1497
#endif  // HAVE_X86_ASM