Coverage Report

Created: 2026-04-29 06:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
Line
Count
Source
1
/*
2
 *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
11
#include <immintrin.h>
12
#include "./vpx_dsp_rtcd.h"
13
#include "vpx_dsp/x86/convolve.h"
14
#include "vpx_dsp/x86/convolve_avx2.h"
15
16
// -----------------------------------------------------------------------------
17
// Copy and average
18
19
void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
20
                                   uint16_t *dst, ptrdiff_t dst_stride,
21
                                   const InterpKernel *filter, int x0_q4,
22
                                   int x_step_q4, int y0_q4, int y_step_q4,
23
1.61M
                                   int w, int h, int bd) {
24
1.61M
  (void)filter;
25
1.61M
  (void)x0_q4;
26
1.61M
  (void)x_step_q4;
27
1.61M
  (void)y0_q4;
28
1.61M
  (void)y_step_q4;
29
1.61M
  (void)bd;
30
31
1.61M
  assert(w % 4 == 0);
32
1.61M
  if (w > 32) {  // w = 64
33
35.7M
    do {
34
35.7M
      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
35
35.7M
      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
36
35.7M
      const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
37
35.7M
      const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
38
35.7M
      src += src_stride;
39
35.7M
      _mm256_storeu_si256((__m256i *)dst, p0);
40
35.7M
      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
41
35.7M
      _mm256_storeu_si256((__m256i *)(dst + 32), p2);
42
35.7M
      _mm256_storeu_si256((__m256i *)(dst + 48), p3);
43
35.7M
      dst += dst_stride;
44
35.7M
      h--;
45
35.7M
    } while (h > 0);
46
851k
  } else if (w > 16) {  // w = 32
47
6.31M
    do {
48
6.31M
      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
49
6.31M
      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
50
6.31M
      src += src_stride;
51
6.31M
      _mm256_storeu_si256((__m256i *)dst, p0);
52
6.31M
      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
53
6.31M
      dst += dst_stride;
54
6.31M
      h--;
55
6.31M
    } while (h > 0);
56
655k
  } else if (w > 8) {  // w = 16
57
143k
    __m256i p0, p1;
58
1.13M
    do {
59
1.13M
      p0 = _mm256_loadu_si256((const __m256i *)src);
60
1.13M
      src += src_stride;
61
1.13M
      p1 = _mm256_loadu_si256((const __m256i *)src);
62
1.13M
      src += src_stride;
63
64
1.13M
      _mm256_storeu_si256((__m256i *)dst, p0);
65
1.13M
      dst += dst_stride;
66
1.13M
      _mm256_storeu_si256((__m256i *)dst, p1);
67
1.13M
      dst += dst_stride;
68
1.13M
      h -= 2;
69
1.13M
    } while (h > 0);
70
512k
  } else if (w > 4) {  // w = 8
71
206k
    __m128i p0, p1;
72
869k
    do {
73
869k
      p0 = _mm_loadu_si128((const __m128i *)src);
74
869k
      src += src_stride;
75
869k
      p1 = _mm_loadu_si128((const __m128i *)src);
76
869k
      src += src_stride;
77
78
869k
      _mm_storeu_si128((__m128i *)dst, p0);
79
869k
      dst += dst_stride;
80
869k
      _mm_storeu_si128((__m128i *)dst, p1);
81
869k
      dst += dst_stride;
82
869k
      h -= 2;
83
869k
    } while (h > 0);
84
305k
  } else {  // w = 4
85
305k
    __m128i p0, p1;
86
672k
    do {
87
672k
      p0 = _mm_loadl_epi64((const __m128i *)src);
88
672k
      src += src_stride;
89
672k
      p1 = _mm_loadl_epi64((const __m128i *)src);
90
672k
      src += src_stride;
91
92
672k
      _mm_storel_epi64((__m128i *)dst, p0);
93
672k
      dst += dst_stride;
94
672k
      _mm_storel_epi64((__m128i *)dst, p1);
95
672k
      dst += dst_stride;
96
672k
      h -= 2;
97
672k
    } while (h > 0);
98
305k
  }
99
1.61M
}
100
101
void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
102
                                  uint16_t *dst, ptrdiff_t dst_stride,
103
                                  const InterpKernel *filter, int x0_q4,
104
                                  int x_step_q4, int y0_q4, int y_step_q4,
105
286k
                                  int w, int h, int bd) {
106
286k
  (void)filter;
107
286k
  (void)x0_q4;
108
286k
  (void)x_step_q4;
109
286k
  (void)y0_q4;
110
286k
  (void)y_step_q4;
111
286k
  (void)bd;
112
113
286k
  assert(w % 4 == 0);
114
286k
  if (w > 32) {  // w = 64
115
12.8k
    __m256i p0, p1, p2, p3, u0, u1, u2, u3;
116
767k
    do {
117
767k
      p0 = _mm256_loadu_si256((const __m256i *)src);
118
767k
      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
119
767k
      p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
120
767k
      p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
121
767k
      src += src_stride;
122
767k
      u0 = _mm256_loadu_si256((const __m256i *)dst);
123
767k
      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
124
767k
      u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
125
767k
      u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
126
767k
      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
127
767k
      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
128
767k
      _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
129
767k
      _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
130
767k
      dst += dst_stride;
131
767k
      h--;
132
767k
    } while (h > 0);
133
273k
  } else if (w > 16) {  // w = 32
134
29.9k
    __m256i p0, p1, u0, u1;
135
921k
    do {
136
921k
      p0 = _mm256_loadu_si256((const __m256i *)src);
137
921k
      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
138
921k
      src += src_stride;
139
921k
      u0 = _mm256_loadu_si256((const __m256i *)dst);
140
921k
      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
141
921k
      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
142
921k
      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
143
921k
      dst += dst_stride;
144
921k
      h--;
145
921k
    } while (h > 0);
146
243k
  } else if (w > 8) {  // w = 16
147
45.4k
    __m256i p0, p1, u0, u1;
148
352k
    do {
149
352k
      p0 = _mm256_loadu_si256((const __m256i *)src);
150
352k
      p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
151
352k
      src += src_stride << 1;
152
352k
      u0 = _mm256_loadu_si256((const __m256i *)dst);
153
352k
      u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
154
155
352k
      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
156
352k
      _mm256_storeu_si256((__m256i *)(dst + dst_stride),
157
352k
                          _mm256_avg_epu16(p1, u1));
158
352k
      dst += dst_stride << 1;
159
352k
      h -= 2;
160
352k
    } while (h > 0);
161
198k
  } else if (w > 4) {  // w = 8
162
76.1k
    __m128i p0, p1, u0, u1;
163
315k
    do {
164
315k
      p0 = _mm_loadu_si128((const __m128i *)src);
165
315k
      p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
166
315k
      src += src_stride << 1;
167
315k
      u0 = _mm_loadu_si128((const __m128i *)dst);
168
315k
      u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
169
170
315k
      _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
171
315k
      _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
172
315k
      dst += dst_stride << 1;
173
315k
      h -= 2;
174
315k
    } while (h > 0);
175
122k
  } else {  // w = 4
176
122k
    __m128i p0, p1, u0, u1;
177
259k
    do {
178
259k
      p0 = _mm_loadl_epi64((const __m128i *)src);
179
259k
      p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
180
259k
      src += src_stride << 1;
181
259k
      u0 = _mm_loadl_epi64((const __m128i *)dst);
182
259k
      u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
183
184
259k
      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
185
259k
      _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
186
259k
      dst += dst_stride << 1;
187
259k
      h -= 2;
188
259k
    } while (h > 0);
189
122k
  }
190
286k
}
191
192
#if HAVE_X86_ASM
193
// -----------------------------------------------------------------------------
194
// Horizontal and vertical filtering
195
196
static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
197
                                              7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
198
                                              4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
199
200
static const uint8_t signal_pattern_1[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
201
                                              8, 9, 10, 11, 10, 11, 12, 13,
202
                                              4, 5, 6,  7,  6,  7,  8,  9,
203
                                              8, 9, 10, 11, 10, 11, 12, 13 };
204
205
static const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
206
                                              10, 11, 12, 13, 12, 13, 14, 15,
207
                                              6,  7,  8,  9,  8,  9,  10, 11,
208
                                              10, 11, 12, 13, 12, 13, 14, 15 };
209
210
static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
211
212
195M
#define CONV8_ROUNDING_BITS (7)
213
0
#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1))
214
215
// -----------------------------------------------------------------------------
216
// Horizontal Filtering
217
218
43.2M
static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
219
43.2M
  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
220
43.2M
  const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
221
43.2M
  const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
222
43.2M
  const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
223
224
43.2M
  p[0] = _mm256_shuffle_epi8(*s, sf0);  // x0x6
225
43.2M
  p[1] = _mm256_shuffle_epi8(*s, sf1);  // x1x7
226
43.2M
  p[2] = _mm256_shuffle_epi8(c, sf0);   // x2x4
227
43.2M
  p[3] = _mm256_shuffle_epi8(c, sf1);   // x3x5
228
43.2M
}
229
230
// Note:
231
//  Shared by 8x2 and 16x1 block
232
static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
233
21.4M
                                  __m256i *x /*x[8]*/) {
234
21.4M
  __m256i pp[8];
235
21.4M
  pack_pixels(s0, pp);
236
21.4M
  pack_pixels(s1, &pp[4]);
237
21.4M
  x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
238
21.4M
  x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
239
21.4M
  x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
240
21.4M
  x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
241
21.4M
  x[4] = x[2];
242
21.4M
  x[5] = x[3];
243
21.4M
  x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
244
21.4M
  x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
245
21.4M
}
246
247
375k
static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
248
375k
  __m256i pp[8];
249
375k
  __m256i s0;
250
375k
  s0 = _mm256_loadu_si256((const __m256i *)src);
251
375k
  pack_pixels(&s0, pp);
252
375k
  x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
253
375k
  x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
254
375k
  x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
255
375k
  x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
256
375k
}
257
258
static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
259
3.00M
                                   __m256i *x) {
260
3.00M
  __m256i s0, s1;
261
3.00M
  s0 = _mm256_loadu_si256((const __m256i *)src);
262
3.00M
  s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
263
3.00M
  pack_16_pixels(&s0, &s1, x);
264
3.00M
}
265
266
18.8M
static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
267
18.8M
  __m256i s0, s1;
268
18.8M
  s0 = _mm256_loadu_si256((const __m256i *)src);
269
18.8M
  s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
270
18.8M
  pack_16_pixels(&s0, &s1, x);
271
18.8M
}
272
273
// Note:
274
//  Shared by horizontal and vertical filtering
275
2.11M
static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
276
2.11M
  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
277
2.11M
  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
278
2.11M
  const __m256i p0 = _mm256_set1_epi32(0x03020100);
279
2.11M
  const __m256i p1 = _mm256_set1_epi32(0x07060504);
280
2.11M
  const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
281
2.11M
  const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
282
2.11M
  f[0] = _mm256_shuffle_epi8(hh, p0);
283
2.11M
  f[1] = _mm256_shuffle_epi8(hh, p1);
284
2.11M
  f[2] = _mm256_shuffle_epi8(hh, p2);
285
2.11M
  f[3] = _mm256_shuffle_epi8(hh, p3);
286
2.11M
}
287
288
static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
289
                                     const __m256i *fil /*fil[4]*/,
290
82.6M
                                     __m256i *y) {
291
82.6M
  __m256i a, a0, a1;
292
293
82.6M
  a0 = _mm256_madd_epi16(fil[0], sig[0]);
294
82.6M
  a1 = _mm256_madd_epi16(fil[3], sig[3]);
295
82.6M
  a = _mm256_add_epi32(a0, a1);
296
297
82.6M
  a0 = _mm256_madd_epi16(fil[1], sig[1]);
298
82.6M
  a1 = _mm256_madd_epi16(fil[2], sig[2]);
299
300
82.6M
  {
301
82.6M
    const __m256i min = _mm256_min_epi32(a0, a1);
302
82.6M
    a = _mm256_add_epi32(a, min);
303
82.6M
  }
304
82.6M
  {
305
82.6M
    const __m256i max = _mm256_max_epi32(a0, a1);
306
82.6M
    a = _mm256_add_epi32(a, max);
307
82.6M
  }
308
82.6M
  {
309
82.6M
    const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
310
82.6M
    a = _mm256_add_epi32(a, rounding);
311
82.6M
    *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
312
82.6M
  }
313
82.6M
}
314
315
static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
316
424k
                                    uint16_t *dst) {
317
424k
  const __m128i a0 = _mm256_castsi256_si128(*y);
318
424k
  const __m128i a1 = _mm256_extractf128_si256(*y, 1);
319
424k
  __m128i res = _mm_packus_epi32(a0, a1);
320
424k
  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
321
424k
  _mm_storeu_si128((__m128i *)dst, res);
322
424k
}
323
324
static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
325
                                    const __m256i *mask, uint16_t *dst,
326
4.69M
                                    ptrdiff_t pitch) {
327
4.69M
  __m256i a = _mm256_packus_epi32(*y0, *y1);
328
4.69M
  a = _mm256_min_epi16(a, *mask);
329
4.69M
  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
330
4.69M
  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
331
4.69M
}
332
333
static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
334
24.8M
                                     const __m256i *mask, uint16_t *dst) {
335
24.8M
  __m256i a = _mm256_packus_epi32(*y0, *y1);
336
24.8M
  a = _mm256_min_epi16(a, *mask);
337
24.8M
  _mm256_storeu_si256((__m256i *)dst, a);
338
24.8M
}
339
340
static void vpx_highbd_filter_block1d8_h8_avx2(
341
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
342
442k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
343
442k
  __m256i signal[8], res0, res1;
344
442k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
345
346
442k
  __m256i ff[4];
347
442k
  pack_filters(filter, ff);
348
349
442k
  src_ptr -= 3;
350
2.91M
  do {
351
2.91M
    pack_8x2_pixels(src_ptr, src_pitch, signal);
352
2.91M
    filter_8x1_pixels(signal, ff, &res0);
353
2.91M
    filter_8x1_pixels(&signal[4], ff, &res1);
354
2.91M
    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
355
2.91M
    height -= 2;
356
2.91M
    src_ptr += src_pitch << 1;
357
2.91M
    dst_ptr += dst_pitch << 1;
358
2.91M
  } while (height > 1);
359
360
442k
  if (height > 0) {
361
377k
    pack_8x1_pixels(src_ptr, signal);
362
377k
    filter_8x1_pixels(signal, ff, &res0);
363
377k
    store_8x1_pixels(&res0, &max, dst_ptr);
364
377k
  }
365
442k
}
366
367
static void vpx_highbd_filter_block1d16_h8_avx2(
368
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
369
571k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
370
571k
  __m256i signal[8], res0, res1;
371
571k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
372
373
571k
  __m256i ff[4];
374
571k
  pack_filters(filter, ff);
375
376
571k
  src_ptr -= 3;
377
18.6M
  do {
378
18.6M
    pack_16x1_pixels(src_ptr, signal);
379
18.6M
    filter_8x1_pixels(signal, ff, &res0);
380
18.6M
    filter_8x1_pixels(&signal[4], ff, &res1);
381
18.6M
    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
382
18.6M
    height -= 1;
383
18.6M
    src_ptr += src_pitch;
384
18.6M
    dst_ptr += dst_pitch;
385
18.6M
  } while (height > 0);
386
571k
}
387
388
// -----------------------------------------------------------------------------
389
// 2-tap horizontal filtering
390
391
325k
static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
392
325k
  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
393
325k
  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
394
325k
  const __m256i p = _mm256_set1_epi32(0x09080706);
395
325k
  f[0] = _mm256_shuffle_epi8(hh, p);
396
325k
}
397
398
// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
399
// the difference is s0/s1 specifies first and second rows or,
400
// first 16 samples and 8-sample shifted 16 samples
401
static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
402
5.08M
                                     __m256i *sig) {
403
5.08M
  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
404
5.08M
  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
405
5.08M
  __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
406
5.08M
  __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
407
5.08M
  __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
408
5.08M
  __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
409
5.08M
  r0 = _mm256_shuffle_epi8(r0, sf2);
410
5.08M
  r1 = _mm256_shuffle_epi8(r1, sf2);
411
5.08M
  sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
412
5.08M
  sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
413
5.08M
}
414
415
static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
416
279k
                                      const ptrdiff_t pitch, __m256i *sig) {
417
279k
  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
418
279k
  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
419
279k
  pack_16_2t_pixels(&r0, &r1, sig);
420
279k
}
421
422
static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
423
4.81M
                                       __m256i *sig /*sig[2]*/) {
424
4.81M
  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
425
4.81M
  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
426
4.81M
  pack_16_2t_pixels(&r0, &r1, sig);
427
4.81M
}
428
429
static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
430
49.1k
                                      __m256i *sig /*sig[2]*/) {
431
49.1k
  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
432
49.1k
  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
433
49.1k
  __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
434
49.1k
  __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
435
49.1k
  r0 = _mm256_permutevar8x32_epi32(r0, idx);
436
49.1k
  r0 = _mm256_shuffle_epi8(r0, sf2);
437
49.1k
  sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
438
49.1k
}
439
440
// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
441
static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
442
9.40M
                                       __m256i *y0, __m256i *y1) {
443
9.40M
  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
444
9.40M
  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
445
9.40M
  __m256i x1 = _mm256_madd_epi16(sig[1], *f);
446
9.40M
  x0 = _mm256_add_epi32(x0, rounding);
447
9.40M
  x1 = _mm256_add_epi32(x1, rounding);
448
9.40M
  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
449
9.40M
  *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
450
9.40M
}
451
452
static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
453
49.1k
                                        __m256i *y0) {
454
49.1k
  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
455
49.1k
  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
456
49.1k
  x0 = _mm256_add_epi32(x0, rounding);
457
49.1k
  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
458
49.1k
}
459
460
static void vpx_highbd_filter_block1d8_h2_avx2(
461
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
462
59.6k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
463
59.6k
  __m256i signal[2], res0, res1;
464
59.6k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
465
466
59.6k
  __m256i ff;
467
59.6k
  pack_2t_filter(filter, &ff);
468
469
59.6k
  src_ptr -= 3;
470
254k
  do {
471
254k
    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
472
254k
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
473
254k
    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
474
254k
    height -= 2;
475
254k
    src_ptr += src_pitch << 1;
476
254k
    dst_ptr += dst_pitch << 1;
477
254k
  } while (height > 1);
478
479
59.6k
  if (height > 0) {
480
49.1k
    pack_8x1_2t_pixels(src_ptr, signal);
481
49.1k
    filter_8x1_2t_pixels(signal, &ff, &res0);
482
49.1k
    store_8x1_pixels(&res0, &max, dst_ptr);
483
49.1k
  }
484
59.6k
}
485
486
static void vpx_highbd_filter_block1d16_h2_avx2(
487
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
488
117k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
489
117k
  __m256i signal[2], res0, res1;
490
117k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
491
492
117k
  __m256i ff;
493
117k
  pack_2t_filter(filter, &ff);
494
495
117k
  src_ptr -= 3;
496
4.12M
  do {
497
4.12M
    pack_16x1_2t_pixels(src_ptr, signal);
498
4.12M
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
499
4.12M
    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
500
4.12M
    height -= 1;
501
4.12M
    src_ptr += src_pitch;
502
4.12M
    dst_ptr += dst_pitch;
503
4.12M
  } while (height > 0);
504
117k
}
505
506
// -----------------------------------------------------------------------------
507
// Vertical Filtering
508
509
547k
static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
510
547k
  __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
511
547k
  __m256i s1 =
512
547k
      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
513
547k
  __m256i s2 = _mm256_castsi128_si256(
514
547k
      _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
515
547k
  __m256i s3 = _mm256_castsi128_si256(
516
547k
      _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
517
547k
  __m256i s4 = _mm256_castsi128_si256(
518
547k
      _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
519
547k
  __m256i s5 = _mm256_castsi128_si256(
520
547k
      _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
521
547k
  __m256i s6 = _mm256_castsi128_si256(
522
547k
      _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
523
524
547k
  s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
525
547k
  s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
526
547k
  s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
527
547k
  s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
528
547k
  s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
529
547k
  s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
530
531
547k
  sig[0] = _mm256_unpacklo_epi16(s0, s1);
532
547k
  sig[4] = _mm256_unpackhi_epi16(s0, s1);
533
547k
  sig[1] = _mm256_unpacklo_epi16(s2, s3);
534
547k
  sig[5] = _mm256_unpackhi_epi16(s2, s3);
535
547k
  sig[2] = _mm256_unpacklo_epi16(s4, s5);
536
547k
  sig[6] = _mm256_unpackhi_epi16(s4, s5);
537
547k
  sig[8] = s6;
538
547k
}
539
540
static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
541
2.22M
                                   __m256i *sig) {
542
  // base + 7th row
543
2.22M
  __m256i s0 = _mm256_castsi128_si256(
544
2.22M
      _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
545
  // base + 8th row
546
2.22M
  __m256i s1 = _mm256_castsi128_si256(
547
2.22M
      _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
548
2.22M
  __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
549
2.22M
  __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
550
2.22M
  sig[3] = _mm256_unpacklo_epi16(s2, s3);
551
2.22M
  sig[7] = _mm256_unpackhi_epi16(s2, s3);
552
2.22M
  sig[8] = s1;
553
2.22M
}
554
555
static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
556
2.22M
                                     __m256i *y0, __m256i *y1) {
557
2.22M
  filter_8x1_pixels(sig, f, y0);
558
2.22M
  filter_8x1_pixels(&sig[4], f, y1);
559
2.22M
}
560
561
22.3M
static INLINE void update_pixels(__m256i *sig) {
562
22.3M
  int i;
563
89.4M
  for (i = 0; i < 3; ++i) {
564
67.1M
    sig[i] = sig[i + 1];
565
67.1M
    sig[i + 4] = sig[i + 5];
566
67.1M
  }
567
22.3M
}
568
569
static void vpx_highbd_filter_block1d8_v8_avx2(
570
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
571
384k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
572
384k
  __m256i signal[9], res0, res1;
573
384k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
574
575
384k
  __m256i ff[4];
576
384k
  pack_filters(filter, ff);
577
578
384k
  pack_8x9_init(src_ptr, src_pitch, signal);
579
580
1.56M
  do {
581
1.56M
    pack_8x9_pixels(src_ptr, src_pitch, signal);
582
583
1.56M
    filter_8x9_pixels(signal, ff, &res0, &res1);
584
1.56M
    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
585
1.56M
    update_pixels(signal);
586
587
1.56M
    src_ptr += src_pitch << 1;
588
1.56M
    dst_ptr += dst_pitch << 1;
589
1.56M
    height -= 2;
590
1.56M
  } while (height > 0);
591
384k
}
592
593
685k
static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
594
685k
  __m256i u0, u1, u2, u3;
595
  // load 0-6 rows
596
685k
  const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
597
685k
  const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
598
685k
  const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
599
685k
  const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
600
685k
  const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
601
685k
  const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
602
685k
  const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
603
604
685k
  u0 = _mm256_permute2x128_si256(s0, s1, 0x20);  // 0, 1 low
605
685k
  u1 = _mm256_permute2x128_si256(s0, s1, 0x31);  // 0, 1 high
606
607
685k
  u2 = _mm256_permute2x128_si256(s1, s2, 0x20);  // 1, 2 low
608
685k
  u3 = _mm256_permute2x128_si256(s1, s2, 0x31);  // 1, 2 high
609
610
685k
  sig[0] = _mm256_unpacklo_epi16(u0, u2);
611
685k
  sig[4] = _mm256_unpackhi_epi16(u0, u2);
612
613
685k
  sig[8] = _mm256_unpacklo_epi16(u1, u3);
614
685k
  sig[12] = _mm256_unpackhi_epi16(u1, u3);
615
616
685k
  u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
617
685k
  u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
618
619
685k
  u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
620
685k
  u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
621
622
685k
  sig[1] = _mm256_unpacklo_epi16(u0, u2);
623
685k
  sig[5] = _mm256_unpackhi_epi16(u0, u2);
624
625
685k
  sig[9] = _mm256_unpacklo_epi16(u1, u3);
626
685k
  sig[13] = _mm256_unpackhi_epi16(u1, u3);
627
628
685k
  u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
629
685k
  u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
630
631
685k
  u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
632
685k
  u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
633
634
685k
  sig[2] = _mm256_unpacklo_epi16(u0, u2);
635
685k
  sig[6] = _mm256_unpackhi_epi16(u0, u2);
636
637
685k
  sig[10] = _mm256_unpacklo_epi16(u1, u3);
638
685k
  sig[14] = _mm256_unpackhi_epi16(u1, u3);
639
640
685k
  sig[16] = s6;
641
685k
}
642
643
static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
644
9.96M
                             __m256i *sig) {
645
  // base + 7th row
646
9.96M
  const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
647
  // base + 8th row
648
9.96M
  const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
649
650
9.96M
  __m256i u0, u1, u2, u3;
651
9.96M
  u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
652
9.96M
  u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
653
654
9.96M
  u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
655
9.96M
  u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
656
657
9.96M
  sig[3] = _mm256_unpacklo_epi16(u0, u2);
658
9.96M
  sig[7] = _mm256_unpackhi_epi16(u0, u2);
659
660
9.96M
  sig[11] = _mm256_unpacklo_epi16(u1, u3);
661
9.96M
  sig[15] = _mm256_unpackhi_epi16(u1, u3);
662
663
9.96M
  sig[16] = s8;
664
9.96M
}
665
666
static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
667
10.1M
                                      __m256i *y0, __m256i *y1) {
668
10.1M
  __m256i res[4];
669
10.1M
  int i;
670
50.7M
  for (i = 0; i < 4; ++i) {
671
40.5M
    filter_8x1_pixels(&sig[i << 2], f, &res[i]);
672
40.5M
  }
673
674
10.1M
  {
675
10.1M
    const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
676
10.1M
    const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
677
10.1M
    *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
678
10.1M
    *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
679
10.1M
  }
680
10.1M
}
681
682
static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
683
                                     const __m256i *mask, uint16_t *dst,
684
7.64M
                                     ptrdiff_t pitch) {
685
7.64M
  __m256i p = _mm256_min_epi16(*y0, *mask);
686
7.64M
  _mm256_storeu_si256((__m256i *)dst, p);
687
7.64M
  p = _mm256_min_epi16(*y1, *mask);
688
7.64M
  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
689
7.64M
}
690
691
9.94M
static void update_16x9_pixels(__m256i *sig) {
692
9.94M
  update_pixels(&sig[0]);
693
9.94M
  update_pixels(&sig[8]);
694
9.94M
}
695
696
static void vpx_highbd_filter_block1d16_v8_avx2(
697
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
698
585k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
699
585k
  __m256i signal[17], res0, res1;
700
585k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
701
702
585k
  __m256i ff[4];
703
585k
  pack_filters(filter, ff);
704
705
585k
  pack_16x9_init(src_ptr, src_pitch, signal);
706
707
8.03M
  do {
708
8.03M
    pack_16x9_pixels(src_ptr, src_pitch, signal);
709
8.03M
    filter_16x9_pixels(signal, ff, &res0, &res1);
710
8.03M
    store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
711
8.03M
    update_16x9_pixels(signal);
712
713
8.03M
    src_ptr += src_pitch << 1;
714
8.03M
    dst_ptr += dst_pitch << 1;
715
8.03M
    height -= 2;
716
8.03M
  } while (height > 0);
717
585k
}
718
719
// -----------------------------------------------------------------------------
720
// 2-tap vertical filtering
721
722
126k
static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
723
126k
  sig[2] = _mm256_loadu_si256((const __m256i *)src);
724
126k
}
725
726
static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
727
4.32M
                                       __m256i *sig) {
728
  // load the next row
729
4.32M
  const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
730
4.32M
  sig[0] = _mm256_unpacklo_epi16(sig[2], u);
731
4.32M
  sig[1] = _mm256_unpackhi_epi16(sig[2], u);
732
4.32M
  sig[2] = u;
733
4.32M
}
734
735
static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
736
4.32M
                                         __m256i *y0, __m256i *y1) {
737
4.32M
  filter_16_2t_pixels(sig, f, y0, y1);
738
4.32M
}
739
740
static void vpx_highbd_filter_block1d16_v2_avx2(
741
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
742
82.8k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
743
82.8k
  __m256i signal[3], res0, res1;
744
82.8k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
745
82.8k
  __m256i ff;
746
747
82.8k
  pack_2t_filter(filter, &ff);
748
82.8k
  pack_16x2_init(src_ptr, signal);
749
750
2.81M
  do {
751
2.81M
    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
752
2.81M
    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
753
2.81M
    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
754
755
2.81M
    src_ptr += src_pitch;
756
2.81M
    dst_ptr += dst_pitch;
757
2.81M
    height -= 1;
758
2.81M
  } while (height > 0);
759
82.8k
}
760
761
64.3k
static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
762
64.3k
  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
763
64.3k
  const __m128i p = _mm_set1_epi32(0x09080706);
764
64.3k
  f[0] = _mm_shuffle_epi8(h, p);
765
64.3k
}
766
767
64.3k
static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
768
64.3k
  sig[2] = _mm_loadu_si128((const __m128i *)src);
769
64.3k
}
770
771
static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
772
552k
                                          __m128i *sig) {
773
  // load the next row
774
552k
  const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
775
552k
  sig[0] = _mm_unpacklo_epi16(sig[2], u);
776
552k
  sig[1] = _mm_unpackhi_epi16(sig[2], u);
777
552k
  sig[2] = u;
778
552k
}
779
780
static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
781
552k
                                      __m128i *y0, __m128i *y1) {
782
552k
  const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
783
552k
  __m128i x0 = _mm_madd_epi16(sig[0], *f);
784
552k
  __m128i x1 = _mm_madd_epi16(sig[1], *f);
785
552k
  x0 = _mm_add_epi32(x0, rounding);
786
552k
  x1 = _mm_add_epi32(x1, rounding);
787
552k
  *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
788
552k
  *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
789
552k
}
790
791
static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
792
380k
                                           const __m128i *mask, uint16_t *dst) {
793
380k
  __m128i res = _mm_packus_epi32(*y0, *y1);
794
380k
  res = _mm_min_epi16(res, *mask);
795
380k
  _mm_storeu_si128((__m128i *)dst, res);
796
380k
}
797
798
static void vpx_highbd_filter_block1d8_v2_avx2(
799
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
800
44.6k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
801
44.6k
  __m128i signal[3], res0, res1;
802
44.6k
  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
803
44.6k
  __m128i ff;
804
805
44.6k
  pack_8x1_2t_filter(filter, &ff);
806
44.6k
  pack_8x2_init(src_ptr, signal);
807
808
380k
  do {
809
380k
    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
810
380k
    filter_8_2t_pixels(signal, &ff, &res0, &res1);
811
380k
    store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
812
813
380k
    src_ptr += src_pitch;
814
380k
    dst_ptr += dst_pitch;
815
380k
    height -= 1;
816
380k
  } while (height > 0);
817
44.6k
}
818
819
// Calculation with averaging the input pixels
820
821
static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask,
822
0
                                        uint16_t *dst) {
823
0
  const __m128i a0 = _mm256_castsi256_si128(*y0);
824
0
  const __m128i a1 = _mm256_extractf128_si256(*y0, 1);
825
0
  __m128i res = _mm_packus_epi32(a0, a1);
826
0
  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
827
0
  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
828
0
  res = _mm_avg_epu16(res, pix);
829
0
  _mm_storeu_si128((__m128i *)dst, res);
830
0
}
831
832
static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1,
833
                                        const __m256i *mask, uint16_t *dst,
834
807k
                                        ptrdiff_t pitch) {
835
807k
  __m256i a = _mm256_packus_epi32(*y0, *y1);
836
807k
  const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst);
837
807k
  const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch));
838
807k
  const __m256i pix =
839
807k
      _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
840
807k
  a = _mm256_min_epi16(a, *mask);
841
807k
  a = _mm256_avg_epu16(a, pix);
842
807k
  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
843
807k
  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
844
807k
}
845
846
static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1,
847
3.08M
                                         const __m256i *mask, uint16_t *dst) {
848
3.08M
  __m256i a = _mm256_packus_epi32(*y0, *y1);
849
3.08M
  const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
850
3.08M
  a = _mm256_min_epi16(a, *mask);
851
3.08M
  a = _mm256_avg_epu16(a, pix);
852
3.08M
  _mm256_storeu_si256((__m256i *)dst, a);
853
3.08M
}
854
855
static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1,
856
                                         const __m256i *mask, uint16_t *dst,
857
2.37M
                                         ptrdiff_t pitch) {
858
2.37M
  const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst);
859
2.37M
  const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch));
860
2.37M
  __m256i p = _mm256_min_epi16(*y0, *mask);
861
2.37M
  p = _mm256_avg_epu16(p, pix0);
862
2.37M
  _mm256_storeu_si256((__m256i *)dst, p);
863
864
2.37M
  p = _mm256_min_epi16(*y1, *mask);
865
2.37M
  p = _mm256_avg_epu16(p, pix1);
866
2.37M
  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
867
2.37M
}
868
869
static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0,
870
                                               const __m128i *y1,
871
                                               const __m128i *mask,
872
172k
                                               uint16_t *dst) {
873
172k
  __m128i res = _mm_packus_epi32(*y0, *y1);
874
172k
  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
875
172k
  res = _mm_min_epi16(res, *mask);
876
172k
  res = _mm_avg_epu16(res, pix);
877
172k
  _mm_storeu_si128((__m128i *)dst, res);
878
172k
}
879
880
static void vpx_highbd_filter_block1d8_h8_avg_avx2(
881
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
882
25.8k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
883
25.8k
  __m256i signal[8], res0, res1;
884
25.8k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
885
886
25.8k
  __m256i ff[4];
887
25.8k
  pack_filters(filter, ff);
888
889
25.8k
  src_ptr -= 3;
890
108k
  do {
891
108k
    pack_8x2_pixels(src_ptr, src_pitch, signal);
892
108k
    filter_8x1_pixels(signal, ff, &res0);
893
108k
    filter_8x1_pixels(&signal[4], ff, &res1);
894
108k
    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
895
108k
    height -= 2;
896
108k
    src_ptr += src_pitch << 1;
897
108k
    dst_ptr += dst_pitch << 1;
898
108k
  } while (height > 1);
899
900
25.8k
  if (height > 0) {
901
0
    pack_8x1_pixels(src_ptr, signal);
902
0
    filter_8x1_pixels(signal, ff, &res0);
903
0
    store_8x1_avg_pixels(&res0, &max, dst_ptr);
904
0
  }
905
25.8k
}
906
907
static void vpx_highbd_filter_block1d16_h8_avg_avx2(
908
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
909
28.6k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
910
28.6k
  __m256i signal[8], res0, res1;
911
28.6k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
912
913
28.6k
  __m256i ff[4];
914
28.6k
  pack_filters(filter, ff);
915
916
28.6k
  src_ptr -= 3;
917
880k
  do {
918
880k
    pack_16x1_pixels(src_ptr, signal);
919
880k
    filter_8x1_pixels(signal, ff, &res0);
920
880k
    filter_8x1_pixels(&signal[4], ff, &res1);
921
880k
    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
922
880k
    height -= 1;
923
880k
    src_ptr += src_pitch;
924
880k
    dst_ptr += dst_pitch;
925
880k
  } while (height > 0);
926
28.6k
}
927
928
static void vpx_highbd_filter_block1d4_h4_avx2(
929
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
930
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
931
  // We extract the middle four elements of the kernel into two registers in
932
  // the form
933
  // ... k[3] k[2] k[3] k[2]
934
  // ... k[5] k[4] k[5] k[4]
935
  // Then we shuffle the source into
936
  // ... s[1] s[0] s[0] s[-1]
937
  // ... s[3] s[2] s[2] s[1]
938
  // Calling multiply and add gives us half of the sum. Calling add on the two
939
  // halves gives us the output. Since avx2 allows us to use 256-bit buffer, we
940
  // can do this two rows at a time.
941
942
0
  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
943
0
  __m256i res_reg;
944
0
  __m256i idx_shift_0 =
945
0
      _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
946
0
                       3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
947
0
  __m256i idx_shift_2 =
948
0
      _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
949
0
                       5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
950
951
0
  __m128i kernel_reg_128;  // Kernel
952
0
  __m256i kernel_reg, kernel_reg_23,
953
0
      kernel_reg_45;  // Segments of the kernel used
954
0
  const __m256i reg_round =
955
0
      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
956
0
  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
957
0
  const ptrdiff_t unrolled_src_stride = src_stride << 1;
958
0
  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
959
0
  int h;
960
961
  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
962
0
  src_ptr -= 1;
963
964
  // Load Kernel
965
0
  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
966
0
  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
967
0
  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
968
0
  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
969
970
0
  for (h = height; h >= 2; h -= 2) {
971
    // Load the source
972
0
    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
973
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
974
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
975
976
    // Get the output
977
0
    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
978
0
                                   &kernel_reg_23, &kernel_reg_45);
979
980
    // Round the result
981
0
    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
982
983
    // Finally combine to get the final dst
984
0
    res_reg = _mm256_packus_epi32(res_reg, res_reg);
985
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
986
0
    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
987
0
                        &res_reg);
988
989
0
    src_ptr += unrolled_src_stride;
990
0
    dst_ptr += unrolled_dst_stride;
991
0
  }
992
993
  // Repeat for the last row if needed
994
0
  if (h > 0) {
995
    // Load the source
996
0
    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
997
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
998
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
999
1000
    // Get the output
1001
0
    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1002
0
                                   &kernel_reg_23, &kernel_reg_45);
1003
1004
    // Round the result
1005
0
    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
1006
1007
    // Finally combine to get the final dst
1008
0
    res_reg = _mm256_packus_epi32(res_reg, res_reg);
1009
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1010
0
    _mm_storel_epi64((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg));
1011
0
  }
1012
0
}
1013
1014
static void vpx_highbd_filter_block1d8_h4_avx2(
1015
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1016
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1017
  // We will extract the middle four elements of the kernel into two registers
1018
  // in the form
1019
  // ... k[3] k[2] k[3] k[2]
1020
  // ... k[5] k[4] k[5] k[4]
1021
  // Then we shuffle the source into
1022
  // ... s[1] s[0] s[0] s[-1]
1023
  // ... s[3] s[2] s[2] s[1]
1024
  // Calling multiply and add gives us half of the sum of the first half.
1025
  // Calling add gives us first half of the output. Repat again to get the whole
1026
  // output. Since avx2 allows us to use 256-bit buffer, we can do this two rows
1027
  // at a time.
1028
1029
0
  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
1030
0
  __m256i res_reg, res_first, res_last;
1031
0
  __m256i idx_shift_0 =
1032
0
      _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
1033
0
                       3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
1034
0
  __m256i idx_shift_2 =
1035
0
      _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
1036
0
                       5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
1037
1038
0
  __m128i kernel_reg_128;  // Kernel
1039
0
  __m256i kernel_reg, kernel_reg_23,
1040
0
      kernel_reg_45;  // Segments of the kernel used
1041
0
  const __m256i reg_round =
1042
0
      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
1043
0
  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
1044
0
  const ptrdiff_t unrolled_src_stride = src_stride << 1;
1045
0
  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
1046
0
  int h;
1047
1048
  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
1049
0
  src_ptr -= 1;
1050
1051
  // Load Kernel
1052
0
  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
1053
0
  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
1054
0
  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
1055
0
  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
1056
1057
0
  for (h = height; h >= 2; h -= 2) {
1058
    // Load the source
1059
0
    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
1060
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
1061
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
1062
1063
    // Result for first half
1064
0
    res_first = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1065
0
                                     &kernel_reg_23, &kernel_reg_45);
1066
1067
    // Do again to get the second half of dst
1068
    // Load the source
1069
0
    src_reg = mm256_loadu2_si128(src_ptr + 4, src_ptr + src_stride + 4);
1070
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
1071
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
1072
1073
    // Result for second half
1074
0
    res_last = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1075
0
                                    &kernel_reg_23, &kernel_reg_45);
1076
1077
    // Round each result
1078
0
    res_first = mm256_round_epi32(&res_first, &reg_round, CONV8_ROUNDING_BITS);
1079
0
    res_last = mm256_round_epi32(&res_last, &reg_round, CONV8_ROUNDING_BITS);
1080
1081
    // Finally combine to get the final dst
1082
0
    res_reg = _mm256_packus_epi32(res_first, res_last);
1083
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1084
0
    mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
1085
0
                       &res_reg);
1086
1087
0
    src_ptr += unrolled_src_stride;
1088
0
    dst_ptr += unrolled_dst_stride;
1089
0
  }
1090
1091
  // Repeat for the last row if needed
1092
0
  if (h > 0) {
1093
0
    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
1094
0
    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
1095
0
    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
1096
1097
0
    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1098
0
                                   &kernel_reg_23, &kernel_reg_45);
1099
1100
0
    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
1101
1102
0
    res_reg = _mm256_packus_epi32(res_reg, res_reg);
1103
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1104
1105
0
    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg);
1106
0
  }
1107
0
}
1108
1109
static void vpx_highbd_filter_block1d16_h4_avx2(
1110
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1111
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1112
0
  vpx_highbd_filter_block1d8_h4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
1113
0
                                     height, kernel, bd);
1114
0
  vpx_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
1115
0
                                     dst_stride, height, kernel, bd);
1116
0
}
1117
1118
static void vpx_highbd_filter_block1d8_v8_avg_avx2(
1119
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1120
166k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1121
166k
  __m256i signal[9], res0, res1;
1122
166k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1123
1124
166k
  __m256i ff[4];
1125
166k
  pack_filters(filter, ff);
1126
1127
166k
  pack_8x9_init(src_ptr, src_pitch, signal);
1128
1129
674k
  do {
1130
674k
    pack_8x9_pixels(src_ptr, src_pitch, signal);
1131
1132
674k
    filter_8x9_pixels(signal, ff, &res0, &res1);
1133
674k
    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
1134
674k
    update_pixels(signal);
1135
1136
674k
    src_ptr += src_pitch << 1;
1137
674k
    dst_ptr += dst_pitch << 1;
1138
674k
    height -= 2;
1139
674k
  } while (height > 0);
1140
166k
}
1141
1142
static void vpx_highbd_filter_block1d16_v8_avg_avx2(
1143
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1144
170k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1145
170k
  __m256i signal[17], res0, res1;
1146
170k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1147
1148
170k
  __m256i ff[4];
1149
170k
  pack_filters(filter, ff);
1150
1151
170k
  pack_16x9_init(src_ptr, src_pitch, signal);
1152
1153
2.38M
  do {
1154
2.38M
    pack_16x9_pixels(src_ptr, src_pitch, signal);
1155
2.38M
    filter_16x9_pixels(signal, ff, &res0, &res1);
1156
2.38M
    store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
1157
2.38M
    update_16x9_pixels(signal);
1158
1159
2.38M
    src_ptr += src_pitch << 1;
1160
2.38M
    dst_ptr += dst_pitch << 1;
1161
2.38M
    height -= 2;
1162
2.38M
  } while (height > 0);
1163
170k
}
1164
1165
static void vpx_highbd_filter_block1d8_h2_avg_avx2(
1166
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1167
5.74k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1168
5.74k
  __m256i signal[2], res0, res1;
1169
5.74k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1170
1171
5.74k
  __m256i ff;
1172
5.74k
  pack_2t_filter(filter, &ff);
1173
1174
5.74k
  src_ptr -= 3;
1175
25.4k
  do {
1176
25.4k
    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
1177
25.4k
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
1178
25.4k
    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
1179
25.4k
    height -= 2;
1180
25.4k
    src_ptr += src_pitch << 1;
1181
25.4k
    dst_ptr += dst_pitch << 1;
1182
25.4k
  } while (height > 1);
1183
1184
5.74k
  if (height > 0) {
1185
0
    pack_8x1_2t_pixels(src_ptr, signal);
1186
0
    filter_8x1_2t_pixels(signal, &ff, &res0);
1187
0
    store_8x1_avg_pixels(&res0, &max, dst_ptr);
1188
0
  }
1189
5.74k
}
1190
1191
static void vpx_highbd_filter_block1d16_h2_avg_avx2(
1192
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1193
16.3k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1194
16.3k
  __m256i signal[2], res0, res1;
1195
16.3k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1196
1197
16.3k
  __m256i ff;
1198
16.3k
  pack_2t_filter(filter, &ff);
1199
1200
16.3k
  src_ptr -= 3;
1201
688k
  do {
1202
688k
    pack_16x1_2t_pixels(src_ptr, signal);
1203
688k
    filter_16_2t_pixels(signal, &ff, &res0, &res1);
1204
688k
    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
1205
688k
    height -= 1;
1206
688k
    src_ptr += src_pitch;
1207
688k
    dst_ptr += dst_pitch;
1208
688k
  } while (height > 0);
1209
16.3k
}
1210
1211
static void vpx_highbd_filter_block1d16_v2_avg_avx2(
1212
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1213
44.1k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1214
44.1k
  __m256i signal[3], res0, res1;
1215
44.1k
  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1216
44.1k
  __m256i ff;
1217
1218
44.1k
  pack_2t_filter(filter, &ff);
1219
44.1k
  pack_16x2_init(src_ptr, signal);
1220
1221
1.51M
  do {
1222
1.51M
    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
1223
1.51M
    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
1224
1.51M
    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
1225
1226
1.51M
    src_ptr += src_pitch;
1227
1.51M
    dst_ptr += dst_pitch;
1228
1.51M
    height -= 1;
1229
1.51M
  } while (height > 0);
1230
44.1k
}
1231
1232
static void vpx_highbd_filter_block1d8_v2_avg_avx2(
1233
    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1234
19.6k
    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1235
19.6k
  __m128i signal[3], res0, res1;
1236
19.6k
  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
1237
19.6k
  __m128i ff;
1238
1239
19.6k
  pack_8x1_2t_filter(filter, &ff);
1240
19.6k
  pack_8x2_init(src_ptr, signal);
1241
1242
172k
  do {
1243
172k
    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
1244
172k
    filter_8_2t_pixels(signal, &ff, &res0, &res1);
1245
172k
    store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr);
1246
1247
172k
    src_ptr += src_pitch;
1248
172k
    dst_ptr += dst_pitch;
1249
172k
    height -= 1;
1250
172k
  } while (height > 0);
1251
19.6k
}
1252
1253
static void vpx_highbd_filter_block1d4_v4_avx2(
1254
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1255
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1256
  // We will load two rows of pixels and rearrange them into the form
1257
  // ... s[1,0] s[0,0] s[0,0] s[-1,0]
1258
  // so that we can call multiply and add with the kernel partial output. Then
1259
  // we can call add with another row to get the output.
1260
1261
  // Register for source s[-1:3, :]
1262
0
  __m256i src_reg_1, src_reg_2, src_reg_3;
1263
  // Interleaved rows of the source. lo is first half, hi second
1264
0
  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
1265
0
  __m256i src_reg_m1001, src_reg_1223;
1266
1267
  // Result after multiply and add
1268
0
  __m256i res_reg;
1269
1270
0
  __m128i kernel_reg_128;                            // Kernel
1271
0
  __m256i kernel_reg, kernel_reg_23, kernel_reg_45;  // Segments of kernel used
1272
1273
0
  const __m256i reg_round =
1274
0
      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
1275
0
  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
1276
0
  const ptrdiff_t src_stride_unrolled = src_stride << 1;
1277
0
  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
1278
0
  int h;
1279
1280
  // Load Kernel
1281
0
  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
1282
0
  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
1283
0
  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
1284
0
  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
1285
1286
  // Row -1 to row 0
1287
0
  src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr,
1288
0
                                   (const __m128i *)(src_ptr + src_stride));
1289
1290
  // Row 0 to row 1
1291
0
  src_reg_1 = _mm256_castsi128_si256(
1292
0
      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
1293
0
  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
1294
1295
  // First three rows
1296
0
  src_reg_m1001 = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
1297
1298
0
  for (h = height; h > 1; h -= 2) {
1299
0
    src_reg_2 = _mm256_castsi128_si256(
1300
0
        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
1301
1302
0
    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
1303
0
                                         _mm256_castsi256_si128(src_reg_2), 1);
1304
1305
0
    src_reg_3 = _mm256_castsi128_si256(
1306
0
        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
1307
1308
0
    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
1309
0
                                         _mm256_castsi256_si128(src_reg_3), 1);
1310
1311
    // Last three rows
1312
0
    src_reg_1223 = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
1313
1314
    // Output
1315
0
    res_reg = mm256_madd_add_epi32(&src_reg_m1001, &src_reg_1223,
1316
0
                                   &kernel_reg_23, &kernel_reg_45);
1317
1318
    // Round the words
1319
0
    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
1320
1321
    // Combine to get the result
1322
0
    res_reg = _mm256_packus_epi32(res_reg, res_reg);
1323
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1324
1325
    // Save the result
1326
0
    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
1327
0
                        &res_reg);
1328
1329
    // Update the source by two rows
1330
0
    src_ptr += src_stride_unrolled;
1331
0
    dst_ptr += dst_stride_unrolled;
1332
1333
0
    src_reg_m1001 = src_reg_1223;
1334
0
    src_reg_1 = src_reg_3;
1335
0
  }
1336
0
}
1337
1338
static void vpx_highbd_filter_block1d8_v4_avx2(
1339
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1340
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1341
  // We will load two rows of pixels and rearrange them into the form
1342
  // ... s[1,0] s[0,0] s[0,0] s[-1,0]
1343
  // so that we can call multiply and add with the kernel partial output. Then
1344
  // we can call add with another row to get the output.
1345
1346
  // Register for source s[-1:3, :]
1347
0
  __m256i src_reg_1, src_reg_2, src_reg_3;
1348
  // Interleaved rows of the source. lo is first half, hi second
1349
0
  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
1350
0
  __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi;
1351
1352
0
  __m128i kernel_reg_128;                            // Kernel
1353
0
  __m256i kernel_reg, kernel_reg_23, kernel_reg_45;  // Segments of kernel
1354
1355
  // Result after multiply and add
1356
0
  __m256i res_reg, res_reg_lo, res_reg_hi;
1357
1358
0
  const __m256i reg_round =
1359
0
      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
1360
0
  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
1361
0
  const ptrdiff_t src_stride_unrolled = src_stride << 1;
1362
0
  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
1363
0
  int h;
1364
1365
  // Load Kernel
1366
0
  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
1367
0
  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
1368
0
  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
1369
0
  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
1370
1371
  // Row -1 to row 0
1372
0
  src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
1373
0
                                   (const __m128i *)(src_ptr + src_stride));
1374
1375
  // Row 0 to row 1
1376
0
  src_reg_1 = _mm256_castsi128_si256(
1377
0
      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
1378
0
  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
1379
1380
  // First three rows
1381
0
  src_reg_m1001_lo = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
1382
0
  src_reg_m1001_hi = _mm256_unpackhi_epi16(src_reg_m10, src_reg_01);
1383
1384
0
  for (h = height; h > 1; h -= 2) {
1385
0
    src_reg_2 = _mm256_castsi128_si256(
1386
0
        _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)));
1387
1388
0
    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
1389
0
                                         _mm256_castsi256_si128(src_reg_2), 1);
1390
1391
0
    src_reg_3 = _mm256_castsi128_si256(
1392
0
        _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)));
1393
1394
0
    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
1395
0
                                         _mm256_castsi256_si128(src_reg_3), 1);
1396
1397
    // Last three rows
1398
0
    src_reg_1223_lo = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
1399
0
    src_reg_1223_hi = _mm256_unpackhi_epi16(src_reg_12, src_reg_23);
1400
1401
    // Output from first half
1402
0
    res_reg_lo = mm256_madd_add_epi32(&src_reg_m1001_lo, &src_reg_1223_lo,
1403
0
                                      &kernel_reg_23, &kernel_reg_45);
1404
1405
    // Output from second half
1406
0
    res_reg_hi = mm256_madd_add_epi32(&src_reg_m1001_hi, &src_reg_1223_hi,
1407
0
                                      &kernel_reg_23, &kernel_reg_45);
1408
1409
    // Round the words
1410
0
    res_reg_lo =
1411
0
        mm256_round_epi32(&res_reg_lo, &reg_round, CONV8_ROUNDING_BITS);
1412
0
    res_reg_hi =
1413
0
        mm256_round_epi32(&res_reg_hi, &reg_round, CONV8_ROUNDING_BITS);
1414
1415
    // Combine to get the result
1416
0
    res_reg = _mm256_packus_epi32(res_reg_lo, res_reg_hi);
1417
0
    res_reg = _mm256_min_epi16(res_reg, reg_max);
1418
1419
    // Save the result
1420
0
    mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
1421
0
                       &res_reg);
1422
1423
    // Update the source by two rows
1424
0
    src_ptr += src_stride_unrolled;
1425
0
    dst_ptr += dst_stride_unrolled;
1426
1427
0
    src_reg_m1001_lo = src_reg_1223_lo;
1428
0
    src_reg_m1001_hi = src_reg_1223_hi;
1429
0
    src_reg_1 = src_reg_3;
1430
0
  }
1431
0
}
1432
1433
static void vpx_highbd_filter_block1d16_v4_avx2(
1434
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1435
0
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1436
0
  vpx_highbd_filter_block1d8_v4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
1437
0
                                     height, kernel, bd);
1438
0
  vpx_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
1439
0
                                     dst_stride, height, kernel, bd);
1440
0
}
1441
1442
// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
1443
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
1444
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
1445
1446
// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
1447
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
1448
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
1449
1450
673k
#define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2
1451
86.9k
#define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2
1452
590k
#define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
1453
64.2k
#define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2
1454
1455
// Use the [vh]8 version because there is no [vh]4 implementation.
1456
#define vpx_highbd_filter_block1d16_v4_avg_avx2 \
1457
0
  vpx_highbd_filter_block1d16_v8_avg_avx2
1458
#define vpx_highbd_filter_block1d16_h4_avg_avx2 \
1459
0
  vpx_highbd_filter_block1d16_h8_avg_avx2
1460
#define vpx_highbd_filter_block1d8_v4_avg_avx2 \
1461
0
  vpx_highbd_filter_block1d8_v8_avg_avx2
1462
#define vpx_highbd_filter_block1d8_h4_avg_avx2 \
1463
0
  vpx_highbd_filter_block1d8_h8_avg_avx2
1464
#define vpx_highbd_filter_block1d4_v4_avg_avx2 \
1465
0
  vpx_highbd_filter_block1d4_v8_avg_avx2
1466
#define vpx_highbd_filter_block1d4_h4_avg_avx2 \
1467
0
  vpx_highbd_filter_block1d4_h8_avg_avx2
1468
1469
HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
1470
HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
1471
                 src - src_stride * (num_taps / 2 - 1), , avx2, 0)
1472
3.06M
HIGH_FUN_CONV_2D(, avx2, 0)
1473
1474
// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
1475
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
1476
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
1477
1478
// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
1479
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
1480
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
1481
1482
#define vpx_highbd_filter_block1d4_h8_avg_avx2 \
1483
44.4k
  vpx_highbd_filter_block1d4_h8_avg_sse2
1484
#define vpx_highbd_filter_block1d4_h2_avg_avx2 \
1485
8.01k
  vpx_highbd_filter_block1d4_h2_avg_sse2
1486
#define vpx_highbd_filter_block1d4_v8_avg_avx2 \
1487
244k
  vpx_highbd_filter_block1d4_v8_avg_sse2
1488
#define vpx_highbd_filter_block1d4_v2_avg_avx2 \
1489
28.7k
  vpx_highbd_filter_block1d4_v2_avg_sse2
1490
1491
HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
1492
HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
1493
                 src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
1494
HIGH_FUN_CONV_2D(avg_, avx2, 1)
1495
1496
#undef HIGHBD_FUNC
1497
#endif  // HAVE_X86_ASM