/src/aom/av1/common/x86/cfl_avx2.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | #include <immintrin.h> |
12 | | |
13 | | #include "config/av1_rtcd.h" |
14 | | |
15 | | #include "av1/common/cfl.h" |
16 | | |
17 | | #include "av1/common/x86/cfl_simd.h" |
18 | | |
19 | | #define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd) \ |
20 | | CFL_SUBSAMPLE(avx2, sub, bd, 32, 32) \ |
21 | | CFL_SUBSAMPLE(avx2, sub, bd, 32, 16) \ |
22 | | CFL_SUBSAMPLE(avx2, sub, bd, 32, 8) \ |
23 | | cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2( \ |
24 | 3.03M | TX_SIZE tx_size) { \ |
25 | 3.03M | static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ |
26 | 3.03M | cfl_subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \ |
27 | 3.03M | cfl_subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \ |
28 | 3.03M | cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \ |
29 | 3.03M | cfl_subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \ |
30 | 3.03M | NULL, /* 64x64 (invalid CFL size) */ \ |
31 | 3.03M | cfl_subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \ |
32 | 3.03M | cfl_subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \ |
33 | 3.03M | cfl_subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \ |
34 | 3.03M | cfl_subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \ |
35 | 3.03M | cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \ |
36 | 3.03M | cfl_subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \ |
37 | 3.03M | NULL, /* 32x64 (invalid CFL size) */ \ |
38 | 3.03M | NULL, /* 64x32 (invalid CFL size) */ \ |
39 | 3.03M | cfl_subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \ |
40 | 3.03M | cfl_subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \ |
41 | 3.03M | cfl_subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \ |
42 | 3.03M | cfl_subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \ |
43 | 3.03M | NULL, /* 16x64 (invalid CFL size) */ \ |
44 | 3.03M | NULL, /* 64x16 (invalid CFL size) */ \ |
45 | 3.03M | }; \ |
46 | 3.03M | return subfn_##sub[tx_size]; \ |
47 | 3.03M | } cfl_get_luma_subsampling_420_lbd_avx2 Line | Count | Source | 24 | 961k | TX_SIZE tx_size) { \ | 25 | 961k | static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ | 26 | 961k | cfl_subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \ | 27 | 961k | cfl_subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \ | 28 | 961k | cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \ | 29 | 961k | cfl_subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \ | 30 | 961k | NULL, /* 64x64 (invalid CFL size) */ \ | 31 | 961k | cfl_subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \ | 32 | 961k | cfl_subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \ | 33 | 961k | cfl_subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \ | 34 | 961k | cfl_subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \ | 35 | 961k | cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \ | 36 | 961k | cfl_subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \ | 37 | 961k | NULL, /* 32x64 (invalid CFL size) */ \ | 38 | 961k | NULL, /* 64x32 (invalid CFL size) */ \ | 39 | 961k | cfl_subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \ | 40 | 961k | cfl_subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \ | 41 | 961k | cfl_subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \ | 42 | 961k | cfl_subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \ | 43 | 961k | NULL, /* 16x64 (invalid CFL size) */ \ | 44 | 961k | NULL, /* 64x16 (invalid CFL size) */ \ | 45 | 961k | }; \ | 46 | 961k | return subfn_##sub[tx_size]; \ | 47 | 961k | } |
cfl_get_luma_subsampling_422_lbd_avx2 Line | Count | Source | 24 | 2.22k | TX_SIZE tx_size) { \ | 25 | 2.22k | static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ | 26 | 2.22k | cfl_subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \ | 27 | 2.22k | cfl_subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \ | 28 | 2.22k | cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \ | 29 | 2.22k | cfl_subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \ | 30 | 2.22k | NULL, /* 64x64 (invalid CFL size) */ \ | 31 | 2.22k | cfl_subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \ | 32 | 2.22k | cfl_subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \ | 33 | 2.22k | cfl_subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \ | 34 | 2.22k | cfl_subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \ | 35 | 2.22k | cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \ | 36 | 2.22k | cfl_subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \ | 37 | 2.22k | NULL, /* 32x64 (invalid CFL size) */ \ | 38 | 2.22k | NULL, /* 64x32 (invalid CFL size) */ \ | 39 | 2.22k | cfl_subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \ | 40 | 2.22k | cfl_subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \ | 41 | 2.22k | cfl_subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \ | 42 | 2.22k | cfl_subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \ | 43 | 2.22k | NULL, /* 16x64 (invalid CFL size) */ \ | 44 | 2.22k | NULL, /* 64x16 (invalid CFL size) */ \ | 45 | 2.22k | }; \ | 46 | 2.22k | return subfn_##sub[tx_size]; \ | 47 | 2.22k | } |
cfl_get_luma_subsampling_444_lbd_avx2 Line | Count | Source | 24 | 421k | TX_SIZE tx_size) { \ | 25 | 421k | static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ | 26 | 421k | cfl_subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \ | 27 | 421k | cfl_subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \ | 28 | 421k | cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \ | 29 | 421k | cfl_subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \ | 30 | 421k | NULL, /* 64x64 (invalid CFL size) */ \ | 31 | 421k | cfl_subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \ | 32 | 421k | cfl_subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \ | 33 | 421k | cfl_subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \ | 34 | 421k | cfl_subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \ | 35 | 421k | cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \ | 36 | 421k | cfl_subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \ | 37 | 421k | NULL, /* 32x64 (invalid CFL size) */ \ | 38 | 421k | NULL, /* 64x32 (invalid CFL size) */ \ | 39 | 421k | cfl_subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \ | 40 | 421k | cfl_subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \ | 41 | 421k | cfl_subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \ | 42 | 421k | cfl_subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \ | 43 | 421k | NULL, /* 16x64 (invalid CFL size) */ \ | 44 | 421k | NULL, /* 64x16 (invalid CFL size) */ \ | 45 | 421k | }; \ | 46 | 421k | return subfn_##sub[tx_size]; \ | 47 | 421k | } |
cfl_get_luma_subsampling_420_hbd_avx2 Line | Count | Source | 24 | 878k | TX_SIZE tx_size) { \ | 25 | 878k | static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ | 26 | 878k | cfl_subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \ | 27 | 878k | cfl_subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \ | 28 | 878k | cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \ | 29 | 878k | cfl_subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \ | 30 | 878k | NULL, /* 64x64 (invalid CFL size) */ \ | 31 | 878k | cfl_subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \ | 32 | 878k | cfl_subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \ | 33 | 878k | cfl_subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \ | 34 | 878k | cfl_subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \ | 35 | 878k | cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \ | 36 | 878k | cfl_subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \ | 37 | 878k | NULL, /* 32x64 (invalid CFL size) */ \ | 38 | 878k | NULL, /* 64x32 (invalid CFL size) */ \ | 39 | 878k | cfl_subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \ | 40 | 878k | cfl_subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \ | 41 | 878k | cfl_subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \ | 42 | 878k | cfl_subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \ | 43 | 878k | NULL, /* 16x64 (invalid CFL size) */ \ | 44 | 878k | NULL, /* 64x16 (invalid CFL size) */ \ | 45 | 878k | }; \ | 46 | 878k | return subfn_##sub[tx_size]; \ | 47 | 878k | } |
cfl_get_luma_subsampling_422_hbd_avx2 Line | Count | Source | 24 | 1.88k | TX_SIZE tx_size) { \ | 25 | 1.88k | static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ | 26 | 1.88k | cfl_subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \ | 27 | 1.88k | cfl_subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \ | 28 | 1.88k | cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \ | 29 | 1.88k | cfl_subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \ | 30 | 1.88k | NULL, /* 64x64 (invalid CFL size) */ \ | 31 | 1.88k | cfl_subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \ | 32 | 1.88k | cfl_subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \ | 33 | 1.88k | cfl_subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \ | 34 | 1.88k | cfl_subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \ | 35 | 1.88k | cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \ | 36 | 1.88k | cfl_subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \ | 37 | 1.88k | NULL, /* 32x64 (invalid CFL size) */ \ | 38 | 1.88k | NULL, /* 64x32 (invalid CFL size) */ \ | 39 | 1.88k | cfl_subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \ | 40 | 1.88k | cfl_subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \ | 41 | 1.88k | cfl_subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \ | 42 | 1.88k | cfl_subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \ | 43 | 1.88k | NULL, /* 16x64 (invalid CFL size) */ \ | 44 | 1.88k | NULL, /* 64x16 (invalid CFL size) */ \ | 45 | 1.88k | }; \ | 46 | 1.88k | return subfn_##sub[tx_size]; \ | 47 | 1.88k | } |
cfl_get_luma_subsampling_444_hbd_avx2 Line | Count | Source | 24 | 767k | TX_SIZE tx_size) { \ | 25 | 767k | static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ | 26 | 767k | cfl_subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \ | 27 | 767k | cfl_subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \ | 28 | 767k | cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \ | 29 | 767k | cfl_subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \ | 30 | 767k | NULL, /* 64x64 (invalid CFL size) */ \ | 31 | 767k | cfl_subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \ | 32 | 767k | cfl_subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \ | 33 | 767k | cfl_subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \ | 34 | 767k | cfl_subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \ | 35 | 767k | cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \ | 36 | 767k | cfl_subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \ | 37 | 767k | NULL, /* 32x64 (invalid CFL size) */ \ | 38 | 767k | NULL, /* 64x32 (invalid CFL size) */ \ | 39 | 767k | cfl_subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \ | 40 | 767k | cfl_subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \ | 41 | 767k | cfl_subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \ | 42 | 767k | cfl_subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \ | 43 | 767k | NULL, /* 16x64 (invalid CFL size) */ \ | 44 | 767k | NULL, /* 64x16 (invalid CFL size) */ \ | 45 | 767k | }; \ | 46 | 767k | return subfn_##sub[tx_size]; \ | 47 | 767k | } |
|
48 | | |
49 | | /** |
50 | | * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more |
51 | | * precise version of a box filter 4:2:0 pixel subsampling in Q3. |
52 | | * |
53 | | * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the |
54 | | * active area is specified using width and height. |
55 | | * |
56 | | * Note: We don't need to worry about going over the active area, as long as we |
57 | | * stay inside the CfL prediction buffer. |
58 | | * |
59 | | * Note: For 4:2:0 luma subsampling, the width will never be greater than 16. |
60 | | */ |
61 | | static void cfl_luma_subsampling_420_lbd_avx2(const uint8_t *input, |
62 | | int input_stride, |
63 | | uint16_t *pred_buf_q3, int width, |
64 | 46.6k | int height) { |
65 | 46.6k | (void)width; // Forever 32 |
66 | 46.6k | const __m256i twos = _mm256_set1_epi8(2); // Thirty two twos |
67 | 46.6k | const int luma_stride = input_stride << 1; |
68 | 46.6k | __m256i *row = (__m256i *)pred_buf_q3; |
69 | 46.6k | const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256; |
70 | 503k | do { |
71 | 503k | __m256i top = _mm256_loadu_si256((__m256i *)input); |
72 | 503k | __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride)); |
73 | | |
74 | 503k | __m256i top_16x16 = _mm256_maddubs_epi16(top, twos); |
75 | 503k | __m256i bot_16x16 = _mm256_maddubs_epi16(bot, twos); |
76 | 503k | __m256i sum_16x16 = _mm256_add_epi16(top_16x16, bot_16x16); |
77 | | |
78 | 503k | _mm256_storeu_si256(row, sum_16x16); |
79 | | |
80 | 503k | input += luma_stride; |
81 | 503k | } while ((row += CFL_BUF_LINE_I256) < row_end); |
82 | 46.6k | } |
83 | | |
84 | | CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, lbd) |
85 | | |
86 | | /** |
87 | | * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more |
88 | | * precise version of a box filter 4:2:2 pixel subsampling in Q3. |
89 | | * |
90 | | * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the |
91 | | * active area is specified using width and height. |
92 | | * |
93 | | * Note: We don't need to worry about going over the active area, as long as we |
94 | | * stay inside the CfL prediction buffer. |
95 | | */ |
96 | | static void cfl_luma_subsampling_422_lbd_avx2(const uint8_t *input, |
97 | | int input_stride, |
98 | | uint16_t *pred_buf_q3, int width, |
99 | 425 | int height) { |
100 | 425 | (void)width; // Forever 32 |
101 | 425 | const __m256i fours = _mm256_set1_epi8(4); // Thirty two fours |
102 | 425 | __m256i *row = (__m256i *)pred_buf_q3; |
103 | 425 | const __m256i *row_end = row + height * CFL_BUF_LINE_I256; |
104 | 7.00k | do { |
105 | 7.00k | __m256i top = _mm256_loadu_si256((__m256i *)input); |
106 | 7.00k | __m256i top_16x16 = _mm256_maddubs_epi16(top, fours); |
107 | 7.00k | _mm256_storeu_si256(row, top_16x16); |
108 | 7.00k | input += input_stride; |
109 | 7.00k | } while ((row += CFL_BUF_LINE_I256) < row_end); |
110 | 425 | } |
111 | | |
112 | | CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, lbd) |
113 | | |
114 | | /** |
115 | | * Multiplies the pixels by 8 (scaling in Q3). The AVX2 subsampling is only |
116 | | * performed on block of width 32. |
117 | | * |
118 | | * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the |
119 | | * active area is specified using width and height. |
120 | | * |
121 | | * Note: We don't need to worry about going over the active area, as long as we |
122 | | * stay inside the CfL prediction buffer. |
123 | | */ |
124 | | static void cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input, |
125 | | int input_stride, |
126 | | uint16_t *pred_buf_q3, int width, |
127 | 43.2k | int height) { |
128 | 43.2k | (void)width; // Forever 32 |
129 | 43.2k | __m256i *row = (__m256i *)pred_buf_q3; |
130 | 43.2k | const __m256i *row_end = row + height * CFL_BUF_LINE_I256; |
131 | 43.2k | const __m256i zeros = _mm256_setzero_si256(); |
132 | 848k | do { |
133 | 848k | __m256i top = _mm256_loadu_si256((__m256i *)input); |
134 | 848k | top = _mm256_permute4x64_epi64(top, _MM_SHUFFLE(3, 1, 2, 0)); |
135 | | |
136 | 848k | __m256i row_lo = _mm256_unpacklo_epi8(top, zeros); |
137 | 848k | row_lo = _mm256_slli_epi16(row_lo, 3); |
138 | 848k | __m256i row_hi = _mm256_unpackhi_epi8(top, zeros); |
139 | 848k | row_hi = _mm256_slli_epi16(row_hi, 3); |
140 | | |
141 | 848k | _mm256_storeu_si256(row, row_lo); |
142 | 848k | _mm256_storeu_si256(row + 1, row_hi); |
143 | | |
144 | 848k | input += input_stride; |
145 | 848k | } while ((row += CFL_BUF_LINE_I256) < row_end); |
146 | 43.2k | } |
147 | | |
148 | | CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, lbd) |
149 | | |
150 | | #if CONFIG_AV1_HIGHBITDEPTH |
151 | | /** |
152 | | * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more |
153 | | * precise version of a box filter 4:2:0 pixel subsampling in Q3. |
154 | | * |
155 | | * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the |
156 | | * active area is specified using width and height. |
157 | | * |
158 | | * Note: We don't need to worry about going over the active area, as long as we |
159 | | * stay inside the CfL prediction buffer. |
160 | | * |
161 | | * Note: For 4:2:0 luma subsampling, the width will never be greater than 16. |
162 | | */ |
163 | | static void cfl_luma_subsampling_420_hbd_avx2(const uint16_t *input, |
164 | | int input_stride, |
165 | | uint16_t *pred_buf_q3, int width, |
166 | 27.4k | int height) { |
167 | 27.4k | (void)width; // Forever 32 |
168 | 27.4k | const int luma_stride = input_stride << 1; |
169 | 27.4k | __m256i *row = (__m256i *)pred_buf_q3; |
170 | 27.4k | const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256; |
171 | 291k | do { |
172 | 291k | __m256i top = _mm256_loadu_si256((__m256i *)input); |
173 | 291k | __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride)); |
174 | 291k | __m256i sum = _mm256_add_epi16(top, bot); |
175 | | |
176 | 291k | __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16)); |
177 | 291k | __m256i bot_1 = _mm256_loadu_si256((__m256i *)(input + 16 + input_stride)); |
178 | 291k | __m256i sum_1 = _mm256_add_epi16(top_1, bot_1); |
179 | | |
180 | 291k | __m256i hsum = _mm256_hadd_epi16(sum, sum_1); |
181 | 291k | hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0)); |
182 | 291k | hsum = _mm256_add_epi16(hsum, hsum); |
183 | | |
184 | 291k | _mm256_storeu_si256(row, hsum); |
185 | | |
186 | 291k | input += luma_stride; |
187 | 291k | } while ((row += CFL_BUF_LINE_I256) < row_end); |
188 | 27.4k | } |
189 | | |
190 | | CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, hbd) |
191 | | |
192 | | /** |
193 | | * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more |
194 | | * precise version of a box filter 4:2:2 pixel subsampling in Q3. |
195 | | * |
196 | | * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the |
197 | | * active area is specified using width and height. |
198 | | * |
199 | | * Note: We don't need to worry about going over the active area, as long as we |
200 | | * stay inside the CfL prediction buffer. |
201 | | * |
202 | | */ |
203 | | static void cfl_luma_subsampling_422_hbd_avx2(const uint16_t *input, |
204 | | int input_stride, |
205 | | uint16_t *pred_buf_q3, int width, |
206 | 119 | int height) { |
207 | 119 | (void)width; // Forever 32 |
208 | 119 | __m256i *row = (__m256i *)pred_buf_q3; |
209 | 119 | const __m256i *row_end = row + height * CFL_BUF_LINE_I256; |
210 | 1.78k | do { |
211 | 1.78k | __m256i top = _mm256_loadu_si256((__m256i *)input); |
212 | 1.78k | __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16)); |
213 | 1.78k | __m256i hsum = _mm256_hadd_epi16(top, top_1); |
214 | 1.78k | hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0)); |
215 | 1.78k | hsum = _mm256_slli_epi16(hsum, 2); |
216 | | |
217 | 1.78k | _mm256_storeu_si256(row, hsum); |
218 | | |
219 | 1.78k | input += input_stride; |
220 | 1.78k | } while ((row += CFL_BUF_LINE_I256) < row_end); |
221 | 119 | } |
222 | | |
223 | | CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, hbd) |
224 | | |
225 | | static void cfl_luma_subsampling_444_hbd_avx2(const uint16_t *input, |
226 | | int input_stride, |
227 | | uint16_t *pred_buf_q3, int width, |
228 | 61.2k | int height) { |
229 | 61.2k | (void)width; // Forever 32 |
230 | 61.2k | __m256i *row = (__m256i *)pred_buf_q3; |
231 | 61.2k | const __m256i *row_end = row + height * CFL_BUF_LINE_I256; |
232 | 1.03M | do { |
233 | 1.03M | __m256i top = _mm256_loadu_si256((__m256i *)input); |
234 | 1.03M | __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16)); |
235 | 1.03M | _mm256_storeu_si256(row, _mm256_slli_epi16(top, 3)); |
236 | 1.03M | _mm256_storeu_si256(row + 1, _mm256_slli_epi16(top_1, 3)); |
237 | 1.03M | input += input_stride; |
238 | 1.03M | } while ((row += CFL_BUF_LINE_I256) < row_end); |
239 | 61.2k | } |
240 | | |
241 | | CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, hbd) |
242 | | #endif // CONFIG_AV1_HIGHBITDEPTH |
243 | | |
244 | | static inline __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12, |
245 | 12.0M | __m256i alpha_sign, __m256i dc_q0) { |
246 | 12.0M | __m256i ac_q3 = _mm256_loadu_si256(input); |
247 | 12.0M | __m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3); |
248 | 12.0M | __m256i scaled_luma_q0 = |
249 | 12.0M | _mm256_mulhrs_epi16(_mm256_abs_epi16(ac_q3), alpha_q12); |
250 | 12.0M | scaled_luma_q0 = _mm256_sign_epi16(scaled_luma_q0, ac_sign); |
251 | 12.0M | return _mm256_add_epi16(scaled_luma_q0, dc_q0); |
252 | 12.0M | } |
253 | | |
254 | | static inline void cfl_predict_lbd_avx2(const int16_t *pred_buf_q3, |
255 | | uint8_t *dst, int dst_stride, |
256 | 87.2k | int alpha_q3, int width, int height) { |
257 | 87.2k | (void)width; |
258 | 87.2k | const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3); |
259 | 87.2k | const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9); |
260 | 87.2k | const __m256i dc_q0 = _mm256_set1_epi16(*dst); |
261 | 87.2k | __m256i *row = (__m256i *)pred_buf_q3; |
262 | 87.2k | const __m256i *row_end = row + height * CFL_BUF_LINE_I256; |
263 | | |
264 | 1.71M | do { |
265 | 1.71M | __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); |
266 | 1.71M | __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); |
267 | 1.71M | res = _mm256_packus_epi16(res, next); |
268 | 1.71M | res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0)); |
269 | 1.71M | _mm256_storeu_si256((__m256i *)dst, res); |
270 | 1.71M | dst += dst_stride; |
271 | 1.71M | } while ((row += CFL_BUF_LINE_I256) < row_end); |
272 | 87.2k | } |
273 | | |
274 | | CFL_PREDICT_X(avx2, 32, 8, lbd) |
275 | | CFL_PREDICT_X(avx2, 32, 16, lbd) |
276 | | CFL_PREDICT_X(avx2, 32, 32, lbd) |
277 | | |
278 | 1.40M | cfl_predict_lbd_fn cfl_get_predict_lbd_fn_avx2(TX_SIZE tx_size) { |
279 | 1.40M | static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = { |
280 | 1.40M | cfl_predict_lbd_4x4_ssse3, /* 4x4 */ |
281 | 1.40M | cfl_predict_lbd_8x8_ssse3, /* 8x8 */ |
282 | 1.40M | cfl_predict_lbd_16x16_ssse3, /* 16x16 */ |
283 | 1.40M | cfl_predict_lbd_32x32_avx2, /* 32x32 */ |
284 | 1.40M | NULL, /* 64x64 (invalid CFL size) */ |
285 | 1.40M | cfl_predict_lbd_4x8_ssse3, /* 4x8 */ |
286 | 1.40M | cfl_predict_lbd_8x4_ssse3, /* 8x4 */ |
287 | 1.40M | cfl_predict_lbd_8x16_ssse3, /* 8x16 */ |
288 | 1.40M | cfl_predict_lbd_16x8_ssse3, /* 16x8 */ |
289 | 1.40M | cfl_predict_lbd_16x32_ssse3, /* 16x32 */ |
290 | 1.40M | cfl_predict_lbd_32x16_avx2, /* 32x16 */ |
291 | 1.40M | NULL, /* 32x64 (invalid CFL size) */ |
292 | 1.40M | NULL, /* 64x32 (invalid CFL size) */ |
293 | 1.40M | cfl_predict_lbd_4x16_ssse3, /* 4x16 */ |
294 | 1.40M | cfl_predict_lbd_16x4_ssse3, /* 16x4 */ |
295 | 1.40M | cfl_predict_lbd_8x32_ssse3, /* 8x32 */ |
296 | 1.40M | cfl_predict_lbd_32x8_avx2, /* 32x8 */ |
297 | 1.40M | NULL, /* 16x64 (invalid CFL size) */ |
298 | 1.40M | NULL, /* 64x16 (invalid CFL size) */ |
299 | 1.40M | }; |
300 | | // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the |
301 | | // function pointer array out of bounds. |
302 | 1.40M | return pred[tx_size % TX_SIZES_ALL]; |
303 | 1.40M | } |
304 | | |
305 | | #if CONFIG_AV1_HIGHBITDEPTH |
306 | 538k | static __m256i highbd_max_epi16(int bd) { |
307 | 538k | const __m256i neg_one = _mm256_set1_epi16(-1); |
308 | | // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd) |
309 | 538k | return _mm256_xor_si256(_mm256_slli_epi16(neg_one, bd), neg_one); |
310 | 538k | } |
311 | | |
312 | 8.65M | static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) { |
313 | 8.65M | return _mm256_max_epi16(_mm256_min_epi16(u, max), zero); |
314 | 8.65M | } |
315 | | |
316 | | static inline void cfl_predict_hbd_avx2(const int16_t *pred_buf_q3, |
317 | | uint16_t *dst, int dst_stride, |
318 | | int alpha_q3, int bd, int width, |
319 | 538k | int height) { |
320 | | // Use SSSE3 version for smaller widths |
321 | 538k | assert(width == 16 || width == 32); |
322 | 538k | const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3); |
323 | 538k | const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9); |
324 | 538k | const __m256i dc_q0 = _mm256_loadu_si256((__m256i *)dst); |
325 | 538k | const __m256i max = highbd_max_epi16(bd); |
326 | | |
327 | 538k | __m256i *row = (__m256i *)pred_buf_q3; |
328 | 538k | const __m256i *row_end = row + height * CFL_BUF_LINE_I256; |
329 | 6.50M | do { |
330 | 6.50M | const __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); |
331 | 6.50M | _mm256_storeu_si256((__m256i *)dst, |
332 | 6.50M | highbd_clamp_epi16(res, _mm256_setzero_si256(), max)); |
333 | 6.50M | if (width == 32) { |
334 | 2.15M | const __m256i res_1 = |
335 | 2.15M | predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); |
336 | 2.15M | _mm256_storeu_si256( |
337 | 2.15M | (__m256i *)(dst + 16), |
338 | 2.15M | highbd_clamp_epi16(res_1, _mm256_setzero_si256(), max)); |
339 | 2.15M | } |
340 | 6.50M | dst += dst_stride; |
341 | 6.50M | } while ((row += CFL_BUF_LINE_I256) < row_end); |
342 | 538k | } |
343 | | |
344 | | CFL_PREDICT_X(avx2, 16, 4, hbd) |
345 | | CFL_PREDICT_X(avx2, 16, 8, hbd) |
346 | | CFL_PREDICT_X(avx2, 16, 16, hbd) |
347 | | CFL_PREDICT_X(avx2, 16, 32, hbd) |
348 | | CFL_PREDICT_X(avx2, 32, 8, hbd) |
349 | | CFL_PREDICT_X(avx2, 32, 16, hbd) |
350 | | CFL_PREDICT_X(avx2, 32, 32, hbd) |
351 | | |
352 | 1.57M | cfl_predict_hbd_fn cfl_get_predict_hbd_fn_avx2(TX_SIZE tx_size) { |
353 | 1.57M | static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = { |
354 | 1.57M | cfl_predict_hbd_4x4_ssse3, /* 4x4 */ |
355 | 1.57M | cfl_predict_hbd_8x8_ssse3, /* 8x8 */ |
356 | 1.57M | cfl_predict_hbd_16x16_avx2, /* 16x16 */ |
357 | 1.57M | cfl_predict_hbd_32x32_avx2, /* 32x32 */ |
358 | 1.57M | NULL, /* 64x64 (invalid CFL size) */ |
359 | 1.57M | cfl_predict_hbd_4x8_ssse3, /* 4x8 */ |
360 | 1.57M | cfl_predict_hbd_8x4_ssse3, /* 8x4 */ |
361 | 1.57M | cfl_predict_hbd_8x16_ssse3, /* 8x16 */ |
362 | 1.57M | cfl_predict_hbd_16x8_avx2, /* 16x8 */ |
363 | 1.57M | cfl_predict_hbd_16x32_avx2, /* 16x32 */ |
364 | 1.57M | cfl_predict_hbd_32x16_avx2, /* 32x16 */ |
365 | 1.57M | NULL, /* 32x64 (invalid CFL size) */ |
366 | 1.57M | NULL, /* 64x32 (invalid CFL size) */ |
367 | 1.57M | cfl_predict_hbd_4x16_ssse3, /* 4x16 */ |
368 | 1.57M | cfl_predict_hbd_16x4_avx2, /* 16x4 */ |
369 | 1.57M | cfl_predict_hbd_8x32_ssse3, /* 8x32 */ |
370 | 1.57M | cfl_predict_hbd_32x8_avx2, /* 32x8 */ |
371 | 1.57M | NULL, /* 16x64 (invalid CFL size) */ |
372 | 1.57M | NULL, /* 64x16 (invalid CFL size) */ |
373 | 1.57M | }; |
374 | | // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the |
375 | | // function pointer array out of bounds. |
376 | 1.57M | return pred[tx_size % TX_SIZES_ALL]; |
377 | 1.57M | } |
378 | | #endif // CONFIG_AV1_HIGHBITDEPTH |
379 | | |
380 | | // Returns a vector where all the (32-bits) elements are the sum of all the |
381 | | // lanes in a. |
382 | 480k | static inline __m256i fill_sum_epi32(__m256i a) { |
383 | | // Given that a == [A, B, C, D, E, F, G, H] |
384 | 480k | a = _mm256_hadd_epi32(a, a); |
385 | | // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H |
386 | | // a == [A', C', A', C', E', G', E', G'] |
387 | 480k | a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)); |
388 | | // a == [A', C', E', G', A', C', E', G'] |
389 | 480k | a = _mm256_hadd_epi32(a, a); |
390 | | // Given that A'' == A' + C' and E'' == E' + G' |
391 | | // a == [A'', E'', A'', E'', A'', E'', A'', E''] |
392 | 480k | return _mm256_hadd_epi32(a, a); |
393 | | // Given that A''' == A'' + E'' |
394 | | // a == [A''', A''', A''', A''', A''', A''', A''', A'''] |
395 | 480k | } |
396 | | |
397 | 3.96M | static inline __m256i _mm256_addl_epi16(__m256i a) { |
398 | 3.96M | return _mm256_add_epi32(_mm256_unpacklo_epi16(a, _mm256_setzero_si256()), |
399 | 3.96M | _mm256_unpackhi_epi16(a, _mm256_setzero_si256())); |
400 | 3.96M | } |
401 | | |
402 | | static inline void subtract_average_avx2(const uint16_t *src_ptr, |
403 | | int16_t *dst_ptr, int width, |
404 | | int height, int round_offset, |
405 | 480k | int num_pel_log2) { |
406 | | // Use SSE2 version for smaller widths |
407 | 480k | assert(width == 16 || width == 32); |
408 | | |
409 | 480k | const __m256i *src = (__m256i *)src_ptr; |
410 | 480k | const __m256i *const end = src + height * CFL_BUF_LINE_I256; |
411 | | // To maximize usage of the AVX2 registers, we sum two rows per loop |
412 | | // iteration |
413 | 480k | const int step = 2 * CFL_BUF_LINE_I256; |
414 | | |
415 | 480k | __m256i sum = _mm256_setzero_si256(); |
416 | | // For width 32, we use a second sum accumulator to reduce accumulator |
417 | | // dependencies in the loop. |
418 | 480k | __m256i sum2; |
419 | 480k | if (width == 32) sum2 = _mm256_setzero_si256(); |
420 | | |
421 | 2.99M | do { |
422 | | // Add top row to the bottom row |
423 | 2.99M | __m256i l0 = _mm256_add_epi16(_mm256_loadu_si256(src), |
424 | 2.99M | _mm256_loadu_si256(src + CFL_BUF_LINE_I256)); |
425 | 2.99M | sum = _mm256_add_epi32(sum, _mm256_addl_epi16(l0)); |
426 | 2.99M | if (width == 32) { /* Don't worry, this if it gets optimized out. */ |
427 | | // Add the second part of the top row to the second part of the bottom row |
428 | 968k | __m256i l1 = |
429 | 968k | _mm256_add_epi16(_mm256_loadu_si256(src + 1), |
430 | 968k | _mm256_loadu_si256(src + 1 + CFL_BUF_LINE_I256)); |
431 | 968k | sum2 = _mm256_add_epi32(sum2, _mm256_addl_epi16(l1)); |
432 | 968k | } |
433 | 2.99M | src += step; |
434 | 2.99M | } while (src < end); |
435 | | // Combine both sum accumulators |
436 | 480k | if (width == 32) sum = _mm256_add_epi32(sum, sum2); |
437 | | |
438 | 480k | __m256i fill = fill_sum_epi32(sum); |
439 | | |
440 | 480k | __m256i avg_epi16 = _mm256_srli_epi32( |
441 | 480k | _mm256_add_epi32(fill, _mm256_set1_epi32(round_offset)), num_pel_log2); |
442 | 480k | avg_epi16 = _mm256_packs_epi32(avg_epi16, avg_epi16); |
443 | | |
444 | | // Store and subtract loop |
445 | 480k | src = (__m256i *)src_ptr; |
446 | 480k | __m256i *dst = (__m256i *)dst_ptr; |
447 | 5.99M | do { |
448 | 5.99M | _mm256_storeu_si256(dst, |
449 | 5.99M | _mm256_sub_epi16(_mm256_loadu_si256(src), avg_epi16)); |
450 | 5.99M | if (width == 32) { |
451 | 1.93M | _mm256_storeu_si256( |
452 | 1.93M | dst + 1, _mm256_sub_epi16(_mm256_loadu_si256(src + 1), avg_epi16)); |
453 | 1.93M | } |
454 | 5.99M | src += CFL_BUF_LINE_I256; |
455 | 5.99M | dst += CFL_BUF_LINE_I256; |
456 | 5.99M | } while (src < end); |
457 | 480k | } |
458 | | |
459 | | // Declare wrappers for AVX2 sizes |
460 | | CFL_SUB_AVG_X(avx2, 16, 4, 32, 6) |
461 | | CFL_SUB_AVG_X(avx2, 16, 8, 64, 7) |
462 | | CFL_SUB_AVG_X(avx2, 16, 16, 128, 8) |
463 | | CFL_SUB_AVG_X(avx2, 16, 32, 256, 9) |
464 | | CFL_SUB_AVG_X(avx2, 32, 8, 128, 8) |
465 | | CFL_SUB_AVG_X(avx2, 32, 16, 256, 9) |
466 | | CFL_SUB_AVG_X(avx2, 32, 32, 512, 10) |
467 | | |
468 | | // Based on the observation that for small blocks AVX2 does not outperform |
469 | | // SSE2, we call the SSE2 code for block widths 4 and 8. |
470 | 1.48M | cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size) { |
471 | 1.48M | static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { |
472 | 1.48M | cfl_subtract_average_4x4_sse2, /* 4x4 */ |
473 | 1.48M | cfl_subtract_average_8x8_sse2, /* 8x8 */ |
474 | 1.48M | cfl_subtract_average_16x16_avx2, /* 16x16 */ |
475 | 1.48M | cfl_subtract_average_32x32_avx2, /* 32x32 */ |
476 | 1.48M | NULL, /* 64x64 (invalid CFL size) */ |
477 | 1.48M | cfl_subtract_average_4x8_sse2, /* 4x8 */ |
478 | 1.48M | cfl_subtract_average_8x4_sse2, /* 8x4 */ |
479 | 1.48M | cfl_subtract_average_8x16_sse2, /* 8x16 */ |
480 | 1.48M | cfl_subtract_average_16x8_avx2, /* 16x8 */ |
481 | 1.48M | cfl_subtract_average_16x32_avx2, /* 16x32 */ |
482 | 1.48M | cfl_subtract_average_32x16_avx2, /* 32x16 */ |
483 | 1.48M | NULL, /* 32x64 (invalid CFL size) */ |
484 | 1.48M | NULL, /* 64x32 (invalid CFL size) */ |
485 | 1.48M | cfl_subtract_average_4x16_sse2, /* 4x16 */ |
486 | 1.48M | cfl_subtract_average_16x4_avx2, /* 16x4 */ |
487 | 1.48M | cfl_subtract_average_8x32_sse2, /* 8x32 */ |
488 | 1.48M | cfl_subtract_average_32x8_avx2, /* 32x8 */ |
489 | 1.48M | NULL, /* 16x64 (invalid CFL size) */ |
490 | 1.48M | NULL, /* 64x16 (invalid CFL size) */ |
491 | 1.48M | }; |
492 | | // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to |
493 | | // index the function pointer array out of bounds. |
494 | 1.48M | return sub_avg[tx_size % TX_SIZES_ALL]; |
495 | 1.48M | } |