/src/libvpx/vpx_dsp/x86/convolve_avx2.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #ifndef VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_ |
12 | | #define VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_ |
13 | | |
14 | | #include <immintrin.h> // AVX2 |
15 | | |
16 | | #include "./vpx_config.h" |
17 | | |
18 | | #if defined(__clang__) |
19 | | #if (__clang_major__ > 0 && __clang_major__ < 3) || \ |
20 | | (__clang_major__ == 3 && __clang_minor__ <= 3) || \ |
21 | | (defined(__APPLE__) && defined(__apple_build_version__) && \ |
22 | | ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ |
23 | | (__clang_major__ == 5 && __clang_minor__ == 0))) |
24 | | #define MM256_BROADCASTSI128_SI256(x) \ |
25 | | _mm_broadcastsi128_si256((__m128i const *)&(x)) |
26 | | #else // clang > 3.3, and not 5.0 on macosx. |
27 | 71.8M | #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) |
28 | | #endif // clang <= 3.3 |
29 | | #elif defined(__GNUC__) |
30 | | #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) |
31 | | #define MM256_BROADCASTSI128_SI256(x) \ |
32 | | _mm_broadcastsi128_si256((__m128i const *)&(x)) |
33 | | #elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 |
34 | | #define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) |
35 | | #else // gcc > 4.7 |
36 | | #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) |
37 | | #endif // gcc <= 4.6 |
38 | | #else // !(gcc || clang) |
39 | | #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) |
40 | | #endif // __clang__ |
41 | | |
42 | | static INLINE void shuffle_filter_avx2(const int16_t *const filter, |
43 | 71.8M | __m256i *const f) { |
44 | 71.8M | const __m256i f_values = |
45 | 71.8M | MM256_BROADCASTSI128_SI256(_mm_load_si128((const __m128i *)filter)); |
46 | | // pack and duplicate the filter values |
47 | 71.8M | f[0] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0200u)); |
48 | 71.8M | f[1] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0604u)); |
49 | 71.8M | f[2] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0a08u)); |
50 | 71.8M | f[3] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0e0cu)); |
51 | 71.8M | } vpx_subpixel_8t_intrin_avx2.c:shuffle_filter_avx2 Line | Count | Source | 43 | 71.8M | __m256i *const f) { | 44 | 71.8M | const __m256i f_values = | 45 | 71.8M | MM256_BROADCASTSI128_SI256(_mm_load_si128((const __m128i *)filter)); | 46 | | // pack and duplicate the filter values | 47 | 71.8M | f[0] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0200u)); | 48 | 71.8M | f[1] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0604u)); | 49 | 71.8M | f[2] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0a08u)); | 50 | 71.8M | f[3] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0e0cu)); | 51 | 71.8M | } |
Unexecuted instantiation: highbd_convolve_avx2.c:shuffle_filter_avx2 |
52 | | |
53 | | static INLINE __m256i convolve8_16_avx2(const __m256i *const s, |
54 | 394M | const __m256i *const f) { |
55 | | // multiply 2 adjacent elements with the filter and add the result |
56 | 394M | const __m256i k_64 = _mm256_set1_epi16(1 << 6); |
57 | 394M | const __m256i x0 = _mm256_maddubs_epi16(s[0], f[0]); |
58 | 394M | const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]); |
59 | 394M | const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]); |
60 | 394M | const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]); |
61 | 394M | __m256i sum1, sum2; |
62 | | |
63 | | // sum the results together, saturating only on the final step |
64 | | // adding x0 with x2 and x1 with x3 is the only order that prevents |
65 | | // outranges for all filters |
66 | 394M | sum1 = _mm256_add_epi16(x0, x2); |
67 | 394M | sum2 = _mm256_add_epi16(x1, x3); |
68 | | // add the rounding offset early to avoid another saturated add |
69 | 394M | sum1 = _mm256_add_epi16(sum1, k_64); |
70 | 394M | sum1 = _mm256_adds_epi16(sum1, sum2); |
71 | | // round and shift by 7 bit each 16 bit |
72 | 394M | sum1 = _mm256_srai_epi16(sum1, 7); |
73 | 394M | return sum1; |
74 | 394M | } vpx_subpixel_8t_intrin_avx2.c:convolve8_16_avx2 Line | Count | Source | 54 | 394M | const __m256i *const f) { | 55 | | // multiply 2 adjacent elements with the filter and add the result | 56 | 394M | const __m256i k_64 = _mm256_set1_epi16(1 << 6); | 57 | 394M | const __m256i x0 = _mm256_maddubs_epi16(s[0], f[0]); | 58 | 394M | const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]); | 59 | 394M | const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]); | 60 | 394M | const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]); | 61 | 394M | __m256i sum1, sum2; | 62 | | | 63 | | // sum the results together, saturating only on the final step | 64 | | // adding x0 with x2 and x1 with x3 is the only order that prevents | 65 | | // outranges for all filters | 66 | 394M | sum1 = _mm256_add_epi16(x0, x2); | 67 | 394M | sum2 = _mm256_add_epi16(x1, x3); | 68 | | // add the rounding offset early to avoid another saturated add | 69 | 394M | sum1 = _mm256_add_epi16(sum1, k_64); | 70 | 394M | sum1 = _mm256_adds_epi16(sum1, sum2); | 71 | | // round and shift by 7 bit each 16 bit | 72 | 394M | sum1 = _mm256_srai_epi16(sum1, 7); | 73 | 394M | return sum1; | 74 | 394M | } |
Unexecuted instantiation: highbd_convolve_avx2.c:convolve8_16_avx2 |
75 | | |
76 | | static INLINE __m128i convolve8_8_avx2(const __m256i *const s, |
77 | 0 | const __m256i *const f) { |
78 | 0 | // multiply 2 adjacent elements with the filter and add the result |
79 | 0 | const __m128i k_64 = _mm_set1_epi16(1 << 6); |
80 | 0 | const __m128i x0 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[0]), |
81 | 0 | _mm256_castsi256_si128(f[0])); |
82 | 0 | const __m128i x1 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[1]), |
83 | 0 | _mm256_castsi256_si128(f[1])); |
84 | 0 | const __m128i x2 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[2]), |
85 | 0 | _mm256_castsi256_si128(f[2])); |
86 | 0 | const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]), |
87 | 0 | _mm256_castsi256_si128(f[3])); |
88 | 0 | __m128i sum1, sum2; |
89 | 0 |
|
90 | 0 | // sum the results together, saturating only on the final step |
91 | 0 | // adding x0 with x2 and x1 with x3 is the only order that prevents |
92 | 0 | // outranges for all filters |
93 | 0 | sum1 = _mm_add_epi16(x0, x2); |
94 | 0 | sum2 = _mm_add_epi16(x1, x3); |
95 | 0 | // add the rounding offset early to avoid another saturated add |
96 | 0 | sum1 = _mm_add_epi16(sum1, k_64); |
97 | 0 | sum1 = _mm_adds_epi16(sum1, sum2); |
98 | 0 | // shift by 7 bit each 16 bit |
99 | 0 | sum1 = _mm_srai_epi16(sum1, 7); |
100 | 0 | return sum1; |
101 | 0 | } Unexecuted instantiation: vpx_subpixel_8t_intrin_avx2.c:convolve8_8_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve8_8_avx2 |
102 | | |
103 | 645M | static INLINE __m256i mm256_loadu2_si128(const void *lo, const void *hi) { |
104 | 645M | const __m256i tmp = |
105 | 645M | _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)lo)); |
106 | 645M | return _mm256_inserti128_si256(tmp, _mm_loadu_si128((const __m128i *)hi), 1); |
107 | 645M | } vpx_subpixel_8t_intrin_avx2.c:mm256_loadu2_si128 Line | Count | Source | 103 | 645M | static INLINE __m256i mm256_loadu2_si128(const void *lo, const void *hi) { | 104 | 645M | const __m256i tmp = | 105 | 645M | _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)lo)); | 106 | 645M | return _mm256_inserti128_si256(tmp, _mm_loadu_si128((const __m128i *)hi), 1); | 107 | 645M | } |
Unexecuted instantiation: highbd_convolve_avx2.c:mm256_loadu2_si128 |
108 | | |
109 | 165M | static INLINE __m256i mm256_loadu2_epi64(const void *lo, const void *hi) { |
110 | 165M | const __m256i tmp = |
111 | 165M | _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)lo)); |
112 | 165M | return _mm256_inserti128_si256(tmp, _mm_loadl_epi64((const __m128i *)hi), 1); |
113 | 165M | } vpx_subpixel_8t_intrin_avx2.c:mm256_loadu2_epi64 Line | Count | Source | 109 | 165M | static INLINE __m256i mm256_loadu2_epi64(const void *lo, const void *hi) { | 110 | 165M | const __m256i tmp = | 111 | 165M | _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)lo)); | 112 | 165M | return _mm256_inserti128_si256(tmp, _mm_loadl_epi64((const __m128i *)hi), 1); | 113 | 165M | } |
Unexecuted instantiation: highbd_convolve_avx2.c:mm256_loadu2_epi64 |
114 | | |
115 | | static INLINE void mm256_store2_si128(__m128i *const dst_ptr_1, |
116 | | __m128i *const dst_ptr_2, |
117 | 274M | const __m256i *const src) { |
118 | 274M | _mm_store_si128(dst_ptr_1, _mm256_castsi256_si128(*src)); |
119 | 274M | _mm_store_si128(dst_ptr_2, _mm256_extractf128_si256(*src, 1)); |
120 | 274M | } vpx_subpixel_8t_intrin_avx2.c:mm256_store2_si128 Line | Count | Source | 117 | 274M | const __m256i *const src) { | 118 | 274M | _mm_store_si128(dst_ptr_1, _mm256_castsi256_si128(*src)); | 119 | 274M | _mm_store_si128(dst_ptr_2, _mm256_extractf128_si256(*src, 1)); | 120 | 274M | } |
Unexecuted instantiation: highbd_convolve_avx2.c:mm256_store2_si128 |
121 | | |
122 | | static INLINE void mm256_storeu2_epi64(__m128i *const dst_ptr_1, |
123 | | __m128i *const dst_ptr_2, |
124 | 142M | const __m256i *const src) { |
125 | 142M | _mm_storel_epi64(dst_ptr_1, _mm256_castsi256_si128(*src)); |
126 | 142M | _mm_storel_epi64(dst_ptr_2, _mm256_extractf128_si256(*src, 1)); |
127 | 142M | } vpx_subpixel_8t_intrin_avx2.c:mm256_storeu2_epi64 Line | Count | Source | 124 | 142M | const __m256i *const src) { | 125 | 142M | _mm_storel_epi64(dst_ptr_1, _mm256_castsi256_si128(*src)); | 126 | 142M | _mm_storel_epi64(dst_ptr_2, _mm256_extractf128_si256(*src, 1)); | 127 | 142M | } |
Unexecuted instantiation: highbd_convolve_avx2.c:mm256_storeu2_epi64 |
128 | | |
129 | | static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1, |
130 | | __m128i *const dst_ptr_2, |
131 | 340M | const __m256i *const src) { |
132 | 340M | *((int *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src)); |
133 | 340M | *((int *)(dst_ptr_2)) = _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1)); |
134 | 340M | } vpx_subpixel_8t_intrin_avx2.c:mm256_storeu2_epi32 Line | Count | Source | 131 | 340M | const __m256i *const src) { | 132 | 340M | *((int *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src)); | 133 | 340M | *((int *)(dst_ptr_2)) = _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1)); | 134 | 340M | } |
Unexecuted instantiation: highbd_convolve_avx2.c:mm256_storeu2_epi32 |
135 | | |
136 | | static INLINE __m256i mm256_round_epi32(const __m256i *const src, |
137 | | const __m256i *const half_depth, |
138 | 0 | const int depth) { |
139 | 0 | const __m256i nearest_src = _mm256_add_epi32(*src, *half_depth); |
140 | 0 | return _mm256_srai_epi32(nearest_src, depth); |
141 | 0 | } Unexecuted instantiation: vpx_subpixel_8t_intrin_avx2.c:mm256_round_epi32 Unexecuted instantiation: highbd_convolve_avx2.c:mm256_round_epi32 |
142 | | |
143 | | static INLINE __m256i mm256_round_epi16(const __m256i *const src, |
144 | | const __m256i *const half_depth, |
145 | 732M | const int depth) { |
146 | 732M | const __m256i nearest_src = _mm256_adds_epi16(*src, *half_depth); |
147 | 732M | return _mm256_srai_epi16(nearest_src, depth); |
148 | 732M | } vpx_subpixel_8t_intrin_avx2.c:mm256_round_epi16 Line | Count | Source | 145 | 732M | const int depth) { | 146 | 732M | const __m256i nearest_src = _mm256_adds_epi16(*src, *half_depth); | 147 | 732M | return _mm256_srai_epi16(nearest_src, depth); | 148 | 732M | } |
Unexecuted instantiation: highbd_convolve_avx2.c:mm256_round_epi16 |
149 | | |
150 | | static INLINE __m256i mm256_madd_add_epi32(const __m256i *const src_0, |
151 | | const __m256i *const src_1, |
152 | | const __m256i *const ker_0, |
153 | 0 | const __m256i *const ker_1) { |
154 | 0 | const __m256i tmp_0 = _mm256_madd_epi16(*src_0, *ker_0); |
155 | 0 | const __m256i tmp_1 = _mm256_madd_epi16(*src_1, *ker_1); |
156 | 0 | return _mm256_add_epi32(tmp_0, tmp_1); |
157 | 0 | } Unexecuted instantiation: vpx_subpixel_8t_intrin_avx2.c:mm256_madd_add_epi32 Unexecuted instantiation: highbd_convolve_avx2.c:mm256_madd_add_epi32 |
158 | | |
159 | | #undef MM256_BROADCASTSI128_SI256 |
160 | | |
161 | | #endif // VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_ |