/src/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #ifndef VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_ |
12 | | #define VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_ |
13 | | |
14 | | #ifdef __cplusplus |
15 | | extern "C" { |
16 | | #endif |
17 | | |
18 | | #define pair_set_epi32(a, b) \ |
19 | | _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a)) |
20 | | |
21 | 103M | static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { |
22 | 103M | __m128i buf0, buf1; |
23 | 103M | buf0 = _mm_mul_epu32(a, b); |
24 | 103M | a = _mm_srli_epi64(a, 32); |
25 | 103M | b = _mm_srli_epi64(b, 32); |
26 | 103M | buf1 = _mm_mul_epu32(a, b); |
27 | 103M | return _mm_add_epi64(buf0, buf1); |
28 | 103M | } fwd_txfm_sse2.c:k_madd_epi32 Line | Count | Source | 21 | 103M | static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { | 22 | 103M | __m128i buf0, buf1; | 23 | 103M | buf0 = _mm_mul_epu32(a, b); | 24 | 103M | a = _mm_srli_epi64(a, 32); | 25 | 103M | b = _mm_srli_epi64(b, 32); | 26 | 103M | buf1 = _mm_mul_epu32(a, b); | 27 | 103M | return _mm_add_epi64(buf0, buf1); | 28 | 103M | } |
Unexecuted instantiation: vp9_dct_intrin_sse2.c:k_madd_epi32 |
29 | | |
30 | 51.7M | static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) { |
31 | 51.7M | __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); |
32 | 51.7M | __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); |
33 | 51.7M | return _mm_unpacklo_epi64(buf0, buf1); |
34 | 51.7M | } fwd_txfm_sse2.c:k_packs_epi64 Line | Count | Source | 30 | 51.7M | static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) { | 31 | 51.7M | __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); | 32 | 51.7M | __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); | 33 | 51.7M | return _mm_unpacklo_epi64(buf0, buf1); | 34 | 51.7M | } |
Unexecuted instantiation: vp9_dct_intrin_sse2.c:k_packs_epi64 |
35 | | |
36 | | static INLINE int check_epi16_overflow_x2(const __m128i *preg0, |
37 | 0 | const __m128i *preg1) { |
38 | 0 | const __m128i max_overflow = _mm_set1_epi16(0x7fff); |
39 | 0 | const __m128i min_overflow = _mm_set1_epi16((short)0x8000); |
40 | 0 | __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), |
41 | 0 | _mm_cmpeq_epi16(*preg0, min_overflow)); |
42 | 0 | __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), |
43 | 0 | _mm_cmpeq_epi16(*preg1, min_overflow)); |
44 | 0 | cmp0 = _mm_or_si128(cmp0, cmp1); |
45 | 0 | return _mm_movemask_epi8(cmp0); |
46 | 0 | } Unexecuted instantiation: fwd_txfm_sse2.c:check_epi16_overflow_x2 Unexecuted instantiation: vp9_dct_intrin_sse2.c:check_epi16_overflow_x2 |
47 | | |
48 | | static INLINE int check_epi16_overflow_x4(const __m128i *preg0, |
49 | | const __m128i *preg1, |
50 | | const __m128i *preg2, |
51 | 0 | const __m128i *preg3) { |
52 | 0 | const __m128i max_overflow = _mm_set1_epi16(0x7fff); |
53 | 0 | const __m128i min_overflow = _mm_set1_epi16((short)0x8000); |
54 | 0 | __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), |
55 | 0 | _mm_cmpeq_epi16(*preg0, min_overflow)); |
56 | 0 | __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), |
57 | 0 | _mm_cmpeq_epi16(*preg1, min_overflow)); |
58 | 0 | __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow), |
59 | 0 | _mm_cmpeq_epi16(*preg2, min_overflow)); |
60 | 0 | __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow), |
61 | 0 | _mm_cmpeq_epi16(*preg3, min_overflow)); |
62 | 0 | cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)); |
63 | 0 | return _mm_movemask_epi8(cmp0); |
64 | 0 | } Unexecuted instantiation: fwd_txfm_sse2.c:check_epi16_overflow_x4 Unexecuted instantiation: vp9_dct_intrin_sse2.c:check_epi16_overflow_x4 |
65 | | |
66 | | static INLINE int check_epi16_overflow_x8( |
67 | | const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, |
68 | | const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, |
69 | 0 | const __m128i *preg6, const __m128i *preg7) { |
70 | 0 | int res0, res1; |
71 | 0 | res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); |
72 | 0 | res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); |
73 | 0 | return res0 + res1; |
74 | 0 | } Unexecuted instantiation: fwd_txfm_sse2.c:check_epi16_overflow_x8 Unexecuted instantiation: vp9_dct_intrin_sse2.c:check_epi16_overflow_x8 |
75 | | |
76 | | static INLINE int check_epi16_overflow_x12( |
77 | | const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, |
78 | | const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, |
79 | | const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, |
80 | 0 | const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) { |
81 | 0 | int res0, res1; |
82 | 0 | res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); |
83 | 0 | res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); |
84 | 0 | if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); |
85 | 0 | return res0 + res1; |
86 | 0 | } Unexecuted instantiation: fwd_txfm_sse2.c:check_epi16_overflow_x12 Unexecuted instantiation: vp9_dct_intrin_sse2.c:check_epi16_overflow_x12 |
87 | | |
88 | | static INLINE int check_epi16_overflow_x16( |
89 | | const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, |
90 | | const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, |
91 | | const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, |
92 | | const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, |
93 | | const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, |
94 | 0 | const __m128i *preg15) { |
95 | 0 | int res0, res1; |
96 | 0 | res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); |
97 | 0 | res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); |
98 | 0 | if (!res0) { |
99 | 0 | res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); |
100 | 0 | if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); |
101 | 0 | } |
102 | 0 | return res0 + res1; |
103 | 0 | } Unexecuted instantiation: fwd_txfm_sse2.c:check_epi16_overflow_x16 Unexecuted instantiation: vp9_dct_intrin_sse2.c:check_epi16_overflow_x16 |
104 | | |
105 | | static INLINE int check_epi16_overflow_x32( |
106 | | const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, |
107 | | const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, |
108 | | const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, |
109 | | const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, |
110 | | const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, |
111 | | const __m128i *preg15, const __m128i *preg16, const __m128i *preg17, |
112 | | const __m128i *preg18, const __m128i *preg19, const __m128i *preg20, |
113 | | const __m128i *preg21, const __m128i *preg22, const __m128i *preg23, |
114 | | const __m128i *preg24, const __m128i *preg25, const __m128i *preg26, |
115 | | const __m128i *preg27, const __m128i *preg28, const __m128i *preg29, |
116 | 0 | const __m128i *preg30, const __m128i *preg31) { |
117 | 0 | int res0, res1; |
118 | 0 | res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); |
119 | 0 | res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); |
120 | 0 | if (!res0) { |
121 | 0 | res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); |
122 | 0 | if (!res1) { |
123 | 0 | res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); |
124 | 0 | if (!res0) { |
125 | 0 | res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19); |
126 | 0 | if (!res1) { |
127 | 0 | res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23); |
128 | 0 | if (!res0) { |
129 | 0 | res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27); |
130 | 0 | if (!res1) |
131 | 0 | res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31); |
132 | 0 | } |
133 | 0 | } |
134 | 0 | } |
135 | 0 | } |
136 | 0 | } |
137 | 0 | return res0 + res1; |
138 | 0 | } Unexecuted instantiation: fwd_txfm_sse2.c:check_epi16_overflow_x32 Unexecuted instantiation: vp9_dct_intrin_sse2.c:check_epi16_overflow_x32 |
139 | | |
140 | | static INLINE int k_check_epi32_overflow_4(const __m128i *preg0, |
141 | | const __m128i *preg1, |
142 | | const __m128i *preg2, |
143 | | const __m128i *preg3, |
144 | 0 | const __m128i *zero) { |
145 | 0 | __m128i minus_one = _mm_set1_epi32(-1); |
146 | | // Check for overflows |
147 | 0 | __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1); |
148 | 0 | __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1); |
149 | 0 | __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1); |
150 | 0 | __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1); |
151 | 0 | __m128i reg0_top_dwords = |
152 | 0 | _mm_shuffle_epi32(reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1)); |
153 | 0 | __m128i reg1_top_dwords = |
154 | 0 | _mm_shuffle_epi32(reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1)); |
155 | 0 | __m128i reg2_top_dwords = |
156 | 0 | _mm_shuffle_epi32(reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1)); |
157 | 0 | __m128i reg3_top_dwords = |
158 | 0 | _mm_shuffle_epi32(reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1)); |
159 | 0 | __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords); |
160 | 0 | __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords); |
161 | 0 | __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero); |
162 | 0 | __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero); |
163 | 0 | __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one); |
164 | 0 | __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one); |
165 | 0 | int overflow_01 = |
166 | 0 | _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_01, valid_negative_01)); |
167 | 0 | int overflow_23 = |
168 | 0 | _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_23, valid_negative_23)); |
169 | 0 | return (overflow_01 + overflow_23); |
170 | 0 | } Unexecuted instantiation: fwd_txfm_sse2.c:k_check_epi32_overflow_4 Unexecuted instantiation: vp9_dct_intrin_sse2.c:k_check_epi32_overflow_4 |
171 | | |
172 | | static INLINE int k_check_epi32_overflow_8( |
173 | | const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, |
174 | | const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, |
175 | 0 | const __m128i *preg6, const __m128i *preg7, const __m128i *zero) { |
176 | 0 | int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); |
177 | 0 | if (!overflow) { |
178 | 0 | overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); |
179 | 0 | } |
180 | 0 | return overflow; |
181 | 0 | } Unexecuted instantiation: fwd_txfm_sse2.c:k_check_epi32_overflow_8 Unexecuted instantiation: vp9_dct_intrin_sse2.c:k_check_epi32_overflow_8 |
182 | | |
183 | | static INLINE int k_check_epi32_overflow_16( |
184 | | const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, |
185 | | const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, |
186 | | const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, |
187 | | const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, |
188 | | const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, |
189 | 0 | const __m128i *preg15, const __m128i *zero) { |
190 | 0 | int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); |
191 | 0 | if (!overflow) { |
192 | 0 | overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); |
193 | 0 | if (!overflow) { |
194 | 0 | overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero); |
195 | 0 | if (!overflow) { |
196 | 0 | overflow = |
197 | 0 | k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero); |
198 | 0 | } |
199 | 0 | } |
200 | 0 | } |
201 | 0 | return overflow; |
202 | 0 | } Unexecuted instantiation: fwd_txfm_sse2.c:k_check_epi32_overflow_16 Unexecuted instantiation: vp9_dct_intrin_sse2.c:k_check_epi32_overflow_16 |
203 | | |
204 | | static INLINE int k_check_epi32_overflow_32( |
205 | | const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, |
206 | | const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, |
207 | | const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, |
208 | | const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, |
209 | | const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, |
210 | | const __m128i *preg15, const __m128i *preg16, const __m128i *preg17, |
211 | | const __m128i *preg18, const __m128i *preg19, const __m128i *preg20, |
212 | | const __m128i *preg21, const __m128i *preg22, const __m128i *preg23, |
213 | | const __m128i *preg24, const __m128i *preg25, const __m128i *preg26, |
214 | | const __m128i *preg27, const __m128i *preg28, const __m128i *preg29, |
215 | 0 | const __m128i *preg30, const __m128i *preg31, const __m128i *zero) { |
216 | 0 | int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); |
217 | 0 | if (!overflow) { |
218 | 0 | overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); |
219 | 0 | if (!overflow) { |
220 | 0 | overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero); |
221 | 0 | if (!overflow) { |
222 | 0 | overflow = |
223 | 0 | k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero); |
224 | 0 | if (!overflow) { |
225 | 0 | overflow = |
226 | 0 | k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, zero); |
227 | 0 | if (!overflow) { |
228 | 0 | overflow = |
229 | 0 | k_check_epi32_overflow_4(preg20, preg21, preg22, preg23, zero); |
230 | 0 | if (!overflow) { |
231 | 0 | overflow = k_check_epi32_overflow_4(preg24, preg25, preg26, |
232 | 0 | preg27, zero); |
233 | 0 | if (!overflow) { |
234 | 0 | overflow = k_check_epi32_overflow_4(preg28, preg29, preg30, |
235 | 0 | preg31, zero); |
236 | 0 | } |
237 | 0 | } |
238 | 0 | } |
239 | 0 | } |
240 | 0 | } |
241 | 0 | } |
242 | 0 | } |
243 | 0 | return overflow; |
244 | 0 | } Unexecuted instantiation: fwd_txfm_sse2.c:k_check_epi32_overflow_32 Unexecuted instantiation: vp9_dct_intrin_sse2.c:k_check_epi32_overflow_32 |
245 | | |
246 | 1.07G | static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { |
247 | 1.07G | #if CONFIG_VP9_HIGHBITDEPTH |
248 | 1.07G | const __m128i zero = _mm_setzero_si128(); |
249 | 1.07G | const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); |
250 | 1.07G | __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); |
251 | 1.07G | __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); |
252 | 1.07G | _mm_store_si128((__m128i *)(dst_ptr), out0); |
253 | 1.07G | _mm_store_si128((__m128i *)(dst_ptr + 4), out1); |
254 | | #else |
255 | | _mm_store_si128((__m128i *)(dst_ptr), *poutput); |
256 | | #endif // CONFIG_VP9_HIGHBITDEPTH |
257 | 1.07G | } fwd_txfm_sse2.c:store_output Line | Count | Source | 246 | 348M | static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { | 247 | 348M | #if CONFIG_VP9_HIGHBITDEPTH | 248 | 348M | const __m128i zero = _mm_setzero_si128(); | 249 | 348M | const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); | 250 | 348M | __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); | 251 | 348M | __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); | 252 | 348M | _mm_store_si128((__m128i *)(dst_ptr), out0); | 253 | 348M | _mm_store_si128((__m128i *)(dst_ptr + 4), out1); | 254 | | #else | 255 | | _mm_store_si128((__m128i *)(dst_ptr), *poutput); | 256 | | #endif // CONFIG_VP9_HIGHBITDEPTH | 257 | 348M | } |
vp9_dct_intrin_sse2.c:store_output Line | Count | Source | 246 | 722M | static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { | 247 | 722M | #if CONFIG_VP9_HIGHBITDEPTH | 248 | 722M | const __m128i zero = _mm_setzero_si128(); | 249 | 722M | const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); | 250 | 722M | __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); | 251 | 722M | __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); | 252 | 722M | _mm_store_si128((__m128i *)(dst_ptr), out0); | 253 | 722M | _mm_store_si128((__m128i *)(dst_ptr + 4), out1); | 254 | | #else | 255 | | _mm_store_si128((__m128i *)(dst_ptr), *poutput); | 256 | | #endif // CONFIG_VP9_HIGHBITDEPTH | 257 | 722M | } |
|
258 | | |
259 | 1.30G | static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) { |
260 | 1.30G | #if CONFIG_VP9_HIGHBITDEPTH |
261 | 1.30G | const __m128i zero = _mm_setzero_si128(); |
262 | 1.30G | const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); |
263 | 1.30G | __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); |
264 | 1.30G | __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); |
265 | 1.30G | _mm_storeu_si128((__m128i *)(dst_ptr), out0); |
266 | 1.30G | _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); |
267 | | #else |
268 | | _mm_storeu_si128((__m128i *)(dst_ptr), *poutput); |
269 | | #endif // CONFIG_VP9_HIGHBITDEPTH |
270 | 1.30G | } fwd_txfm_sse2.c:storeu_output Line | Count | Source | 259 | 1.30G | static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) { | 260 | 1.30G | #if CONFIG_VP9_HIGHBITDEPTH | 261 | 1.30G | const __m128i zero = _mm_setzero_si128(); | 262 | 1.30G | const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); | 263 | 1.30G | __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); | 264 | 1.30G | __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); | 265 | 1.30G | _mm_storeu_si128((__m128i *)(dst_ptr), out0); | 266 | 1.30G | _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); | 267 | | #else | 268 | | _mm_storeu_si128((__m128i *)(dst_ptr), *poutput); | 269 | | #endif // CONFIG_VP9_HIGHBITDEPTH | 270 | 1.30G | } |
Unexecuted instantiation: vp9_dct_intrin_sse2.c:storeu_output |
271 | | |
272 | | static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1, |
273 | | const __m128i *pmultiplier, |
274 | | const __m128i *prounding, |
275 | 894M | const int shift) { |
276 | 894M | const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier); |
277 | 894M | const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier); |
278 | 894M | const __m128i v0 = _mm_add_epi32(u0, *prounding); |
279 | 894M | const __m128i v1 = _mm_add_epi32(u1, *prounding); |
280 | 894M | const __m128i w0 = _mm_srai_epi32(v0, shift); |
281 | 894M | const __m128i w1 = _mm_srai_epi32(v1, shift); |
282 | 894M | return _mm_packs_epi32(w0, w1); |
283 | 894M | } fwd_txfm_sse2.c:mult_round_shift Line | Count | Source | 275 | 894M | const int shift) { | 276 | 894M | const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier); | 277 | 894M | const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier); | 278 | 894M | const __m128i v0 = _mm_add_epi32(u0, *prounding); | 279 | 894M | const __m128i v1 = _mm_add_epi32(u1, *prounding); | 280 | 894M | const __m128i w0 = _mm_srai_epi32(v0, shift); | 281 | 894M | const __m128i w1 = _mm_srai_epi32(v1, shift); | 282 | 894M | return _mm_packs_epi32(w0, w1); | 283 | 894M | } |
Unexecuted instantiation: vp9_dct_intrin_sse2.c:mult_round_shift |
284 | | |
285 | | static INLINE void transpose_and_output8x8( |
286 | | const __m128i *pin00, const __m128i *pin01, const __m128i *pin02, |
287 | | const __m128i *pin03, const __m128i *pin04, const __m128i *pin05, |
288 | | const __m128i *pin06, const __m128i *pin07, const int pass, |
289 | 68.7M | int16_t *out0_ptr, tran_low_t *out1_ptr) { |
290 | | // 00 01 02 03 04 05 06 07 |
291 | | // 10 11 12 13 14 15 16 17 |
292 | | // 20 21 22 23 24 25 26 27 |
293 | | // 30 31 32 33 34 35 36 37 |
294 | | // 40 41 42 43 44 45 46 47 |
295 | | // 50 51 52 53 54 55 56 57 |
296 | | // 60 61 62 63 64 65 66 67 |
297 | | // 70 71 72 73 74 75 76 77 |
298 | 68.7M | const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01); |
299 | 68.7M | const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03); |
300 | 68.7M | const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01); |
301 | 68.7M | const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03); |
302 | 68.7M | const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05); |
303 | 68.7M | const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07); |
304 | 68.7M | const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05); |
305 | 68.7M | const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07); |
306 | | // 00 10 01 11 02 12 03 13 |
307 | | // 20 30 21 31 22 32 23 33 |
308 | | // 04 14 05 15 06 16 07 17 |
309 | | // 24 34 25 35 26 36 27 37 |
310 | | // 40 50 41 51 42 52 43 53 |
311 | | // 60 70 61 71 62 72 63 73 |
312 | | // 54 54 55 55 56 56 57 57 |
313 | | // 64 74 65 75 66 76 67 77 |
314 | 68.7M | const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); |
315 | 68.7M | const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); |
316 | 68.7M | const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); |
317 | 68.7M | const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); |
318 | 68.7M | const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); |
319 | 68.7M | const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); |
320 | 68.7M | const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); |
321 | 68.7M | const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); |
322 | | // 00 10 20 30 01 11 21 31 |
323 | | // 40 50 60 70 41 51 61 71 |
324 | | // 02 12 22 32 03 13 23 33 |
325 | | // 42 52 62 72 43 53 63 73 |
326 | | // 04 14 24 34 05 15 21 36 |
327 | | // 44 54 64 74 45 55 61 76 |
328 | | // 06 16 26 36 07 17 27 37 |
329 | | // 46 56 66 76 47 57 67 77 |
330 | 68.7M | const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); |
331 | 68.7M | const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); |
332 | 68.7M | const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); |
333 | 68.7M | const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); |
334 | 68.7M | const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); |
335 | 68.7M | const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); |
336 | 68.7M | const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); |
337 | 68.7M | const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); |
338 | | // 00 10 20 30 40 50 60 70 |
339 | | // 01 11 21 31 41 51 61 71 |
340 | | // 02 12 22 32 42 52 62 72 |
341 | | // 03 13 23 33 43 53 63 73 |
342 | | // 04 14 24 34 44 54 64 74 |
343 | | // 05 15 25 35 45 55 65 75 |
344 | | // 06 16 26 36 46 56 66 76 |
345 | | // 07 17 27 37 47 57 67 77 |
346 | 68.7M | if (pass == 0) { |
347 | 34.3M | _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0); |
348 | 34.3M | _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1); |
349 | 34.3M | _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2); |
350 | 34.3M | _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3); |
351 | 34.3M | _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4); |
352 | 34.3M | _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5); |
353 | 34.3M | _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6); |
354 | 34.3M | _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7); |
355 | 34.3M | } else { |
356 | 34.3M | storeu_output(&tr2_0, (out1_ptr + 0 * 16)); |
357 | 34.3M | storeu_output(&tr2_1, (out1_ptr + 1 * 16)); |
358 | 34.3M | storeu_output(&tr2_2, (out1_ptr + 2 * 16)); |
359 | 34.3M | storeu_output(&tr2_3, (out1_ptr + 3 * 16)); |
360 | 34.3M | storeu_output(&tr2_4, (out1_ptr + 4 * 16)); |
361 | 34.3M | storeu_output(&tr2_5, (out1_ptr + 5 * 16)); |
362 | 34.3M | storeu_output(&tr2_6, (out1_ptr + 6 * 16)); |
363 | 34.3M | storeu_output(&tr2_7, (out1_ptr + 7 * 16)); |
364 | 34.3M | } |
365 | 68.7M | } fwd_txfm_sse2.c:transpose_and_output8x8 Line | Count | Source | 289 | 68.7M | int16_t *out0_ptr, tran_low_t *out1_ptr) { | 290 | | // 00 01 02 03 04 05 06 07 | 291 | | // 10 11 12 13 14 15 16 17 | 292 | | // 20 21 22 23 24 25 26 27 | 293 | | // 30 31 32 33 34 35 36 37 | 294 | | // 40 41 42 43 44 45 46 47 | 295 | | // 50 51 52 53 54 55 56 57 | 296 | | // 60 61 62 63 64 65 66 67 | 297 | | // 70 71 72 73 74 75 76 77 | 298 | 68.7M | const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01); | 299 | 68.7M | const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03); | 300 | 68.7M | const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01); | 301 | 68.7M | const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03); | 302 | 68.7M | const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05); | 303 | 68.7M | const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07); | 304 | 68.7M | const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05); | 305 | 68.7M | const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07); | 306 | | // 00 10 01 11 02 12 03 13 | 307 | | // 20 30 21 31 22 32 23 33 | 308 | | // 04 14 05 15 06 16 07 17 | 309 | | // 24 34 25 35 26 36 27 37 | 310 | | // 40 50 41 51 42 52 43 53 | 311 | | // 60 70 61 71 62 72 63 73 | 312 | | // 54 54 55 55 56 56 57 57 | 313 | | // 64 74 65 75 66 76 67 77 | 314 | 68.7M | const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); | 315 | 68.7M | const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); | 316 | 68.7M | const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); | 317 | 68.7M | const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); | 318 | 68.7M | const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); | 319 | 68.7M | const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); | 320 | 68.7M | const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); | 321 | 68.7M | const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); | 322 | | // 00 10 20 30 01 11 21 31 | 323 | | // 40 50 60 70 41 51 61 71 | 324 | | // 02 12 22 32 03 13 23 33 | 325 | | // 42 52 62 72 43 53 63 73 | 326 | | // 04 14 24 34 05 15 21 36 | 327 | | // 44 54 64 74 45 55 61 76 | 328 | | // 06 16 26 36 07 17 27 37 | 329 | | // 46 56 66 76 47 57 67 77 | 330 | 68.7M | const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); | 331 | 68.7M | const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); | 332 | 68.7M | const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); | 333 | 68.7M | const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); | 334 | 68.7M | const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); | 335 | 68.7M | const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); | 336 | 68.7M | const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); | 337 | 68.7M | const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); | 338 | | // 00 10 20 30 40 50 60 70 | 339 | | // 01 11 21 31 41 51 61 71 | 340 | | // 02 12 22 32 42 52 62 72 | 341 | | // 03 13 23 33 43 53 63 73 | 342 | | // 04 14 24 34 44 54 64 74 | 343 | | // 05 15 25 35 45 55 65 75 | 344 | | // 06 16 26 36 46 56 66 76 | 345 | | // 07 17 27 37 47 57 67 77 | 346 | 68.7M | if (pass == 0) { | 347 | 34.3M | _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0); | 348 | 34.3M | _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1); | 349 | 34.3M | _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2); | 350 | 34.3M | _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3); | 351 | 34.3M | _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4); | 352 | 34.3M | _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5); | 353 | 34.3M | _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6); | 354 | 34.3M | _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7); | 355 | 34.3M | } else { | 356 | 34.3M | storeu_output(&tr2_0, (out1_ptr + 0 * 16)); | 357 | 34.3M | storeu_output(&tr2_1, (out1_ptr + 1 * 16)); | 358 | 34.3M | storeu_output(&tr2_2, (out1_ptr + 2 * 16)); | 359 | 34.3M | storeu_output(&tr2_3, (out1_ptr + 3 * 16)); | 360 | 34.3M | storeu_output(&tr2_4, (out1_ptr + 4 * 16)); | 361 | 34.3M | storeu_output(&tr2_5, (out1_ptr + 5 * 16)); | 362 | 34.3M | storeu_output(&tr2_6, (out1_ptr + 6 * 16)); | 363 | 34.3M | storeu_output(&tr2_7, (out1_ptr + 7 * 16)); | 364 | 34.3M | } | 365 | 68.7M | } |
Unexecuted instantiation: vp9_dct_intrin_sse2.c:transpose_and_output8x8 |
366 | | |
367 | | #ifdef __cplusplus |
368 | | } // extern "C" |
369 | | #endif |
370 | | |
371 | | #endif // VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_ |