/src/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <emmintrin.h> // SSE2 |
12 | | |
13 | | #include "./vpx_config.h" |
14 | | #include "./vpx_dsp_rtcd.h" |
15 | | #include "vpx_dsp/vpx_dsp_common.h" |
16 | | #include "vpx_dsp/x86/fwd_txfm_sse2.h" |
17 | | |
18 | 0 | void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { |
19 | 0 | __m128i in0, in1; |
20 | 0 | __m128i tmp; |
21 | 0 | const __m128i zero = _mm_setzero_si128(); |
22 | 0 | in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); |
23 | 0 | in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); |
24 | 0 | in1 = _mm_unpacklo_epi64( |
25 | 0 | in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); |
26 | 0 | in0 = _mm_unpacklo_epi64( |
27 | 0 | in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); |
28 | |
|
29 | 0 | tmp = _mm_add_epi16(in0, in1); |
30 | 0 | in0 = _mm_unpacklo_epi16(zero, tmp); |
31 | 0 | in1 = _mm_unpackhi_epi16(zero, tmp); |
32 | 0 | in0 = _mm_srai_epi32(in0, 16); |
33 | 0 | in1 = _mm_srai_epi32(in1, 16); |
34 | |
|
35 | 0 | tmp = _mm_add_epi32(in0, in1); |
36 | 0 | in0 = _mm_unpacklo_epi32(tmp, zero); |
37 | 0 | in1 = _mm_unpackhi_epi32(tmp, zero); |
38 | |
|
39 | 0 | tmp = _mm_add_epi32(in0, in1); |
40 | 0 | in0 = _mm_srli_si128(tmp, 8); |
41 | |
|
42 | 0 | in1 = _mm_add_epi32(tmp, in0); |
43 | 0 | in0 = _mm_slli_epi32(in1, 1); |
44 | 0 | output[0] = (tran_low_t)_mm_cvtsi128_si32(in0); |
45 | 0 | } |
46 | | |
47 | 50.1k | void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) { |
48 | 50.1k | __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); |
49 | 50.1k | __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); |
50 | 50.1k | __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); |
51 | 50.1k | __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); |
52 | 50.1k | __m128i u0, u1, sum; |
53 | | |
54 | 50.1k | u0 = _mm_add_epi16(in0, in1); |
55 | 50.1k | u1 = _mm_add_epi16(in2, in3); |
56 | | |
57 | 50.1k | in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); |
58 | 50.1k | in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); |
59 | 50.1k | in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); |
60 | 50.1k | in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); |
61 | | |
62 | 50.1k | sum = _mm_add_epi16(u0, u1); |
63 | | |
64 | 50.1k | in0 = _mm_add_epi16(in0, in1); |
65 | 50.1k | in2 = _mm_add_epi16(in2, in3); |
66 | 50.1k | sum = _mm_add_epi16(sum, in0); |
67 | | |
68 | 50.1k | u0 = _mm_setzero_si128(); |
69 | 50.1k | sum = _mm_add_epi16(sum, in2); |
70 | | |
71 | 50.1k | in0 = _mm_unpacklo_epi16(u0, sum); |
72 | 50.1k | in1 = _mm_unpackhi_epi16(u0, sum); |
73 | 50.1k | in0 = _mm_srai_epi32(in0, 16); |
74 | 50.1k | in1 = _mm_srai_epi32(in1, 16); |
75 | | |
76 | 50.1k | sum = _mm_add_epi32(in0, in1); |
77 | 50.1k | in0 = _mm_unpacklo_epi32(sum, u0); |
78 | 50.1k | in1 = _mm_unpackhi_epi32(sum, u0); |
79 | | |
80 | 50.1k | sum = _mm_add_epi32(in0, in1); |
81 | 50.1k | in0 = _mm_srli_si128(sum, 8); |
82 | | |
83 | 50.1k | in1 = _mm_add_epi32(sum, in0); |
84 | 50.1k | output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); |
85 | 50.1k | } |
86 | | |
87 | | void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, |
88 | 9.27k | int stride) { |
89 | 9.27k | __m128i in0, in1, in2, in3; |
90 | 9.27k | __m128i u0, u1; |
91 | 9.27k | __m128i sum = _mm_setzero_si128(); |
92 | 9.27k | int i; |
93 | | |
94 | 27.8k | for (i = 0; i < 2; ++i) { |
95 | 18.5k | in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0)); |
96 | 18.5k | in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8)); |
97 | 18.5k | in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0)); |
98 | 18.5k | in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8)); |
99 | | |
100 | 18.5k | u0 = _mm_add_epi16(in0, in1); |
101 | 18.5k | u1 = _mm_add_epi16(in2, in3); |
102 | 18.5k | sum = _mm_add_epi16(sum, u0); |
103 | | |
104 | 18.5k | in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0)); |
105 | 18.5k | in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8)); |
106 | 18.5k | in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0)); |
107 | 18.5k | in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8)); |
108 | | |
109 | 18.5k | sum = _mm_add_epi16(sum, u1); |
110 | 18.5k | u0 = _mm_add_epi16(in0, in1); |
111 | 18.5k | u1 = _mm_add_epi16(in2, in3); |
112 | 18.5k | sum = _mm_add_epi16(sum, u0); |
113 | | |
114 | 18.5k | in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0)); |
115 | 18.5k | in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8)); |
116 | 18.5k | in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0)); |
117 | 18.5k | in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8)); |
118 | | |
119 | 18.5k | sum = _mm_add_epi16(sum, u1); |
120 | 18.5k | u0 = _mm_add_epi16(in0, in1); |
121 | 18.5k | u1 = _mm_add_epi16(in2, in3); |
122 | 18.5k | sum = _mm_add_epi16(sum, u0); |
123 | | |
124 | 18.5k | in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0)); |
125 | 18.5k | in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8)); |
126 | 18.5k | in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0)); |
127 | 18.5k | in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8)); |
128 | | |
129 | 18.5k | sum = _mm_add_epi16(sum, u1); |
130 | 18.5k | u0 = _mm_add_epi16(in0, in1); |
131 | 18.5k | u1 = _mm_add_epi16(in2, in3); |
132 | 18.5k | sum = _mm_add_epi16(sum, u0); |
133 | | |
134 | 18.5k | sum = _mm_add_epi16(sum, u1); |
135 | 18.5k | input += 8 * stride; |
136 | 18.5k | } |
137 | | |
138 | 9.27k | u0 = _mm_setzero_si128(); |
139 | 9.27k | in0 = _mm_unpacklo_epi16(u0, sum); |
140 | 9.27k | in1 = _mm_unpackhi_epi16(u0, sum); |
141 | 9.27k | in0 = _mm_srai_epi32(in0, 16); |
142 | 9.27k | in1 = _mm_srai_epi32(in1, 16); |
143 | | |
144 | 9.27k | sum = _mm_add_epi32(in0, in1); |
145 | 9.27k | in0 = _mm_unpacklo_epi32(sum, u0); |
146 | 9.27k | in1 = _mm_unpackhi_epi32(sum, u0); |
147 | | |
148 | 9.27k | sum = _mm_add_epi32(in0, in1); |
149 | 9.27k | in0 = _mm_srli_si128(sum, 8); |
150 | | |
151 | 9.27k | in1 = _mm_add_epi32(sum, in0); |
152 | 9.27k | in1 = _mm_srai_epi32(in1, 1); |
153 | 9.27k | output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); |
154 | 9.27k | } |
155 | | |
156 | | void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, |
157 | 2.09k | int stride) { |
158 | 2.09k | __m128i in0, in1, in2, in3; |
159 | 2.09k | __m128i u0, u1; |
160 | 2.09k | __m128i sum = _mm_setzero_si128(); |
161 | 2.09k | int i; |
162 | | |
163 | 18.8k | for (i = 0; i < 8; ++i) { |
164 | 16.7k | in0 = _mm_load_si128((const __m128i *)(input + 0)); |
165 | 16.7k | in1 = _mm_load_si128((const __m128i *)(input + 8)); |
166 | 16.7k | in2 = _mm_load_si128((const __m128i *)(input + 16)); |
167 | 16.7k | in3 = _mm_load_si128((const __m128i *)(input + 24)); |
168 | | |
169 | 16.7k | input += stride; |
170 | 16.7k | u0 = _mm_add_epi16(in0, in1); |
171 | 16.7k | u1 = _mm_add_epi16(in2, in3); |
172 | 16.7k | sum = _mm_add_epi16(sum, u0); |
173 | | |
174 | 16.7k | in0 = _mm_load_si128((const __m128i *)(input + 0)); |
175 | 16.7k | in1 = _mm_load_si128((const __m128i *)(input + 8)); |
176 | 16.7k | in2 = _mm_load_si128((const __m128i *)(input + 16)); |
177 | 16.7k | in3 = _mm_load_si128((const __m128i *)(input + 24)); |
178 | | |
179 | 16.7k | input += stride; |
180 | 16.7k | sum = _mm_add_epi16(sum, u1); |
181 | 16.7k | u0 = _mm_add_epi16(in0, in1); |
182 | 16.7k | u1 = _mm_add_epi16(in2, in3); |
183 | 16.7k | sum = _mm_add_epi16(sum, u0); |
184 | | |
185 | 16.7k | in0 = _mm_load_si128((const __m128i *)(input + 0)); |
186 | 16.7k | in1 = _mm_load_si128((const __m128i *)(input + 8)); |
187 | 16.7k | in2 = _mm_load_si128((const __m128i *)(input + 16)); |
188 | 16.7k | in3 = _mm_load_si128((const __m128i *)(input + 24)); |
189 | | |
190 | 16.7k | input += stride; |
191 | 16.7k | sum = _mm_add_epi16(sum, u1); |
192 | 16.7k | u0 = _mm_add_epi16(in0, in1); |
193 | 16.7k | u1 = _mm_add_epi16(in2, in3); |
194 | 16.7k | sum = _mm_add_epi16(sum, u0); |
195 | | |
196 | 16.7k | in0 = _mm_load_si128((const __m128i *)(input + 0)); |
197 | 16.7k | in1 = _mm_load_si128((const __m128i *)(input + 8)); |
198 | 16.7k | in2 = _mm_load_si128((const __m128i *)(input + 16)); |
199 | 16.7k | in3 = _mm_load_si128((const __m128i *)(input + 24)); |
200 | | |
201 | 16.7k | input += stride; |
202 | 16.7k | sum = _mm_add_epi16(sum, u1); |
203 | 16.7k | u0 = _mm_add_epi16(in0, in1); |
204 | 16.7k | u1 = _mm_add_epi16(in2, in3); |
205 | 16.7k | sum = _mm_add_epi16(sum, u0); |
206 | | |
207 | 16.7k | sum = _mm_add_epi16(sum, u1); |
208 | 16.7k | } |
209 | | |
210 | 2.09k | u0 = _mm_setzero_si128(); |
211 | 2.09k | in0 = _mm_unpacklo_epi16(u0, sum); |
212 | 2.09k | in1 = _mm_unpackhi_epi16(u0, sum); |
213 | 2.09k | in0 = _mm_srai_epi32(in0, 16); |
214 | 2.09k | in1 = _mm_srai_epi32(in1, 16); |
215 | | |
216 | 2.09k | sum = _mm_add_epi32(in0, in1); |
217 | 2.09k | in0 = _mm_unpacklo_epi32(sum, u0); |
218 | 2.09k | in1 = _mm_unpackhi_epi32(sum, u0); |
219 | | |
220 | 2.09k | sum = _mm_add_epi32(in0, in1); |
221 | 2.09k | in0 = _mm_srli_si128(sum, 8); |
222 | | |
223 | 2.09k | in1 = _mm_add_epi32(sum, in0); |
224 | 2.09k | in1 = _mm_srai_epi32(in1, 3); |
225 | 2.09k | output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); |
226 | 2.09k | } |
227 | | |
228 | | #define DCT_HIGH_BIT_DEPTH 0 |
229 | | #define FDCT4x4_2D vpx_fdct4x4_sse2 |
230 | | #define FDCT8x8_2D vpx_fdct8x8_sse2 |
231 | | #define FDCT16x16_2D vpx_fdct16x16_sse2 |
232 | | #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" |
233 | | #undef FDCT4x4_2D |
234 | | #undef FDCT8x8_2D |
235 | | #undef FDCT16x16_2D |
236 | | |
237 | | #define FDCT32x32_2D vpx_fdct32x32_rd_sse2 |
238 | | #define FDCT32x32_HIGH_PRECISION 0 |
239 | | #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" |
240 | | #undef FDCT32x32_2D |
241 | | #undef FDCT32x32_HIGH_PRECISION |
242 | | |
243 | | #define FDCT32x32_2D vpx_fdct32x32_sse2 |
244 | | #define FDCT32x32_HIGH_PRECISION 1 |
245 | | #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT |
246 | | #undef FDCT32x32_2D |
247 | | #undef FDCT32x32_HIGH_PRECISION |
248 | | #undef DCT_HIGH_BIT_DEPTH |
249 | | |
250 | | #if CONFIG_VP9_HIGHBITDEPTH |
251 | | #define DCT_HIGH_BIT_DEPTH 1 |
252 | | #define FDCT4x4_2D vpx_highbd_fdct4x4_sse2 |
253 | | #define FDCT8x8_2D vpx_highbd_fdct8x8_sse2 |
254 | | #define FDCT16x16_2D vpx_highbd_fdct16x16_sse2 |
255 | | #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT |
256 | | #undef FDCT4x4_2D |
257 | | #undef FDCT8x8_2D |
258 | | #undef FDCT16x16_2D |
259 | | |
260 | | #define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2 |
261 | | #define FDCT32x32_HIGH_PRECISION 0 |
262 | | #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT |
263 | | #undef FDCT32x32_2D |
264 | | #undef FDCT32x32_HIGH_PRECISION |
265 | | |
266 | | #define FDCT32x32_2D vpx_highbd_fdct32x32_sse2 |
267 | | #define FDCT32x32_HIGH_PRECISION 1 |
268 | | #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT |
269 | | #undef FDCT32x32_2D |
270 | | #undef FDCT32x32_HIGH_PRECISION |
271 | | #undef DCT_HIGH_BIT_DEPTH |
272 | | #endif // CONFIG_VP9_HIGHBITDEPTH |