/src/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <emmintrin.h> // SSE2 |
12 | | |
13 | | #include "./vpx_config.h" |
14 | | #include "./vpx_dsp_rtcd.h" |
15 | | #include "vpx_dsp/vpx_dsp_common.h" |
16 | | #include "vpx_dsp/x86/fwd_txfm_sse2.h" |
17 | | |
18 | 0 | void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { |
19 | 0 | __m128i in0, in1; |
20 | 0 | __m128i tmp; |
21 | 0 | const __m128i zero = _mm_setzero_si128(); |
22 | 0 | in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); |
23 | 0 | in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); |
24 | 0 | in1 = _mm_unpacklo_epi64( |
25 | 0 | in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); |
26 | 0 | in0 = _mm_unpacklo_epi64( |
27 | 0 | in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); |
28 | |
|
29 | 0 | tmp = _mm_add_epi16(in0, in1); |
30 | 0 | in0 = _mm_unpacklo_epi16(zero, tmp); |
31 | 0 | in1 = _mm_unpackhi_epi16(zero, tmp); |
32 | 0 | in0 = _mm_srai_epi32(in0, 16); |
33 | 0 | in1 = _mm_srai_epi32(in1, 16); |
34 | |
|
35 | 0 | tmp = _mm_add_epi32(in0, in1); |
36 | 0 | in0 = _mm_unpacklo_epi32(tmp, zero); |
37 | 0 | in1 = _mm_unpackhi_epi32(tmp, zero); |
38 | |
|
39 | 0 | tmp = _mm_add_epi32(in0, in1); |
40 | 0 | in0 = _mm_srli_si128(tmp, 8); |
41 | |
|
42 | 0 | in1 = _mm_add_epi32(tmp, in0); |
43 | 0 | in0 = _mm_slli_epi32(in1, 1); |
44 | 0 | output[0] = (tran_low_t)_mm_cvtsi128_si32(in0); |
45 | 0 | } |
46 | | |
47 | 0 | void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) { |
48 | 0 | __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); |
49 | 0 | __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); |
50 | 0 | __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); |
51 | 0 | __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); |
52 | 0 | __m128i u0, u1, sum; |
53 | |
|
54 | 0 | u0 = _mm_add_epi16(in0, in1); |
55 | 0 | u1 = _mm_add_epi16(in2, in3); |
56 | |
|
57 | 0 | in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); |
58 | 0 | in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); |
59 | 0 | in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); |
60 | 0 | in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); |
61 | |
|
62 | 0 | sum = _mm_add_epi16(u0, u1); |
63 | |
|
64 | 0 | in0 = _mm_add_epi16(in0, in1); |
65 | 0 | in2 = _mm_add_epi16(in2, in3); |
66 | 0 | sum = _mm_add_epi16(sum, in0); |
67 | |
|
68 | 0 | u0 = _mm_setzero_si128(); |
69 | 0 | sum = _mm_add_epi16(sum, in2); |
70 | |
|
71 | 0 | in0 = _mm_unpacklo_epi16(u0, sum); |
72 | 0 | in1 = _mm_unpackhi_epi16(u0, sum); |
73 | 0 | in0 = _mm_srai_epi32(in0, 16); |
74 | 0 | in1 = _mm_srai_epi32(in1, 16); |
75 | |
|
76 | 0 | sum = _mm_add_epi32(in0, in1); |
77 | 0 | in0 = _mm_unpacklo_epi32(sum, u0); |
78 | 0 | in1 = _mm_unpackhi_epi32(sum, u0); |
79 | |
|
80 | 0 | sum = _mm_add_epi32(in0, in1); |
81 | 0 | in0 = _mm_srli_si128(sum, 8); |
82 | |
|
83 | 0 | in1 = _mm_add_epi32(sum, in0); |
84 | 0 | output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); |
85 | 0 | } |
86 | | |
87 | | void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, |
88 | 0 | int stride) { |
89 | 0 | __m128i in0, in1, in2, in3; |
90 | 0 | __m128i u0, u1; |
91 | 0 | __m128i sum = _mm_setzero_si128(); |
92 | 0 | int i; |
93 | |
|
94 | 0 | for (i = 0; i < 2; ++i) { |
95 | 0 | in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0)); |
96 | 0 | in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8)); |
97 | 0 | in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0)); |
98 | 0 | in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8)); |
99 | |
|
100 | 0 | u0 = _mm_add_epi16(in0, in1); |
101 | 0 | u1 = _mm_add_epi16(in2, in3); |
102 | 0 | sum = _mm_add_epi16(sum, u0); |
103 | |
|
104 | 0 | in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0)); |
105 | 0 | in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8)); |
106 | 0 | in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0)); |
107 | 0 | in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8)); |
108 | |
|
109 | 0 | sum = _mm_add_epi16(sum, u1); |
110 | 0 | u0 = _mm_add_epi16(in0, in1); |
111 | 0 | u1 = _mm_add_epi16(in2, in3); |
112 | 0 | sum = _mm_add_epi16(sum, u0); |
113 | |
|
114 | 0 | in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0)); |
115 | 0 | in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8)); |
116 | 0 | in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0)); |
117 | 0 | in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8)); |
118 | |
|
119 | 0 | sum = _mm_add_epi16(sum, u1); |
120 | 0 | u0 = _mm_add_epi16(in0, in1); |
121 | 0 | u1 = _mm_add_epi16(in2, in3); |
122 | 0 | sum = _mm_add_epi16(sum, u0); |
123 | |
|
124 | 0 | in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0)); |
125 | 0 | in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8)); |
126 | 0 | in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0)); |
127 | 0 | in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8)); |
128 | |
|
129 | 0 | sum = _mm_add_epi16(sum, u1); |
130 | 0 | u0 = _mm_add_epi16(in0, in1); |
131 | 0 | u1 = _mm_add_epi16(in2, in3); |
132 | 0 | sum = _mm_add_epi16(sum, u0); |
133 | |
|
134 | 0 | sum = _mm_add_epi16(sum, u1); |
135 | 0 | input += 8 * stride; |
136 | 0 | } |
137 | |
|
138 | 0 | u0 = _mm_setzero_si128(); |
139 | 0 | in0 = _mm_unpacklo_epi16(u0, sum); |
140 | 0 | in1 = _mm_unpackhi_epi16(u0, sum); |
141 | 0 | in0 = _mm_srai_epi32(in0, 16); |
142 | 0 | in1 = _mm_srai_epi32(in1, 16); |
143 | |
|
144 | 0 | sum = _mm_add_epi32(in0, in1); |
145 | 0 | in0 = _mm_unpacklo_epi32(sum, u0); |
146 | 0 | in1 = _mm_unpackhi_epi32(sum, u0); |
147 | |
|
148 | 0 | sum = _mm_add_epi32(in0, in1); |
149 | 0 | in0 = _mm_srli_si128(sum, 8); |
150 | |
|
151 | 0 | in1 = _mm_add_epi32(sum, in0); |
152 | 0 | in1 = _mm_srai_epi32(in1, 1); |
153 | 0 | output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); |
154 | 0 | } |
155 | | |
156 | | void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, |
157 | 0 | int stride) { |
158 | 0 | __m128i in0, in1, in2, in3; |
159 | 0 | __m128i u0, u1; |
160 | 0 | __m128i sum = _mm_setzero_si128(); |
161 | 0 | int i; |
162 | |
|
163 | 0 | for (i = 0; i < 8; ++i) { |
164 | 0 | in0 = _mm_load_si128((const __m128i *)(input + 0)); |
165 | 0 | in1 = _mm_load_si128((const __m128i *)(input + 8)); |
166 | 0 | in2 = _mm_load_si128((const __m128i *)(input + 16)); |
167 | 0 | in3 = _mm_load_si128((const __m128i *)(input + 24)); |
168 | |
|
169 | 0 | input += stride; |
170 | 0 | u0 = _mm_add_epi16(in0, in1); |
171 | 0 | u1 = _mm_add_epi16(in2, in3); |
172 | 0 | sum = _mm_add_epi16(sum, u0); |
173 | |
|
174 | 0 | in0 = _mm_load_si128((const __m128i *)(input + 0)); |
175 | 0 | in1 = _mm_load_si128((const __m128i *)(input + 8)); |
176 | 0 | in2 = _mm_load_si128((const __m128i *)(input + 16)); |
177 | 0 | in3 = _mm_load_si128((const __m128i *)(input + 24)); |
178 | |
|
179 | 0 | input += stride; |
180 | 0 | sum = _mm_add_epi16(sum, u1); |
181 | 0 | u0 = _mm_add_epi16(in0, in1); |
182 | 0 | u1 = _mm_add_epi16(in2, in3); |
183 | 0 | sum = _mm_add_epi16(sum, u0); |
184 | |
|
185 | 0 | in0 = _mm_load_si128((const __m128i *)(input + 0)); |
186 | 0 | in1 = _mm_load_si128((const __m128i *)(input + 8)); |
187 | 0 | in2 = _mm_load_si128((const __m128i *)(input + 16)); |
188 | 0 | in3 = _mm_load_si128((const __m128i *)(input + 24)); |
189 | |
|
190 | 0 | input += stride; |
191 | 0 | sum = _mm_add_epi16(sum, u1); |
192 | 0 | u0 = _mm_add_epi16(in0, in1); |
193 | 0 | u1 = _mm_add_epi16(in2, in3); |
194 | 0 | sum = _mm_add_epi16(sum, u0); |
195 | |
|
196 | 0 | in0 = _mm_load_si128((const __m128i *)(input + 0)); |
197 | 0 | in1 = _mm_load_si128((const __m128i *)(input + 8)); |
198 | 0 | in2 = _mm_load_si128((const __m128i *)(input + 16)); |
199 | 0 | in3 = _mm_load_si128((const __m128i *)(input + 24)); |
200 | |
|
201 | 0 | input += stride; |
202 | 0 | sum = _mm_add_epi16(sum, u1); |
203 | 0 | u0 = _mm_add_epi16(in0, in1); |
204 | 0 | u1 = _mm_add_epi16(in2, in3); |
205 | 0 | sum = _mm_add_epi16(sum, u0); |
206 | |
|
207 | 0 | sum = _mm_add_epi16(sum, u1); |
208 | 0 | } |
209 | |
|
210 | 0 | u0 = _mm_setzero_si128(); |
211 | 0 | in0 = _mm_unpacklo_epi16(u0, sum); |
212 | 0 | in1 = _mm_unpackhi_epi16(u0, sum); |
213 | 0 | in0 = _mm_srai_epi32(in0, 16); |
214 | 0 | in1 = _mm_srai_epi32(in1, 16); |
215 | |
|
216 | 0 | sum = _mm_add_epi32(in0, in1); |
217 | 0 | in0 = _mm_unpacklo_epi32(sum, u0); |
218 | 0 | in1 = _mm_unpackhi_epi32(sum, u0); |
219 | |
|
220 | 0 | sum = _mm_add_epi32(in0, in1); |
221 | 0 | in0 = _mm_srli_si128(sum, 8); |
222 | |
|
223 | 0 | in1 = _mm_add_epi32(sum, in0); |
224 | 0 | in1 = _mm_srai_epi32(in1, 3); |
225 | 0 | output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); |
226 | 0 | } |
227 | | |
228 | | #define DCT_HIGH_BIT_DEPTH 0 |
229 | | #define FDCT4x4_2D vpx_fdct4x4_sse2 |
230 | | #define FDCT8x8_2D vpx_fdct8x8_sse2 |
231 | | #define FDCT16x16_2D vpx_fdct16x16_sse2 |
232 | | #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" |
233 | | #undef FDCT4x4_2D |
234 | | #undef FDCT8x8_2D |
235 | | #undef FDCT16x16_2D |
236 | | |
237 | | #define FDCT32x32_2D vpx_fdct32x32_rd_sse2 |
238 | | #define FDCT32x32_HIGH_PRECISION 0 |
239 | | #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" |
240 | | #undef FDCT32x32_2D |
241 | | #undef FDCT32x32_HIGH_PRECISION |
242 | | |
243 | | #define FDCT32x32_2D vpx_fdct32x32_sse2 |
244 | | #define FDCT32x32_HIGH_PRECISION 1 |
245 | | #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT |
246 | | #undef FDCT32x32_2D |
247 | | #undef FDCT32x32_HIGH_PRECISION |
248 | | #undef DCT_HIGH_BIT_DEPTH |
249 | | |
250 | | #if CONFIG_VP9_HIGHBITDEPTH |
251 | | #define DCT_HIGH_BIT_DEPTH 1 |
252 | | #define FDCT4x4_2D vpx_highbd_fdct4x4_sse2 |
253 | | #define FDCT8x8_2D vpx_highbd_fdct8x8_sse2 |
254 | | #define FDCT16x16_2D vpx_highbd_fdct16x16_sse2 |
255 | | #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT |
256 | | #undef FDCT4x4_2D |
257 | | #undef FDCT8x8_2D |
258 | | #undef FDCT16x16_2D |
259 | | |
260 | | #define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2 |
261 | | #define FDCT32x32_HIGH_PRECISION 0 |
262 | | #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT |
263 | | #undef FDCT32x32_2D |
264 | | #undef FDCT32x32_HIGH_PRECISION |
265 | | |
266 | | #define FDCT32x32_2D vpx_highbd_fdct32x32_sse2 |
267 | | #define FDCT32x32_HIGH_PRECISION 1 |
268 | | #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT |
269 | | #undef FDCT32x32_2D |
270 | | #undef FDCT32x32_HIGH_PRECISION |
271 | | #undef DCT_HIGH_BIT_DEPTH |
272 | | #endif // CONFIG_VP9_HIGHBITDEPTH |