/src/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <tmmintrin.h> |
12 | | |
13 | | #include "./vpx_dsp_rtcd.h" |
14 | | #include "vpx_dsp/x86/inv_txfm_sse2.h" |
15 | | #include "vpx_dsp/x86/inv_txfm_ssse3.h" |
16 | | #include "vpx_dsp/x86/transpose_sse2.h" |
17 | | #include "vpx_dsp/x86/txfm_common_sse2.h" |
18 | | |
19 | | static INLINE void partial_butterfly_ssse3(const __m128i in, const int c0, |
20 | | const int c1, __m128i *const out0, |
21 | 719k | __m128i *const out1) { |
22 | 719k | const __m128i cst0 = _mm_set1_epi16(2 * c0); |
23 | 719k | const __m128i cst1 = _mm_set1_epi16(2 * c1); |
24 | 719k | *out0 = _mm_mulhrs_epi16(in, cst0); |
25 | 719k | *out1 = _mm_mulhrs_epi16(in, cst1); |
26 | 719k | } |
27 | | |
28 | 102k | static INLINE __m128i partial_butterfly_cospi16_ssse3(const __m128i in) { |
29 | 102k | const __m128i coef_pair = _mm_set1_epi16(2 * cospi_16_64); |
30 | 102k | return _mm_mulhrs_epi16(in, coef_pair); |
31 | 102k | } |
32 | | |
33 | | void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, |
34 | 695k | int stride) { |
35 | 695k | __m128i io[8]; |
36 | | |
37 | 695k | io[0] = load_input_data4(input + 0 * 8); |
38 | 695k | io[1] = load_input_data4(input + 1 * 8); |
39 | 695k | io[2] = load_input_data4(input + 2 * 8); |
40 | 695k | io[3] = load_input_data4(input + 3 * 8); |
41 | | |
42 | 695k | idct8x8_12_add_kernel_ssse3(io); |
43 | 695k | write_buffer_8x8(io, dest, stride); |
44 | 695k | } |
45 | | |
46 | | // Group the coefficient calculation into smaller functions to prevent stack |
47 | | // spillover in 32x32 idct optimizations: |
48 | | // quarter_1: 0-7 |
49 | | // quarter_2: 8-15 |
50 | | // quarter_3_4: 16-23, 24-31 |
51 | | |
52 | | // For each 8x32 block __m128i in[32], |
53 | | // Input with index, 0, 4 |
54 | | // output pixels: 0-7 in __m128i out[32] |
55 | | static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/, |
56 | 102k | __m128i *const out /*out[8]*/) { |
57 | 102k | __m128i step1[8], step2[8]; |
58 | | |
59 | | // stage 3 |
60 | 102k | partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); |
61 | | |
62 | | // stage 4 |
63 | 102k | step2[0] = partial_butterfly_cospi16_ssse3(in[0]); |
64 | 102k | step2[4] = step1[4]; |
65 | 102k | step2[5] = step1[4]; |
66 | 102k | step2[6] = step1[7]; |
67 | 102k | step2[7] = step1[7]; |
68 | | |
69 | | // stage 5 |
70 | 102k | step1[0] = step2[0]; |
71 | 102k | step1[1] = step2[0]; |
72 | 102k | step1[2] = step2[0]; |
73 | 102k | step1[3] = step2[0]; |
74 | 102k | step1[4] = step2[4]; |
75 | 102k | butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); |
76 | 102k | step1[7] = step2[7]; |
77 | | |
78 | | // stage 6 |
79 | 102k | out[0] = _mm_add_epi16(step1[0], step1[7]); |
80 | 102k | out[1] = _mm_add_epi16(step1[1], step1[6]); |
81 | 102k | out[2] = _mm_add_epi16(step1[2], step1[5]); |
82 | 102k | out[3] = _mm_add_epi16(step1[3], step1[4]); |
83 | 102k | out[4] = _mm_sub_epi16(step1[3], step1[4]); |
84 | 102k | out[5] = _mm_sub_epi16(step1[2], step1[5]); |
85 | 102k | out[6] = _mm_sub_epi16(step1[1], step1[6]); |
86 | 102k | out[7] = _mm_sub_epi16(step1[0], step1[7]); |
87 | 102k | } |
88 | | |
89 | | // For each 8x32 block __m128i in[32], |
90 | | // Input with index, 2, 6 |
91 | | // output pixels: 8-15 in __m128i out[32] |
92 | | static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/, |
93 | 102k | __m128i *const out /*out[16]*/) { |
94 | 102k | __m128i step1[16], step2[16]; |
95 | | |
96 | | // stage 2 |
97 | 102k | partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8], |
98 | 102k | &step2[15]); |
99 | 102k | partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11], |
100 | 102k | &step2[12]); |
101 | | |
102 | | // stage 3 |
103 | 102k | step1[8] = step2[8]; |
104 | 102k | step1[9] = step2[8]; |
105 | 102k | step1[14] = step2[15]; |
106 | 102k | step1[15] = step2[15]; |
107 | 102k | step1[10] = step2[11]; |
108 | 102k | step1[11] = step2[11]; |
109 | 102k | step1[12] = step2[12]; |
110 | 102k | step1[13] = step2[12]; |
111 | | |
112 | 102k | idct32_8x32_quarter_2_stage_4_to_6(step1, out); |
113 | 102k | } |
114 | | |
115 | | static INLINE void idct32_34_8x32_quarter_1_2( |
116 | 102k | const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { |
117 | 102k | __m128i temp[16]; |
118 | 102k | idct32_34_8x32_quarter_1(in, temp); |
119 | 102k | idct32_34_8x32_quarter_2(in, temp); |
120 | | // stage 7 |
121 | 102k | add_sub_butterfly(temp, out, 16); |
122 | 102k | } |
123 | | |
124 | | // For each 8x32 block __m128i in[32], |
125 | | // Input with odd index, 1, 3, 5, 7 |
126 | | // output pixels: 16-23, 24-31 in __m128i out[32] |
127 | | static INLINE void idct32_34_8x32_quarter_3_4( |
128 | 102k | const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { |
129 | 102k | __m128i step1[32]; |
130 | | |
131 | | // stage 1 |
132 | 102k | partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16], |
133 | 102k | &step1[31]); |
134 | 102k | partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19], |
135 | 102k | &step1[28]); |
136 | 102k | partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20], |
137 | 102k | &step1[27]); |
138 | 102k | partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23], |
139 | 102k | &step1[24]); |
140 | | |
141 | | // stage 3 |
142 | 102k | butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17], |
143 | 102k | &step1[30]); |
144 | 102k | butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18], |
145 | 102k | &step1[29]); |
146 | 102k | butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21], |
147 | 102k | &step1[26]); |
148 | 102k | butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22], |
149 | 102k | &step1[25]); |
150 | | |
151 | 102k | idct32_8x32_quarter_3_4_stage_4_to_7(step1, out); |
152 | 102k | } |
153 | | |
154 | | void idct32_34_8x32_ssse3(const __m128i *const in /*in[32]*/, |
155 | 102k | __m128i *const out /*out[32]*/) { |
156 | 102k | __m128i temp[32]; |
157 | | |
158 | 102k | idct32_34_8x32_quarter_1_2(in, temp); |
159 | 102k | idct32_34_8x32_quarter_3_4(in, temp); |
160 | | // final stage |
161 | 102k | add_sub_butterfly(temp, out, 32); |
162 | 102k | } |
163 | | |
164 | | // Only upper-left 8x8 has non-zero coeff |
165 | | void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, |
166 | 20.5k | int stride) { |
167 | 20.5k | __m128i io[32], col[32]; |
168 | 20.5k | int i; |
169 | | |
170 | | // Load input data. Only need to load the top left 8x8 block. |
171 | 20.5k | load_transpose_16bit_8x8(input, 32, io); |
172 | 20.5k | idct32_34_8x32_ssse3(io, col); |
173 | | |
174 | 102k | for (i = 0; i < 32; i += 8) { |
175 | 82.2k | int j; |
176 | 82.2k | transpose_16bit_8x8(col + i, io); |
177 | 82.2k | idct32_34_8x32_ssse3(io, io); |
178 | | |
179 | 2.71M | for (j = 0; j < 32; ++j) { |
180 | 2.63M | write_buffer_8x1(dest + j * stride, io[j]); |
181 | 2.63M | } |
182 | | |
183 | 82.2k | dest += 8; |
184 | 82.2k | } |
185 | 20.5k | } |
186 | | |
187 | | // For each 8x32 block __m128i in[32], |
188 | | // Input with index, 0, 4, 8, 12 |
189 | | // output pixels: 0-7 in __m128i out[32] |
190 | | static INLINE void idct32_135_8x32_quarter_1(const __m128i *const in /*in[32]*/, |
191 | 0 | __m128i *const out /*out[8]*/) { |
192 | 0 | __m128i step1[8], step2[8]; |
193 | | |
194 | | // stage 3 |
195 | 0 | partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); |
196 | 0 | partial_butterfly_ssse3(in[12], -cospi_20_64, cospi_12_64, &step1[5], |
197 | 0 | &step1[6]); |
198 | | |
199 | | // stage 4 |
200 | 0 | step2[0] = partial_butterfly_cospi16_ssse3(in[0]); |
201 | 0 | partial_butterfly_ssse3(in[8], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); |
202 | 0 | step2[4] = _mm_add_epi16(step1[4], step1[5]); |
203 | 0 | step2[5] = _mm_sub_epi16(step1[4], step1[5]); |
204 | 0 | step2[6] = _mm_sub_epi16(step1[7], step1[6]); |
205 | 0 | step2[7] = _mm_add_epi16(step1[7], step1[6]); |
206 | | |
207 | | // stage 5 |
208 | 0 | step1[0] = _mm_add_epi16(step2[0], step2[3]); |
209 | 0 | step1[1] = _mm_add_epi16(step2[0], step2[2]); |
210 | 0 | step1[2] = _mm_sub_epi16(step2[0], step2[2]); |
211 | 0 | step1[3] = _mm_sub_epi16(step2[0], step2[3]); |
212 | 0 | step1[4] = step2[4]; |
213 | 0 | butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); |
214 | 0 | step1[7] = step2[7]; |
215 | | |
216 | | // stage 6 |
217 | 0 | out[0] = _mm_add_epi16(step1[0], step1[7]); |
218 | 0 | out[1] = _mm_add_epi16(step1[1], step1[6]); |
219 | 0 | out[2] = _mm_add_epi16(step1[2], step1[5]); |
220 | 0 | out[3] = _mm_add_epi16(step1[3], step1[4]); |
221 | 0 | out[4] = _mm_sub_epi16(step1[3], step1[4]); |
222 | 0 | out[5] = _mm_sub_epi16(step1[2], step1[5]); |
223 | 0 | out[6] = _mm_sub_epi16(step1[1], step1[6]); |
224 | 0 | out[7] = _mm_sub_epi16(step1[0], step1[7]); |
225 | 0 | } |
226 | | |
227 | | // For each 8x32 block __m128i in[32], |
228 | | // Input with index, 2, 6, 10, 14 |
229 | | // output pixels: 8-15 in __m128i out[32] |
230 | | static INLINE void idct32_135_8x32_quarter_2(const __m128i *const in /*in[32]*/, |
231 | 0 | __m128i *const out /*out[16]*/) { |
232 | 0 | __m128i step1[16], step2[16]; |
233 | | |
234 | | // stage 2 |
235 | 0 | partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8], |
236 | 0 | &step2[15]); |
237 | 0 | partial_butterfly_ssse3(in[14], -cospi_18_64, cospi_14_64, &step2[9], |
238 | 0 | &step2[14]); |
239 | 0 | partial_butterfly_ssse3(in[10], cospi_22_64, cospi_10_64, &step2[10], |
240 | 0 | &step2[13]); |
241 | 0 | partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11], |
242 | 0 | &step2[12]); |
243 | | |
244 | | // stage 3 |
245 | 0 | step1[8] = _mm_add_epi16(step2[8], step2[9]); |
246 | 0 | step1[9] = _mm_sub_epi16(step2[8], step2[9]); |
247 | 0 | step1[10] = _mm_sub_epi16(step2[11], step2[10]); |
248 | 0 | step1[11] = _mm_add_epi16(step2[11], step2[10]); |
249 | 0 | step1[12] = _mm_add_epi16(step2[12], step2[13]); |
250 | 0 | step1[13] = _mm_sub_epi16(step2[12], step2[13]); |
251 | 0 | step1[14] = _mm_sub_epi16(step2[15], step2[14]); |
252 | 0 | step1[15] = _mm_add_epi16(step2[15], step2[14]); |
253 | |
|
254 | 0 | idct32_8x32_quarter_2_stage_4_to_6(step1, out); |
255 | 0 | } |
256 | | |
257 | | static INLINE void idct32_135_8x32_quarter_1_2( |
258 | 0 | const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { |
259 | 0 | __m128i temp[16]; |
260 | 0 | idct32_135_8x32_quarter_1(in, temp); |
261 | 0 | idct32_135_8x32_quarter_2(in, temp); |
262 | | // stage 7 |
263 | 0 | add_sub_butterfly(temp, out, 16); |
264 | 0 | } |
265 | | |
266 | | // For each 8x32 block __m128i in[32], |
267 | | // Input with odd index, |
268 | | // 1, 3, 5, 7, 9, 11, 13, 15 |
269 | | // output pixels: 16-23, 24-31 in __m128i out[32] |
270 | | static INLINE void idct32_135_8x32_quarter_3_4( |
271 | 0 | const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { |
272 | 0 | __m128i step1[32], step2[32]; |
273 | | |
274 | | // stage 1 |
275 | 0 | partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16], |
276 | 0 | &step1[31]); |
277 | 0 | partial_butterfly_ssse3(in[15], -cospi_17_64, cospi_15_64, &step1[17], |
278 | 0 | &step1[30]); |
279 | 0 | partial_butterfly_ssse3(in[9], cospi_23_64, cospi_9_64, &step1[18], |
280 | 0 | &step1[29]); |
281 | 0 | partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19], |
282 | 0 | &step1[28]); |
283 | |
|
284 | 0 | partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20], |
285 | 0 | &step1[27]); |
286 | 0 | partial_butterfly_ssse3(in[11], -cospi_21_64, cospi_11_64, &step1[21], |
287 | 0 | &step1[26]); |
288 | |
|
289 | 0 | partial_butterfly_ssse3(in[13], cospi_19_64, cospi_13_64, &step1[22], |
290 | 0 | &step1[25]); |
291 | 0 | partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23], |
292 | 0 | &step1[24]); |
293 | | |
294 | | // stage 2 |
295 | 0 | step2[16] = _mm_add_epi16(step1[16], step1[17]); |
296 | 0 | step2[17] = _mm_sub_epi16(step1[16], step1[17]); |
297 | 0 | step2[18] = _mm_sub_epi16(step1[19], step1[18]); |
298 | 0 | step2[19] = _mm_add_epi16(step1[19], step1[18]); |
299 | 0 | step2[20] = _mm_add_epi16(step1[20], step1[21]); |
300 | 0 | step2[21] = _mm_sub_epi16(step1[20], step1[21]); |
301 | 0 | step2[22] = _mm_sub_epi16(step1[23], step1[22]); |
302 | 0 | step2[23] = _mm_add_epi16(step1[23], step1[22]); |
303 | |
|
304 | 0 | step2[24] = _mm_add_epi16(step1[24], step1[25]); |
305 | 0 | step2[25] = _mm_sub_epi16(step1[24], step1[25]); |
306 | 0 | step2[26] = _mm_sub_epi16(step1[27], step1[26]); |
307 | 0 | step2[27] = _mm_add_epi16(step1[27], step1[26]); |
308 | 0 | step2[28] = _mm_add_epi16(step1[28], step1[29]); |
309 | 0 | step2[29] = _mm_sub_epi16(step1[28], step1[29]); |
310 | 0 | step2[30] = _mm_sub_epi16(step1[31], step1[30]); |
311 | 0 | step2[31] = _mm_add_epi16(step1[31], step1[30]); |
312 | | |
313 | | // stage 3 |
314 | 0 | step1[16] = step2[16]; |
315 | 0 | step1[31] = step2[31]; |
316 | 0 | butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17], |
317 | 0 | &step1[30]); |
318 | 0 | butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18], |
319 | 0 | &step1[29]); |
320 | 0 | step1[19] = step2[19]; |
321 | 0 | step1[20] = step2[20]; |
322 | 0 | butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21], |
323 | 0 | &step1[26]); |
324 | 0 | butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22], |
325 | 0 | &step1[25]); |
326 | 0 | step1[23] = step2[23]; |
327 | 0 | step1[24] = step2[24]; |
328 | 0 | step1[27] = step2[27]; |
329 | 0 | step1[28] = step2[28]; |
330 | |
|
331 | 0 | idct32_8x32_quarter_3_4_stage_4_to_7(step1, out); |
332 | 0 | } |
333 | | |
334 | | void idct32_135_8x32_ssse3(const __m128i *const in /*in[32]*/, |
335 | 0 | __m128i *const out /*out[32]*/) { |
336 | 0 | __m128i temp[32]; |
337 | 0 | idct32_135_8x32_quarter_1_2(in, temp); |
338 | 0 | idct32_135_8x32_quarter_3_4(in, temp); |
339 | | // final stage |
340 | 0 | add_sub_butterfly(temp, out, 32); |
341 | 0 | } |
342 | | |
343 | | void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, |
344 | 0 | int stride) { |
345 | 0 | __m128i col[2][32], io[32]; |
346 | 0 | int i; |
347 | | |
348 | | // rows |
349 | 0 | for (i = 0; i < 2; i++) { |
350 | 0 | load_transpose_16bit_8x8(&input[0], 32, &io[0]); |
351 | 0 | load_transpose_16bit_8x8(&input[8], 32, &io[8]); |
352 | 0 | idct32_135_8x32_ssse3(io, col[i]); |
353 | 0 | input += 32 << 3; |
354 | 0 | } |
355 | | |
356 | | // columns |
357 | 0 | for (i = 0; i < 32; i += 8) { |
358 | 0 | transpose_16bit_8x8(col[0] + i, io); |
359 | 0 | transpose_16bit_8x8(col[1] + i, io + 8); |
360 | 0 | idct32_135_8x32_ssse3(io, io); |
361 | 0 | store_buffer_8x32(io, dest, stride); |
362 | 0 | dest += 8; |
363 | 0 | } |
364 | 0 | } |