/src/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2017 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #ifndef VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_ |
12 | | #define VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_ |
13 | | |
14 | | #include <tmmintrin.h> |
15 | | |
16 | | #include "./vpx_dsp_rtcd.h" |
17 | | #include "vpx_dsp/x86/inv_txfm_sse2.h" |
18 | | #include "vpx_dsp/x86/transpose_sse2.h" |
19 | | #include "vpx_dsp/x86/txfm_common_sse2.h" |
20 | | |
21 | 695k | static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) { |
22 | 695k | const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64); |
23 | 695k | const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64); |
24 | 695k | const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64); |
25 | 695k | const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64); |
26 | 695k | const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
27 | 695k | const __m128i cospi_16_64d = _mm_set1_epi16((int16_t)(2 * cospi_16_64)); |
28 | 695k | const __m128i cospi_28_64d = _mm_set1_epi16((int16_t)(2 * cospi_28_64)); |
29 | 695k | const __m128i cospi_4_64d = _mm_set1_epi16((int16_t)(2 * cospi_4_64)); |
30 | 695k | const __m128i cospi_n20_64d = _mm_set1_epi16((int16_t)(-2 * cospi_20_64)); |
31 | 695k | const __m128i cospi_12_64d = _mm_set1_epi16((int16_t)(2 * cospi_12_64)); |
32 | 695k | const __m128i cospi_24_64d = _mm_set1_epi16((int16_t)(2 * cospi_24_64)); |
33 | 695k | const __m128i cospi_8_64d = _mm_set1_epi16((int16_t)(2 * cospi_8_64)); |
34 | 695k | __m128i step1[8], step2[8], tmp[4]; |
35 | | |
36 | | // pass 1 |
37 | | |
38 | 695k | transpose_16bit_4x4(io, io); |
39 | | // io[0]: 00 10 20 30 01 11 21 31 |
40 | | // io[1]: 02 12 22 32 03 13 23 33 |
41 | | |
42 | | // stage 1 |
43 | 695k | tmp[0] = _mm_unpacklo_epi64(io[0], io[0]); |
44 | 695k | tmp[1] = _mm_unpackhi_epi64(io[0], io[0]); |
45 | 695k | tmp[2] = _mm_unpacklo_epi64(io[1], io[1]); |
46 | 695k | tmp[3] = _mm_unpackhi_epi64(io[1], io[1]); |
47 | 695k | step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d); // step1 4&7 |
48 | 695k | step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d); // step1 5&6 |
49 | | |
50 | | // stage 2 |
51 | 695k | step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d); // step2 0&1 |
52 | 695k | step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d); // step2 3&2 |
53 | 695k | step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7 |
54 | 695k | step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6 |
55 | 695k | step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]); // step2 6 |
56 | | |
57 | | // stage 3 |
58 | 695k | tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]); |
59 | 695k | step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]); // step1 5&6 |
60 | 695k | tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1 |
61 | 695k | tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2 |
62 | 695k | step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1 |
63 | 695k | step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0 |
64 | | |
65 | | // stage 4 |
66 | 695k | tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0 |
67 | 695k | tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1 |
68 | 695k | tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7 |
69 | 695k | tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6 |
70 | | |
71 | | // pass 2 |
72 | | |
73 | 695k | idct8x8_12_transpose_16bit_4x8(tmp, io); |
74 | | |
75 | | // stage 1 |
76 | 695k | step1[4] = _mm_mulhrs_epi16(io[1], cospi_28_64d); |
77 | 695k | step1[7] = _mm_mulhrs_epi16(io[1], cospi_4_64d); |
78 | 695k | step1[5] = _mm_mulhrs_epi16(io[3], cospi_n20_64d); |
79 | 695k | step1[6] = _mm_mulhrs_epi16(io[3], cospi_12_64d); |
80 | | |
81 | | // stage 2 |
82 | 695k | step2[0] = _mm_mulhrs_epi16(io[0], cospi_16_64d); // step2[1] = step2[0] |
83 | 695k | step2[2] = _mm_mulhrs_epi16(io[2], cospi_24_64d); |
84 | 695k | step2[3] = _mm_mulhrs_epi16(io[2], cospi_8_64d); |
85 | 695k | step2[4] = _mm_add_epi16(step1[4], step1[5]); |
86 | 695k | step2[5] = _mm_sub_epi16(step1[4], step1[5]); |
87 | 695k | step2[6] = _mm_sub_epi16(step1[7], step1[6]); |
88 | 695k | step2[7] = _mm_add_epi16(step1[7], step1[6]); |
89 | | |
90 | | // stage 3 |
91 | 695k | step1[0] = _mm_add_epi16(step2[0], step2[3]); |
92 | 695k | step1[1] = _mm_add_epi16(step2[0], step2[2]); |
93 | 695k | step1[2] = _mm_sub_epi16(step2[0], step2[2]); |
94 | 695k | step1[3] = _mm_sub_epi16(step2[0], step2[3]); |
95 | 695k | butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); |
96 | | |
97 | | // stage 4 |
98 | 695k | io[0] = _mm_add_epi16(step1[0], step2[7]); |
99 | 695k | io[1] = _mm_add_epi16(step1[1], step1[6]); |
100 | 695k | io[2] = _mm_add_epi16(step1[2], step1[5]); |
101 | 695k | io[3] = _mm_add_epi16(step1[3], step2[4]); |
102 | 695k | io[4] = _mm_sub_epi16(step1[3], step2[4]); |
103 | 695k | io[5] = _mm_sub_epi16(step1[2], step1[5]); |
104 | 695k | io[6] = _mm_sub_epi16(step1[1], step1[6]); |
105 | 695k | io[7] = _mm_sub_epi16(step1[0], step2[7]); |
106 | 695k | } inv_txfm_ssse3.c:idct8x8_12_add_kernel_ssse3 Line | Count | Source | 21 | 695k | static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) { | 22 | 695k | const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64); | 23 | 695k | const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64); | 24 | 695k | const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64); | 25 | 695k | const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64); | 26 | 695k | const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 27 | 695k | const __m128i cospi_16_64d = _mm_set1_epi16((int16_t)(2 * cospi_16_64)); | 28 | 695k | const __m128i cospi_28_64d = _mm_set1_epi16((int16_t)(2 * cospi_28_64)); | 29 | 695k | const __m128i cospi_4_64d = _mm_set1_epi16((int16_t)(2 * cospi_4_64)); | 30 | 695k | const __m128i cospi_n20_64d = _mm_set1_epi16((int16_t)(-2 * cospi_20_64)); | 31 | 695k | const __m128i cospi_12_64d = _mm_set1_epi16((int16_t)(2 * cospi_12_64)); | 32 | 695k | const __m128i cospi_24_64d = _mm_set1_epi16((int16_t)(2 * cospi_24_64)); | 33 | 695k | const __m128i cospi_8_64d = _mm_set1_epi16((int16_t)(2 * cospi_8_64)); | 34 | 695k | __m128i step1[8], step2[8], tmp[4]; | 35 | | | 36 | | // pass 1 | 37 | | | 38 | 695k | transpose_16bit_4x4(io, io); | 39 | | // io[0]: 00 10 20 30 01 11 21 31 | 40 | | // io[1]: 02 12 22 32 03 13 23 33 | 41 | | | 42 | | // stage 1 | 43 | 695k | tmp[0] = _mm_unpacklo_epi64(io[0], io[0]); | 44 | 695k | tmp[1] = _mm_unpackhi_epi64(io[0], io[0]); | 45 | 695k | tmp[2] = _mm_unpacklo_epi64(io[1], io[1]); | 46 | 695k | tmp[3] = _mm_unpackhi_epi64(io[1], io[1]); | 47 | 695k | step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d); // step1 4&7 | 48 | 695k | step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d); // step1 5&6 | 49 | | | 50 | | // stage 2 | 51 | 695k | step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d); // step2 0&1 | 52 | 695k | step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d); // step2 3&2 | 53 | 695k | step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7 | 54 | 695k | step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6 | 55 | 695k | step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]); // step2 6 | 56 | | | 57 | | // stage 3 | 58 | 695k | tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]); | 59 | 695k | step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]); // step1 5&6 | 60 | 695k | tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1 | 61 | 695k | tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2 | 62 | 695k | step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1 | 63 | 695k | step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0 | 64 | | | 65 | | // stage 4 | 66 | 695k | tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0 | 67 | 695k | tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1 | 68 | 695k | tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7 | 69 | 695k | tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6 | 70 | | | 71 | | // pass 2 | 72 | | | 73 | 695k | idct8x8_12_transpose_16bit_4x8(tmp, io); | 74 | | | 75 | | // stage 1 | 76 | 695k | step1[4] = _mm_mulhrs_epi16(io[1], cospi_28_64d); | 77 | 695k | step1[7] = _mm_mulhrs_epi16(io[1], cospi_4_64d); | 78 | 695k | step1[5] = _mm_mulhrs_epi16(io[3], cospi_n20_64d); | 79 | 695k | step1[6] = _mm_mulhrs_epi16(io[3], cospi_12_64d); | 80 | | | 81 | | // stage 2 | 82 | 695k | step2[0] = _mm_mulhrs_epi16(io[0], cospi_16_64d); // step2[1] = step2[0] | 83 | 695k | step2[2] = _mm_mulhrs_epi16(io[2], cospi_24_64d); | 84 | 695k | step2[3] = _mm_mulhrs_epi16(io[2], cospi_8_64d); | 85 | 695k | step2[4] = _mm_add_epi16(step1[4], step1[5]); | 86 | 695k | step2[5] = _mm_sub_epi16(step1[4], step1[5]); | 87 | 695k | step2[6] = _mm_sub_epi16(step1[7], step1[6]); | 88 | 695k | step2[7] = _mm_add_epi16(step1[7], step1[6]); | 89 | | | 90 | | // stage 3 | 91 | 695k | step1[0] = _mm_add_epi16(step2[0], step2[3]); | 92 | 695k | step1[1] = _mm_add_epi16(step2[0], step2[2]); | 93 | 695k | step1[2] = _mm_sub_epi16(step2[0], step2[2]); | 94 | 695k | step1[3] = _mm_sub_epi16(step2[0], step2[3]); | 95 | 695k | butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); | 96 | | | 97 | | // stage 4 | 98 | 695k | io[0] = _mm_add_epi16(step1[0], step2[7]); | 99 | 695k | io[1] = _mm_add_epi16(step1[1], step1[6]); | 100 | 695k | io[2] = _mm_add_epi16(step1[2], step1[5]); | 101 | 695k | io[3] = _mm_add_epi16(step1[3], step2[4]); | 102 | 695k | io[4] = _mm_sub_epi16(step1[3], step2[4]); | 103 | 695k | io[5] = _mm_sub_epi16(step1[2], step1[5]); | 104 | 695k | io[6] = _mm_sub_epi16(step1[1], step1[6]); | 105 | 695k | io[7] = _mm_sub_epi16(step1[0], step2[7]); | 106 | 695k | } |
Unexecuted instantiation: highbd_idct8x8_add_sse4.c:idct8x8_12_add_kernel_ssse3 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:idct8x8_12_add_kernel_ssse3 |
107 | | |
108 | | void idct32_135_8x32_ssse3(const __m128i *const in, __m128i *const out); |
109 | | |
110 | | #endif // VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_ |