/src/aom/av1/common/x86/av1_txfm_sse2.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | #ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ |
12 | | #define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ |
13 | | |
14 | | #include <emmintrin.h> // SSE2 |
15 | | |
16 | | #include "config/aom_config.h" |
17 | | #include "config/av1_rtcd.h" |
18 | | |
19 | | #include "aom/aom_integer.h" |
20 | | #include "aom_dsp/x86/transpose_sse2.h" |
21 | | #include "aom_dsp/x86/txfm_common_sse2.h" |
22 | | #include "av1/common/av1_txfm.h" |
23 | | |
24 | | #ifdef __cplusplus |
25 | | extern "C" { |
26 | | #endif |
27 | | |
28 | | static inline void btf_16_w4_sse2( |
29 | | const __m128i *const w0, const __m128i *const w1, const __m128i __rounding, |
30 | | const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1, |
31 | 0 | __m128i *const out0, __m128i *const out1) { |
32 | 0 | const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1); |
33 | 0 | const __m128i u0 = _mm_madd_epi16(t0, *w0); |
34 | 0 | const __m128i v0 = _mm_madd_epi16(t0, *w1); |
35 | 0 | const __m128i a0 = _mm_add_epi32(u0, __rounding); |
36 | 0 | const __m128i b0 = _mm_add_epi32(v0, __rounding); |
37 | 0 | const __m128i c0 = _mm_srai_epi32(a0, cos_bit); |
38 | 0 | const __m128i d0 = _mm_srai_epi32(b0, cos_bit); |
39 | 0 |
|
40 | 0 | *out0 = _mm_packs_epi32(c0, c0); |
41 | 0 | *out1 = _mm_packs_epi32(d0, c0); |
42 | 0 | } Unexecuted instantiation: av1_inv_txfm_ssse3.c:btf_16_w4_sse2 Unexecuted instantiation: highbd_inv_txfm_sse4.c:btf_16_w4_sse2 Unexecuted instantiation: av1_inv_txfm_avx2.c:btf_16_w4_sse2 |
43 | | |
44 | | #define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \ |
45 | 16.0M | do { \ |
46 | 16.0M | __m128i t0 = _mm_unpacklo_epi16(in0, in1); \ |
47 | 16.0M | __m128i u0 = _mm_madd_epi16(t0, w0); \ |
48 | 16.0M | __m128i v0 = _mm_madd_epi16(t0, w1); \ |
49 | 16.0M | \ |
50 | 16.0M | __m128i a0 = _mm_add_epi32(u0, __rounding); \ |
51 | 16.0M | __m128i b0 = _mm_add_epi32(v0, __rounding); \ |
52 | 16.0M | \ |
53 | 16.0M | __m128i c0 = _mm_srai_epi32(a0, cos_bit); \ |
54 | 16.0M | __m128i d0 = _mm_srai_epi32(b0, cos_bit); \ |
55 | 16.0M | \ |
56 | 16.0M | out0 = _mm_packs_epi32(c0, c0); \ |
57 | 16.0M | out1 = _mm_packs_epi32(d0, d0); \ |
58 | 16.0M | } while (0) |
59 | | |
60 | | #define btf_16_sse2(w0, w1, in0, in1, out0, out1) \ |
61 | 30.0M | do { \ |
62 | 30.0M | __m128i t0 = _mm_unpacklo_epi16(in0, in1); \ |
63 | 30.0M | __m128i t1 = _mm_unpackhi_epi16(in0, in1); \ |
64 | 30.0M | __m128i u0 = _mm_madd_epi16(t0, w0); \ |
65 | 30.0M | __m128i u1 = _mm_madd_epi16(t1, w0); \ |
66 | 30.0M | __m128i v0 = _mm_madd_epi16(t0, w1); \ |
67 | 30.0M | __m128i v1 = _mm_madd_epi16(t1, w1); \ |
68 | 30.0M | \ |
69 | 30.0M | __m128i a0 = _mm_add_epi32(u0, __rounding); \ |
70 | 30.0M | __m128i a1 = _mm_add_epi32(u1, __rounding); \ |
71 | 30.0M | __m128i b0 = _mm_add_epi32(v0, __rounding); \ |
72 | 30.0M | __m128i b1 = _mm_add_epi32(v1, __rounding); \ |
73 | 30.0M | \ |
74 | 30.0M | __m128i c0 = _mm_srai_epi32(a0, cos_bit); \ |
75 | 30.0M | __m128i c1 = _mm_srai_epi32(a1, cos_bit); \ |
76 | 30.0M | __m128i d0 = _mm_srai_epi32(b0, cos_bit); \ |
77 | 30.0M | __m128i d1 = _mm_srai_epi32(b1, cos_bit); \ |
78 | 30.0M | \ |
79 | 30.0M | out0 = _mm_packs_epi32(c0, c1); \ |
80 | 30.0M | out1 = _mm_packs_epi32(d0, d1); \ |
81 | 30.0M | } while (0) |
82 | | |
83 | 0 | static inline __m128i load_16bit_to_16bit(const int16_t *a) { |
84 | 0 | return _mm_load_si128((const __m128i *)a); |
85 | 0 | } Unexecuted instantiation: av1_inv_txfm_ssse3.c:load_16bit_to_16bit Unexecuted instantiation: highbd_inv_txfm_sse4.c:load_16bit_to_16bit Unexecuted instantiation: av1_inv_txfm_avx2.c:load_16bit_to_16bit |
86 | | |
87 | 17.2M | static inline __m128i load_32bit_to_16bit(const int32_t *a) { |
88 | 17.2M | const __m128i a_low = _mm_load_si128((const __m128i *)a); |
89 | 17.2M | return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4)); |
90 | 17.2M | } av1_inv_txfm_ssse3.c:load_32bit_to_16bit Line | Count | Source | 87 | 17.2M | static inline __m128i load_32bit_to_16bit(const int32_t *a) { | 88 | 17.2M | const __m128i a_low = _mm_load_si128((const __m128i *)a); | 89 | 17.2M | return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4)); | 90 | 17.2M | } |
Unexecuted instantiation: highbd_inv_txfm_sse4.c:load_32bit_to_16bit Unexecuted instantiation: av1_inv_txfm_avx2.c:load_32bit_to_16bit |
91 | | |
92 | 14.9M | static inline __m128i load_32bit_to_16bit_w4(const int32_t *a) { |
93 | 14.9M | const __m128i a_low = _mm_load_si128((const __m128i *)a); |
94 | 14.9M | return _mm_packs_epi32(a_low, a_low); |
95 | 14.9M | } av1_inv_txfm_ssse3.c:load_32bit_to_16bit_w4 Line | Count | Source | 92 | 14.9M | static inline __m128i load_32bit_to_16bit_w4(const int32_t *a) { | 93 | 14.9M | const __m128i a_low = _mm_load_si128((const __m128i *)a); | 94 | 14.9M | return _mm_packs_epi32(a_low, a_low); | 95 | 14.9M | } |
Unexecuted instantiation: highbd_inv_txfm_sse4.c:load_32bit_to_16bit_w4 Unexecuted instantiation: av1_inv_txfm_avx2.c:load_32bit_to_16bit_w4 |
96 | | |
97 | | // Store 4 16 bit values. Sign extend the values. |
98 | 0 | static inline void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) { |
99 | 0 | const __m128i a_lo = _mm_unpacklo_epi16(a, a); |
100 | 0 | const __m128i a_1 = _mm_srai_epi32(a_lo, 16); |
101 | 0 | _mm_store_si128((__m128i *)b, a_1); |
102 | 0 | } Unexecuted instantiation: av1_inv_txfm_ssse3.c:store_16bit_to_32bit_w4 Unexecuted instantiation: highbd_inv_txfm_sse4.c:store_16bit_to_32bit_w4 Unexecuted instantiation: av1_inv_txfm_avx2.c:store_16bit_to_32bit_w4 |
103 | | |
104 | | // Store 8 16 bit values. Sign extend the values. |
105 | 0 | static inline void store_16bit_to_32bit(__m128i a, int32_t *b) { |
106 | 0 | const __m128i a_lo = _mm_unpacklo_epi16(a, a); |
107 | 0 | const __m128i a_hi = _mm_unpackhi_epi16(a, a); |
108 | 0 | const __m128i a_1 = _mm_srai_epi32(a_lo, 16); |
109 | 0 | const __m128i a_2 = _mm_srai_epi32(a_hi, 16); |
110 | 0 | _mm_store_si128((__m128i *)b, a_1); |
111 | 0 | _mm_store_si128((__m128i *)(b + 4), a_2); |
112 | 0 | } Unexecuted instantiation: av1_inv_txfm_ssse3.c:store_16bit_to_32bit Unexecuted instantiation: highbd_inv_txfm_sse4.c:store_16bit_to_32bit Unexecuted instantiation: av1_inv_txfm_avx2.c:store_16bit_to_32bit |
113 | | |
114 | 0 | static inline __m128i scale_round_sse2(const __m128i a, const int scale) { |
115 | 0 | const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1)); |
116 | 0 | const __m128i b = _mm_madd_epi16(a, scale_rounding); |
117 | 0 | return _mm_srai_epi32(b, NewSqrt2Bits); |
118 | 0 | } Unexecuted instantiation: av1_inv_txfm_ssse3.c:scale_round_sse2 Unexecuted instantiation: highbd_inv_txfm_sse4.c:scale_round_sse2 Unexecuted instantiation: av1_inv_txfm_avx2.c:scale_round_sse2 |
119 | | |
120 | | static inline void store_rect_16bit_to_32bit_w4(const __m128i a, |
121 | 0 | int32_t *const b) { |
122 | 0 | const __m128i one = _mm_set1_epi16(1); |
123 | 0 | const __m128i a_lo = _mm_unpacklo_epi16(a, one); |
124 | 0 | const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2); |
125 | 0 | _mm_store_si128((__m128i *)b, b_lo); |
126 | 0 | } Unexecuted instantiation: av1_inv_txfm_ssse3.c:store_rect_16bit_to_32bit_w4 Unexecuted instantiation: highbd_inv_txfm_sse4.c:store_rect_16bit_to_32bit_w4 Unexecuted instantiation: av1_inv_txfm_avx2.c:store_rect_16bit_to_32bit_w4 |
127 | | |
128 | | static inline void store_rect_16bit_to_32bit(const __m128i a, |
129 | 0 | int32_t *const b) { |
130 | 0 | const __m128i one = _mm_set1_epi16(1); |
131 | 0 | const __m128i a_lo = _mm_unpacklo_epi16(a, one); |
132 | 0 | const __m128i a_hi = _mm_unpackhi_epi16(a, one); |
133 | 0 | const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2); |
134 | 0 | const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2); |
135 | 0 | _mm_store_si128((__m128i *)b, b_lo); |
136 | 0 | _mm_store_si128((__m128i *)(b + 4), b_hi); |
137 | 0 | } Unexecuted instantiation: av1_inv_txfm_ssse3.c:store_rect_16bit_to_32bit Unexecuted instantiation: highbd_inv_txfm_sse4.c:store_rect_16bit_to_32bit Unexecuted instantiation: av1_inv_txfm_avx2.c:store_rect_16bit_to_32bit |
138 | | |
139 | | static inline void load_buffer_16bit_to_16bit_w4(const int16_t *const in, |
140 | | const int stride, |
141 | | __m128i *const out, |
142 | 0 | const int out_size) { |
143 | 0 | for (int i = 0; i < out_size; ++i) { |
144 | 0 | out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride)); |
145 | 0 | } |
146 | 0 | } Unexecuted instantiation: av1_inv_txfm_ssse3.c:load_buffer_16bit_to_16bit_w4 Unexecuted instantiation: highbd_inv_txfm_sse4.c:load_buffer_16bit_to_16bit_w4 Unexecuted instantiation: av1_inv_txfm_avx2.c:load_buffer_16bit_to_16bit_w4 |
147 | | |
148 | | static inline void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in, |
149 | | const int stride, |
150 | | __m128i *const out, |
151 | 0 | const int out_size) { |
152 | 0 | for (int i = 0; i < out_size; ++i) { |
153 | 0 | out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride)); |
154 | 0 | } |
155 | 0 | } Unexecuted instantiation: av1_inv_txfm_ssse3.c:load_buffer_16bit_to_16bit_w4_flip Unexecuted instantiation: highbd_inv_txfm_sse4.c:load_buffer_16bit_to_16bit_w4_flip Unexecuted instantiation: av1_inv_txfm_avx2.c:load_buffer_16bit_to_16bit_w4_flip |
156 | | |
157 | | static inline void load_buffer_16bit_to_16bit(const int16_t *in, int stride, |
158 | 0 | __m128i *out, int out_size) { |
159 | 0 | for (int i = 0; i < out_size; ++i) { |
160 | 0 | out[i] = load_16bit_to_16bit(in + i * stride); |
161 | 0 | } |
162 | 0 | } Unexecuted instantiation: av1_inv_txfm_ssse3.c:load_buffer_16bit_to_16bit Unexecuted instantiation: highbd_inv_txfm_sse4.c:load_buffer_16bit_to_16bit Unexecuted instantiation: av1_inv_txfm_avx2.c:load_buffer_16bit_to_16bit |
163 | | |
164 | | static inline void load_buffer_16bit_to_16bit_flip(const int16_t *in, |
165 | | int stride, __m128i *out, |
166 | 0 | int out_size) { |
167 | 0 | for (int i = 0; i < out_size; ++i) { |
168 | 0 | out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride); |
169 | 0 | } |
170 | 0 | } Unexecuted instantiation: av1_inv_txfm_ssse3.c:load_buffer_16bit_to_16bit_flip Unexecuted instantiation: highbd_inv_txfm_sse4.c:load_buffer_16bit_to_16bit_flip Unexecuted instantiation: av1_inv_txfm_avx2.c:load_buffer_16bit_to_16bit_flip |
171 | | |
172 | | static inline void load_buffer_32bit_to_16bit(const int32_t *in, int stride, |
173 | 2.12M | __m128i *out, int out_size) { |
174 | 16.5M | for (int i = 0; i < out_size; ++i) { |
175 | 14.3M | out[i] = load_32bit_to_16bit(in + i * stride); |
176 | 14.3M | } |
177 | 2.12M | } av1_inv_txfm_ssse3.c:load_buffer_32bit_to_16bit Line | Count | Source | 173 | 2.12M | __m128i *out, int out_size) { | 174 | 16.5M | for (int i = 0; i < out_size; ++i) { | 175 | 14.3M | out[i] = load_32bit_to_16bit(in + i * stride); | 176 | 14.3M | } | 177 | 2.12M | } |
Unexecuted instantiation: highbd_inv_txfm_sse4.c:load_buffer_32bit_to_16bit Unexecuted instantiation: av1_inv_txfm_avx2.c:load_buffer_32bit_to_16bit |
178 | | |
179 | | static inline void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride, |
180 | 1.64M | __m128i *out, int out_size) { |
181 | 16.5M | for (int i = 0; i < out_size; ++i) { |
182 | 14.9M | out[i] = load_32bit_to_16bit_w4(in + i * stride); |
183 | 14.9M | } |
184 | 1.64M | } av1_inv_txfm_ssse3.c:load_buffer_32bit_to_16bit_w4 Line | Count | Source | 180 | 1.64M | __m128i *out, int out_size) { | 181 | 16.5M | for (int i = 0; i < out_size; ++i) { | 182 | 14.9M | out[i] = load_32bit_to_16bit_w4(in + i * stride); | 183 | 14.9M | } | 184 | 1.64M | } |
Unexecuted instantiation: highbd_inv_txfm_sse4.c:load_buffer_32bit_to_16bit_w4 Unexecuted instantiation: av1_inv_txfm_avx2.c:load_buffer_32bit_to_16bit_w4 |
185 | | |
186 | | static inline void load_buffer_32bit_to_16bit_flip(const int32_t *in, |
187 | | int stride, __m128i *out, |
188 | 0 | int out_size) { |
189 | 0 | for (int i = 0; i < out_size; ++i) { |
190 | 0 | out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride); |
191 | 0 | } |
192 | 0 | } Unexecuted instantiation: av1_inv_txfm_ssse3.c:load_buffer_32bit_to_16bit_flip Unexecuted instantiation: highbd_inv_txfm_sse4.c:load_buffer_32bit_to_16bit_flip Unexecuted instantiation: av1_inv_txfm_avx2.c:load_buffer_32bit_to_16bit_flip |
193 | | |
194 | | static inline void store_buffer_16bit_to_32bit_w4(const __m128i *const in, |
195 | | int32_t *const out, |
196 | | const int stride, |
197 | 0 | const int out_size) { |
198 | 0 | for (int i = 0; i < out_size; ++i) { |
199 | 0 | store_16bit_to_32bit_w4(in[i], out + i * stride); |
200 | 0 | } |
201 | 0 | } Unexecuted instantiation: av1_inv_txfm_ssse3.c:store_buffer_16bit_to_32bit_w4 Unexecuted instantiation: highbd_inv_txfm_sse4.c:store_buffer_16bit_to_32bit_w4 Unexecuted instantiation: av1_inv_txfm_avx2.c:store_buffer_16bit_to_32bit_w4 |
202 | | |
203 | | static inline void store_buffer_16bit_to_32bit_w8(const __m128i *const in, |
204 | | int32_t *const out, |
205 | | const int stride, |
206 | 0 | const int out_size) { |
207 | 0 | for (int i = 0; i < out_size; ++i) { |
208 | 0 | store_16bit_to_32bit(in[i], out + i * stride); |
209 | 0 | } |
210 | 0 | } Unexecuted instantiation: av1_inv_txfm_ssse3.c:store_buffer_16bit_to_32bit_w8 Unexecuted instantiation: highbd_inv_txfm_sse4.c:store_buffer_16bit_to_32bit_w8 Unexecuted instantiation: av1_inv_txfm_avx2.c:store_buffer_16bit_to_32bit_w8 |
211 | | |
212 | | static inline void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in, |
213 | | int32_t *const out, |
214 | | const int stride, |
215 | 0 | const int out_size) { |
216 | 0 | for (int i = 0; i < out_size; ++i) { |
217 | 0 | store_rect_16bit_to_32bit_w4(in[i], out + i * stride); |
218 | 0 | } |
219 | 0 | } Unexecuted instantiation: av1_inv_txfm_ssse3.c:store_rect_buffer_16bit_to_32bit_w4 Unexecuted instantiation: highbd_inv_txfm_sse4.c:store_rect_buffer_16bit_to_32bit_w4 Unexecuted instantiation: av1_inv_txfm_avx2.c:store_rect_buffer_16bit_to_32bit_w4 |
220 | | |
221 | | static inline void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in, |
222 | | int32_t *const out, |
223 | | const int stride, |
224 | 0 | const int out_size) { |
225 | 0 | for (int i = 0; i < out_size; ++i) { |
226 | 0 | store_rect_16bit_to_32bit(in[i], out + i * stride); |
227 | 0 | } |
228 | 0 | } Unexecuted instantiation: av1_inv_txfm_ssse3.c:store_rect_buffer_16bit_to_32bit_w8 Unexecuted instantiation: highbd_inv_txfm_sse4.c:store_rect_buffer_16bit_to_32bit_w8 Unexecuted instantiation: av1_inv_txfm_avx2.c:store_rect_buffer_16bit_to_32bit_w8 |
229 | | |
230 | | static inline void store_buffer_16bit_to_16bit_8x8(const __m128i *in, |
231 | | uint16_t *out, |
232 | 0 | const int stride) { |
233 | 0 | for (int i = 0; i < 8; ++i) { |
234 | 0 | _mm_store_si128((__m128i *)(out + i * stride), in[i]); |
235 | 0 | } |
236 | 0 | } Unexecuted instantiation: av1_inv_txfm_ssse3.c:store_buffer_16bit_to_16bit_8x8 Unexecuted instantiation: highbd_inv_txfm_sse4.c:store_buffer_16bit_to_16bit_8x8 Unexecuted instantiation: av1_inv_txfm_avx2.c:store_buffer_16bit_to_16bit_8x8 |
237 | | |
238 | 0 | static inline void round_shift_16bit(__m128i *in, int size, int bit) { |
239 | 0 | if (bit < 0) { |
240 | 0 | bit = -bit; |
241 | 0 | __m128i rounding = _mm_set1_epi16(1 << (bit - 1)); |
242 | 0 | for (int i = 0; i < size; ++i) { |
243 | 0 | in[i] = _mm_adds_epi16(in[i], rounding); |
244 | 0 | in[i] = _mm_srai_epi16(in[i], bit); |
245 | 0 | } |
246 | 0 | } else if (bit > 0) { |
247 | 0 | for (int i = 0; i < size; ++i) { |
248 | 0 | in[i] = _mm_slli_epi16(in[i], bit); |
249 | 0 | } |
250 | 0 | } |
251 | 0 | } Unexecuted instantiation: av1_inv_txfm_ssse3.c:round_shift_16bit Unexecuted instantiation: highbd_inv_txfm_sse4.c:round_shift_16bit Unexecuted instantiation: av1_inv_txfm_avx2.c:round_shift_16bit |
252 | | |
253 | 363k | static inline void flip_buf_sse2(__m128i *in, __m128i *out, int size) { |
254 | 3.37M | for (int i = 0; i < size; ++i) { |
255 | 3.01M | out[size - i - 1] = in[i]; |
256 | 3.01M | } |
257 | 363k | } av1_inv_txfm_ssse3.c:flip_buf_sse2 Line | Count | Source | 253 | 222k | static inline void flip_buf_sse2(__m128i *in, __m128i *out, int size) { | 254 | 1.72M | for (int i = 0; i < size; ++i) { | 255 | 1.50M | out[size - i - 1] = in[i]; | 256 | 1.50M | } | 257 | 222k | } |
highbd_inv_txfm_sse4.c:flip_buf_sse2 Line | Count | Source | 253 | 141k | static inline void flip_buf_sse2(__m128i *in, __m128i *out, int size) { | 254 | 1.65M | for (int i = 0; i < size; ++i) { | 255 | 1.51M | out[size - i - 1] = in[i]; | 256 | 1.51M | } | 257 | 141k | } |
Unexecuted instantiation: av1_inv_txfm_avx2.c:flip_buf_sse2 |
258 | | |
259 | | void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output, |
260 | | int stride, TX_TYPE tx_type, int bd); |
261 | | |
262 | | void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output, |
263 | | int stride, TX_TYPE tx_type, int bd); |
264 | | |
265 | | void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output, |
266 | | int stride, TX_TYPE tx_type, int bd); |
267 | | |
268 | | void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output, |
269 | | int stride, TX_TYPE tx_type, int bd); |
270 | | |
271 | | void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output, |
272 | | int stride, TX_TYPE tx_type, int bd); |
273 | | |
274 | | void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output, |
275 | | int stride, TX_TYPE tx_type, int bd); |
276 | | |
277 | | void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output, |
278 | | int stride, TX_TYPE tx_type, int bd); |
279 | | |
280 | | void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output, |
281 | | int stride, TX_TYPE tx_type, int bd); |
282 | | |
283 | | void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output, |
284 | | int stride, TX_TYPE tx_type, int bd); |
285 | | |
286 | | void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output, |
287 | | int stride, TX_TYPE tx_type, int bd); |
288 | | |
289 | | void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output, |
290 | | int stride, TX_TYPE tx_type, int bd); |
291 | | |
292 | | void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output, |
293 | | int stride, TX_TYPE tx_type, int bd); |
294 | | |
295 | | void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output, |
296 | | int stride, TX_TYPE tx_type, int bd); |
297 | | |
298 | | void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output, |
299 | | int stride, TX_TYPE tx_type, int bd); |
300 | | |
301 | | void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output, |
302 | | int stride, TX_TYPE tx_type, int bd); |
303 | | |
304 | | void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output, |
305 | | int stride, TX_TYPE tx_type, int bd); |
306 | | |
307 | | typedef void (*transform_1d_sse2)(const __m128i *input, __m128i *output, |
308 | | int8_t cos_bit); |
309 | | |
310 | | void av1_iadst8_sse2(const __m128i *input, __m128i *output); |
311 | | |
312 | | void av1_idct8_sse2(const __m128i *input, __m128i *output); |
313 | | |
314 | | typedef struct { |
315 | | transform_1d_sse2 col, row; // vertical and horizontal |
316 | | } transform_2d_sse2; |
317 | | |
318 | | #ifdef __cplusplus |
319 | | } |
320 | | #endif // __cplusplus |
321 | | #endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ |