Coverage Report

Created: 2025-06-13 07:07

/src/aom/av1/common/x86/av1_txfm_sse2.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
12
#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
13
14
#include <emmintrin.h>  // SSE2
15
16
#include "config/aom_config.h"
17
#include "config/av1_rtcd.h"
18
19
#include "aom/aom_integer.h"
20
#include "aom_dsp/x86/transpose_sse2.h"
21
#include "aom_dsp/x86/txfm_common_sse2.h"
22
#include "av1/common/av1_txfm.h"
23
24
#ifdef __cplusplus
25
extern "C" {
26
#endif
27
28
static inline void btf_16_w4_sse2(
29
    const __m128i *const w0, const __m128i *const w1, const __m128i __rounding,
30
    const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1,
31
0
    __m128i *const out0, __m128i *const out1) {
32
0
  const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1);
33
0
  const __m128i u0 = _mm_madd_epi16(t0, *w0);
34
0
  const __m128i v0 = _mm_madd_epi16(t0, *w1);
35
0
  const __m128i a0 = _mm_add_epi32(u0, __rounding);
36
0
  const __m128i b0 = _mm_add_epi32(v0, __rounding);
37
0
  const __m128i c0 = _mm_srai_epi32(a0, cos_bit);
38
0
  const __m128i d0 = _mm_srai_epi32(b0, cos_bit);
39
0
40
0
  *out0 = _mm_packs_epi32(c0, c0);
41
0
  *out1 = _mm_packs_epi32(d0, c0);
42
0
}
Unexecuted instantiation: av1_inv_txfm_ssse3.c:btf_16_w4_sse2
Unexecuted instantiation: highbd_inv_txfm_sse4.c:btf_16_w4_sse2
Unexecuted instantiation: av1_inv_txfm_avx2.c:btf_16_w4_sse2
43
44
#define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \
45
16.0M
  do {                                               \
46
16.0M
    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
47
16.0M
    __m128i u0 = _mm_madd_epi16(t0, w0);             \
48
16.0M
    __m128i v0 = _mm_madd_epi16(t0, w1);             \
49
16.0M
                                                     \
50
16.0M
    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
51
16.0M
    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
52
16.0M
                                                     \
53
16.0M
    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
54
16.0M
    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
55
16.0M
                                                     \
56
16.0M
    out0 = _mm_packs_epi32(c0, c0);                  \
57
16.0M
    out1 = _mm_packs_epi32(d0, d0);                  \
58
16.0M
  } while (0)
59
60
#define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
61
30.0M
  do {                                            \
62
30.0M
    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
63
30.0M
    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
64
30.0M
    __m128i u0 = _mm_madd_epi16(t0, w0);          \
65
30.0M
    __m128i u1 = _mm_madd_epi16(t1, w0);          \
66
30.0M
    __m128i v0 = _mm_madd_epi16(t0, w1);          \
67
30.0M
    __m128i v1 = _mm_madd_epi16(t1, w1);          \
68
30.0M
                                                  \
69
30.0M
    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
70
30.0M
    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
71
30.0M
    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
72
30.0M
    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
73
30.0M
                                                  \
74
30.0M
    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
75
30.0M
    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
76
30.0M
    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
77
30.0M
    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
78
30.0M
                                                  \
79
30.0M
    out0 = _mm_packs_epi32(c0, c1);               \
80
30.0M
    out1 = _mm_packs_epi32(d0, d1);               \
81
30.0M
  } while (0)
82
83
0
static inline __m128i load_16bit_to_16bit(const int16_t *a) {
84
0
  return _mm_load_si128((const __m128i *)a);
85
0
}
Unexecuted instantiation: av1_inv_txfm_ssse3.c:load_16bit_to_16bit
Unexecuted instantiation: highbd_inv_txfm_sse4.c:load_16bit_to_16bit
Unexecuted instantiation: av1_inv_txfm_avx2.c:load_16bit_to_16bit
86
87
17.2M
static inline __m128i load_32bit_to_16bit(const int32_t *a) {
88
17.2M
  const __m128i a_low = _mm_load_si128((const __m128i *)a);
89
17.2M
  return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
90
17.2M
}
av1_inv_txfm_ssse3.c:load_32bit_to_16bit
Line
Count
Source
87
17.2M
static inline __m128i load_32bit_to_16bit(const int32_t *a) {
88
17.2M
  const __m128i a_low = _mm_load_si128((const __m128i *)a);
89
17.2M
  return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
90
17.2M
}
Unexecuted instantiation: highbd_inv_txfm_sse4.c:load_32bit_to_16bit
Unexecuted instantiation: av1_inv_txfm_avx2.c:load_32bit_to_16bit
91
92
14.9M
static inline __m128i load_32bit_to_16bit_w4(const int32_t *a) {
93
14.9M
  const __m128i a_low = _mm_load_si128((const __m128i *)a);
94
14.9M
  return _mm_packs_epi32(a_low, a_low);
95
14.9M
}
av1_inv_txfm_ssse3.c:load_32bit_to_16bit_w4
Line
Count
Source
92
14.9M
static inline __m128i load_32bit_to_16bit_w4(const int32_t *a) {
93
14.9M
  const __m128i a_low = _mm_load_si128((const __m128i *)a);
94
14.9M
  return _mm_packs_epi32(a_low, a_low);
95
14.9M
}
Unexecuted instantiation: highbd_inv_txfm_sse4.c:load_32bit_to_16bit_w4
Unexecuted instantiation: av1_inv_txfm_avx2.c:load_32bit_to_16bit_w4
96
97
// Store 4 16 bit values. Sign extend the values.
98
0
static inline void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) {
99
0
  const __m128i a_lo = _mm_unpacklo_epi16(a, a);
100
0
  const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
101
0
  _mm_store_si128((__m128i *)b, a_1);
102
0
}
Unexecuted instantiation: av1_inv_txfm_ssse3.c:store_16bit_to_32bit_w4
Unexecuted instantiation: highbd_inv_txfm_sse4.c:store_16bit_to_32bit_w4
Unexecuted instantiation: av1_inv_txfm_avx2.c:store_16bit_to_32bit_w4
103
104
// Store 8 16 bit values. Sign extend the values.
105
0
static inline void store_16bit_to_32bit(__m128i a, int32_t *b) {
106
0
  const __m128i a_lo = _mm_unpacklo_epi16(a, a);
107
0
  const __m128i a_hi = _mm_unpackhi_epi16(a, a);
108
0
  const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
109
0
  const __m128i a_2 = _mm_srai_epi32(a_hi, 16);
110
0
  _mm_store_si128((__m128i *)b, a_1);
111
0
  _mm_store_si128((__m128i *)(b + 4), a_2);
112
0
}
Unexecuted instantiation: av1_inv_txfm_ssse3.c:store_16bit_to_32bit
Unexecuted instantiation: highbd_inv_txfm_sse4.c:store_16bit_to_32bit
Unexecuted instantiation: av1_inv_txfm_avx2.c:store_16bit_to_32bit
113
114
0
static inline __m128i scale_round_sse2(const __m128i a, const int scale) {
115
0
  const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1));
116
0
  const __m128i b = _mm_madd_epi16(a, scale_rounding);
117
0
  return _mm_srai_epi32(b, NewSqrt2Bits);
118
0
}
Unexecuted instantiation: av1_inv_txfm_ssse3.c:scale_round_sse2
Unexecuted instantiation: highbd_inv_txfm_sse4.c:scale_round_sse2
Unexecuted instantiation: av1_inv_txfm_avx2.c:scale_round_sse2
119
120
static inline void store_rect_16bit_to_32bit_w4(const __m128i a,
121
0
                                                int32_t *const b) {
122
0
  const __m128i one = _mm_set1_epi16(1);
123
0
  const __m128i a_lo = _mm_unpacklo_epi16(a, one);
124
0
  const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
125
0
  _mm_store_si128((__m128i *)b, b_lo);
126
0
}
Unexecuted instantiation: av1_inv_txfm_ssse3.c:store_rect_16bit_to_32bit_w4
Unexecuted instantiation: highbd_inv_txfm_sse4.c:store_rect_16bit_to_32bit_w4
Unexecuted instantiation: av1_inv_txfm_avx2.c:store_rect_16bit_to_32bit_w4
127
128
static inline void store_rect_16bit_to_32bit(const __m128i a,
129
0
                                             int32_t *const b) {
130
0
  const __m128i one = _mm_set1_epi16(1);
131
0
  const __m128i a_lo = _mm_unpacklo_epi16(a, one);
132
0
  const __m128i a_hi = _mm_unpackhi_epi16(a, one);
133
0
  const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
134
0
  const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
135
0
  _mm_store_si128((__m128i *)b, b_lo);
136
0
  _mm_store_si128((__m128i *)(b + 4), b_hi);
137
0
}
Unexecuted instantiation: av1_inv_txfm_ssse3.c:store_rect_16bit_to_32bit
Unexecuted instantiation: highbd_inv_txfm_sse4.c:store_rect_16bit_to_32bit
Unexecuted instantiation: av1_inv_txfm_avx2.c:store_rect_16bit_to_32bit
138
139
static inline void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
140
                                                 const int stride,
141
                                                 __m128i *const out,
142
0
                                                 const int out_size) {
143
0
  for (int i = 0; i < out_size; ++i) {
144
0
    out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
145
0
  }
146
0
}
Unexecuted instantiation: av1_inv_txfm_ssse3.c:load_buffer_16bit_to_16bit_w4
Unexecuted instantiation: highbd_inv_txfm_sse4.c:load_buffer_16bit_to_16bit_w4
Unexecuted instantiation: av1_inv_txfm_avx2.c:load_buffer_16bit_to_16bit_w4
147
148
static inline void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
149
                                                      const int stride,
150
                                                      __m128i *const out,
151
0
                                                      const int out_size) {
152
0
  for (int i = 0; i < out_size; ++i) {
153
0
    out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
154
0
  }
155
0
}
Unexecuted instantiation: av1_inv_txfm_ssse3.c:load_buffer_16bit_to_16bit_w4_flip
Unexecuted instantiation: highbd_inv_txfm_sse4.c:load_buffer_16bit_to_16bit_w4_flip
Unexecuted instantiation: av1_inv_txfm_avx2.c:load_buffer_16bit_to_16bit_w4_flip
156
157
static inline void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
158
0
                                              __m128i *out, int out_size) {
159
0
  for (int i = 0; i < out_size; ++i) {
160
0
    out[i] = load_16bit_to_16bit(in + i * stride);
161
0
  }
162
0
}
Unexecuted instantiation: av1_inv_txfm_ssse3.c:load_buffer_16bit_to_16bit
Unexecuted instantiation: highbd_inv_txfm_sse4.c:load_buffer_16bit_to_16bit
Unexecuted instantiation: av1_inv_txfm_avx2.c:load_buffer_16bit_to_16bit
163
164
static inline void load_buffer_16bit_to_16bit_flip(const int16_t *in,
165
                                                   int stride, __m128i *out,
166
0
                                                   int out_size) {
167
0
  for (int i = 0; i < out_size; ++i) {
168
0
    out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride);
169
0
  }
170
0
}
Unexecuted instantiation: av1_inv_txfm_ssse3.c:load_buffer_16bit_to_16bit_flip
Unexecuted instantiation: highbd_inv_txfm_sse4.c:load_buffer_16bit_to_16bit_flip
Unexecuted instantiation: av1_inv_txfm_avx2.c:load_buffer_16bit_to_16bit_flip
171
172
static inline void load_buffer_32bit_to_16bit(const int32_t *in, int stride,
173
2.12M
                                              __m128i *out, int out_size) {
174
16.5M
  for (int i = 0; i < out_size; ++i) {
175
14.3M
    out[i] = load_32bit_to_16bit(in + i * stride);
176
14.3M
  }
177
2.12M
}
av1_inv_txfm_ssse3.c:load_buffer_32bit_to_16bit
Line
Count
Source
173
2.12M
                                              __m128i *out, int out_size) {
174
16.5M
  for (int i = 0; i < out_size; ++i) {
175
14.3M
    out[i] = load_32bit_to_16bit(in + i * stride);
176
14.3M
  }
177
2.12M
}
Unexecuted instantiation: highbd_inv_txfm_sse4.c:load_buffer_32bit_to_16bit
Unexecuted instantiation: av1_inv_txfm_avx2.c:load_buffer_32bit_to_16bit
178
179
static inline void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride,
180
1.64M
                                                 __m128i *out, int out_size) {
181
16.5M
  for (int i = 0; i < out_size; ++i) {
182
14.9M
    out[i] = load_32bit_to_16bit_w4(in + i * stride);
183
14.9M
  }
184
1.64M
}
av1_inv_txfm_ssse3.c:load_buffer_32bit_to_16bit_w4
Line
Count
Source
180
1.64M
                                                 __m128i *out, int out_size) {
181
16.5M
  for (int i = 0; i < out_size; ++i) {
182
14.9M
    out[i] = load_32bit_to_16bit_w4(in + i * stride);
183
14.9M
  }
184
1.64M
}
Unexecuted instantiation: highbd_inv_txfm_sse4.c:load_buffer_32bit_to_16bit_w4
Unexecuted instantiation: av1_inv_txfm_avx2.c:load_buffer_32bit_to_16bit_w4
185
186
static inline void load_buffer_32bit_to_16bit_flip(const int32_t *in,
187
                                                   int stride, __m128i *out,
188
0
                                                   int out_size) {
189
0
  for (int i = 0; i < out_size; ++i) {
190
0
    out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride);
191
0
  }
192
0
}
Unexecuted instantiation: av1_inv_txfm_ssse3.c:load_buffer_32bit_to_16bit_flip
Unexecuted instantiation: highbd_inv_txfm_sse4.c:load_buffer_32bit_to_16bit_flip
Unexecuted instantiation: av1_inv_txfm_avx2.c:load_buffer_32bit_to_16bit_flip
193
194
static inline void store_buffer_16bit_to_32bit_w4(const __m128i *const in,
195
                                                  int32_t *const out,
196
                                                  const int stride,
197
0
                                                  const int out_size) {
198
0
  for (int i = 0; i < out_size; ++i) {
199
0
    store_16bit_to_32bit_w4(in[i], out + i * stride);
200
0
  }
201
0
}
Unexecuted instantiation: av1_inv_txfm_ssse3.c:store_buffer_16bit_to_32bit_w4
Unexecuted instantiation: highbd_inv_txfm_sse4.c:store_buffer_16bit_to_32bit_w4
Unexecuted instantiation: av1_inv_txfm_avx2.c:store_buffer_16bit_to_32bit_w4
202
203
static inline void store_buffer_16bit_to_32bit_w8(const __m128i *const in,
204
                                                  int32_t *const out,
205
                                                  const int stride,
206
0
                                                  const int out_size) {
207
0
  for (int i = 0; i < out_size; ++i) {
208
0
    store_16bit_to_32bit(in[i], out + i * stride);
209
0
  }
210
0
}
Unexecuted instantiation: av1_inv_txfm_ssse3.c:store_buffer_16bit_to_32bit_w8
Unexecuted instantiation: highbd_inv_txfm_sse4.c:store_buffer_16bit_to_32bit_w8
Unexecuted instantiation: av1_inv_txfm_avx2.c:store_buffer_16bit_to_32bit_w8
211
212
static inline void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in,
213
                                                       int32_t *const out,
214
                                                       const int stride,
215
0
                                                       const int out_size) {
216
0
  for (int i = 0; i < out_size; ++i) {
217
0
    store_rect_16bit_to_32bit_w4(in[i], out + i * stride);
218
0
  }
219
0
}
Unexecuted instantiation: av1_inv_txfm_ssse3.c:store_rect_buffer_16bit_to_32bit_w4
Unexecuted instantiation: highbd_inv_txfm_sse4.c:store_rect_buffer_16bit_to_32bit_w4
Unexecuted instantiation: av1_inv_txfm_avx2.c:store_rect_buffer_16bit_to_32bit_w4
220
221
static inline void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in,
222
                                                       int32_t *const out,
223
                                                       const int stride,
224
0
                                                       const int out_size) {
225
0
  for (int i = 0; i < out_size; ++i) {
226
0
    store_rect_16bit_to_32bit(in[i], out + i * stride);
227
0
  }
228
0
}
Unexecuted instantiation: av1_inv_txfm_ssse3.c:store_rect_buffer_16bit_to_32bit_w8
Unexecuted instantiation: highbd_inv_txfm_sse4.c:store_rect_buffer_16bit_to_32bit_w8
Unexecuted instantiation: av1_inv_txfm_avx2.c:store_rect_buffer_16bit_to_32bit_w8
229
230
static inline void store_buffer_16bit_to_16bit_8x8(const __m128i *in,
231
                                                   uint16_t *out,
232
0
                                                   const int stride) {
233
0
  for (int i = 0; i < 8; ++i) {
234
0
    _mm_store_si128((__m128i *)(out + i * stride), in[i]);
235
0
  }
236
0
}
Unexecuted instantiation: av1_inv_txfm_ssse3.c:store_buffer_16bit_to_16bit_8x8
Unexecuted instantiation: highbd_inv_txfm_sse4.c:store_buffer_16bit_to_16bit_8x8
Unexecuted instantiation: av1_inv_txfm_avx2.c:store_buffer_16bit_to_16bit_8x8
237
238
0
static inline void round_shift_16bit(__m128i *in, int size, int bit) {
239
0
  if (bit < 0) {
240
0
    bit = -bit;
241
0
    __m128i rounding = _mm_set1_epi16(1 << (bit - 1));
242
0
    for (int i = 0; i < size; ++i) {
243
0
      in[i] = _mm_adds_epi16(in[i], rounding);
244
0
      in[i] = _mm_srai_epi16(in[i], bit);
245
0
    }
246
0
  } else if (bit > 0) {
247
0
    for (int i = 0; i < size; ++i) {
248
0
      in[i] = _mm_slli_epi16(in[i], bit);
249
0
    }
250
0
  }
251
0
}
Unexecuted instantiation: av1_inv_txfm_ssse3.c:round_shift_16bit
Unexecuted instantiation: highbd_inv_txfm_sse4.c:round_shift_16bit
Unexecuted instantiation: av1_inv_txfm_avx2.c:round_shift_16bit
252
253
363k
static inline void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
254
3.37M
  for (int i = 0; i < size; ++i) {
255
3.01M
    out[size - i - 1] = in[i];
256
3.01M
  }
257
363k
}
av1_inv_txfm_ssse3.c:flip_buf_sse2
Line
Count
Source
253
222k
static inline void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
254
1.72M
  for (int i = 0; i < size; ++i) {
255
1.50M
    out[size - i - 1] = in[i];
256
1.50M
  }
257
222k
}
highbd_inv_txfm_sse4.c:flip_buf_sse2
Line
Count
Source
253
141k
static inline void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
254
1.65M
  for (int i = 0; i < size; ++i) {
255
1.51M
    out[size - i - 1] = in[i];
256
1.51M
  }
257
141k
}
Unexecuted instantiation: av1_inv_txfm_avx2.c:flip_buf_sse2
258
259
void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
260
                                   int stride, TX_TYPE tx_type, int bd);
261
262
void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
263
                                   int stride, TX_TYPE tx_type, int bd);
264
265
void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
266
                                    int stride, TX_TYPE tx_type, int bd);
267
268
void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
269
                                   int stride, TX_TYPE tx_type, int bd);
270
271
void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
272
                                   int stride, TX_TYPE tx_type, int bd);
273
274
void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
275
                                    int stride, TX_TYPE tx_type, int bd);
276
277
void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
278
                                    int stride, TX_TYPE tx_type, int bd);
279
280
void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
281
                                    int stride, TX_TYPE tx_type, int bd);
282
283
void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
284
                                    int stride, TX_TYPE tx_type, int bd);
285
286
void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
287
                                     int stride, TX_TYPE tx_type, int bd);
288
289
void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
290
                                     int stride, TX_TYPE tx_type, int bd);
291
292
void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
293
                                    int stride, TX_TYPE tx_type, int bd);
294
295
void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
296
                                     int stride, TX_TYPE tx_type, int bd);
297
298
void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
299
                                     int stride, TX_TYPE tx_type, int bd);
300
301
void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
302
                                     int stride, TX_TYPE tx_type, int bd);
303
304
void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
305
                                     int stride, TX_TYPE tx_type, int bd);
306
307
typedef void (*transform_1d_sse2)(const __m128i *input, __m128i *output,
308
                                  int8_t cos_bit);
309
310
void av1_iadst8_sse2(const __m128i *input, __m128i *output);
311
312
void av1_idct8_sse2(const __m128i *input, __m128i *output);
313
314
typedef struct {
315
  transform_1d_sse2 col, row;  // vertical and horizontal
316
} transform_2d_sse2;
317
318
#ifdef __cplusplus
319
}
320
#endif  // __cplusplus
321
#endif  // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_