Coverage Report

Created: 2024-09-06 07:53

/src/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
11
#include <emmintrin.h>  // SSE2
12
13
#include "./vpx_config.h"
14
#include "./vpx_dsp_rtcd.h"
15
#include "vpx_dsp/vpx_dsp_common.h"
16
#include "vpx_dsp/x86/fwd_txfm_sse2.h"
17
18
0
void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
19
0
  __m128i in0, in1;
20
0
  __m128i tmp;
21
0
  const __m128i zero = _mm_setzero_si128();
22
0
  in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
23
0
  in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
24
0
  in1 = _mm_unpacklo_epi64(
25
0
      in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
26
0
  in0 = _mm_unpacklo_epi64(
27
0
      in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
28
29
0
  tmp = _mm_add_epi16(in0, in1);
30
0
  in0 = _mm_unpacklo_epi16(zero, tmp);
31
0
  in1 = _mm_unpackhi_epi16(zero, tmp);
32
0
  in0 = _mm_srai_epi32(in0, 16);
33
0
  in1 = _mm_srai_epi32(in1, 16);
34
35
0
  tmp = _mm_add_epi32(in0, in1);
36
0
  in0 = _mm_unpacklo_epi32(tmp, zero);
37
0
  in1 = _mm_unpackhi_epi32(tmp, zero);
38
39
0
  tmp = _mm_add_epi32(in0, in1);
40
0
  in0 = _mm_srli_si128(tmp, 8);
41
42
0
  in1 = _mm_add_epi32(tmp, in0);
43
0
  in0 = _mm_slli_epi32(in1, 1);
44
0
  output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
45
0
}
46
47
50.1k
void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
48
50.1k
  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
49
50.1k
  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
50
50.1k
  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
51
50.1k
  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
52
50.1k
  __m128i u0, u1, sum;
53
54
50.1k
  u0 = _mm_add_epi16(in0, in1);
55
50.1k
  u1 = _mm_add_epi16(in2, in3);
56
57
50.1k
  in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
58
50.1k
  in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
59
50.1k
  in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
60
50.1k
  in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
61
62
50.1k
  sum = _mm_add_epi16(u0, u1);
63
64
50.1k
  in0 = _mm_add_epi16(in0, in1);
65
50.1k
  in2 = _mm_add_epi16(in2, in3);
66
50.1k
  sum = _mm_add_epi16(sum, in0);
67
68
50.1k
  u0 = _mm_setzero_si128();
69
50.1k
  sum = _mm_add_epi16(sum, in2);
70
71
50.1k
  in0 = _mm_unpacklo_epi16(u0, sum);
72
50.1k
  in1 = _mm_unpackhi_epi16(u0, sum);
73
50.1k
  in0 = _mm_srai_epi32(in0, 16);
74
50.1k
  in1 = _mm_srai_epi32(in1, 16);
75
76
50.1k
  sum = _mm_add_epi32(in0, in1);
77
50.1k
  in0 = _mm_unpacklo_epi32(sum, u0);
78
50.1k
  in1 = _mm_unpackhi_epi32(sum, u0);
79
80
50.1k
  sum = _mm_add_epi32(in0, in1);
81
50.1k
  in0 = _mm_srli_si128(sum, 8);
82
83
50.1k
  in1 = _mm_add_epi32(sum, in0);
84
50.1k
  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
85
50.1k
}
86
87
void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
88
9.27k
                          int stride) {
89
9.27k
  __m128i in0, in1, in2, in3;
90
9.27k
  __m128i u0, u1;
91
9.27k
  __m128i sum = _mm_setzero_si128();
92
9.27k
  int i;
93
94
27.8k
  for (i = 0; i < 2; ++i) {
95
18.5k
    in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
96
18.5k
    in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
97
18.5k
    in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
98
18.5k
    in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
99
100
18.5k
    u0 = _mm_add_epi16(in0, in1);
101
18.5k
    u1 = _mm_add_epi16(in2, in3);
102
18.5k
    sum = _mm_add_epi16(sum, u0);
103
104
18.5k
    in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
105
18.5k
    in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
106
18.5k
    in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
107
18.5k
    in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
108
109
18.5k
    sum = _mm_add_epi16(sum, u1);
110
18.5k
    u0 = _mm_add_epi16(in0, in1);
111
18.5k
    u1 = _mm_add_epi16(in2, in3);
112
18.5k
    sum = _mm_add_epi16(sum, u0);
113
114
18.5k
    in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
115
18.5k
    in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
116
18.5k
    in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
117
18.5k
    in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
118
119
18.5k
    sum = _mm_add_epi16(sum, u1);
120
18.5k
    u0 = _mm_add_epi16(in0, in1);
121
18.5k
    u1 = _mm_add_epi16(in2, in3);
122
18.5k
    sum = _mm_add_epi16(sum, u0);
123
124
18.5k
    in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
125
18.5k
    in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
126
18.5k
    in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
127
18.5k
    in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
128
129
18.5k
    sum = _mm_add_epi16(sum, u1);
130
18.5k
    u0 = _mm_add_epi16(in0, in1);
131
18.5k
    u1 = _mm_add_epi16(in2, in3);
132
18.5k
    sum = _mm_add_epi16(sum, u0);
133
134
18.5k
    sum = _mm_add_epi16(sum, u1);
135
18.5k
    input += 8 * stride;
136
18.5k
  }
137
138
9.27k
  u0 = _mm_setzero_si128();
139
9.27k
  in0 = _mm_unpacklo_epi16(u0, sum);
140
9.27k
  in1 = _mm_unpackhi_epi16(u0, sum);
141
9.27k
  in0 = _mm_srai_epi32(in0, 16);
142
9.27k
  in1 = _mm_srai_epi32(in1, 16);
143
144
9.27k
  sum = _mm_add_epi32(in0, in1);
145
9.27k
  in0 = _mm_unpacklo_epi32(sum, u0);
146
9.27k
  in1 = _mm_unpackhi_epi32(sum, u0);
147
148
9.27k
  sum = _mm_add_epi32(in0, in1);
149
9.27k
  in0 = _mm_srli_si128(sum, 8);
150
151
9.27k
  in1 = _mm_add_epi32(sum, in0);
152
9.27k
  in1 = _mm_srai_epi32(in1, 1);
153
9.27k
  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
154
9.27k
}
155
156
void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
157
2.09k
                          int stride) {
158
2.09k
  __m128i in0, in1, in2, in3;
159
2.09k
  __m128i u0, u1;
160
2.09k
  __m128i sum = _mm_setzero_si128();
161
2.09k
  int i;
162
163
18.8k
  for (i = 0; i < 8; ++i) {
164
16.7k
    in0 = _mm_load_si128((const __m128i *)(input + 0));
165
16.7k
    in1 = _mm_load_si128((const __m128i *)(input + 8));
166
16.7k
    in2 = _mm_load_si128((const __m128i *)(input + 16));
167
16.7k
    in3 = _mm_load_si128((const __m128i *)(input + 24));
168
169
16.7k
    input += stride;
170
16.7k
    u0 = _mm_add_epi16(in0, in1);
171
16.7k
    u1 = _mm_add_epi16(in2, in3);
172
16.7k
    sum = _mm_add_epi16(sum, u0);
173
174
16.7k
    in0 = _mm_load_si128((const __m128i *)(input + 0));
175
16.7k
    in1 = _mm_load_si128((const __m128i *)(input + 8));
176
16.7k
    in2 = _mm_load_si128((const __m128i *)(input + 16));
177
16.7k
    in3 = _mm_load_si128((const __m128i *)(input + 24));
178
179
16.7k
    input += stride;
180
16.7k
    sum = _mm_add_epi16(sum, u1);
181
16.7k
    u0 = _mm_add_epi16(in0, in1);
182
16.7k
    u1 = _mm_add_epi16(in2, in3);
183
16.7k
    sum = _mm_add_epi16(sum, u0);
184
185
16.7k
    in0 = _mm_load_si128((const __m128i *)(input + 0));
186
16.7k
    in1 = _mm_load_si128((const __m128i *)(input + 8));
187
16.7k
    in2 = _mm_load_si128((const __m128i *)(input + 16));
188
16.7k
    in3 = _mm_load_si128((const __m128i *)(input + 24));
189
190
16.7k
    input += stride;
191
16.7k
    sum = _mm_add_epi16(sum, u1);
192
16.7k
    u0 = _mm_add_epi16(in0, in1);
193
16.7k
    u1 = _mm_add_epi16(in2, in3);
194
16.7k
    sum = _mm_add_epi16(sum, u0);
195
196
16.7k
    in0 = _mm_load_si128((const __m128i *)(input + 0));
197
16.7k
    in1 = _mm_load_si128((const __m128i *)(input + 8));
198
16.7k
    in2 = _mm_load_si128((const __m128i *)(input + 16));
199
16.7k
    in3 = _mm_load_si128((const __m128i *)(input + 24));
200
201
16.7k
    input += stride;
202
16.7k
    sum = _mm_add_epi16(sum, u1);
203
16.7k
    u0 = _mm_add_epi16(in0, in1);
204
16.7k
    u1 = _mm_add_epi16(in2, in3);
205
16.7k
    sum = _mm_add_epi16(sum, u0);
206
207
16.7k
    sum = _mm_add_epi16(sum, u1);
208
16.7k
  }
209
210
2.09k
  u0 = _mm_setzero_si128();
211
2.09k
  in0 = _mm_unpacklo_epi16(u0, sum);
212
2.09k
  in1 = _mm_unpackhi_epi16(u0, sum);
213
2.09k
  in0 = _mm_srai_epi32(in0, 16);
214
2.09k
  in1 = _mm_srai_epi32(in1, 16);
215
216
2.09k
  sum = _mm_add_epi32(in0, in1);
217
2.09k
  in0 = _mm_unpacklo_epi32(sum, u0);
218
2.09k
  in1 = _mm_unpackhi_epi32(sum, u0);
219
220
2.09k
  sum = _mm_add_epi32(in0, in1);
221
2.09k
  in0 = _mm_srli_si128(sum, 8);
222
223
2.09k
  in1 = _mm_add_epi32(sum, in0);
224
2.09k
  in1 = _mm_srai_epi32(in1, 3);
225
2.09k
  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
226
2.09k
}
227
228
#define DCT_HIGH_BIT_DEPTH 0
229
#define FDCT4x4_2D vpx_fdct4x4_sse2
230
#define FDCT8x8_2D vpx_fdct8x8_sse2
231
#define FDCT16x16_2D vpx_fdct16x16_sse2
232
#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"
233
#undef FDCT4x4_2D
234
#undef FDCT8x8_2D
235
#undef FDCT16x16_2D
236
237
#define FDCT32x32_2D vpx_fdct32x32_rd_sse2
238
#define FDCT32x32_HIGH_PRECISION 0
239
#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"
240
#undef FDCT32x32_2D
241
#undef FDCT32x32_HIGH_PRECISION
242
243
#define FDCT32x32_2D vpx_fdct32x32_sse2
244
#define FDCT32x32_HIGH_PRECISION 1
245
#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
246
#undef FDCT32x32_2D
247
#undef FDCT32x32_HIGH_PRECISION
248
#undef DCT_HIGH_BIT_DEPTH
249
250
#if CONFIG_VP9_HIGHBITDEPTH
251
#define DCT_HIGH_BIT_DEPTH 1
252
#define FDCT4x4_2D vpx_highbd_fdct4x4_sse2
253
#define FDCT8x8_2D vpx_highbd_fdct8x8_sse2
254
#define FDCT16x16_2D vpx_highbd_fdct16x16_sse2
255
#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"  // NOLINT
256
#undef FDCT4x4_2D
257
#undef FDCT8x8_2D
258
#undef FDCT16x16_2D
259
260
#define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2
261
#define FDCT32x32_HIGH_PRECISION 0
262
#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
263
#undef FDCT32x32_2D
264
#undef FDCT32x32_HIGH_PRECISION
265
266
#define FDCT32x32_2D vpx_highbd_fdct32x32_sse2
267
#define FDCT32x32_HIGH_PRECISION 1
268
#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
269
#undef FDCT32x32_2D
270
#undef FDCT32x32_HIGH_PRECISION
271
#undef DCT_HIGH_BIT_DEPTH
272
#endif  // CONFIG_VP9_HIGHBITDEPTH