Coverage Report

Created: 2026-04-01 07:42

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c
Line
Count
Source
1
/*
2
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
11
#include <emmintrin.h>  // SSE2
12
13
#include "./vpx_config.h"
14
#include "./vpx_dsp_rtcd.h"
15
#include "vpx_dsp/vpx_dsp_common.h"
16
#include "vpx_dsp/x86/fwd_txfm_sse2.h"
17
18
0
void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
19
0
  __m128i in0, in1;
20
0
  __m128i tmp;
21
0
  const __m128i zero = _mm_setzero_si128();
22
0
  in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
23
0
  in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
24
0
  in1 = _mm_unpacklo_epi64(
25
0
      in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
26
0
  in0 = _mm_unpacklo_epi64(
27
0
      in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
28
29
0
  tmp = _mm_add_epi16(in0, in1);
30
0
  in0 = _mm_unpacklo_epi16(zero, tmp);
31
0
  in1 = _mm_unpackhi_epi16(zero, tmp);
32
0
  in0 = _mm_srai_epi32(in0, 16);
33
0
  in1 = _mm_srai_epi32(in1, 16);
34
35
0
  tmp = _mm_add_epi32(in0, in1);
36
0
  in0 = _mm_unpacklo_epi32(tmp, zero);
37
0
  in1 = _mm_unpackhi_epi32(tmp, zero);
38
39
0
  tmp = _mm_add_epi32(in0, in1);
40
0
  in0 = _mm_srli_si128(tmp, 8);
41
42
0
  in1 = _mm_add_epi32(tmp, in0);
43
0
  in0 = _mm_slli_epi32(in1, 1);
44
0
  output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
45
0
}
46
47
62.0k
void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
48
62.0k
  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
49
62.0k
  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
50
62.0k
  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
51
62.0k
  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
52
62.0k
  __m128i u0, u1, sum;
53
54
62.0k
  u0 = _mm_add_epi16(in0, in1);
55
62.0k
  u1 = _mm_add_epi16(in2, in3);
56
57
62.0k
  in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
58
62.0k
  in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
59
62.0k
  in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
60
62.0k
  in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
61
62
62.0k
  sum = _mm_add_epi16(u0, u1);
63
64
62.0k
  in0 = _mm_add_epi16(in0, in1);
65
62.0k
  in2 = _mm_add_epi16(in2, in3);
66
62.0k
  sum = _mm_add_epi16(sum, in0);
67
68
62.0k
  u0 = _mm_setzero_si128();
69
62.0k
  sum = _mm_add_epi16(sum, in2);
70
71
62.0k
  in0 = _mm_unpacklo_epi16(u0, sum);
72
62.0k
  in1 = _mm_unpackhi_epi16(u0, sum);
73
62.0k
  in0 = _mm_srai_epi32(in0, 16);
74
62.0k
  in1 = _mm_srai_epi32(in1, 16);
75
76
62.0k
  sum = _mm_add_epi32(in0, in1);
77
62.0k
  in0 = _mm_unpacklo_epi32(sum, u0);
78
62.0k
  in1 = _mm_unpackhi_epi32(sum, u0);
79
80
62.0k
  sum = _mm_add_epi32(in0, in1);
81
62.0k
  in0 = _mm_srli_si128(sum, 8);
82
83
62.0k
  in1 = _mm_add_epi32(sum, in0);
84
62.0k
  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
85
62.0k
}
86
87
void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
88
10.0k
                          int stride) {
89
10.0k
  __m128i in0, in1, in2, in3;
90
10.0k
  __m128i u0, u1;
91
10.0k
  __m128i sum = _mm_setzero_si128();
92
10.0k
  int i;
93
94
30.0k
  for (i = 0; i < 2; ++i) {
95
20.0k
    in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
96
20.0k
    in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
97
20.0k
    in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
98
20.0k
    in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
99
100
20.0k
    u0 = _mm_add_epi16(in0, in1);
101
20.0k
    u1 = _mm_add_epi16(in2, in3);
102
20.0k
    sum = _mm_add_epi16(sum, u0);
103
104
20.0k
    in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
105
20.0k
    in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
106
20.0k
    in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
107
20.0k
    in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
108
109
20.0k
    sum = _mm_add_epi16(sum, u1);
110
20.0k
    u0 = _mm_add_epi16(in0, in1);
111
20.0k
    u1 = _mm_add_epi16(in2, in3);
112
20.0k
    sum = _mm_add_epi16(sum, u0);
113
114
20.0k
    in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
115
20.0k
    in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
116
20.0k
    in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
117
20.0k
    in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
118
119
20.0k
    sum = _mm_add_epi16(sum, u1);
120
20.0k
    u0 = _mm_add_epi16(in0, in1);
121
20.0k
    u1 = _mm_add_epi16(in2, in3);
122
20.0k
    sum = _mm_add_epi16(sum, u0);
123
124
20.0k
    in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
125
20.0k
    in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
126
20.0k
    in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
127
20.0k
    in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
128
129
20.0k
    sum = _mm_add_epi16(sum, u1);
130
20.0k
    u0 = _mm_add_epi16(in0, in1);
131
20.0k
    u1 = _mm_add_epi16(in2, in3);
132
20.0k
    sum = _mm_add_epi16(sum, u0);
133
134
20.0k
    sum = _mm_add_epi16(sum, u1);
135
20.0k
    input += 8 * stride;
136
20.0k
  }
137
138
10.0k
  u0 = _mm_setzero_si128();
139
10.0k
  in0 = _mm_unpacklo_epi16(u0, sum);
140
10.0k
  in1 = _mm_unpackhi_epi16(u0, sum);
141
10.0k
  in0 = _mm_srai_epi32(in0, 16);
142
10.0k
  in1 = _mm_srai_epi32(in1, 16);
143
144
10.0k
  sum = _mm_add_epi32(in0, in1);
145
10.0k
  in0 = _mm_unpacklo_epi32(sum, u0);
146
10.0k
  in1 = _mm_unpackhi_epi32(sum, u0);
147
148
10.0k
  sum = _mm_add_epi32(in0, in1);
149
10.0k
  in0 = _mm_srli_si128(sum, 8);
150
151
10.0k
  in1 = _mm_add_epi32(sum, in0);
152
10.0k
  in1 = _mm_srai_epi32(in1, 1);
153
10.0k
  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
154
10.0k
}
155
156
void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
157
4.27k
                          int stride) {
158
4.27k
  __m128i in0, in1, in2, in3;
159
4.27k
  __m128i u0, u1;
160
4.27k
  __m128i sum = _mm_setzero_si128();
161
4.27k
  int i;
162
163
38.4k
  for (i = 0; i < 8; ++i) {
164
34.1k
    in0 = _mm_load_si128((const __m128i *)(input + 0));
165
34.1k
    in1 = _mm_load_si128((const __m128i *)(input + 8));
166
34.1k
    in2 = _mm_load_si128((const __m128i *)(input + 16));
167
34.1k
    in3 = _mm_load_si128((const __m128i *)(input + 24));
168
169
34.1k
    input += stride;
170
34.1k
    u0 = _mm_add_epi16(in0, in1);
171
34.1k
    u1 = _mm_add_epi16(in2, in3);
172
34.1k
    sum = _mm_add_epi16(sum, u0);
173
174
34.1k
    in0 = _mm_load_si128((const __m128i *)(input + 0));
175
34.1k
    in1 = _mm_load_si128((const __m128i *)(input + 8));
176
34.1k
    in2 = _mm_load_si128((const __m128i *)(input + 16));
177
34.1k
    in3 = _mm_load_si128((const __m128i *)(input + 24));
178
179
34.1k
    input += stride;
180
34.1k
    sum = _mm_add_epi16(sum, u1);
181
34.1k
    u0 = _mm_add_epi16(in0, in1);
182
34.1k
    u1 = _mm_add_epi16(in2, in3);
183
34.1k
    sum = _mm_add_epi16(sum, u0);
184
185
34.1k
    in0 = _mm_load_si128((const __m128i *)(input + 0));
186
34.1k
    in1 = _mm_load_si128((const __m128i *)(input + 8));
187
34.1k
    in2 = _mm_load_si128((const __m128i *)(input + 16));
188
34.1k
    in3 = _mm_load_si128((const __m128i *)(input + 24));
189
190
34.1k
    input += stride;
191
34.1k
    sum = _mm_add_epi16(sum, u1);
192
34.1k
    u0 = _mm_add_epi16(in0, in1);
193
34.1k
    u1 = _mm_add_epi16(in2, in3);
194
34.1k
    sum = _mm_add_epi16(sum, u0);
195
196
34.1k
    in0 = _mm_load_si128((const __m128i *)(input + 0));
197
34.1k
    in1 = _mm_load_si128((const __m128i *)(input + 8));
198
34.1k
    in2 = _mm_load_si128((const __m128i *)(input + 16));
199
34.1k
    in3 = _mm_load_si128((const __m128i *)(input + 24));
200
201
34.1k
    input += stride;
202
34.1k
    sum = _mm_add_epi16(sum, u1);
203
34.1k
    u0 = _mm_add_epi16(in0, in1);
204
34.1k
    u1 = _mm_add_epi16(in2, in3);
205
34.1k
    sum = _mm_add_epi16(sum, u0);
206
207
34.1k
    sum = _mm_add_epi16(sum, u1);
208
34.1k
  }
209
210
4.27k
  u0 = _mm_setzero_si128();
211
4.27k
  in0 = _mm_unpacklo_epi16(u0, sum);
212
4.27k
  in1 = _mm_unpackhi_epi16(u0, sum);
213
4.27k
  in0 = _mm_srai_epi32(in0, 16);
214
4.27k
  in1 = _mm_srai_epi32(in1, 16);
215
216
4.27k
  sum = _mm_add_epi32(in0, in1);
217
4.27k
  in0 = _mm_unpacklo_epi32(sum, u0);
218
4.27k
  in1 = _mm_unpackhi_epi32(sum, u0);
219
220
4.27k
  sum = _mm_add_epi32(in0, in1);
221
4.27k
  in0 = _mm_srli_si128(sum, 8);
222
223
4.27k
  in1 = _mm_add_epi32(sum, in0);
224
4.27k
  in1 = _mm_srai_epi32(in1, 3);
225
4.27k
  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
226
4.27k
}
227
228
#define DCT_HIGH_BIT_DEPTH 0
229
#define FDCT4x4_2D vpx_fdct4x4_sse2
230
#define FDCT8x8_2D vpx_fdct8x8_sse2
231
#define FDCT16x16_2D vpx_fdct16x16_sse2
232
#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"
233
#undef FDCT4x4_2D
234
#undef FDCT8x8_2D
235
#undef FDCT16x16_2D
236
237
#define FDCT32x32_2D vpx_fdct32x32_rd_sse2
238
#define FDCT32x32_HIGH_PRECISION 0
239
#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"
240
#undef FDCT32x32_2D
241
#undef FDCT32x32_HIGH_PRECISION
242
243
#define FDCT32x32_2D vpx_fdct32x32_sse2
244
#define FDCT32x32_HIGH_PRECISION 1
245
#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
246
#undef FDCT32x32_2D
247
#undef FDCT32x32_HIGH_PRECISION
248
#undef DCT_HIGH_BIT_DEPTH
249
250
#if CONFIG_VP9_HIGHBITDEPTH
251
#define DCT_HIGH_BIT_DEPTH 1
252
#define FDCT4x4_2D vpx_highbd_fdct4x4_sse2
253
#define FDCT8x8_2D vpx_highbd_fdct8x8_sse2
254
#define FDCT16x16_2D vpx_highbd_fdct16x16_sse2
255
#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"  // NOLINT
256
#undef FDCT4x4_2D
257
#undef FDCT8x8_2D
258
#undef FDCT16x16_2D
259
260
#define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2
261
#define FDCT32x32_HIGH_PRECISION 0
262
#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
263
#undef FDCT32x32_2D
264
#undef FDCT32x32_HIGH_PRECISION
265
266
#define FDCT32x32_2D vpx_highbd_fdct32x32_sse2
267
#define FDCT32x32_HIGH_PRECISION 1
268
#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
269
#undef FDCT32x32_2D
270
#undef FDCT32x32_HIGH_PRECISION
271
#undef DCT_HIGH_BIT_DEPTH
272
#endif  // CONFIG_VP9_HIGHBITDEPTH