Coverage Report

Created: 2026-02-14 06:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c
Line
Count
Source
1
/*
2
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
11
#include <emmintrin.h>  // SSE2
12
13
#include "./vpx_config.h"
14
#include "./vpx_dsp_rtcd.h"
15
#include "vpx_dsp/vpx_dsp_common.h"
16
#include "vpx_dsp/x86/fwd_txfm_sse2.h"
17
18
0
void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
19
0
  __m128i in0, in1;
20
0
  __m128i tmp;
21
0
  const __m128i zero = _mm_setzero_si128();
22
0
  in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
23
0
  in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
24
0
  in1 = _mm_unpacklo_epi64(
25
0
      in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
26
0
  in0 = _mm_unpacklo_epi64(
27
0
      in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
28
29
0
  tmp = _mm_add_epi16(in0, in1);
30
0
  in0 = _mm_unpacklo_epi16(zero, tmp);
31
0
  in1 = _mm_unpackhi_epi16(zero, tmp);
32
0
  in0 = _mm_srai_epi32(in0, 16);
33
0
  in1 = _mm_srai_epi32(in1, 16);
34
35
0
  tmp = _mm_add_epi32(in0, in1);
36
0
  in0 = _mm_unpacklo_epi32(tmp, zero);
37
0
  in1 = _mm_unpackhi_epi32(tmp, zero);
38
39
0
  tmp = _mm_add_epi32(in0, in1);
40
0
  in0 = _mm_srli_si128(tmp, 8);
41
42
0
  in1 = _mm_add_epi32(tmp, in0);
43
0
  in0 = _mm_slli_epi32(in1, 1);
44
0
  output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
45
0
}
46
47
0
void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
48
0
  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
49
0
  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
50
0
  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
51
0
  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
52
0
  __m128i u0, u1, sum;
53
54
0
  u0 = _mm_add_epi16(in0, in1);
55
0
  u1 = _mm_add_epi16(in2, in3);
56
57
0
  in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
58
0
  in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
59
0
  in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
60
0
  in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
61
62
0
  sum = _mm_add_epi16(u0, u1);
63
64
0
  in0 = _mm_add_epi16(in0, in1);
65
0
  in2 = _mm_add_epi16(in2, in3);
66
0
  sum = _mm_add_epi16(sum, in0);
67
68
0
  u0 = _mm_setzero_si128();
69
0
  sum = _mm_add_epi16(sum, in2);
70
71
0
  in0 = _mm_unpacklo_epi16(u0, sum);
72
0
  in1 = _mm_unpackhi_epi16(u0, sum);
73
0
  in0 = _mm_srai_epi32(in0, 16);
74
0
  in1 = _mm_srai_epi32(in1, 16);
75
76
0
  sum = _mm_add_epi32(in0, in1);
77
0
  in0 = _mm_unpacklo_epi32(sum, u0);
78
0
  in1 = _mm_unpackhi_epi32(sum, u0);
79
80
0
  sum = _mm_add_epi32(in0, in1);
81
0
  in0 = _mm_srli_si128(sum, 8);
82
83
0
  in1 = _mm_add_epi32(sum, in0);
84
0
  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
85
0
}
86
87
void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
88
0
                          int stride) {
89
0
  __m128i in0, in1, in2, in3;
90
0
  __m128i u0, u1;
91
0
  __m128i sum = _mm_setzero_si128();
92
0
  int i;
93
94
0
  for (i = 0; i < 2; ++i) {
95
0
    in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
96
0
    in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
97
0
    in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
98
0
    in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
99
100
0
    u0 = _mm_add_epi16(in0, in1);
101
0
    u1 = _mm_add_epi16(in2, in3);
102
0
    sum = _mm_add_epi16(sum, u0);
103
104
0
    in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
105
0
    in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
106
0
    in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
107
0
    in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
108
109
0
    sum = _mm_add_epi16(sum, u1);
110
0
    u0 = _mm_add_epi16(in0, in1);
111
0
    u1 = _mm_add_epi16(in2, in3);
112
0
    sum = _mm_add_epi16(sum, u0);
113
114
0
    in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
115
0
    in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
116
0
    in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
117
0
    in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
118
119
0
    sum = _mm_add_epi16(sum, u1);
120
0
    u0 = _mm_add_epi16(in0, in1);
121
0
    u1 = _mm_add_epi16(in2, in3);
122
0
    sum = _mm_add_epi16(sum, u0);
123
124
0
    in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
125
0
    in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
126
0
    in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
127
0
    in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
128
129
0
    sum = _mm_add_epi16(sum, u1);
130
0
    u0 = _mm_add_epi16(in0, in1);
131
0
    u1 = _mm_add_epi16(in2, in3);
132
0
    sum = _mm_add_epi16(sum, u0);
133
134
0
    sum = _mm_add_epi16(sum, u1);
135
0
    input += 8 * stride;
136
0
  }
137
138
0
  u0 = _mm_setzero_si128();
139
0
  in0 = _mm_unpacklo_epi16(u0, sum);
140
0
  in1 = _mm_unpackhi_epi16(u0, sum);
141
0
  in0 = _mm_srai_epi32(in0, 16);
142
0
  in1 = _mm_srai_epi32(in1, 16);
143
144
0
  sum = _mm_add_epi32(in0, in1);
145
0
  in0 = _mm_unpacklo_epi32(sum, u0);
146
0
  in1 = _mm_unpackhi_epi32(sum, u0);
147
148
0
  sum = _mm_add_epi32(in0, in1);
149
0
  in0 = _mm_srli_si128(sum, 8);
150
151
0
  in1 = _mm_add_epi32(sum, in0);
152
0
  in1 = _mm_srai_epi32(in1, 1);
153
0
  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
154
0
}
155
156
void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
157
0
                          int stride) {
158
0
  __m128i in0, in1, in2, in3;
159
0
  __m128i u0, u1;
160
0
  __m128i sum = _mm_setzero_si128();
161
0
  int i;
162
163
0
  for (i = 0; i < 8; ++i) {
164
0
    in0 = _mm_load_si128((const __m128i *)(input + 0));
165
0
    in1 = _mm_load_si128((const __m128i *)(input + 8));
166
0
    in2 = _mm_load_si128((const __m128i *)(input + 16));
167
0
    in3 = _mm_load_si128((const __m128i *)(input + 24));
168
169
0
    input += stride;
170
0
    u0 = _mm_add_epi16(in0, in1);
171
0
    u1 = _mm_add_epi16(in2, in3);
172
0
    sum = _mm_add_epi16(sum, u0);
173
174
0
    in0 = _mm_load_si128((const __m128i *)(input + 0));
175
0
    in1 = _mm_load_si128((const __m128i *)(input + 8));
176
0
    in2 = _mm_load_si128((const __m128i *)(input + 16));
177
0
    in3 = _mm_load_si128((const __m128i *)(input + 24));
178
179
0
    input += stride;
180
0
    sum = _mm_add_epi16(sum, u1);
181
0
    u0 = _mm_add_epi16(in0, in1);
182
0
    u1 = _mm_add_epi16(in2, in3);
183
0
    sum = _mm_add_epi16(sum, u0);
184
185
0
    in0 = _mm_load_si128((const __m128i *)(input + 0));
186
0
    in1 = _mm_load_si128((const __m128i *)(input + 8));
187
0
    in2 = _mm_load_si128((const __m128i *)(input + 16));
188
0
    in3 = _mm_load_si128((const __m128i *)(input + 24));
189
190
0
    input += stride;
191
0
    sum = _mm_add_epi16(sum, u1);
192
0
    u0 = _mm_add_epi16(in0, in1);
193
0
    u1 = _mm_add_epi16(in2, in3);
194
0
    sum = _mm_add_epi16(sum, u0);
195
196
0
    in0 = _mm_load_si128((const __m128i *)(input + 0));
197
0
    in1 = _mm_load_si128((const __m128i *)(input + 8));
198
0
    in2 = _mm_load_si128((const __m128i *)(input + 16));
199
0
    in3 = _mm_load_si128((const __m128i *)(input + 24));
200
201
0
    input += stride;
202
0
    sum = _mm_add_epi16(sum, u1);
203
0
    u0 = _mm_add_epi16(in0, in1);
204
0
    u1 = _mm_add_epi16(in2, in3);
205
0
    sum = _mm_add_epi16(sum, u0);
206
207
0
    sum = _mm_add_epi16(sum, u1);
208
0
  }
209
210
0
  u0 = _mm_setzero_si128();
211
0
  in0 = _mm_unpacklo_epi16(u0, sum);
212
0
  in1 = _mm_unpackhi_epi16(u0, sum);
213
0
  in0 = _mm_srai_epi32(in0, 16);
214
0
  in1 = _mm_srai_epi32(in1, 16);
215
216
0
  sum = _mm_add_epi32(in0, in1);
217
0
  in0 = _mm_unpacklo_epi32(sum, u0);
218
0
  in1 = _mm_unpackhi_epi32(sum, u0);
219
220
0
  sum = _mm_add_epi32(in0, in1);
221
0
  in0 = _mm_srli_si128(sum, 8);
222
223
0
  in1 = _mm_add_epi32(sum, in0);
224
0
  in1 = _mm_srai_epi32(in1, 3);
225
0
  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
226
0
}
227
228
#define DCT_HIGH_BIT_DEPTH 0
229
#define FDCT4x4_2D vpx_fdct4x4_sse2
230
#define FDCT8x8_2D vpx_fdct8x8_sse2
231
#define FDCT16x16_2D vpx_fdct16x16_sse2
232
#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"
233
#undef FDCT4x4_2D
234
#undef FDCT8x8_2D
235
#undef FDCT16x16_2D
236
237
#define FDCT32x32_2D vpx_fdct32x32_rd_sse2
238
#define FDCT32x32_HIGH_PRECISION 0
239
#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"
240
#undef FDCT32x32_2D
241
#undef FDCT32x32_HIGH_PRECISION
242
243
#define FDCT32x32_2D vpx_fdct32x32_sse2
244
#define FDCT32x32_HIGH_PRECISION 1
245
#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
246
#undef FDCT32x32_2D
247
#undef FDCT32x32_HIGH_PRECISION
248
#undef DCT_HIGH_BIT_DEPTH
249
250
#if CONFIG_VP9_HIGHBITDEPTH
251
#define DCT_HIGH_BIT_DEPTH 1
252
#define FDCT4x4_2D vpx_highbd_fdct4x4_sse2
253
#define FDCT8x8_2D vpx_highbd_fdct8x8_sse2
254
#define FDCT16x16_2D vpx_highbd_fdct16x16_sse2
255
#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"  // NOLINT
256
#undef FDCT4x4_2D
257
#undef FDCT8x8_2D
258
#undef FDCT16x16_2D
259
260
#define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2
261
#define FDCT32x32_HIGH_PRECISION 0
262
#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
263
#undef FDCT32x32_2D
264
#undef FDCT32x32_HIGH_PRECISION
265
266
#define FDCT32x32_2D vpx_highbd_fdct32x32_sse2
267
#define FDCT32x32_HIGH_PRECISION 1
268
#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
269
#undef FDCT32x32_2D
270
#undef FDCT32x32_HIGH_PRECISION
271
#undef DCT_HIGH_BIT_DEPTH
272
#endif  // CONFIG_VP9_HIGHBITDEPTH