Coverage Report

Created: 2026-05-16 06:35

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/gstreamer/subprojects/gst-plugins-base/gst-libs/gst/audio/audio-resampler-x86-sse2.c
Line
Count
Source
1
/* GStreamer
2
 * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
3
 *
4
 * This library is free software; you can redistribute it and/or
5
 * modify it under the terms of the GNU Library General Public
6
 * License as published by the Free Software Foundation; either
7
 * version 2 of the License, or (at your option) any later version.
8
 *
9
 * This library is distributed in the hope that it will be useful,
10
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12
 * Library General Public License for more details.
13
 *
14
 * You should have received a copy of the GNU Library General Public
15
 * License along with this library; if not, write to the
16
 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
17
 * Boston, MA 02110-1301, USA.
18
 */
19
20
#ifdef HAVE_CONFIG_H
21
#  include "config.h"
22
#endif
23
24
#include "audio-resampler-x86-sse2.h"
25
26
#include <immintrin.h>
27
28
static inline void
29
inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a,
30
    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
31
0
{
32
0
  gint i;
33
0
  __m128i sum, t;
34
35
0
  sum = _mm_setzero_si128 ();
36
37
0
  for (i = 0; i < len; i += 16) {
38
0
    t = _mm_loadu_si128 ((__m128i *) (a + i));
39
0
    sum =
40
0
        _mm_add_epi32 (sum, _mm_madd_epi16 (t,
41
0
            _mm_load_si128 ((__m128i *) (b + i + 0))));
42
43
0
    t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
44
0
    sum =
45
0
        _mm_add_epi32 (sum, _mm_madd_epi16 (t,
46
0
            _mm_load_si128 ((__m128i *) (b + i + 8))));
47
0
  }
48
0
  sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3)));
49
0
  sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1)));
50
51
0
  sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
52
0
  sum = _mm_srai_epi32 (sum, PRECISION_S16);
53
0
  sum = _mm_packs_epi32 (sum, sum);
54
0
  *o = _mm_extract_epi16 (sum, 0);
55
0
}
56
57
static inline void
58
inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a,
59
    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
60
0
{
61
0
  gint i = 0;
62
0
  __m128i sum[2], t;
63
0
  __m128i f = _mm_set_epi64x (0, *((gint64 *) icoeff));
64
0
  const gint16 *c[2] = { (gint16 *) ((gint8 *) b + 0 * bstride),
65
0
    (gint16 *) ((gint8 *) b + 1 * bstride)
66
0
  };
67
68
0
  sum[0] = sum[1] = _mm_setzero_si128 ();
69
0
  f = _mm_unpacklo_epi16 (f, sum[0]);
70
71
0
  for (; i < len; i += 16) {
72
0
    t = _mm_loadu_si128 ((__m128i *) (a + i + 0));
73
0
    sum[0] =
74
0
        _mm_add_epi32 (sum[0], _mm_madd_epi16 (t,
75
0
            _mm_load_si128 ((__m128i *) (c[0] + i + 0))));
76
0
    sum[1] =
77
0
        _mm_add_epi32 (sum[1], _mm_madd_epi16 (t,
78
0
            _mm_load_si128 ((__m128i *) (c[1] + i + 0))));
79
80
0
    t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
81
0
    sum[0] =
82
0
        _mm_add_epi32 (sum[0], _mm_madd_epi16 (t,
83
0
            _mm_load_si128 ((__m128i *) (c[0] + i + 8))));
84
0
    sum[1] =
85
0
        _mm_add_epi32 (sum[1], _mm_madd_epi16 (t,
86
0
            _mm_load_si128 ((__m128i *) (c[1] + i + 8))));
87
0
  }
88
0
  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
89
0
  sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16);
90
91
0
  sum[0] =
92
0
      _mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
93
0
  sum[1] =
94
0
      _mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
95
0
  sum[0] = _mm_add_epi32 (sum[0], sum[1]);
96
97
0
  sum[0] =
98
0
      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
99
0
              3)));
100
0
  sum[0] =
101
0
      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
102
0
              1)));
103
104
0
  sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
105
0
  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
106
0
  sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
107
0
  *o = _mm_extract_epi16 (sum[0], 0);
108
0
}
109
110
static inline void
111
inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a,
112
    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
113
0
{
114
0
  gint i = 0;
115
0
  __m128i sum[4], t[4];
116
0
  __m128i f = _mm_set_epi64x (0, *((long long *) icoeff));
117
0
  const gint16 *c[4] = { (gint16 *) ((gint8 *) b + 0 * bstride),
118
0
    (gint16 *) ((gint8 *) b + 1 * bstride),
119
0
    (gint16 *) ((gint8 *) b + 2 * bstride),
120
0
    (gint16 *) ((gint8 *) b + 3 * bstride)
121
0
  };
122
123
0
  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
124
0
  f = _mm_unpacklo_epi16 (f, sum[0]);
125
126
0
  for (; i < len; i += 8) {
127
0
    t[0] = _mm_loadu_si128 ((__m128i *) (a + i));
128
0
    sum[0] =
129
0
        _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0],
130
0
            _mm_load_si128 ((__m128i *) (c[0] + i))));
131
0
    sum[1] =
132
0
        _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0],
133
0
            _mm_load_si128 ((__m128i *) (c[1] + i))));
134
0
    sum[2] =
135
0
        _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0],
136
0
            _mm_load_si128 ((__m128i *) (c[2] + i))));
137
0
    sum[3] =
138
0
        _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0],
139
0
            _mm_load_si128 ((__m128i *) (c[3] + i))));
140
0
  }
141
0
  t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]);
142
0
  t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]);
143
0
  t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]);
144
0
  t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]);
145
146
0
  sum[0] =
147
0
      _mm_add_epi32 (_mm_unpacklo_epi64 (t[0], t[1]), _mm_unpackhi_epi64 (t[0],
148
0
          t[1]));
149
0
  sum[2] =
150
0
      _mm_add_epi32 (_mm_unpacklo_epi64 (t[2], t[3]), _mm_unpackhi_epi64 (t[2],
151
0
          t[3]));
152
0
  sum[0] = _mm_add_epi32 (sum[0], sum[2]);
153
154
0
  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
155
0
  sum[0] = _mm_madd_epi16 (sum[0], f);
156
157
0
  sum[0] =
158
0
      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
159
0
              3)));
160
0
  sum[0] =
161
0
      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
162
0
              1)));
163
164
0
  sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
165
0
  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
166
0
  sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
167
0
  *o = _mm_extract_epi16 (sum[0], 0);
168
0
}
169
170
static inline void
171
inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a,
172
    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
173
0
{
174
0
  gint i = 0;
175
0
  __m128d sum = _mm_setzero_pd ();
176
177
0
  for (; i < len; i += 8) {
178
0
    sum =
179
0
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
180
0
            _mm_load_pd (b + i + 0)));
181
0
    sum =
182
0
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
183
0
            _mm_load_pd (b + i + 2)));
184
0
    sum =
185
0
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
186
0
            _mm_load_pd (b + i + 4)));
187
0
    sum =
188
0
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
189
0
            _mm_load_pd (b + i + 6)));
190
0
  }
191
0
  sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
192
0
  _mm_store_sd (o, sum);
193
0
}
194
195
static inline void
196
inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a,
197
    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
198
0
{
199
0
  gint i = 0;
200
0
  __m128d sum[2], t;
201
0
  const gdouble *c[2] = { (gdouble *) ((gint8 *) b + 0 * bstride),
202
0
    (gdouble *) ((gint8 *) b + 1 * bstride)
203
0
  };
204
205
0
  sum[0] = sum[1] = _mm_setzero_pd ();
206
207
0
  for (; i < len; i += 4) {
208
0
    t = _mm_loadu_pd (a + i + 0);
209
0
    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0)));
210
0
    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0)));
211
0
    t = _mm_loadu_pd (a + i + 2);
212
0
    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2)));
213
0
    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2)));
214
0
  }
215
0
  sum[0] = _mm_mul_pd (_mm_sub_pd (sum[0], sum[1]), _mm_load1_pd (icoeff));
216
0
  sum[0] = _mm_add_pd (sum[0], sum[1]);
217
0
  sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
218
0
  _mm_store_sd (o, sum[0]);
219
0
}
220
221
static inline void
222
inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a,
223
    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
224
0
{
225
0
  gint i;
226
0
  __m128d f[2], sum[4], t;
227
0
  const gdouble *c[4] = { (gdouble *) ((gint8 *) b + 0 * bstride),
228
0
    (gdouble *) ((gint8 *) b + 1 * bstride),
229
0
    (gdouble *) ((gint8 *) b + 2 * bstride),
230
0
    (gdouble *) ((gint8 *) b + 3 * bstride)
231
0
  };
232
233
0
  f[0] = _mm_loadu_pd (icoeff + 0);
234
0
  f[1] = _mm_loadu_pd (icoeff + 2);
235
0
  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd ();
236
237
0
  for (i = 0; i < len; i += 2) {
238
0
    t = _mm_loadu_pd (a + i + 0);
239
0
    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i)));
240
0
    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i)));
241
0
    sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i)));
242
0
    sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i)));
243
0
  }
244
0
  sum[0] =
245
0
      _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0)));
246
0
  sum[1] =
247
0
      _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1)));
248
0
  sum[2] =
249
0
      _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0)));
250
0
  sum[3] =
251
0
      _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1)));
252
0
  sum[0] = _mm_add_pd (sum[0], sum[1]);
253
0
  sum[2] = _mm_add_pd (sum[2], sum[3]);
254
0
  sum[0] = _mm_add_pd (sum[0], sum[2]);
255
0
  sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
256
0
  _mm_store_sd (o, sum[0]);
257
0
}
258
259
MAKE_RESAMPLE_FUNC (gint16, full, 1, sse2);
260
MAKE_RESAMPLE_FUNC (gint16, linear, 1, sse2);
261
MAKE_RESAMPLE_FUNC (gint16, cubic, 1, sse2);
262
263
MAKE_RESAMPLE_FUNC (gdouble, full, 1, sse2);
264
MAKE_RESAMPLE_FUNC (gdouble, linear, 1, sse2);
265
MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, sse2);
266
267
void
268
interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap,
269
    gint len, const gpointer icp, gint astride)
270
0
{
271
0
  gint i = 0;
272
0
  gint16 *o = op, *a = ap, *ic = icp;
273
0
  __m128i ta, tb, t1, t2;
274
0
  __m128i f = _mm_set_epi64x (0, *((gint64 *) ic));
275
0
  const gint16 *c[2] = { (gint16 *) ((gint8 *) a + 0 * astride),
276
0
    (gint16 *) ((gint8 *) a + 1 * astride)
277
0
  };
278
279
0
  f = _mm_unpacklo_epi32 (f, f);
280
0
  f = _mm_unpacklo_epi64 (f, f);
281
282
0
  for (; i < len; i += 8) {
283
0
    ta = _mm_load_si128 ((__m128i *) (c[0] + i));
284
0
    tb = _mm_load_si128 ((__m128i *) (c[1] + i));
285
286
0
    t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f);
287
0
    t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f);
288
289
0
    t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
290
0
    t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
291
292
0
    t1 = _mm_srai_epi32 (t1, PRECISION_S16);
293
0
    t2 = _mm_srai_epi32 (t2, PRECISION_S16);
294
295
0
    t1 = _mm_packs_epi32 (t1, t2);
296
0
    _mm_store_si128 ((__m128i *) (o + i), t1);
297
0
  }
298
0
}
299
300
void
301
interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap,
302
    gint len, const gpointer icp, gint astride)
303
0
{
304
0
  gint i = 0;
305
0
  gint16 *o = op, *a = ap, *ic = icp;
306
0
  __m128i ta, tb, tl1, tl2, th1, th2;
307
0
  __m128i f[2];
308
0
  const gint16 *c[4] = { (gint16 *) ((gint8 *) a + 0 * astride),
309
0
    (gint16 *) ((gint8 *) a + 1 * astride),
310
0
    (gint16 *) ((gint8 *) a + 2 * astride),
311
0
    (gint16 *) ((gint8 *) a + 3 * astride)
312
0
  };
313
314
0
  f[0] = _mm_set_epi16 (ic[1], ic[0], ic[1], ic[0], ic[1], ic[0], ic[1], ic[0]);
315
0
  f[1] = _mm_set_epi16 (ic[3], ic[2], ic[3], ic[2], ic[3], ic[2], ic[3], ic[2]);
316
317
0
  for (; i < len; i += 8) {
318
0
    ta = _mm_load_si128 ((__m128i *) (c[0] + i));
319
0
    tb = _mm_load_si128 ((__m128i *) (c[1] + i));
320
321
0
    tl1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[0]);
322
0
    th1 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[0]);
323
324
0
    ta = _mm_load_si128 ((__m128i *) (c[2] + i));
325
0
    tb = _mm_load_si128 ((__m128i *) (c[3] + i));
326
327
0
    tl2 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[1]);
328
0
    th2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[1]);
329
330
0
    tl1 = _mm_add_epi32 (tl1, tl2);
331
0
    th1 = _mm_add_epi32 (th1, th2);
332
333
0
    tl1 = _mm_add_epi32 (tl1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
334
0
    th1 = _mm_add_epi32 (th1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
335
336
0
    tl1 = _mm_srai_epi32 (tl1, PRECISION_S16);
337
0
    th1 = _mm_srai_epi32 (th1, PRECISION_S16);
338
339
0
    tl1 = _mm_packs_epi32 (tl1, th1);
340
0
    _mm_store_si128 ((__m128i *) (o + i), tl1);
341
0
  }
342
0
}
343
344
void
345
interpolate_gdouble_linear_sse2 (gpointer op, const gpointer ap,
346
    gint len, const gpointer icp, gint astride)
347
0
{
348
0
  gint i;
349
0
  gdouble *o = op, *a = ap, *ic = icp;
350
0
  __m128d f[2], t1, t2;
351
0
  const gdouble *c[2] = { (gdouble *) ((gint8 *) a + 0 * astride),
352
0
    (gdouble *) ((gint8 *) a + 1 * astride)
353
0
  };
354
355
0
  f[0] = _mm_load1_pd (ic + 0);
356
0
  f[1] = _mm_load1_pd (ic + 1);
357
358
0
  for (i = 0; i < len; i += 4) {
359
0
    t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
360
0
    t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
361
0
    _mm_store_pd (o + i + 0, _mm_add_pd (t1, t2));
362
363
0
    t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 2), f[0]);
364
0
    t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 2), f[1]);
365
0
    _mm_store_pd (o + i + 2, _mm_add_pd (t1, t2));
366
0
  }
367
0
}
368
369
void
370
interpolate_gdouble_cubic_sse2 (gpointer op, const gpointer ap,
371
    gint len, const gpointer icp, gint astride)
372
0
{
373
0
  gint i;
374
0
  gdouble *o = op, *a = ap, *ic = icp;
375
0
  __m128d f[4], t[4];
376
0
  const gdouble *c[4] = { (gdouble *) ((gint8 *) a + 0 * astride),
377
0
    (gdouble *) ((gint8 *) a + 1 * astride),
378
0
    (gdouble *) ((gint8 *) a + 2 * astride),
379
0
    (gdouble *) ((gint8 *) a + 3 * astride)
380
0
  };
381
382
0
  f[0] = _mm_load1_pd (ic + 0);
383
0
  f[1] = _mm_load1_pd (ic + 1);
384
0
  f[2] = _mm_load1_pd (ic + 2);
385
0
  f[3] = _mm_load1_pd (ic + 3);
386
387
0
  for (i = 0; i < len; i += 2) {
388
0
    t[0] = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
389
0
    t[1] = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
390
0
    t[2] = _mm_mul_pd (_mm_load_pd (c[2] + i + 0), f[2]);
391
0
    t[3] = _mm_mul_pd (_mm_load_pd (c[3] + i + 0), f[3]);
392
0
    t[0] = _mm_add_pd (t[0], t[1]);
393
0
    t[2] = _mm_add_pd (t[2], t[3]);
394
0
    _mm_store_pd (o + i + 0, _mm_add_pd (t[0], t[2]));
395
0
  }
396
0
}