/src/gstreamer/subprojects/gst-plugins-base/gst-libs/gst/audio/audio-resampler-x86-sse2.c

Source
/* GStreamer
 * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 */

#ifdef HAVE_CONFIG_H
#  include "config.h"
#endif

#include "audio-resampler-x86-sse2.h"

#include <immintrin.h>

static inline void
inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a,
    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
{
  gint i;
  __m128i sum, t;

  sum = _mm_setzero_si128 ();

  for (i = 0; i < len; i += 16) {
    t = _mm_loadu_si128 ((__m128i *) (a + i));
    sum =
        _mm_add_epi32 (sum, _mm_madd_epi16 (t,
            _mm_load_si128 ((__m128i *) (b + i + 0))));

    t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
    sum =
        _mm_add_epi32 (sum, _mm_madd_epi16 (t,
            _mm_load_si128 ((__m128i *) (b + i + 8))));
  }
  sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3)));
  sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1)));

  sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
  sum = _mm_srai_epi32 (sum, PRECISION_S16);
  sum = _mm_packs_epi32 (sum, sum);
  *o = _mm_extract_epi16 (sum, 0);
}

static inline void
inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a,
    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
{
  gint i = 0;
  __m128i sum[2], t;
  __m128i f = _mm_set_epi64x (0, *((gint64 *) icoeff));
  const gint16 *c[2] = { (gint16 *) ((gint8 *) b + 0 * bstride),
    (gint16 *) ((gint8 *) b + 1 * bstride)
  };

  sum[0] = sum[1] = _mm_setzero_si128 ();
  f = _mm_unpacklo_epi16 (f, sum[0]);

  for (; i < len; i += 16) {
    t = _mm_loadu_si128 ((__m128i *) (a + i + 0));
    sum[0] =
        _mm_add_epi32 (sum[0], _mm_madd_epi16 (t,
            _mm_load_si128 ((__m128i *) (c[0] + i + 0))));
    sum[1] =
        _mm_add_epi32 (sum[1], _mm_madd_epi16 (t,
            _mm_load_si128 ((__m128i *) (c[1] + i + 0))));

    t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
    sum[0] =
        _mm_add_epi32 (sum[0], _mm_madd_epi16 (t,
            _mm_load_si128 ((__m128i *) (c[0] + i + 8))));
    sum[1] =
        _mm_add_epi32 (sum[1], _mm_madd_epi16 (t,
            _mm_load_si128 ((__m128i *) (c[1] + i + 8))));
  }
  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
  sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16);

  sum[0] =
      _mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
  sum[1] =
      _mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
  sum[0] = _mm_add_epi32 (sum[0], sum[1]);

  sum[0] =
      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
              3)));
  sum[0] =
      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
              1)));

  sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
  sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
  *o = _mm_extract_epi16 (sum[0], 0);
}

static inline void
inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a,
    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
{
  gint i = 0;
  __m128i sum[4], t[4];
  __m128i f = _mm_set_epi64x (0, *((long long *) icoeff));
  const gint16 *c[4] = { (gint16 *) ((gint8 *) b + 0 * bstride),
    (gint16 *) ((gint8 *) b + 1 * bstride),
    (gint16 *) ((gint8 *) b + 2 * bstride),
    (gint16 *) ((gint8 *) b + 3 * bstride)
  };

  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
  f = _mm_unpacklo_epi16 (f, sum[0]);

  for (; i < len; i += 8) {
    t[0] = _mm_loadu_si128 ((__m128i *) (a + i));
    sum[0] =
        _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0],
            _mm_load_si128 ((__m128i *) (c[0] + i))));
    sum[1] =
        _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0],
            _mm_load_si128 ((__m128i *) (c[1] + i))));
    sum[2] =
        _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0],
            _mm_load_si128 ((__m128i *) (c[2] + i))));
    sum[3] =
        _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0],
            _mm_load_si128 ((__m128i *) (c[3] + i))));
  }
  t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]);
  t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]);
  t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]);
  t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]);

  sum[0] =
      _mm_add_epi32 (_mm_unpacklo_epi64 (t[0], t[1]), _mm_unpackhi_epi64 (t[0],
          t[1]));
  sum[2] =
      _mm_add_epi32 (_mm_unpacklo_epi64 (t[2], t[3]), _mm_unpackhi_epi64 (t[2],
          t[3]));
  sum[0] = _mm_add_epi32 (sum[0], sum[2]);

  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
  sum[0] = _mm_madd_epi16 (sum[0], f);

  sum[0] =
      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
              3)));
  sum[0] =
      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
              1)));

  sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
  sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
  *o = _mm_extract_epi16 (sum[0], 0);
}

static inline void
inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a,
    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
{
  gint i = 0;
  __m128d sum = _mm_setzero_pd ();

  for (; i < len; i += 8) {
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
            _mm_load_pd (b + i + 0)));
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
            _mm_load_pd (b + i + 2)));
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
            _mm_load_pd (b + i + 4)));
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
            _mm_load_pd (b + i + 6)));
  }
  sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
  _mm_store_sd (o, sum);
}

static inline void
inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a,
    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
{
  gint i = 0;
  __m128d sum[2], t;
  const gdouble *c[2] = { (gdouble *) ((gint8 *) b + 0 * bstride),
    (gdouble *) ((gint8 *) b + 1 * bstride)
  };

  sum[0] = sum[1] = _mm_setzero_pd ();

  for (; i < len; i += 4) {
    t = _mm_loadu_pd (a + i + 0);
    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0)));
    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0)));
    t = _mm_loadu_pd (a + i + 2);
    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2)));
    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2)));
  }
  sum[0] = _mm_mul_pd (_mm_sub_pd (sum[0], sum[1]), _mm_load1_pd (icoeff));
  sum[0] = _mm_add_pd (sum[0], sum[1]);
  sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
  _mm_store_sd (o, sum[0]);
}

static inline void
inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a,
    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
{
  gint i;
  __m128d f[2], sum[4], t;
  const gdouble *c[4] = { (gdouble *) ((gint8 *) b + 0 * bstride),
    (gdouble *) ((gint8 *) b + 1 * bstride),
    (gdouble *) ((gint8 *) b + 2 * bstride),
    (gdouble *) ((gint8 *) b + 3 * bstride)
  };

  f[0] = _mm_loadu_pd (icoeff + 0);
  f[1] = _mm_loadu_pd (icoeff + 2);
  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd ();

  for (i = 0; i < len; i += 2) {
    t = _mm_loadu_pd (a + i + 0);
    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i)));
    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i)));
    sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i)));
    sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i)));
  }
  sum[0] =
      _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0)));
  sum[1] =
      _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1)));
  sum[2] =
      _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0)));
  sum[3] =
      _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1)));
  sum[0] = _mm_add_pd (sum[0], sum[1]);
  sum[2] = _mm_add_pd (sum[2], sum[3]);
  sum[0] = _mm_add_pd (sum[0], sum[2]);
  sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
  _mm_store_sd (o, sum[0]);
}

MAKE_RESAMPLE_FUNC (gint16, full, 1, sse2);
MAKE_RESAMPLE_FUNC (gint16, linear, 1, sse2);
MAKE_RESAMPLE_FUNC (gint16, cubic, 1, sse2);

MAKE_RESAMPLE_FUNC (gdouble, full, 1, sse2);
MAKE_RESAMPLE_FUNC (gdouble, linear, 1, sse2);
MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, sse2);

void
interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap,
    gint len, const gpointer icp, gint astride)
{
  gint i = 0;
  gint16 *o = op, *a = ap, *ic = icp;
  __m128i ta, tb, t1, t2;
  __m128i f = _mm_set_epi64x (0, *((gint64 *) ic));
  const gint16 *c[2] = { (gint16 *) ((gint8 *) a + 0 * astride),
    (gint16 *) ((gint8 *) a + 1 * astride)
  };

  f = _mm_unpacklo_epi32 (f, f);
  f = _mm_unpacklo_epi64 (f, f);

  for (; i < len; i += 8) {
    ta = _mm_load_si128 ((__m128i *) (c[0] + i));
    tb = _mm_load_si128 ((__m128i *) (c[1] + i));

    t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f);
    t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f);

    t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
    t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));

    t1 = _mm_srai_epi32 (t1, PRECISION_S16);
    t2 = _mm_srai_epi32 (t2, PRECISION_S16);

    t1 = _mm_packs_epi32 (t1, t2);
    _mm_store_si128 ((__m128i *) (o + i), t1);
  }
}

void
interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap,
    gint len, const gpointer icp, gint astride)
{
  gint i = 0;
  gint16 *o = op, *a = ap, *ic = icp;
  __m128i ta, tb, tl1, tl2, th1, th2;
  __m128i f[2];
  const gint16 *c[4] = { (gint16 *) ((gint8 *) a + 0 * astride),
    (gint16 *) ((gint8 *) a + 1 * astride),
    (gint16 *) ((gint8 *) a + 2 * astride),
    (gint16 *) ((gint8 *) a + 3 * astride)
  };

  f[0] = _mm_set_epi16 (ic[1], ic[0], ic[1], ic[0], ic[1], ic[0], ic[1], ic[0]);
  f[1] = _mm_set_epi16 (ic[3], ic[2], ic[3], ic[2], ic[3], ic[2], ic[3], ic[2]);

  for (; i < len; i += 8) {
    ta = _mm_load_si128 ((__m128i *) (c[0] + i));
    tb = _mm_load_si128 ((__m128i *) (c[1] + i));

    tl1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[0]);
    th1 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[0]);

    ta = _mm_load_si128 ((__m128i *) (c[2] + i));
    tb = _mm_load_si128 ((__m128i *) (c[3] + i));

    tl2 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[1]);
    th2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[1]);

    tl1 = _mm_add_epi32 (tl1, tl2);
    th1 = _mm_add_epi32 (th1, th2);

    tl1 = _mm_add_epi32 (tl1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
    th1 = _mm_add_epi32 (th1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));

    tl1 = _mm_srai_epi32 (tl1, PRECISION_S16);
    th1 = _mm_srai_epi32 (th1, PRECISION_S16);

    tl1 = _mm_packs_epi32 (tl1, th1);
    _mm_store_si128 ((__m128i *) (o + i), tl1);
  }
}

void
interpolate_gdouble_linear_sse2 (gpointer op, const gpointer ap,
    gint len, const gpointer icp, gint astride)
{
  gint i;
  gdouble *o = op, *a = ap, *ic = icp;
  __m128d f[2], t1, t2;
  const gdouble *c[2] = { (gdouble *) ((gint8 *) a + 0 * astride),
    (gdouble *) ((gint8 *) a + 1 * astride)
  };

  f[0] = _mm_load1_pd (ic + 0);
  f[1] = _mm_load1_pd (ic + 1);

  for (i = 0; i < len; i += 4) {
    t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
    t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
    _mm_store_pd (o + i + 0, _mm_add_pd (t1, t2));

    t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 2), f[0]);
    t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 2), f[1]);
    _mm_store_pd (o + i + 2, _mm_add_pd (t1, t2));
  }
}

void
interpolate_gdouble_cubic_sse2 (gpointer op, const gpointer ap,
    gint len, const gpointer icp, gint astride)
{
  gint i;
  gdouble *o = op, *a = ap, *ic = icp;
  __m128d f[4], t[4];
  const gdouble *c[4] = { (gdouble *) ((gint8 *) a + 0 * astride),
    (gdouble *) ((gint8 *) a + 1 * astride),
    (gdouble *) ((gint8 *) a + 2 * astride),
    (gdouble *) ((gint8 *) a + 3 * astride)
  };

  f[0] = _mm_load1_pd (ic + 0);
  f[1] = _mm_load1_pd (ic + 1);
  f[2] = _mm_load1_pd (ic + 2);
  f[3] = _mm_load1_pd (ic + 3);

  for (i = 0; i < len; i += 2) {
    t[0] = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
    t[1] = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
    t[2] = _mm_mul_pd (_mm_load_pd (c[2] + i + 0), f[2]);
    t[3] = _mm_mul_pd (_mm_load_pd (c[3] + i + 0), f[3]);
    t[0] = _mm_add_pd (t[0], t[1]);
    t[2] = _mm_add_pd (t[2], t[3]);
    _mm_store_pd (o + i + 0, _mm_add_pd (t[0], t[2]));
  }
}

Coverage Report

Created: 2026-05-16 06:35

Line	Count	Source
1		/* GStreamer
2		* Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
3		*
4		* This library is free software; you can redistribute it and/or
5		* modify it under the terms of the GNU Library General Public
6		* License as published by the Free Software Foundation; either
7		* version 2 of the License, or (at your option) any later version.
8		*
9		* This library is distributed in the hope that it will be useful,
10		* but WITHOUT ANY WARRANTY; without even the implied warranty of
11		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12		* Library General Public License for more details.
13		*
14		* You should have received a copy of the GNU Library General Public
15		* License along with this library; if not, write to the
16		* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
17		* Boston, MA 02110-1301, USA.
18		*/
19
20		#ifdef HAVE_CONFIG_H
21		# include "config.h"
22		#endif
23
24		#include "audio-resampler-x86-sse2.h"
25
26		#include <immintrin.h>
27
28		static inline void
29		inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a,
30		const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
31	0	{
32	0	gint i;
33	0	__m128i sum, t;
34
35	0	sum = _mm_setzero_si128 ();
36
37	0	for (i = 0; i < len; i += 16) {
38	0	t = _mm_loadu_si128 ((__m128i *) (a + i));
39	0	sum =
40	0	_mm_add_epi32 (sum, _mm_madd_epi16 (t,
41	0	_mm_load_si128 ((__m128i *) (b + i + 0))));
42
43	0	t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
44	0	sum =
45	0	_mm_add_epi32 (sum, _mm_madd_epi16 (t,
46	0	_mm_load_si128 ((__m128i *) (b + i + 8))));
47	0	}
48	0	sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3)));
49	0	sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1)));
50
51	0	sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
52	0	sum = _mm_srai_epi32 (sum, PRECISION_S16);
53	0	sum = _mm_packs_epi32 (sum, sum);
54	0	*o = _mm_extract_epi16 (sum, 0);
55	0	}
56
57		static inline void
58		inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a,
59		const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
60	0	{
61	0	gint i = 0;
62	0	__m128i sum[2], t;
63	0	__m128i f = _mm_set_epi64x (0, ((gint64 ) icoeff));
64	0	const gint16 c[2] = { (gint16 ) ((gint8 ) b + 0 bstride),
65	0	(gint16 ) ((gint8 ) b + 1 * bstride)
66	0	};
67
68	0	sum[0] = sum[1] = _mm_setzero_si128 ();
69	0	f = _mm_unpacklo_epi16 (f, sum[0]);
70
71	0	for (; i < len; i += 16) {
72	0	t = _mm_loadu_si128 ((__m128i *) (a + i + 0));
73	0	sum[0] =
74	0	_mm_add_epi32 (sum[0], _mm_madd_epi16 (t,
75	0	_mm_load_si128 ((__m128i *) (c[0] + i + 0))));
76	0	sum[1] =
77	0	_mm_add_epi32 (sum[1], _mm_madd_epi16 (t,
78	0	_mm_load_si128 ((__m128i *) (c[1] + i + 0))));
79
80	0	t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
81	0	sum[0] =
82	0	_mm_add_epi32 (sum[0], _mm_madd_epi16 (t,
83	0	_mm_load_si128 ((__m128i *) (c[0] + i + 8))));
84	0	sum[1] =
85	0	_mm_add_epi32 (sum[1], _mm_madd_epi16 (t,
86	0	_mm_load_si128 ((__m128i *) (c[1] + i + 8))));
87	0	}
88	0	sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
89	0	sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16);
90
91	0	sum[0] =
92	0	_mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
93	0	sum[1] =
94	0	_mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
95	0	sum[0] = _mm_add_epi32 (sum[0], sum[1]);
96
97	0	sum[0] =
98	0	_mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
99	0	3)));
100	0	sum[0] =
101	0	_mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
102	0	1)));
103
104	0	sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
105	0	sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
106	0	sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
107	0	*o = _mm_extract_epi16 (sum[0], 0);
108	0	}
109
110		static inline void
111		inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a,
112		const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
113	0	{
114	0	gint i = 0;
115	0	__m128i sum[4], t[4];
116	0	__m128i f = _mm_set_epi64x (0, ((long long ) icoeff));
117	0	const gint16 c[4] = { (gint16 ) ((gint8 ) b + 0 bstride),
118	0	(gint16 ) ((gint8 ) b + 1 * bstride),
119	0	(gint16 ) ((gint8 ) b + 2 * bstride),
120	0	(gint16 ) ((gint8 ) b + 3 * bstride)
121	0	};
122
123	0	sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
124	0	f = _mm_unpacklo_epi16 (f, sum[0]);
125
126	0	for (; i < len; i += 8) {
127	0	t[0] = _mm_loadu_si128 ((__m128i *) (a + i));
128	0	sum[0] =
129	0	_mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0],
130	0	_mm_load_si128 ((__m128i *) (c[0] + i))));
131	0	sum[1] =
132	0	_mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0],
133	0	_mm_load_si128 ((__m128i *) (c[1] + i))));
134	0	sum[2] =
135	0	_mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0],
136	0	_mm_load_si128 ((__m128i *) (c[2] + i))));
137	0	sum[3] =
138	0	_mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0],
139	0	_mm_load_si128 ((__m128i *) (c[3] + i))));
140	0	}
141	0	t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]);
142	0	t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]);
143	0	t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]);
144	0	t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]);
145
146	0	sum[0] =
147	0	_mm_add_epi32 (_mm_unpacklo_epi64 (t[0], t[1]), _mm_unpackhi_epi64 (t[0],
148	0	t[1]));
149	0	sum[2] =
150	0	_mm_add_epi32 (_mm_unpacklo_epi64 (t[2], t[3]), _mm_unpackhi_epi64 (t[2],
151	0	t[3]));
152	0	sum[0] = _mm_add_epi32 (sum[0], sum[2]);
153
154	0	sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
155	0	sum[0] = _mm_madd_epi16 (sum[0], f);
156
157	0	sum[0] =
158	0	_mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
159	0	3)));
160	0	sum[0] =
161	0	_mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
162	0	1)));
163
164	0	sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
165	0	sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
166	0	sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
167	0	*o = _mm_extract_epi16 (sum[0], 0);
168	0	}
169
170		static inline void
171		inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a,
172		const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
173	0	{
174	0	gint i = 0;
175	0	__m128d sum = _mm_setzero_pd ();
176
177	0	for (; i < len; i += 8) {
178	0	sum =
179	0	_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
180	0	_mm_load_pd (b + i + 0)));
181	0	sum =
182	0	_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
183	0	_mm_load_pd (b + i + 2)));
184	0	sum =
185	0	_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
186	0	_mm_load_pd (b + i + 4)));
187	0	sum =
188	0	_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
189	0	_mm_load_pd (b + i + 6)));
190	0	}
191	0	sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
192	0	_mm_store_sd (o, sum);
193	0	}
194
195		static inline void
196		inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a,
197		const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
198	0	{
199	0	gint i = 0;
200	0	__m128d sum[2], t;
201	0	const gdouble c[2] = { (gdouble ) ((gint8 ) b + 0 bstride),
202	0	(gdouble ) ((gint8 ) b + 1 * bstride)
203	0	};
204
205	0	sum[0] = sum[1] = _mm_setzero_pd ();
206
207	0	for (; i < len; i += 4) {
208	0	t = _mm_loadu_pd (a + i + 0);
209	0	sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0)));
210	0	sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0)));
211	0	t = _mm_loadu_pd (a + i + 2);
212	0	sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2)));
213	0	sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2)));
214	0	}
215	0	sum[0] = _mm_mul_pd (_mm_sub_pd (sum[0], sum[1]), _mm_load1_pd (icoeff));
216	0	sum[0] = _mm_add_pd (sum[0], sum[1]);
217	0	sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
218	0	_mm_store_sd (o, sum[0]);
219	0	}
220
221		static inline void
222		inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a,
223		const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
224	0	{
225	0	gint i;
226	0	__m128d f[2], sum[4], t;
227	0	const gdouble c[4] = { (gdouble ) ((gint8 ) b + 0 bstride),
228	0	(gdouble ) ((gint8 ) b + 1 * bstride),
229	0	(gdouble ) ((gint8 ) b + 2 * bstride),
230	0	(gdouble ) ((gint8 ) b + 3 * bstride)
231	0	};
232
233	0	f[0] = _mm_loadu_pd (icoeff + 0);
234	0	f[1] = _mm_loadu_pd (icoeff + 2);
235	0	sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd ();
236
237	0	for (i = 0; i < len; i += 2) {
238	0	t = _mm_loadu_pd (a + i + 0);
239	0	sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i)));
240	0	sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i)));
241	0	sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i)));
242	0	sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i)));
243	0	}
244	0	sum[0] =
245	0	_mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0)));
246	0	sum[1] =
247	0	_mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1)));
248	0	sum[2] =
249	0	_mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0)));
250	0	sum[3] =
251	0	_mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1)));
252	0	sum[0] = _mm_add_pd (sum[0], sum[1]);
253	0	sum[2] = _mm_add_pd (sum[2], sum[3]);
254	0	sum[0] = _mm_add_pd (sum[0], sum[2]);
255	0	sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
256	0	_mm_store_sd (o, sum[0]);
257	0	}
258
259		MAKE_RESAMPLE_FUNC (gint16, full, 1, sse2);
260		MAKE_RESAMPLE_FUNC (gint16, linear, 1, sse2);
261		MAKE_RESAMPLE_FUNC (gint16, cubic, 1, sse2);
262
263		MAKE_RESAMPLE_FUNC (gdouble, full, 1, sse2);
264		MAKE_RESAMPLE_FUNC (gdouble, linear, 1, sse2);
265		MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, sse2);
266
267		void
268		interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap,
269		gint len, const gpointer icp, gint astride)
270	0	{
271	0	gint i = 0;
272	0	gint16 o = op, a = ap, *ic = icp;
273	0	__m128i ta, tb, t1, t2;
274	0	__m128i f = _mm_set_epi64x (0, ((gint64 ) ic));
275	0	const gint16 c[2] = { (gint16 ) ((gint8 ) a + 0 astride),
276	0	(gint16 ) ((gint8 ) a + 1 * astride)
277	0	};
278
279	0	f = _mm_unpacklo_epi32 (f, f);
280	0	f = _mm_unpacklo_epi64 (f, f);
281
282	0	for (; i < len; i += 8) {
283	0	ta = _mm_load_si128 ((__m128i *) (c[0] + i));
284	0	tb = _mm_load_si128 ((__m128i *) (c[1] + i));
285
286	0	t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f);
287	0	t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f);
288
289	0	t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
290	0	t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
291
292	0	t1 = _mm_srai_epi32 (t1, PRECISION_S16);
293	0	t2 = _mm_srai_epi32 (t2, PRECISION_S16);
294
295	0	t1 = _mm_packs_epi32 (t1, t2);
296	0	_mm_store_si128 ((__m128i *) (o + i), t1);
297	0	}
298	0	}
299
300		void
301		interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap,
302		gint len, const gpointer icp, gint astride)
303	0	{
304	0	gint i = 0;
305	0	gint16 o = op, a = ap, *ic = icp;
306	0	__m128i ta, tb, tl1, tl2, th1, th2;
307	0	__m128i f[2];
308	0	const gint16 c[4] = { (gint16 ) ((gint8 ) a + 0 astride),
309	0	(gint16 ) ((gint8 ) a + 1 * astride),
310	0	(gint16 ) ((gint8 ) a + 2 * astride),
311	0	(gint16 ) ((gint8 ) a + 3 * astride)
312	0	};
313
314	0	f[0] = _mm_set_epi16 (ic[1], ic[0], ic[1], ic[0], ic[1], ic[0], ic[1], ic[0]);
315	0	f[1] = _mm_set_epi16 (ic[3], ic[2], ic[3], ic[2], ic[3], ic[2], ic[3], ic[2]);
316
317	0	for (; i < len; i += 8) {
318	0	ta = _mm_load_si128 ((__m128i *) (c[0] + i));
319	0	tb = _mm_load_si128 ((__m128i *) (c[1] + i));
320
321	0	tl1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[0]);
322	0	th1 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[0]);
323
324	0	ta = _mm_load_si128 ((__m128i *) (c[2] + i));
325	0	tb = _mm_load_si128 ((__m128i *) (c[3] + i));
326
327	0	tl2 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[1]);
328	0	th2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[1]);
329
330	0	tl1 = _mm_add_epi32 (tl1, tl2);
331	0	th1 = _mm_add_epi32 (th1, th2);
332
333	0	tl1 = _mm_add_epi32 (tl1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
334	0	th1 = _mm_add_epi32 (th1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
335
336	0	tl1 = _mm_srai_epi32 (tl1, PRECISION_S16);
337	0	th1 = _mm_srai_epi32 (th1, PRECISION_S16);
338
339	0	tl1 = _mm_packs_epi32 (tl1, th1);
340	0	_mm_store_si128 ((__m128i *) (o + i), tl1);
341	0	}
342	0	}
343
344		void
345		interpolate_gdouble_linear_sse2 (gpointer op, const gpointer ap,
346		gint len, const gpointer icp, gint astride)
347	0	{
348	0	gint i;
349	0	gdouble o = op, a = ap, *ic = icp;
350	0	__m128d f[2], t1, t2;
351	0	const gdouble c[2] = { (gdouble ) ((gint8 ) a + 0 astride),
352	0	(gdouble ) ((gint8 ) a + 1 * astride)
353	0	};
354
355	0	f[0] = _mm_load1_pd (ic + 0);
356	0	f[1] = _mm_load1_pd (ic + 1);
357
358	0	for (i = 0; i < len; i += 4) {
359	0	t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
360	0	t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
361	0	_mm_store_pd (o + i + 0, _mm_add_pd (t1, t2));
362
363	0	t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 2), f[0]);
364	0	t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 2), f[1]);
365	0	_mm_store_pd (o + i + 2, _mm_add_pd (t1, t2));
366	0	}
367	0	}
368
369		void
370		interpolate_gdouble_cubic_sse2 (gpointer op, const gpointer ap,
371		gint len, const gpointer icp, gint astride)
372	0	{
373	0	gint i;
374	0	gdouble o = op, a = ap, *ic = icp;
375	0	__m128d f[4], t[4];
376	0	const gdouble c[4] = { (gdouble ) ((gint8 ) a + 0 astride),
377	0	(gdouble ) ((gint8 ) a + 1 * astride),
378	0	(gdouble ) ((gint8 ) a + 2 * astride),
379	0	(gdouble ) ((gint8 ) a + 3 * astride)
380	0	};
381
382	0	f[0] = _mm_load1_pd (ic + 0);
383	0	f[1] = _mm_load1_pd (ic + 1);
384	0	f[2] = _mm_load1_pd (ic + 2);
385	0	f[3] = _mm_load1_pd (ic + 3);
386
387	0	for (i = 0; i < len; i += 2) {
388	0	t[0] = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
389	0	t[1] = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
390	0	t[2] = _mm_mul_pd (_mm_load_pd (c[2] + i + 0), f[2]);
391	0	t[3] = _mm_mul_pd (_mm_load_pd (c[3] + i + 0), f[3]);
392	0	t[0] = _mm_add_pd (t[0], t[1]);
393	0	t[2] = _mm_add_pd (t[2], t[3]);
394	0	_mm_store_pd (o + i + 0, _mm_add_pd (t[0], t[2]));
395	0	}
396	0	}