/src/gstreamer/subprojects/gst-plugins-base/gst-libs/gst/audio/audio-resampler-x86-sse2.c
Line | Count | Source |
1 | | /* GStreamer |
2 | | * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com> |
3 | | * |
4 | | * This library is free software; you can redistribute it and/or |
5 | | * modify it under the terms of the GNU Library General Public |
6 | | * License as published by the Free Software Foundation; either |
7 | | * version 2 of the License, or (at your option) any later version. |
8 | | * |
9 | | * This library is distributed in the hope that it will be useful, |
10 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | | * Library General Public License for more details. |
13 | | * |
14 | | * You should have received a copy of the GNU Library General Public |
15 | | * License along with this library; if not, write to the |
16 | | * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, |
17 | | * Boston, MA 02110-1301, USA. |
18 | | */ |
19 | | |
20 | | #ifdef HAVE_CONFIG_H |
21 | | # include "config.h" |
22 | | #endif |
23 | | |
24 | | #include "audio-resampler-x86-sse2.h" |
25 | | |
26 | | #include <immintrin.h> |
27 | | |
28 | | static inline void |
29 | | inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a, |
30 | | const gint16 * b, gint len, const gint16 * icoeff, gint bstride) |
31 | 0 | { |
32 | 0 | gint i; |
33 | 0 | __m128i sum, t; |
34 | |
|
35 | 0 | sum = _mm_setzero_si128 (); |
36 | |
|
37 | 0 | for (i = 0; i < len; i += 16) { |
38 | 0 | t = _mm_loadu_si128 ((__m128i *) (a + i)); |
39 | 0 | sum = |
40 | 0 | _mm_add_epi32 (sum, _mm_madd_epi16 (t, |
41 | 0 | _mm_load_si128 ((__m128i *) (b + i + 0)))); |
42 | |
|
43 | 0 | t = _mm_loadu_si128 ((__m128i *) (a + i + 8)); |
44 | 0 | sum = |
45 | 0 | _mm_add_epi32 (sum, _mm_madd_epi16 (t, |
46 | 0 | _mm_load_si128 ((__m128i *) (b + i + 8)))); |
47 | 0 | } |
48 | 0 | sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3))); |
49 | 0 | sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1))); |
50 | |
|
51 | 0 | sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); |
52 | 0 | sum = _mm_srai_epi32 (sum, PRECISION_S16); |
53 | 0 | sum = _mm_packs_epi32 (sum, sum); |
54 | 0 | *o = _mm_extract_epi16 (sum, 0); |
55 | 0 | } |
56 | | |
57 | | static inline void |
58 | | inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a, |
59 | | const gint16 * b, gint len, const gint16 * icoeff, gint bstride) |
60 | 0 | { |
61 | 0 | gint i = 0; |
62 | 0 | __m128i sum[2], t; |
63 | 0 | __m128i f = _mm_set_epi64x (0, *((gint64 *) icoeff)); |
64 | 0 | const gint16 *c[2] = { (gint16 *) ((gint8 *) b + 0 * bstride), |
65 | 0 | (gint16 *) ((gint8 *) b + 1 * bstride) |
66 | 0 | }; |
67 | |
|
68 | 0 | sum[0] = sum[1] = _mm_setzero_si128 (); |
69 | 0 | f = _mm_unpacklo_epi16 (f, sum[0]); |
70 | |
|
71 | 0 | for (; i < len; i += 16) { |
72 | 0 | t = _mm_loadu_si128 ((__m128i *) (a + i + 0)); |
73 | 0 | sum[0] = |
74 | 0 | _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, |
75 | 0 | _mm_load_si128 ((__m128i *) (c[0] + i + 0)))); |
76 | 0 | sum[1] = |
77 | 0 | _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, |
78 | 0 | _mm_load_si128 ((__m128i *) (c[1] + i + 0)))); |
79 | |
|
80 | 0 | t = _mm_loadu_si128 ((__m128i *) (a + i + 8)); |
81 | 0 | sum[0] = |
82 | 0 | _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, |
83 | 0 | _mm_load_si128 ((__m128i *) (c[0] + i + 8)))); |
84 | 0 | sum[1] = |
85 | 0 | _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, |
86 | 0 | _mm_load_si128 ((__m128i *) (c[1] + i + 8)))); |
87 | 0 | } |
88 | 0 | sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); |
89 | 0 | sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16); |
90 | |
|
91 | 0 | sum[0] = |
92 | 0 | _mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0))); |
93 | 0 | sum[1] = |
94 | 0 | _mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1))); |
95 | 0 | sum[0] = _mm_add_epi32 (sum[0], sum[1]); |
96 | |
|
97 | 0 | sum[0] = |
98 | 0 | _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, |
99 | 0 | 3))); |
100 | 0 | sum[0] = |
101 | 0 | _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, |
102 | 0 | 1))); |
103 | |
|
104 | 0 | sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); |
105 | 0 | sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); |
106 | 0 | sum[0] = _mm_packs_epi32 (sum[0], sum[0]); |
107 | 0 | *o = _mm_extract_epi16 (sum[0], 0); |
108 | 0 | } |
109 | | |
110 | | static inline void |
111 | | inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a, |
112 | | const gint16 * b, gint len, const gint16 * icoeff, gint bstride) |
113 | 0 | { |
114 | 0 | gint i = 0; |
115 | 0 | __m128i sum[4], t[4]; |
116 | 0 | __m128i f = _mm_set_epi64x (0, *((long long *) icoeff)); |
117 | 0 | const gint16 *c[4] = { (gint16 *) ((gint8 *) b + 0 * bstride), |
118 | 0 | (gint16 *) ((gint8 *) b + 1 * bstride), |
119 | 0 | (gint16 *) ((gint8 *) b + 2 * bstride), |
120 | 0 | (gint16 *) ((gint8 *) b + 3 * bstride) |
121 | 0 | }; |
122 | |
|
123 | 0 | sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 (); |
124 | 0 | f = _mm_unpacklo_epi16 (f, sum[0]); |
125 | |
|
126 | 0 | for (; i < len; i += 8) { |
127 | 0 | t[0] = _mm_loadu_si128 ((__m128i *) (a + i)); |
128 | 0 | sum[0] = |
129 | 0 | _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0], |
130 | 0 | _mm_load_si128 ((__m128i *) (c[0] + i)))); |
131 | 0 | sum[1] = |
132 | 0 | _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0], |
133 | 0 | _mm_load_si128 ((__m128i *) (c[1] + i)))); |
134 | 0 | sum[2] = |
135 | 0 | _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0], |
136 | 0 | _mm_load_si128 ((__m128i *) (c[2] + i)))); |
137 | 0 | sum[3] = |
138 | 0 | _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0], |
139 | 0 | _mm_load_si128 ((__m128i *) (c[3] + i)))); |
140 | 0 | } |
141 | 0 | t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]); |
142 | 0 | t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]); |
143 | 0 | t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]); |
144 | 0 | t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]); |
145 | |
|
146 | 0 | sum[0] = |
147 | 0 | _mm_add_epi32 (_mm_unpacklo_epi64 (t[0], t[1]), _mm_unpackhi_epi64 (t[0], |
148 | 0 | t[1])); |
149 | 0 | sum[2] = |
150 | 0 | _mm_add_epi32 (_mm_unpacklo_epi64 (t[2], t[3]), _mm_unpackhi_epi64 (t[2], |
151 | 0 | t[3])); |
152 | 0 | sum[0] = _mm_add_epi32 (sum[0], sum[2]); |
153 | |
|
154 | 0 | sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); |
155 | 0 | sum[0] = _mm_madd_epi16 (sum[0], f); |
156 | |
|
157 | 0 | sum[0] = |
158 | 0 | _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, |
159 | 0 | 3))); |
160 | 0 | sum[0] = |
161 | 0 | _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, |
162 | 0 | 1))); |
163 | |
|
164 | 0 | sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); |
165 | 0 | sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); |
166 | 0 | sum[0] = _mm_packs_epi32 (sum[0], sum[0]); |
167 | 0 | *o = _mm_extract_epi16 (sum[0], 0); |
168 | 0 | } |
169 | | |
170 | | static inline void |
171 | | inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a, |
172 | | const gdouble * b, gint len, const gdouble * icoeff, gint bstride) |
173 | 0 | { |
174 | 0 | gint i = 0; |
175 | 0 | __m128d sum = _mm_setzero_pd (); |
176 | |
|
177 | 0 | for (; i < len; i += 8) { |
178 | 0 | sum = |
179 | 0 | _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0), |
180 | 0 | _mm_load_pd (b + i + 0))); |
181 | 0 | sum = |
182 | 0 | _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2), |
183 | 0 | _mm_load_pd (b + i + 2))); |
184 | 0 | sum = |
185 | 0 | _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4), |
186 | 0 | _mm_load_pd (b + i + 4))); |
187 | 0 | sum = |
188 | 0 | _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6), |
189 | 0 | _mm_load_pd (b + i + 6))); |
190 | 0 | } |
191 | 0 | sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum)); |
192 | 0 | _mm_store_sd (o, sum); |
193 | 0 | } |
194 | | |
195 | | static inline void |
196 | | inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a, |
197 | | const gdouble * b, gint len, const gdouble * icoeff, gint bstride) |
198 | 0 | { |
199 | 0 | gint i = 0; |
200 | 0 | __m128d sum[2], t; |
201 | 0 | const gdouble *c[2] = { (gdouble *) ((gint8 *) b + 0 * bstride), |
202 | 0 | (gdouble *) ((gint8 *) b + 1 * bstride) |
203 | 0 | }; |
204 | |
|
205 | 0 | sum[0] = sum[1] = _mm_setzero_pd (); |
206 | |
|
207 | 0 | for (; i < len; i += 4) { |
208 | 0 | t = _mm_loadu_pd (a + i + 0); |
209 | 0 | sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0))); |
210 | 0 | sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0))); |
211 | 0 | t = _mm_loadu_pd (a + i + 2); |
212 | 0 | sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2))); |
213 | 0 | sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2))); |
214 | 0 | } |
215 | 0 | sum[0] = _mm_mul_pd (_mm_sub_pd (sum[0], sum[1]), _mm_load1_pd (icoeff)); |
216 | 0 | sum[0] = _mm_add_pd (sum[0], sum[1]); |
217 | 0 | sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0])); |
218 | 0 | _mm_store_sd (o, sum[0]); |
219 | 0 | } |
220 | | |
221 | | static inline void |
222 | | inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a, |
223 | | const gdouble * b, gint len, const gdouble * icoeff, gint bstride) |
224 | 0 | { |
225 | 0 | gint i; |
226 | 0 | __m128d f[2], sum[4], t; |
227 | 0 | const gdouble *c[4] = { (gdouble *) ((gint8 *) b + 0 * bstride), |
228 | 0 | (gdouble *) ((gint8 *) b + 1 * bstride), |
229 | 0 | (gdouble *) ((gint8 *) b + 2 * bstride), |
230 | 0 | (gdouble *) ((gint8 *) b + 3 * bstride) |
231 | 0 | }; |
232 | |
|
233 | 0 | f[0] = _mm_loadu_pd (icoeff + 0); |
234 | 0 | f[1] = _mm_loadu_pd (icoeff + 2); |
235 | 0 | sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd (); |
236 | |
|
237 | 0 | for (i = 0; i < len; i += 2) { |
238 | 0 | t = _mm_loadu_pd (a + i + 0); |
239 | 0 | sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i))); |
240 | 0 | sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i))); |
241 | 0 | sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i))); |
242 | 0 | sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i))); |
243 | 0 | } |
244 | 0 | sum[0] = |
245 | 0 | _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0))); |
246 | 0 | sum[1] = |
247 | 0 | _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1))); |
248 | 0 | sum[2] = |
249 | 0 | _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0))); |
250 | 0 | sum[3] = |
251 | 0 | _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1))); |
252 | 0 | sum[0] = _mm_add_pd (sum[0], sum[1]); |
253 | 0 | sum[2] = _mm_add_pd (sum[2], sum[3]); |
254 | 0 | sum[0] = _mm_add_pd (sum[0], sum[2]); |
255 | 0 | sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0])); |
256 | 0 | _mm_store_sd (o, sum[0]); |
257 | 0 | } |
258 | | |
259 | | MAKE_RESAMPLE_FUNC (gint16, full, 1, sse2); |
260 | | MAKE_RESAMPLE_FUNC (gint16, linear, 1, sse2); |
261 | | MAKE_RESAMPLE_FUNC (gint16, cubic, 1, sse2); |
262 | | |
263 | | MAKE_RESAMPLE_FUNC (gdouble, full, 1, sse2); |
264 | | MAKE_RESAMPLE_FUNC (gdouble, linear, 1, sse2); |
265 | | MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, sse2); |
266 | | |
267 | | void |
268 | | interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap, |
269 | | gint len, const gpointer icp, gint astride) |
270 | 0 | { |
271 | 0 | gint i = 0; |
272 | 0 | gint16 *o = op, *a = ap, *ic = icp; |
273 | 0 | __m128i ta, tb, t1, t2; |
274 | 0 | __m128i f = _mm_set_epi64x (0, *((gint64 *) ic)); |
275 | 0 | const gint16 *c[2] = { (gint16 *) ((gint8 *) a + 0 * astride), |
276 | 0 | (gint16 *) ((gint8 *) a + 1 * astride) |
277 | 0 | }; |
278 | |
|
279 | 0 | f = _mm_unpacklo_epi32 (f, f); |
280 | 0 | f = _mm_unpacklo_epi64 (f, f); |
281 | |
|
282 | 0 | for (; i < len; i += 8) { |
283 | 0 | ta = _mm_load_si128 ((__m128i *) (c[0] + i)); |
284 | 0 | tb = _mm_load_si128 ((__m128i *) (c[1] + i)); |
285 | |
|
286 | 0 | t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f); |
287 | 0 | t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f); |
288 | |
|
289 | 0 | t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); |
290 | 0 | t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); |
291 | |
|
292 | 0 | t1 = _mm_srai_epi32 (t1, PRECISION_S16); |
293 | 0 | t2 = _mm_srai_epi32 (t2, PRECISION_S16); |
294 | |
|
295 | 0 | t1 = _mm_packs_epi32 (t1, t2); |
296 | 0 | _mm_store_si128 ((__m128i *) (o + i), t1); |
297 | 0 | } |
298 | 0 | } |
299 | | |
300 | | void |
301 | | interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap, |
302 | | gint len, const gpointer icp, gint astride) |
303 | 0 | { |
304 | 0 | gint i = 0; |
305 | 0 | gint16 *o = op, *a = ap, *ic = icp; |
306 | 0 | __m128i ta, tb, tl1, tl2, th1, th2; |
307 | 0 | __m128i f[2]; |
308 | 0 | const gint16 *c[4] = { (gint16 *) ((gint8 *) a + 0 * astride), |
309 | 0 | (gint16 *) ((gint8 *) a + 1 * astride), |
310 | 0 | (gint16 *) ((gint8 *) a + 2 * astride), |
311 | 0 | (gint16 *) ((gint8 *) a + 3 * astride) |
312 | 0 | }; |
313 | |
|
314 | 0 | f[0] = _mm_set_epi16 (ic[1], ic[0], ic[1], ic[0], ic[1], ic[0], ic[1], ic[0]); |
315 | 0 | f[1] = _mm_set_epi16 (ic[3], ic[2], ic[3], ic[2], ic[3], ic[2], ic[3], ic[2]); |
316 | |
|
317 | 0 | for (; i < len; i += 8) { |
318 | 0 | ta = _mm_load_si128 ((__m128i *) (c[0] + i)); |
319 | 0 | tb = _mm_load_si128 ((__m128i *) (c[1] + i)); |
320 | |
|
321 | 0 | tl1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[0]); |
322 | 0 | th1 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[0]); |
323 | |
|
324 | 0 | ta = _mm_load_si128 ((__m128i *) (c[2] + i)); |
325 | 0 | tb = _mm_load_si128 ((__m128i *) (c[3] + i)); |
326 | |
|
327 | 0 | tl2 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[1]); |
328 | 0 | th2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[1]); |
329 | |
|
330 | 0 | tl1 = _mm_add_epi32 (tl1, tl2); |
331 | 0 | th1 = _mm_add_epi32 (th1, th2); |
332 | |
|
333 | 0 | tl1 = _mm_add_epi32 (tl1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); |
334 | 0 | th1 = _mm_add_epi32 (th1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); |
335 | |
|
336 | 0 | tl1 = _mm_srai_epi32 (tl1, PRECISION_S16); |
337 | 0 | th1 = _mm_srai_epi32 (th1, PRECISION_S16); |
338 | |
|
339 | 0 | tl1 = _mm_packs_epi32 (tl1, th1); |
340 | 0 | _mm_store_si128 ((__m128i *) (o + i), tl1); |
341 | 0 | } |
342 | 0 | } |
343 | | |
344 | | void |
345 | | interpolate_gdouble_linear_sse2 (gpointer op, const gpointer ap, |
346 | | gint len, const gpointer icp, gint astride) |
347 | 0 | { |
348 | 0 | gint i; |
349 | 0 | gdouble *o = op, *a = ap, *ic = icp; |
350 | 0 | __m128d f[2], t1, t2; |
351 | 0 | const gdouble *c[2] = { (gdouble *) ((gint8 *) a + 0 * astride), |
352 | 0 | (gdouble *) ((gint8 *) a + 1 * astride) |
353 | 0 | }; |
354 | |
|
355 | 0 | f[0] = _mm_load1_pd (ic + 0); |
356 | 0 | f[1] = _mm_load1_pd (ic + 1); |
357 | |
|
358 | 0 | for (i = 0; i < len; i += 4) { |
359 | 0 | t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]); |
360 | 0 | t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]); |
361 | 0 | _mm_store_pd (o + i + 0, _mm_add_pd (t1, t2)); |
362 | |
|
363 | 0 | t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 2), f[0]); |
364 | 0 | t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 2), f[1]); |
365 | 0 | _mm_store_pd (o + i + 2, _mm_add_pd (t1, t2)); |
366 | 0 | } |
367 | 0 | } |
368 | | |
369 | | void |
370 | | interpolate_gdouble_cubic_sse2 (gpointer op, const gpointer ap, |
371 | | gint len, const gpointer icp, gint astride) |
372 | 0 | { |
373 | 0 | gint i; |
374 | 0 | gdouble *o = op, *a = ap, *ic = icp; |
375 | 0 | __m128d f[4], t[4]; |
376 | 0 | const gdouble *c[4] = { (gdouble *) ((gint8 *) a + 0 * astride), |
377 | 0 | (gdouble *) ((gint8 *) a + 1 * astride), |
378 | 0 | (gdouble *) ((gint8 *) a + 2 * astride), |
379 | 0 | (gdouble *) ((gint8 *) a + 3 * astride) |
380 | 0 | }; |
381 | |
|
382 | 0 | f[0] = _mm_load1_pd (ic + 0); |
383 | 0 | f[1] = _mm_load1_pd (ic + 1); |
384 | 0 | f[2] = _mm_load1_pd (ic + 2); |
385 | 0 | f[3] = _mm_load1_pd (ic + 3); |
386 | |
|
387 | 0 | for (i = 0; i < len; i += 2) { |
388 | 0 | t[0] = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]); |
389 | 0 | t[1] = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]); |
390 | 0 | t[2] = _mm_mul_pd (_mm_load_pd (c[2] + i + 0), f[2]); |
391 | 0 | t[3] = _mm_mul_pd (_mm_load_pd (c[3] + i + 0), f[3]); |
392 | 0 | t[0] = _mm_add_pd (t[0], t[1]); |
393 | 0 | t[2] = _mm_add_pd (t[2], t[3]); |
394 | 0 | _mm_store_pd (o + i + 0, _mm_add_pd (t[0], t[2])); |
395 | 0 | } |
396 | 0 | } |