Coverage Report

Created: 2023-06-07 07:17

/src/pffft/pffft.c
Line
Count
Source (jump to first uncovered line)
1
/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
2
3
   Based on original fortran 77 code from FFTPACKv4 from NETLIB
4
   (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
5
   of NCAR, in 1985.
6
7
   As confirmed by the NCAR fftpack software curators, the following
8
   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
9
   released under the same terms.
10
11
   FFTPACK license:
12
13
   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
14
15
   Copyright (c) 2004 the University Corporation for Atmospheric
16
   Research ("UCAR"). All rights reserved. Developed by NCAR's
17
   Computational and Information Systems Laboratory, UCAR,
18
   www.cisl.ucar.edu.
19
20
   Redistribution and use of the Software in source and binary forms,
21
   with or without modification, is permitted provided that the
22
   following conditions are met:
23
24
   - Neither the names of NCAR's Computational and Information Systems
25
   Laboratory, the University Corporation for Atmospheric Research,
26
   nor the names of its sponsors or contributors may be used to
27
   endorse or promote products derived from this Software without
28
   specific prior written permission.  
29
30
   - Redistributions of source code must retain the above copyright
31
   notices, this list of conditions, and the disclaimer below.
32
33
   - Redistributions in binary form must reproduce the above copyright
34
   notice, this list of conditions, and the disclaimer below in the
35
   documentation and/or other materials provided with the
36
   distribution.
37
38
   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
39
   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
40
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
41
   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
42
   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
43
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
44
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
45
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
46
   SOFTWARE.
47
48
49
   PFFFT : a Pretty Fast FFT.
50
51
   This file is largerly based on the original FFTPACK implementation, modified in
52
   order to take advantage of SIMD instructions of modern CPUs.
53
*/
54
55
/*
56
  ChangeLog: 
57
  - 2011/10/02, version 1: This is the very first release of this file.
58
*/
59
60
#include "pffft.h"
61
#include <stdlib.h>
62
#include <stdio.h>
63
#include <math.h>
64
#include <assert.h>
65
66
/* detect compiler flavour */
67
#if defined(_MSC_VER)
68
#  define COMPILER_MSVC
69
#elif defined(__GNUC__)
70
#  define COMPILER_GCC
71
#endif
72
73
#if defined(COMPILER_GCC)
74
#  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
75
#  define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
76
#  define RESTRICT __restrict
77
514
#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
78
#elif defined(COMPILER_MSVC)
79
#  define ALWAYS_INLINE(return_type) __forceinline return_type
80
#  define NEVER_INLINE(return_type) __declspec(noinline) return_type
81
#  define RESTRICT __restrict
82
#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
83
#endif
84
85
86
/* 
87
   vector support macros: the rest of the code is independant of
88
   SSE/Altivec/NEON -- adding support for other platforms with 4-element
89
   vectors should be limited to these macros 
90
*/
91
92
93
// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
94
//#define PFFFT_SIMD_DISABLE
95
96
/*
97
   Altivec support macros 
98
*/
99
#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__))
100
#include <altivec.h>
101
typedef vector float v4sf;
102
#  define SIMD_SZ 4
103
#  define VZERO() ((vector float) vec_splat_u8(0))
104
#  define VMUL(a,b) vec_madd(a,b, VZERO())
105
#  define VADD(a,b) vec_add(a,b)
106
#  define VMADD(a,b,c) vec_madd(a,b,c)
107
#  define VSUB(a,b) vec_sub(a,b)
108
inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
109
#  define LD_PS1(p) ld_ps1(&p)
110
#  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
111
#  define UNINTERLEAVE2(in1, in2, out1, out2) {                           \
112
    vector unsigned char vperm1 =  (vector unsigned char){0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27}; \
113
    vector unsigned char vperm2 =  (vector unsigned char){4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31}; \
114
    v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
115
  }
116
#  define VTRANSPOSE4(x0,x1,x2,x3) {              \
117
    v4sf y0 = vec_mergeh(x0, x2);               \
118
    v4sf y1 = vec_mergel(x0, x2);               \
119
    v4sf y2 = vec_mergeh(x1, x3);               \
120
    v4sf y3 = vec_mergel(x1, x3);               \
121
    x0 = vec_mergeh(y0, y2);                    \
122
    x1 = vec_mergel(y0, y2);                    \
123
    x2 = vec_mergeh(y1, y3);                    \
124
    x3 = vec_mergel(y1, y3);                    \
125
  }
126
#  define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15})
127
#  define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
128
129
/*
130
  SSE1 support macros
131
*/
132
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(i386) || defined(_M_IX86))
133
134
#include <xmmintrin.h>
135
typedef __m128 v4sf;
136
35.0M
#  define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
137
135
#  define VZERO() _mm_setzero_ps()
138
119M
#  define VMUL(a,b) _mm_mul_ps(a,b)
139
139M
#  define VADD(a,b) _mm_add_ps(a,b)
140
6.70M
#  define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
141
118M
#  define VSUB(a,b) _mm_sub_ps(a,b)
142
14.6M
#  define LD_PS1(p) _mm_set1_ps(p)
143
1.46M
#  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
144
4.39M
#  define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
145
2.92M
#  define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
146
1.45M
#  define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
147
#  define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
148
149
/*
150
  ARM NEON support macros
151
*/
152
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__))
153
#  include <arm_neon.h>
154
typedef float32x4_t v4sf;
155
#  define SIMD_SZ 4
156
#  define VZERO() vdupq_n_f32(0)
157
#  define VMUL(a,b) vmulq_f32(a,b)
158
#  define VADD(a,b) vaddq_f32(a,b)
159
#  define VMADD(a,b,c) vmlaq_f32(c,a,b)
160
#  define VSUB(a,b) vsubq_f32(a,b)
161
#  define LD_PS1(p) vld1q_dup_f32(&(p))
162
#  define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
163
#  define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
164
#  define VTRANSPOSE4(x0,x1,x2,x3) {                                    \
165
    float32x4x2_t t0_ = vzipq_f32(x0, x2);                              \
166
    float32x4x2_t t1_ = vzipq_f32(x1, x3);                              \
167
    float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]);              \
168
    float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]);              \
169
    x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
170
  }
171
// marginally faster version
172
//#  define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
173
#  define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
174
#  define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
175
#else
176
#  if !defined(PFFFT_SIMD_DISABLE)
177
#    warning "building with simd disabled !\n";
178
#    define PFFFT_SIMD_DISABLE // fallback to scalar code
179
#  endif
180
#endif
181
182
// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
183
#ifdef PFFFT_SIMD_DISABLE
184
typedef float v4sf;
185
#  define SIMD_SZ 1
186
#  define VZERO() 0.f
187
#  define VMUL(a,b) ((a)*(b))
188
#  define VADD(a,b) ((a)+(b))
189
#  define VMADD(a,b,c) ((a)*(b)+(c))
190
#  define VSUB(a,b) ((a)-(b))
191
#  define LD_PS1(p) (p)
192
#  define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
193
#endif
194
195
// shortcuts for complex multiplcations
196
26.0M
#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
197
9.01M
#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
198
#ifndef SVMUL
199
// multiply a scalar with a vector
200
5.32M
#define SVMUL(f,v) VMUL(LD_PS1(f),v)
201
#endif
202
203
#if !defined(PFFFT_SIMD_DISABLE)
204
typedef union v4sf_union {
205
  v4sf  v;
206
  float f[4];
207
} v4sf_union;
208
209
#include <string.h>
210
211
0
#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
212
213
/* detect bugs with the vector support macros */
214
0
void validate_pffft_simd(void) {
215
0
  float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
216
0
  v4sf_union a0, a1, a2, a3, t, u; 
217
0
  memcpy(a0.f, f, 4*sizeof(float));
218
0
  memcpy(a1.f, f+4, 4*sizeof(float));
219
0
  memcpy(a2.f, f+8, 4*sizeof(float));
220
0
  memcpy(a3.f, f+12, 4*sizeof(float));
221
222
0
  t = a0; u = a1; t.v = VZERO();
223
0
  printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
224
0
  t.v = VADD(a1.v, a2.v);
225
0
  printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
226
0
  t.v = VMUL(a1.v, a2.v);
227
0
  printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
228
0
  t.v = VMADD(a1.v, a2.v,a0.v);
229
0
  printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
230
231
0
  INTERLEAVE2(a1.v,a2.v,t.v,u.v);
232
0
  printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
233
0
  assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
234
0
  UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
235
0
  printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
236
0
  assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
237
238
0
  t.v=LD_PS1(f[15]);
239
0
  printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
240
0
  assertv4(t, 15, 15, 15, 15);
241
0
  t.v = VSWAPHL(a1.v, a2.v);
242
0
  printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
243
0
  assertv4(t, 8, 9, 6, 7);
244
0
  VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
245
0
  printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", 
246
0
         a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3], 
247
0
         a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]); 
248
0
  assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
249
0
}
250
#else
251
void validate_pffft_simd() {} // allow test_pffft.c to call this function even when simd is not available..
252
#endif //!PFFFT_SIMD_DISABLE
253
254
/* SSE and co like 16-bytes aligned pointers */
255
2.31k
#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
256
771
void *pffft_aligned_malloc(size_t nb_bytes) {
257
771
  void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
258
771
  if (!p0) return (void *) 0;
259
771
  p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
260
771
  *((void **) p - 1) = p0;
261
771
  return p;
262
771
}
263
264
771
void pffft_aligned_free(void *p) {
265
771
  if (p) free(*((void **) p - 1));
266
771
}
267
268
0
int pffft_simd_size() { return SIMD_SZ; }
269
270
/*
271
  passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
272
*/
273
164
static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) {
274
164
  int k, i;
275
164
  int l1ido = l1*ido;
276
164
  if (ido <= 2) {
277
0
    for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
278
0
      ch[0]         = VADD(cc[0], cc[ido+0]);
279
0
      ch[l1ido]     = VSUB(cc[0], cc[ido+0]);
280
0
      ch[1]         = VADD(cc[1], cc[ido+1]);
281
0
      ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
282
0
    }
283
164
  } else {
284
328
    for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
285
1.26M
      for (i=0; i<ido-1; i+=2) {
286
1.26M
        v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
287
1.26M
        v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
288
1.26M
        v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
289
1.26M
        ch[i]   = VADD(cc[i+0], cc[i+ido+0]);
290
1.26M
        ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
291
1.26M
        VCPLXMUL(tr2, ti2, wr, wi);
292
1.26M
        ch[i+l1ido]   = tr2;
293
1.26M
        ch[i+l1ido+1] = ti2;
294
1.26M
      }
295
164
    }
296
164
  }
297
164
}
298
299
/*
300
  passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
301
*/
302
static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
303
308
                                    const float *wa1, const float *wa2, float fsign) {
304
308
  static const float taur = -0.5f;
305
308
  float taui = 0.866025403784439f*fsign;
306
308
  int i, k;
307
308
  v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
308
308
  int l1ido = l1*ido;
309
308
  float wr1, wi1, wr2, wi2;
310
308
  assert(ido > 2);
311
39.4k
  for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
312
891k
    for (i=0; i<ido-1; i+=2) {
313
852k
      tr2 = VADD(cc[i+ido], cc[i+2*ido]);
314
852k
      cr2 = VADD(cc[i], SVMUL(taur,tr2));
315
852k
      ch[i]    = VADD(cc[i], tr2);
316
852k
      ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
317
852k
      ci2 = VADD(cc[i    +1], SVMUL(taur,ti2));
318
852k
      ch[i+1]  = VADD(cc[i+1], ti2);
319
852k
      cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
320
852k
      ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
321
852k
      dr2 = VSUB(cr2, ci3);
322
852k
      dr3 = VADD(cr2, ci3);
323
852k
      di2 = VADD(ci2, cr3);
324
852k
      di3 = VSUB(ci2, cr3);
325
852k
      wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1];
326
852k
      VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
327
852k
      ch[i+l1ido] = dr2; 
328
852k
      ch[i+l1ido + 1] = di2;
329
852k
      VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
330
852k
      ch[i+2*l1ido] = dr3;
331
852k
      ch[i+2*l1ido+1] = di3;
332
852k
    }
333
39.1k
  }
334
308
} /* passf3 */
335
336
static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
337
902
                                    const float *wa1, const float *wa2, const float *wa3, float fsign) {
338
  /* isign == -1 for forward transform and +1 for backward transform */
339
340
902
  int i, k;
341
902
  v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
342
902
  int l1ido = l1*ido;
343
902
  if (ido == 2) {
344
733k
    for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
345
733k
      tr1 = VSUB(cc[0], cc[2*ido + 0]);
346
733k
      tr2 = VADD(cc[0], cc[2*ido + 0]);
347
733k
      ti1 = VSUB(cc[1], cc[2*ido + 1]);
348
733k
      ti2 = VADD(cc[1], cc[2*ido + 1]);
349
733k
      ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
350
733k
      tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
351
733k
      tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
352
733k
      ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
353
354
733k
      ch[0*l1ido + 0] = VADD(tr2, tr3);
355
733k
      ch[0*l1ido + 1] = VADD(ti2, ti3);
356
733k
      ch[1*l1ido + 0] = VADD(tr1, tr4);
357
733k
      ch[1*l1ido + 1] = VADD(ti1, ti4);
358
733k
      ch[2*l1ido + 0] = VSUB(tr2, tr3);
359
733k
      ch[2*l1ido + 1] = VSUB(ti2, ti3);        
360
733k
      ch[3*l1ido + 0] = VSUB(tr1, tr4);
361
733k
      ch[3*l1ido + 1] = VSUB(ti1, ti4);
362
733k
    }
363
658
  } else {
364
211k
    for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
365
3.51M
      for (i=0; i<ido-1; i+=2) {
366
3.30M
        float wr1, wi1, wr2, wi2, wr3, wi3;
367
3.30M
        tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
368
3.30M
        tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
369
3.30M
        ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
370
3.30M
        ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
371
3.30M
        tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
372
3.30M
        ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
373
3.30M
        tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
374
3.30M
        ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
375
376
3.30M
        ch[i] = VADD(tr2, tr3);
377
3.30M
        cr3    = VSUB(tr2, tr3);
378
3.30M
        ch[i + 1] = VADD(ti2, ti3);
379
3.30M
        ci3 = VSUB(ti2, ti3);
380
381
3.30M
        cr2 = VADD(tr1, tr4);
382
3.30M
        cr4 = VSUB(tr1, tr4);
383
3.30M
        ci2 = VADD(ti1, ti4);
384
3.30M
        ci4 = VSUB(ti1, ti4);
385
3.30M
        wr1=wa1[i]; wi1=fsign*wa1[i+1];
386
3.30M
        VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
387
3.30M
        wr2=wa2[i]; wi2=fsign*wa2[i+1];
388
3.30M
        ch[i + l1ido] = cr2;
389
3.30M
        ch[i + l1ido + 1] = ci2;
390
391
3.30M
        VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
392
3.30M
        wr3=wa3[i]; wi3=fsign*wa3[i+1];
393
3.30M
        ch[i + 2*l1ido] = cr3;
394
3.30M
        ch[i + 2*l1ido + 1] = ci3;
395
396
3.30M
        VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
397
3.30M
        ch[i + 3*l1ido] = cr4;
398
3.30M
        ch[i + 3*l1ido + 1] = ci4;
399
3.30M
      }
400
210k
    }
401
658
  }
402
902
} /* passf4 */
403
404
/*
405
  passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
406
*/
407
static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
408
                                    const float *wa1, const float *wa2, 
409
172
                                    const float *wa3, const float *wa4, float fsign) {  
410
172
  static const float tr11 = .309016994374947f;
411
172
  const float ti11 = .951056516295154f*fsign;
412
172
  static const float tr12 = -.809016994374947f;
413
172
  const float ti12 = .587785252292473f*fsign;
414
415
  /* Local variables */
416
172
  int i, k;
417
172
  v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
418
172
    ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
419
420
172
  float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4;
421
422
172
#define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1]
423
3.04M
#define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1]
424
425
172
  assert(ido > 2);
426
5.77k
  for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) {
427
310k
    for (i = 0; i < ido-1; i += 2) {
428
304k
      ti5 = VSUB(cc_ref(i  , 2), cc_ref(i  , 5));
429
304k
      ti2 = VADD(cc_ref(i  , 2), cc_ref(i  , 5));
430
304k
      ti4 = VSUB(cc_ref(i  , 3), cc_ref(i  , 4));
431
304k
      ti3 = VADD(cc_ref(i  , 3), cc_ref(i  , 4));
432
304k
      tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
433
304k
      tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
434
304k
      tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
435
304k
      tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
436
304k
      ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
437
304k
      ch_ref(i  , 1) = VADD(cc_ref(i  , 1), VADD(ti2, ti3));
438
304k
      cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
439
304k
      ci2 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
440
304k
      cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
441
304k
      ci3 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
442
304k
      cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
443
304k
      ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
444
304k
      cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
445
304k
      ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
446
304k
      dr3 = VSUB(cr3, ci4);
447
304k
      dr4 = VADD(cr3, ci4);
448
304k
      di3 = VADD(ci3, cr4);
449
304k
      di4 = VSUB(ci3, cr4);
450
304k
      dr5 = VADD(cr2, ci5);
451
304k
      dr2 = VSUB(cr2, ci5);
452
304k
      di5 = VSUB(ci2, cr5);
453
304k
      di2 = VADD(ci2, cr5);
454
304k
      wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1];
455
304k
      wr3=wa3[i]; wi3=fsign*wa3[i+1]; wr4=wa4[i]; wi4=fsign*wa4[i+1];
456
304k
      VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
457
304k
      ch_ref(i - 1, 2) = dr2;
458
304k
      ch_ref(i, 2)     = di2;
459
304k
      VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
460
304k
      ch_ref(i - 1, 3) = dr3;
461
304k
      ch_ref(i, 3)     = di3;
462
304k
      VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
463
304k
      ch_ref(i - 1, 4) = dr4;
464
304k
      ch_ref(i, 4)     = di4;
465
304k
      VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
466
304k
      ch_ref(i - 1, 5) = dr5;
467
304k
      ch_ref(i, 5)     = di5;
468
304k
    }
469
5.60k
  }
470
172
#undef ch_ref
471
172
#undef cc_ref
472
172
}
473
474
62
static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
475
62
  static const float minus_one = -1.f;
476
62
  int i, k, l1ido = l1*ido;
477
124
  for (k=0; k < l1ido; k += ido) {
478
62
    v4sf a = cc[k], b = cc[k + l1ido];
479
62
    ch[2*k] = VADD(a, b);
480
62
    ch[2*(k+ido)-1] = VSUB(a, b);
481
62
  }
482
62
  if (ido < 2) return;
483
62
  if (ido != 2) {
484
124
    for (k=0; k < l1ido; k += ido) {
485
150k
      for (i=2; i<ido; i+=2) {
486
150k
        v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
487
150k
        v4sf br = cc[i - 1 + k], bi = cc[i + k];
488
150k
        VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1])); 
489
150k
        ch[i + 2*k] = VADD(bi, ti2);
490
150k
        ch[2*(k+ido) - i] = VSUB(ti2, bi);
491
150k
        ch[i - 1 + 2*k] = VADD(br, tr2);
492
150k
        ch[2*(k+ido) - i -1] = VSUB(br, tr2);
493
150k
      }
494
62
    }
495
62
    if (ido % 2 == 1) return;
496
62
  }
497
124
  for (k=0; k < l1ido; k += ido) {
498
62
    ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
499
62
    ch[2*k + ido-1] = cc[k + ido-1];
500
62
  }
501
62
} /* radf2 */
502
503
504
62
static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1) {
505
62
  static const float minus_two=-2;
506
62
  int i, k, l1ido = l1*ido;
507
62
  v4sf a,b,c,d, tr2, ti2;
508
124
  for (k=0; k < l1ido; k += ido) {
509
62
    a = cc[2*k]; b = cc[2*(k+ido) - 1];
510
62
    ch[k] = VADD(a, b);
511
62
    ch[k + l1ido] =VSUB(a, b);
512
62
  }
513
62
  if (ido < 2) return;
514
62
  if (ido != 2) {
515
124
    for (k = 0; k < l1ido; k += ido) {
516
150k
      for (i = 2; i < ido; i += 2) {
517
150k
        a = cc[i-1 + 2*k]; b = cc[2*(k + ido) - i - 1];
518
150k
        c = cc[i+0 + 2*k]; d = cc[2*(k + ido) - i + 0];
519
150k
        ch[i-1 + k] = VADD(a, b);
520
150k
        tr2 = VSUB(a, b);
521
150k
        ch[i+0 + k] = VSUB(c, d);
522
150k
        ti2 = VADD(c, d);
523
150k
        VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
524
150k
        ch[i-1 + k + l1ido] = tr2;
525
150k
        ch[i+0 + k + l1ido] = ti2;
526
150k
      }
527
62
    }
528
62
    if (ido % 2 == 1) return;
529
62
  }
530
124
  for (k = 0; k < l1ido; k += ido) {
531
62
    a = cc[2*k + ido-1]; b = cc[2*k + ido];
532
62
    ch[k + ido-1] = VADD(a,a);
533
62
    ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
534
62
  }
535
62
} /* radb2 */
536
537
static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
538
181
                     const float *wa1, const float *wa2) {
539
181
  static const float taur = -0.5f;
540
181
  static const float taui = 0.866025403784439f;
541
181
  int i, k, ic;
542
181
  v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
543
151k
  for (k=0; k<l1; k++) {
544
151k
    cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
545
151k
    ch[3*k*ido] = VADD(cc[k*ido], cr2);
546
151k
    ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
547
151k
    ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
548
151k
  }
549
181
  if (ido == 1) return;
550
54.5k
  for (k=0; k<l1; k++) {
551
407k
    for (i=2; i<ido; i+=2) {
552
353k
      ic = ido - i;
553
353k
      wr1 = LD_PS1(wa1[i - 2]); wi1 = LD_PS1(wa1[i - 1]);
554
353k
      dr2 = cc[i - 1 + (k + l1)*ido]; di2 = cc[i + (k + l1)*ido];
555
353k
      VCPLXMULCONJ(dr2, di2, wr1, wi1);
556
557
353k
      wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
558
353k
      dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido];
559
353k
      VCPLXMULCONJ(dr3, di3, wr2, wi2);
560
        
561
353k
      cr2 = VADD(dr2, dr3);
562
353k
      ci2 = VADD(di2, di3);
563
353k
      ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
564
353k
      ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
565
353k
      tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
566
353k
      ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
567
353k
      tr3 = SVMUL(taui, VSUB(di2, di3));
568
353k
      ti3 = SVMUL(taui, VSUB(dr3, dr2));
569
353k
      ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
570
353k
      ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
571
353k
      ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
572
353k
      ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
573
353k
    }
574
54.3k
  }
575
145
} /* radf3 */
576
577
578
static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
579
                     const float *wa1, const float *wa2)
580
181
{
581
181
  static const float taur = -0.5f;
582
181
  static const float taui = 0.866025403784439f;
583
181
  static const float taui_2 = 0.866025403784439f*2;
584
181
  int i, k, ic;
585
181
  v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
586
151k
  for (k=0; k<l1; k++) {
587
151k
    tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2);
588
151k
    cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
589
151k
    ch[k*ido] = VADD(cc[3*k*ido], tr2);
590
151k
    ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
591
151k
    ch[(k + l1)*ido] = VSUB(cr2, ci3);
592
151k
    ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
593
151k
  }
594
181
  if (ido == 1) return;
595
54.5k
  for (k=0; k<l1; k++) {
596
407k
    for (i=2; i<ido; i+=2) {
597
353k
      ic = ido - i;
598
353k
      tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
599
353k
      cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
600
353k
      ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
601
353k
      ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
602
353k
      ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
603
353k
      ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
604
353k
      cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
605
353k
      ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
606
353k
      dr2 = VSUB(cr2, ci3);
607
353k
      dr3 = VADD(cr2, ci3);
608
353k
      di2 = VADD(ci2, cr3);
609
353k
      di3 = VSUB(ci2, cr3);
610
353k
      VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
611
353k
      ch[i - 1 + (k + l1)*ido] = dr2;
612
353k
      ch[i + (k + l1)*ido] = di2;
613
353k
      VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
614
353k
      ch[i - 1 + (k + 2*l1)*ido] = dr3;
615
353k
      ch[i + (k + 2*l1)*ido] = di3;
616
353k
    }
617
54.3k
  }
618
145
} /* radb3 */
619
620
static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
621
                                   const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
622
529
{
623
529
  static const float minus_hsqt2 = (float)-0.7071067811865475;
624
529
  int i, k, l1ido = l1*ido;
625
529
  {
626
529
    const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido; 
627
529
    v4sf * RESTRICT ch_ = ch;
628
635k
    while (cc < cc_end) {
629
      // this loop represents between 25% and 40% of total radf4_ps cost !
630
634k
      v4sf a0 = cc[0], a1 = cc[l1ido];
631
634k
      v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
632
634k
      v4sf tr1 = VADD(a1, a3);
633
634k
      v4sf tr2 = VADD(a0, a2);
634
634k
      ch[2*ido-1] = VSUB(a0, a2);
635
634k
      ch[2*ido  ] = VSUB(a3, a1);
636
634k
      ch[0      ] = VADD(tr1, tr2);
637
634k
      ch[4*ido-1] = VSUB(tr2, tr1);
638
634k
      cc += ido; ch += 4*ido;
639
634k
    }
640
529
    cc = cc_; ch = ch_;
641
529
  }
642
529
  if (ido < 2) return;
643
489
  if (ido != 2) {
644
176k
    for (k = 0; k < l1ido; k += ido) {
645
175k
      const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
646
2.05M
      for (i=2; i<ido; i += 2, pc += 2) {
647
1.88M
        int ic = ido - i;
648
1.88M
        v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
649
1.88M
        v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
650
651
1.88M
        cr2 = pc[1*l1ido+0];
652
1.88M
        ci2 = pc[1*l1ido+1];
653
1.88M
        wr=LD_PS1(wa1[i - 2]);
654
1.88M
        wi=LD_PS1(wa1[i - 1]);
655
1.88M
        VCPLXMULCONJ(cr2,ci2,wr,wi);
656
657
1.88M
        cr3 = pc[2*l1ido+0];
658
1.88M
        ci3 = pc[2*l1ido+1];
659
1.88M
        wr = LD_PS1(wa2[i-2]); 
660
1.88M
        wi = LD_PS1(wa2[i-1]);
661
1.88M
        VCPLXMULCONJ(cr3, ci3, wr, wi);
662
663
1.88M
        cr4 = pc[3*l1ido];
664
1.88M
        ci4 = pc[3*l1ido+1];
665
1.88M
        wr = LD_PS1(wa3[i-2]); 
666
1.88M
        wi = LD_PS1(wa3[i-1]);
667
1.88M
        VCPLXMULCONJ(cr4, ci4, wr, wi);
668
669
        /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
670
671
1.88M
        tr1 = VADD(cr2,cr4);
672
1.88M
        tr4 = VSUB(cr4,cr2); 
673
1.88M
        tr2 = VADD(pc[0],cr3);
674
1.88M
        tr3 = VSUB(pc[0],cr3);
675
1.88M
        ch[i - 1 + 4*k] = VADD(tr1,tr2);
676
1.88M
        ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed
677
1.88M
        ti1 = VADD(ci2,ci4);
678
1.88M
        ti4 = VSUB(ci2,ci4);
679
1.88M
        ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
680
1.88M
        ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4
681
1.88M
        ti2 = VADD(pc[1],ci3);
682
1.88M
        ti3 = VSUB(pc[1],ci3);
683
1.88M
        ch[i + 4*k] = VADD(ti1, ti2);
684
1.88M
        ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
685
1.88M
        ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
686
1.88M
        ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
687
1.88M
      }
688
175k
    }
689
489
    if (ido % 2 == 1) return;
690
489
  }
691
159k
  for (k=0; k<l1ido; k += ido) {
692
158k
    v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
693
158k
    v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
694
158k
    v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
695
158k
    v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
696
158k
    ch[ido-1 + 4*k] = VADD(tr1, c);
697
158k
    ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
698
158k
    ch[4*k + 1*ido] = VSUB(ti1, d); 
699
158k
    ch[4*k + 3*ido] = VADD(ti1, d); 
700
158k
  }
701
394
} /* radf4 */
702
703
704
static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
705
                                   const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
706
529
{
707
529
  static const float minus_sqrt2 = (float)-1.414213562373095;
708
529
  static const float two = 2.f;
709
529
  int i, k, l1ido = l1*ido;
710
529
  v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
711
529
  {
712
529
    const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido; 
713
529
    v4sf *ch_ = ch;
714
635k
    while (ch < ch_end) {
715
634k
      v4sf a = cc[0], b = cc[4*ido-1];
716
634k
      v4sf c = cc[2*ido], d = cc[2*ido-1];
717
634k
      tr3 = SVMUL(two,d);
718
634k
      tr2 = VADD(a,b);
719
634k
      tr1 = VSUB(a,b);
720
634k
      tr4 = SVMUL(two,c);
721
634k
      ch[0*l1ido] = VADD(tr2, tr3);
722
634k
      ch[2*l1ido] = VSUB(tr2, tr3);
723
634k
      ch[1*l1ido] = VSUB(tr1, tr4);
724
634k
      ch[3*l1ido] = VADD(tr1, tr4);
725
      
726
634k
      cc += 4*ido; ch += ido;
727
634k
    }
728
529
    cc = cc_; ch = ch_;
729
529
  }
730
529
  if (ido < 2) return;
731
489
  if (ido != 2) {
732
176k
    for (k = 0; k < l1ido; k += ido) {
733
175k
      const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k);
734
175k
      v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
735
2.05M
      for (i = 2; i < ido; i += 2) {
736
737
1.88M
        tr1 = VSUB(pc[i], pc[4*ido - i]);
738
1.88M
        tr2 = VADD(pc[i], pc[4*ido - i]);
739
1.88M
        ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
740
1.88M
        tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
741
1.88M
        ph[0] = VADD(tr2, tr3);
742
1.88M
        cr3 = VSUB(tr2, tr3);
743
744
1.88M
        ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
745
1.88M
        tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
746
1.88M
        cr2 = VSUB(tr1, tr4);
747
1.88M
        cr4 = VADD(tr1, tr4);
748
749
1.88M
        ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
750
1.88M
        ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
751
752
1.88M
        ph[1] = VADD(ti2, ti3); ph += l1ido;
753
1.88M
        ci3 = VSUB(ti2, ti3);
754
1.88M
        ci2 = VADD(ti1, ti4);
755
1.88M
        ci4 = VSUB(ti1, ti4);
756
1.88M
        VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
757
1.88M
        ph[0] = cr2;
758
1.88M
        ph[1] = ci2; ph += l1ido;
759
1.88M
        VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
760
1.88M
        ph[0] = cr3;
761
1.88M
        ph[1] = ci3; ph += l1ido;
762
1.88M
        VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
763
1.88M
        ph[0] = cr4;
764
1.88M
        ph[1] = ci4; ph = ph - 3*l1ido + 2;
765
1.88M
      }
766
175k
    }
767
489
    if (ido % 2 == 1) return;
768
489
  }
769
159k
  for (k=0; k < l1ido; k+=ido) {
770
158k
    int i0 = 4*k + ido;
771
158k
    v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
772
158k
    v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
773
158k
    tr1 = VSUB(c,d);
774
158k
    tr2 = VADD(c,d);
775
158k
    ti1 = VADD(b,a);
776
158k
    ti2 = VSUB(b,a);
777
158k
    ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
778
158k
    ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
779
158k
    ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
780
158k
    ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
781
158k
  }
782
394
} /* radb4 */
783
784
static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, 
785
                     const float *wa1, const float *wa2, const float *wa3, const float *wa4)
786
102
{
787
102
  static const float tr11 = .309016994374947f;
788
102
  static const float ti11 = .951056516295154f;
789
102
  static const float tr12 = -.809016994374947f;
790
102
  static const float ti12 = .587785252292473f;
791
792
  /* System generated locals */
793
102
  int cc_offset, ch_offset;
794
795
  /* Local variables */
796
102
  int i, k, ic;
797
102
  v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
798
102
    cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
799
102
  int idp2;
800
801
802
102
#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
803
1.66M
#define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
804
805
  /* Parameter adjustments */
806
102
  ch_offset = 1 + ido * 6;
807
102
  ch -= ch_offset;
808
102
  cc_offset = 1 + ido * (1 + l1);
809
102
  cc -= cc_offset;
810
811
  /* Function Body */
812
176k
  for (k = 1; k <= l1; ++k) {
813
176k
    cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
814
176k
    ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
815
176k
    cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
816
176k
    ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
817
176k
    ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
818
176k
    ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
819
176k
    ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
820
176k
    ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
821
176k
    ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
822
    //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
823
176k
  }
824
102
  if (ido == 1) {
825
59
    return;
826
59
  }
827
43
  idp2 = ido + 2;
828
19.1k
  for (k = 1; k <= l1; ++k) {
829
97.4k
    for (i = 3; i <= ido; i += 2) {
830
78.3k
      ic = idp2 - i;
831
78.3k
      dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]);
832
78.3k
      dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]);
833
78.3k
      dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]);
834
78.3k
      dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]);
835
78.3k
      VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
836
78.3k
      VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
837
78.3k
      VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
838
78.3k
      VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
839
78.3k
      cr2 = VADD(dr2, dr5);
840
78.3k
      ci5 = VSUB(dr5, dr2);
841
78.3k
      cr5 = VSUB(di2, di5);
842
78.3k
      ci2 = VADD(di2, di5);
843
78.3k
      cr3 = VADD(dr3, dr4);
844
78.3k
      ci4 = VSUB(dr4, dr3);
845
78.3k
      cr4 = VSUB(di3, di4);
846
78.3k
      ci3 = VADD(di3, di4);
847
78.3k
      ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
848
78.3k
      ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));//
849
78.3k
      tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
850
78.3k
      ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));//
851
78.3k
      tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
852
78.3k
      ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));//
853
78.3k
      tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
854
78.3k
      ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
855
78.3k
      tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
856
78.3k
      ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
857
78.3k
      ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
858
78.3k
      ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
859
78.3k
      ch_ref(i, 3, k) = VADD(ti2, ti5);
860
78.3k
      ch_ref(ic, 2, k) = VSUB(ti5, ti2);
861
78.3k
      ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
862
78.3k
      ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
863
78.3k
      ch_ref(i, 5, k) = VADD(ti3, ti4);
864
78.3k
      ch_ref(ic, 4, k) = VSUB(ti4, ti3);
865
78.3k
    }
866
19.0k
  }
867
43
#undef cc_ref
868
43
#undef ch_ref
869
43
} /* radf5 */
870
871
static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, 
872
                  const float *wa1, const float *wa2, const float *wa3, const float *wa4)
873
102
{
874
102
  static const float tr11 = .309016994374947f;
875
102
  static const float ti11 = .951056516295154f;
876
102
  static const float tr12 = -.809016994374947f;
877
102
  static const float ti12 = .587785252292473f;
878
879
102
  int cc_offset, ch_offset;
880
881
  /* Local variables */
882
102
  int i, k, ic;
883
102
  v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
884
102
    ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
885
102
  int idp2;
886
887
102
#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
888
1.66M
#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
889
890
  /* Parameter adjustments */
891
102
  ch_offset = 1 + ido * (1 + l1);
892
102
  ch -= ch_offset;
893
102
  cc_offset = 1 + ido * 6;
894
102
  cc -= cc_offset;
895
896
  /* Function Body */
897
176k
  for (k = 1; k <= l1; ++k) {
898
176k
    ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
899
176k
    ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
900
176k
    tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
901
176k
    tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
902
176k
    ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
903
176k
    cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
904
176k
    cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
905
176k
    ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
906
176k
    ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
907
176k
    ch_ref(1, k, 2) = VSUB(cr2, ci5);
908
176k
    ch_ref(1, k, 3) = VSUB(cr3, ci4);
909
176k
    ch_ref(1, k, 4) = VADD(cr3, ci4);
910
176k
    ch_ref(1, k, 5) = VADD(cr2, ci5);
911
176k
  }
912
102
  if (ido == 1) {
913
59
    return;
914
59
  }
915
43
  idp2 = ido + 2;
916
19.1k
  for (k = 1; k <= l1; ++k) {
917
97.4k
    for (i = 3; i <= ido; i += 2) {
918
78.3k
      ic = idp2 - i;
919
78.3k
      ti5 = VADD(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
920
78.3k
      ti2 = VSUB(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
921
78.3k
      ti4 = VADD(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
922
78.3k
      ti3 = VSUB(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
923
78.3k
      tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
924
78.3k
      tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
925
78.3k
      tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
926
78.3k
      tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
927
78.3k
      ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
928
78.3k
      ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
929
78.3k
      cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
930
78.3k
      ci2 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
931
78.3k
      cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
932
78.3k
      ci3 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
933
78.3k
      cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
934
78.3k
      ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
935
78.3k
      cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
936
78.3k
      ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
937
78.3k
      dr3 = VSUB(cr3, ci4);
938
78.3k
      dr4 = VADD(cr3, ci4);
939
78.3k
      di3 = VADD(ci3, cr4);
940
78.3k
      di4 = VSUB(ci3, cr4);
941
78.3k
      dr5 = VADD(cr2, ci5);
942
78.3k
      dr2 = VSUB(cr2, ci5);
943
78.3k
      di5 = VSUB(ci2, cr5);
944
78.3k
      di2 = VADD(ci2, cr5);
945
78.3k
      VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
946
78.3k
      VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
947
78.3k
      VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
948
78.3k
      VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
949
950
78.3k
      ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
951
78.3k
      ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
952
78.3k
      ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
953
78.3k
      ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
954
78.3k
    }
955
19.0k
  }
956
43
#undef cc_ref
957
43
#undef ch_ref
958
43
} /* radb5 */
959
960
static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, 
961
135
                                      const float *wa, const int *ifac) {  
962
135
  v4sf *in  = (v4sf*)input_readonly;
963
135
  v4sf *out = (in == work2 ? work1 : work2);
964
135
  int nf = ifac[1], k1;
965
135
  int l2 = n;
966
135
  int iw = n-1;
967
135
  assert(in != out && work1 != work2);
968
1.00k
  for (k1 = 1; k1 <= nf; ++k1) {
969
874
    int kh = nf - k1;
970
874
    int ip = ifac[kh + 2];
971
874
    int l1 = l2 / ip;
972
874
    int ido = n / l2;
973
874
    iw -= (ip - 1)*ido;
974
874
    switch (ip) {
975
102
      case 5: {
976
102
        int ix2 = iw + ido;
977
102
        int ix3 = ix2 + ido;
978
102
        int ix4 = ix3 + ido;
979
102
        radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
980
102
      } break;
981
529
      case 4: {
982
529
        int ix2 = iw + ido;
983
529
        int ix3 = ix2 + ido;
984
529
        radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
985
529
      } break;
986
181
      case 3: {
987
181
        int ix2 = iw + ido;
988
181
        radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
989
181
      } break;
990
62
      case 2:
991
62
        radf2_ps(ido, l1, in, out, &wa[iw]);
992
62
        break;
993
0
      default:
994
0
        assert(0);
995
0
        break;
996
874
    }
997
874
    l2 = l1;
998
874
    if (out == work2) {
999
462
      out = work1; in = work2;
1000
462
    } else {
1001
412
      out = work2; in = work1;
1002
412
    }
1003
874
  }
1004
135
  return in; /* this is in fact the output .. */
1005
135
} /* rfftf1 */
1006
1007
static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, 
1008
135
                                      const float *wa, const int *ifac) {  
1009
135
  v4sf *in  = (v4sf*)input_readonly;
1010
135
  v4sf *out = (in == work2 ? work1 : work2);
1011
135
  int nf = ifac[1], k1;
1012
135
  int l1 = 1;
1013
135
  int iw = 0;
1014
135
  assert(in != out);
1015
1.00k
  for (k1=1; k1<=nf; k1++) {
1016
874
    int ip = ifac[k1 + 1];
1017
874
    int l2 = ip*l1;
1018
874
    int ido = n / l2;
1019
874
    switch (ip) {
1020
102
      case 5: {
1021
102
        int ix2 = iw + ido;
1022
102
        int ix3 = ix2 + ido;
1023
102
        int ix4 = ix3 + ido;
1024
102
        radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
1025
102
      } break;
1026
529
      case 4: {
1027
529
        int ix2 = iw + ido;
1028
529
        int ix3 = ix2 + ido;
1029
529
        radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
1030
529
      } break;
1031
181
      case 3: {
1032
181
        int ix2 = iw + ido;
1033
181
        radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
1034
181
      } break;
1035
62
      case 2:
1036
62
        radb2_ps(ido, l1, in, out, &wa[iw]);
1037
62
        break;
1038
0
      default:
1039
0
        assert(0);
1040
0
        break;
1041
874
    }
1042
874
    l1 = l2;
1043
874
    iw += (ip - 1)*ido;
1044
1045
874
    if (out == work2) {
1046
412
      out = work1; in = work2;
1047
462
    } else {
1048
462
      out = work2; in = work1;
1049
462
    }
1050
874
  }
1051
135
  return in; /* this is in fact the output .. */
1052
135
}
1053
1054
257
static int decompose(int n, int *ifac, const int *ntryh) {
1055
257
  int nl = n, nf = 0, i, j = 0;
1056
1.28k
  for (j=0; ntryh[j]; ++j) {
1057
1.02k
    int ntry = ntryh[j];
1058
2.67k
    while (nl != 1) {
1059
2.22k
      int nq = nl / ntry;
1060
2.22k
      int nr = nl - ntry * nq;
1061
2.22k
      if (nr == 0) {
1062
1.64k
        ifac[2+nf++] = ntry;
1063
1.64k
        nl = nq;
1064
1.64k
        if (ntry == 2 && nf != 1) {
1065
764
          for (i = 2; i <= nf; ++i) {
1066
620
            int ib = nf - i + 2;
1067
620
            ifac[ib + 1] = ifac[ib];
1068
620
          }
1069
144
          ifac[2] = 2;
1070
144
        }
1071
1.64k
      } else break;
1072
2.22k
    }
1073
1.02k
  }
1074
257
  ifac[0] = n;
1075
257
  ifac[1] = nf;  
1076
257
  return nf;
1077
257
}
1078
1079
1080
1081
static void rffti1_ps(int n, float *wa, int *ifac)
1082
135
{
1083
135
  static const int ntryh[] = { 4,2,3,5,0 };
1084
135
  int k1, j, ii;
1085
1086
135
  int nf = decompose(n,ifac,ntryh);
1087
135
  float argh = (2*M_PI) / n;
1088
135
  int is = 0;
1089
135
  int nfm1 = nf - 1;
1090
135
  int l1 = 1;
1091
874
  for (k1 = 1; k1 <= nfm1; k1++) {
1092
739
    int ip = ifac[k1 + 1];
1093
739
    int ld = 0;
1094
739
    int l2 = l1*ip;
1095
739
    int ido = n / l2;
1096
739
    int ipm = ip - 1;
1097
2.73k
    for (j = 1; j <= ipm; ++j) {
1098
1.99k
      float argld;
1099
1.99k
      int i = is, fi=0;
1100
1.99k
      ld += l1;
1101
1.99k
      argld = ld*argh;
1102
1.45M
      for (ii = 3; ii <= ido; ii += 2) {
1103
1.45M
        i += 2;
1104
1.45M
        fi += 1;
1105
1.45M
        wa[i - 2] = cos(fi*argld);
1106
1.45M
        wa[i - 1] = sin(fi*argld);
1107
1.45M
      }
1108
1.99k
      is += ido;
1109
1.99k
    }
1110
739
    l1 = l2;
1111
739
  }
1112
135
} /* rffti1 */
1113
1114
void cffti1_ps(int n, float *wa, int *ifac)
1115
122
{
1116
122
  static const int ntryh[] = { 5,3,4,2,0 };
1117
122
  int k1, j, ii;
1118
1119
122
  int nf = decompose(n,ifac,ntryh);
1120
122
  float argh = (2*M_PI)/(float)n;
1121
122
  int i = 1;
1122
122
  int l1 = 1;
1123
895
  for (k1=1; k1<=nf; k1++) {
1124
773
    int ip = ifac[k1+1];
1125
773
    int ld = 0;
1126
773
    int l2 = l1*ip;
1127
773
    int ido = n / l2;
1128
773
    int idot = ido + ido + 2;
1129
773
    int ipm = ip - 1;
1130
2.86k
    for (j=1; j<=ipm; j++) {
1131
2.08k
      float argld;
1132
2.08k
      int i1 = i, fi = 0;
1133
2.08k
      wa[i-1] = 1;
1134
2.08k
      wa[i] = 0;
1135
2.08k
      ld += l1;
1136
2.08k
      argld = ld*argh;
1137
1.46M
      for (ii = 4; ii <= idot; ii += 2) {
1138
1.46M
        i += 2;
1139
1.46M
        fi += 1;
1140
1.46M
        wa[i-1] = cos(fi*argld);
1141
1.46M
        wa[i] = sin(fi*argld);
1142
1.46M
      }
1143
2.08k
      if (ip > 5) {
1144
0
        wa[i1-1] = wa[i-1];
1145
0
        wa[i1] = wa[i];
1146
0
      }
1147
2.08k
    }
1148
773
    l1 = l2;
1149
773
  }
1150
122
} /* cffti1 */
1151
1152
1153
244
v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) {
1154
244
  v4sf *in  = (v4sf*)input_readonly;
1155
244
  v4sf *out = (in == work2 ? work1 : work2); 
1156
244
  int nf = ifac[1], k1;
1157
244
  int l1 = 1;
1158
244
  int iw = 0;
1159
244
  assert(in != out && work1 != work2);
1160
1.79k
  for (k1=2; k1<=nf+1; k1++) {
1161
1.54k
    int ip = ifac[k1];
1162
1.54k
    int l2 = ip*l1;
1163
1.54k
    int ido = n / l2;
1164
1.54k
    int idot = ido + ido;
1165
1.54k
    switch (ip) {
1166
172
      case 5: {
1167
172
        int ix2 = iw + idot;
1168
172
        int ix3 = ix2 + idot;
1169
172
        int ix4 = ix3 + idot;
1170
172
        passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
1171
172
      } break;
1172
902
      case 4: {
1173
902
        int ix2 = iw + idot;
1174
902
        int ix3 = ix2 + idot;
1175
902
        passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign);
1176
902
      } break;
1177
164
      case 2: {
1178
164
        passf2_ps(idot, l1, in, out, &wa[iw], isign);
1179
164
      } break;
1180
308
      case 3: {
1181
308
        int ix2 = iw + idot;
1182
308
        passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign);
1183
308
      } break;
1184
0
      default:
1185
0
        assert(0);
1186
1.54k
    }
1187
1.54k
    l1 = l2;
1188
1.54k
    iw += (ip - 1)*idot;
1189
1.54k
    if (out == work2) {
1190
726
      out = work1; in = work2;
1191
820
    } else {
1192
820
      out = work2; in = work1;
1193
820
    }
1194
1.54k
  }
1195
1196
244
  return in; /* this is in fact the output .. */
1197
244
}
1198
1199
1200
struct PFFFT_Setup {
1201
  int     N;
1202
  int     Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
1203
  int ifac[15];
1204
  pffft_transform_t transform;
1205
  v4sf *data; // allocated room for twiddle coefs
1206
  float *e;    // points into 'data' , N/4*3 elements
1207
  float *twiddle; // points into 'data', N/4 elements
1208
};
1209
1210
257
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
1211
257
  PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
1212
257
  int k, m;
1213
  /* unfortunately, the fft size must be a multiple of 16 for complex FFTs 
1214
     and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1215
     handle other cases (or maybe just switch to a scalar fft, I don't know..) */
1216
257
  if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
1217
257
  if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
1218
  //assert((N % 32) == 0);
1219
0
  s->N = N;
1220
257
  s->transform = transform;  
1221
  /* nb of complex simd vectors */
1222
257
  s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
1223
257
  s->data = (v4sf*)pffft_aligned_malloc(2*s->Ncvec * sizeof(v4sf));
1224
257
  s->e = (float*)s->data;
1225
257
  s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);  
1226
1227
257
  if (transform == PFFFT_REAL) {
1228
1.45M
    for (k=0; k < s->Ncvec; ++k) {
1229
1.45M
      int i = k/SIMD_SZ;
1230
1.45M
      int j = k%SIMD_SZ;
1231
5.82M
      for (m=0; m < SIMD_SZ-1; ++m) {
1232
4.37M
        float A = -2*M_PI*(m+1)*k / N;
1233
4.37M
        s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cos(A);
1234
4.37M
        s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sin(A);
1235
4.37M
      }
1236
1.45M
    }
1237
135
    rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1238
135
  } else {
1239
1.46M
    for (k=0; k < s->Ncvec; ++k) {
1240
1.46M
      int i = k/SIMD_SZ;
1241
1.46M
      int j = k%SIMD_SZ;
1242
5.86M
      for (m=0; m < SIMD_SZ-1; ++m) {
1243
4.40M
        float A = -2*M_PI*(m+1)*k / N;
1244
4.40M
        s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = cos(A);
1245
4.40M
        s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = sin(A);
1246
4.40M
      }
1247
1.46M
    }
1248
122
    cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1249
122
  }
1250
1251
  /* check that N is decomposable with allowed prime factors */
1252
1.90k
  for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; }
1253
257
  if (m != N/SIMD_SZ) {
1254
0
    pffft_destroy_setup(s); s = 0;
1255
0
  }
1256
1257
257
  return s;
1258
257
}
1259
1260
1261
257
void pffft_destroy_setup(PFFFT_Setup *s) {
1262
257
  pffft_aligned_free(s->data);
1263
257
  free(s);
1264
257
}
1265
1266
#if !defined(PFFFT_SIMD_DISABLE)
1267
1268
/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
1269
0
static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) {
1270
0
  v4sf g0, g1;
1271
0
  int k;
1272
0
  INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
1273
  
1274
0
  *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
1275
0
  for (k=1; k < N; ++k) {
1276
0
    v4sf h0, h1;
1277
0
    INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
1278
0
    *--out = VSWAPHL(g1, h0);
1279
0
    *--out = VSWAPHL(h0, h1);
1280
0
    g1 = h1;
1281
0
  }
1282
0
  *--out = VSWAPHL(g1, g0);
1283
0
}
1284
1285
270
static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) {
1286
270
  v4sf g0, g1, h0, h1;
1287
270
  int k;
1288
270
  g0 = g1 = in[0]; ++in;
1289
728k
  for (k=1; k < N; ++k) {
1290
728k
    h0 = *in++; h1 = *in++;
1291
728k
    g1 = VSWAPHL(g1, h0);
1292
728k
    h0 = VSWAPHL(h0, h1);
1293
728k
    UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride;
1294
728k
    g1 = h1;
1295
728k
  }
1296
270
  h0 = *in++; h1 = g0;
1297
270
  g1 = VSWAPHL(g1, h0);
1298
270
  h0 = VSWAPHL(h0, h1);
1299
270
  UNINTERLEAVE2(h0, g1, out[0], out[1]);
1300
270
}
1301
1302
257
void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1303
257
  int k, N = setup->N, Ncvec = setup->Ncvec;
1304
257
  const v4sf *vin = (const v4sf*)in;
1305
257
  v4sf *vout = (v4sf*)out;
1306
257
  assert(in != out);
1307
257
  if (setup->transform == PFFFT_REAL) {
1308
135
    int dk = N/32;
1309
135
    if (direction == PFFFT_FORWARD) {
1310
0
      for (k=0; k < dk; ++k) {
1311
0
        INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
1312
0
        INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
1313
0
      }
1314
0
      reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2));
1315
0
      reversed_copy(dk, vin+6, 8, (v4sf*)(out + N));
1316
135
    } else {
1317
364k
      for (k=0; k < dk; ++k) {
1318
364k
        UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
1319
364k
        UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
1320
364k
      }
1321
135
      unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8);
1322
135
      unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8);
1323
135
    }
1324
135
  } else {
1325
122
    if (direction == PFFFT_FORWARD) {
1326
0
      for (k=0; k < Ncvec; ++k) { 
1327
0
        int kk = (k/4) + (k%4)*(Ncvec/4);
1328
0
        INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
1329
0
      }
1330
122
    } else {
1331
1.46M
      for (k=0; k < Ncvec; ++k) { 
1332
1.46M
        int kk = (k/4) + (k%4)*(Ncvec/4);
1333
1.46M
        UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
1334
1.46M
      }
1335
122
    }
1336
122
  }
1337
257
}
1338
1339
122
void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1340
122
  int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1341
122
  v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1342
122
  v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1343
122
  assert(in != out);
1344
366k
  for (k=0; k < dk; ++k) {    
1345
366k
    r0 = in[8*k+0]; i0 = in[8*k+1];
1346
366k
    r1 = in[8*k+2]; i1 = in[8*k+3];
1347
366k
    r2 = in[8*k+4]; i2 = in[8*k+5];
1348
366k
    r3 = in[8*k+6]; i3 = in[8*k+7];
1349
366k
    VTRANSPOSE4(r0,r1,r2,r3);
1350
366k
    VTRANSPOSE4(i0,i1,i2,i3);
1351
366k
    VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
1352
366k
    VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
1353
366k
    VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
1354
1355
366k
    sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1356
366k
    sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1357
366k
    si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1358
366k
    si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1359
1360
    /*
1361
      transformation for each column is:
1362
      
1363
      [1   1   1   1   0   0   0   0]   [r0]
1364
      [1   0  -1   0   0  -1   0   1]   [r1]
1365
      [1  -1   1  -1   0   0   0   0]   [r2]
1366
      [1   0  -1   0   0   1   0  -1]   [r3]
1367
      [0   0   0   0   1   1   1   1] * [i0]
1368
      [0   1   0  -1   1   0  -1   0]   [i1]
1369
      [0   0   0   0   1  -1   1  -1]   [i2]
1370
      [0  -1   0   1   1   0  -1   0]   [i3]    
1371
    */
1372
    
1373
366k
    r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1374
366k
    r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
1375
366k
    r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1376
366k
    r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
1377
  
1378
366k
    *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1379
366k
    *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1380
366k
  }
1381
122
}
1382
1383
122
void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1384
122
  int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1385
122
  v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1386
122
  v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1387
122
  assert(in != out);
1388
366k
  for (k=0; k < dk; ++k) {    
1389
366k
    r0 = in[8*k+0]; i0 = in[8*k+1];
1390
366k
    r1 = in[8*k+2]; i1 = in[8*k+3];
1391
366k
    r2 = in[8*k+4]; i2 = in[8*k+5];
1392
366k
    r3 = in[8*k+6]; i3 = in[8*k+7];
1393
1394
366k
    sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1395
366k
    sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1396
366k
    si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1397
366k
    si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1398
1399
366k
    r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1400
366k
    r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
1401
366k
    r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1402
366k
    r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
1403
1404
366k
    VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
1405
366k
    VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
1406
366k
    VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
1407
1408
366k
    VTRANSPOSE4(r0,r1,r2,r3);
1409
366k
    VTRANSPOSE4(i0,i1,i2,i3);
1410
1411
366k
    *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1412
366k
    *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1413
366k
  }
1414
122
}
1415
1416
1417
static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
1418
364k
                            const v4sf *e, v4sf *out) {
1419
364k
  v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1420
364k
  v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1421
364k
  r0 = *in0; i0 = *in1;
1422
364k
  r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
1423
364k
  VTRANSPOSE4(r0,r1,r2,r3);
1424
364k
  VTRANSPOSE4(i0,i1,i2,i3);
1425
 
1426
  /*
1427
    transformation for each column is:
1428
1429
    [1   1   1   1   0   0   0   0]   [r0]
1430
    [1   0  -1   0   0  -1   0   1]   [r1]
1431
    [1   0  -1   0   0   1   0  -1]   [r2]
1432
    [1  -1   1  -1   0   0   0   0]   [r3]
1433
    [0   0   0   0   1   1   1   1] * [i0]
1434
    [0  -1   0   1  -1   0   1   0]   [i1]
1435
    [0  -1   0   1   1   0  -1   0]   [i2]
1436
    [0   0   0   0  -1   1  -1   1]   [i3]    
1437
  */
1438
  
1439
  //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1440
  //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1441
1442
364k
  VCPLXMUL(r1,i1,e[0],e[1]);
1443
364k
  VCPLXMUL(r2,i2,e[2],e[3]);
1444
364k
  VCPLXMUL(r3,i3,e[4],e[5]);
1445
1446
  //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1447
  //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1448
1449
364k
  sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2); 
1450
364k
  sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
1451
364k
  si0 = VADD(i0,i2); di0 = VSUB(i0,i2); 
1452
364k
  si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
1453
1454
364k
  r0 = VADD(sr0, sr1);
1455
364k
  r3 = VSUB(sr0, sr1);
1456
364k
  i0 = VADD(si0, si1);
1457
364k
  i3 = VSUB(si1, si0);
1458
364k
  r1 = VADD(dr0, di1);
1459
364k
  r2 = VSUB(dr0, di1);
1460
364k
  i1 = VSUB(dr1, di0);
1461
364k
  i2 = VADD(dr1, di0);
1462
1463
364k
  *out++ = r0;
1464
364k
  *out++ = i0;
1465
364k
  *out++ = r1;
1466
364k
  *out++ = i1;
1467
364k
  *out++ = r2;
1468
364k
  *out++ = i2;
1469
364k
  *out++ = r3;
1470
364k
  *out++ = i3;
1471
1472
364k
}
1473
1474
135
static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1475
135
  int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1476
  /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1477
1478
135
  v4sf_union cr, ci, *uout = (v4sf_union*)out;
1479
135
  v4sf save = in[7], zero=VZERO();
1480
135
  float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
1481
135
  static const float s = M_SQRT2/2;
1482
1483
135
  cr.v = in[0]; ci.v = in[Ncvec*2-1];
1484
135
  assert(in != out);
1485
0
  pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
1486
1487
  /*
1488
    [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1489
1490
    [Xr(1)]  ] [1   1   1   1   0   0   0   0]
1491
    [Xr(N/4) ] [0   0   0   0   1   s   0  -s]
1492
    [Xr(N/2) ] [1   0  -1   0   0   0   0   0]
1493
    [Xr(3N/4)] [0   0   0   0   1  -s   0   s]
1494
    [Xi(1)   ] [1  -1   1  -1   0   0   0   0]
1495
    [Xi(N/4) ] [0   0   0   0   0  -s  -1  -s]
1496
    [Xi(N/2) ] [0  -1   0   1   0   0   0   0]
1497
    [Xi(3N/4)] [0   0   0   0   0  -s   1  -s]
1498
  */
1499
1500
135
  xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
1501
135
  xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
1502
135
  xr2=(cr.f[0]-cr.f[2]);                     uout[4].f[0] = xr2;
1503
135
  xi2=(cr.f[3]-cr.f[1]);                     uout[5].f[0] = xi2;
1504
135
  xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]);        uout[2].f[0] = xr1;
1505
135
  xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[3].f[0] = xi1;
1506
135
  xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]);        uout[6].f[0] = xr3;
1507
135
  xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[7].f[0] = xi3; 
1508
1509
364k
  for (k=1; k < dk; ++k) {
1510
364k
    v4sf save_next = in[8*k+7];
1511
364k
    pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1,
1512
364k
                           e + k*6, out + k*8);
1513
364k
    save = save_next;
1514
364k
  }
1515
1516
135
}
1517
1518
static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, 
1519
364k
                                             const v4sf *e, v4sf *out, int first) {
1520
364k
  v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
1521
  /*
1522
    transformation for each column is:
1523
1524
    [1   1   1   1   0   0   0   0]   [r0]
1525
    [1   0   0  -1   0  -1  -1   0]   [r1]
1526
    [1  -1  -1   1   0   0   0   0]   [r2]
1527
    [1   0   0  -1   0   1   1   0]   [r3]
1528
    [0   0   0   0   1  -1   1  -1] * [i0]
1529
    [0  -1   1   0   1   0   0   1]   [i1]
1530
    [0   0   0   0   1   1  -1  -1]   [i2]
1531
    [0   1  -1   0   1   0   0   1]   [i3]    
1532
  */
1533
1534
364k
  v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3); 
1535
364k
  v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
1536
364k
  v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3); 
1537
364k
  v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
1538
1539
364k
  r0 = VADD(sr0, sr1);
1540
364k
  r2 = VSUB(sr0, sr1);
1541
364k
  r1 = VSUB(dr0, si1);
1542
364k
  r3 = VADD(dr0, si1);
1543
364k
  i0 = VSUB(di0, di1);
1544
364k
  i2 = VADD(di0, di1);
1545
364k
  i1 = VSUB(si0, dr1);
1546
364k
  i3 = VADD(si0, dr1);
1547
1548
364k
  VCPLXMULCONJ(r1,i1,e[0],e[1]);
1549
364k
  VCPLXMULCONJ(r2,i2,e[2],e[3]);
1550
364k
  VCPLXMULCONJ(r3,i3,e[4],e[5]);
1551
1552
364k
  VTRANSPOSE4(r0,r1,r2,r3);
1553
364k
  VTRANSPOSE4(i0,i1,i2,i3);
1554
1555
364k
  if (!first) {
1556
364k
    *out++ = r0;
1557
364k
    *out++ = i0;
1558
364k
  }
1559
364k
  *out++ = r1;
1560
364k
  *out++ = i1;
1561
364k
  *out++ = r2;
1562
364k
  *out++ = i2;
1563
364k
  *out++ = r3;
1564
364k
  *out++ = i3;
1565
364k
}
1566
1567
135
static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1568
135
  int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1569
  /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1570
1571
135
  v4sf_union Xr, Xi, *uout = (v4sf_union*)out;
1572
135
  float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
1573
135
  static const float s = M_SQRT2;
1574
135
  assert(in != out);
1575
675
  for (k=0; k < 4; ++k) {
1576
540
    Xr.f[k] = ((float*)in)[8*k];
1577
540
    Xi.f[k] = ((float*)in)[8*k+4];
1578
540
  }
1579
1580
135
  pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values
1581
1582
  /*
1583
    [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1584
1585
    [cr0] [1   0   2   0   1   0   0   0]
1586
    [cr1] [1   0   0   0  -1   0  -2   0]
1587
    [cr2] [1   0  -2   0   1   0   0   0]
1588
    [cr3] [1   0   0   0  -1   0   2   0]
1589
    [ci0] [0   2   0   2   0   0   0   0]
1590
    [ci1] [0   s   0  -s   0  -s   0  -s]
1591
    [ci2] [0   0   0   0   0  -2   0   2]
1592
    [ci3] [0  -s   0   s   0  -s   0  -s]
1593
  */
1594
364k
  for (k=1; k < dk; ++k) {    
1595
364k
    pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
1596
364k
  }
1597
1598
135
  cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
1599
135
  cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
1600
135
  cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
1601
135
  cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
1602
135
  ci0= 2*(Xr.f[1]+Xr.f[3]);                       uout[2*Ncvec-1].f[0] = ci0;
1603
135
  ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
1604
135
  ci2= 2*(Xi.f[3]-Xi.f[1]);                       uout[2*Ncvec-1].f[2] = ci2;
1605
135
  ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
1606
135
}
1607
1608
1609
void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
1610
514
                             pffft_direction_t direction, int ordered) {
1611
514
  int k, Ncvec   = setup->Ncvec;
1612
514
  int nf_odd = (setup->ifac[1] & 1);
1613
1614
  // temporary buffer is allocated on the stack if the scratch pointer is NULL
1615
514
  int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1616
514
  VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1617
1618
514
  const v4sf *vinput = (const v4sf*)finput;
1619
514
  v4sf *voutput      = (v4sf*)foutput;
1620
514
  v4sf *buff[2]      = { voutput, scratch ? scratch : scratch_on_stack };
1621
514
  int ib = (nf_odd ^ ordered ? 1 : 0);
1622
1623
514
  assert(VALIGNED(finput) && VALIGNED(foutput));
1624
1625
  //assert(finput != foutput);
1626
514
  if (direction == PFFFT_FORWARD) {
1627
257
    ib = !ib;
1628
257
    if (setup->transform == PFFFT_REAL) { 
1629
135
      ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib],
1630
135
                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);      
1631
135
      pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1632
135
    } else {
1633
122
      v4sf *tmp = buff[ib];
1634
1.46M
      for (k=0; k < Ncvec; ++k) {
1635
1.46M
        UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
1636
1.46M
      }
1637
122
      ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], 
1638
122
                      setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1639
122
      pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1640
122
    }
1641
257
    if (ordered) {
1642
0
      pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);       
1643
257
    } else ib = !ib;
1644
257
  } else {
1645
257
    if (vinput == buff[ib]) { 
1646
0
      ib = !ib; // may happen when finput == foutput
1647
0
    }
1648
257
    if (ordered) {
1649
257
      pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD); 
1650
257
      vinput = buff[ib]; ib = !ib;
1651
257
    }
1652
257
    if (setup->transform == PFFFT_REAL) {
1653
135
      pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1654
135
      ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], 
1655
135
                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1656
135
    } else {
1657
122
      pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1658
122
      ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1], 
1659
122
                      setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1660
1.46M
      for (k=0; k < Ncvec; ++k) {
1661
1.46M
        INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
1662
1.46M
      }
1663
122
    }
1664
257
  }
1665
  
1666
514
  if (buff[ib] != voutput) {
1667
    /* extra copy required -- this situation should only happen when finput == foutput */
1668
0
    assert(finput==foutput);
1669
0
    for (k=0; k < Ncvec; ++k) {
1670
0
      v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
1671
0
      voutput[2*k] = a; voutput[2*k+1] = b;
1672
0
    }
1673
0
    ib = !ib;
1674
0
  }
1675
0
  assert(buff[ib] == voutput);
1676
514
}
1677
1678
257
void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
1679
257
  int Ncvec = s->Ncvec;
1680
257
  const v4sf * RESTRICT va = (const v4sf*)a;
1681
257
  const v4sf * RESTRICT vb = (const v4sf*)b;
1682
257
  v4sf * RESTRICT vab = (v4sf*)ab;
1683
1684
#ifdef __arm__
1685
  __builtin_prefetch(va);
1686
  __builtin_prefetch(vb);
1687
  __builtin_prefetch(vab);
1688
  __builtin_prefetch(va+2);
1689
  __builtin_prefetch(vb+2);
1690
  __builtin_prefetch(vab+2);
1691
  __builtin_prefetch(va+4);
1692
  __builtin_prefetch(vb+4);
1693
  __builtin_prefetch(vab+4);
1694
  __builtin_prefetch(va+6);
1695
  __builtin_prefetch(vb+6);
1696
  __builtin_prefetch(vab+6);
1697
# ifndef __clang__
1698
#   define ZCONVOLVE_USING_INLINE_NEON_ASM
1699
# endif
1700
#endif
1701
1702
257
  float ar, ai, br, bi, abr, abi;
1703
257
#ifndef ZCONVOLVE_USING_INLINE_ASM
1704
257
  v4sf vscal = LD_PS1(scaling);
1705
257
  int i;
1706
257
#endif
1707
1708
257
  assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
1709
0
  ar = ((v4sf_union*)va)[0].f[0];
1710
257
  ai = ((v4sf_union*)va)[1].f[0];
1711
257
  br = ((v4sf_union*)vb)[0].f[0];
1712
257
  bi = ((v4sf_union*)vb)[1].f[0];
1713
257
  abr = ((v4sf_union*)vab)[0].f[0];
1714
257
  abi = ((v4sf_union*)vab)[1].f[0];
1715
 
1716
#ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
1717
  const float *a_ = a, *b_ = b; float *ab_ = ab;
1718
  int N = Ncvec;
1719
  asm volatile("mov         r8, %2                  \n"
1720
               "vdup.f32    q15, %4                 \n"
1721
               "1:                                  \n"
1722
               "pld         [%0,#64]                \n"
1723
               "pld         [%1,#64]                \n"
1724
               "pld         [%2,#64]                \n"
1725
               "pld         [%0,#96]                \n"
1726
               "pld         [%1,#96]                \n"
1727
               "pld         [%2,#96]                \n"
1728
               "vld1.f32    {q0,q1},   [%0,:128]!         \n"
1729
               "vld1.f32    {q4,q5},   [%1,:128]!         \n"
1730
               "vld1.f32    {q2,q3},   [%0,:128]!         \n"
1731
               "vld1.f32    {q6,q7},   [%1,:128]!         \n"
1732
               "vld1.f32    {q8,q9},   [r8,:128]!          \n"
1733
               
1734
               "vmul.f32    q10, q0, q4             \n"
1735
               "vmul.f32    q11, q0, q5             \n"
1736
               "vmul.f32    q12, q2, q6             \n" 
1737
               "vmul.f32    q13, q2, q7             \n"                 
1738
               "vmls.f32    q10, q1, q5             \n"
1739
               "vmla.f32    q11, q1, q4             \n"
1740
               "vld1.f32    {q0,q1}, [r8,:128]!     \n"
1741
               "vmls.f32    q12, q3, q7             \n"
1742
               "vmla.f32    q13, q3, q6             \n"
1743
               "vmla.f32    q8, q10, q15            \n"
1744
               "vmla.f32    q9, q11, q15            \n"
1745
               "vmla.f32    q0, q12, q15            \n"
1746
               "vmla.f32    q1, q13, q15            \n"
1747
               "vst1.f32    {q8,q9},[%2,:128]!    \n"
1748
               "vst1.f32    {q0,q1},[%2,:128]!    \n"
1749
               "subs        %3, #2                  \n"
1750
               "bne         1b                      \n"
1751
               : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
1752
#else // default routine, works fine for non-arm cpus with current compilers
1753
1.46M
  for (i=0; i < Ncvec; i += 2) {
1754
1.46M
    v4sf ar, ai, br, bi;
1755
1.46M
    ar = va[2*i+0]; ai = va[2*i+1];
1756
1.46M
    br = vb[2*i+0]; bi = vb[2*i+1];
1757
1.46M
    VCPLXMUL(ar, ai, br, bi);
1758
1.46M
    vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]);
1759
1.46M
    vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]);
1760
1.46M
    ar = va[2*i+2]; ai = va[2*i+3];
1761
1.46M
    br = vb[2*i+2]; bi = vb[2*i+3];
1762
1.46M
    VCPLXMUL(ar, ai, br, bi);
1763
1.46M
    vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]);
1764
1.46M
    vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]);
1765
1.46M
  }
1766
257
#endif
1767
257
  if (s->transform == PFFFT_REAL) {
1768
135
    ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling;
1769
135
    ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling;
1770
135
  }
1771
257
}
1772
1773
1774
#else // defined(PFFFT_SIMD_DISABLE)
1775
1776
// standard routine using scalar floats, without SIMD stuff.
1777
1778
#define pffft_zreorder_nosimd pffft_zreorder
1779
void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1780
  int k, N = setup->N;
1781
  if (setup->transform == PFFFT_COMPLEX) {
1782
    for (k=0; k < 2*N; ++k) out[k] = in[k];
1783
    return;
1784
  }
1785
  else if (direction == PFFFT_FORWARD) {
1786
    float x_N = in[N-1];
1787
    for (k=N-1; k > 1; --k) out[k] = in[k-1]; 
1788
    out[0] = in[0];
1789
    out[1] = x_N;
1790
  } else {
1791
    float x_N = in[1];
1792
    for (k=1; k < N-1; ++k) out[k] = in[k+1]; 
1793
    out[0] = in[0];
1794
    out[N-1] = x_N;
1795
  }
1796
}
1797
1798
#define pffft_transform_internal_nosimd pffft_transform_internal
1799
void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
1800
                                    pffft_direction_t direction, int ordered) {
1801
  int Ncvec   = setup->Ncvec;
1802
  int nf_odd = (setup->ifac[1] & 1);
1803
1804
  // temporary buffer is allocated on the stack if the scratch pointer is NULL
1805
  int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1806
  VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1807
  float *buff[2];
1808
  int ib;
1809
  if (scratch == 0) scratch = scratch_on_stack;
1810
  buff[0] = output; buff[1] = scratch;
1811
1812
  if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered.
1813
  ib = (nf_odd ^ ordered ? 1 : 0);
1814
1815
  if (direction == PFFFT_FORWARD) {
1816
    if (setup->transform == PFFFT_REAL) { 
1817
      ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1818
                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);      
1819
    } else {
1820
      ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], 
1821
                      setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1822
    }
1823
    if (ordered) {
1824
      pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
1825
    }
1826
  } else {    
1827
    if (input == buff[ib]) { 
1828
      ib = !ib; // may happen when finput == foutput
1829
    }
1830
    if (ordered) {
1831
      pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD); 
1832
      input = buff[!ib];
1833
    }
1834
    if (setup->transform == PFFFT_REAL) {
1835
      ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib], 
1836
                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1837
    } else {
1838
      ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], 
1839
                      setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1840
    }
1841
  }
1842
  if (buff[ib] != output) {
1843
    int k;
1844
    // extra copy required -- this situation should happens only when finput == foutput
1845
    assert(input==output);
1846
    for (k=0; k < Ncvec; ++k) {
1847
      float a = buff[ib][2*k], b = buff[ib][2*k+1];
1848
      output[2*k] = a; output[2*k+1] = b;
1849
    }
1850
    ib = !ib;
1851
  }
1852
  assert(buff[ib] == output);
1853
}
1854
1855
#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
1856
void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
1857
                                       float *ab, float scaling) {
1858
  int i, Ncvec = s->Ncvec;
1859
1860
  if (s->transform == PFFFT_REAL) {
1861
    // take care of the fftpack ordering
1862
    ab[0] += a[0]*b[0]*scaling;
1863
    ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
1864
    ++ab; ++a; ++b; --Ncvec;
1865
  }
1866
  for (i=0; i < Ncvec; ++i) {
1867
    float ar, ai, br, bi;
1868
    ar = a[2*i+0]; ai = a[2*i+1];
1869
    br = b[2*i+0]; bi = b[2*i+1];
1870
    VCPLXMUL(ar, ai, br, bi);
1871
    ab[2*i+0] += ar*scaling;
1872
    ab[2*i+1] += ai*scaling;
1873
  }
1874
}
1875
1876
#endif // defined(PFFFT_SIMD_DISABLE)
1877
1878
257
void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1879
257
  pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0);
1880
257
}
1881
1882
257
void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1883
257
  pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1);
1884
257
}