Coverage Report

Created: 2024-09-08 06:43

/src/fftw3/reodft/reodft010e-r2hc.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2003, 2007-14 Matteo Frigo
3
 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4
 *
5
 * This program is free software; you can redistribute it and/or modify
6
 * it under the terms of the GNU General Public License as published by
7
 * the Free Software Foundation; either version 2 of the License, or
8
 * (at your option) any later version.
9
 *
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 * GNU General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, write to the Free Software
17
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18
 *
19
 */
20
21
22
/* Do an R{E,O}DFT{01,10} problem via an R2HC problem, with some
23
   pre/post-processing ala FFTPACK. */
24
25
#include "reodft/reodft.h"
26
27
typedef struct {
28
     solver super;
29
} S;
30
31
typedef struct {
32
     plan_rdft super;
33
     plan *cld;
34
     twid *td;
35
     INT is, os;
36
     INT n;
37
     INT vl;
38
     INT ivs, ovs;
39
     rdft_kind kind;
40
} P;
41
42
/* A real-even-01 DFT operates logically on a size-4N array:
43
                   I 0 -r(I*) -I 0 r(I*),
44
   where r denotes reversal and * denotes deletion of the 0th element.
45
   To compute the transform of this, we imagine performing a radix-4
46
   (real-input) DIF step, which turns the size-4N DFT into 4 size-N
47
   (contiguous) DFTs, two of which are zero and two of which are
48
   conjugates.  The non-redundant size-N DFT has halfcomplex input, so
49
   we can do it with a size-N hc2r transform.  (In order to share
50
   plans with the re10 (inverse) transform, however, we use the DHT
51
   trick to re-express the hc2r problem as r2hc.  This has little cost
52
   since we are already pre- and post-processing the data in {i,n-i}
53
   order.)  Finally, we have to write out the data in the correct
54
   order...the two size-N redundant (conjugate) hc2r DFTs correspond
55
   to the even and odd outputs in O (i.e. the usual interleaved output
56
   of DIF transforms); since this data has even symmetry, we only
57
   write the first half of it.
58
59
   The real-even-10 DFT is just the reverse of these steps, i.e. a
60
   radix-4 DIT transform.  There, however, we just use the r2hc
61
   transform naturally without resorting to the DHT trick.
62
63
   A real-odd-01 DFT is very similar, except that the input is
64
   0 I (rI)* 0 -I -(rI)*.  This format, however, can be transformed
65
   into precisely the real-even-01 format above by sending I -> rI
66
   and shifting the array by N.  The former swap is just another
67
   transformation on the input during preprocessing; the latter
68
   multiplies the even/odd outputs by i/-i, which combines with
69
   the factor of -i (to take the imaginary part) to simply flip
70
   the sign of the odd outputs.  Vice-versa for real-odd-10.
71
72
   The FFTPACK source code was very helpful in working this out.
73
   (They do unnecessary passes over the array, though.)  The same
74
   algorithm is also described in:
75
76
      John Makhoul, "A fast cosine transform in one and two dimensions,"
77
      IEEE Trans. on Acoust. Speech and Sig. Proc., ASSP-28 (1), 27--34 (1980).
78
79
   Note that Numerical Recipes suggests a different algorithm that
80
   requires more operations and uses trig. functions for both the pre-
81
   and post-processing passes.
82
*/
83
84
static void apply_re01(const plan *ego_, R *I, R *O)
85
0
{
86
0
     const P *ego = (const P *) ego_;
87
0
     INT is = ego->is, os = ego->os;
88
0
     INT i, n = ego->n;
89
0
     INT iv, vl = ego->vl;
90
0
     INT ivs = ego->ivs, ovs = ego->ovs;
91
0
     R *W = ego->td->W;
92
0
     R *buf;
93
94
0
     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
95
96
0
     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
97
0
    buf[0] = I[0];
98
0
    for (i = 1; i < n - i; ++i) {
99
0
         E a, b, apb, amb, wa, wb;
100
0
         a = I[is * i];
101
0
         b = I[is * (n - i)];
102
0
         apb = a + b;
103
0
         amb = a - b;
104
0
         wa = W[2*i];
105
0
         wb = W[2*i + 1];
106
0
         buf[i] = wa * amb + wb * apb; 
107
0
         buf[n - i] = wa * apb - wb * amb; 
108
0
    }
109
0
    if (i == n - i) {
110
0
         buf[i] = K(2.0) * I[is * i] * W[2*i];
111
0
    }
112
    
113
0
    {
114
0
         plan_rdft *cld = (plan_rdft *) ego->cld;
115
0
         cld->apply((plan *) cld, buf, buf);
116
0
    }
117
    
118
0
    O[0] = buf[0];
119
0
    for (i = 1; i < n - i; ++i) {
120
0
         E a, b;
121
0
         INT k;
122
0
         a = buf[i];
123
0
         b = buf[n - i];
124
0
         k = i + i;
125
0
         O[os * (k - 1)] = a - b;
126
0
         O[os * k] = a + b;
127
0
    }
128
0
    if (i == n - i) {
129
0
         O[os * (n - 1)] = buf[i];
130
0
    }
131
0
     }
132
133
0
     X(ifree)(buf);
134
0
}
135
136
/* ro01 is same as re01, but with i <-> n - 1 - i in the input and
137
   the sign of the odd output elements flipped. */
138
static void apply_ro01(const plan *ego_, R *I, R *O)
139
0
{
140
0
     const P *ego = (const P *) ego_;
141
0
     INT is = ego->is, os = ego->os;
142
0
     INT i, n = ego->n;
143
0
     INT iv, vl = ego->vl;
144
0
     INT ivs = ego->ivs, ovs = ego->ovs;
145
0
     R *W = ego->td->W;
146
0
     R *buf;
147
148
0
     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
149
150
0
     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
151
0
    buf[0] = I[is * (n - 1)];
152
0
    for (i = 1; i < n - i; ++i) {
153
0
         E a, b, apb, amb, wa, wb;
154
0
         a = I[is * (n - 1 - i)];
155
0
         b = I[is * (i - 1)];
156
0
         apb = a + b;
157
0
         amb = a - b;
158
0
         wa = W[2*i];
159
0
         wb = W[2*i+1];
160
0
         buf[i] = wa * amb + wb * apb; 
161
0
         buf[n - i] = wa * apb - wb * amb; 
162
0
    }
163
0
    if (i == n - i) {
164
0
         buf[i] = K(2.0) * I[is * (i - 1)] * W[2*i];
165
0
    }
166
    
167
0
    {
168
0
         plan_rdft *cld = (plan_rdft *) ego->cld;
169
0
         cld->apply((plan *) cld, buf, buf);
170
0
    }
171
    
172
0
    O[0] = buf[0];
173
0
    for (i = 1; i < n - i; ++i) {
174
0
         E a, b;
175
0
         INT k;
176
0
         a = buf[i];
177
0
         b = buf[n - i];
178
0
         k = i + i;
179
0
         O[os * (k - 1)] = b - a;
180
0
         O[os * k] = a + b;
181
0
    }
182
0
    if (i == n - i) {
183
0
         O[os * (n - 1)] = -buf[i];
184
0
    }
185
0
     }
186
187
0
     X(ifree)(buf);
188
0
}
189
190
static void apply_re10(const plan *ego_, R *I, R *O)
191
0
{
192
0
     const P *ego = (const P *) ego_;
193
0
     INT is = ego->is, os = ego->os;
194
0
     INT i, n = ego->n;
195
0
     INT iv, vl = ego->vl;
196
0
     INT ivs = ego->ivs, ovs = ego->ovs;
197
0
     R *W = ego->td->W;
198
0
     R *buf;
199
200
0
     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
201
202
0
     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
203
0
    buf[0] = I[0];
204
0
    for (i = 1; i < n - i; ++i) {
205
0
         E u, v;
206
0
         INT k = i + i;
207
0
         u = I[is * (k - 1)];
208
0
         v = I[is * k];
209
0
         buf[n - i] = u;
210
0
         buf[i] = v;
211
0
    }
212
0
    if (i == n - i) {
213
0
         buf[i] = I[is * (n - 1)];
214
0
    }
215
    
216
0
    {
217
0
         plan_rdft *cld = (plan_rdft *) ego->cld;
218
0
         cld->apply((plan *) cld, buf, buf);
219
0
    }
220
    
221
0
    O[0] = K(2.0) * buf[0];
222
0
    for (i = 1; i < n - i; ++i) {
223
0
         E a, b, wa, wb;
224
0
         a = K(2.0) * buf[i];
225
0
         b = K(2.0) * buf[n - i];
226
0
         wa = W[2*i];
227
0
         wb = W[2*i + 1];
228
0
         O[os * i] = wa * a + wb * b;
229
0
         O[os * (n - i)] = wb * a - wa * b;
230
0
    }
231
0
    if (i == n - i) {
232
0
         O[os * i] = K(2.0) * buf[i] * W[2*i];
233
0
    }
234
0
     }
235
236
0
     X(ifree)(buf);
237
0
}
238
239
/* ro10 is same as re10, but with i <-> n - 1 - i in the output and
240
   the sign of the odd input elements flipped. */
241
static void apply_ro10(const plan *ego_, R *I, R *O)
242
0
{
243
0
     const P *ego = (const P *) ego_;
244
0
     INT is = ego->is, os = ego->os;
245
0
     INT i, n = ego->n;
246
0
     INT iv, vl = ego->vl;
247
0
     INT ivs = ego->ivs, ovs = ego->ovs;
248
0
     R *W = ego->td->W;
249
0
     R *buf;
250
251
0
     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
252
253
0
     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
254
0
    buf[0] = I[0];
255
0
    for (i = 1; i < n - i; ++i) {
256
0
         E u, v;
257
0
         INT k = i + i;
258
0
         u = -I[is * (k - 1)];
259
0
         v = I[is * k];
260
0
         buf[n - i] = u;
261
0
         buf[i] = v;
262
0
    }
263
0
    if (i == n - i) {
264
0
         buf[i] = -I[is * (n - 1)];
265
0
    }
266
    
267
0
    {
268
0
         plan_rdft *cld = (plan_rdft *) ego->cld;
269
0
         cld->apply((plan *) cld, buf, buf);
270
0
    }
271
    
272
0
    O[os * (n - 1)] = K(2.0) * buf[0];
273
0
    for (i = 1; i < n - i; ++i) {
274
0
         E a, b, wa, wb;
275
0
         a = K(2.0) * buf[i];
276
0
         b = K(2.0) * buf[n - i];
277
0
         wa = W[2*i];
278
0
         wb = W[2*i + 1];
279
0
         O[os * (n - 1 - i)] = wa * a + wb * b;
280
0
         O[os * (i - 1)] = wb * a - wa * b;
281
0
    }
282
0
    if (i == n - i) {
283
0
         O[os * (i - 1)] = K(2.0) * buf[i] * W[2*i];
284
0
    }
285
0
     }
286
287
0
     X(ifree)(buf);
288
0
}
289
290
static void awake(plan *ego_, enum wakefulness wakefulness)
291
0
{
292
0
     P *ego = (P *) ego_;
293
0
     static const tw_instr reodft010e_tw[] = {
294
0
          { TW_COS, 0, 1 },
295
0
          { TW_SIN, 0, 1 },
296
0
          { TW_NEXT, 1, 0 }
297
0
     };
298
299
0
     X(plan_awake)(ego->cld, wakefulness);
300
301
0
     X(twiddle_awake)(wakefulness, &ego->td, reodft010e_tw, 
302
0
          4*ego->n, 1, ego->n/2+1);
303
0
}
304
305
static void destroy(plan *ego_)
306
0
{
307
0
     P *ego = (P *) ego_;
308
0
     X(plan_destroy_internal)(ego->cld);
309
0
}
310
311
static void print(const plan *ego_, printer *p)
312
0
{
313
0
     const P *ego = (const P *) ego_;
314
0
     p->print(p, "(%se-r2hc-%D%v%(%p%))",
315
0
        X(rdft_kind_str)(ego->kind), ego->n, ego->vl, ego->cld);
316
0
}
317
318
static int applicable0(const solver *ego_, const problem *p_)
319
0
{
320
0
     const problem_rdft *p = (const problem_rdft *) p_;
321
0
     UNUSED(ego_);
322
323
0
     return (1
324
0
       && p->sz->rnk == 1
325
0
       && p->vecsz->rnk <= 1
326
0
       && (p->kind[0] == REDFT01 || p->kind[0] == REDFT10
327
0
     || p->kind[0] == RODFT01 || p->kind[0] == RODFT10)
328
0
    );
329
0
}
330
331
static int applicable(const solver *ego, const problem *p, const planner *plnr)
332
332
{
333
332
     return (!NO_SLOWP(plnr) && applicable0(ego, p));
334
332
}
335
336
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
337
332
{
338
332
     P *pln;
339
332
     const problem_rdft *p;
340
332
     plan *cld;
341
332
     R *buf;
342
332
     INT n;
343
332
     opcnt ops;
344
345
332
     static const plan_adt padt = {
346
332
    X(rdft_solve), awake, print, destroy
347
332
     };
348
349
332
     if (!applicable(ego_, p_, plnr))
350
332
          return (plan *)0;
351
352
0
     p = (const problem_rdft *) p_;
353
354
0
     n = p->sz->dims[0].n;
355
0
     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
356
357
0
     cld = X(mkplan_d)(plnr, X(mkproblem_rdft_1_d)(X(mktensor_1d)(n, 1, 1),
358
0
                                                   X(mktensor_0d)(),
359
0
                                                   buf, buf, R2HC));
360
0
     X(ifree)(buf);
361
0
     if (!cld)
362
0
          return (plan *)0;
363
364
0
     switch (p->kind[0]) {
365
0
   case REDFT01: pln = MKPLAN_RDFT(P, &padt, apply_re01); break;
366
0
   case REDFT10: pln = MKPLAN_RDFT(P, &padt, apply_re10); break;
367
0
   case RODFT01: pln = MKPLAN_RDFT(P, &padt, apply_ro01); break;
368
0
   case RODFT10: pln = MKPLAN_RDFT(P, &padt, apply_ro10); break;
369
0
   default: A(0); return (plan*)0;
370
0
     }
371
372
0
     pln->n = n;
373
0
     pln->is = p->sz->dims[0].is;
374
0
     pln->os = p->sz->dims[0].os;
375
0
     pln->cld = cld;
376
0
     pln->td = 0;
377
0
     pln->kind = p->kind[0];
378
     
379
0
     X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
380
     
381
0
     X(ops_zero)(&ops);
382
0
     ops.other = 4 + (n-1)/2 * 10 + (1 - n % 2) * 5;
383
0
     if (p->kind[0] == REDFT01 || p->kind[0] == RODFT01) {
384
0
    ops.add = (n-1)/2 * 6;
385
0
    ops.mul = (n-1)/2 * 4 + (1 - n % 2) * 2;
386
0
     }
387
0
     else { /* 10 transforms */
388
0
    ops.add = (n-1)/2 * 2;
389
0
    ops.mul = 1 + (n-1)/2 * 6 + (1 - n % 2) * 2;
390
0
     }
391
     
392
0
     X(ops_zero)(&pln->super.super.ops);
393
0
     X(ops_madd2)(pln->vl, &ops, &pln->super.super.ops);
394
0
     X(ops_madd2)(pln->vl, &cld->ops, &pln->super.super.ops);
395
396
0
     return &(pln->super.super);
397
0
}
398
399
/* constructor */
400
static solver *mksolver(void)
401
1
{
402
1
     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
403
1
     S *slv = MKSOLVER(S, &sadt);
404
1
     return &(slv->super);
405
1
}
406
407
void X(reodft010e_r2hc_register)(planner *p)
408
1
{
409
1
     REGISTER_SOLVER(p, mksolver());
410
1
}