/src/fftw3/reodft/reodft11e-radix2.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2003, 2007-14 Matteo Frigo |
3 | | * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology |
4 | | * |
5 | | * This program is free software; you can redistribute it and/or modify |
6 | | * it under the terms of the GNU General Public License as published by |
7 | | * the Free Software Foundation; either version 2 of the License, or |
8 | | * (at your option) any later version. |
9 | | * |
10 | | * This program is distributed in the hope that it will be useful, |
11 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | * GNU General Public License for more details. |
14 | | * |
15 | | * You should have received a copy of the GNU General Public License |
16 | | * along with this program; if not, write to the Free Software |
17 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
18 | | * |
19 | | */ |
20 | | |
21 | | |
22 | | /* Do an R{E,O}DFT11 problem of *even* size by a pair of R2HC problems |
23 | | of half the size, plus some pre/post-processing. Use a trick from: |
24 | | |
25 | | Zhongde Wang, "On computing the discrete Fourier and cosine transforms," |
26 | | IEEE Trans. Acoust. Speech Sig. Proc. ASSP-33 (4), 1341--1344 (1985). |
27 | | |
28 | | to re-express as a pair of half-size REDFT01 (DCT-III) problems. Our |
29 | | implementation looks quite a bit different from the algorithm described |
30 | | in the paper because we combined the paper's pre/post-processing with |
31 | | the pre/post-processing used to turn REDFT01 into R2HC. (Also, the |
32 | | paper uses a DCT/DST pair, but we turn the DST into a DCT via the |
33 | | usual reordering/sign-flip trick. We additionally combined a couple |
34 | | of the matrices/transformations of the paper into a single pass.) |
35 | | |
36 | | NOTE: We originally used a simpler method by S. C. Chan and K. L. Ho |
37 | | that turned out to have numerical problems; see reodft11e-r2hc.c. |
38 | | |
39 | | (For odd sizes, see reodft11e-r2hc-odd.c.) |
40 | | */ |
41 | | |
42 | | #include "reodft/reodft.h" |
43 | | |
44 | | typedef struct { |
45 | | solver super; |
46 | | } S; |
47 | | |
48 | | typedef struct { |
49 | | plan_rdft super; |
50 | | plan *cld; |
51 | | twid *td, *td2; |
52 | | INT is, os; |
53 | | INT n; |
54 | | INT vl; |
55 | | INT ivs, ovs; |
56 | | rdft_kind kind; |
57 | | } P; |
58 | | |
59 | | static void apply_re11(const plan *ego_, R *I, R *O) |
60 | 0 | { |
61 | 0 | const P *ego = (const P *) ego_; |
62 | 0 | INT is = ego->is, os = ego->os; |
63 | 0 | INT i, n = ego->n, n2 = n/2; |
64 | 0 | INT iv, vl = ego->vl; |
65 | 0 | INT ivs = ego->ivs, ovs = ego->ovs; |
66 | 0 | R *W = ego->td->W; |
67 | 0 | R *W2; |
68 | 0 | R *buf; |
69 | |
|
70 | 0 | buf = (R *) MALLOC(sizeof(R) * n, BUFFERS); |
71 | |
|
72 | 0 | for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) { |
73 | 0 | buf[0] = K(2.0) * I[0]; |
74 | 0 | buf[n2] = K(2.0) * I[is * (n - 1)]; |
75 | 0 | for (i = 1; i + i < n2; ++i) { |
76 | 0 | INT k = i + i; |
77 | 0 | E a, b, a2, b2; |
78 | 0 | { |
79 | 0 | E u, v; |
80 | 0 | u = I[is * (k - 1)]; |
81 | 0 | v = I[is * k]; |
82 | 0 | a = u + v; |
83 | 0 | b2 = u - v; |
84 | 0 | } |
85 | 0 | { |
86 | 0 | E u, v; |
87 | 0 | u = I[is * (n - k - 1)]; |
88 | 0 | v = I[is * (n - k)]; |
89 | 0 | b = u + v; |
90 | 0 | a2 = u - v; |
91 | 0 | } |
92 | 0 | { |
93 | 0 | E wa, wb; |
94 | 0 | wa = W[2*i]; |
95 | 0 | wb = W[2*i + 1]; |
96 | 0 | { |
97 | 0 | E apb, amb; |
98 | 0 | apb = a + b; |
99 | 0 | amb = a - b; |
100 | 0 | buf[i] = wa * amb + wb * apb; |
101 | 0 | buf[n2 - i] = wa * apb - wb * amb; |
102 | 0 | } |
103 | 0 | { |
104 | 0 | E apb, amb; |
105 | 0 | apb = a2 + b2; |
106 | 0 | amb = a2 - b2; |
107 | 0 | buf[n2 + i] = wa * amb + wb * apb; |
108 | 0 | buf[n - i] = wa * apb - wb * amb; |
109 | 0 | } |
110 | 0 | } |
111 | 0 | } |
112 | 0 | if (i + i == n2) { |
113 | 0 | E u, v; |
114 | 0 | u = I[is * (n2 - 1)]; |
115 | 0 | v = I[is * n2]; |
116 | 0 | buf[i] = (u + v) * (W[2*i] * K(2.0)); |
117 | 0 | buf[n - i] = (u - v) * (W[2*i] * K(2.0)); |
118 | 0 | } |
119 | | |
120 | | |
121 | | /* child plan: two r2hc's of size n/2 */ |
122 | 0 | { |
123 | 0 | plan_rdft *cld = (plan_rdft *) ego->cld; |
124 | 0 | cld->apply((plan *) cld, buf, buf); |
125 | 0 | } |
126 | | |
127 | 0 | W2 = ego->td2->W; |
128 | 0 | { /* i == 0 case */ |
129 | 0 | E wa, wb; |
130 | 0 | E a, b; |
131 | 0 | wa = W2[0]; /* cos */ |
132 | 0 | wb = W2[1]; /* sin */ |
133 | 0 | a = buf[0]; |
134 | 0 | b = buf[n2]; |
135 | 0 | O[0] = wa * a + wb * b; |
136 | 0 | O[os * (n - 1)] = wb * a - wa * b; |
137 | 0 | } |
138 | 0 | W2 += 2; |
139 | 0 | for (i = 1; i + i < n2; ++i, W2 += 2) { |
140 | 0 | INT k; |
141 | 0 | E u, v, u2, v2; |
142 | 0 | u = buf[i]; |
143 | 0 | v = buf[n2 - i]; |
144 | 0 | u2 = buf[n2 + i]; |
145 | 0 | v2 = buf[n - i]; |
146 | 0 | k = (i + i) - 1; |
147 | 0 | { |
148 | 0 | E wa, wb; |
149 | 0 | E a, b; |
150 | 0 | wa = W2[0]; /* cos */ |
151 | 0 | wb = W2[1]; /* sin */ |
152 | 0 | a = u - v; |
153 | 0 | b = v2 - u2; |
154 | 0 | O[os * k] = wa * a + wb * b; |
155 | 0 | O[os * (n - 1 - k)] = wb * a - wa * b; |
156 | 0 | } |
157 | 0 | ++k; |
158 | 0 | W2 += 2; |
159 | 0 | { |
160 | 0 | E wa, wb; |
161 | 0 | E a, b; |
162 | 0 | wa = W2[0]; /* cos */ |
163 | 0 | wb = W2[1]; /* sin */ |
164 | 0 | a = u + v; |
165 | 0 | b = u2 + v2; |
166 | 0 | O[os * k] = wa * a + wb * b; |
167 | 0 | O[os * (n - 1 - k)] = wb * a - wa * b; |
168 | 0 | } |
169 | 0 | } |
170 | 0 | if (i + i == n2) { |
171 | 0 | INT k = (i + i) - 1; |
172 | 0 | E wa, wb; |
173 | 0 | E a, b; |
174 | 0 | wa = W2[0]; /* cos */ |
175 | 0 | wb = W2[1]; /* sin */ |
176 | 0 | a = buf[i]; |
177 | 0 | b = buf[n2 + i]; |
178 | 0 | O[os * k] = wa * a - wb * b; |
179 | 0 | O[os * (n - 1 - k)] = wb * a + wa * b; |
180 | 0 | } |
181 | 0 | } |
182 | |
|
183 | 0 | X(ifree)(buf); |
184 | 0 | } |
185 | | |
186 | | #if 0 |
187 | | |
188 | | /* This version of apply_re11 uses REDFT01 child plans, more similar |
189 | | to the original paper by Z. Wang. We keep it around for reference |
190 | | (it is simpler) and because it may become more efficient if we |
191 | | ever implement REDFT01 codelets. */ |
192 | | |
193 | | static void apply_re11(const plan *ego_, R *I, R *O) |
194 | | { |
195 | | const P *ego = (const P *) ego_; |
196 | | INT is = ego->is, os = ego->os; |
197 | | INT i, n = ego->n; |
198 | | INT iv, vl = ego->vl; |
199 | | INT ivs = ego->ivs, ovs = ego->ovs; |
200 | | R *W; |
201 | | R *buf; |
202 | | |
203 | | buf = (R *) MALLOC(sizeof(R) * n, BUFFERS); |
204 | | |
205 | | for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) { |
206 | | buf[0] = K(2.0) * I[0]; |
207 | | buf[n/2] = K(2.0) * I[is * (n - 1)]; |
208 | | for (i = 1; i + i < n; ++i) { |
209 | | INT k = i + i; |
210 | | E a, b; |
211 | | a = I[is * (k - 1)]; |
212 | | b = I[is * k]; |
213 | | buf[i] = a + b; |
214 | | buf[n - i] = a - b; |
215 | | } |
216 | | |
217 | | /* child plan: two redft01's (DCT-III) */ |
218 | | { |
219 | | plan_rdft *cld = (plan_rdft *) ego->cld; |
220 | | cld->apply((plan *) cld, buf, buf); |
221 | | } |
222 | | |
223 | | W = ego->td2->W; |
224 | | for (i = 0; i + 1 < n/2; ++i, W += 2) { |
225 | | { |
226 | | E wa, wb; |
227 | | E a, b; |
228 | | wa = W[0]; /* cos */ |
229 | | wb = W[1]; /* sin */ |
230 | | a = buf[i]; |
231 | | b = buf[n/2 + i]; |
232 | | O[os * i] = wa * a + wb * b; |
233 | | O[os * (n - 1 - i)] = wb * a - wa * b; |
234 | | } |
235 | | ++i; |
236 | | W += 2; |
237 | | { |
238 | | E wa, wb; |
239 | | E a, b; |
240 | | wa = W[0]; /* cos */ |
241 | | wb = W[1]; /* sin */ |
242 | | a = buf[i]; |
243 | | b = buf[n/2 + i]; |
244 | | O[os * i] = wa * a - wb * b; |
245 | | O[os * (n - 1 - i)] = wb * a + wa * b; |
246 | | } |
247 | | } |
248 | | if (i < n/2) { |
249 | | E wa, wb; |
250 | | E a, b; |
251 | | wa = W[0]; /* cos */ |
252 | | wb = W[1]; /* sin */ |
253 | | a = buf[i]; |
254 | | b = buf[n/2 + i]; |
255 | | O[os * i] = wa * a + wb * b; |
256 | | O[os * (n - 1 - i)] = wb * a - wa * b; |
257 | | } |
258 | | } |
259 | | |
260 | | X(ifree)(buf); |
261 | | } |
262 | | |
263 | | #endif /* 0 */ |
264 | | |
265 | | /* like for rodft01, rodft11 is obtained from redft11 by |
266 | | reversing the input and flipping the sign of every other output. */ |
267 | | static void apply_ro11(const plan *ego_, R *I, R *O) |
268 | 0 | { |
269 | 0 | const P *ego = (const P *) ego_; |
270 | 0 | INT is = ego->is, os = ego->os; |
271 | 0 | INT i, n = ego->n, n2 = n/2; |
272 | 0 | INT iv, vl = ego->vl; |
273 | 0 | INT ivs = ego->ivs, ovs = ego->ovs; |
274 | 0 | R *W = ego->td->W; |
275 | 0 | R *W2; |
276 | 0 | R *buf; |
277 | |
|
278 | 0 | buf = (R *) MALLOC(sizeof(R) * n, BUFFERS); |
279 | |
|
280 | 0 | for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) { |
281 | 0 | buf[0] = K(2.0) * I[is * (n - 1)]; |
282 | 0 | buf[n2] = K(2.0) * I[0]; |
283 | 0 | for (i = 1; i + i < n2; ++i) { |
284 | 0 | INT k = i + i; |
285 | 0 | E a, b, a2, b2; |
286 | 0 | { |
287 | 0 | E u, v; |
288 | 0 | u = I[is * (n - k)]; |
289 | 0 | v = I[is * (n - 1 - k)]; |
290 | 0 | a = u + v; |
291 | 0 | b2 = u - v; |
292 | 0 | } |
293 | 0 | { |
294 | 0 | E u, v; |
295 | 0 | u = I[is * (k)]; |
296 | 0 | v = I[is * (k - 1)]; |
297 | 0 | b = u + v; |
298 | 0 | a2 = u - v; |
299 | 0 | } |
300 | 0 | { |
301 | 0 | E wa, wb; |
302 | 0 | wa = W[2*i]; |
303 | 0 | wb = W[2*i + 1]; |
304 | 0 | { |
305 | 0 | E apb, amb; |
306 | 0 | apb = a + b; |
307 | 0 | amb = a - b; |
308 | 0 | buf[i] = wa * amb + wb * apb; |
309 | 0 | buf[n2 - i] = wa * apb - wb * amb; |
310 | 0 | } |
311 | 0 | { |
312 | 0 | E apb, amb; |
313 | 0 | apb = a2 + b2; |
314 | 0 | amb = a2 - b2; |
315 | 0 | buf[n2 + i] = wa * amb + wb * apb; |
316 | 0 | buf[n - i] = wa * apb - wb * amb; |
317 | 0 | } |
318 | 0 | } |
319 | 0 | } |
320 | 0 | if (i + i == n2) { |
321 | 0 | E u, v; |
322 | 0 | u = I[is * n2]; |
323 | 0 | v = I[is * (n2 - 1)]; |
324 | 0 | buf[i] = (u + v) * (W[2*i] * K(2.0)); |
325 | 0 | buf[n - i] = (u - v) * (W[2*i] * K(2.0)); |
326 | 0 | } |
327 | | |
328 | | |
329 | | /* child plan: two r2hc's of size n/2 */ |
330 | 0 | { |
331 | 0 | plan_rdft *cld = (plan_rdft *) ego->cld; |
332 | 0 | cld->apply((plan *) cld, buf, buf); |
333 | 0 | } |
334 | | |
335 | 0 | W2 = ego->td2->W; |
336 | 0 | { /* i == 0 case */ |
337 | 0 | E wa, wb; |
338 | 0 | E a, b; |
339 | 0 | wa = W2[0]; /* cos */ |
340 | 0 | wb = W2[1]; /* sin */ |
341 | 0 | a = buf[0]; |
342 | 0 | b = buf[n2]; |
343 | 0 | O[0] = wa * a + wb * b; |
344 | 0 | O[os * (n - 1)] = wa * b - wb * a; |
345 | 0 | } |
346 | 0 | W2 += 2; |
347 | 0 | for (i = 1; i + i < n2; ++i, W2 += 2) { |
348 | 0 | INT k; |
349 | 0 | E u, v, u2, v2; |
350 | 0 | u = buf[i]; |
351 | 0 | v = buf[n2 - i]; |
352 | 0 | u2 = buf[n2 + i]; |
353 | 0 | v2 = buf[n - i]; |
354 | 0 | k = (i + i) - 1; |
355 | 0 | { |
356 | 0 | E wa, wb; |
357 | 0 | E a, b; |
358 | 0 | wa = W2[0]; /* cos */ |
359 | 0 | wb = W2[1]; /* sin */ |
360 | 0 | a = v - u; |
361 | 0 | b = u2 - v2; |
362 | 0 | O[os * k] = wa * a + wb * b; |
363 | 0 | O[os * (n - 1 - k)] = wa * b - wb * a; |
364 | 0 | } |
365 | 0 | ++k; |
366 | 0 | W2 += 2; |
367 | 0 | { |
368 | 0 | E wa, wb; |
369 | 0 | E a, b; |
370 | 0 | wa = W2[0]; /* cos */ |
371 | 0 | wb = W2[1]; /* sin */ |
372 | 0 | a = u + v; |
373 | 0 | b = u2 + v2; |
374 | 0 | O[os * k] = wa * a + wb * b; |
375 | 0 | O[os * (n - 1 - k)] = wa * b - wb * a; |
376 | 0 | } |
377 | 0 | } |
378 | 0 | if (i + i == n2) { |
379 | 0 | INT k = (i + i) - 1; |
380 | 0 | E wa, wb; |
381 | 0 | E a, b; |
382 | 0 | wa = W2[0]; /* cos */ |
383 | 0 | wb = W2[1]; /* sin */ |
384 | 0 | a = buf[i]; |
385 | 0 | b = buf[n2 + i]; |
386 | 0 | O[os * k] = wb * b - wa * a; |
387 | 0 | O[os * (n - 1 - k)] = wa * b + wb * a; |
388 | 0 | } |
389 | 0 | } |
390 | |
|
391 | 0 | X(ifree)(buf); |
392 | 0 | } |
393 | | |
394 | | static void awake(plan *ego_, enum wakefulness wakefulness) |
395 | 0 | { |
396 | 0 | P *ego = (P *) ego_; |
397 | 0 | static const tw_instr reodft010e_tw[] = { |
398 | 0 | { TW_COS, 0, 1 }, |
399 | 0 | { TW_SIN, 0, 1 }, |
400 | 0 | { TW_NEXT, 1, 0 } |
401 | 0 | }; |
402 | 0 | static const tw_instr reodft11e_tw[] = { |
403 | 0 | { TW_COS, 1, 1 }, |
404 | 0 | { TW_SIN, 1, 1 }, |
405 | 0 | { TW_NEXT, 2, 0 } |
406 | 0 | }; |
407 | |
|
408 | 0 | X(plan_awake)(ego->cld, wakefulness); |
409 | |
|
410 | 0 | X(twiddle_awake)(wakefulness, &ego->td, reodft010e_tw, |
411 | 0 | 2*ego->n, 1, ego->n/4+1); |
412 | 0 | X(twiddle_awake)(wakefulness, &ego->td2, reodft11e_tw, |
413 | 0 | 8*ego->n, 1, ego->n); |
414 | 0 | } |
415 | | |
416 | | static void destroy(plan *ego_) |
417 | 0 | { |
418 | 0 | P *ego = (P *) ego_; |
419 | 0 | X(plan_destroy_internal)(ego->cld); |
420 | 0 | } |
421 | | |
422 | | static void print(const plan *ego_, printer *p) |
423 | 0 | { |
424 | 0 | const P *ego = (const P *) ego_; |
425 | 0 | p->print(p, "(%se-radix2-r2hc-%D%v%(%p%))", |
426 | 0 | X(rdft_kind_str)(ego->kind), ego->n, ego->vl, ego->cld); |
427 | 0 | } |
428 | | |
429 | | static int applicable0(const solver *ego_, const problem *p_) |
430 | 0 | { |
431 | 0 | const problem_rdft *p = (const problem_rdft *) p_; |
432 | 0 | UNUSED(ego_); |
433 | |
|
434 | 0 | return (1 |
435 | 0 | && p->sz->rnk == 1 |
436 | 0 | && p->vecsz->rnk <= 1 |
437 | 0 | && p->sz->dims[0].n % 2 == 0 |
438 | 0 | && (p->kind[0] == REDFT11 || p->kind[0] == RODFT11) |
439 | 0 | ); |
440 | 0 | } |
441 | | |
442 | | static int applicable(const solver *ego, const problem *p, const planner *plnr) |
443 | 320 | { |
444 | 320 | return (!NO_SLOWP(plnr) && applicable0(ego, p)); |
445 | 320 | } |
446 | | |
447 | | static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) |
448 | 320 | { |
449 | 320 | P *pln; |
450 | 320 | const problem_rdft *p; |
451 | 320 | plan *cld; |
452 | 320 | R *buf; |
453 | 320 | INT n; |
454 | 320 | opcnt ops; |
455 | | |
456 | 320 | static const plan_adt padt = { |
457 | 320 | X(rdft_solve), awake, print, destroy |
458 | 320 | }; |
459 | | |
460 | 320 | if (!applicable(ego_, p_, plnr)) |
461 | 320 | return (plan *)0; |
462 | | |
463 | 0 | p = (const problem_rdft *) p_; |
464 | |
|
465 | 0 | n = p->sz->dims[0].n; |
466 | 0 | buf = (R *) MALLOC(sizeof(R) * n, BUFFERS); |
467 | |
|
468 | 0 | cld = X(mkplan_d)(plnr, X(mkproblem_rdft_1_d)(X(mktensor_1d)(n/2, 1, 1), |
469 | 0 | X(mktensor_1d)(2, n/2, n/2), |
470 | 0 | buf, buf, R2HC)); |
471 | 0 | X(ifree)(buf); |
472 | 0 | if (!cld) |
473 | 0 | return (plan *)0; |
474 | | |
475 | 0 | pln = MKPLAN_RDFT(P, &padt, p->kind[0]==REDFT11 ? apply_re11:apply_ro11); |
476 | 0 | pln->n = n; |
477 | 0 | pln->is = p->sz->dims[0].is; |
478 | 0 | pln->os = p->sz->dims[0].os; |
479 | 0 | pln->cld = cld; |
480 | 0 | pln->td = pln->td2 = 0; |
481 | 0 | pln->kind = p->kind[0]; |
482 | | |
483 | 0 | X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs); |
484 | | |
485 | 0 | X(ops_zero)(&ops); |
486 | 0 | ops.add = 2 + (n/2 - 1)/2 * 20; |
487 | 0 | ops.mul = 6 + (n/2 - 1)/2 * 16; |
488 | 0 | ops.other = 4*n + 2 + (n/2 - 1)/2 * 6; |
489 | 0 | if ((n/2) % 2 == 0) { |
490 | 0 | ops.add += 4; |
491 | 0 | ops.mul += 8; |
492 | 0 | ops.other += 4; |
493 | 0 | } |
494 | |
|
495 | 0 | X(ops_zero)(&pln->super.super.ops); |
496 | 0 | X(ops_madd2)(pln->vl, &ops, &pln->super.super.ops); |
497 | 0 | X(ops_madd2)(pln->vl, &cld->ops, &pln->super.super.ops); |
498 | |
|
499 | 0 | return &(pln->super.super); |
500 | 0 | } |
501 | | |
502 | | /* constructor */ |
503 | | static solver *mksolver(void) |
504 | 1 | { |
505 | 1 | static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 }; |
506 | 1 | S *slv = MKSOLVER(S, &sadt); |
507 | 1 | return &(slv->super); |
508 | 1 | } |
509 | | |
510 | | void X(reodft11e_radix2_r2hc_register)(planner *p) |
511 | 1 | { |
512 | 1 | REGISTER_SOLVER(p, mksolver()); |
513 | 1 | } |