/src/fftw3/reodft/reodft11e-radix2.c

Source
/*
 * Copyright (c) 2003, 2007-14 Matteo Frigo
 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 *
 */


/* Do an R{E,O}DFT11 problem of *even* size by a pair of R2HC problems
   of half the size, plus some pre/post-processing.  Use a trick from:

   Zhongde Wang, "On computing the discrete Fourier and cosine transforms,"
   IEEE Trans. Acoust. Speech Sig. Proc. ASSP-33 (4), 1341--1344 (1985).

   to re-express as a pair of half-size REDFT01 (DCT-III) problems.  Our
   implementation looks quite a bit different from the algorithm described
   in the paper because we combined the paper's pre/post-processing with
   the pre/post-processing used to turn REDFT01 into R2HC.  (Also, the
   paper uses a DCT/DST pair, but we turn the DST into a DCT via the
   usual reordering/sign-flip trick.  We additionally combined a couple
   of the matrices/transformations of the paper into a single pass.)

   NOTE: We originally used a simpler method by S. C. Chan and K. L. Ho
   that turned out to have numerical problems; see reodft11e-r2hc.c.

   (For odd sizes, see reodft11e-r2hc-odd.c.)
*/

#include "reodft/reodft.h"

typedef struct {
     solver super;
} S;

typedef struct {
     plan_rdft super;
     plan *cld;
     twid *td, *td2;
     INT is, os;
     INT n;
     INT vl;
     INT ivs, ovs;
     rdft_kind kind;
} P;

static void apply_re11(const plan *ego_, R *I, R *O)
{
     const P *ego = (const P *) ego_;
     INT is = ego->is, os = ego->os;
     INT i, n = ego->n, n2 = n/2;
     INT iv, vl = ego->vl;
     INT ivs = ego->ivs, ovs = ego->ovs;
     R *W = ego->td->W;
     R *W2;
     R *buf;

     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);

     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
    buf[0] = K(2.0) * I[0];
    buf[n2] = K(2.0) * I[is * (n - 1)];
    for (i = 1; i + i < n2; ++i) {
         INT k = i + i;
         E a, b, a2, b2;
         {
        E u, v;
        u = I[is * (k - 1)];
        v = I[is * k];
        a = u + v;
        b2 = u - v;
         }
         {
        E u, v;
        u = I[is * (n - k - 1)];
        v = I[is * (n - k)];
        b = u + v;
        a2 = u - v;
         }
         {
        E wa, wb;
        wa = W[2*i];
        wb = W[2*i + 1];
        {
       E apb, amb;
       apb = a + b;
       amb = a - b;
       buf[i] = wa * amb + wb * apb; 
       buf[n2 - i] = wa * apb - wb * amb; 
        }
        {
       E apb, amb;
       apb = a2 + b2;
       amb = a2 - b2;
       buf[n2 + i] = wa * amb + wb * apb; 
       buf[n - i] = wa * apb - wb * amb; 
        }
         }
    }
    if (i + i == n2) {
         E u, v;
         u = I[is * (n2 - 1)];
         v = I[is * n2];
         buf[i] = (u + v) * (W[2*i] * K(2.0));
         buf[n - i] = (u - v) * (W[2*i] * K(2.0));
    }


    /* child plan: two r2hc's of size n/2 */
    {
         plan_rdft *cld = (plan_rdft *) ego->cld;
         cld->apply((plan *) cld, buf, buf);
    }
    
    W2 = ego->td2->W;
    { /* i == 0 case */
         E wa, wb;
         E a, b;
         wa = W2[0]; /* cos */
         wb = W2[1]; /* sin */
         a = buf[0];
         b = buf[n2];
         O[0] = wa * a + wb * b;
         O[os * (n - 1)] = wb * a - wa * b;
    }
    W2 += 2;
    for (i = 1; i + i < n2; ++i, W2 += 2) {
         INT k;
         E u, v, u2, v2;
         u = buf[i];
         v = buf[n2 - i];
         u2 = buf[n2 + i];
         v2 = buf[n - i];
         k = (i + i) - 1;
         {
                    E wa, wb;
                    E a, b;
                    wa = W2[0]; /* cos */
                    wb = W2[1]; /* sin */
                    a = u - v;
                    b = v2 - u2;
                    O[os * k] = wa * a + wb * b;
                    O[os * (n - 1 - k)] = wb * a - wa * b;
               }
         ++k;
         W2 += 2;
         {
        E wa, wb;
        E a, b;
        wa = W2[0]; /* cos */
        wb = W2[1]; /* sin */
        a = u + v;
        b = u2 + v2;
        O[os * k] = wa * a + wb * b;
        O[os * (n - 1 - k)] = wb * a - wa * b;
         }
    }
    if (i + i == n2) {
         INT k = (i + i) - 1;
         E wa, wb;
         E a, b;
         wa = W2[0]; /* cos */
         wb = W2[1]; /* sin */
         a = buf[i];
         b = buf[n2 + i];
         O[os * k] = wa * a - wb * b;
         O[os * (n - 1 - k)] = wb * a + wa * b;
    }
     }

     X(ifree)(buf);
}

#if 0

/* This version of apply_re11 uses REDFT01 child plans, more similar
   to the original paper by Z. Wang.  We keep it around for reference
   (it is simpler) and because it may become more efficient if we
   ever implement REDFT01 codelets. */

static void apply_re11(const plan *ego_, R *I, R *O)
{
     const P *ego = (const P *) ego_;
     INT is = ego->is, os = ego->os;
     INT i, n = ego->n;
     INT iv, vl = ego->vl;
     INT ivs = ego->ivs, ovs = ego->ovs;
     R *W;
     R *buf;

     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);

     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
    buf[0] = K(2.0) * I[0];
    buf[n/2] = K(2.0) * I[is * (n - 1)];
    for (i = 1; i + i < n; ++i) {
         INT k = i + i;
         E a, b;
         a = I[is * (k - 1)];
         b = I[is * k];
         buf[i] = a + b;
         buf[n - i] = a - b;
    }

    /* child plan: two redft01's (DCT-III) */
    {
         plan_rdft *cld = (plan_rdft *) ego->cld;
         cld->apply((plan *) cld, buf, buf);
    }
    
    W = ego->td2->W;
    for (i = 0; i + 1 < n/2; ++i, W += 2) {
         {
        E wa, wb;
        E a, b;
        wa = W[0]; /* cos */
        wb = W[1]; /* sin */
        a = buf[i];
        b = buf[n/2 + i];
        O[os * i] = wa * a + wb * b;
        O[os * (n - 1 - i)] = wb * a - wa * b;
         }
         ++i;
         W += 2;
         {
                    E wa, wb;
                    E a, b;
                    wa = W[0]; /* cos */
                    wb = W[1]; /* sin */
                    a = buf[i];
                    b = buf[n/2 + i];
                    O[os * i] = wa * a - wb * b;
                    O[os * (n - 1 - i)] = wb * a + wa * b;
               }
    }
    if (i < n/2) {
         E wa, wb;
         E a, b;
         wa = W[0]; /* cos */
         wb = W[1]; /* sin */
         a = buf[i];
         b = buf[n/2 + i];
         O[os * i] = wa * a + wb * b;
         O[os * (n - 1 - i)] = wb * a - wa * b;
    }
     }

     X(ifree)(buf);
}

#endif /* 0 */

/* like for rodft01, rodft11 is obtained from redft11 by
   reversing the input and flipping the sign of every other output. */
static void apply_ro11(const plan *ego_, R *I, R *O)
{
     const P *ego = (const P *) ego_;
     INT is = ego->is, os = ego->os;
     INT i, n = ego->n, n2 = n/2;
     INT iv, vl = ego->vl;
     INT ivs = ego->ivs, ovs = ego->ovs;
     R *W = ego->td->W;
     R *W2;
     R *buf;

     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);

     for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
    buf[0] = K(2.0) * I[is * (n - 1)];
    buf[n2] = K(2.0) * I[0];
    for (i = 1; i + i < n2; ++i) {
         INT k = i + i;
         E a, b, a2, b2;
         {
        E u, v;
        u = I[is * (n - k)];
        v = I[is * (n - 1 - k)];
        a = u + v;
        b2 = u - v;
         }
         {
        E u, v;
        u = I[is * (k)];
        v = I[is * (k - 1)];
        b = u + v;
        a2 = u - v;
         }
         {
        E wa, wb;
        wa = W[2*i];
        wb = W[2*i + 1];
        {
       E apb, amb;
       apb = a + b;
       amb = a - b;
       buf[i] = wa * amb + wb * apb; 
       buf[n2 - i] = wa * apb - wb * amb; 
        }
        {
       E apb, amb;
       apb = a2 + b2;
       amb = a2 - b2;
       buf[n2 + i] = wa * amb + wb * apb; 
       buf[n - i] = wa * apb - wb * amb; 
        }
         }
    }
    if (i + i == n2) {
         E u, v;
         u = I[is * n2];
         v = I[is * (n2 - 1)];
         buf[i] = (u + v) * (W[2*i] * K(2.0));
         buf[n - i] = (u - v) * (W[2*i] * K(2.0));
    }


    /* child plan: two r2hc's of size n/2 */
    {
         plan_rdft *cld = (plan_rdft *) ego->cld;
         cld->apply((plan *) cld, buf, buf);
    }
    
    W2 = ego->td2->W;
    { /* i == 0 case */
         E wa, wb;
         E a, b;
         wa = W2[0]; /* cos */
         wb = W2[1]; /* sin */
         a = buf[0];
         b = buf[n2];
         O[0] = wa * a + wb * b;
         O[os * (n - 1)] = wa * b - wb * a;
    }
    W2 += 2;
    for (i = 1; i + i < n2; ++i, W2 += 2) {
         INT k;
         E u, v, u2, v2;
         u = buf[i];
         v = buf[n2 - i];
         u2 = buf[n2 + i];
         v2 = buf[n - i];
         k = (i + i) - 1;
         {
                    E wa, wb;
                    E a, b;
                    wa = W2[0]; /* cos */
                    wb = W2[1]; /* sin */
                    a = v - u;
                    b = u2 - v2;
                    O[os * k] = wa * a + wb * b;
                    O[os * (n - 1 - k)] = wa * b - wb * a;
               }
         ++k;
         W2 += 2;
         {
        E wa, wb;
        E a, b;
        wa = W2[0]; /* cos */
        wb = W2[1]; /* sin */
        a = u + v;
        b = u2 + v2;
        O[os * k] = wa * a + wb * b;
        O[os * (n - 1 - k)] = wa * b - wb * a;
         }
    }
    if (i + i == n2) {
         INT k = (i + i) - 1;
         E wa, wb;
         E a, b;
         wa = W2[0]; /* cos */
         wb = W2[1]; /* sin */
         a = buf[i];
         b = buf[n2 + i];
         O[os * k] = wb * b - wa * a;
         O[os * (n - 1 - k)] = wa * b + wb * a;
    }
     }

     X(ifree)(buf);
}

static void awake(plan *ego_, enum wakefulness wakefulness)
{
     P *ego = (P *) ego_;
     static const tw_instr reodft010e_tw[] = {
          { TW_COS, 0, 1 },
          { TW_SIN, 0, 1 },
          { TW_NEXT, 1, 0 }
     };
     static const tw_instr reodft11e_tw[] = {
          { TW_COS, 1, 1 },
          { TW_SIN, 1, 1 },
          { TW_NEXT, 2, 0 }
     };

     X(plan_awake)(ego->cld, wakefulness);

     X(twiddle_awake)(wakefulness, &ego->td, reodft010e_tw, 
          2*ego->n, 1, ego->n/4+1);
     X(twiddle_awake)(wakefulness, &ego->td2, reodft11e_tw, 
          8*ego->n, 1, ego->n);
}

static void destroy(plan *ego_)
{
     P *ego = (P *) ego_;
     X(plan_destroy_internal)(ego->cld);
}

static void print(const plan *ego_, printer *p)
{
     const P *ego = (const P *) ego_;
     p->print(p, "(%se-radix2-r2hc-%D%v%(%p%))",
        X(rdft_kind_str)(ego->kind), ego->n, ego->vl, ego->cld);
}

static int applicable0(const solver *ego_, const problem *p_)
{
     const problem_rdft *p = (const problem_rdft *) p_;
     UNUSED(ego_);

     return (1
       && p->sz->rnk == 1
       && p->vecsz->rnk <= 1
       && p->sz->dims[0].n % 2 == 0
       && (p->kind[0] == REDFT11 || p->kind[0] == RODFT11)
    );
}

static int applicable(const solver *ego, const problem *p, const planner *plnr)
{
     return (!NO_SLOWP(plnr) && applicable0(ego, p));
}

static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
     P *pln;
     const problem_rdft *p;
     plan *cld;
     R *buf;
     INT n;
     opcnt ops;

     static const plan_adt padt = {
    X(rdft_solve), awake, print, destroy
     };

     if (!applicable(ego_, p_, plnr))
          return (plan *)0;

     p = (const problem_rdft *) p_;

     n = p->sz->dims[0].n;
     buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);

     cld = X(mkplan_d)(plnr, X(mkproblem_rdft_1_d)(X(mktensor_1d)(n/2, 1, 1),
                                                   X(mktensor_1d)(2, n/2, n/2),
                                                   buf, buf, R2HC));
     X(ifree)(buf);
     if (!cld)
          return (plan *)0;

     pln = MKPLAN_RDFT(P, &padt, p->kind[0]==REDFT11 ? apply_re11:apply_ro11);
     pln->n = n;
     pln->is = p->sz->dims[0].is;
     pln->os = p->sz->dims[0].os;
     pln->cld = cld;
     pln->td = pln->td2 = 0;
     pln->kind = p->kind[0];
     
     X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
     
     X(ops_zero)(&ops);
     ops.add = 2 + (n/2 - 1)/2 * 20;
     ops.mul = 6 + (n/2 - 1)/2 * 16;
     ops.other = 4*n + 2 + (n/2 - 1)/2 * 6;
     if ((n/2) % 2 == 0) {
    ops.add += 4;
    ops.mul += 8;
    ops.other += 4;
     }

     X(ops_zero)(&pln->super.super.ops);
     X(ops_madd2)(pln->vl, &ops, &pln->super.super.ops);
     X(ops_madd2)(pln->vl, &cld->ops, &pln->super.super.ops);

     return &(pln->super.super);
}

/* constructor */
static solver *mksolver(void)
{
     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
     S *slv = MKSOLVER(S, &sadt);
     return &(slv->super);
}

void X(reodft11e_radix2_r2hc_register)(planner *p)
{
     REGISTER_SOLVER(p, mksolver());
}

Coverage Report

Created: 2025-10-10 07:00

Line	Count	Source
1		/*
2		* Copyright (c) 2003, 2007-14 Matteo Frigo
3		* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4		*
5		* This program is free software; you can redistribute it and/or modify
6		* it under the terms of the GNU General Public License as published by
7		* the Free Software Foundation; either version 2 of the License, or
8		* (at your option) any later version.
9		*
10		* This program is distributed in the hope that it will be useful,
11		* but WITHOUT ANY WARRANTY; without even the implied warranty of
12		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13		* GNU General Public License for more details.
14		*
15		* You should have received a copy of the GNU General Public License
16		* along with this program; if not, write to the Free Software
17		* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18		*
19		*/
20
21
22		/* Do an R{E,O}DFT11 problem of even size by a pair of R2HC problems
23		of half the size, plus some pre/post-processing. Use a trick from:
24
25		Zhongde Wang, "On computing the discrete Fourier and cosine transforms,"
26		IEEE Trans. Acoust. Speech Sig. Proc. ASSP-33 (4), 1341--1344 (1985).
27
28		to re-express as a pair of half-size REDFT01 (DCT-III) problems. Our
29		implementation looks quite a bit different from the algorithm described
30		in the paper because we combined the paper's pre/post-processing with
31		the pre/post-processing used to turn REDFT01 into R2HC. (Also, the
32		paper uses a DCT/DST pair, but we turn the DST into a DCT via the
33		usual reordering/sign-flip trick. We additionally combined a couple
34		of the matrices/transformations of the paper into a single pass.)
35
36		NOTE: We originally used a simpler method by S. C. Chan and K. L. Ho
37		that turned out to have numerical problems; see reodft11e-r2hc.c.
38
39		(For odd sizes, see reodft11e-r2hc-odd.c.)
40		*/
41
42		#include "reodft/reodft.h"
43
44		typedef struct {
45		solver super;
46		} S;
47
48		typedef struct {
49		plan_rdft super;
50		plan *cld;
51		twid td, td2;
52		INT is, os;
53		INT n;
54		INT vl;
55		INT ivs, ovs;
56		rdft_kind kind;
57		} P;
58
59		static void apply_re11(const plan ego_, R I, R *O)
60	0	{
61	0	const P ego = (const P ) ego_;
62	0	INT is = ego->is, os = ego->os;
63	0	INT i, n = ego->n, n2 = n/2;
64	0	INT iv, vl = ego->vl;
65	0	INT ivs = ego->ivs, ovs = ego->ovs;
66	0	R *W = ego->td->W;
67	0	R *W2;
68	0	R *buf;
69
70	0	buf = (R ) MALLOC(sizeof(R) n, BUFFERS);
71
72	0	for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
73	0	buf[0] = K(2.0) * I[0];
74	0	buf[n2] = K(2.0) * I[is * (n - 1)];
75	0	for (i = 1; i + i < n2; ++i) {
76	0	INT k = i + i;
77	0	E a, b, a2, b2;
78	0	{
79	0	E u, v;
80	0	u = I[is * (k - 1)];
81	0	v = I[is * k];
82	0	a = u + v;
83	0	b2 = u - v;
84	0	}
85	0	{
86	0	E u, v;
87	0	u = I[is * (n - k - 1)];
88	0	v = I[is * (n - k)];
89	0	b = u + v;
90	0	a2 = u - v;
91	0	}
92	0	{
93	0	E wa, wb;
94	0	wa = W[2*i];
95	0	wb = W[2*i + 1];
96	0	{
97	0	E apb, amb;
98	0	apb = a + b;
99	0	amb = a - b;
100	0	buf[i] = wa * amb + wb * apb;
101	0	buf[n2 - i] = wa * apb - wb * amb;
102	0	}
103	0	{
104	0	E apb, amb;
105	0	apb = a2 + b2;
106	0	amb = a2 - b2;
107	0	buf[n2 + i] = wa * amb + wb * apb;
108	0	buf[n - i] = wa * apb - wb * amb;
109	0	}
110	0	}
111	0	}
112	0	if (i + i == n2) {
113	0	E u, v;
114	0	u = I[is * (n2 - 1)];
115	0	v = I[is * n2];
116	0	buf[i] = (u + v) * (W[2i] K(2.0));
117	0	buf[n - i] = (u - v) * (W[2i] K(2.0));
118	0	}
119
120
121		/* child plan: two r2hc's of size n/2 */
122	0	{
123	0	plan_rdft cld = (plan_rdft ) ego->cld;
124	0	cld->apply((plan *) cld, buf, buf);
125	0	}
126
127	0	W2 = ego->td2->W;
128	0	{ /* i == 0 case */
129	0	E wa, wb;
130	0	E a, b;
131	0	wa = W2[0]; /* cos */
132	0	wb = W2[1]; /* sin */
133	0	a = buf[0];
134	0	b = buf[n2];
135	0	O[0] = wa * a + wb * b;
136	0	O[os * (n - 1)] = wb * a - wa * b;
137	0	}
138	0	W2 += 2;
139	0	for (i = 1; i + i < n2; ++i, W2 += 2) {
140	0	INT k;
141	0	E u, v, u2, v2;
142	0	u = buf[i];
143	0	v = buf[n2 - i];
144	0	u2 = buf[n2 + i];
145	0	v2 = buf[n - i];
146	0	k = (i + i) - 1;
147	0	{
148	0	E wa, wb;
149	0	E a, b;
150	0	wa = W2[0]; /* cos */
151	0	wb = W2[1]; /* sin */
152	0	a = u - v;
153	0	b = v2 - u2;
154	0	O[os * k] = wa * a + wb * b;
155	0	O[os * (n - 1 - k)] = wb * a - wa * b;
156	0	}
157	0	++k;
158	0	W2 += 2;
159	0	{
160	0	E wa, wb;
161	0	E a, b;
162	0	wa = W2[0]; /* cos */
163	0	wb = W2[1]; /* sin */
164	0	a = u + v;
165	0	b = u2 + v2;
166	0	O[os * k] = wa * a + wb * b;
167	0	O[os * (n - 1 - k)] = wb * a - wa * b;
168	0	}
169	0	}
170	0	if (i + i == n2) {
171	0	INT k = (i + i) - 1;
172	0	E wa, wb;
173	0	E a, b;
174	0	wa = W2[0]; /* cos */
175	0	wb = W2[1]; /* sin */
176	0	a = buf[i];
177	0	b = buf[n2 + i];
178	0	O[os * k] = wa * a - wb * b;
179	0	O[os * (n - 1 - k)] = wb * a + wa * b;
180	0	}
181	0	}
182
183	0	X(ifree)(buf);
184	0	}
185
186		#if 0
187
188		/* This version of apply_re11 uses REDFT01 child plans, more similar
189		to the original paper by Z. Wang. We keep it around for reference
190		(it is simpler) and because it may become more efficient if we
191		ever implement REDFT01 codelets. */
192
193		static void apply_re11(const plan ego_, R I, R *O)
194		{
195		const P ego = (const P ) ego_;
196		INT is = ego->is, os = ego->os;
197		INT i, n = ego->n;
198		INT iv, vl = ego->vl;
199		INT ivs = ego->ivs, ovs = ego->ovs;
200		R *W;
201		R *buf;
202
203		buf = (R ) MALLOC(sizeof(R) n, BUFFERS);
204
205		for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
206		buf[0] = K(2.0) * I[0];
207		buf[n/2] = K(2.0) * I[is * (n - 1)];
208		for (i = 1; i + i < n; ++i) {
209		INT k = i + i;
210		E a, b;
211		a = I[is * (k - 1)];
212		b = I[is * k];
213		buf[i] = a + b;
214		buf[n - i] = a - b;
215		}
216
217		/* child plan: two redft01's (DCT-III) */
218		{
219		plan_rdft cld = (plan_rdft ) ego->cld;
220		cld->apply((plan *) cld, buf, buf);
221		}
222
223		W = ego->td2->W;
224		for (i = 0; i + 1 < n/2; ++i, W += 2) {
225		{
226		E wa, wb;
227		E a, b;
228		wa = W[0]; /* cos */
229		wb = W[1]; /* sin */
230		a = buf[i];
231		b = buf[n/2 + i];
232		O[os * i] = wa * a + wb * b;
233		O[os * (n - 1 - i)] = wb * a - wa * b;
234		}
235		++i;
236		W += 2;
237		{
238		E wa, wb;
239		E a, b;
240		wa = W[0]; /* cos */
241		wb = W[1]; /* sin */
242		a = buf[i];
243		b = buf[n/2 + i];
244		O[os * i] = wa * a - wb * b;
245		O[os * (n - 1 - i)] = wb * a + wa * b;
246		}
247		}
248		if (i < n/2) {
249		E wa, wb;
250		E a, b;
251		wa = W[0]; /* cos */
252		wb = W[1]; /* sin */
253		a = buf[i];
254		b = buf[n/2 + i];
255		O[os * i] = wa * a + wb * b;
256		O[os * (n - 1 - i)] = wb * a - wa * b;
257		}
258		}
259
260		X(ifree)(buf);
261		}
262
263		#endif /* 0 */
264
265		/* like for rodft01, rodft11 is obtained from redft11 by
266		reversing the input and flipping the sign of every other output. */
267		static void apply_ro11(const plan ego_, R I, R *O)
268	0	{
269	0	const P ego = (const P ) ego_;
270	0	INT is = ego->is, os = ego->os;
271	0	INT i, n = ego->n, n2 = n/2;
272	0	INT iv, vl = ego->vl;
273	0	INT ivs = ego->ivs, ovs = ego->ovs;
274	0	R *W = ego->td->W;
275	0	R *W2;
276	0	R *buf;
277
278	0	buf = (R ) MALLOC(sizeof(R) n, BUFFERS);
279
280	0	for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
281	0	buf[0] = K(2.0) * I[is * (n - 1)];
282	0	buf[n2] = K(2.0) * I[0];
283	0	for (i = 1; i + i < n2; ++i) {
284	0	INT k = i + i;
285	0	E a, b, a2, b2;
286	0	{
287	0	E u, v;
288	0	u = I[is * (n - k)];
289	0	v = I[is * (n - 1 - k)];
290	0	a = u + v;
291	0	b2 = u - v;
292	0	}
293	0	{
294	0	E u, v;
295	0	u = I[is * (k)];
296	0	v = I[is * (k - 1)];
297	0	b = u + v;
298	0	a2 = u - v;
299	0	}
300	0	{
301	0	E wa, wb;
302	0	wa = W[2*i];
303	0	wb = W[2*i + 1];
304	0	{
305	0	E apb, amb;
306	0	apb = a + b;
307	0	amb = a - b;
308	0	buf[i] = wa * amb + wb * apb;
309	0	buf[n2 - i] = wa * apb - wb * amb;
310	0	}
311	0	{
312	0	E apb, amb;
313	0	apb = a2 + b2;
314	0	amb = a2 - b2;
315	0	buf[n2 + i] = wa * amb + wb * apb;
316	0	buf[n - i] = wa * apb - wb * amb;
317	0	}
318	0	}
319	0	}
320	0	if (i + i == n2) {
321	0	E u, v;
322	0	u = I[is * n2];
323	0	v = I[is * (n2 - 1)];
324	0	buf[i] = (u + v) * (W[2i] K(2.0));
325	0	buf[n - i] = (u - v) * (W[2i] K(2.0));
326	0	}
327
328
329		/* child plan: two r2hc's of size n/2 */
330	0	{
331	0	plan_rdft cld = (plan_rdft ) ego->cld;
332	0	cld->apply((plan *) cld, buf, buf);
333	0	}
334
335	0	W2 = ego->td2->W;
336	0	{ /* i == 0 case */
337	0	E wa, wb;
338	0	E a, b;
339	0	wa = W2[0]; /* cos */
340	0	wb = W2[1]; /* sin */
341	0	a = buf[0];
342	0	b = buf[n2];
343	0	O[0] = wa * a + wb * b;
344	0	O[os * (n - 1)] = wa * b - wb * a;
345	0	}
346	0	W2 += 2;
347	0	for (i = 1; i + i < n2; ++i, W2 += 2) {
348	0	INT k;
349	0	E u, v, u2, v2;
350	0	u = buf[i];
351	0	v = buf[n2 - i];
352	0	u2 = buf[n2 + i];
353	0	v2 = buf[n - i];
354	0	k = (i + i) - 1;
355	0	{
356	0	E wa, wb;
357	0	E a, b;
358	0	wa = W2[0]; /* cos */
359	0	wb = W2[1]; /* sin */
360	0	a = v - u;
361	0	b = u2 - v2;
362	0	O[os * k] = wa * a + wb * b;
363	0	O[os * (n - 1 - k)] = wa * b - wb * a;
364	0	}
365	0	++k;
366	0	W2 += 2;
367	0	{
368	0	E wa, wb;
369	0	E a, b;
370	0	wa = W2[0]; /* cos */
371	0	wb = W2[1]; /* sin */
372	0	a = u + v;
373	0	b = u2 + v2;
374	0	O[os * k] = wa * a + wb * b;
375	0	O[os * (n - 1 - k)] = wa * b - wb * a;
376	0	}
377	0	}
378	0	if (i + i == n2) {
379	0	INT k = (i + i) - 1;
380	0	E wa, wb;
381	0	E a, b;
382	0	wa = W2[0]; /* cos */
383	0	wb = W2[1]; /* sin */
384	0	a = buf[i];
385	0	b = buf[n2 + i];
386	0	O[os * k] = wb * b - wa * a;
387	0	O[os * (n - 1 - k)] = wa * b + wb * a;
388	0	}
389	0	}
390
391	0	X(ifree)(buf);
392	0	}
393
394		static void awake(plan *ego_, enum wakefulness wakefulness)
395	0	{
396	0	P ego = (P ) ego_;
397	0	static const tw_instr reodft010e_tw[] = {
398	0	{ TW_COS, 0, 1 },
399	0	{ TW_SIN, 0, 1 },
400	0	{ TW_NEXT, 1, 0 }
401	0	};
402	0	static const tw_instr reodft11e_tw[] = {
403	0	{ TW_COS, 1, 1 },
404	0	{ TW_SIN, 1, 1 },
405	0	{ TW_NEXT, 2, 0 }
406	0	};
407
408	0	X(plan_awake)(ego->cld, wakefulness);
409
410	0	X(twiddle_awake)(wakefulness, &ego->td, reodft010e_tw,
411	0	2*ego->n, 1, ego->n/4+1);
412	0	X(twiddle_awake)(wakefulness, &ego->td2, reodft11e_tw,
413	0	8*ego->n, 1, ego->n);
414	0	}
415
416		static void destroy(plan *ego_)
417	0	{
418	0	P ego = (P ) ego_;
419	0	X(plan_destroy_internal)(ego->cld);
420	0	}
421
422		static void print(const plan ego_, printer p)
423	0	{
424	0	const P ego = (const P ) ego_;
425	0	p->print(p, "(%se-radix2-r2hc-%D%v%(%p%))",
426	0	X(rdft_kind_str)(ego->kind), ego->n, ego->vl, ego->cld);
427	0	}
428
429		static int applicable0(const solver ego_, const problem p_)
430	0	{
431	0	const problem_rdft p = (const problem_rdft ) p_;
432	0	UNUSED(ego_);
433
434	0	return (1
435	0	&& p->sz->rnk == 1
436	0	&& p->vecsz->rnk <= 1
437	0	&& p->sz->dims[0].n % 2 == 0
438	0	&& (p->kind[0] == REDFT11 \|\| p->kind[0] == RODFT11)
439	0	);
440	0	}
441
442		static int applicable(const solver ego, const problem p, const planner *plnr)
443	320	{
444	320	return (!NO_SLOWP(plnr) && applicable0(ego, p));
445	320	}
446
447		static plan mkplan(const solver ego_, const problem p_, planner plnr)
448	320	{
449	320	P *pln;
450	320	const problem_rdft *p;
451	320	plan *cld;
452	320	R *buf;
453	320	INT n;
454	320	opcnt ops;
455
456	320	static const plan_adt padt = {
457	320	X(rdft_solve), awake, print, destroy
458	320	};
459
460	320	if (!applicable(ego_, p_, plnr))
461	320	return (plan *)0;
462
463	0	p = (const problem_rdft *) p_;
464
465	0	n = p->sz->dims[0].n;
466	0	buf = (R ) MALLOC(sizeof(R) n, BUFFERS);
467
468	0	cld = X(mkplan_d)(plnr, X(mkproblem_rdft_1_d)(X(mktensor_1d)(n/2, 1, 1),
469	0	X(mktensor_1d)(2, n/2, n/2),
470	0	buf, buf, R2HC));
471	0	X(ifree)(buf);
472	0	if (!cld)
473	0	return (plan *)0;
474
475	0	pln = MKPLAN_RDFT(P, &padt, p->kind[0]==REDFT11 ? apply_re11:apply_ro11);
476	0	pln->n = n;
477	0	pln->is = p->sz->dims[0].is;
478	0	pln->os = p->sz->dims[0].os;
479	0	pln->cld = cld;
480	0	pln->td = pln->td2 = 0;
481	0	pln->kind = p->kind[0];
482
483	0	X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
484
485	0	X(ops_zero)(&ops);
486	0	ops.add = 2 + (n/2 - 1)/2 * 20;
487	0	ops.mul = 6 + (n/2 - 1)/2 * 16;
488	0	ops.other = 4n + 2 + (n/2 - 1)/2 6;
489	0	if ((n/2) % 2 == 0) {
490	0	ops.add += 4;
491	0	ops.mul += 8;
492	0	ops.other += 4;
493	0	}
494
495	0	X(ops_zero)(&pln->super.super.ops);
496	0	X(ops_madd2)(pln->vl, &ops, &pln->super.super.ops);
497	0	X(ops_madd2)(pln->vl, &cld->ops, &pln->super.super.ops);
498
499	0	return &(pln->super.super);
500	0	}
501
502		/* constructor */
503		static solver *mksolver(void)
504	1	{
505	1	static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
506	1	S *slv = MKSOLVER(S, &sadt);
507	1	return &(slv->super);
508	1	}
509
510		void X(reodft11e_radix2_r2hc_register)(planner *p)
511	1	{
512	1	REGISTER_SOLVER(p, mksolver());
513	1	}