/src/fftw3/dft/buffered.c

Source
/*
 * Copyright (c) 2003, 2007-14 Matteo Frigo
 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 *
 */


#include "dft/dft.h"

typedef struct {
     solver super;
     size_t maxnbuf_ndx;
} S;

static const INT maxnbufs[] = { 8, 256 };

typedef struct {
     plan_dft super;

     plan *cld, *cldcpy, *cldrest;
     INT n, vl, nbuf, bufdist;
     INT ivs_by_nbuf, ovs_by_nbuf;
     INT roffset, ioffset;
} P;

/* transform a vector input with the help of bufs */
static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
{
     const P *ego = (const P *) ego_;
     INT nbuf = ego->nbuf;
     R *bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist * 2, BUFFERS);

     plan_dft *cld = (plan_dft *) ego->cld;
     plan_dft *cldcpy = (plan_dft *) ego->cldcpy;
     plan_dft *cldrest;
     INT i, vl = ego->vl;
     INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
     INT roffset = ego->roffset, ioffset = ego->ioffset;

     for (i = nbuf; i <= vl; i += nbuf) {
          /* transform to bufs: */
          cld->apply((plan *) cld, ri, ii, bufs + roffset, bufs + ioffset);
    ri += ivs_by_nbuf; ii += ivs_by_nbuf;

          /* copy back */
          cldcpy->apply((plan *) cldcpy, bufs+roffset, bufs+ioffset, ro, io);
    ro += ovs_by_nbuf; io += ovs_by_nbuf;
     }

     X(ifree)(bufs);

     /* Do the remaining transforms, if any: */
     cldrest = (plan_dft *) ego->cldrest;
     cldrest->apply((plan *) cldrest, ri, ii, ro, io);
}


static void awake(plan *ego_, enum wakefulness wakefulness)
{
     P *ego = (P *) ego_;

     X(plan_awake)(ego->cld, wakefulness);
     X(plan_awake)(ego->cldcpy, wakefulness);
     X(plan_awake)(ego->cldrest, wakefulness);
}

static void destroy(plan *ego_)
{
     P *ego = (P *) ego_;
     X(plan_destroy_internal)(ego->cldrest);
     X(plan_destroy_internal)(ego->cldcpy);
     X(plan_destroy_internal)(ego->cld);
}

static void print(const plan *ego_, printer *p)
{
     const P *ego = (const P *) ego_;
     p->print(p, "(dft-buffered-%D%v/%D-%D%(%p%)%(%p%)%(%p%))",
              ego->n, ego->nbuf,
              ego->vl, ego->bufdist % ego->n,
              ego->cld, ego->cldcpy, ego->cldrest);
}

static int applicable0(const S *ego, const problem *p_, const planner *plnr)
{
     const problem_dft *p = (const problem_dft *) p_;
     const iodim *d = p->sz->dims;

     if (1
   && p->vecsz->rnk <= 1
   && p->sz->rnk == 1
    ) {
    INT vl, ivs, ovs;
    X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);

    if (X(toobig)(p->sz->dims[0].n) && CONSERVE_MEMORYP(plnr))
         return 0;

    /* if this solver is redundant, in the sense that a solver
       of lower index generates the same plan, then prune this
       solver */
    if (X(nbuf_redundant)(d[0].n, vl, 
        ego->maxnbuf_ndx,
        maxnbufs, NELEM(maxnbufs)))
         return 0;

    /*
      In principle, the buffered transforms might be useful
      when working out of place.  However, in order to
      prevent infinite loops in the planner, we require
      that the output stride of the buffered transforms be
      greater than 2.
    */
    if (p->ri != p->ro)
         return (d[0].os > 2);

    /*
     * If the problem is in place, the input/output strides must
     * be the same or the whole thing must fit in the buffer.
     */
    if (X(tensor_inplace_strides2)(p->sz, p->vecsz))
         return 1;

    if (/* fits into buffer: */
         ((p->vecsz->rnk == 0)
    ||
    (X(nbuf)(d[0].n, p->vecsz->dims[0].n, 
       maxnbufs[ego->maxnbuf_ndx]) 
     == p->vecsz->dims[0].n)))
         return 1;
     }

     return 0;
}

static int applicable(const S *ego, const problem *p_, const planner *plnr)
{
     if (NO_BUFFERINGP(plnr)) return 0;
     if (!applicable0(ego, p_, plnr)) return 0;

     if (NO_UGLYP(plnr)) {
    const problem_dft *p = (const problem_dft *) p_;
    if (p->ri != p->ro) return 0;
    if (X(toobig)(p->sz->dims[0].n)) return 0;
     }
     return 1;
}

static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
     P *pln;
     const S *ego = (const S *)ego_;
     plan *cld = (plan *) 0;
     plan *cldcpy = (plan *) 0;
     plan *cldrest = (plan *) 0;
     const problem_dft *p = (const problem_dft *) p_;
     R *bufs = (R *) 0;
     INT nbuf = 0, bufdist, n, vl;
     INT ivs, ovs, roffset, ioffset;

     static const plan_adt padt = {
    X(dft_solve), awake, print, destroy
     };

     if (!applicable(ego, p_, plnr))
          goto nada;

     n = X(tensor_sz)(p->sz);

     X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);

     nbuf = X(nbuf)(n, vl, maxnbufs[ego->maxnbuf_ndx]);
     bufdist = X(bufdist)(n, vl);
     A(nbuf > 0);

     /* attempt to keep real and imaginary part in the same order,
  so as to allow optimizations in the the copy plan */
     roffset = (p->ri - p->ii > 0) ? (INT)1 : (INT)0;
     ioffset = 1 - roffset;

     /* initial allocation for the purpose of planning */
     bufs = (R *) MALLOC(sizeof(R) * nbuf * bufdist * 2, BUFFERS);

     /* allow destruction of input if problem is in place */
     cld = X(mkplan_f_d)(plnr,
       X(mkproblem_dft_d)(
            X(mktensor_1d)(n, p->sz->dims[0].is, 2),
            X(mktensor_1d)(nbuf, ivs, bufdist * 2),
            TAINT(p->ri, ivs * nbuf),
            TAINT(p->ii, ivs * nbuf),
            bufs + roffset, 
            bufs + ioffset),
       0, 0, (p->ri == p->ro) ? NO_DESTROY_INPUT : 0);
     if (!cld)
          goto nada;

     /* copying back from the buffer is a rank-0 transform: */
     cldcpy = X(mkplan_d)(plnr,
        X(mkproblem_dft_d)(
             X(mktensor_0d)(),
             X(mktensor_2d)(nbuf, bufdist * 2, ovs,
                n, 2, p->sz->dims[0].os),
             bufs + roffset, 
             bufs + ioffset, 
             TAINT(p->ro, ovs * nbuf), 
             TAINT(p->io, ovs * nbuf)));
     if (!cldcpy)
          goto nada;

     /* deallocate buffers, let apply() allocate them for real */
     X(ifree)(bufs);
     bufs = 0;

     /* plan the leftover transforms (cldrest): */
     {
    INT id = ivs * (nbuf * (vl / nbuf));
    INT od = ovs * (nbuf * (vl / nbuf));
    cldrest = X(mkplan_d)(plnr, 
        X(mkproblem_dft_d)(
             X(tensor_copy)(p->sz),
             X(mktensor_1d)(vl % nbuf, ivs, ovs),
             p->ri+id, p->ii+id, p->ro+od, p->io+od));
     }
     if (!cldrest)
          goto nada;

     pln = MKPLAN_DFT(P, &padt, apply);
     pln->cld = cld;
     pln->cldcpy = cldcpy;
     pln->cldrest = cldrest;
     pln->n = n;
     pln->vl = vl;
     pln->ivs_by_nbuf = ivs * nbuf;
     pln->ovs_by_nbuf = ovs * nbuf;
     pln->roffset = roffset;
     pln->ioffset = ioffset;

     pln->nbuf = nbuf;
     pln->bufdist = bufdist;

     {
    opcnt t;
    X(ops_add)(&cld->ops, &cldcpy->ops, &t);
    X(ops_madd)(vl / nbuf, &t, &cldrest->ops, &pln->super.super.ops);
     }

     return &(pln->super.super);

 nada:
     X(ifree0)(bufs);
     X(plan_destroy_internal)(cldrest);
     X(plan_destroy_internal)(cldcpy);
     X(plan_destroy_internal)(cld);
     return (plan *) 0;
}

static solver *mksolver(size_t maxnbuf_ndx)
{
     static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
     S *slv = MKSOLVER(S, &sadt);
     slv->maxnbuf_ndx = maxnbuf_ndx;
     return &(slv->super);
}

void X(dft_buffered_register)(planner *p)
{
     size_t i;
     for (i = 0; i < NELEM(maxnbufs); ++i)
    REGISTER_SOLVER(p, mksolver(i));
}

Coverage Report

Created: 2026-05-30 06:15

Line	Count	Source
1		/*
2		* Copyright (c) 2003, 2007-14 Matteo Frigo
3		* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4		*
5		* This program is free software; you can redistribute it and/or modify
6		* it under the terms of the GNU General Public License as published by
7		* the Free Software Foundation; either version 2 of the License, or
8		* (at your option) any later version.
9		*
10		* This program is distributed in the hope that it will be useful,
11		* but WITHOUT ANY WARRANTY; without even the implied warranty of
12		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13		* GNU General Public License for more details.
14		*
15		* You should have received a copy of the GNU General Public License
16		* along with this program; if not, write to the Free Software
17		* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18		*
19		*/
20
21
22		#include "dft/dft.h"
23
24		typedef struct {
25		solver super;
26		size_t maxnbuf_ndx;
27		} S;
28
29		static const INT maxnbufs[] = { 8, 256 };
30
31		typedef struct {
32		plan_dft super;
33
34		plan cld, cldcpy, *cldrest;
35		INT n, vl, nbuf, bufdist;
36		INT ivs_by_nbuf, ovs_by_nbuf;
37		INT roffset, ioffset;
38		} P;
39
40		/* transform a vector input with the help of bufs */
41		static void apply(const plan ego_, R ri, R ii, R ro, R *io)
42	65	{
43	65	const P ego = (const P ) ego_;
44	65	INT nbuf = ego->nbuf;
45	65	R bufs = (R )MALLOC(sizeof(R) * nbuf * ego->bufdist * 2, BUFFERS);
46
47	65	plan_dft cld = (plan_dft ) ego->cld;
48	65	plan_dft cldcpy = (plan_dft ) ego->cldcpy;
49	65	plan_dft *cldrest;
50	65	INT i, vl = ego->vl;
51	65	INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
52	65	INT roffset = ego->roffset, ioffset = ego->ioffset;
53
54	130	for (i = nbuf; i <= vl; i += nbuf) {
55		/* transform to bufs: */
56	65	cld->apply((plan *) cld, ri, ii, bufs + roffset, bufs + ioffset);
57	65	ri += ivs_by_nbuf; ii += ivs_by_nbuf;
58
59		/* copy back */
60	65	cldcpy->apply((plan *) cldcpy, bufs+roffset, bufs+ioffset, ro, io);
61	65	ro += ovs_by_nbuf; io += ovs_by_nbuf;
62	65	}
63
64	65	X(ifree)(bufs);
65
66		/* Do the remaining transforms, if any: */
67	65	cldrest = (plan_dft *) ego->cldrest;
68	65	cldrest->apply((plan *) cldrest, ri, ii, ro, io);
69	65	}
70
71
72		static void awake(plan *ego_, enum wakefulness wakefulness)
73	50	{
74	50	P ego = (P ) ego_;
75
76	50	X(plan_awake)(ego->cld, wakefulness);
77	50	X(plan_awake)(ego->cldcpy, wakefulness);
78	50	X(plan_awake)(ego->cldrest, wakefulness);
79	50	}
80
81		static void destroy(plan *ego_)
82	200	{
83	200	P ego = (P ) ego_;
84	200	X(plan_destroy_internal)(ego->cldrest);
85	200	X(plan_destroy_internal)(ego->cldcpy);
86	200	X(plan_destroy_internal)(ego->cld);
87	200	}
88
89		static void print(const plan ego_, printer p)
90	0	{
91	0	const P ego = (const P ) ego_;
92	0	p->print(p, "(dft-buffered-%D%v/%D-%D%(%p%)%(%p%)%(%p%))",
93	0	ego->n, ego->nbuf,
94	0	ego->vl, ego->bufdist % ego->n,
95	0	ego->cld, ego->cldcpy, ego->cldrest);
96	0	}
97
98		static int applicable0(const S ego, const problem p_, const planner *plnr)
99	2.26k	{
100	2.26k	const problem_dft p = (const problem_dft ) p_;
101	2.26k	const iodim *d = p->sz->dims;
102
103	2.26k	if (1
104	2.26k	&& p->vecsz->rnk <= 1
105	1.23k	&& p->sz->rnk == 1
106	2.26k	) {
107	1.19k	INT vl, ivs, ovs;
108	1.19k	X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
109
110	1.19k	if (X(toobig)(p->sz->dims[0].n) && CONSERVE_MEMORYP(plnr))
111	0	return 0;
112
113		/* if this solver is redundant, in the sense that a solver
114		of lower index generates the same plan, then prune this
115		solver */
116	1.19k	if (X(nbuf_redundant)(d[0].n, vl,
117	1.19k	ego->maxnbuf_ndx,
118	1.19k	maxnbufs, NELEM(maxnbufs)))
119	439	return 0;
120
121		/*
122		In principle, the buffered transforms might be useful
123		when working out of place. However, in order to
124		prevent infinite loops in the planner, we require
125		that the output stride of the buffered transforms be
126		greater than 2.
127		*/
128	756	if (p->ri != p->ro)
129	448	return (d[0].os > 2);
130
131		/*
132		* If the problem is in place, the input/output strides must
133		* be the same or the whole thing must fit in the buffer.
134		*/
135	308	if (X(tensor_inplace_strides2)(p->sz, p->vecsz))
136	0	return 1;
137
138	308	if (/* fits into buffer: */
139	308	((p->vecsz->rnk == 0)
140	308	\|\|
141	308	(X(nbuf)(d[0].n, p->vecsz->dims[0].n,
142	308	maxnbufs[ego->maxnbuf_ndx])
143	308	== p->vecsz->dims[0].n)))
144	200	return 1;
145	308	}
146
147	1.18k	return 0;
148	2.26k	}
149
150		static int applicable(const S ego, const problem p_, const planner *plnr)
151	2.26k	{
152	2.26k	if (NO_BUFFERINGP(plnr)) return 0;
153	2.26k	if (!applicable0(ego, p_, plnr)) return 0;
154
155	200	if (NO_UGLYP(plnr)) {
156	200	const problem_dft p = (const problem_dft ) p_;
157	200	if (p->ri != p->ro) return 0;
158	200	if (X(toobig)(p->sz->dims[0].n)) return 0;
159	200	}
160	200	return 1;
161	200	}
162
163		static plan mkplan(const solver ego_, const problem p_, planner plnr)
164	2.26k	{
165	2.26k	P *pln;
166	2.26k	const S ego = (const S )ego_;
167	2.26k	plan cld = (plan ) 0;
168	2.26k	plan cldcpy = (plan ) 0;
169	2.26k	plan cldrest = (plan ) 0;
170	2.26k	const problem_dft p = (const problem_dft ) p_;
171	2.26k	R bufs = (R ) 0;
172	2.26k	INT nbuf = 0, bufdist, n, vl;
173	2.26k	INT ivs, ovs, roffset, ioffset;
174
175	2.26k	static const plan_adt padt = {
176	2.26k	X(dft_solve), awake, print, destroy
177	2.26k	};
178
179	2.26k	if (!applicable(ego, p_, plnr))
180	2.06k	goto nada;
181
182	200	n = X(tensor_sz)(p->sz);
183
184	200	X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
185
186	200	nbuf = X(nbuf)(n, vl, maxnbufs[ego->maxnbuf_ndx]);
187	200	bufdist = X(bufdist)(n, vl);
188	200	A(nbuf > 0);
189
190		/* attempt to keep real and imaginary part in the same order,
191		so as to allow optimizations in the the copy plan */
192	200	roffset = (p->ri - p->ii > 0) ? (INT)1 : (INT)0;
193	200	ioffset = 1 - roffset;
194
195		/* initial allocation for the purpose of planning */
196	200	bufs = (R ) MALLOC(sizeof(R) nbuf * bufdist * 2, BUFFERS);
197
198		/* allow destruction of input if problem is in place */
199	200	cld = X(mkplan_f_d)(plnr,
200	200	X(mkproblem_dft_d)(
201	200	X(mktensor_1d)(n, p->sz->dims[0].is, 2),
202	200	X(mktensor_1d)(nbuf, ivs, bufdist * 2),
203	200	TAINT(p->ri, ivs * nbuf),
204	200	TAINT(p->ii, ivs * nbuf),
205	200	bufs + roffset,
206	200	bufs + ioffset),
207	200	0, 0, (p->ri == p->ro) ? NO_DESTROY_INPUT : 0);
208	200	if (!cld)
209	0	goto nada;
210
211		/* copying back from the buffer is a rank-0 transform: */
212	200	cldcpy = X(mkplan_d)(plnr,
213	200	X(mkproblem_dft_d)(
214	200	X(mktensor_0d)(),
215	200	X(mktensor_2d)(nbuf, bufdist * 2, ovs,
216	200	n, 2, p->sz->dims[0].os),
217	200	bufs + roffset,
218	200	bufs + ioffset,
219	200	TAINT(p->ro, ovs * nbuf),
220	200	TAINT(p->io, ovs * nbuf)));
221	200	if (!cldcpy)
222	0	goto nada;
223
224		/* deallocate buffers, let apply() allocate them for real */
225	200	X(ifree)(bufs);
226	200	bufs = 0;
227
228		/* plan the leftover transforms (cldrest): */
229	200	{
230	200	INT id = ivs * (nbuf * (vl / nbuf));
231	200	INT od = ovs * (nbuf * (vl / nbuf));
232	200	cldrest = X(mkplan_d)(plnr,
233	200	X(mkproblem_dft_d)(
234	200	X(tensor_copy)(p->sz),
235	200	X(mktensor_1d)(vl % nbuf, ivs, ovs),
236	200	p->ri+id, p->ii+id, p->ro+od, p->io+od));
237	200	}
238	200	if (!cldrest)
239	0	goto nada;
240
241	200	pln = MKPLAN_DFT(P, &padt, apply);
242	200	pln->cld = cld;
243	200	pln->cldcpy = cldcpy;
244	200	pln->cldrest = cldrest;
245	200	pln->n = n;
246	200	pln->vl = vl;
247	200	pln->ivs_by_nbuf = ivs * nbuf;
248	200	pln->ovs_by_nbuf = ovs * nbuf;
249	200	pln->roffset = roffset;
250	200	pln->ioffset = ioffset;
251
252	200	pln->nbuf = nbuf;
253	200	pln->bufdist = bufdist;
254
255	200	{
256	200	opcnt t;
257	200	X(ops_add)(&cld->ops, &cldcpy->ops, &t);
258	200	X(ops_madd)(vl / nbuf, &t, &cldrest->ops, &pln->super.super.ops);
259	200	}
260
261	200	return &(pln->super.super);
262
263	2.06k	nada:
264	2.06k	X(ifree0)(bufs);
265	2.06k	X(plan_destroy_internal)(cldrest);
266	2.06k	X(plan_destroy_internal)(cldcpy);
267	2.06k	X(plan_destroy_internal)(cld);
268	2.06k	return (plan *) 0;
269	200	}
270
271		static solver *mksolver(size_t maxnbuf_ndx)
272	4	{
273	4	static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
274	4	S *slv = MKSOLVER(S, &sadt);
275	4	slv->maxnbuf_ndx = maxnbuf_ndx;
276	4	return &(slv->super);
277	4	}
278
279		void X(dft_buffered_register)(planner *p)
280	1	{
281	1	size_t i;
282	3	for (i = 0; i < NELEM(maxnbufs); ++i)
283	2	REGISTER_SOLVER(p, mksolver(i));
284	1	}