/src/fftw3/dft/buffered.c

Source
/*
 * Copyright (c) 2003, 2007-14 Matteo Frigo
 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 *
 */


#include "dft/dft.h"

typedef struct {
     solver super;
     size_t maxnbuf_ndx;
} S;

static const INT maxnbufs[] = { 8, 256 };

typedef struct {
     plan_dft super;

     plan *cld, *cldcpy, *cldrest;
     INT n, vl, nbuf, bufdist;
     INT ivs_by_nbuf, ovs_by_nbuf;
     INT roffset, ioffset;
} P;

/* transform a vector input with the help of bufs */
static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
{
     const P *ego = (const P *) ego_;
     INT nbuf = ego->nbuf;
     R *bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist * 2, BUFFERS);

     plan_dft *cld = (plan_dft *) ego->cld;
     plan_dft *cldcpy = (plan_dft *) ego->cldcpy;
     plan_dft *cldrest;
     INT i, vl = ego->vl;
     INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
     INT roffset = ego->roffset, ioffset = ego->ioffset;

     for (i = nbuf; i <= vl; i += nbuf) {
          /* transform to bufs: */
          cld->apply((plan *) cld, ri, ii, bufs + roffset, bufs + ioffset);
    ri += ivs_by_nbuf; ii += ivs_by_nbuf;

          /* copy back */
          cldcpy->apply((plan *) cldcpy, bufs+roffset, bufs+ioffset, ro, io);
    ro += ovs_by_nbuf; io += ovs_by_nbuf;
     }

     X(ifree)(bufs);

     /* Do the remaining transforms, if any: */
     cldrest = (plan_dft *) ego->cldrest;
     cldrest->apply((plan *) cldrest, ri, ii, ro, io);
}


static void awake(plan *ego_, enum wakefulness wakefulness)
{
     P *ego = (P *) ego_;

     X(plan_awake)(ego->cld, wakefulness);
     X(plan_awake)(ego->cldcpy, wakefulness);
     X(plan_awake)(ego->cldrest, wakefulness);
}

static void destroy(plan *ego_)
{
     P *ego = (P *) ego_;
     X(plan_destroy_internal)(ego->cldrest);
     X(plan_destroy_internal)(ego->cldcpy);
     X(plan_destroy_internal)(ego->cld);
}

static void print(const plan *ego_, printer *p)
{
     const P *ego = (const P *) ego_;
     p->print(p, "(dft-buffered-%D%v/%D-%D%(%p%)%(%p%)%(%p%))",
              ego->n, ego->nbuf,
              ego->vl, ego->bufdist % ego->n,
              ego->cld, ego->cldcpy, ego->cldrest);
}

static int applicable0(const S *ego, const problem *p_, const planner *plnr)
{
     const problem_dft *p = (const problem_dft *) p_;
     const iodim *d = p->sz->dims;

     if (1
   && p->vecsz->rnk <= 1
   && p->sz->rnk == 1
    ) {
    INT vl, ivs, ovs;
    X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);

    if (X(toobig)(p->sz->dims[0].n) && CONSERVE_MEMORYP(plnr))
         return 0;

    /* if this solver is redundant, in the sense that a solver
       of lower index generates the same plan, then prune this
       solver */
    if (X(nbuf_redundant)(d[0].n, vl, 
        ego->maxnbuf_ndx,
        maxnbufs, NELEM(maxnbufs)))
         return 0;

    /*
      In principle, the buffered transforms might be useful
      when working out of place.  However, in order to
      prevent infinite loops in the planner, we require
      that the output stride of the buffered transforms be
      greater than 2.
    */
    if (p->ri != p->ro)
         return (d[0].os > 2);

    /*
     * If the problem is in place, the input/output strides must
     * be the same or the whole thing must fit in the buffer.
     */
    if (X(tensor_inplace_strides2)(p->sz, p->vecsz))
         return 1;

    if (/* fits into buffer: */
         ((p->vecsz->rnk == 0)
    ||
    (X(nbuf)(d[0].n, p->vecsz->dims[0].n, 
       maxnbufs[ego->maxnbuf_ndx]) 
     == p->vecsz->dims[0].n)))
         return 1;
     }

     return 0;
}

static int applicable(const S *ego, const problem *p_, const planner *plnr)
{
     if (NO_BUFFERINGP(plnr)) return 0;
     if (!applicable0(ego, p_, plnr)) return 0;

     if (NO_UGLYP(plnr)) {
    const problem_dft *p = (const problem_dft *) p_;
    if (p->ri != p->ro) return 0;
    if (X(toobig)(p->sz->dims[0].n)) return 0;
     }
     return 1;
}

static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
     P *pln;
     const S *ego = (const S *)ego_;
     plan *cld = (plan *) 0;
     plan *cldcpy = (plan *) 0;
     plan *cldrest = (plan *) 0;
     const problem_dft *p = (const problem_dft *) p_;
     R *bufs = (R *) 0;
     INT nbuf = 0, bufdist, n, vl;
     INT ivs, ovs, roffset, ioffset;

     static const plan_adt padt = {
    X(dft_solve), awake, print, destroy
     };

     if (!applicable(ego, p_, plnr))
          goto nada;

     n = X(tensor_sz)(p->sz);

     X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);

     nbuf = X(nbuf)(n, vl, maxnbufs[ego->maxnbuf_ndx]);
     bufdist = X(bufdist)(n, vl);
     A(nbuf > 0);

     /* attempt to keep real and imaginary part in the same order,
  so as to allow optimizations in the the copy plan */
     roffset = (p->ri - p->ii > 0) ? (INT)1 : (INT)0;
     ioffset = 1 - roffset;

     /* initial allocation for the purpose of planning */
     bufs = (R *) MALLOC(sizeof(R) * nbuf * bufdist * 2, BUFFERS);

     /* allow destruction of input if problem is in place */
     cld = X(mkplan_f_d)(plnr,
       X(mkproblem_dft_d)(
            X(mktensor_1d)(n, p->sz->dims[0].is, 2),
            X(mktensor_1d)(nbuf, ivs, bufdist * 2),
            TAINT(p->ri, ivs * nbuf),
            TAINT(p->ii, ivs * nbuf),
            bufs + roffset, 
            bufs + ioffset),
       0, 0, (p->ri == p->ro) ? NO_DESTROY_INPUT : 0);
     if (!cld)
          goto nada;

     /* copying back from the buffer is a rank-0 transform: */
     cldcpy = X(mkplan_d)(plnr,
        X(mkproblem_dft_d)(
             X(mktensor_0d)(),
             X(mktensor_2d)(nbuf, bufdist * 2, ovs,
                n, 2, p->sz->dims[0].os),
             bufs + roffset, 
             bufs + ioffset, 
             TAINT(p->ro, ovs * nbuf), 
             TAINT(p->io, ovs * nbuf)));
     if (!cldcpy)
          goto nada;

     /* deallocate buffers, let apply() allocate them for real */
     X(ifree)(bufs);
     bufs = 0;

     /* plan the leftover transforms (cldrest): */
     {
    INT id = ivs * (nbuf * (vl / nbuf));
    INT od = ovs * (nbuf * (vl / nbuf));
    cldrest = X(mkplan_d)(plnr, 
        X(mkproblem_dft_d)(
             X(tensor_copy)(p->sz),
             X(mktensor_1d)(vl % nbuf, ivs, ovs),
             p->ri+id, p->ii+id, p->ro+od, p->io+od));
     }
     if (!cldrest)
          goto nada;

     pln = MKPLAN_DFT(P, &padt, apply);
     pln->cld = cld;
     pln->cldcpy = cldcpy;
     pln->cldrest = cldrest;
     pln->n = n;
     pln->vl = vl;
     pln->ivs_by_nbuf = ivs * nbuf;
     pln->ovs_by_nbuf = ovs * nbuf;
     pln->roffset = roffset;
     pln->ioffset = ioffset;

     pln->nbuf = nbuf;
     pln->bufdist = bufdist;

     {
    opcnt t;
    X(ops_add)(&cld->ops, &cldcpy->ops, &t);
    X(ops_madd)(vl / nbuf, &t, &cldrest->ops, &pln->super.super.ops);
     }

     return &(pln->super.super);

 nada:
     X(ifree0)(bufs);
     X(plan_destroy_internal)(cldrest);
     X(plan_destroy_internal)(cldcpy);
     X(plan_destroy_internal)(cld);
     return (plan *) 0;
}

static solver *mksolver(size_t maxnbuf_ndx)
{
     static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
     S *slv = MKSOLVER(S, &sadt);
     slv->maxnbuf_ndx = maxnbuf_ndx;
     return &(slv->super);
}

void X(dft_buffered_register)(planner *p)
{
     size_t i;
     for (i = 0; i < NELEM(maxnbufs); ++i)
    REGISTER_SOLVER(p, mksolver(i));
}

Coverage Report

Created: 2025-11-24 06:40

Line	Count	Source
1		/*
2		* Copyright (c) 2003, 2007-14 Matteo Frigo
3		* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4		*
5		* This program is free software; you can redistribute it and/or modify
6		* it under the terms of the GNU General Public License as published by
7		* the Free Software Foundation; either version 2 of the License, or
8		* (at your option) any later version.
9		*
10		* This program is distributed in the hope that it will be useful,
11		* but WITHOUT ANY WARRANTY; without even the implied warranty of
12		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13		* GNU General Public License for more details.
14		*
15		* You should have received a copy of the GNU General Public License
16		* along with this program; if not, write to the Free Software
17		* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18		*
19		*/
20
21
22		#include "dft/dft.h"
23
24		typedef struct {
25		solver super;
26		size_t maxnbuf_ndx;
27		} S;
28
29		static const INT maxnbufs[] = { 8, 256 };
30
31		typedef struct {
32		plan_dft super;
33
34		plan cld, cldcpy, *cldrest;
35		INT n, vl, nbuf, bufdist;
36		INT ivs_by_nbuf, ovs_by_nbuf;
37		INT roffset, ioffset;
38		} P;
39
40		/* transform a vector input with the help of bufs */
41		static void apply(const plan ego_, R ri, R ii, R ro, R *io)
42	70	{
43	70	const P ego = (const P ) ego_;
44	70	INT nbuf = ego->nbuf;
45	70	R bufs = (R )MALLOC(sizeof(R) * nbuf * ego->bufdist * 2, BUFFERS);
46
47	70	plan_dft cld = (plan_dft ) ego->cld;
48	70	plan_dft cldcpy = (plan_dft ) ego->cldcpy;
49	70	plan_dft *cldrest;
50	70	INT i, vl = ego->vl;
51	70	INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
52	70	INT roffset = ego->roffset, ioffset = ego->ioffset;
53
54	140	for (i = nbuf; i <= vl; i += nbuf) {
55		/* transform to bufs: */
56	70	cld->apply((plan *) cld, ri, ii, bufs + roffset, bufs + ioffset);
57	70	ri += ivs_by_nbuf; ii += ivs_by_nbuf;
58
59		/* copy back */
60	70	cldcpy->apply((plan *) cldcpy, bufs+roffset, bufs+ioffset, ro, io);
61	70	ro += ovs_by_nbuf; io += ovs_by_nbuf;
62	70	}
63
64	70	X(ifree)(bufs);
65
66		/* Do the remaining transforms, if any: */
67	70	cldrest = (plan_dft *) ego->cldrest;
68	70	cldrest->apply((plan *) cldrest, ri, ii, ro, io);
69	70	}
70
71
72		static void awake(plan *ego_, enum wakefulness wakefulness)
73	56	{
74	56	P ego = (P ) ego_;
75
76	56	X(plan_awake)(ego->cld, wakefulness);
77	56	X(plan_awake)(ego->cldcpy, wakefulness);
78	56	X(plan_awake)(ego->cldrest, wakefulness);
79	56	}
80
81		static void destroy(plan *ego_)
82	202	{
83	202	P ego = (P ) ego_;
84	202	X(plan_destroy_internal)(ego->cldrest);
85	202	X(plan_destroy_internal)(ego->cldcpy);
86	202	X(plan_destroy_internal)(ego->cld);
87	202	}
88
89		static void print(const plan ego_, printer p)
90	0	{
91	0	const P ego = (const P ) ego_;
92	0	p->print(p, "(dft-buffered-%D%v/%D-%D%(%p%)%(%p%)%(%p%))",
93	0	ego->n, ego->nbuf,
94	0	ego->vl, ego->bufdist % ego->n,
95	0	ego->cld, ego->cldcpy, ego->cldrest);
96	0	}
97
98		static int applicable0(const S ego, const problem p_, const planner *plnr)
99	2.23k	{
100	2.23k	const problem_dft p = (const problem_dft ) p_;
101	2.23k	const iodim *d = p->sz->dims;
102
103	2.23k	if (1
104	2.23k	&& p->vecsz->rnk <= 1
105	1.21k	&& p->sz->rnk == 1
106	2.23k	) {
107	1.17k	INT vl, ivs, ovs;
108	1.17k	X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
109
110	1.17k	if (X(toobig)(p->sz->dims[0].n) && CONSERVE_MEMORYP(plnr))
111	0	return 0;
112
113		/* if this solver is redundant, in the sense that a solver
114		of lower index generates the same plan, then prune this
115		solver */
116	1.17k	if (X(nbuf_redundant)(d[0].n, vl,
117	1.17k	ego->maxnbuf_ndx,
118	1.17k	maxnbufs, NELEM(maxnbufs)))
119	432	return 0;
120
121		/*
122		In principle, the buffered transforms might be useful
123		when working out of place. However, in order to
124		prevent infinite loops in the planner, we require
125		that the output stride of the buffered transforms be
126		greater than 2.
127		*/
128	744	if (p->ri != p->ro)
129	438	return (d[0].os > 2);
130
131		/*
132		* If the problem is in place, the input/output strides must
133		* be the same or the whole thing must fit in the buffer.
134		*/
135	306	if (X(tensor_inplace_strides2)(p->sz, p->vecsz))
136	0	return 1;
137
138	306	if (/* fits into buffer: */
139	306	((p->vecsz->rnk == 0)
140	306	\|\|
141	306	(X(nbuf)(d[0].n, p->vecsz->dims[0].n,
142	306	maxnbufs[ego->maxnbuf_ndx])
143	306	== p->vecsz->dims[0].n)))
144	202	return 1;
145	306	}
146
147	1.15k	return 0;
148	2.23k	}
149
150		static int applicable(const S ego, const problem p_, const planner *plnr)
151	2.23k	{
152	2.23k	if (NO_BUFFERINGP(plnr)) return 0;
153	2.23k	if (!applicable0(ego, p_, plnr)) return 0;
154
155	202	if (NO_UGLYP(plnr)) {
156	202	const problem_dft p = (const problem_dft ) p_;
157	202	if (p->ri != p->ro) return 0;
158	202	if (X(toobig)(p->sz->dims[0].n)) return 0;
159	202	}
160	202	return 1;
161	202	}
162
163		static plan mkplan(const solver ego_, const problem p_, planner plnr)
164	2.23k	{
165	2.23k	P *pln;
166	2.23k	const S ego = (const S )ego_;
167	2.23k	plan cld = (plan ) 0;
168	2.23k	plan cldcpy = (plan ) 0;
169	2.23k	plan cldrest = (plan ) 0;
170	2.23k	const problem_dft p = (const problem_dft ) p_;
171	2.23k	R bufs = (R ) 0;
172	2.23k	INT nbuf = 0, bufdist, n, vl;
173	2.23k	INT ivs, ovs, roffset, ioffset;
174
175	2.23k	static const plan_adt padt = {
176	2.23k	X(dft_solve), awake, print, destroy
177	2.23k	};
178
179	2.23k	if (!applicable(ego, p_, plnr))
180	2.02k	goto nada;
181
182	202	n = X(tensor_sz)(p->sz);
183
184	202	X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
185
186	202	nbuf = X(nbuf)(n, vl, maxnbufs[ego->maxnbuf_ndx]);
187	202	bufdist = X(bufdist)(n, vl);
188	202	A(nbuf > 0);
189
190		/* attempt to keep real and imaginary part in the same order,
191		so as to allow optimizations in the the copy plan */
192	202	roffset = (p->ri - p->ii > 0) ? (INT)1 : (INT)0;
193	202	ioffset = 1 - roffset;
194
195		/* initial allocation for the purpose of planning */
196	202	bufs = (R ) MALLOC(sizeof(R) nbuf * bufdist * 2, BUFFERS);
197
198		/* allow destruction of input if problem is in place */
199	202	cld = X(mkplan_f_d)(plnr,
200	202	X(mkproblem_dft_d)(
201	202	X(mktensor_1d)(n, p->sz->dims[0].is, 2),
202	202	X(mktensor_1d)(nbuf, ivs, bufdist * 2),
203	202	TAINT(p->ri, ivs * nbuf),
204	202	TAINT(p->ii, ivs * nbuf),
205	202	bufs + roffset,
206	202	bufs + ioffset),
207	202	0, 0, (p->ri == p->ro) ? NO_DESTROY_INPUT : 0);
208	202	if (!cld)
209	0	goto nada;
210
211		/* copying back from the buffer is a rank-0 transform: */
212	202	cldcpy = X(mkplan_d)(plnr,
213	202	X(mkproblem_dft_d)(
214	202	X(mktensor_0d)(),
215	202	X(mktensor_2d)(nbuf, bufdist * 2, ovs,
216	202	n, 2, p->sz->dims[0].os),
217	202	bufs + roffset,
218	202	bufs + ioffset,
219	202	TAINT(p->ro, ovs * nbuf),
220	202	TAINT(p->io, ovs * nbuf)));
221	202	if (!cldcpy)
222	0	goto nada;
223
224		/* deallocate buffers, let apply() allocate them for real */
225	202	X(ifree)(bufs);
226	202	bufs = 0;
227
228		/* plan the leftover transforms (cldrest): */
229	202	{
230	202	INT id = ivs * (nbuf * (vl / nbuf));
231	202	INT od = ovs * (nbuf * (vl / nbuf));
232	202	cldrest = X(mkplan_d)(plnr,
233	202	X(mkproblem_dft_d)(
234	202	X(tensor_copy)(p->sz),
235	202	X(mktensor_1d)(vl % nbuf, ivs, ovs),
236	202	p->ri+id, p->ii+id, p->ro+od, p->io+od));
237	202	}
238	202	if (!cldrest)
239	0	goto nada;
240
241	202	pln = MKPLAN_DFT(P, &padt, apply);
242	202	pln->cld = cld;
243	202	pln->cldcpy = cldcpy;
244	202	pln->cldrest = cldrest;
245	202	pln->n = n;
246	202	pln->vl = vl;
247	202	pln->ivs_by_nbuf = ivs * nbuf;
248	202	pln->ovs_by_nbuf = ovs * nbuf;
249	202	pln->roffset = roffset;
250	202	pln->ioffset = ioffset;
251
252	202	pln->nbuf = nbuf;
253	202	pln->bufdist = bufdist;
254
255	202	{
256	202	opcnt t;
257	202	X(ops_add)(&cld->ops, &cldcpy->ops, &t);
258	202	X(ops_madd)(vl / nbuf, &t, &cldrest->ops, &pln->super.super.ops);
259	202	}
260
261	202	return &(pln->super.super);
262
263	2.02k	nada:
264	2.02k	X(ifree0)(bufs);
265	2.02k	X(plan_destroy_internal)(cldrest);
266	2.02k	X(plan_destroy_internal)(cldcpy);
267	2.02k	X(plan_destroy_internal)(cld);
268	2.02k	return (plan *) 0;
269	202	}
270
271		static solver *mksolver(size_t maxnbuf_ndx)
272	4	{
273	4	static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
274	4	S *slv = MKSOLVER(S, &sadt);
275	4	slv->maxnbuf_ndx = maxnbuf_ndx;
276	4	return &(slv->super);
277	4	}
278
279		void X(dft_buffered_register)(planner *p)
280	1	{
281	1	size_t i;
282	3	for (i = 0; i < NELEM(maxnbufs); ++i)
283	2	REGISTER_SOLVER(p, mksolver(i));
284	1	}