/src/fftw3/dft/buffered.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2003, 2007-14 Matteo Frigo |
3 | | * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology |
4 | | * |
5 | | * This program is free software; you can redistribute it and/or modify |
6 | | * it under the terms of the GNU General Public License as published by |
7 | | * the Free Software Foundation; either version 2 of the License, or |
8 | | * (at your option) any later version. |
9 | | * |
10 | | * This program is distributed in the hope that it will be useful, |
11 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | * GNU General Public License for more details. |
14 | | * |
15 | | * You should have received a copy of the GNU General Public License |
16 | | * along with this program; if not, write to the Free Software |
17 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
18 | | * |
19 | | */ |
20 | | |
21 | | |
22 | | #include "dft/dft.h" |
23 | | |
24 | | typedef struct { |
25 | | solver super; |
26 | | size_t maxnbuf_ndx; |
27 | | } S; |
28 | | |
29 | | static const INT maxnbufs[] = { 8, 256 }; |
30 | | |
31 | | typedef struct { |
32 | | plan_dft super; |
33 | | |
34 | | plan *cld, *cldcpy, *cldrest; |
35 | | INT n, vl, nbuf, bufdist; |
36 | | INT ivs_by_nbuf, ovs_by_nbuf; |
37 | | INT roffset, ioffset; |
38 | | } P; |
39 | | |
40 | | /* transform a vector input with the help of bufs */ |
41 | | static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io) |
42 | 61 | { |
43 | 61 | const P *ego = (const P *) ego_; |
44 | 61 | INT nbuf = ego->nbuf; |
45 | 61 | R *bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist * 2, BUFFERS); |
46 | | |
47 | 61 | plan_dft *cld = (plan_dft *) ego->cld; |
48 | 61 | plan_dft *cldcpy = (plan_dft *) ego->cldcpy; |
49 | 61 | plan_dft *cldrest; |
50 | 61 | INT i, vl = ego->vl; |
51 | 61 | INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf; |
52 | 61 | INT roffset = ego->roffset, ioffset = ego->ioffset; |
53 | | |
54 | 122 | for (i = nbuf; i <= vl; i += nbuf) { |
55 | | /* transform to bufs: */ |
56 | 61 | cld->apply((plan *) cld, ri, ii, bufs + roffset, bufs + ioffset); |
57 | 61 | ri += ivs_by_nbuf; ii += ivs_by_nbuf; |
58 | | |
59 | | /* copy back */ |
60 | 61 | cldcpy->apply((plan *) cldcpy, bufs+roffset, bufs+ioffset, ro, io); |
61 | 61 | ro += ovs_by_nbuf; io += ovs_by_nbuf; |
62 | 61 | } |
63 | | |
64 | 61 | X(ifree)(bufs); |
65 | | |
66 | | /* Do the remaining transforms, if any: */ |
67 | 61 | cldrest = (plan_dft *) ego->cldrest; |
68 | 61 | cldrest->apply((plan *) cldrest, ri, ii, ro, io); |
69 | 61 | } |
70 | | |
71 | | |
72 | | static void awake(plan *ego_, enum wakefulness wakefulness) |
73 | 50 | { |
74 | 50 | P *ego = (P *) ego_; |
75 | | |
76 | 50 | X(plan_awake)(ego->cld, wakefulness); |
77 | 50 | X(plan_awake)(ego->cldcpy, wakefulness); |
78 | 50 | X(plan_awake)(ego->cldrest, wakefulness); |
79 | 50 | } |
80 | | |
81 | | static void destroy(plan *ego_) |
82 | 196 | { |
83 | 196 | P *ego = (P *) ego_; |
84 | 196 | X(plan_destroy_internal)(ego->cldrest); |
85 | 196 | X(plan_destroy_internal)(ego->cldcpy); |
86 | 196 | X(plan_destroy_internal)(ego->cld); |
87 | 196 | } |
88 | | |
89 | | static void print(const plan *ego_, printer *p) |
90 | 0 | { |
91 | 0 | const P *ego = (const P *) ego_; |
92 | 0 | p->print(p, "(dft-buffered-%D%v/%D-%D%(%p%)%(%p%)%(%p%))", |
93 | 0 | ego->n, ego->nbuf, |
94 | 0 | ego->vl, ego->bufdist % ego->n, |
95 | 0 | ego->cld, ego->cldcpy, ego->cldrest); |
96 | 0 | } |
97 | | |
98 | | static int applicable0(const S *ego, const problem *p_, const planner *plnr) |
99 | 2.21k | { |
100 | 2.21k | const problem_dft *p = (const problem_dft *) p_; |
101 | 2.21k | const iodim *d = p->sz->dims; |
102 | | |
103 | 2.21k | if (1 |
104 | 2.21k | && p->vecsz->rnk <= 1 |
105 | 2.21k | && p->sz->rnk == 1 |
106 | 2.21k | ) { |
107 | 1.16k | INT vl, ivs, ovs; |
108 | 1.16k | X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs); |
109 | | |
110 | 1.16k | if (X(toobig)(p->sz->dims[0].n) && CONSERVE_MEMORYP(plnr)) |
111 | 0 | return 0; |
112 | | |
113 | | /* if this solver is redundant, in the sense that a solver |
114 | | of lower index generates the same plan, then prune this |
115 | | solver */ |
116 | 1.16k | if (X(nbuf_redundant)(d[0].n, vl, |
117 | 1.16k | ego->maxnbuf_ndx, |
118 | 1.16k | maxnbufs, NELEM(maxnbufs))) |
119 | 430 | return 0; |
120 | | |
121 | | /* |
122 | | In principle, the buffered transforms might be useful |
123 | | when working out of place. However, in order to |
124 | | prevent infinite loops in the planner, we require |
125 | | that the output stride of the buffered transforms be |
126 | | greater than 2. |
127 | | */ |
128 | 735 | if (p->ri != p->ro) |
129 | 435 | return (d[0].os > 2); |
130 | | |
131 | | /* |
132 | | * If the problem is in place, the input/output strides must |
133 | | * be the same or the whole thing must fit in the buffer. |
134 | | */ |
135 | 300 | if (X(tensor_inplace_strides2)(p->sz, p->vecsz)) |
136 | 0 | return 1; |
137 | | |
138 | 300 | if (/* fits into buffer: */ |
139 | 300 | ((p->vecsz->rnk == 0) |
140 | 300 | || |
141 | 300 | (X(nbuf)(d[0].n, p->vecsz->dims[0].n, |
142 | 300 | maxnbufs[ego->maxnbuf_ndx]) |
143 | 300 | == p->vecsz->dims[0].n))) |
144 | 196 | return 1; |
145 | 300 | } |
146 | | |
147 | 1.15k | return 0; |
148 | 2.21k | } |
149 | | |
150 | | static int applicable(const S *ego, const problem *p_, const planner *plnr) |
151 | 2.21k | { |
152 | 2.21k | if (NO_BUFFERINGP(plnr)) return 0; |
153 | 2.21k | if (!applicable0(ego, p_, plnr)) return 0; |
154 | | |
155 | 196 | if (NO_UGLYP(plnr)) { |
156 | 196 | const problem_dft *p = (const problem_dft *) p_; |
157 | 196 | if (p->ri != p->ro) return 0; |
158 | 196 | if (X(toobig)(p->sz->dims[0].n)) return 0; |
159 | 196 | } |
160 | 196 | return 1; |
161 | 196 | } |
162 | | |
163 | | static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) |
164 | 2.21k | { |
165 | 2.21k | P *pln; |
166 | 2.21k | const S *ego = (const S *)ego_; |
167 | 2.21k | plan *cld = (plan *) 0; |
168 | 2.21k | plan *cldcpy = (plan *) 0; |
169 | 2.21k | plan *cldrest = (plan *) 0; |
170 | 2.21k | const problem_dft *p = (const problem_dft *) p_; |
171 | 2.21k | R *bufs = (R *) 0; |
172 | 2.21k | INT nbuf = 0, bufdist, n, vl; |
173 | 2.21k | INT ivs, ovs, roffset, ioffset; |
174 | | |
175 | 2.21k | static const plan_adt padt = { |
176 | 2.21k | X(dft_solve), awake, print, destroy |
177 | 2.21k | }; |
178 | | |
179 | 2.21k | if (!applicable(ego, p_, plnr)) |
180 | 2.01k | goto nada; |
181 | | |
182 | 196 | n = X(tensor_sz)(p->sz); |
183 | | |
184 | 196 | X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs); |
185 | | |
186 | 196 | nbuf = X(nbuf)(n, vl, maxnbufs[ego->maxnbuf_ndx]); |
187 | 196 | bufdist = X(bufdist)(n, vl); |
188 | 196 | A(nbuf > 0); |
189 | | |
190 | | /* attempt to keep real and imaginary part in the same order, |
191 | | so as to allow optimizations in the the copy plan */ |
192 | 196 | roffset = (p->ri - p->ii > 0) ? (INT)1 : (INT)0; |
193 | 196 | ioffset = 1 - roffset; |
194 | | |
195 | | /* initial allocation for the purpose of planning */ |
196 | 196 | bufs = (R *) MALLOC(sizeof(R) * nbuf * bufdist * 2, BUFFERS); |
197 | | |
198 | | /* allow destruction of input if problem is in place */ |
199 | 196 | cld = X(mkplan_f_d)(plnr, |
200 | 196 | X(mkproblem_dft_d)( |
201 | 196 | X(mktensor_1d)(n, p->sz->dims[0].is, 2), |
202 | 196 | X(mktensor_1d)(nbuf, ivs, bufdist * 2), |
203 | 196 | TAINT(p->ri, ivs * nbuf), |
204 | 196 | TAINT(p->ii, ivs * nbuf), |
205 | 196 | bufs + roffset, |
206 | 196 | bufs + ioffset), |
207 | 196 | 0, 0, (p->ri == p->ro) ? NO_DESTROY_INPUT : 0); |
208 | 196 | if (!cld) |
209 | 0 | goto nada; |
210 | | |
211 | | /* copying back from the buffer is a rank-0 transform: */ |
212 | 196 | cldcpy = X(mkplan_d)(plnr, |
213 | 196 | X(mkproblem_dft_d)( |
214 | 196 | X(mktensor_0d)(), |
215 | 196 | X(mktensor_2d)(nbuf, bufdist * 2, ovs, |
216 | 196 | n, 2, p->sz->dims[0].os), |
217 | 196 | bufs + roffset, |
218 | 196 | bufs + ioffset, |
219 | 196 | TAINT(p->ro, ovs * nbuf), |
220 | 196 | TAINT(p->io, ovs * nbuf))); |
221 | 196 | if (!cldcpy) |
222 | 0 | goto nada; |
223 | | |
224 | | /* deallocate buffers, let apply() allocate them for real */ |
225 | 196 | X(ifree)(bufs); |
226 | 196 | bufs = 0; |
227 | | |
228 | | /* plan the leftover transforms (cldrest): */ |
229 | 196 | { |
230 | 196 | INT id = ivs * (nbuf * (vl / nbuf)); |
231 | 196 | INT od = ovs * (nbuf * (vl / nbuf)); |
232 | 196 | cldrest = X(mkplan_d)(plnr, |
233 | 196 | X(mkproblem_dft_d)( |
234 | 196 | X(tensor_copy)(p->sz), |
235 | 196 | X(mktensor_1d)(vl % nbuf, ivs, ovs), |
236 | 196 | p->ri+id, p->ii+id, p->ro+od, p->io+od)); |
237 | 196 | } |
238 | 196 | if (!cldrest) |
239 | 0 | goto nada; |
240 | | |
241 | 196 | pln = MKPLAN_DFT(P, &padt, apply); |
242 | 196 | pln->cld = cld; |
243 | 196 | pln->cldcpy = cldcpy; |
244 | 196 | pln->cldrest = cldrest; |
245 | 196 | pln->n = n; |
246 | 196 | pln->vl = vl; |
247 | 196 | pln->ivs_by_nbuf = ivs * nbuf; |
248 | 196 | pln->ovs_by_nbuf = ovs * nbuf; |
249 | 196 | pln->roffset = roffset; |
250 | 196 | pln->ioffset = ioffset; |
251 | | |
252 | 196 | pln->nbuf = nbuf; |
253 | 196 | pln->bufdist = bufdist; |
254 | | |
255 | 196 | { |
256 | 196 | opcnt t; |
257 | 196 | X(ops_add)(&cld->ops, &cldcpy->ops, &t); |
258 | 196 | X(ops_madd)(vl / nbuf, &t, &cldrest->ops, &pln->super.super.ops); |
259 | 196 | } |
260 | | |
261 | 196 | return &(pln->super.super); |
262 | | |
263 | 2.01k | nada: |
264 | 2.01k | X(ifree0)(bufs); |
265 | 2.01k | X(plan_destroy_internal)(cldrest); |
266 | 2.01k | X(plan_destroy_internal)(cldcpy); |
267 | 2.01k | X(plan_destroy_internal)(cld); |
268 | 2.01k | return (plan *) 0; |
269 | 196 | } |
270 | | |
271 | | static solver *mksolver(size_t maxnbuf_ndx) |
272 | 4 | { |
273 | 4 | static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 }; |
274 | 4 | S *slv = MKSOLVER(S, &sadt); |
275 | 4 | slv->maxnbuf_ndx = maxnbuf_ndx; |
276 | 4 | return &(slv->super); |
277 | 4 | } |
278 | | |
279 | | void X(dft_buffered_register)(planner *p) |
280 | 1 | { |
281 | 1 | size_t i; |
282 | 3 | for (i = 0; i < NELEM(maxnbufs); ++i) |
283 | 2 | REGISTER_SOLVER(p, mksolver(i)); |
284 | 1 | } |