/src/fftw3/rdft/buffered2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2003, 2007-14 Matteo Frigo |
3 | | * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology |
4 | | * |
5 | | * This program is free software; you can redistribute it and/or modify |
6 | | * it under the terms of the GNU General Public License as published by |
7 | | * the Free Software Foundation; either version 2 of the License, or |
8 | | * (at your option) any later version. |
9 | | * |
10 | | * This program is distributed in the hope that it will be useful, |
11 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | * GNU General Public License for more details. |
14 | | * |
15 | | * You should have received a copy of the GNU General Public License |
16 | | * along with this program; if not, write to the Free Software |
17 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
18 | | * |
19 | | */ |
20 | | |
21 | | |
22 | | /* buffering of rdft2. We always buffer the complex array */ |
23 | | |
24 | | #include "rdft/rdft.h" |
25 | | #include "dft/dft.h" |
26 | | |
27 | | typedef struct { |
28 | | solver super; |
29 | | size_t maxnbuf_ndx; |
30 | | } S; |
31 | | |
32 | | static const INT maxnbufs[] = { 8, 256 }; |
33 | | |
34 | | typedef struct { |
35 | | plan_rdft2 super; |
36 | | |
37 | | plan *cld, *cldcpy, *cldrest; |
38 | | INT n, vl, nbuf, bufdist; |
39 | | INT ivs_by_nbuf, ovs_by_nbuf; |
40 | | INT ioffset, roffset; |
41 | | } P; |
42 | | |
43 | | /* transform a vector input with the help of bufs */ |
44 | | static void apply_r2hc(const plan *ego_, R *r0, R *r1, R *cr, R *ci) |
45 | 0 | { |
46 | 0 | const P *ego = (const P *) ego_; |
47 | 0 | plan_rdft2 *cld = (plan_rdft2 *) ego->cld; |
48 | 0 | plan_dft *cldcpy = (plan_dft *) ego->cldcpy; |
49 | 0 | INT i, vl = ego->vl, nbuf = ego->nbuf; |
50 | 0 | INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf; |
51 | 0 | R *bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist, BUFFERS); |
52 | 0 | R *bufr = bufs + ego->roffset; |
53 | 0 | R *bufi = bufs + ego->ioffset; |
54 | 0 | plan_rdft2 *cldrest; |
55 | |
|
56 | 0 | for (i = nbuf; i <= vl; i += nbuf) { |
57 | | /* transform to bufs: */ |
58 | 0 | cld->apply((plan *) cld, r0, r1, bufr, bufi); |
59 | 0 | r0 += ivs_by_nbuf; r1 += ivs_by_nbuf; |
60 | | |
61 | | /* copy back */ |
62 | 0 | cldcpy->apply((plan *) cldcpy, bufr, bufi, cr, ci); |
63 | 0 | cr += ovs_by_nbuf; ci += ovs_by_nbuf; |
64 | 0 | } |
65 | |
|
66 | 0 | X(ifree)(bufs); |
67 | | |
68 | | /* Do the remaining transforms, if any: */ |
69 | 0 | cldrest = (plan_rdft2 *) ego->cldrest; |
70 | 0 | cldrest->apply((plan *) cldrest, r0, r1, cr, ci); |
71 | 0 | } |
72 | | |
73 | | /* for hc2r problems, copy the input into buffer, and then |
74 | | transform buffer->output, which allows for destruction of the |
75 | | buffer */ |
76 | | static void apply_hc2r(const plan *ego_, R *r0, R *r1, R *cr, R *ci) |
77 | 0 | { |
78 | 0 | const P *ego = (const P *) ego_; |
79 | 0 | plan_rdft2 *cld = (plan_rdft2 *) ego->cld; |
80 | 0 | plan_dft *cldcpy = (plan_dft *) ego->cldcpy; |
81 | 0 | INT i, vl = ego->vl, nbuf = ego->nbuf; |
82 | 0 | INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf; |
83 | 0 | R *bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist, BUFFERS); |
84 | 0 | R *bufr = bufs + ego->roffset; |
85 | 0 | R *bufi = bufs + ego->ioffset; |
86 | 0 | plan_rdft2 *cldrest; |
87 | |
|
88 | 0 | for (i = nbuf; i <= vl; i += nbuf) { |
89 | | /* copy input into bufs: */ |
90 | 0 | cldcpy->apply((plan *) cldcpy, cr, ci, bufr, bufi); |
91 | 0 | cr += ivs_by_nbuf; ci += ivs_by_nbuf; |
92 | | |
93 | | /* transform to output */ |
94 | 0 | cld->apply((plan *) cld, r0, r1, bufr, bufi); |
95 | 0 | r0 += ovs_by_nbuf; r1 += ovs_by_nbuf; |
96 | 0 | } |
97 | |
|
98 | 0 | X(ifree)(bufs); |
99 | | |
100 | | /* Do the remaining transforms, if any: */ |
101 | 0 | cldrest = (plan_rdft2 *) ego->cldrest; |
102 | 0 | cldrest->apply((plan *) cldrest, r0, r1, cr, ci); |
103 | 0 | } |
104 | | |
105 | | |
106 | | static void awake(plan *ego_, enum wakefulness wakefulness) |
107 | 0 | { |
108 | 0 | P *ego = (P *) ego_; |
109 | |
|
110 | 0 | X(plan_awake)(ego->cld, wakefulness); |
111 | 0 | X(plan_awake)(ego->cldcpy, wakefulness); |
112 | 0 | X(plan_awake)(ego->cldrest, wakefulness); |
113 | 0 | } |
114 | | |
115 | | static void destroy(plan *ego_) |
116 | 0 | { |
117 | 0 | P *ego = (P *) ego_; |
118 | 0 | X(plan_destroy_internal)(ego->cldrest); |
119 | 0 | X(plan_destroy_internal)(ego->cldcpy); |
120 | 0 | X(plan_destroy_internal)(ego->cld); |
121 | 0 | } |
122 | | |
123 | | static void print(const plan *ego_, printer *p) |
124 | 0 | { |
125 | 0 | const P *ego = (const P *) ego_; |
126 | 0 | p->print(p, "(rdft2-buffered-%D%v/%D-%D%(%p%)%(%p%)%(%p%))", |
127 | 0 | ego->n, ego->nbuf, |
128 | 0 | ego->vl, ego->bufdist % ego->n, |
129 | 0 | ego->cld, ego->cldcpy, ego->cldrest); |
130 | 0 | } |
131 | | |
132 | | static int applicable0(const S *ego, const problem *p_, const planner *plnr) |
133 | 0 | { |
134 | 0 | const problem_rdft2 *p = (const problem_rdft2 *) p_; |
135 | 0 | iodim *d = p->sz->dims; |
136 | |
|
137 | 0 | if (1 |
138 | 0 | && p->vecsz->rnk <= 1 |
139 | 0 | && p->sz->rnk == 1 |
140 | | |
141 | | /* we assume even n throughout */ |
142 | 0 | && (d[0].n % 2) == 0 |
143 | | |
144 | | /* and we only consider these two cases */ |
145 | 0 | && (p->kind == R2HC || p->kind == HC2R) |
146 | |
|
147 | 0 | ) { |
148 | 0 | INT vl, ivs, ovs; |
149 | 0 | X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs); |
150 | |
|
151 | 0 | if (X(toobig)(d[0].n) && CONSERVE_MEMORYP(plnr)) |
152 | 0 | return 0; |
153 | | |
154 | | /* if this solver is redundant, in the sense that a solver |
155 | | of lower index generates the same plan, then prune this |
156 | | solver */ |
157 | 0 | if (X(nbuf_redundant)(d[0].n, vl, |
158 | 0 | ego->maxnbuf_ndx, |
159 | 0 | maxnbufs, NELEM(maxnbufs))) |
160 | 0 | return 0; |
161 | | |
162 | 0 | if (p->r0 != p->cr) { |
163 | 0 | if (p->kind == HC2R) { |
164 | | /* Allow HC2R problems only if the input is to be |
165 | | preserved. This solver sets NO_DESTROY_INPUT, |
166 | | which prevents infinite loops */ |
167 | 0 | return (NO_DESTROY_INPUTP(plnr)); |
168 | 0 | } else { |
169 | | /* |
170 | | In principle, the buffered transforms might be useful |
171 | | when working out of place. However, in order to |
172 | | prevent infinite loops in the planner, we require |
173 | | that the output stride of the buffered transforms be |
174 | | greater than 2. |
175 | | */ |
176 | 0 | return (d[0].os > 2); |
177 | 0 | } |
178 | 0 | } |
179 | | |
180 | | /* |
181 | | * If the problem is in place, the input/output strides must |
182 | | * be the same or the whole thing must fit in the buffer. |
183 | | */ |
184 | 0 | if (X(rdft2_inplace_strides(p, RNK_MINFTY))) |
185 | 0 | return 1; |
186 | | |
187 | 0 | if (/* fits into buffer: */ |
188 | 0 | ((p->vecsz->rnk == 0) |
189 | 0 | || |
190 | 0 | (X(nbuf)(d[0].n, p->vecsz->dims[0].n, |
191 | 0 | maxnbufs[ego->maxnbuf_ndx]) |
192 | 0 | == p->vecsz->dims[0].n))) |
193 | 0 | return 1; |
194 | 0 | } |
195 | | |
196 | 0 | return 0; |
197 | 0 | } |
198 | | |
199 | | static int applicable(const S *ego, const problem *p_, const planner *plnr) |
200 | 0 | { |
201 | 0 | const problem_rdft2 *p; |
202 | |
|
203 | 0 | if (NO_BUFFERINGP(plnr)) return 0; |
204 | | |
205 | 0 | if (!applicable0(ego, p_, plnr)) return 0; |
206 | | |
207 | 0 | p = (const problem_rdft2 *) p_; |
208 | 0 | if (p->kind == HC2R) { |
209 | 0 | if (NO_UGLYP(plnr)) { |
210 | | /* UGLY if in-place and too big, since the problem |
211 | | could be solved via transpositions */ |
212 | 0 | if (p->r0 == p->cr && X(toobig)(p->sz->dims[0].n)) |
213 | 0 | return 0; |
214 | 0 | } |
215 | 0 | } else { |
216 | 0 | if (NO_UGLYP(plnr)) { |
217 | 0 | if (p->r0 != p->cr || X(toobig)(p->sz->dims[0].n)) |
218 | 0 | return 0; |
219 | 0 | } |
220 | 0 | } |
221 | 0 | return 1; |
222 | 0 | } |
223 | | |
224 | | static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) |
225 | 0 | { |
226 | 0 | P *pln; |
227 | 0 | const S *ego = (const S *)ego_; |
228 | 0 | plan *cld = (plan *) 0; |
229 | 0 | plan *cldcpy = (plan *) 0; |
230 | 0 | plan *cldrest = (plan *) 0; |
231 | 0 | const problem_rdft2 *p = (const problem_rdft2 *) p_; |
232 | 0 | R *bufs = (R *) 0; |
233 | 0 | INT nbuf = 0, bufdist, n, vl; |
234 | 0 | INT ivs, ovs, ioffset, roffset, id, od; |
235 | |
|
236 | 0 | static const plan_adt padt = { |
237 | 0 | X(rdft2_solve), awake, print, destroy |
238 | 0 | }; |
239 | |
|
240 | 0 | if (!applicable(ego, p_, plnr)) |
241 | 0 | goto nada; |
242 | | |
243 | 0 | n = X(tensor_sz)(p->sz); |
244 | 0 | X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs); |
245 | |
|
246 | 0 | nbuf = X(nbuf)(n, vl, maxnbufs[ego->maxnbuf_ndx]); |
247 | 0 | bufdist = X(bufdist)(n + 2, vl); /* complex-side rdft2 stores N+2 |
248 | | real numbers */ |
249 | 0 | A(nbuf > 0); |
250 | | |
251 | | /* attempt to keep real and imaginary part in the same order, |
252 | | so as to allow optimizations in the the copy plan */ |
253 | 0 | roffset = (p->cr - p->ci > 0) ? (INT)1 : (INT)0; |
254 | 0 | ioffset = 1 - roffset; |
255 | | |
256 | | /* initial allocation for the purpose of planning */ |
257 | 0 | bufs = (R *) MALLOC(sizeof(R) * nbuf * bufdist, BUFFERS); |
258 | |
|
259 | 0 | id = ivs * (nbuf * (vl / nbuf)); |
260 | 0 | od = ovs * (nbuf * (vl / nbuf)); |
261 | |
|
262 | 0 | if (p->kind == R2HC) { |
263 | | /* allow destruction of input if problem is in place */ |
264 | 0 | cld = X(mkplan_f_d)( |
265 | 0 | plnr, |
266 | 0 | X(mkproblem_rdft2_d)( |
267 | 0 | X(mktensor_1d)(n, p->sz->dims[0].is, 2), |
268 | 0 | X(mktensor_1d)(nbuf, ivs, bufdist), |
269 | 0 | TAINT(p->r0, ivs * nbuf), TAINT(p->r1, ivs * nbuf), |
270 | 0 | bufs + roffset, bufs + ioffset, p->kind), |
271 | 0 | 0, 0, (p->r0 == p->cr) ? NO_DESTROY_INPUT : 0); |
272 | 0 | if (!cld) goto nada; |
273 | | |
274 | | /* copying back from the buffer is a rank-0 DFT: */ |
275 | 0 | cldcpy = X(mkplan_d)( |
276 | 0 | plnr, |
277 | 0 | X(mkproblem_dft_d)( |
278 | 0 | X(mktensor_0d)(), |
279 | 0 | X(mktensor_2d)(nbuf, bufdist, ovs, |
280 | 0 | n/2+1, 2, p->sz->dims[0].os), |
281 | 0 | bufs + roffset, bufs + ioffset, |
282 | 0 | TAINT(p->cr, ovs * nbuf), TAINT(p->ci, ovs * nbuf) )); |
283 | 0 | if (!cldcpy) goto nada; |
284 | | |
285 | 0 | X(ifree)(bufs); bufs = 0; |
286 | |
|
287 | 0 | cldrest = X(mkplan_d)(plnr, |
288 | 0 | X(mkproblem_rdft2_d)( |
289 | 0 | X(tensor_copy)(p->sz), |
290 | 0 | X(mktensor_1d)(vl % nbuf, ivs, ovs), |
291 | 0 | p->r0 + id, p->r1 + id, |
292 | 0 | p->cr + od, p->ci + od, |
293 | 0 | p->kind)); |
294 | 0 | if (!cldrest) goto nada; |
295 | 0 | pln = MKPLAN_RDFT2(P, &padt, apply_r2hc); |
296 | 0 | } else { |
297 | | /* allow destruction of buffer */ |
298 | 0 | cld = X(mkplan_f_d)( |
299 | 0 | plnr, |
300 | 0 | X(mkproblem_rdft2_d)( |
301 | 0 | X(mktensor_1d)(n, 2, p->sz->dims[0].os), |
302 | 0 | X(mktensor_1d)(nbuf, bufdist, ovs), |
303 | 0 | TAINT(p->r0, ovs * nbuf), TAINT(p->r1, ovs * nbuf), |
304 | 0 | bufs + roffset, bufs + ioffset, p->kind), |
305 | 0 | 0, 0, NO_DESTROY_INPUT); |
306 | 0 | if (!cld) goto nada; |
307 | | |
308 | | /* copying input into buffer is a rank-0 DFT: */ |
309 | 0 | cldcpy = X(mkplan_d)( |
310 | 0 | plnr, |
311 | 0 | X(mkproblem_dft_d)( |
312 | 0 | X(mktensor_0d)(), |
313 | 0 | X(mktensor_2d)(nbuf, ivs, bufdist, |
314 | 0 | n/2+1, p->sz->dims[0].is, 2), |
315 | 0 | TAINT(p->cr, ivs * nbuf), TAINT(p->ci, ivs * nbuf), |
316 | 0 | bufs + roffset, bufs + ioffset)); |
317 | 0 | if (!cldcpy) goto nada; |
318 | | |
319 | 0 | X(ifree)(bufs); bufs = 0; |
320 | |
|
321 | 0 | cldrest = X(mkplan_d)(plnr, |
322 | 0 | X(mkproblem_rdft2_d)( |
323 | 0 | X(tensor_copy)(p->sz), |
324 | 0 | X(mktensor_1d)(vl % nbuf, ivs, ovs), |
325 | 0 | p->r0 + od, p->r1 + od, |
326 | 0 | p->cr + id, p->ci + id, |
327 | 0 | p->kind)); |
328 | 0 | if (!cldrest) goto nada; |
329 | | |
330 | 0 | pln = MKPLAN_RDFT2(P, &padt, apply_hc2r); |
331 | 0 | } |
332 | | |
333 | 0 | pln->cld = cld; |
334 | 0 | pln->cldcpy = cldcpy; |
335 | 0 | pln->cldrest = cldrest; |
336 | 0 | pln->n = n; |
337 | 0 | pln->vl = vl; |
338 | 0 | pln->ivs_by_nbuf = ivs * nbuf; |
339 | 0 | pln->ovs_by_nbuf = ovs * nbuf; |
340 | 0 | pln->roffset = roffset; |
341 | 0 | pln->ioffset = ioffset; |
342 | |
|
343 | 0 | pln->nbuf = nbuf; |
344 | 0 | pln->bufdist = bufdist; |
345 | |
|
346 | 0 | { |
347 | 0 | opcnt t; |
348 | 0 | X(ops_add)(&cld->ops, &cldcpy->ops, &t); |
349 | 0 | X(ops_madd)(vl / nbuf, &t, &cldrest->ops, &pln->super.super.ops); |
350 | 0 | } |
351 | |
|
352 | 0 | return &(pln->super.super); |
353 | | |
354 | 0 | nada: |
355 | 0 | X(ifree0)(bufs); |
356 | 0 | X(plan_destroy_internal)(cldrest); |
357 | 0 | X(plan_destroy_internal)(cldcpy); |
358 | 0 | X(plan_destroy_internal)(cld); |
359 | 0 | return (plan *) 0; |
360 | 0 | } |
361 | | |
362 | | static solver *mksolver(size_t maxnbuf_ndx) |
363 | 2 | { |
364 | 2 | static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 }; |
365 | 2 | S *slv = MKSOLVER(S, &sadt); |
366 | 2 | slv->maxnbuf_ndx = maxnbuf_ndx; |
367 | 2 | return &(slv->super); |
368 | 2 | } |
369 | | |
370 | | void X(rdft2_buffered_register)(planner *p) |
371 | 1 | { |
372 | 1 | size_t i; |
373 | 3 | for (i = 0; i < NELEM(maxnbufs); ++i) |
374 | 2 | REGISTER_SOLVER(p, mksolver(i)); |
375 | 1 | } |