/src/fftw3/rdft/vrank3-transpose.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2003, 2007-14 Matteo Frigo |
3 | | * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology |
4 | | * |
5 | | * This program is free software; you can redistribute it and/or modify |
6 | | * it under the terms of the GNU General Public License as published by |
7 | | * the Free Software Foundation; either version 2 of the License, or |
8 | | * (at your option) any later version. |
9 | | * |
10 | | * This program is distributed in the hope that it will be useful, |
11 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | * GNU General Public License for more details. |
14 | | * |
15 | | * You should have received a copy of the GNU General Public License |
16 | | * along with this program; if not, write to the Free Software |
17 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
18 | | * |
19 | | */ |
20 | | |
21 | | |
22 | | /* rank-0, vector-rank-3, non-square in-place transposition |
23 | | (see rank0.c for square transposition) */ |
24 | | |
25 | | #include "rdft/rdft.h" |
26 | | |
27 | | #ifdef HAVE_STRING_H |
28 | | #include <string.h> /* for memcpy() */ |
29 | | #endif |
30 | | |
31 | | struct P_s; |
32 | | |
33 | | typedef struct { |
34 | | rdftapply apply; |
35 | | int (*applicable)(const problem_rdft *p, planner *plnr, |
36 | | int dim0, int dim1, int dim2, INT *nbuf); |
37 | | int (*mkcldrn)(const problem_rdft *p, planner *plnr, struct P_s *ego); |
38 | | const char *nam; |
39 | | } transpose_adt; |
40 | | |
41 | | typedef struct { |
42 | | solver super; |
43 | | const transpose_adt *adt; |
44 | | } S; |
45 | | |
46 | | typedef struct P_s { |
47 | | plan_rdft super; |
48 | | INT n, m, vl; /* transpose n x m matrix of vl-tuples */ |
49 | | INT nbuf; /* buffer size */ |
50 | | INT nd, md, d; /* transpose-gcd params */ |
51 | | INT nc, mc; /* transpose-cut params */ |
52 | | plan *cld1, *cld2, *cld3; /* children, null if unused */ |
53 | | const S *slv; |
54 | | } P; |
55 | | |
56 | | |
57 | | /*************************************************************************/ |
58 | | /* some utilities for the solvers */ |
59 | | |
60 | | static INT gcd(INT a, INT b) |
61 | 26 | { |
62 | 26 | INT r; |
63 | 26 | do { |
64 | 26 | r = a % b; |
65 | 26 | a = b; |
66 | 26 | b = r; |
67 | 26 | } while (r != 0); |
68 | | |
69 | 26 | return a; |
70 | 26 | } |
71 | | |
72 | | /* whether we can transpose with one of our routines expecting |
73 | | contiguous Ntuples */ |
74 | | static int Ntuple_transposable(const iodim *a, const iodim *b, INT vl, INT vs) |
75 | 393 | { |
76 | 393 | return (vs == 1 && b->is == vl && a->os == vl && |
77 | 393 | ((a->n == b->n && a->is == b->os |
78 | 393 | && a->is >= b->n && a->is % vl == 0) |
79 | 393 | || (a->is == b->n * vl && b->os == a->n * vl))); |
80 | 393 | } |
81 | | |
82 | | /* check whether a and b correspond to the first and second dimensions |
83 | | of a transpose of tuples with vector length = vl, stride = vs. */ |
84 | | static int transposable(const iodim *a, const iodim *b, INT vl, INT vs) |
85 | 471 | { |
86 | 471 | return ((a->n == b->n && a->os == b->is && a->is == b->os) |
87 | 471 | || Ntuple_transposable(a, b, vl, vs)); |
88 | 471 | } |
89 | | |
90 | | static int pickdim(const tensor *s, int *pdim0, int *pdim1, int *pdim2) |
91 | 471 | { |
92 | 471 | int dim0, dim1; |
93 | | |
94 | 471 | for (dim0 = 0; dim0 < s->rnk; ++dim0) |
95 | 942 | for (dim1 = 0; dim1 < s->rnk; ++dim1) { |
96 | 942 | int dim2 = 3 - dim0 - dim1; |
97 | 942 | if (dim0 == dim1) continue; |
98 | 471 | if ((s->rnk == 2 || s->dims[dim2].is == s->dims[dim2].os) |
99 | 471 | && transposable(s->dims + dim0, s->dims + dim1, |
100 | 471 | s->rnk == 2 ? (INT)1 : s->dims[dim2].n, |
101 | 471 | s->rnk == 2 ? (INT)1 : s->dims[dim2].is)) { |
102 | 471 | *pdim0 = dim0; |
103 | 471 | *pdim1 = dim1; |
104 | 471 | *pdim2 = dim2; |
105 | 471 | return 1; |
106 | 471 | } |
107 | 471 | } |
108 | 0 | return 0; |
109 | 471 | } |
110 | | |
111 | 0 | #define MINBUFDIV 9 /* min factor by which buffer is smaller than data */ |
112 | 0 | #define MAXBUF 65536 /* maximum non-ugly buffer */ |
113 | | |
114 | | /* generic applicability function */ |
115 | | static int applicable(const solver *ego_, const problem *p_, planner *plnr, |
116 | | int *dim0, int *dim1, int *dim2, INT *nbuf) |
117 | 981 | { |
118 | 981 | const S *ego = (const S *) ego_; |
119 | 981 | const problem_rdft *p = (const problem_rdft *) p_; |
120 | | |
121 | 981 | return (1 |
122 | 981 | && p->I == p->O |
123 | 981 | && p->sz->rnk == 0 |
124 | 981 | && (p->vecsz->rnk == 2 || p->vecsz->rnk == 3) |
125 | | |
126 | 981 | && pickdim(p->vecsz, dim0, dim1, dim2) |
127 | | |
128 | | /* UGLY if vecloop in wrong order for locality */ |
129 | 981 | && (!NO_UGLYP(plnr) || |
130 | 471 | p->vecsz->rnk == 2 || |
131 | 471 | X(iabs)(p->vecsz->dims[*dim2].is) |
132 | 471 | < X(imax)(X(iabs)(p->vecsz->dims[*dim0].is), |
133 | 471 | X(iabs)(p->vecsz->dims[*dim0].os))) |
134 | | |
135 | | /* SLOW if non-square */ |
136 | 981 | && (!NO_SLOWP(plnr) |
137 | 471 | || p->vecsz->dims[*dim0].n == p->vecsz->dims[*dim1].n) |
138 | | |
139 | 981 | && ego->adt->applicable(p, plnr, *dim0,*dim1,*dim2,nbuf) |
140 | | |
141 | | /* buffers too big are UGLY */ |
142 | 981 | && ((!NO_UGLYP(plnr) && !CONSERVE_MEMORYP(plnr)) |
143 | 0 | || *nbuf <= MAXBUF |
144 | 0 | || *nbuf * MINBUFDIV <= X(tensor_sz)(p->vecsz)) |
145 | 981 | ); |
146 | 981 | } |
147 | | |
148 | | static void get_transpose_vec(const problem_rdft *p, int dim2, INT *vl,INT *vs) |
149 | 78 | { |
150 | 78 | if (p->vecsz->rnk == 2) { |
151 | 0 | *vl = 1; *vs = 1; |
152 | 0 | } |
153 | 78 | else { |
154 | 78 | *vl = p->vecsz->dims[dim2].n; |
155 | 78 | *vs = p->vecsz->dims[dim2].is; /* == os */ |
156 | 78 | } |
157 | 78 | } |
158 | | |
159 | | /*************************************************************************/ |
160 | | /* Cache-oblivious in-place transpose of non-square matrices, based |
161 | | on transposes of blocks given by the gcd of the dimensions. |
162 | | |
163 | | This algorithm is related to algorithm V5 from Murray Dow, |
164 | | "Transposing a matrix on a vector computer," Parallel Computing 21 |
165 | | (12), 1997-2005 (1995), with the modification that we use |
166 | | cache-oblivious recursive transpose subroutines (and we derived |
167 | | it independently). |
168 | | |
169 | | For a p x q matrix, this requires scratch space equal to the size |
170 | | of the matrix divided by gcd(p,q). Alternatively, see also the |
171 | | "cut" algorithm below, if |p-q| * gcd(p,q) < max(p,q). */ |
172 | | |
173 | | static void apply_gcd(const plan *ego_, R *I, R *O) |
174 | 0 | { |
175 | 0 | const P *ego = (const P *) ego_; |
176 | 0 | INT n = ego->nd, m = ego->md, d = ego->d; |
177 | 0 | INT vl = ego->vl; |
178 | 0 | R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS); |
179 | 0 | INT i, num_el = n*m*d*vl; |
180 | |
|
181 | 0 | A(ego->n == n * d && ego->m == m * d); |
182 | 0 | UNUSED(O); |
183 | | |
184 | | /* Transpose the matrix I in-place, where I is an (n*d) x (m*d) matrix |
185 | | of vl-tuples and buf contains n*m*d*vl elements. |
186 | | |
187 | | In general, to transpose a p x q matrix, you should call this |
188 | | routine with d = gcd(p, q), n = p/d, and m = q/d. */ |
189 | |
|
190 | 0 | A(n > 0 && m > 0 && vl > 0); |
191 | 0 | A(d > 1); |
192 | | |
193 | | /* treat as (d x n) x (d' x m) matrix. (d' = d) */ |
194 | | |
195 | | /* First, transpose d x (n x d') x m to d x (d' x n) x m, |
196 | | using the buf matrix. This consists of d transposes |
197 | | of contiguous n x d' matrices of m-tuples. */ |
198 | 0 | if (n > 1) { |
199 | 0 | rdftapply cldapply = ((plan_rdft *) ego->cld1)->apply; |
200 | 0 | for (i = 0; i < d; ++i) { |
201 | 0 | cldapply(ego->cld1, I + i*num_el, buf); |
202 | 0 | memcpy(I + i*num_el, buf, num_el*sizeof(R)); |
203 | 0 | } |
204 | 0 | } |
205 | | |
206 | | /* Now, transpose (d x d') x (n x m) to (d' x d) x (n x m), which |
207 | | is a square in-place transpose of n*m-tuples: */ |
208 | 0 | { |
209 | 0 | rdftapply cldapply = ((plan_rdft *) ego->cld2)->apply; |
210 | 0 | cldapply(ego->cld2, I, I); |
211 | 0 | } |
212 | | |
213 | | /* Finally, transpose d' x ((d x n) x m) to d' x (m x (d x n)), |
214 | | using the buf matrix. This consists of d' transposes |
215 | | of contiguous d*n x m matrices. */ |
216 | 0 | if (m > 1) { |
217 | 0 | rdftapply cldapply = ((plan_rdft *) ego->cld3)->apply; |
218 | 0 | for (i = 0; i < d; ++i) { |
219 | 0 | cldapply(ego->cld3, I + i*num_el, buf); |
220 | 0 | memcpy(I + i*num_el, buf, num_el*sizeof(R)); |
221 | 0 | } |
222 | 0 | } |
223 | |
|
224 | 0 | X(ifree)(buf); |
225 | 0 | } |
226 | | |
227 | | static int applicable_gcd(const problem_rdft *p, planner *plnr, |
228 | | int dim0, int dim1, int dim2, INT *nbuf) |
229 | 26 | { |
230 | 26 | INT n = p->vecsz->dims[dim0].n; |
231 | 26 | INT m = p->vecsz->dims[dim1].n; |
232 | 26 | INT d, vl, vs; |
233 | 26 | get_transpose_vec(p, dim2, &vl, &vs); |
234 | 26 | d = gcd(n, m); |
235 | 26 | *nbuf = n * (m / d) * vl; |
236 | 26 | return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts */ |
237 | 26 | && n != m |
238 | 26 | && d > 1 |
239 | 26 | && Ntuple_transposable(p->vecsz->dims + dim0, |
240 | 0 | p->vecsz->dims + dim1, |
241 | 0 | vl, vs)); |
242 | 26 | } |
243 | | |
244 | | static int mkcldrn_gcd(const problem_rdft *p, planner *plnr, P *ego) |
245 | 0 | { |
246 | 0 | INT n = ego->nd, m = ego->md, d = ego->d; |
247 | 0 | INT vl = ego->vl; |
248 | 0 | R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS); |
249 | 0 | INT num_el = n*m*d*vl; |
250 | |
|
251 | 0 | if (n > 1) { |
252 | 0 | ego->cld1 = X(mkplan_d)(plnr, |
253 | 0 | X(mkproblem_rdft_0_d)( |
254 | 0 | X(mktensor_3d)(n, d*m*vl, m*vl, |
255 | 0 | d, m*vl, n*m*vl, |
256 | 0 | m*vl, 1, 1), |
257 | 0 | TAINT(p->I, num_el), buf)); |
258 | 0 | if (!ego->cld1) |
259 | 0 | goto nada; |
260 | 0 | X(ops_madd)(d, &ego->cld1->ops, &ego->super.super.ops, |
261 | 0 | &ego->super.super.ops); |
262 | 0 | ego->super.super.ops.other += num_el * d * 2; |
263 | 0 | } |
264 | | |
265 | 0 | ego->cld2 = X(mkplan_d)(plnr, |
266 | 0 | X(mkproblem_rdft_0_d)( |
267 | 0 | X(mktensor_3d)(d, d*n*m*vl, n*m*vl, |
268 | 0 | d, n*m*vl, d*n*m*vl, |
269 | 0 | n*m*vl, 1, 1), |
270 | 0 | p->I, p->I)); |
271 | 0 | if (!ego->cld2) |
272 | 0 | goto nada; |
273 | 0 | X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops); |
274 | |
|
275 | 0 | if (m > 1) { |
276 | 0 | ego->cld3 = X(mkplan_d)(plnr, |
277 | 0 | X(mkproblem_rdft_0_d)( |
278 | 0 | X(mktensor_3d)(d*n, m*vl, vl, |
279 | 0 | m, vl, d*n*vl, |
280 | 0 | vl, 1, 1), |
281 | 0 | TAINT(p->I, num_el), buf)); |
282 | 0 | if (!ego->cld3) |
283 | 0 | goto nada; |
284 | 0 | X(ops_madd2)(d, &ego->cld3->ops, &ego->super.super.ops); |
285 | 0 | ego->super.super.ops.other += num_el * d * 2; |
286 | 0 | } |
287 | | |
288 | 0 | X(ifree)(buf); |
289 | 0 | return 1; |
290 | | |
291 | 0 | nada: |
292 | 0 | X(ifree)(buf); |
293 | 0 | return 0; |
294 | 0 | } |
295 | | |
296 | | static const transpose_adt adt_gcd = |
297 | | { |
298 | | apply_gcd, applicable_gcd, mkcldrn_gcd, |
299 | | "rdft-transpose-gcd" |
300 | | }; |
301 | | |
302 | | /*************************************************************************/ |
303 | | /* Cache-oblivious in-place transpose of non-square n x m matrices, |
304 | | based on transposing a sub-matrix first and then transposing the |
305 | | remainder(s) with the help of a buffer. See also transpose-gcd, |
306 | | above, if gcd(n,m) is large. |
307 | | |
308 | | This algorithm is related to algorithm V3 from Murray Dow, |
309 | | "Transposing a matrix on a vector computer," Parallel Computing 21 |
310 | | (12), 1997-2005 (1995), with the modifications that we use |
311 | | cache-oblivious recursive transpose subroutines and we have the |
312 | | generalization for large |n-m| below. |
313 | | |
314 | | The best case, and the one described by Dow, is for |n-m| small, in |
315 | | which case we transpose a square sub-matrix of size min(n,m), |
316 | | handling the remainder via a buffer. This requires scratch space |
317 | | equal to the size of the matrix times |n-m| / max(n,m). |
318 | | |
319 | | As a generalization when |n-m| is not small, we also support cutting |
320 | | *both* dimensions to an nc x mc matrix which is *not* necessarily |
321 | | square, but has a large gcd (and can therefore use transpose-gcd). |
322 | | */ |
323 | | |
324 | | static void apply_cut(const plan *ego_, R *I, R *O) |
325 | 0 | { |
326 | 0 | const P *ego = (const P *) ego_; |
327 | 0 | INT n = ego->n, m = ego->m, nc = ego->nc, mc = ego->mc, vl = ego->vl; |
328 | 0 | INT i; |
329 | 0 | R *buf1 = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS); |
330 | 0 | UNUSED(O); |
331 | |
|
332 | 0 | if (m > mc) { |
333 | 0 | ((plan_rdft *) ego->cld1)->apply(ego->cld1, I + mc*vl, buf1); |
334 | 0 | for (i = 0; i < nc; ++i) |
335 | 0 | memmove(I + (mc*vl) * i, I + (m*vl) * i, sizeof(R) * (mc*vl)); |
336 | 0 | } |
337 | |
|
338 | 0 | ((plan_rdft *) ego->cld2)->apply(ego->cld2, I, I); /* nc x mc transpose */ |
339 | | |
340 | 0 | if (n > nc) { |
341 | 0 | R *buf2 = buf1 + (m-mc)*(nc*vl); /* FIXME: force better alignment? */ |
342 | 0 | memcpy(buf2, I + nc*(m*vl), (n-nc)*(m*vl)*sizeof(R)); |
343 | 0 | for (i = mc-1; i >= 0; --i) |
344 | 0 | memmove(I + (n*vl) * i, I + (nc*vl) * i, sizeof(R) * (n*vl)); |
345 | 0 | ((plan_rdft *) ego->cld3)->apply(ego->cld3, buf2, I + nc*vl); |
346 | 0 | } |
347 | |
|
348 | 0 | if (m > mc) { |
349 | 0 | if (n > nc) |
350 | 0 | for (i = mc; i < m; ++i) |
351 | 0 | memcpy(I + i*(n*vl), buf1 + (i-mc)*(nc*vl), |
352 | 0 | (nc*vl)*sizeof(R)); |
353 | 0 | else |
354 | 0 | memcpy(I + mc*(n*vl), buf1, (m-mc)*(n*vl)*sizeof(R)); |
355 | 0 | } |
356 | |
|
357 | 0 | X(ifree)(buf1); |
358 | 0 | } |
359 | | |
360 | | /* only cut one dimension if the resulting buffer is small enough */ |
361 | | static int cut1(INT n, INT m, INT vl) |
362 | 0 | { |
363 | 0 | return (X(imax)(n,m) >= X(iabs)(n-m) * MINBUFDIV |
364 | 0 | || X(imin)(n,m) * X(iabs)(n-m) * vl <= MAXBUF); |
365 | 0 | } |
366 | | |
367 | 0 | #define CUT_NSRCH 32 /* range of sizes to search for possible cuts */ |
368 | | |
369 | | static int applicable_cut(const problem_rdft *p, planner *plnr, |
370 | | int dim0, int dim1, int dim2, INT *nbuf) |
371 | 26 | { |
372 | 26 | INT n = p->vecsz->dims[dim0].n; |
373 | 26 | INT m = p->vecsz->dims[dim1].n; |
374 | 26 | INT vl, vs; |
375 | 26 | get_transpose_vec(p, dim2, &vl, &vs); |
376 | 26 | *nbuf = 0; /* always small enough to be non-UGLY (?) */ |
377 | 26 | A(MINBUFDIV <= CUT_NSRCH); /* assumed to avoid inf. loops below */ |
378 | 26 | return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts? */ |
379 | 26 | && n != m |
380 | | |
381 | | /* Don't call transpose-cut recursively (avoid inf. loops): |
382 | | the non-square sub-transpose produced when !cut1 |
383 | | should always have gcd(n,m) >= min(CUT_NSRCH,n,m), |
384 | | for which transpose-gcd is applicable */ |
385 | 26 | && (cut1(n, m, vl) |
386 | 0 | || gcd(n, m) < X(imin)(MINBUFDIV, X(imin)(n,m))) |
387 | | |
388 | 26 | && Ntuple_transposable(p->vecsz->dims + dim0, |
389 | 0 | p->vecsz->dims + dim1, |
390 | 0 | vl, vs)); |
391 | 26 | } |
392 | | |
393 | | static int mkcldrn_cut(const problem_rdft *p, planner *plnr, P *ego) |
394 | 0 | { |
395 | 0 | INT n = ego->n, m = ego->m, nc, mc; |
396 | 0 | INT vl = ego->vl; |
397 | 0 | R *buf; |
398 | | |
399 | | /* pick the "best" cut */ |
400 | 0 | if (cut1(n, m, vl)) { |
401 | 0 | nc = mc = X(imin)(n,m); |
402 | 0 | } |
403 | 0 | else { |
404 | 0 | INT dc, ns, ms; |
405 | 0 | dc = gcd(m, n); nc = n; mc = m; |
406 | | /* search for cut with largest gcd |
407 | | (TODO: different optimality criteria? different search range?) */ |
408 | 0 | for (ms = m; ms > 0 && ms > m - CUT_NSRCH; --ms) { |
409 | 0 | for (ns = n; ns > 0 && ns > n - CUT_NSRCH; --ns) { |
410 | 0 | INT ds = gcd(ms, ns); |
411 | 0 | if (ds > dc) { |
412 | 0 | dc = ds; nc = ns; mc = ms; |
413 | 0 | if (dc == X(imin)(ns, ms)) |
414 | 0 | break; /* cannot get larger than this */ |
415 | 0 | } |
416 | 0 | } |
417 | 0 | if (dc == X(imin)(n, ms)) |
418 | 0 | break; /* cannot get larger than this */ |
419 | 0 | } |
420 | 0 | A(dc >= X(imin)(CUT_NSRCH, X(imin)(n, m))); |
421 | 0 | } |
422 | 0 | ego->nc = nc; |
423 | 0 | ego->mc = mc; |
424 | 0 | ego->nbuf = (m-mc)*(nc*vl) + (n-nc)*(m*vl); |
425 | |
|
426 | 0 | buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS); |
427 | |
|
428 | 0 | if (m > mc) { |
429 | 0 | ego->cld1 = X(mkplan_d)(plnr, |
430 | 0 | X(mkproblem_rdft_0_d)( |
431 | 0 | X(mktensor_3d)(nc, m*vl, vl, |
432 | 0 | m-mc, vl, nc*vl, |
433 | 0 | vl, 1, 1), |
434 | 0 | p->I + mc*vl, buf)); |
435 | 0 | if (!ego->cld1) |
436 | 0 | goto nada; |
437 | 0 | X(ops_add2)(&ego->cld1->ops, &ego->super.super.ops); |
438 | 0 | } |
439 | | |
440 | 0 | ego->cld2 = X(mkplan_d)(plnr, |
441 | 0 | X(mkproblem_rdft_0_d)( |
442 | 0 | X(mktensor_3d)(nc, mc*vl, vl, |
443 | 0 | mc, vl, nc*vl, |
444 | 0 | vl, 1, 1), |
445 | 0 | p->I, p->I)); |
446 | 0 | if (!ego->cld2) |
447 | 0 | goto nada; |
448 | 0 | X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops); |
449 | |
|
450 | 0 | if (n > nc) { |
451 | 0 | ego->cld3 = X(mkplan_d)(plnr, |
452 | 0 | X(mkproblem_rdft_0_d)( |
453 | 0 | X(mktensor_3d)(n-nc, m*vl, vl, |
454 | 0 | m, vl, n*vl, |
455 | 0 | vl, 1, 1), |
456 | 0 | buf + (m-mc)*(nc*vl), p->I + nc*vl)); |
457 | 0 | if (!ego->cld3) |
458 | 0 | goto nada; |
459 | 0 | X(ops_add2)(&ego->cld3->ops, &ego->super.super.ops); |
460 | 0 | } |
461 | | |
462 | | /* memcpy/memmove operations */ |
463 | 0 | ego->super.super.ops.other += 2 * vl * (nc*mc * ((m > mc) + (n > nc)) |
464 | 0 | + (n-nc)*m + (m-mc)*nc); |
465 | |
|
466 | 0 | X(ifree)(buf); |
467 | 0 | return 1; |
468 | | |
469 | 0 | nada: |
470 | 0 | X(ifree)(buf); |
471 | 0 | return 0; |
472 | 0 | } |
473 | | |
474 | | static const transpose_adt adt_cut = |
475 | | { |
476 | | apply_cut, applicable_cut, mkcldrn_cut, |
477 | | "rdft-transpose-cut" |
478 | | }; |
479 | | |
480 | | /*************************************************************************/ |
481 | | /* In-place transpose routine from TOMS, which follows the cycles of |
482 | | the permutation so that it writes to each location only once. |
483 | | Because of cache-line and other issues, however, this routine is |
484 | | typically much slower than transpose-gcd or transpose-cut, even |
485 | | though the latter do some extra writes. On the other hand, if the |
486 | | vector length is large then the TOMS routine is best. |
487 | | |
488 | | The TOMS routine also has the advantage of requiring less buffer |
489 | | space for the case of gcd(nx,ny) small. However, in this case it |
490 | | has been superseded by the combination of the generalized |
491 | | transpose-cut method with the transpose-gcd method, which can |
492 | | always transpose with buffers a small fraction of the array size |
493 | | regardless of gcd(nx,ny). */ |
494 | | |
495 | | /* |
496 | | * TOMS Transpose. Algorithm 513 (Revised version of algorithm 380). |
497 | | * |
498 | | * These routines do in-place transposes of arrays. |
499 | | * |
500 | | * [ Cate, E.G. and Twigg, D.W., ACM Transactions on Mathematical Software, |
501 | | * vol. 3, no. 1, 104-110 (1977) ] |
502 | | * |
503 | | * C version by Steven G. Johnson (February 1997). |
504 | | */ |
505 | | |
506 | | /* |
507 | | * "a" is a 1D array of length ny*nx*N which constains the nx x ny |
508 | | * matrix of N-tuples to be transposed. "a" is stored in row-major |
509 | | * order (last index varies fastest). move is a 1D array of length |
510 | | * move_size used to store information to speed up the process. The |
511 | | * value move_size=(ny+nx)/2 is recommended. buf should be an array |
512 | | * of length 2*N. |
513 | | * |
514 | | */ |
515 | | |
516 | | static void transpose_toms513(R *a, INT nx, INT ny, INT N, |
517 | | char *move, INT move_size, R *buf) |
518 | 0 | { |
519 | 0 | INT i, im, mn; |
520 | 0 | R *b, *c, *d; |
521 | 0 | INT ncount; |
522 | 0 | INT k; |
523 | | |
524 | | /* check arguments and initialize: */ |
525 | 0 | A(ny > 0 && nx > 0 && N > 0 && move_size > 0); |
526 | | |
527 | 0 | b = buf; |
528 | | |
529 | | /* Cate & Twigg have a special case for nx == ny, but we don't |
530 | | bother, since we already have special code for this case elsewhere. */ |
531 | |
|
532 | 0 | c = buf + N; |
533 | 0 | ncount = 2; /* always at least 2 fixed points */ |
534 | 0 | k = (mn = ny * nx) - 1; |
535 | | |
536 | 0 | for (i = 0; i < move_size; ++i) |
537 | 0 | move[i] = 0; |
538 | | |
539 | 0 | if (ny >= 3 && nx >= 3) |
540 | 0 | ncount += gcd(ny - 1, nx - 1) - 1; /* # fixed points */ |
541 | | |
542 | 0 | i = 1; |
543 | 0 | im = ny; |
544 | | |
545 | 0 | while (1) { |
546 | 0 | INT i1, i2, i1c, i2c; |
547 | 0 | INT kmi; |
548 | | |
549 | | /** Rearrange the elements of a loop |
550 | | and its companion loop: **/ |
551 | | |
552 | 0 | i1 = i; |
553 | 0 | kmi = k - i; |
554 | 0 | i1c = kmi; |
555 | 0 | switch (N) { |
556 | 0 | case 1: |
557 | 0 | b[0] = a[i1]; |
558 | 0 | c[0] = a[i1c]; |
559 | 0 | break; |
560 | 0 | case 2: |
561 | 0 | b[0] = a[2*i1]; |
562 | 0 | b[1] = a[2*i1+1]; |
563 | 0 | c[0] = a[2*i1c]; |
564 | 0 | c[1] = a[2*i1c+1]; |
565 | 0 | break; |
566 | 0 | default: |
567 | 0 | memcpy(b, &a[N * i1], N * sizeof(R)); |
568 | 0 | memcpy(c, &a[N * i1c], N * sizeof(R)); |
569 | 0 | } |
570 | 0 | while (1) { |
571 | 0 | i2 = ny * i1 - k * (i1 / nx); |
572 | 0 | i2c = k - i2; |
573 | 0 | if (i1 < move_size) |
574 | 0 | move[i1] = 1; |
575 | 0 | if (i1c < move_size) |
576 | 0 | move[i1c] = 1; |
577 | 0 | ncount += 2; |
578 | 0 | if (i2 == i) |
579 | 0 | break; |
580 | 0 | if (i2 == kmi) { |
581 | 0 | d = b; |
582 | 0 | b = c; |
583 | 0 | c = d; |
584 | 0 | break; |
585 | 0 | } |
586 | 0 | switch (N) { |
587 | 0 | case 1: |
588 | 0 | a[i1] = a[i2]; |
589 | 0 | a[i1c] = a[i2c]; |
590 | 0 | break; |
591 | 0 | case 2: |
592 | 0 | a[2*i1] = a[2*i2]; |
593 | 0 | a[2*i1+1] = a[2*i2+1]; |
594 | 0 | a[2*i1c] = a[2*i2c]; |
595 | 0 | a[2*i1c+1] = a[2*i2c+1]; |
596 | 0 | break; |
597 | 0 | default: |
598 | 0 | memcpy(&a[N * i1], &a[N * i2], |
599 | 0 | N * sizeof(R)); |
600 | 0 | memcpy(&a[N * i1c], &a[N * i2c], |
601 | 0 | N * sizeof(R)); |
602 | 0 | } |
603 | 0 | i1 = i2; |
604 | 0 | i1c = i2c; |
605 | 0 | } |
606 | 0 | switch (N) { |
607 | 0 | case 1: |
608 | 0 | a[i1] = b[0]; |
609 | 0 | a[i1c] = c[0]; |
610 | 0 | break; |
611 | 0 | case 2: |
612 | 0 | a[2*i1] = b[0]; |
613 | 0 | a[2*i1+1] = b[1]; |
614 | 0 | a[2*i1c] = c[0]; |
615 | 0 | a[2*i1c+1] = c[1]; |
616 | 0 | break; |
617 | 0 | default: |
618 | 0 | memcpy(&a[N * i1], b, N * sizeof(R)); |
619 | 0 | memcpy(&a[N * i1c], c, N * sizeof(R)); |
620 | 0 | } |
621 | 0 | if (ncount >= mn) |
622 | 0 | break; /* we've moved all elements */ |
623 | | |
624 | | /** Search for loops to rearrange: **/ |
625 | | |
626 | 0 | while (1) { |
627 | 0 | INT max = k - i; |
628 | 0 | ++i; |
629 | 0 | A(i <= max); |
630 | 0 | im += ny; |
631 | 0 | if (im > k) |
632 | 0 | im -= k; |
633 | 0 | i2 = im; |
634 | 0 | if (i == i2) |
635 | 0 | continue; |
636 | 0 | if (i >= move_size) { |
637 | 0 | while (i2 > i && i2 < max) { |
638 | 0 | i1 = i2; |
639 | 0 | i2 = ny * i1 - k * (i1 / nx); |
640 | 0 | } |
641 | 0 | if (i2 == i) |
642 | 0 | break; |
643 | 0 | } else if (!move[i]) |
644 | 0 | break; |
645 | 0 | } |
646 | 0 | } |
647 | 0 | } |
648 | | |
649 | | static void apply_toms513(const plan *ego_, R *I, R *O) |
650 | 0 | { |
651 | 0 | const P *ego = (const P *) ego_; |
652 | 0 | INT n = ego->n, m = ego->m; |
653 | 0 | INT vl = ego->vl; |
654 | 0 | R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS); |
655 | 0 | UNUSED(O); |
656 | 0 | transpose_toms513(I, n, m, vl, (char *) (buf + 2*vl), (n+m)/2, buf); |
657 | 0 | X(ifree)(buf); |
658 | 0 | } |
659 | | |
660 | | static int applicable_toms513(const problem_rdft *p, planner *plnr, |
661 | | int dim0, int dim1, int dim2, INT *nbuf) |
662 | 26 | { |
663 | 26 | INT n = p->vecsz->dims[dim0].n; |
664 | 26 | INT m = p->vecsz->dims[dim1].n; |
665 | 26 | INT vl, vs; |
666 | 26 | get_transpose_vec(p, dim2, &vl, &vs); |
667 | 26 | *nbuf = 2*vl |
668 | 26 | + ((n + m) / 2 * sizeof(char) + sizeof(R) - 1) / sizeof(R); |
669 | 26 | return (!NO_SLOWP(plnr) |
670 | 26 | && (vl > 8 || !NO_UGLYP(plnr)) /* UGLY for small vl */ |
671 | 26 | && n != m |
672 | 26 | && Ntuple_transposable(p->vecsz->dims + dim0, |
673 | 0 | p->vecsz->dims + dim1, |
674 | 0 | vl, vs)); |
675 | 26 | } |
676 | | |
677 | | static int mkcldrn_toms513(const problem_rdft *p, planner *plnr, P *ego) |
678 | 0 | { |
679 | 0 | UNUSED(p); UNUSED(plnr); |
680 | | /* heuristic so that TOMS algorithm is last resort for small vl */ |
681 | 0 | ego->super.super.ops.other += ego->n * ego->m * 2 * (ego->vl + 30); |
682 | 0 | return 1; |
683 | 0 | } |
684 | | |
685 | | static const transpose_adt adt_toms513 = |
686 | | { |
687 | | apply_toms513, applicable_toms513, mkcldrn_toms513, |
688 | | "rdft-transpose-toms513" |
689 | | }; |
690 | | |
691 | | /*-----------------------------------------------------------------------*/ |
692 | | /*-----------------------------------------------------------------------*/ |
693 | | /* generic stuff: */ |
694 | | |
695 | | static void awake(plan *ego_, enum wakefulness wakefulness) |
696 | 0 | { |
697 | 0 | P *ego = (P *) ego_; |
698 | 0 | X(plan_awake)(ego->cld1, wakefulness); |
699 | 0 | X(plan_awake)(ego->cld2, wakefulness); |
700 | 0 | X(plan_awake)(ego->cld3, wakefulness); |
701 | 0 | } |
702 | | |
703 | | static void print(const plan *ego_, printer *p) |
704 | 0 | { |
705 | 0 | const P *ego = (const P *) ego_; |
706 | 0 | p->print(p, "(%s-%Dx%D%v", ego->slv->adt->nam, |
707 | 0 | ego->n, ego->m, ego->vl); |
708 | 0 | if (ego->cld1) p->print(p, "%(%p%)", ego->cld1); |
709 | 0 | if (ego->cld2) p->print(p, "%(%p%)", ego->cld2); |
710 | 0 | if (ego->cld3) p->print(p, "%(%p%)", ego->cld3); |
711 | 0 | p->print(p, ")"); |
712 | 0 | } |
713 | | |
714 | | static void destroy(plan *ego_) |
715 | 0 | { |
716 | 0 | P *ego = (P *) ego_; |
717 | 0 | X(plan_destroy_internal)(ego->cld3); |
718 | 0 | X(plan_destroy_internal)(ego->cld2); |
719 | 0 | X(plan_destroy_internal)(ego->cld1); |
720 | 0 | } |
721 | | |
722 | | static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) |
723 | 981 | { |
724 | 981 | const S *ego = (const S *) ego_; |
725 | 981 | const problem_rdft *p; |
726 | 981 | int dim0, dim1, dim2; |
727 | 981 | INT nbuf, vs; |
728 | 981 | P *pln; |
729 | | |
730 | 981 | static const plan_adt padt = { |
731 | 981 | X(rdft_solve), awake, print, destroy |
732 | 981 | }; |
733 | | |
734 | 981 | if (!applicable(ego_, p_, plnr, &dim0, &dim1, &dim2, &nbuf)) |
735 | 981 | return (plan *) 0; |
736 | | |
737 | 0 | p = (const problem_rdft *) p_; |
738 | 0 | pln = MKPLAN_RDFT(P, &padt, ego->adt->apply); |
739 | |
|
740 | 0 | pln->n = p->vecsz->dims[dim0].n; |
741 | 0 | pln->m = p->vecsz->dims[dim1].n; |
742 | 0 | get_transpose_vec(p, dim2, &pln->vl, &vs); |
743 | 0 | pln->nbuf = nbuf; |
744 | 0 | pln->d = gcd(pln->n, pln->m); |
745 | 0 | pln->nd = pln->n / pln->d; |
746 | 0 | pln->md = pln->m / pln->d; |
747 | 0 | pln->slv = ego; |
748 | |
|
749 | 0 | X(ops_zero)(&pln->super.super.ops); /* mkcldrn is responsible for ops */ |
750 | |
|
751 | 0 | pln->cld1 = pln->cld2 = pln->cld3 = 0; |
752 | 0 | if (!ego->adt->mkcldrn(p, plnr, pln)) { |
753 | 0 | X(plan_destroy_internal)(&(pln->super.super)); |
754 | 0 | return 0; |
755 | 0 | } |
756 | | |
757 | 0 | return &(pln->super.super); |
758 | 0 | } |
759 | | |
760 | | static solver *mksolver(const transpose_adt *adt) |
761 | 3 | { |
762 | 3 | static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 }; |
763 | 3 | S *slv = MKSOLVER(S, &sadt); |
764 | 3 | slv->adt = adt; |
765 | 3 | return &(slv->super); |
766 | 3 | } |
767 | | |
768 | | void X(rdft_vrank3_transpose_register)(planner *p) |
769 | 1 | { |
770 | 1 | unsigned i; |
771 | 1 | static const transpose_adt *const adts[] = { |
772 | 1 | &adt_gcd, &adt_cut, |
773 | 1 | &adt_toms513 |
774 | 1 | }; |
775 | 4 | for (i = 0; i < sizeof(adts) / sizeof(adts[0]); ++i) |
776 | 3 | REGISTER_SOLVER(p, mksolver(adts[i])); |
777 | 1 | } |