Coverage Report

Created: 2025-06-22 06:45

/src/fftw3/rdft/vrank3-transpose.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2003, 2007-14 Matteo Frigo
3
 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4
 *
5
 * This program is free software; you can redistribute it and/or modify
6
 * it under the terms of the GNU General Public License as published by
7
 * the Free Software Foundation; either version 2 of the License, or
8
 * (at your option) any later version.
9
 *
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 * GNU General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, write to the Free Software
17
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18
 *
19
 */
20
21
22
/* rank-0, vector-rank-3, non-square in-place transposition
23
   (see rank0.c for square transposition)  */
24
25
#include "rdft/rdft.h"
26
27
#ifdef HAVE_STRING_H
28
#include <string.h>   /* for memcpy() */
29
#endif
30
31
struct P_s;
32
33
typedef struct {
34
     rdftapply apply;
35
     int (*applicable)(const problem_rdft *p, planner *plnr,
36
           int dim0, int dim1, int dim2, INT *nbuf);
37
     int (*mkcldrn)(const problem_rdft *p, planner *plnr, struct P_s *ego);
38
     const char *nam;
39
} transpose_adt;
40
41
typedef struct {
42
     solver super;
43
     const transpose_adt *adt;
44
} S;
45
46
typedef struct P_s {
47
     plan_rdft super;
48
     INT n, m, vl; /* transpose n x m matrix of vl-tuples */
49
     INT nbuf; /* buffer size */
50
     INT nd, md, d; /* transpose-gcd params */
51
     INT nc, mc; /* transpose-cut params */
52
     plan *cld1, *cld2, *cld3; /* children, null if unused */
53
     const S *slv;
54
} P;
55
56
57
/*************************************************************************/
58
/* some utilities for the solvers */
59
60
static INT gcd(INT a, INT b)
61
26
{
62
26
     INT r;
63
26
     do {
64
26
    r = a % b;
65
26
    a = b;
66
26
    b = r;
67
26
     } while (r != 0);
68
     
69
26
     return a;
70
26
}
71
72
/* whether we can transpose with one of our routines expecting
73
   contiguous Ntuples */
74
static int Ntuple_transposable(const iodim *a, const iodim *b, INT vl, INT vs)
75
393
{
76
393
     return (vs == 1 && b->is == vl && a->os == vl &&
77
393
       ((a->n == b->n && a->is == b->os
78
393
         && a->is >= b->n && a->is % vl == 0)
79
393
        || (a->is == b->n * vl && b->os == a->n * vl)));
80
393
}
81
82
/* check whether a and b correspond to the first and second dimensions
83
   of a transpose of tuples with vector length = vl, stride = vs. */
84
static int transposable(const iodim *a, const iodim *b, INT vl, INT vs)
85
471
{
86
471
     return ((a->n == b->n && a->os == b->is && a->is == b->os)
87
471
             || Ntuple_transposable(a, b, vl, vs));
88
471
}
89
90
static int pickdim(const tensor *s, int *pdim0, int *pdim1, int *pdim2)
91
471
{
92
471
     int dim0, dim1;
93
94
471
     for (dim0 = 0; dim0 < s->rnk; ++dim0)
95
942
          for (dim1 = 0; dim1 < s->rnk; ++dim1) {
96
942
         int dim2 = 3 - dim0 - dim1;
97
942
         if (dim0 == dim1) continue;
98
471
               if ((s->rnk == 2 || s->dims[dim2].is == s->dims[dim2].os)
99
471
       && transposable(s->dims + dim0, s->dims + dim1, 
100
471
           s->rnk == 2 ? (INT)1 : s->dims[dim2].n,
101
471
           s->rnk == 2 ? (INT)1 : s->dims[dim2].is)) {
102
471
                    *pdim0 = dim0;
103
471
                    *pdim1 = dim1;
104
471
        *pdim2 = dim2;
105
471
                    return 1;
106
471
               }
107
471
    }
108
0
     return 0;
109
471
}
110
111
0
#define MINBUFDIV 9 /* min factor by which buffer is smaller than data */
112
0
#define MAXBUF 65536 /* maximum non-ugly buffer */
113
114
/* generic applicability function */
115
static int applicable(const solver *ego_, const problem *p_, planner *plnr,
116
          int *dim0, int *dim1, int *dim2, INT *nbuf)
117
981
{
118
981
     const S *ego = (const S *) ego_;
119
981
     const problem_rdft *p = (const problem_rdft *) p_;
120
121
981
     return (1
122
981
       && p->I == p->O
123
981
       && p->sz->rnk == 0
124
981
       && (p->vecsz->rnk == 2 || p->vecsz->rnk == 3)
125
126
981
       && pickdim(p->vecsz, dim0, dim1, dim2)
127
128
       /* UGLY if vecloop in wrong order for locality */
129
981
       && (!NO_UGLYP(plnr) ||
130
471
     p->vecsz->rnk == 2 ||
131
471
     X(iabs)(p->vecsz->dims[*dim2].is)
132
471
     < X(imax)(X(iabs)(p->vecsz->dims[*dim0].is),
133
471
         X(iabs)(p->vecsz->dims[*dim0].os)))
134
135
       /* SLOW if non-square */
136
981
       && (!NO_SLOWP(plnr)
137
471
     || p->vecsz->dims[*dim0].n == p->vecsz->dims[*dim1].n)
138
          
139
981
       && ego->adt->applicable(p, plnr, *dim0,*dim1,*dim2,nbuf)
140
141
       /* buffers too big are UGLY */
142
981
       && ((!NO_UGLYP(plnr) && !CONSERVE_MEMORYP(plnr))
143
0
     || *nbuf <= MAXBUF
144
0
     || *nbuf * MINBUFDIV <= X(tensor_sz)(p->vecsz))
145
981
    );
146
981
}
147
148
static void get_transpose_vec(const problem_rdft *p, int dim2, INT *vl,INT *vs)
149
78
{
150
78
     if (p->vecsz->rnk == 2) {
151
0
    *vl = 1; *vs = 1;
152
0
     }
153
78
     else {
154
78
    *vl = p->vecsz->dims[dim2].n;
155
78
    *vs = p->vecsz->dims[dim2].is; /* == os */
156
78
     }  
157
78
}
158
159
/*************************************************************************/
160
/* Cache-oblivious in-place transpose of non-square matrices, based 
161
   on transposes of blocks given by the gcd of the dimensions.
162
163
   This algorithm is related to algorithm V5 from Murray Dow,
164
   "Transposing a matrix on a vector computer," Parallel Computing 21
165
   (12), 1997-2005 (1995), with the modification that we use
166
   cache-oblivious recursive transpose subroutines (and we derived
167
   it independently).
168
   
169
   For a p x q matrix, this requires scratch space equal to the size
170
   of the matrix divided by gcd(p,q).  Alternatively, see also the
171
   "cut" algorithm below, if |p-q| * gcd(p,q) < max(p,q). */
172
173
static void apply_gcd(const plan *ego_, R *I, R *O)
174
0
{
175
0
     const P *ego = (const P *) ego_;
176
0
     INT n = ego->nd, m = ego->md, d = ego->d;
177
0
     INT vl = ego->vl;
178
0
     R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
179
0
     INT i, num_el = n*m*d*vl;
180
181
0
     A(ego->n == n * d && ego->m == m * d);
182
0
     UNUSED(O);
183
184
     /* Transpose the matrix I in-place, where I is an (n*d) x (m*d) matrix
185
  of vl-tuples and buf contains n*m*d*vl elements.  
186
  
187
  In general, to transpose a p x q matrix, you should call this
188
  routine with d = gcd(p, q), n = p/d, and m = q/d.  */
189
190
0
     A(n > 0 && m > 0 && vl > 0);
191
0
     A(d > 1);
192
193
     /* treat as (d x n) x (d' x m) matrix.  (d' = d) */
194
     
195
     /* First, transpose d x (n x d') x m to d x (d' x n) x m,
196
  using the buf matrix.  This consists of d transposes
197
  of contiguous n x d' matrices of m-tuples. */
198
0
     if (n > 1) {
199
0
    rdftapply cldapply = ((plan_rdft *) ego->cld1)->apply;
200
0
    for (i = 0; i < d; ++i) {
201
0
         cldapply(ego->cld1, I + i*num_el, buf);
202
0
         memcpy(I + i*num_el, buf, num_el*sizeof(R));
203
0
    }
204
0
     }
205
     
206
     /* Now, transpose (d x d') x (n x m) to (d' x d) x (n x m), which
207
  is a square in-place transpose of n*m-tuples: */
208
0
     {
209
0
    rdftapply cldapply = ((plan_rdft *) ego->cld2)->apply;
210
0
    cldapply(ego->cld2, I, I);
211
0
     }
212
     
213
     /* Finally, transpose d' x ((d x n) x m) to d' x (m x (d x n)),
214
  using the buf matrix.  This consists of d' transposes
215
  of contiguous d*n x m matrices. */
216
0
     if (m > 1) {
217
0
    rdftapply cldapply = ((plan_rdft *) ego->cld3)->apply;
218
0
    for (i = 0; i < d; ++i) {
219
0
         cldapply(ego->cld3, I + i*num_el, buf);
220
0
         memcpy(I + i*num_el, buf, num_el*sizeof(R));
221
0
    }
222
0
     }
223
224
0
     X(ifree)(buf);
225
0
}
226
227
static int applicable_gcd(const problem_rdft *p, planner *plnr,
228
        int dim0, int dim1, int dim2, INT *nbuf)
229
26
{
230
26
     INT n = p->vecsz->dims[dim0].n;
231
26
     INT m = p->vecsz->dims[dim1].n;
232
26
     INT d, vl, vs;
233
26
     get_transpose_vec(p, dim2, &vl, &vs);
234
26
     d = gcd(n, m);
235
26
     *nbuf = n * (m / d) * vl;
236
26
     return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts */
237
26
       && n != m
238
26
       && d > 1
239
26
       && Ntuple_transposable(p->vecsz->dims + dim0,
240
0
            p->vecsz->dims + dim1,
241
0
            vl, vs));
242
26
}
243
244
static int mkcldrn_gcd(const problem_rdft *p, planner *plnr, P *ego)
245
0
{
246
0
     INT n = ego->nd, m = ego->md, d = ego->d;
247
0
     INT vl = ego->vl;
248
0
     R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
249
0
     INT num_el = n*m*d*vl;
250
251
0
     if (n > 1) {
252
0
    ego->cld1 = X(mkplan_d)(plnr,
253
0
          X(mkproblem_rdft_0_d)(
254
0
               X(mktensor_3d)(n, d*m*vl, m*vl,
255
0
                  d, m*vl, n*m*vl,
256
0
                  m*vl, 1, 1),
257
0
               TAINT(p->I, num_el), buf));
258
0
    if (!ego->cld1)
259
0
         goto nada;
260
0
    X(ops_madd)(d, &ego->cld1->ops, &ego->super.super.ops,
261
0
          &ego->super.super.ops);
262
0
    ego->super.super.ops.other += num_el * d * 2;
263
0
     }
264
265
0
     ego->cld2 = X(mkplan_d)(plnr,
266
0
           X(mkproblem_rdft_0_d)(
267
0
          X(mktensor_3d)(d, d*n*m*vl, n*m*vl,
268
0
             d, n*m*vl, d*n*m*vl,
269
0
             n*m*vl, 1, 1),
270
0
          p->I, p->I));
271
0
     if (!ego->cld2)
272
0
    goto nada;
273
0
     X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops);
274
275
0
     if (m > 1) {
276
0
    ego->cld3 = X(mkplan_d)(plnr,
277
0
          X(mkproblem_rdft_0_d)(
278
0
               X(mktensor_3d)(d*n, m*vl, vl,
279
0
                  m, vl, d*n*vl,
280
0
                  vl, 1, 1),
281
0
               TAINT(p->I, num_el), buf));
282
0
    if (!ego->cld3)
283
0
         goto nada;
284
0
    X(ops_madd2)(d, &ego->cld3->ops, &ego->super.super.ops);
285
0
    ego->super.super.ops.other += num_el * d * 2;
286
0
     }
287
288
0
     X(ifree)(buf);
289
0
     return 1;
290
291
0
 nada:
292
0
     X(ifree)(buf);
293
0
     return 0;
294
0
}
295
296
static const transpose_adt adt_gcd =
297
{
298
     apply_gcd, applicable_gcd, mkcldrn_gcd,
299
     "rdft-transpose-gcd"
300
};
301
302
/*************************************************************************/
303
/* Cache-oblivious in-place transpose of non-square n x m matrices,
304
   based on transposing a sub-matrix first and then transposing the
305
   remainder(s) with the help of a buffer.  See also transpose-gcd,
306
   above, if gcd(n,m) is large.
307
308
   This algorithm is related to algorithm V3 from Murray Dow,
309
   "Transposing a matrix on a vector computer," Parallel Computing 21
310
   (12), 1997-2005 (1995), with the modifications that we use
311
   cache-oblivious recursive transpose subroutines and we have the
312
   generalization for large |n-m| below.
313
314
   The best case, and the one described by Dow, is for |n-m| small, in
315
   which case we transpose a square sub-matrix of size min(n,m),
316
   handling the remainder via a buffer.  This requires scratch space
317
   equal to the size of the matrix times |n-m| / max(n,m).
318
319
   As a generalization when |n-m| is not small, we also support cutting
320
   *both* dimensions to an nc x mc matrix which is *not* necessarily
321
   square, but has a large gcd (and can therefore use transpose-gcd).
322
*/
323
324
static void apply_cut(const plan *ego_, R *I, R *O)
325
0
{
326
0
     const P *ego = (const P *) ego_;
327
0
     INT n = ego->n, m = ego->m, nc = ego->nc, mc = ego->mc, vl = ego->vl;
328
0
     INT i;
329
0
     R *buf1 = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
330
0
     UNUSED(O);
331
332
0
     if (m > mc) {
333
0
    ((plan_rdft *) ego->cld1)->apply(ego->cld1, I + mc*vl, buf1);
334
0
    for (i = 0; i < nc; ++i)
335
0
         memmove(I + (mc*vl) * i, I + (m*vl) * i, sizeof(R) * (mc*vl));
336
0
     }
337
338
0
     ((plan_rdft *) ego->cld2)->apply(ego->cld2, I, I); /* nc x mc transpose */
339
     
340
0
     if (n > nc) {
341
0
    R *buf2 = buf1 + (m-mc)*(nc*vl); /* FIXME: force better alignment? */
342
0
    memcpy(buf2, I + nc*(m*vl), (n-nc)*(m*vl)*sizeof(R));
343
0
    for (i = mc-1; i >= 0; --i)
344
0
         memmove(I + (n*vl) * i, I + (nc*vl) * i, sizeof(R) * (n*vl));
345
0
    ((plan_rdft *) ego->cld3)->apply(ego->cld3, buf2, I + nc*vl);
346
0
     }
347
348
0
     if (m > mc) {
349
0
    if (n > nc)
350
0
         for (i = mc; i < m; ++i)
351
0
        memcpy(I + i*(n*vl), buf1 + (i-mc)*(nc*vl),
352
0
         (nc*vl)*sizeof(R));
353
0
    else
354
0
         memcpy(I + mc*(n*vl), buf1, (m-mc)*(n*vl)*sizeof(R));
355
0
     }
356
357
0
     X(ifree)(buf1);
358
0
}
359
360
/* only cut one dimension if the resulting buffer is small enough */
361
static int cut1(INT n, INT m, INT vl)
362
0
{
363
0
     return (X(imax)(n,m) >= X(iabs)(n-m) * MINBUFDIV
364
0
       || X(imin)(n,m) * X(iabs)(n-m) * vl <= MAXBUF);
365
0
}
366
367
0
#define CUT_NSRCH 32 /* range of sizes to search for possible cuts */
368
369
static int applicable_cut(const problem_rdft *p, planner *plnr,
370
        int dim0, int dim1, int dim2, INT *nbuf)
371
26
{
372
26
     INT n = p->vecsz->dims[dim0].n;
373
26
     INT m = p->vecsz->dims[dim1].n;
374
26
     INT vl, vs;
375
26
     get_transpose_vec(p, dim2, &vl, &vs);
376
26
     *nbuf = 0; /* always small enough to be non-UGLY (?) */
377
26
     A(MINBUFDIV <= CUT_NSRCH); /* assumed to avoid inf. loops below */
378
26
     return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts? */
379
26
       && n != m
380
       
381
       /* Don't call transpose-cut recursively (avoid inf. loops):
382
          the non-square sub-transpose produced when !cut1
383
          should always have gcd(n,m) >= min(CUT_NSRCH,n,m),
384
          for which transpose-gcd is applicable */
385
26
       && (cut1(n, m, vl)
386
0
     || gcd(n, m) < X(imin)(MINBUFDIV, X(imin)(n,m)))
387
388
26
       && Ntuple_transposable(p->vecsz->dims + dim0,
389
0
            p->vecsz->dims + dim1,
390
0
            vl, vs));
391
26
}
392
393
static int mkcldrn_cut(const problem_rdft *p, planner *plnr, P *ego)
394
0
{
395
0
     INT n = ego->n, m = ego->m, nc, mc;
396
0
     INT vl = ego->vl;
397
0
     R *buf;
398
399
     /* pick the "best" cut */
400
0
     if (cut1(n, m, vl)) {
401
0
    nc = mc = X(imin)(n,m);
402
0
     }
403
0
     else {
404
0
    INT dc, ns, ms;
405
0
    dc = gcd(m, n); nc = n; mc = m;
406
    /* search for cut with largest gcd
407
       (TODO: different optimality criteria? different search range?) */
408
0
    for (ms = m; ms > 0 && ms > m - CUT_NSRCH; --ms) {
409
0
         for (ns = n; ns > 0 && ns > n - CUT_NSRCH; --ns) {
410
0
        INT ds = gcd(ms, ns);
411
0
        if (ds > dc) {
412
0
       dc = ds; nc = ns; mc = ms;
413
0
       if (dc == X(imin)(ns, ms))
414
0
            break; /* cannot get larger than this */
415
0
        }
416
0
         }
417
0
         if (dc == X(imin)(n, ms))
418
0
        break; /* cannot get larger than this */
419
0
    }
420
0
    A(dc >= X(imin)(CUT_NSRCH, X(imin)(n, m)));
421
0
     }
422
0
     ego->nc = nc;
423
0
     ego->mc = mc;
424
0
     ego->nbuf = (m-mc)*(nc*vl) + (n-nc)*(m*vl);
425
426
0
     buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
427
428
0
     if (m > mc) {
429
0
    ego->cld1 = X(mkplan_d)(plnr,
430
0
          X(mkproblem_rdft_0_d)(
431
0
               X(mktensor_3d)(nc, m*vl, vl,
432
0
                  m-mc, vl, nc*vl,
433
0
                  vl, 1, 1),
434
0
               p->I + mc*vl, buf));
435
0
    if (!ego->cld1)
436
0
         goto nada;
437
0
    X(ops_add2)(&ego->cld1->ops, &ego->super.super.ops);
438
0
     }
439
440
0
     ego->cld2 = X(mkplan_d)(plnr,
441
0
           X(mkproblem_rdft_0_d)(
442
0
          X(mktensor_3d)(nc, mc*vl, vl,
443
0
             mc, vl, nc*vl,
444
0
             vl, 1, 1),
445
0
          p->I, p->I));
446
0
     if (!ego->cld2)
447
0
    goto nada;
448
0
     X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops);
449
450
0
     if (n > nc) {
451
0
    ego->cld3 = X(mkplan_d)(plnr,
452
0
          X(mkproblem_rdft_0_d)(
453
0
               X(mktensor_3d)(n-nc, m*vl, vl,
454
0
                  m, vl, n*vl,
455
0
                  vl, 1, 1),
456
0
               buf + (m-mc)*(nc*vl), p->I + nc*vl));
457
0
    if (!ego->cld3)
458
0
         goto nada;
459
0
    X(ops_add2)(&ego->cld3->ops, &ego->super.super.ops);
460
0
     }
461
462
     /* memcpy/memmove operations */
463
0
     ego->super.super.ops.other += 2 * vl * (nc*mc * ((m > mc) + (n > nc))
464
0
               + (n-nc)*m + (m-mc)*nc);
465
466
0
     X(ifree)(buf);
467
0
     return 1;
468
469
0
 nada:
470
0
     X(ifree)(buf);
471
0
     return 0;
472
0
}
473
474
static const transpose_adt adt_cut =
475
{
476
     apply_cut, applicable_cut, mkcldrn_cut,
477
     "rdft-transpose-cut"
478
};
479
480
/*************************************************************************/
481
/* In-place transpose routine from TOMS, which follows the cycles of
482
   the permutation so that it writes to each location only once.
483
   Because of cache-line and other issues, however, this routine is
484
   typically much slower than transpose-gcd or transpose-cut, even
485
   though the latter do some extra writes.  On the other hand, if the
486
   vector length is large then the TOMS routine is best.
487
488
   The TOMS routine also has the advantage of requiring less buffer
489
   space for the case of gcd(nx,ny) small.  However, in this case it
490
   has been superseded by the combination of the generalized
491
   transpose-cut method with the transpose-gcd method, which can
492
   always transpose with buffers a small fraction of the array size
493
   regardless of gcd(nx,ny). */
494
495
/*
496
 * TOMS Transpose.  Algorithm 513 (Revised version of algorithm 380).
497
 * 
498
 * These routines do in-place transposes of arrays.
499
 * 
500
 * [ Cate, E.G. and Twigg, D.W., ACM Transactions on Mathematical Software, 
501
 *   vol. 3, no. 1, 104-110 (1977) ]
502
 * 
503
 * C version by Steven G. Johnson (February 1997).
504
 */
505
506
/*
507
 * "a" is a 1D array of length ny*nx*N which constains the nx x ny
508
 * matrix of N-tuples to be transposed.  "a" is stored in row-major
509
 * order (last index varies fastest).  move is a 1D array of length
510
 * move_size used to store information to speed up the process.  The
511
 * value move_size=(ny+nx)/2 is recommended.  buf should be an array
512
 * of length 2*N.
513
 * 
514
 */
515
516
static void transpose_toms513(R *a, INT nx, INT ny, INT N,
517
                              char *move, INT move_size, R *buf)
518
0
{
519
0
     INT i, im, mn;
520
0
     R *b, *c, *d;
521
0
     INT ncount;
522
0
     INT k;
523
     
524
     /* check arguments and initialize: */
525
0
     A(ny > 0 && nx > 0 && N > 0 && move_size > 0);
526
     
527
0
     b = buf;
528
     
529
     /* Cate & Twigg have a special case for nx == ny, but we don't
530
  bother, since we already have special code for this case elsewhere. */
531
532
0
     c = buf + N;
533
0
     ncount = 2;    /* always at least 2 fixed points */
534
0
     k = (mn = ny * nx) - 1;
535
     
536
0
     for (i = 0; i < move_size; ++i)
537
0
    move[i] = 0;
538
     
539
0
     if (ny >= 3 && nx >= 3)
540
0
    ncount += gcd(ny - 1, nx - 1) - 1; /* # fixed points */
541
     
542
0
     i = 1;
543
0
     im = ny;
544
     
545
0
     while (1) {
546
0
    INT i1, i2, i1c, i2c;
547
0
    INT kmi;
548
    
549
    /** Rearrange the elements of a loop
550
        and its companion loop: **/
551
    
552
0
    i1 = i;
553
0
    kmi = k - i;
554
0
    i1c = kmi;
555
0
    switch (N) {
556
0
        case 1:
557
0
       b[0] = a[i1];
558
0
       c[0] = a[i1c];
559
0
       break;
560
0
        case 2:
561
0
       b[0] = a[2*i1];
562
0
       b[1] = a[2*i1+1];
563
0
       c[0] = a[2*i1c];
564
0
       c[1] = a[2*i1c+1];
565
0
       break;
566
0
        default:
567
0
       memcpy(b, &a[N * i1], N * sizeof(R));
568
0
       memcpy(c, &a[N * i1c], N * sizeof(R));
569
0
    }
570
0
    while (1) {
571
0
         i2 = ny * i1 - k * (i1 / nx);
572
0
         i2c = k - i2;
573
0
         if (i1 < move_size)
574
0
        move[i1] = 1;
575
0
         if (i1c < move_size)
576
0
        move[i1c] = 1;
577
0
         ncount += 2;
578
0
         if (i2 == i)
579
0
        break;
580
0
         if (i2 == kmi) {
581
0
        d = b;
582
0
        b = c;
583
0
        c = d;
584
0
        break;
585
0
         }
586
0
         switch (N) {
587
0
       case 1:
588
0
      a[i1] = a[i2];
589
0
      a[i1c] = a[i2c];
590
0
      break;
591
0
       case 2:
592
0
      a[2*i1] = a[2*i2];
593
0
      a[2*i1+1] = a[2*i2+1];
594
0
      a[2*i1c] = a[2*i2c];
595
0
      a[2*i1c+1] = a[2*i2c+1];
596
0
      break;
597
0
       default:
598
0
      memcpy(&a[N * i1], &a[N * i2], 
599
0
             N * sizeof(R));
600
0
      memcpy(&a[N * i1c], &a[N * i2c], 
601
0
             N * sizeof(R));
602
0
         }
603
0
         i1 = i2;
604
0
         i1c = i2c;
605
0
    }
606
0
    switch (N) {
607
0
        case 1:
608
0
       a[i1] = b[0];
609
0
       a[i1c] = c[0];
610
0
       break;
611
0
        case 2:
612
0
       a[2*i1] = b[0];
613
0
       a[2*i1+1] = b[1];
614
0
       a[2*i1c] = c[0];
615
0
       a[2*i1c+1] = c[1];
616
0
       break;
617
0
        default:
618
0
       memcpy(&a[N * i1], b, N * sizeof(R));
619
0
       memcpy(&a[N * i1c], c, N * sizeof(R));
620
0
    }
621
0
    if (ncount >= mn)
622
0
         break; /* we've moved all elements */
623
    
624
    /** Search for loops to rearrange: **/
625
    
626
0
    while (1) {
627
0
         INT max = k - i;
628
0
         ++i;
629
0
         A(i <= max);
630
0
         im += ny;
631
0
         if (im > k)
632
0
        im -= k;
633
0
         i2 = im;
634
0
         if (i == i2)
635
0
        continue;
636
0
         if (i >= move_size) {
637
0
        while (i2 > i && i2 < max) {
638
0
       i1 = i2;
639
0
       i2 = ny * i1 - k * (i1 / nx);
640
0
        }
641
0
        if (i2 == i)
642
0
       break;
643
0
         } else if (!move[i])
644
0
        break;
645
0
    }
646
0
     }
647
0
}
648
649
static void apply_toms513(const plan *ego_, R *I, R *O)
650
0
{
651
0
     const P *ego = (const P *) ego_;
652
0
     INT n = ego->n, m = ego->m;
653
0
     INT vl = ego->vl;
654
0
     R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
655
0
     UNUSED(O);
656
0
     transpose_toms513(I, n, m, vl, (char *) (buf + 2*vl), (n+m)/2, buf);
657
0
     X(ifree)(buf);
658
0
}
659
660
static int applicable_toms513(const problem_rdft *p, planner *plnr,
661
         int dim0, int dim1, int dim2, INT *nbuf)
662
26
{
663
26
     INT n = p->vecsz->dims[dim0].n;
664
26
     INT m = p->vecsz->dims[dim1].n;
665
26
     INT vl, vs;
666
26
     get_transpose_vec(p, dim2, &vl, &vs);
667
26
     *nbuf = 2*vl 
668
26
    + ((n + m) / 2 * sizeof(char) + sizeof(R) - 1) / sizeof(R);
669
26
     return (!NO_SLOWP(plnr)
670
26
       && (vl > 8 || !NO_UGLYP(plnr)) /* UGLY for small vl */
671
26
       && n != m
672
26
       && Ntuple_transposable(p->vecsz->dims + dim0,
673
0
            p->vecsz->dims + dim1,
674
0
            vl, vs));
675
26
}
676
677
static int mkcldrn_toms513(const problem_rdft *p, planner *plnr, P *ego)
678
0
{
679
0
     UNUSED(p); UNUSED(plnr);
680
     /* heuristic so that TOMS algorithm is last resort for small vl */
681
0
     ego->super.super.ops.other += ego->n * ego->m * 2 * (ego->vl + 30);
682
0
     return 1;
683
0
}
684
685
static const transpose_adt adt_toms513 =
686
{
687
     apply_toms513, applicable_toms513, mkcldrn_toms513,
688
     "rdft-transpose-toms513"
689
};
690
691
/*-----------------------------------------------------------------------*/
692
/*-----------------------------------------------------------------------*/
693
/* generic stuff: */
694
695
static void awake(plan *ego_, enum wakefulness wakefulness)
696
0
{
697
0
     P *ego = (P *) ego_;
698
0
     X(plan_awake)(ego->cld1, wakefulness);
699
0
     X(plan_awake)(ego->cld2, wakefulness);
700
0
     X(plan_awake)(ego->cld3, wakefulness);
701
0
}
702
703
static void print(const plan *ego_, printer *p)
704
0
{
705
0
     const P *ego = (const P *) ego_;
706
0
     p->print(p, "(%s-%Dx%D%v", ego->slv->adt->nam,
707
0
        ego->n, ego->m, ego->vl);
708
0
     if (ego->cld1) p->print(p, "%(%p%)", ego->cld1);
709
0
     if (ego->cld2) p->print(p, "%(%p%)", ego->cld2);
710
0
     if (ego->cld3) p->print(p, "%(%p%)", ego->cld3);
711
0
     p->print(p, ")");
712
0
}
713
714
static void destroy(plan *ego_)
715
0
{
716
0
     P *ego = (P *) ego_;
717
0
     X(plan_destroy_internal)(ego->cld3);
718
0
     X(plan_destroy_internal)(ego->cld2);
719
0
     X(plan_destroy_internal)(ego->cld1);
720
0
}
721
722
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
723
981
{
724
981
     const S *ego = (const S *) ego_;
725
981
     const problem_rdft *p;
726
981
     int dim0, dim1, dim2;
727
981
     INT nbuf, vs;
728
981
     P *pln;
729
730
981
     static const plan_adt padt = {
731
981
    X(rdft_solve), awake, print, destroy
732
981
     };
733
734
981
     if (!applicable(ego_, p_, plnr, &dim0, &dim1, &dim2, &nbuf))
735
981
          return (plan *) 0;
736
737
0
     p = (const problem_rdft *) p_;
738
0
     pln = MKPLAN_RDFT(P, &padt, ego->adt->apply);
739
740
0
     pln->n = p->vecsz->dims[dim0].n;
741
0
     pln->m = p->vecsz->dims[dim1].n;
742
0
     get_transpose_vec(p, dim2, &pln->vl, &vs);
743
0
     pln->nbuf = nbuf;
744
0
     pln->d = gcd(pln->n, pln->m);
745
0
     pln->nd = pln->n / pln->d;
746
0
     pln->md = pln->m / pln->d;
747
0
     pln->slv = ego;
748
749
0
     X(ops_zero)(&pln->super.super.ops); /* mkcldrn is responsible for ops */
750
751
0
     pln->cld1 = pln->cld2 = pln->cld3 = 0;
752
0
     if (!ego->adt->mkcldrn(p, plnr, pln)) {
753
0
    X(plan_destroy_internal)(&(pln->super.super));
754
0
    return 0;
755
0
     }
756
757
0
     return &(pln->super.super);
758
0
}
759
760
static solver *mksolver(const transpose_adt *adt)
761
3
{
762
3
     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
763
3
     S *slv = MKSOLVER(S, &sadt);
764
3
     slv->adt = adt;
765
3
     return &(slv->super);
766
3
}
767
768
void X(rdft_vrank3_transpose_register)(planner *p)
769
1
{
770
1
     unsigned i;
771
1
     static const transpose_adt *const adts[] = {
772
1
    &adt_gcd, &adt_cut,
773
1
    &adt_toms513
774
1
     };
775
4
     for (i = 0; i < sizeof(adts) / sizeof(adts[0]); ++i)
776
3
          REGISTER_SOLVER(p, mksolver(adts[i]));
777
1
}