Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2003, 2007-14 Matteo Frigo |
3 | | * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology |
4 | | * |
5 | | * This program is free software; you can redistribute it and/or modify |
6 | | * it under the terms of the GNU General Public License as published by |
7 | | * the Free Software Foundation; either version 2 of the License, or |
8 | | * (at your option) any later version. |
9 | | * |
10 | | * This program is distributed in the hope that it will be useful, |
11 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | * GNU General Public License for more details. |
14 | | * |
15 | | * You should have received a copy of the GNU General Public License |
16 | | * along with this program; if not, write to the Free Software |
17 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
18 | | * |
19 | | */ |
20 | | |
21 | | #include "dft/dft.h" |
22 | | |
23 | | typedef struct { |
24 | | solver super; |
25 | | } S; |
26 | | |
27 | | typedef struct { |
28 | | plan_dft super; |
29 | | twid *td; |
30 | | INT n, is, os; |
31 | | } P; |
32 | | |
33 | | |
34 | | static void cdot(INT n, const E *x, const R *w, |
35 | | R *or0, R *oi0, R *or1, R *oi1) |
36 | 3.66k | { |
37 | 3.66k | INT i; |
38 | | |
39 | 3.66k | E rr = x[0], ri = 0, ir = x[1], ii = 0; |
40 | 3.66k | x += 2; |
41 | 81.0k | for (i = 1; i + i < n; ++i) { |
42 | 77.3k | rr += x[0] * w[0]; |
43 | 77.3k | ir += x[1] * w[0]; |
44 | 77.3k | ri += x[2] * w[1]; |
45 | 77.3k | ii += x[3] * w[1]; |
46 | 77.3k | x += 4; w += 2; |
47 | 77.3k | } |
48 | 3.66k | *or0 = rr + ii; |
49 | 3.66k | *oi0 = ir - ri; |
50 | 3.66k | *or1 = rr - ii; |
51 | 3.66k | *oi1 = ir + ri; |
52 | 3.66k | } |
53 | | |
54 | | static void hartley(INT n, const R *xr, const R *xi, INT xs, E *o, |
55 | | R *pr, R *pi) |
56 | 254 | { |
57 | 254 | INT i; |
58 | 254 | E sr, si; |
59 | 254 | o[0] = sr = xr[0]; o[1] = si = xi[0]; o += 2; |
60 | 3.91k | for (i = 1; i + i < n; ++i) { |
61 | 3.66k | sr += (o[0] = xr[i * xs] + xr[(n - i) * xs]); |
62 | 3.66k | si += (o[1] = xi[i * xs] + xi[(n - i) * xs]); |
63 | 3.66k | o[2] = xr[i * xs] - xr[(n - i) * xs]; |
64 | 3.66k | o[3] = xi[i * xs] - xi[(n - i) * xs]; |
65 | 3.66k | o += 4; |
66 | 3.66k | } |
67 | 254 | *pr = sr; |
68 | 254 | *pi = si; |
69 | 254 | } |
70 | | |
71 | | static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io) |
72 | 254 | { |
73 | 254 | const P *ego = (const P *) ego_; |
74 | 254 | INT i; |
75 | 254 | INT n = ego->n, is = ego->is, os = ego->os; |
76 | 254 | const R *W = ego->td->W; |
77 | 254 | E *buf; |
78 | 254 | size_t bufsz = n * 2 * sizeof(E); |
79 | | |
80 | 254 | BUF_ALLOC(E *, buf, bufsz); |
81 | 254 | hartley(n, ri, ii, is, buf, ro, io); |
82 | | |
83 | 3.91k | for (i = 1; i + i < n; ++i) { |
84 | 3.66k | cdot(n, buf, W, |
85 | 3.66k | ro + i * os, io + i * os, |
86 | 3.66k | ro + (n - i) * os, io + (n - i) * os); |
87 | 3.66k | W += n - 1; |
88 | 3.66k | } |
89 | | |
90 | 254 | BUF_FREE(buf, bufsz); |
91 | 254 | } |
92 | | |
93 | | static void awake(plan *ego_, enum wakefulness wakefulness) |
94 | 138 | { |
95 | 138 | P *ego = (P *) ego_; |
96 | 138 | static const tw_instr half_tw[] = { |
97 | 138 | { TW_HALF, 1, 0 }, |
98 | 138 | { TW_NEXT, 1, 0 } |
99 | 138 | }; |
100 | | |
101 | 138 | X(twiddle_awake)(wakefulness, &ego->td, half_tw, ego->n, ego->n, |
102 | 138 | (ego->n - 1) / 2); |
103 | 138 | } |
104 | | |
105 | | static void print(const plan *ego_, printer *p) |
106 | 0 | { |
107 | 0 | const P *ego = (const P *) ego_; |
108 | |
|
109 | 0 | p->print(p, "(dft-generic-%D)", ego->n); |
110 | 0 | } |
111 | | |
112 | | static int applicable(const solver *ego, const problem *p_, |
113 | | const planner *plnr) |
114 | 1.18k | { |
115 | 1.18k | const problem_dft *p = (const problem_dft *) p_; |
116 | 1.18k | UNUSED(ego); |
117 | | |
118 | 1.18k | return (1 |
119 | 1.18k | && p->sz->rnk == 1 |
120 | 1.18k | && p->vecsz->rnk == 0 |
121 | 1.18k | && (p->sz->dims[0].n % 2) == 1 |
122 | 1.18k | && CIMPLIES(NO_LARGE_GENERICP(plnr), p->sz->dims[0].n < GENERIC_MIN_BAD) |
123 | 1.18k | && CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > GENERIC_MAX_SLOW) |
124 | 1.18k | && X(is_prime)(p->sz->dims[0].n) |
125 | 1.18k | ); |
126 | 1.18k | } |
127 | | |
128 | | static plan *mkplan(const solver *ego, const problem *p_, planner *plnr) |
129 | 1.18k | { |
130 | 1.18k | const problem_dft *p; |
131 | 1.18k | P *pln; |
132 | 1.18k | INT n; |
133 | | |
134 | 1.18k | static const plan_adt padt = { |
135 | 1.18k | X(dft_solve), awake, print, X(plan_null_destroy) |
136 | 1.18k | }; |
137 | | |
138 | 1.18k | if (!applicable(ego, p_, plnr)) |
139 | 1.00k | return (plan *)0; |
140 | | |
141 | 185 | pln = MKPLAN_DFT(P, &padt, apply); |
142 | | |
143 | 185 | p = (const problem_dft *) p_; |
144 | 185 | pln->n = n = p->sz->dims[0].n; |
145 | 185 | pln->is = p->sz->dims[0].is; |
146 | 185 | pln->os = p->sz->dims[0].os; |
147 | 185 | pln->td = 0; |
148 | | |
149 | 185 | pln->super.super.ops.add = (n-1) * 5; |
150 | 185 | pln->super.super.ops.mul = 0; |
151 | 185 | pln->super.super.ops.fma = (n-1) * (n-1) ; |
152 | | #if 0 /* these are nice pipelined sequential loads and should cost nothing */ |
153 | | pln->super.super.ops.other = (n-1)*(4 + 1 + 2 * (n-1)); /* approximate */ |
154 | | #endif |
155 | | |
156 | 185 | return &(pln->super.super); |
157 | 1.18k | } |
158 | | |
159 | | static solver *mksolver(void) |
160 | 3 | { |
161 | 3 | static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 }; |
162 | 3 | S *slv = MKSOLVER(S, &sadt); |
163 | 3 | return &(slv->super); |
164 | 3 | } |
165 | | |
166 | | void X(dft_generic_register)(planner *p) |
167 | 1 | { |
168 | 1 | REGISTER_SOLVER(p, mksolver()); |
169 | 1 | } |