Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2003, 2007-14 Matteo Frigo |
3 | | * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology |
4 | | * |
5 | | * This program is free software; you can redistribute it and/or modify |
6 | | * it under the terms of the GNU General Public License as published by |
7 | | * the Free Software Foundation; either version 2 of the License, or |
8 | | * (at your option) any later version. |
9 | | * |
10 | | * This program is distributed in the hope that it will be useful, |
11 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | * GNU General Public License for more details. |
14 | | * |
15 | | * You should have received a copy of the GNU General Public License |
16 | | * along with this program; if not, write to the Free Software |
17 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
18 | | * |
19 | | */ |
20 | | |
21 | | |
22 | | /* plans for rank-0 RDFTs (copy operations) */ |
23 | | |
24 | | #include "rdft/rdft.h" |
25 | | |
26 | | #ifdef HAVE_STRING_H |
27 | | #include <string.h> /* for memcpy() */ |
28 | | #endif |
29 | | |
30 | 5.22k | #define MAXRNK 32 /* FIXME: should malloc() */ |
31 | | |
32 | | typedef struct { |
33 | | plan_rdft super; |
34 | | INT vl; |
35 | | int rnk; |
36 | | iodim d[MAXRNK]; |
37 | | const char *nam; |
38 | | } P; |
39 | | |
40 | | typedef struct { |
41 | | solver super; |
42 | | rdftapply apply; |
43 | | int (*applicable)(const P *pln, const problem_rdft *p); |
44 | | const char *nam; |
45 | | } S; |
46 | | |
47 | | /* copy up to MAXRNK dimensions from problem into plan. If a |
48 | | contiguous dimension exists, save its length in pln->vl */ |
49 | | static int fill_iodim(P *pln, const problem_rdft *p) |
50 | 3.54k | { |
51 | 3.54k | int i; |
52 | 3.54k | const tensor *vecsz = p->vecsz; |
53 | | |
54 | 3.54k | pln->vl = 1; |
55 | 3.54k | pln->rnk = 0; |
56 | 12.3k | for (i = 0; i < vecsz->rnk; ++i) { |
57 | | /* extract contiguous dimensions */ |
58 | 8.76k | if (pln->vl == 1 && |
59 | 8.76k | vecsz->dims[i].is == 1 && vecsz->dims[i].os == 1) |
60 | 3.54k | pln->vl = vecsz->dims[i].n; |
61 | 5.22k | else if (pln->rnk == MAXRNK) |
62 | 0 | return 0; |
63 | 5.22k | else |
64 | 5.22k | pln->d[pln->rnk++] = vecsz->dims[i]; |
65 | 8.76k | } |
66 | | |
67 | 3.54k | return 1; |
68 | 3.54k | } |
69 | | |
70 | | /* generic higher-rank copy routine, calls cpy2d() to do the real work */ |
71 | | static void copy(const iodim *d, int rnk, INT vl, |
72 | | R *I, R *O, |
73 | | cpy2d_func cpy2d) |
74 | 0 | { |
75 | 0 | A(rnk >= 2); |
76 | 0 | if (rnk == 2) |
77 | 0 | cpy2d(I, O, d[0].n, d[0].is, d[0].os, d[1].n, d[1].is, d[1].os, vl); |
78 | 0 | else { |
79 | 0 | INT i; |
80 | 0 | for (i = 0; i < d[0].n; ++i, I += d[0].is, O += d[0].os) |
81 | 0 | copy(d + 1, rnk - 1, vl, I, O, cpy2d); |
82 | 0 | } |
83 | 0 | } |
84 | | |
85 | | /* FIXME: should be more general */ |
86 | | static int transposep(const P *pln) |
87 | 515 | { |
88 | 515 | int i; |
89 | | |
90 | 559 | for (i = 0; i < pln->rnk - 2; ++i) |
91 | 44 | if (pln->d[i].is != pln->d[i].os) |
92 | 0 | return 0; |
93 | | |
94 | 515 | return (pln->d[i].n == pln->d[i+1].n && |
95 | 515 | pln->d[i].is == pln->d[i+1].os && |
96 | 515 | pln->d[i].os == pln->d[i+1].is); |
97 | 515 | } |
98 | | |
99 | | /* generic higher-rank transpose routine, calls transpose2d() to do |
100 | | * the real work */ |
101 | | static void transpose(const iodim *d, int rnk, INT vl, |
102 | | R *I, |
103 | | transpose_func transpose2d) |
104 | 0 | { |
105 | 0 | A(rnk >= 2); |
106 | 0 | if (rnk == 2) |
107 | 0 | transpose2d(I, d[0].n, d[0].is, d[0].os, vl); |
108 | 0 | else { |
109 | 0 | INT i; |
110 | 0 | for (i = 0; i < d[0].n; ++i, I += d[0].is) |
111 | 0 | transpose(d + 1, rnk - 1, vl, I, transpose2d); |
112 | 0 | } |
113 | 0 | } |
114 | | |
115 | | /**************************************************************/ |
116 | | /* rank 0,1,2, out of place, iterative */ |
117 | | static void apply_iter(const plan *ego_, R *I, R *O) |
118 | 81 | { |
119 | 81 | const P *ego = (const P *) ego_; |
120 | | |
121 | 81 | switch (ego->rnk) { |
122 | 15 | case 0: |
123 | 15 | X(cpy1d)(I, O, ego->vl, 1, 1, 1); |
124 | 15 | break; |
125 | 66 | case 1: |
126 | 66 | X(cpy1d)(I, O, |
127 | 66 | ego->d[0].n, ego->d[0].is, ego->d[0].os, |
128 | 66 | ego->vl); |
129 | 66 | break; |
130 | 0 | default: |
131 | 0 | copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_ci)); |
132 | 0 | break; |
133 | 81 | } |
134 | 81 | } |
135 | | |
136 | | static int applicable_iter(const P *pln, const problem_rdft *p) |
137 | 396 | { |
138 | 396 | UNUSED(pln); |
139 | 396 | return (p->I != p->O); |
140 | 396 | } |
141 | | |
142 | | /**************************************************************/ |
143 | | /* out of place, write contiguous output */ |
144 | | static void apply_cpy2dco(const plan *ego_, R *I, R *O) |
145 | 0 | { |
146 | 0 | const P *ego = (const P *) ego_; |
147 | 0 | copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_co)); |
148 | 0 | } |
149 | | |
150 | | static int applicable_cpy2dco(const P *pln, const problem_rdft *p) |
151 | 326 | { |
152 | 326 | int rnk = pln->rnk; |
153 | 326 | return (1 |
154 | 326 | && p->I != p->O |
155 | 326 | && rnk >= 2 |
156 | | |
157 | | /* must not duplicate apply_iter */ |
158 | 326 | && (X(iabs)(pln->d[rnk - 2].is) <= X(iabs)(pln->d[rnk - 1].is) |
159 | 11 | || |
160 | 11 | X(iabs)(pln->d[rnk - 2].os) <= X(iabs)(pln->d[rnk - 1].os)) |
161 | 326 | ); |
162 | 326 | } |
163 | | |
164 | | /**************************************************************/ |
165 | | /* out of place, tiled, no buffering */ |
166 | | static void apply_tiled(const plan *ego_, R *I, R *O) |
167 | 0 | { |
168 | 0 | const P *ego = (const P *) ego_; |
169 | 0 | copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiled)); |
170 | 0 | } |
171 | | |
172 | | static int applicable_tiled(const P *pln, const problem_rdft *p) |
173 | 652 | { |
174 | 652 | return (1 |
175 | 652 | && p->I != p->O |
176 | 652 | && pln->rnk >= 2 |
177 | | |
178 | | /* somewhat arbitrary */ |
179 | 652 | && X(compute_tilesz)(pln->vl, 1) > 4 |
180 | 652 | ); |
181 | 652 | } |
182 | | |
183 | | /**************************************************************/ |
184 | | /* out of place, tiled, with buffer */ |
185 | | static void apply_tiledbuf(const plan *ego_, R *I, R *O) |
186 | 0 | { |
187 | 0 | const P *ego = (const P *) ego_; |
188 | 0 | copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiledbuf)); |
189 | 0 | } |
190 | | |
191 | 1 | #define applicable_tiledbuf applicable_tiled |
192 | | |
193 | | /**************************************************************/ |
194 | | /* rank 0, out of place, using memcpy */ |
195 | | static void apply_memcpy(const plan *ego_, R *I, R *O) |
196 | 0 | { |
197 | 0 | const P *ego = (const P *) ego_; |
198 | |
|
199 | 0 | A(ego->rnk == 0); |
200 | 0 | memcpy(O, I, ego->vl * sizeof(R)); |
201 | 0 | } |
202 | | |
203 | | static int applicable_memcpy(const P *pln, const problem_rdft *p) |
204 | 326 | { |
205 | 326 | return (1 |
206 | 326 | && p->I != p->O |
207 | 326 | && pln->rnk == 0 |
208 | 326 | && pln->vl > 2 /* do not bother memcpy-ing complex numbers */ |
209 | 326 | ); |
210 | 326 | } |
211 | | |
212 | | /**************************************************************/ |
213 | | /* rank > 0 vecloop, out of place, using memcpy (e.g. out-of-place |
214 | | transposes of vl-tuples ... for large vl it should be more |
215 | | efficient to use memcpy than the tiled stuff). */ |
216 | | |
217 | | static void memcpy_loop(size_t cpysz, int rnk, const iodim *d, R *I, R *O) |
218 | 0 | { |
219 | 0 | INT i, n = d->n, is = d->is, os = d->os; |
220 | 0 | if (rnk == 1) |
221 | 0 | for (i = 0; i < n; ++i, I += is, O += os) |
222 | 0 | memcpy(O, I, cpysz); |
223 | 0 | else { |
224 | 0 | --rnk; ++d; |
225 | 0 | for (i = 0; i < n; ++i, I += is, O += os) |
226 | 0 | memcpy_loop(cpysz, rnk, d, I, O); |
227 | 0 | } |
228 | 0 | } |
229 | | |
230 | | static void apply_memcpy_loop(const plan *ego_, R *I, R *O) |
231 | 0 | { |
232 | 0 | const P *ego = (const P *) ego_; |
233 | 0 | memcpy_loop(ego->vl * sizeof(R), ego->rnk, ego->d, I, O); |
234 | 0 | } |
235 | | |
236 | | static int applicable_memcpy_loop(const P *pln, const problem_rdft *p) |
237 | 326 | { |
238 | 326 | return (p->I != p->O |
239 | 326 | && pln->rnk > 0 |
240 | 326 | && pln->vl > 2 /* do not bother memcpy-ing complex numbers */); |
241 | 326 | } |
242 | | |
243 | | /**************************************************************/ |
244 | | /* rank 2, in place, square transpose, iterative */ |
245 | | static void apply_ip_sq(const plan *ego_, R *I, R *O) |
246 | 0 | { |
247 | 0 | const P *ego = (const P *) ego_; |
248 | 0 | UNUSED(O); |
249 | 0 | transpose(ego->d, ego->rnk, ego->vl, I, X(transpose)); |
250 | 0 | } |
251 | | |
252 | | |
253 | | static int applicable_ip_sq(const P *pln, const problem_rdft *p) |
254 | 989 | { |
255 | 989 | return (1 |
256 | 989 | && p->I == p->O |
257 | 989 | && pln->rnk >= 2 |
258 | 989 | && transposep(pln)); |
259 | 989 | } |
260 | | |
261 | | /**************************************************************/ |
262 | | /* rank 2, in place, square transpose, tiled */ |
263 | | static void apply_ip_sq_tiled(const plan *ego_, R *I, R *O) |
264 | 0 | { |
265 | 0 | const P *ego = (const P *) ego_; |
266 | 0 | UNUSED(O); |
267 | 0 | transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiled)); |
268 | 0 | } |
269 | | |
270 | | static int applicable_ip_sq_tiled(const P *pln, const problem_rdft *p) |
271 | 663 | { |
272 | 663 | return (1 |
273 | 663 | && applicable_ip_sq(pln, p) |
274 | | |
275 | | /* somewhat arbitrary */ |
276 | 663 | && X(compute_tilesz)(pln->vl, 2) > 4 |
277 | 663 | ); |
278 | 663 | } |
279 | | |
280 | | /**************************************************************/ |
281 | | /* rank 2, in place, square transpose, tiled, buffered */ |
282 | | static void apply_ip_sq_tiledbuf(const plan *ego_, R *I, R *O) |
283 | 0 | { |
284 | 0 | const P *ego = (const P *) ego_; |
285 | 0 | UNUSED(O); |
286 | 0 | transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiledbuf)); |
287 | 0 | } |
288 | | |
289 | 1 | #define applicable_ip_sq_tiledbuf applicable_ip_sq_tiled |
290 | | |
291 | | /**************************************************************/ |
292 | | static int applicable(const S *ego, const problem *p_) |
293 | 3.01k | { |
294 | 3.01k | const problem_rdft *p = (const problem_rdft *) p_; |
295 | 3.01k | P pln; |
296 | 3.01k | return (1 |
297 | 3.01k | && p->sz->rnk == 0 |
298 | 3.01k | && FINITE_RNK(p->vecsz->rnk) |
299 | 3.01k | && fill_iodim(&pln, p) |
300 | 3.01k | && ego->applicable(&pln, p) |
301 | 3.01k | ); |
302 | 3.01k | } |
303 | | |
304 | | static void print(const plan *ego_, printer *p) |
305 | 0 | { |
306 | 0 | const P *ego = (const P *) ego_; |
307 | 0 | int i; |
308 | 0 | p->print(p, "(%s/%D", ego->nam, ego->vl); |
309 | 0 | for (i = 0; i < ego->rnk; ++i) |
310 | 0 | p->print(p, "%v", ego->d[i].n); |
311 | 0 | p->print(p, ")"); |
312 | 0 | } |
313 | | |
314 | | static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) |
315 | 3.01k | { |
316 | 3.01k | const problem_rdft *p; |
317 | 3.01k | const S *ego = (const S *) ego_; |
318 | 3.01k | P *pln; |
319 | 3.01k | int retval; |
320 | | |
321 | 3.01k | static const plan_adt padt = { |
322 | 3.01k | X(rdft_solve), X(null_awake), print, X(plan_null_destroy) |
323 | 3.01k | }; |
324 | | |
325 | 3.01k | UNUSED(plnr); |
326 | | |
327 | 3.01k | if (!applicable(ego, p_)) |
328 | 2.48k | return (plan *) 0; |
329 | | |
330 | 526 | p = (const problem_rdft *) p_; |
331 | 526 | pln = MKPLAN_RDFT(P, &padt, ego->apply); |
332 | | |
333 | 526 | retval = fill_iodim(pln, p); |
334 | 526 | (void)retval; /* UNUSED unless DEBUG */ |
335 | 526 | A(retval); |
336 | 526 | A(pln->vl > 0); /* because FINITE_RNK(p->vecsz->rnk) holds */ |
337 | 526 | pln->nam = ego->nam; |
338 | | |
339 | | /* X(tensor_sz)(p->vecsz) loads, X(tensor_sz)(p->vecsz) stores */ |
340 | 526 | X(ops_other)(2 * X(tensor_sz)(p->vecsz), &pln->super.super.ops); |
341 | 526 | return &(pln->super.super); |
342 | 3.01k | } |
343 | | |
344 | | |
345 | | void X(rdft_rank0_register)(planner *p) |
346 | 1 | { |
347 | 1 | unsigned i; |
348 | 1 | static struct { |
349 | 1 | rdftapply apply; |
350 | 1 | int (*applicable)(const P *, const problem_rdft *); |
351 | 1 | const char *nam; |
352 | 1 | } tab[] = { |
353 | 1 | { apply_memcpy, applicable_memcpy, "rdft-rank0-memcpy" }, |
354 | 1 | { apply_memcpy_loop, applicable_memcpy_loop, |
355 | 1 | "rdft-rank0-memcpy-loop" }, |
356 | 1 | { apply_iter, applicable_iter, "rdft-rank0-iter-ci" }, |
357 | 1 | { apply_cpy2dco, applicable_cpy2dco, "rdft-rank0-iter-co" }, |
358 | 1 | { apply_tiled, applicable_tiled, "rdft-rank0-tiled" }, |
359 | 1 | { apply_tiledbuf, applicable_tiledbuf, "rdft-rank0-tiledbuf" }, |
360 | 1 | { apply_ip_sq, applicable_ip_sq, "rdft-rank0-ip-sq" }, |
361 | 1 | { |
362 | 1 | apply_ip_sq_tiled, |
363 | 1 | applicable_ip_sq_tiled, |
364 | 1 | "rdft-rank0-ip-sq-tiled" |
365 | 1 | }, |
366 | 1 | { |
367 | 1 | apply_ip_sq_tiledbuf, |
368 | 1 | applicable_ip_sq_tiledbuf, |
369 | 1 | "rdft-rank0-ip-sq-tiledbuf" |
370 | 1 | }, |
371 | 1 | }; |
372 | | |
373 | 10 | for (i = 0; i < sizeof(tab) / sizeof(tab[0]); ++i) { |
374 | 9 | static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 }; |
375 | 9 | S *slv = MKSOLVER(S, &sadt); |
376 | 9 | slv->apply = tab[i].apply; |
377 | 9 | slv->applicable = tab[i].applicable; |
378 | 9 | slv->nam = tab[i].nam; |
379 | 9 | REGISTER_SOLVER(p, &(slv->super)); |
380 | 9 | } |
381 | 1 | } |