/src/fftw3/rdft/dht-rader.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2003, 2007-14 Matteo Frigo |
3 | | * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology |
4 | | * |
5 | | * This program is free software; you can redistribute it and/or modify |
6 | | * it under the terms of the GNU General Public License as published by |
7 | | * the Free Software Foundation; either version 2 of the License, or |
8 | | * (at your option) any later version. |
9 | | * |
10 | | * This program is distributed in the hope that it will be useful, |
11 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | * GNU General Public License for more details. |
14 | | * |
15 | | * You should have received a copy of the GNU General Public License |
16 | | * along with this program; if not, write to the Free Software |
17 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
18 | | * |
19 | | */ |
20 | | |
21 | | #include "rdft/rdft.h" |
22 | | |
23 | | /* |
24 | | * Compute DHTs of prime sizes using Rader's trick: turn them |
25 | | * into convolutions of size n - 1, which we then perform via a pair |
26 | | * of FFTs. (We can then do prime real FFTs via rdft-dht.c.) |
27 | | * |
28 | | * Optionally (determined by the "pad" field of the solver), we can |
29 | | * perform the (cyclic) convolution by zero-padding to a size |
30 | | * >= 2*(n-1) - 1. This is advantageous if n-1 has large prime factors. |
31 | | * |
32 | | */ |
33 | | |
34 | | typedef struct { |
35 | | solver super; |
36 | | int pad; |
37 | | } S; |
38 | | |
39 | | typedef struct { |
40 | | plan_rdft super; |
41 | | |
42 | | plan *cld1, *cld2; |
43 | | R *omega; |
44 | | INT n, npad, g, ginv; |
45 | | INT is, os; |
46 | | plan *cld_omega; |
47 | | } P; |
48 | | |
49 | | static rader_tl *omegas = 0; |
50 | | |
51 | | /***************************************************************************/ |
52 | | |
53 | | /* If R2HC_ONLY_CONV is 1, we use a trick to perform the convolution |
54 | | purely in terms of R2HC transforms, as opposed to R2HC followed by H2RC. |
55 | | This requires a few more operations, but allows us to share the same |
56 | | plan/codelets for both Rader children. */ |
57 | | #define R2HC_ONLY_CONV 1 |
58 | | |
59 | | static void apply(const plan *ego_, R *I, R *O) |
60 | 0 | { |
61 | 0 | const P *ego = (const P *) ego_; |
62 | 0 | INT n = ego->n; /* prime */ |
63 | 0 | INT npad = ego->npad; /* == n - 1 for unpadded Rader; always even */ |
64 | 0 | INT is = ego->is, os; |
65 | 0 | INT k, gpower, g; |
66 | 0 | R *buf, *omega; |
67 | 0 | R r0; |
68 | |
|
69 | 0 | buf = (R *) MALLOC(sizeof(R) * npad, BUFFERS); |
70 | | |
71 | | /* First, permute the input, storing in buf: */ |
72 | 0 | g = ego->g; |
73 | 0 | for (gpower = 1, k = 0; k < n - 1; ++k, gpower = MULMOD(gpower, g, n)) { |
74 | 0 | buf[k] = I[gpower * is]; |
75 | 0 | } |
76 | 0 | /* gpower == g^(n-1) mod n == 1 */; |
77 | |
|
78 | 0 | A(n - 1 <= npad); |
79 | 0 | for (k = n - 1; k < npad; ++k) /* optionally, zero-pad convolution */ |
80 | 0 | buf[k] = 0; |
81 | |
|
82 | 0 | os = ego->os; |
83 | | |
84 | | /* compute RDFT of buf, storing in buf (i.e., in-place): */ |
85 | 0 | { |
86 | 0 | plan_rdft *cld = (plan_rdft *) ego->cld1; |
87 | 0 | cld->apply((plan *) cld, buf, buf); |
88 | 0 | } |
89 | | |
90 | | /* set output DC component: */ |
91 | 0 | O[0] = (r0 = I[0]) + buf[0]; |
92 | | |
93 | | /* now, multiply by omega: */ |
94 | 0 | omega = ego->omega; |
95 | 0 | buf[0] *= omega[0]; |
96 | 0 | for (k = 1; k < npad/2; ++k) { |
97 | 0 | E rB, iB, rW, iW, a, b; |
98 | 0 | rW = omega[k]; |
99 | 0 | iW = omega[npad - k]; |
100 | 0 | rB = buf[k]; |
101 | 0 | iB = buf[npad - k]; |
102 | 0 | a = rW * rB - iW * iB; |
103 | 0 | b = rW * iB + iW * rB; |
104 | 0 | #if R2HC_ONLY_CONV |
105 | 0 | buf[k] = a + b; |
106 | 0 | buf[npad - k] = a - b; |
107 | | #else |
108 | | buf[k] = a; |
109 | | buf[npad - k] = b; |
110 | | #endif |
111 | 0 | } |
112 | | /* Nyquist component: */ |
113 | 0 | A(k + k == npad); /* since npad is even */ |
114 | 0 | buf[k] *= omega[k]; |
115 | | |
116 | | /* this will add input[0] to all of the outputs after the ifft */ |
117 | 0 | buf[0] += r0; |
118 | | |
119 | | /* inverse FFT: */ |
120 | 0 | { |
121 | 0 | plan_rdft *cld = (plan_rdft *) ego->cld2; |
122 | 0 | cld->apply((plan *) cld, buf, buf); |
123 | 0 | } |
124 | | |
125 | | /* do inverse permutation to unshuffle the output: */ |
126 | 0 | A(gpower == 1); |
127 | 0 | #if R2HC_ONLY_CONV |
128 | 0 | O[os] = buf[0]; |
129 | 0 | gpower = g = ego->ginv; |
130 | 0 | A(npad == n - 1 || npad/2 >= n - 1); |
131 | 0 | if (npad == n - 1) { |
132 | 0 | for (k = 1; k < npad/2; ++k, gpower = MULMOD(gpower, g, n)) { |
133 | 0 | O[gpower * os] = buf[k] + buf[npad - k]; |
134 | 0 | } |
135 | 0 | O[gpower * os] = buf[k]; |
136 | 0 | ++k, gpower = MULMOD(gpower, g, n); |
137 | 0 | for (; k < npad; ++k, gpower = MULMOD(gpower, g, n)) { |
138 | 0 | O[gpower * os] = buf[npad - k] - buf[k]; |
139 | 0 | } |
140 | 0 | } |
141 | 0 | else { |
142 | 0 | for (k = 1; k < n - 1; ++k, gpower = MULMOD(gpower, g, n)) { |
143 | 0 | O[gpower * os] = buf[k] + buf[npad - k]; |
144 | 0 | } |
145 | 0 | } |
146 | | #else |
147 | | g = ego->ginv; |
148 | | for (k = 0; k < n - 1; ++k, gpower = MULMOD(gpower, g, n)) { |
149 | | O[gpower * os] = buf[k]; |
150 | | } |
151 | | #endif |
152 | 0 | A(gpower == 1); |
153 | |
|
154 | 0 | X(ifree)(buf); |
155 | 0 | } |
156 | | |
157 | | static R *mkomega(enum wakefulness wakefulness, |
158 | | plan *p_, INT n, INT npad, INT ginv) |
159 | 0 | { |
160 | 0 | plan_rdft *p = (plan_rdft *) p_; |
161 | 0 | R *omega; |
162 | 0 | INT i, gpower; |
163 | 0 | trigreal scale; |
164 | 0 | triggen *t; |
165 | |
|
166 | 0 | if ((omega = X(rader_tl_find)(n, npad + 1, ginv, omegas))) |
167 | 0 | return omega; |
168 | | |
169 | 0 | omega = (R *)MALLOC(sizeof(R) * npad, TWIDDLES); |
170 | |
|
171 | 0 | scale = npad; /* normalization for convolution */ |
172 | |
|
173 | 0 | t = X(mktriggen)(wakefulness, n); |
174 | 0 | for (i = 0, gpower = 1; i < n-1; ++i, gpower = MULMOD(gpower, ginv, n)) { |
175 | 0 | trigreal w[2]; |
176 | 0 | t->cexpl(t, gpower, w); |
177 | 0 | omega[i] = (w[0] + w[1]) / scale; |
178 | 0 | } |
179 | 0 | X(triggen_destroy)(t); |
180 | 0 | A(gpower == 1); |
181 | |
|
182 | 0 | A(npad == n - 1 || npad >= 2*(n - 1) - 1); |
183 | |
|
184 | 0 | for (; i < npad; ++i) |
185 | 0 | omega[i] = K(0.0); |
186 | 0 | if (npad > n - 1) |
187 | 0 | for (i = 1; i < n-1; ++i) |
188 | 0 | omega[npad - i] = omega[n - 1 - i]; |
189 | |
|
190 | 0 | p->apply(p_, omega, omega); |
191 | |
|
192 | 0 | X(rader_tl_insert)(n, npad + 1, ginv, omega, &omegas); |
193 | 0 | return omega; |
194 | 0 | } |
195 | | |
196 | | static void free_omega(R *omega) |
197 | 0 | { |
198 | 0 | X(rader_tl_delete)(omega, &omegas); |
199 | 0 | } |
200 | | |
201 | | /***************************************************************************/ |
202 | | |
203 | | static void awake(plan *ego_, enum wakefulness wakefulness) |
204 | 0 | { |
205 | 0 | P *ego = (P *) ego_; |
206 | |
|
207 | 0 | X(plan_awake)(ego->cld1, wakefulness); |
208 | 0 | X(plan_awake)(ego->cld2, wakefulness); |
209 | 0 | X(plan_awake)(ego->cld_omega, wakefulness); |
210 | |
|
211 | 0 | switch (wakefulness) { |
212 | 0 | case SLEEPY: |
213 | 0 | free_omega(ego->omega); |
214 | 0 | ego->omega = 0; |
215 | 0 | break; |
216 | 0 | default: |
217 | 0 | ego->g = X(find_generator)(ego->n); |
218 | 0 | ego->ginv = X(power_mod)(ego->g, ego->n - 2, ego->n); |
219 | 0 | A(MULMOD(ego->g, ego->ginv, ego->n) == 1); |
220 | |
|
221 | 0 | A(!ego->omega); |
222 | 0 | ego->omega = mkomega(wakefulness, |
223 | 0 | ego->cld_omega,ego->n,ego->npad,ego->ginv); |
224 | 0 | break; |
225 | 0 | } |
226 | 0 | } |
227 | | |
228 | | static void destroy(plan *ego_) |
229 | 0 | { |
230 | 0 | P *ego = (P *) ego_; |
231 | 0 | X(plan_destroy_internal)(ego->cld_omega); |
232 | 0 | X(plan_destroy_internal)(ego->cld2); |
233 | 0 | X(plan_destroy_internal)(ego->cld1); |
234 | 0 | } |
235 | | |
236 | | static void print(const plan *ego_, printer *p) |
237 | 0 | { |
238 | 0 | const P *ego = (const P *) ego_; |
239 | |
|
240 | 0 | p->print(p, "(dht-rader-%D/%D%ois=%oos=%(%p%)", |
241 | 0 | ego->n, ego->npad, ego->is, ego->os, ego->cld1); |
242 | 0 | if (ego->cld2 != ego->cld1) |
243 | 0 | p->print(p, "%(%p%)", ego->cld2); |
244 | 0 | if (ego->cld_omega != ego->cld1 && ego->cld_omega != ego->cld2) |
245 | 0 | p->print(p, "%(%p%)", ego->cld_omega); |
246 | 0 | p->putchr(p, ')'); |
247 | 0 | } |
248 | | |
249 | | static int applicable(const solver *ego, const problem *p_, const planner *plnr) |
250 | 648 | { |
251 | 648 | const problem_rdft *p = (const problem_rdft *) p_; |
252 | 648 | UNUSED(ego); |
253 | 648 | return (1 |
254 | 648 | && p->sz->rnk == 1 |
255 | 648 | && p->vecsz->rnk == 0 |
256 | 648 | && p->kind[0] == DHT |
257 | 648 | && X(is_prime)(p->sz->dims[0].n) |
258 | 648 | && p->sz->dims[0].n > 2 |
259 | 648 | && CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > RADER_MAX_SLOW) |
260 | | /* proclaim the solver SLOW if p-1 is not easily |
261 | | factorizable. Unlike in the complex case where |
262 | | Bluestein can solve the problem, in the DHT case we |
263 | | may have no other choice */ |
264 | 648 | && CIMPLIES(NO_SLOWP(plnr), X(factors_into_small_primes)(p->sz->dims[0].n - 1)) |
265 | 648 | ); |
266 | 648 | } |
267 | | |
268 | | static INT choose_transform_size(INT minsz) |
269 | 0 | { |
270 | 0 | static const INT primes[] = { 2, 3, 5, 0 }; |
271 | 0 | while (!X(factors_into)(minsz, primes) || minsz % 2) |
272 | 0 | ++minsz; |
273 | 0 | return minsz; |
274 | 0 | } |
275 | | |
276 | | static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) |
277 | 648 | { |
278 | 648 | const S *ego = (const S *) ego_; |
279 | 648 | const problem_rdft *p = (const problem_rdft *) p_; |
280 | 648 | P *pln; |
281 | 648 | INT n, npad; |
282 | 648 | INT is, os; |
283 | 648 | plan *cld1 = (plan *) 0; |
284 | 648 | plan *cld2 = (plan *) 0; |
285 | 648 | plan *cld_omega = (plan *) 0; |
286 | 648 | R *buf = (R *) 0; |
287 | 648 | problem *cldp; |
288 | | |
289 | 648 | static const plan_adt padt = { |
290 | 648 | X(rdft_solve), awake, print, destroy |
291 | 648 | }; |
292 | | |
293 | 648 | if (!applicable(ego_, p_, plnr)) |
294 | 648 | return (plan *) 0; |
295 | | |
296 | 0 | n = p->sz->dims[0].n; |
297 | 0 | is = p->sz->dims[0].is; |
298 | 0 | os = p->sz->dims[0].os; |
299 | |
|
300 | 0 | if (ego->pad) |
301 | 0 | npad = choose_transform_size(2 * (n - 1) - 1); |
302 | 0 | else |
303 | 0 | npad = n - 1; |
304 | | |
305 | | /* initial allocation for the purpose of planning */ |
306 | 0 | buf = (R *) MALLOC(sizeof(R) * npad, BUFFERS); |
307 | |
|
308 | 0 | cld1 = X(mkplan_f_d)(plnr, |
309 | 0 | X(mkproblem_rdft_1_d)(X(mktensor_1d)(npad, 1, 1), |
310 | 0 | X(mktensor_1d)(1, 0, 0), |
311 | 0 | buf, buf, |
312 | 0 | R2HC), |
313 | 0 | NO_SLOW, 0, 0); |
314 | 0 | if (!cld1) goto nada; |
315 | | |
316 | 0 | cldp = |
317 | 0 | X(mkproblem_rdft_1_d)( |
318 | 0 | X(mktensor_1d)(npad, 1, 1), |
319 | 0 | X(mktensor_1d)(1, 0, 0), |
320 | 0 | buf, buf, |
321 | 0 | #if R2HC_ONLY_CONV |
322 | 0 | R2HC |
323 | | #else |
324 | | HC2R |
325 | | #endif |
326 | 0 | ); |
327 | 0 | if (!(cld2 = X(mkplan_f_d)(plnr, cldp, NO_SLOW, 0, 0))) |
328 | 0 | goto nada; |
329 | | |
330 | | /* plan for omega */ |
331 | 0 | cld_omega = X(mkplan_f_d)(plnr, |
332 | 0 | X(mkproblem_rdft_1_d)( |
333 | 0 | X(mktensor_1d)(npad, 1, 1), |
334 | 0 | X(mktensor_1d)(1, 0, 0), |
335 | 0 | buf, buf, R2HC), |
336 | 0 | NO_SLOW, ESTIMATE, 0); |
337 | 0 | if (!cld_omega) goto nada; |
338 | | |
339 | | /* deallocate buffers; let awake() or apply() allocate them for real */ |
340 | 0 | X(ifree)(buf); |
341 | 0 | buf = 0; |
342 | |
|
343 | 0 | pln = MKPLAN_RDFT(P, &padt, apply); |
344 | 0 | pln->cld1 = cld1; |
345 | 0 | pln->cld2 = cld2; |
346 | 0 | pln->cld_omega = cld_omega; |
347 | 0 | pln->omega = 0; |
348 | 0 | pln->n = n; |
349 | 0 | pln->npad = npad; |
350 | 0 | pln->is = is; |
351 | 0 | pln->os = os; |
352 | |
|
353 | 0 | X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops); |
354 | 0 | pln->super.super.ops.other += (npad/2-1)*6 + npad + n + (n-1) * ego->pad; |
355 | 0 | pln->super.super.ops.add += (npad/2-1)*2 + 2 + (n-1) * ego->pad; |
356 | 0 | pln->super.super.ops.mul += (npad/2-1)*4 + 2 + ego->pad; |
357 | 0 | #if R2HC_ONLY_CONV |
358 | 0 | pln->super.super.ops.other += n-2 - ego->pad; |
359 | 0 | pln->super.super.ops.add += (npad/2-1)*2 + (n-2) - ego->pad; |
360 | 0 | #endif |
361 | |
|
362 | 0 | return &(pln->super.super); |
363 | | |
364 | 0 | nada: |
365 | 0 | X(ifree0)(buf); |
366 | 0 | X(plan_destroy_internal)(cld_omega); |
367 | 0 | X(plan_destroy_internal)(cld2); |
368 | 0 | X(plan_destroy_internal)(cld1); |
369 | 0 | return 0; |
370 | 0 | } |
371 | | |
372 | | /* constructors */ |
373 | | |
374 | | static solver *mksolver(int pad) |
375 | 2 | { |
376 | 2 | static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 }; |
377 | 2 | S *slv = MKSOLVER(S, &sadt); |
378 | 2 | slv->pad = pad; |
379 | 2 | return &(slv->super); |
380 | 2 | } |
381 | | |
382 | | void X(dht_rader_register)(planner *p) |
383 | 1 | { |
384 | 1 | REGISTER_SOLVER(p, mksolver(0)); |
385 | 1 | REGISTER_SOLVER(p, mksolver(1)); |
386 | 1 | } |