/src/fftw3/rdft/scalar/r2cf/hc2cfdft_12.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2003, 2007-14 Matteo Frigo |
3 | | * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology |
4 | | * |
5 | | * This program is free software; you can redistribute it and/or modify |
6 | | * it under the terms of the GNU General Public License as published by |
7 | | * the Free Software Foundation; either version 2 of the License, or |
8 | | * (at your option) any later version. |
9 | | * |
10 | | * This program is distributed in the hope that it will be useful, |
11 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | * GNU General Public License for more details. |
14 | | * |
15 | | * You should have received a copy of the GNU General Public License |
16 | | * along with this program; if not, write to the Free Software |
17 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
18 | | * |
19 | | */ |
20 | | |
21 | | /* This file was automatically generated --- DO NOT EDIT */ |
22 | | /* Generated on Sat Feb 14 07:05:21 UTC 2026 */ |
23 | | |
24 | | #include "rdft/codelet-rdft.h" |
25 | | |
26 | | #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) |
27 | | |
28 | | /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hc2cfdft_12 -include rdft/scalar/hc2cf.h */ |
29 | | |
30 | | /* |
31 | | * This function contains 142 FP additions, 92 FP multiplications, |
32 | | * (or, 96 additions, 46 multiplications, 46 fused multiply/add), |
33 | | * 65 stack variables, 2 constants, and 48 memory accesses |
34 | | */ |
35 | | #include "rdft/scalar/hc2cf.h" |
36 | | |
37 | | static void hc2cfdft_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) |
38 | | { |
39 | | DK(KP866025403, +0.866025403784438646763723170752936183471402627); |
40 | | DK(KP500000000, +0.500000000000000000000000000000000000000000000); |
41 | | { |
42 | | INT m; |
43 | | for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) { |
44 | | E To, T1E, T1m, T2H, Ta, T1G, Tk, T1I, Tl, T1J, T1s, T2b, T1A, T2d, T1B; |
45 | | E T2I, T12, T18, T19, T24, T26, T2C, Tz, T1M, T1f, T2B, TJ, T1O, TT, T1Q; |
46 | | E TU, T1R; |
47 | | { |
48 | | E Tm, Tn, T1u, T1x, T1y, T1z, T1v, T2c, Te, Tj, T1i, T1l, Tf, T1H, T4; |
49 | | E T1o, T9, T1r, T5, T1F, T1p, T2a, T1t, T1, T1n; |
50 | | Tm = Ip[0]; |
51 | | Tn = Im[0]; |
52 | | T1u = Tm + Tn; |
53 | | T1x = Rp[0]; |
54 | | T1y = Rm[0]; |
55 | | T1z = T1x - T1y; |
56 | | T1t = W[0]; |
57 | | T1v = T1t * T1u; |
58 | | T2c = T1t * T1z; |
59 | | { |
60 | | E Tc, Td, Th, Ti, Tb; |
61 | | Tc = Ip[WS(rs, 4)]; |
62 | | Td = Im[WS(rs, 4)]; |
63 | | Te = Tc - Td; |
64 | | Th = Rp[WS(rs, 4)]; |
65 | | Ti = Rm[WS(rs, 4)]; |
66 | | Tj = Th + Ti; |
67 | | T1i = Tc + Td; |
68 | | T1l = Th - Ti; |
69 | | Tb = W[14]; |
70 | | Tf = Tb * Te; |
71 | | T1H = Tb * Tj; |
72 | | } |
73 | | { |
74 | | E T2, T3, T7, T8; |
75 | | T2 = Ip[WS(rs, 2)]; |
76 | | T3 = Im[WS(rs, 2)]; |
77 | | T4 = T2 - T3; |
78 | | T1o = T2 + T3; |
79 | | T7 = Rp[WS(rs, 2)]; |
80 | | T8 = Rm[WS(rs, 2)]; |
81 | | T9 = T7 + T8; |
82 | | T1r = T7 - T8; |
83 | | } |
84 | | T1 = W[6]; |
85 | | T5 = T1 * T4; |
86 | | T1F = T1 * T9; |
87 | | T1n = W[8]; |
88 | | T1p = T1n * T1o; |
89 | | T2a = T1n * T1r; |
90 | | To = Tm - Tn; |
91 | | T1E = T1x + T1y; |
92 | | { |
93 | | E T1j, T2G, T1h, T1k; |
94 | | T1h = W[16]; |
95 | | T1j = T1h * T1i; |
96 | | T2G = T1h * T1l; |
97 | | T1k = W[17]; |
98 | | T1m = FNMS(T1k, T1l, T1j); |
99 | | T2H = FMA(T1k, T1i, T2G); |
100 | | } |
101 | | { |
102 | | E T6, Tg, T1q, T1w; |
103 | | T6 = W[7]; |
104 | | Ta = FNMS(T6, T9, T5); |
105 | | T1G = FMA(T6, T4, T1F); |
106 | | Tg = W[15]; |
107 | | Tk = FNMS(Tg, Tj, Tf); |
108 | | T1I = FMA(Tg, Te, T1H); |
109 | | Tl = Ta + Tk; |
110 | | T1J = T1G + T1I; |
111 | | T1q = W[9]; |
112 | | T1s = FNMS(T1q, T1r, T1p); |
113 | | T2b = FMA(T1q, T1o, T2a); |
114 | | T1w = W[1]; |
115 | | T1A = FNMS(T1w, T1z, T1v); |
116 | | T2d = FMA(T1w, T1u, T2c); |
117 | | T1B = T1s + T1A; |
118 | | T2I = T2b + T2d; |
119 | | } |
120 | | } |
121 | | { |
122 | | E Tt, T11, Ty, T10, T23, TX, TZ, TN, TS, T1b, T1e, TO, T1P, TD, TI; |
123 | | E T17, T16, T25, T13, T15, TE, T1N, TF, TP; |
124 | | { |
125 | | E Tr, Ts, Tw, Tx, TY; |
126 | | Tr = Ip[WS(rs, 3)]; |
127 | | Ts = Im[WS(rs, 3)]; |
128 | | Tt = Tr - Ts; |
129 | | T11 = Tr + Ts; |
130 | | Tw = Rp[WS(rs, 3)]; |
131 | | Tx = Rm[WS(rs, 3)]; |
132 | | TY = Tx - Tw; |
133 | | Ty = Tw + Tx; |
134 | | T10 = W[12]; |
135 | | T23 = T10 * TY; |
136 | | TX = W[13]; |
137 | | TZ = TX * TY; |
138 | | } |
139 | | { |
140 | | E TL, TM, TQ, TR, TK; |
141 | | TL = Ip[WS(rs, 1)]; |
142 | | TM = Im[WS(rs, 1)]; |
143 | | TN = TL - TM; |
144 | | TQ = Rp[WS(rs, 1)]; |
145 | | TR = Rm[WS(rs, 1)]; |
146 | | TS = TQ + TR; |
147 | | T1b = TL + TM; |
148 | | T1e = TQ - TR; |
149 | | TK = W[2]; |
150 | | TO = TK * TN; |
151 | | T1P = TK * TS; |
152 | | } |
153 | | { |
154 | | E TB, TC, T14, TG, TH, TA; |
155 | | TB = Ip[WS(rs, 5)]; |
156 | | TC = Im[WS(rs, 5)]; |
157 | | TD = TB - TC; |
158 | | TG = Rp[WS(rs, 5)]; |
159 | | TH = Rm[WS(rs, 5)]; |
160 | | TI = TG + TH; |
161 | | T14 = TH - TG; |
162 | | T17 = TB + TC; |
163 | | T16 = W[20]; |
164 | | T25 = T16 * T14; |
165 | | T13 = W[21]; |
166 | | T15 = T13 * T14; |
167 | | TA = W[18]; |
168 | | TE = TA * TD; |
169 | | T1N = TA * TI; |
170 | | } |
171 | | T12 = FMA(T10, T11, TZ); |
172 | | T18 = FMA(T16, T17, T15); |
173 | | T19 = T12 + T18; |
174 | | T24 = FNMS(TX, T11, T23); |
175 | | T26 = FNMS(T13, T17, T25); |
176 | | T2C = T24 + T26; |
177 | | { |
178 | | E Tu, T1L, Tq, Tv; |
179 | | Tq = W[10]; |
180 | | Tu = Tq * Tt; |
181 | | T1L = Tq * Ty; |
182 | | Tv = W[11]; |
183 | | Tz = FNMS(Tv, Ty, Tu); |
184 | | T1M = FMA(Tv, Tt, T1L); |
185 | | } |
186 | | { |
187 | | E T1c, T2A, T1a, T1d; |
188 | | T1a = W[4]; |
189 | | T1c = T1a * T1b; |
190 | | T2A = T1a * T1e; |
191 | | T1d = W[5]; |
192 | | T1f = FNMS(T1d, T1e, T1c); |
193 | | T2B = FMA(T1d, T1b, T2A); |
194 | | } |
195 | | TF = W[19]; |
196 | | TJ = FNMS(TF, TI, TE); |
197 | | T1O = FMA(TF, TD, T1N); |
198 | | TP = W[3]; |
199 | | TT = FNMS(TP, TS, TO); |
200 | | T1Q = FMA(TP, TN, T1P); |
201 | | TU = TJ + TT; |
202 | | T1R = T1O + T1Q; |
203 | | } |
204 | | { |
205 | | E TW, T2V, T2Y, T30, T1D, T1U, T1T, T2Z; |
206 | | { |
207 | | E Tp, TV, T2W, T2X; |
208 | | Tp = Tl + To; |
209 | | TV = Tz + TU; |
210 | | TW = Tp - TV; |
211 | | T2V = TV + Tp; |
212 | | T2W = T2C - T2B; |
213 | | T2X = T2H + T2I; |
214 | | T2Y = T2W - T2X; |
215 | | T30 = T2W + T2X; |
216 | | } |
217 | | { |
218 | | E T1g, T1C, T1K, T1S; |
219 | | T1g = T19 + T1f; |
220 | | T1C = T1m + T1B; |
221 | | T1D = T1g - T1C; |
222 | | T1U = T1g + T1C; |
223 | | T1K = T1E + T1J; |
224 | | T1S = T1M + T1R; |
225 | | T1T = T1K + T1S; |
226 | | T2Z = T1K - T1S; |
227 | | } |
228 | | Ip[WS(rs, 3)] = KP500000000 * (TW + T1D); |
229 | | Rp[WS(rs, 3)] = KP500000000 * (T2Z - T30); |
230 | | Im[WS(rs, 2)] = KP500000000 * (T1D - TW); |
231 | | Rm[WS(rs, 2)] = KP500000000 * (T2Z + T30); |
232 | | Rm[WS(rs, 5)] = KP500000000 * (T1T - T1U); |
233 | | Im[WS(rs, 5)] = KP500000000 * (T2Y - T2V); |
234 | | Rp[0] = KP500000000 * (T1T + T1U); |
235 | | Ip[0] = KP500000000 * (T2V + T2Y); |
236 | | } |
237 | | { |
238 | | E T1X, T2v, T2F, T2Q, T2L, T2R, T20, T2w, T28, T2t, T2j, T2p, T2m, T2q, T2f; |
239 | | E T2s; |
240 | | { |
241 | | E T1V, T1W, T2D, T2E; |
242 | | T1V = FNMS(KP500000000, T1J, T1E); |
243 | | T1W = Ta - Tk; |
244 | | T1X = FNMS(KP866025403, T1W, T1V); |
245 | | T2v = FMA(KP866025403, T1W, T1V); |
246 | | T2D = FMA(KP500000000, T2C, T2B); |
247 | | T2E = T18 - T12; |
248 | | T2F = FNMS(KP866025403, T2E, T2D); |
249 | | T2Q = FMA(KP866025403, T2E, T2D); |
250 | | } |
251 | | { |
252 | | E T2J, T2K, T1Y, T1Z; |
253 | | T2J = FNMS(KP500000000, T2I, T2H); |
254 | | T2K = T1s - T1A; |
255 | | T2L = FNMS(KP866025403, T2K, T2J); |
256 | | T2R = FMA(KP866025403, T2K, T2J); |
257 | | T1Y = FNMS(KP500000000, T1R, T1M); |
258 | | T1Z = TJ - TT; |
259 | | T20 = FNMS(KP866025403, T1Z, T1Y); |
260 | | T2w = FMA(KP866025403, T1Z, T1Y); |
261 | | } |
262 | | { |
263 | | E T22, T27, T2h, T2i; |
264 | | T22 = FNMS(KP500000000, T19, T1f); |
265 | | T27 = T24 - T26; |
266 | | T28 = FNMS(KP866025403, T27, T22); |
267 | | T2t = FMA(KP866025403, T27, T22); |
268 | | T2h = FNMS(KP500000000, Tl, To); |
269 | | T2i = T1I - T1G; |
270 | | T2j = FNMS(KP866025403, T2i, T2h); |
271 | | T2p = FMA(KP866025403, T2i, T2h); |
272 | | } |
273 | | { |
274 | | E T2k, T2l, T29, T2e; |
275 | | T2k = FNMS(KP500000000, TU, Tz); |
276 | | T2l = T1Q - T1O; |
277 | | T2m = FNMS(KP866025403, T2l, T2k); |
278 | | T2q = FMA(KP866025403, T2l, T2k); |
279 | | T29 = FNMS(KP500000000, T1B, T1m); |
280 | | T2e = T2b - T2d; |
281 | | T2f = FNMS(KP866025403, T2e, T29); |
282 | | T2s = FMA(KP866025403, T2e, T29); |
283 | | } |
284 | | { |
285 | | E T21, T2g, T2P, T2S; |
286 | | T21 = T1X + T20; |
287 | | T2g = T28 + T2f; |
288 | | Rp[WS(rs, 2)] = KP500000000 * (T21 - T2g); |
289 | | Rm[WS(rs, 3)] = KP500000000 * (T21 + T2g); |
290 | | T2P = T2m + T2j; |
291 | | T2S = T2Q + T2R; |
292 | | Ip[WS(rs, 2)] = KP500000000 * (T2P + T2S); |
293 | | Im[WS(rs, 3)] = KP500000000 * (T2S - T2P); |
294 | | } |
295 | | { |
296 | | E T2n, T2o, T2T, T2U; |
297 | | T2n = T2j - T2m; |
298 | | T2o = T2f - T28; |
299 | | Ip[WS(rs, 5)] = KP500000000 * (T2n + T2o); |
300 | | Im[0] = KP500000000 * (T2o - T2n); |
301 | | T2T = T1X - T20; |
302 | | T2U = T2R - T2Q; |
303 | | Rm[0] = KP500000000 * (T2T - T2U); |
304 | | Rp[WS(rs, 5)] = KP500000000 * (T2T + T2U); |
305 | | } |
306 | | { |
307 | | E T2r, T2u, T2N, T2O; |
308 | | T2r = T2p - T2q; |
309 | | T2u = T2s - T2t; |
310 | | Ip[WS(rs, 1)] = KP500000000 * (T2r + T2u); |
311 | | Im[WS(rs, 4)] = KP500000000 * (T2u - T2r); |
312 | | T2N = T2v - T2w; |
313 | | T2O = T2L - T2F; |
314 | | Rm[WS(rs, 4)] = KP500000000 * (T2N - T2O); |
315 | | Rp[WS(rs, 1)] = KP500000000 * (T2N + T2O); |
316 | | } |
317 | | { |
318 | | E T2x, T2y, T2z, T2M; |
319 | | T2x = T2v + T2w; |
320 | | T2y = T2t + T2s; |
321 | | Rm[WS(rs, 1)] = KP500000000 * (T2x - T2y); |
322 | | Rp[WS(rs, 4)] = KP500000000 * (T2x + T2y); |
323 | | T2z = T2q + T2p; |
324 | | T2M = T2F + T2L; |
325 | | Ip[WS(rs, 4)] = KP500000000 * (T2z - T2M); |
326 | | Im[WS(rs, 1)] = -(KP500000000 * (T2z + T2M)); |
327 | | } |
328 | | } |
329 | | } |
330 | | } |
331 | | } |
332 | | |
333 | | static const tw_instr twinstr[] = { |
334 | | { TW_FULL, 1, 12 }, |
335 | | { TW_NEXT, 1, 0 } |
336 | | }; |
337 | | |
338 | | static const hc2c_desc desc = { 12, "hc2cfdft_12", twinstr, &GENUS, { 96, 46, 46, 0 } }; |
339 | | |
340 | | void X(codelet_hc2cfdft_12) (planner *p) { |
341 | | X(khc2c_register) (p, hc2cfdft_12, &desc, HC2C_VIA_DFT); |
342 | | } |
343 | | #else |
344 | | |
345 | | /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hc2cfdft_12 -include rdft/scalar/hc2cf.h */ |
346 | | |
347 | | /* |
348 | | * This function contains 142 FP additions, 76 FP multiplications, |
349 | | * (or, 112 additions, 46 multiplications, 30 fused multiply/add), |
350 | | * 52 stack variables, 3 constants, and 48 memory accesses |
351 | | */ |
352 | | #include "rdft/scalar/hc2cf.h" |
353 | | |
354 | | static void hc2cfdft_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) |
355 | 0 | { |
356 | 0 | DK(KP250000000, +0.250000000000000000000000000000000000000000000); |
357 | 0 | DK(KP500000000, +0.500000000000000000000000000000000000000000000); |
358 | 0 | DK(KP433012701, +0.433012701892219323381861585376468091735701313); |
359 | 0 | { |
360 | 0 | INT m; |
361 | 0 | for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) { |
362 | 0 | E Tm, T1t, T1d, T2j, Tj, T1Y, T1w, T1G, T1q, T2q, T1U, T2k, Tw, T1y, T17; |
363 | 0 | E T2g, TP, T21, T1B, T1J, T12, T2u, T1P, T2h; |
364 | 0 | { |
365 | 0 | E Tk, Tl, T1k, T1m, T1n, T1o, T4, T1f, T8, T1h, Th, T1c, Td, T1a, T19; |
366 | 0 | E T1b; |
367 | 0 | { |
368 | 0 | E T2, T3, T6, T7; |
369 | 0 | Tk = Ip[0]; |
370 | 0 | Tl = Im[0]; |
371 | 0 | T1k = Tk + Tl; |
372 | 0 | T1m = Rp[0]; |
373 | 0 | T1n = Rm[0]; |
374 | 0 | T1o = T1m - T1n; |
375 | 0 | T2 = Ip[WS(rs, 2)]; |
376 | 0 | T3 = Im[WS(rs, 2)]; |
377 | 0 | T4 = T2 - T3; |
378 | 0 | T1f = T2 + T3; |
379 | 0 | T6 = Rp[WS(rs, 2)]; |
380 | 0 | T7 = Rm[WS(rs, 2)]; |
381 | 0 | T8 = T6 + T7; |
382 | 0 | T1h = T6 - T7; |
383 | 0 | { |
384 | 0 | E Tf, Tg, Tb, Tc; |
385 | 0 | Tf = Rp[WS(rs, 4)]; |
386 | 0 | Tg = Rm[WS(rs, 4)]; |
387 | 0 | Th = Tf + Tg; |
388 | 0 | T1c = Tf - Tg; |
389 | 0 | Tb = Ip[WS(rs, 4)]; |
390 | 0 | Tc = Im[WS(rs, 4)]; |
391 | 0 | Td = Tb - Tc; |
392 | 0 | T1a = Tb + Tc; |
393 | 0 | } |
394 | 0 | } |
395 | 0 | Tm = Tk - Tl; |
396 | 0 | T1t = T1m + T1n; |
397 | 0 | T19 = W[16]; |
398 | 0 | T1b = W[17]; |
399 | 0 | T1d = FNMS(T1b, T1c, T19 * T1a); |
400 | 0 | T2j = FMA(T19, T1c, T1b * T1a); |
401 | 0 | { |
402 | 0 | E T9, T1u, Ti, T1v; |
403 | 0 | { |
404 | 0 | E T1, T5, Ta, Te; |
405 | 0 | T1 = W[6]; |
406 | 0 | T5 = W[7]; |
407 | 0 | T9 = FNMS(T5, T8, T1 * T4); |
408 | 0 | T1u = FMA(T1, T8, T5 * T4); |
409 | 0 | Ta = W[14]; |
410 | 0 | Te = W[15]; |
411 | 0 | Ti = FNMS(Te, Th, Ta * Td); |
412 | 0 | T1v = FMA(Ta, Th, Te * Td); |
413 | 0 | } |
414 | 0 | Tj = T9 + Ti; |
415 | 0 | T1Y = KP433012701 * (T1v - T1u); |
416 | 0 | T1w = T1u + T1v; |
417 | 0 | T1G = KP433012701 * (T9 - Ti); |
418 | 0 | } |
419 | 0 | { |
420 | 0 | E T1i, T1S, T1p, T1T; |
421 | 0 | { |
422 | 0 | E T1e, T1g, T1j, T1l; |
423 | 0 | T1e = W[8]; |
424 | 0 | T1g = W[9]; |
425 | 0 | T1i = FNMS(T1g, T1h, T1e * T1f); |
426 | 0 | T1S = FMA(T1e, T1h, T1g * T1f); |
427 | 0 | T1j = W[0]; |
428 | 0 | T1l = W[1]; |
429 | 0 | T1p = FNMS(T1l, T1o, T1j * T1k); |
430 | 0 | T1T = FMA(T1j, T1o, T1l * T1k); |
431 | 0 | } |
432 | 0 | T1q = T1i + T1p; |
433 | 0 | T2q = KP433012701 * (T1i - T1p); |
434 | 0 | T1U = KP433012701 * (T1S - T1T); |
435 | 0 | T2k = T1S + T1T; |
436 | 0 | } |
437 | 0 | } |
438 | 0 | { |
439 | 0 | E Tr, TT, Tv, TV, TA, TY, TE, T10, TN, T14, TJ, T16; |
440 | 0 | { |
441 | 0 | E Tp, Tq, TC, TD; |
442 | 0 | Tp = Ip[WS(rs, 3)]; |
443 | 0 | Tq = Im[WS(rs, 3)]; |
444 | 0 | Tr = Tp - Tq; |
445 | 0 | TT = Tp + Tq; |
446 | 0 | { |
447 | 0 | E Tt, Tu, Ty, Tz; |
448 | 0 | Tt = Rp[WS(rs, 3)]; |
449 | 0 | Tu = Rm[WS(rs, 3)]; |
450 | 0 | Tv = Tt + Tu; |
451 | 0 | TV = Tt - Tu; |
452 | 0 | Ty = Ip[WS(rs, 5)]; |
453 | 0 | Tz = Im[WS(rs, 5)]; |
454 | 0 | TA = Ty - Tz; |
455 | 0 | TY = Ty + Tz; |
456 | 0 | } |
457 | 0 | TC = Rp[WS(rs, 5)]; |
458 | 0 | TD = Rm[WS(rs, 5)]; |
459 | 0 | TE = TC + TD; |
460 | 0 | T10 = TC - TD; |
461 | 0 | { |
462 | 0 | E TL, TM, TH, TI; |
463 | 0 | TL = Rp[WS(rs, 1)]; |
464 | 0 | TM = Rm[WS(rs, 1)]; |
465 | 0 | TN = TL + TM; |
466 | 0 | T14 = TM - TL; |
467 | 0 | TH = Ip[WS(rs, 1)]; |
468 | 0 | TI = Im[WS(rs, 1)]; |
469 | 0 | TJ = TH - TI; |
470 | 0 | T16 = TH + TI; |
471 | 0 | } |
472 | 0 | } |
473 | 0 | { |
474 | 0 | E To, Ts, T13, T15; |
475 | 0 | To = W[10]; |
476 | 0 | Ts = W[11]; |
477 | 0 | Tw = FNMS(Ts, Tv, To * Tr); |
478 | 0 | T1y = FMA(To, Tv, Ts * Tr); |
479 | 0 | T13 = W[5]; |
480 | 0 | T15 = W[4]; |
481 | 0 | T17 = FMA(T13, T14, T15 * T16); |
482 | 0 | T2g = FNMS(T13, T16, T15 * T14); |
483 | 0 | } |
484 | 0 | { |
485 | 0 | E TF, T1z, TO, T1A; |
486 | 0 | { |
487 | 0 | E Tx, TB, TG, TK; |
488 | 0 | Tx = W[18]; |
489 | 0 | TB = W[19]; |
490 | 0 | TF = FNMS(TB, TE, Tx * TA); |
491 | 0 | T1z = FMA(Tx, TE, TB * TA); |
492 | 0 | TG = W[2]; |
493 | 0 | TK = W[3]; |
494 | 0 | TO = FNMS(TK, TN, TG * TJ); |
495 | 0 | T1A = FMA(TG, TN, TK * TJ); |
496 | 0 | } |
497 | 0 | TP = TF + TO; |
498 | 0 | T21 = KP433012701 * (T1A - T1z); |
499 | 0 | T1B = T1z + T1A; |
500 | 0 | T1J = KP433012701 * (TF - TO); |
501 | 0 | } |
502 | 0 | { |
503 | 0 | E TW, T1O, T11, T1N; |
504 | 0 | { |
505 | 0 | E TS, TU, TX, TZ; |
506 | 0 | TS = W[12]; |
507 | 0 | TU = W[13]; |
508 | 0 | TW = FNMS(TU, TV, TS * TT); |
509 | 0 | T1O = FMA(TS, TV, TU * TT); |
510 | 0 | TX = W[20]; |
511 | 0 | TZ = W[21]; |
512 | 0 | T11 = FNMS(TZ, T10, TX * TY); |
513 | 0 | T1N = FMA(TX, T10, TZ * TY); |
514 | 0 | } |
515 | 0 | T12 = TW + T11; |
516 | 0 | T2u = KP433012701 * (T11 - TW); |
517 | 0 | T1P = KP433012701 * (T1N - T1O); |
518 | 0 | T2h = T1O + T1N; |
519 | 0 | } |
520 | 0 | } |
521 | 0 | { |
522 | 0 | E TR, T2f, T2m, T2o, T1s, T1E, T1D, T2n; |
523 | 0 | { |
524 | 0 | E Tn, TQ, T2i, T2l; |
525 | 0 | Tn = Tj + Tm; |
526 | 0 | TQ = Tw + TP; |
527 | 0 | TR = Tn - TQ; |
528 | 0 | T2f = TQ + Tn; |
529 | 0 | T2i = T2g - T2h; |
530 | 0 | T2l = T2j + T2k; |
531 | 0 | T2m = T2i - T2l; |
532 | 0 | T2o = T2i + T2l; |
533 | 0 | } |
534 | 0 | { |
535 | 0 | E T18, T1r, T1x, T1C; |
536 | 0 | T18 = T12 + T17; |
537 | 0 | T1r = T1d + T1q; |
538 | 0 | T1s = T18 - T1r; |
539 | 0 | T1E = T18 + T1r; |
540 | 0 | T1x = T1t + T1w; |
541 | 0 | T1C = T1y + T1B; |
542 | 0 | T1D = T1x + T1C; |
543 | 0 | T2n = T1x - T1C; |
544 | 0 | } |
545 | 0 | Ip[WS(rs, 3)] = KP500000000 * (TR + T1s); |
546 | 0 | Rp[WS(rs, 3)] = KP500000000 * (T2n - T2o); |
547 | 0 | Im[WS(rs, 2)] = KP500000000 * (T1s - TR); |
548 | 0 | Rm[WS(rs, 2)] = KP500000000 * (T2n + T2o); |
549 | 0 | Rm[WS(rs, 5)] = KP500000000 * (T1D - T1E); |
550 | 0 | Im[WS(rs, 5)] = KP500000000 * (T2m - T2f); |
551 | 0 | Rp[0] = KP500000000 * (T1D + T1E); |
552 | 0 | Ip[0] = KP500000000 * (T2f + T2m); |
553 | 0 | } |
554 | 0 | { |
555 | 0 | E T1H, T2b, T2s, T2B, T2v, T2A, T1K, T2c, T1Q, T29, T1Z, T25, T22, T26, T1V; |
556 | 0 | E T28; |
557 | 0 | { |
558 | 0 | E T1F, T2r, T2t, T1I; |
559 | 0 | T1F = FNMS(KP250000000, T1w, KP500000000 * T1t); |
560 | 0 | T1H = T1F - T1G; |
561 | 0 | T2b = T1F + T1G; |
562 | 0 | T2r = FNMS(KP500000000, T2j, KP250000000 * T2k); |
563 | 0 | T2s = T2q - T2r; |
564 | 0 | T2B = T2q + T2r; |
565 | 0 | T2t = FMA(KP250000000, T2h, KP500000000 * T2g); |
566 | 0 | T2v = T2t - T2u; |
567 | 0 | T2A = T2u + T2t; |
568 | 0 | T1I = FNMS(KP250000000, T1B, KP500000000 * T1y); |
569 | 0 | T1K = T1I - T1J; |
570 | 0 | T2c = T1I + T1J; |
571 | 0 | } |
572 | 0 | { |
573 | 0 | E T1M, T1X, T20, T1R; |
574 | 0 | T1M = FNMS(KP250000000, T12, KP500000000 * T17); |
575 | 0 | T1Q = T1M - T1P; |
576 | 0 | T29 = T1P + T1M; |
577 | 0 | T1X = FNMS(KP250000000, Tj, KP500000000 * Tm); |
578 | 0 | T1Z = T1X - T1Y; |
579 | 0 | T25 = T1Y + T1X; |
580 | 0 | T20 = FNMS(KP250000000, TP, KP500000000 * Tw); |
581 | 0 | T22 = T20 - T21; |
582 | 0 | T26 = T21 + T20; |
583 | 0 | T1R = FNMS(KP250000000, T1q, KP500000000 * T1d); |
584 | 0 | T1V = T1R - T1U; |
585 | 0 | T28 = T1R + T1U; |
586 | 0 | } |
587 | 0 | { |
588 | 0 | E T1L, T1W, T2p, T2w; |
589 | 0 | T1L = T1H + T1K; |
590 | 0 | T1W = T1Q + T1V; |
591 | 0 | Rp[WS(rs, 2)] = T1L - T1W; |
592 | 0 | Rm[WS(rs, 3)] = T1L + T1W; |
593 | 0 | T2p = T22 + T1Z; |
594 | 0 | T2w = T2s - T2v; |
595 | 0 | Ip[WS(rs, 2)] = T2p + T2w; |
596 | 0 | Im[WS(rs, 3)] = T2w - T2p; |
597 | 0 | } |
598 | 0 | { |
599 | 0 | E T23, T24, T2x, T2y; |
600 | 0 | T23 = T1Z - T22; |
601 | 0 | T24 = T1V - T1Q; |
602 | 0 | Ip[WS(rs, 5)] = T23 + T24; |
603 | 0 | Im[0] = T24 - T23; |
604 | 0 | T2x = T1H - T1K; |
605 | 0 | T2y = T2v + T2s; |
606 | 0 | Rm[0] = T2x - T2y; |
607 | 0 | Rp[WS(rs, 5)] = T2x + T2y; |
608 | 0 | } |
609 | 0 | { |
610 | 0 | E T27, T2a, T2z, T2C; |
611 | 0 | T27 = T25 - T26; |
612 | 0 | T2a = T28 - T29; |
613 | 0 | Ip[WS(rs, 1)] = T27 + T2a; |
614 | 0 | Im[WS(rs, 4)] = T2a - T27; |
615 | 0 | T2z = T2b - T2c; |
616 | 0 | T2C = T2A - T2B; |
617 | 0 | Rm[WS(rs, 4)] = T2z - T2C; |
618 | 0 | Rp[WS(rs, 1)] = T2z + T2C; |
619 | 0 | } |
620 | 0 | { |
621 | 0 | E T2d, T2e, T2D, T2E; |
622 | 0 | T2d = T2b + T2c; |
623 | 0 | T2e = T29 + T28; |
624 | 0 | Rm[WS(rs, 1)] = T2d - T2e; |
625 | 0 | Rp[WS(rs, 4)] = T2d + T2e; |
626 | 0 | T2D = T26 + T25; |
627 | 0 | T2E = T2A + T2B; |
628 | 0 | Ip[WS(rs, 4)] = T2D + T2E; |
629 | 0 | Im[WS(rs, 1)] = T2E - T2D; |
630 | 0 | } |
631 | 0 | } |
632 | 0 | } |
633 | 0 | } |
634 | 0 | } |
635 | | |
636 | | static const tw_instr twinstr[] = { |
637 | | { TW_FULL, 1, 12 }, |
638 | | { TW_NEXT, 1, 0 } |
639 | | }; |
640 | | |
641 | | static const hc2c_desc desc = { 12, "hc2cfdft_12", twinstr, &GENUS, { 112, 46, 30, 0 } }; |
642 | | |
643 | 1 | void X(codelet_hc2cfdft_12) (planner *p) { |
644 | 1 | X(khc2c_register) (p, hc2cfdft_12, &desc, HC2C_VIA_DFT); |
645 | 1 | } |
646 | | #endif |