/src/fftw3/rdft/scalar/r2cf/hc2cf2_16.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2003, 2007-14 Matteo Frigo |
3 | | * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology |
4 | | * |
5 | | * This program is free software; you can redistribute it and/or modify |
6 | | * it under the terms of the GNU General Public License as published by |
7 | | * the Free Software Foundation; either version 2 of the License, or |
8 | | * (at your option) any later version. |
9 | | * |
10 | | * This program is distributed in the hope that it will be useful, |
11 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | * GNU General Public License for more details. |
14 | | * |
15 | | * You should have received a copy of the GNU General Public License |
16 | | * along with this program; if not, write to the Free Software |
17 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
18 | | * |
19 | | */ |
20 | | |
21 | | /* This file was automatically generated --- DO NOT EDIT */ |
22 | | /* Generated on Sun Sep 8 06:41:47 UTC 2024 */ |
23 | | |
24 | | #include "rdft/codelet-rdft.h" |
25 | | |
26 | | #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) |
27 | | |
28 | | /* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hc2cf2_16 -include rdft/scalar/hc2cf.h */ |
29 | | |
30 | | /* |
31 | | * This function contains 196 FP additions, 134 FP multiplications, |
32 | | * (or, 104 additions, 42 multiplications, 92 fused multiply/add), |
33 | | * 90 stack variables, 3 constants, and 64 memory accesses |
34 | | */ |
35 | | #include "rdft/scalar/hc2cf.h" |
36 | | |
37 | | static void hc2cf2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) |
38 | | { |
39 | | DK(KP923879532, +0.923879532511286756128183189396788286822416626); |
40 | | DK(KP414213562, +0.414213562373095048801688724209698078569671875); |
41 | | DK(KP707106781, +0.707106781186547524400844362104849039284835938); |
42 | | { |
43 | | INT m; |
44 | | for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) { |
45 | | E T2, Tf, TM, TO, T3, T6, T5, Th, Tz, Ti, T7, TZ, TT, Tq, TW; |
46 | | E Tb, Tu, TP, TI, TF, TC, T1z, T1O, T1D, T1L, Tm, T1f, T1p, T1j, T1m; |
47 | | { |
48 | | E TN, TS, T4, Tp, Ta, Tt, Tl, Tg; |
49 | | T2 = W[0]; |
50 | | Tf = W[2]; |
51 | | Tg = T2 * Tf; |
52 | | TM = W[6]; |
53 | | TN = T2 * TM; |
54 | | TO = W[7]; |
55 | | TS = T2 * TO; |
56 | | T3 = W[4]; |
57 | | T4 = T2 * T3; |
58 | | Tp = Tf * T3; |
59 | | T6 = W[5]; |
60 | | Ta = T2 * T6; |
61 | | Tt = Tf * T6; |
62 | | T5 = W[1]; |
63 | | Th = W[3]; |
64 | | Tl = T2 * Th; |
65 | | Tz = FMA(T5, Th, Tg); |
66 | | Ti = FNMS(T5, Th, Tg); |
67 | | T7 = FMA(T5, T6, T4); |
68 | | TZ = FNMS(Th, T3, Tt); |
69 | | TT = FNMS(T5, TM, TS); |
70 | | Tq = FNMS(Th, T6, Tp); |
71 | | TW = FMA(Th, T6, Tp); |
72 | | Tb = FNMS(T5, T3, Ta); |
73 | | Tu = FMA(Th, T3, Tt); |
74 | | TP = FMA(T5, TO, TN); |
75 | | TI = FMA(T5, T3, Ta); |
76 | | TF = FNMS(T5, T6, T4); |
77 | | { |
78 | | E T1y, T1C, T1e, T1i; |
79 | | T1y = Tz * T3; |
80 | | T1C = Tz * T6; |
81 | | TC = FNMS(T5, Tf, Tl); |
82 | | T1z = FMA(TC, T6, T1y); |
83 | | T1O = FMA(TC, T3, T1C); |
84 | | T1D = FNMS(TC, T3, T1C); |
85 | | T1L = FNMS(TC, T6, T1y); |
86 | | T1e = Ti * T3; |
87 | | T1i = Ti * T6; |
88 | | Tm = FMA(T5, Tf, Tl); |
89 | | T1f = FMA(Tm, T6, T1e); |
90 | | T1p = FMA(Tm, T3, T1i); |
91 | | T1j = FNMS(Tm, T3, T1i); |
92 | | T1m = FNMS(Tm, T6, T1e); |
93 | | } |
94 | | } |
95 | | { |
96 | | E Te, T1U, T3A, T3L, T1G, T2D, T2B, T3h, T1R, T2w, T2I, T3i, Tx, T3M, T1Z; |
97 | | E T3w, TL, T26, T25, T37, T1d, T2o, T2l, T3c, T1s, T2m, T2t, T3d, T12, T28; |
98 | | E T2d, T38; |
99 | | { |
100 | | E T1, T3z, T8, T9, Tc, T3x, Td, T3y; |
101 | | T1 = Rp[0]; |
102 | | T3z = Rm[0]; |
103 | | T8 = Rp[WS(rs, 4)]; |
104 | | T9 = T7 * T8; |
105 | | Tc = Rm[WS(rs, 4)]; |
106 | | T3x = T7 * Tc; |
107 | | Td = FMA(Tb, Tc, T9); |
108 | | Te = T1 + Td; |
109 | | T1U = T1 - Td; |
110 | | T3y = FNMS(Tb, T8, T3x); |
111 | | T3A = T3y + T3z; |
112 | | T3L = T3z - T3y; |
113 | | } |
114 | | { |
115 | | E T1u, T1v, T1w, T2x, T1A, T1B, T1E, T2z; |
116 | | T1u = Ip[WS(rs, 7)]; |
117 | | T1v = TM * T1u; |
118 | | T1w = Im[WS(rs, 7)]; |
119 | | T2x = TM * T1w; |
120 | | T1A = Ip[WS(rs, 3)]; |
121 | | T1B = T1z * T1A; |
122 | | T1E = Im[WS(rs, 3)]; |
123 | | T2z = T1z * T1E; |
124 | | { |
125 | | E T1x, T1F, T2y, T2A; |
126 | | T1x = FMA(TO, T1w, T1v); |
127 | | T1F = FMA(T1D, T1E, T1B); |
128 | | T1G = T1x + T1F; |
129 | | T2D = T1x - T1F; |
130 | | T2y = FNMS(TO, T1u, T2x); |
131 | | T2A = FNMS(T1D, T1A, T2z); |
132 | | T2B = T2y - T2A; |
133 | | T3h = T2y + T2A; |
134 | | } |
135 | | } |
136 | | { |
137 | | E T1H, T1I, T1J, T2E, T1M, T1N, T1P, T2G; |
138 | | T1H = Ip[WS(rs, 1)]; |
139 | | T1I = Tf * T1H; |
140 | | T1J = Im[WS(rs, 1)]; |
141 | | T2E = Tf * T1J; |
142 | | T1M = Ip[WS(rs, 5)]; |
143 | | T1N = T1L * T1M; |
144 | | T1P = Im[WS(rs, 5)]; |
145 | | T2G = T1L * T1P; |
146 | | { |
147 | | E T1K, T1Q, T2F, T2H; |
148 | | T1K = FMA(Th, T1J, T1I); |
149 | | T1Q = FMA(T1O, T1P, T1N); |
150 | | T1R = T1K + T1Q; |
151 | | T2w = T1Q - T1K; |
152 | | T2F = FNMS(Th, T1H, T2E); |
153 | | T2H = FNMS(T1O, T1M, T2G); |
154 | | T2I = T2F - T2H; |
155 | | T3i = T2F + T2H; |
156 | | } |
157 | | } |
158 | | { |
159 | | E Tj, Tk, Tn, T1V, Tr, Ts, Tv, T1X; |
160 | | Tj = Rp[WS(rs, 2)]; |
161 | | Tk = Ti * Tj; |
162 | | Tn = Rm[WS(rs, 2)]; |
163 | | T1V = Ti * Tn; |
164 | | Tr = Rp[WS(rs, 6)]; |
165 | | Ts = Tq * Tr; |
166 | | Tv = Rm[WS(rs, 6)]; |
167 | | T1X = Tq * Tv; |
168 | | { |
169 | | E To, Tw, T1W, T1Y; |
170 | | To = FMA(Tm, Tn, Tk); |
171 | | Tw = FMA(Tu, Tv, Ts); |
172 | | Tx = To + Tw; |
173 | | T3M = To - Tw; |
174 | | T1W = FNMS(Tm, Tj, T1V); |
175 | | T1Y = FNMS(Tu, Tr, T1X); |
176 | | T1Z = T1W - T1Y; |
177 | | T3w = T1W + T1Y; |
178 | | } |
179 | | } |
180 | | { |
181 | | E TA, TB, TD, T21, TG, TH, TJ, T23; |
182 | | TA = Rp[WS(rs, 1)]; |
183 | | TB = Tz * TA; |
184 | | TD = Rm[WS(rs, 1)]; |
185 | | T21 = Tz * TD; |
186 | | TG = Rp[WS(rs, 5)]; |
187 | | TH = TF * TG; |
188 | | TJ = Rm[WS(rs, 5)]; |
189 | | T23 = TF * TJ; |
190 | | { |
191 | | E TE, TK, T22, T24; |
192 | | TE = FMA(TC, TD, TB); |
193 | | TK = FMA(TI, TJ, TH); |
194 | | TL = TE + TK; |
195 | | T26 = TE - TK; |
196 | | T22 = FNMS(TC, TA, T21); |
197 | | T24 = FNMS(TI, TG, T23); |
198 | | T25 = T22 - T24; |
199 | | T37 = T22 + T24; |
200 | | } |
201 | | } |
202 | | { |
203 | | E T15, T16, T17, T2h, T19, T1a, T1b, T2j; |
204 | | T15 = Ip[0]; |
205 | | T16 = T2 * T15; |
206 | | T17 = Im[0]; |
207 | | T2h = T2 * T17; |
208 | | T19 = Ip[WS(rs, 4)]; |
209 | | T1a = T3 * T19; |
210 | | T1b = Im[WS(rs, 4)]; |
211 | | T2j = T3 * T1b; |
212 | | { |
213 | | E T18, T1c, T2i, T2k; |
214 | | T18 = FMA(T5, T17, T16); |
215 | | T1c = FMA(T6, T1b, T1a); |
216 | | T1d = T18 + T1c; |
217 | | T2o = T18 - T1c; |
218 | | T2i = FNMS(T5, T15, T2h); |
219 | | T2k = FNMS(T6, T19, T2j); |
220 | | T2l = T2i - T2k; |
221 | | T3c = T2i + T2k; |
222 | | } |
223 | | } |
224 | | { |
225 | | E T1g, T1h, T1k, T2p, T1n, T1o, T1q, T2r; |
226 | | T1g = Ip[WS(rs, 2)]; |
227 | | T1h = T1f * T1g; |
228 | | T1k = Im[WS(rs, 2)]; |
229 | | T2p = T1f * T1k; |
230 | | T1n = Ip[WS(rs, 6)]; |
231 | | T1o = T1m * T1n; |
232 | | T1q = Im[WS(rs, 6)]; |
233 | | T2r = T1m * T1q; |
234 | | { |
235 | | E T1l, T1r, T2q, T2s; |
236 | | T1l = FMA(T1j, T1k, T1h); |
237 | | T1r = FMA(T1p, T1q, T1o); |
238 | | T1s = T1l + T1r; |
239 | | T2m = T1l - T1r; |
240 | | T2q = FNMS(T1j, T1g, T2p); |
241 | | T2s = FNMS(T1p, T1n, T2r); |
242 | | T2t = T2q - T2s; |
243 | | T3d = T2q + T2s; |
244 | | } |
245 | | } |
246 | | { |
247 | | E TQ, TR, TU, T29, TX, TY, T10, T2b; |
248 | | TQ = Rp[WS(rs, 7)]; |
249 | | TR = TP * TQ; |
250 | | TU = Rm[WS(rs, 7)]; |
251 | | T29 = TP * TU; |
252 | | TX = Rp[WS(rs, 3)]; |
253 | | TY = TW * TX; |
254 | | T10 = Rm[WS(rs, 3)]; |
255 | | T2b = TW * T10; |
256 | | { |
257 | | E TV, T11, T2a, T2c; |
258 | | TV = FMA(TT, TU, TR); |
259 | | T11 = FMA(TZ, T10, TY); |
260 | | T12 = TV + T11; |
261 | | T28 = TV - T11; |
262 | | T2a = FNMS(TT, TQ, T29); |
263 | | T2c = FNMS(TZ, TX, T2b); |
264 | | T2d = T2a - T2c; |
265 | | T38 = T2a + T2c; |
266 | | } |
267 | | } |
268 | | { |
269 | | E T14, T3q, T3C, T3E, T1T, T3D, T3t, T3u; |
270 | | { |
271 | | E Ty, T13, T3v, T3B; |
272 | | Ty = Te + Tx; |
273 | | T13 = TL + T12; |
274 | | T14 = Ty + T13; |
275 | | T3q = Ty - T13; |
276 | | T3v = T37 + T38; |
277 | | T3B = T3w + T3A; |
278 | | T3C = T3v + T3B; |
279 | | T3E = T3B - T3v; |
280 | | } |
281 | | { |
282 | | E T1t, T1S, T3r, T3s; |
283 | | T1t = T1d + T1s; |
284 | | T1S = T1G + T1R; |
285 | | T1T = T1t + T1S; |
286 | | T3D = T1S - T1t; |
287 | | T3r = T3c + T3d; |
288 | | T3s = T3h + T3i; |
289 | | T3t = T3r - T3s; |
290 | | T3u = T3r + T3s; |
291 | | } |
292 | | Rm[WS(rs, 7)] = T14 - T1T; |
293 | | Im[WS(rs, 7)] = T3u - T3C; |
294 | | Rp[0] = T14 + T1T; |
295 | | Ip[0] = T3u + T3C; |
296 | | Rm[WS(rs, 3)] = T3q - T3t; |
297 | | Im[WS(rs, 3)] = T3D - T3E; |
298 | | Rp[WS(rs, 4)] = T3q + T3t; |
299 | | Ip[WS(rs, 4)] = T3D + T3E; |
300 | | } |
301 | | { |
302 | | E T3a, T3m, T3H, T3J, T3f, T3n, T3k, T3o; |
303 | | { |
304 | | E T36, T39, T3F, T3G; |
305 | | T36 = Te - Tx; |
306 | | T39 = T37 - T38; |
307 | | T3a = T36 + T39; |
308 | | T3m = T36 - T39; |
309 | | T3F = T12 - TL; |
310 | | T3G = T3A - T3w; |
311 | | T3H = T3F + T3G; |
312 | | T3J = T3G - T3F; |
313 | | } |
314 | | { |
315 | | E T3b, T3e, T3g, T3j; |
316 | | T3b = T1d - T1s; |
317 | | T3e = T3c - T3d; |
318 | | T3f = T3b + T3e; |
319 | | T3n = T3e - T3b; |
320 | | T3g = T1G - T1R; |
321 | | T3j = T3h - T3i; |
322 | | T3k = T3g - T3j; |
323 | | T3o = T3g + T3j; |
324 | | } |
325 | | { |
326 | | E T3l, T3I, T3p, T3K; |
327 | | T3l = T3f + T3k; |
328 | | Rm[WS(rs, 5)] = FNMS(KP707106781, T3l, T3a); |
329 | | Rp[WS(rs, 2)] = FMA(KP707106781, T3l, T3a); |
330 | | T3I = T3n + T3o; |
331 | | Im[WS(rs, 5)] = FMS(KP707106781, T3I, T3H); |
332 | | Ip[WS(rs, 2)] = FMA(KP707106781, T3I, T3H); |
333 | | T3p = T3n - T3o; |
334 | | Rm[WS(rs, 1)] = FNMS(KP707106781, T3p, T3m); |
335 | | Rp[WS(rs, 6)] = FMA(KP707106781, T3p, T3m); |
336 | | T3K = T3k - T3f; |
337 | | Im[WS(rs, 1)] = FMS(KP707106781, T3K, T3J); |
338 | | Ip[WS(rs, 6)] = FMA(KP707106781, T3K, T3J); |
339 | | } |
340 | | } |
341 | | { |
342 | | E T20, T3N, T3T, T2Q, T2f, T3O, T30, T34, T2T, T3U, T2v, T2N, T2X, T33, T2K; |
343 | | E T2O; |
344 | | { |
345 | | E T27, T2e, T2n, T2u; |
346 | | T20 = T1U - T1Z; |
347 | | T3N = T3L - T3M; |
348 | | T3T = T3M + T3L; |
349 | | T2Q = T1U + T1Z; |
350 | | T27 = T25 - T26; |
351 | | T2e = T28 + T2d; |
352 | | T2f = T27 - T2e; |
353 | | T3O = T27 + T2e; |
354 | | { |
355 | | E T2Y, T2Z, T2R, T2S; |
356 | | T2Y = T2D + T2I; |
357 | | T2Z = T2B + T2w; |
358 | | T30 = FNMS(KP414213562, T2Z, T2Y); |
359 | | T34 = FMA(KP414213562, T2Y, T2Z); |
360 | | T2R = T26 + T25; |
361 | | T2S = T28 - T2d; |
362 | | T2T = T2R + T2S; |
363 | | T3U = T2S - T2R; |
364 | | } |
365 | | T2n = T2l + T2m; |
366 | | T2u = T2o - T2t; |
367 | | T2v = FMA(KP414213562, T2u, T2n); |
368 | | T2N = FNMS(KP414213562, T2n, T2u); |
369 | | { |
370 | | E T2V, T2W, T2C, T2J; |
371 | | T2V = T2o + T2t; |
372 | | T2W = T2l - T2m; |
373 | | T2X = FMA(KP414213562, T2W, T2V); |
374 | | T33 = FNMS(KP414213562, T2V, T2W); |
375 | | T2C = T2w - T2B; |
376 | | T2J = T2D - T2I; |
377 | | T2K = FMA(KP414213562, T2J, T2C); |
378 | | T2O = FNMS(KP414213562, T2C, T2J); |
379 | | } |
380 | | } |
381 | | { |
382 | | E T2g, T2L, T3V, T3W; |
383 | | T2g = FMA(KP707106781, T2f, T20); |
384 | | T2L = T2v + T2K; |
385 | | Rm[WS(rs, 4)] = FNMS(KP923879532, T2L, T2g); |
386 | | Rp[WS(rs, 3)] = FMA(KP923879532, T2L, T2g); |
387 | | T3V = FMA(KP707106781, T3U, T3T); |
388 | | T3W = T2O - T2N; |
389 | | Im[WS(rs, 4)] = FMS(KP923879532, T3W, T3V); |
390 | | Ip[WS(rs, 3)] = FMA(KP923879532, T3W, T3V); |
391 | | } |
392 | | { |
393 | | E T2M, T2P, T3X, T3Y; |
394 | | T2M = FNMS(KP707106781, T2f, T20); |
395 | | T2P = T2N + T2O; |
396 | | Rp[WS(rs, 7)] = FNMS(KP923879532, T2P, T2M); |
397 | | Rm[0] = FMA(KP923879532, T2P, T2M); |
398 | | T3X = FNMS(KP707106781, T3U, T3T); |
399 | | T3Y = T2K - T2v; |
400 | | Im[0] = FMS(KP923879532, T3Y, T3X); |
401 | | Ip[WS(rs, 7)] = FMA(KP923879532, T3Y, T3X); |
402 | | } |
403 | | { |
404 | | E T2U, T31, T3P, T3Q; |
405 | | T2U = FMA(KP707106781, T2T, T2Q); |
406 | | T31 = T2X + T30; |
407 | | Rm[WS(rs, 6)] = FNMS(KP923879532, T31, T2U); |
408 | | Rp[WS(rs, 1)] = FMA(KP923879532, T31, T2U); |
409 | | T3P = FMA(KP707106781, T3O, T3N); |
410 | | T3Q = T33 + T34; |
411 | | Im[WS(rs, 6)] = FMS(KP923879532, T3Q, T3P); |
412 | | Ip[WS(rs, 1)] = FMA(KP923879532, T3Q, T3P); |
413 | | } |
414 | | { |
415 | | E T32, T35, T3R, T3S; |
416 | | T32 = FNMS(KP707106781, T2T, T2Q); |
417 | | T35 = T33 - T34; |
418 | | Rm[WS(rs, 2)] = FNMS(KP923879532, T35, T32); |
419 | | Rp[WS(rs, 5)] = FMA(KP923879532, T35, T32); |
420 | | T3R = FNMS(KP707106781, T3O, T3N); |
421 | | T3S = T30 - T2X; |
422 | | Im[WS(rs, 2)] = FMS(KP923879532, T3S, T3R); |
423 | | Ip[WS(rs, 5)] = FMA(KP923879532, T3S, T3R); |
424 | | } |
425 | | } |
426 | | } |
427 | | } |
428 | | } |
429 | | } |
430 | | |
431 | | static const tw_instr twinstr[] = { |
432 | | { TW_CEXP, 1, 1 }, |
433 | | { TW_CEXP, 1, 3 }, |
434 | | { TW_CEXP, 1, 9 }, |
435 | | { TW_CEXP, 1, 15 }, |
436 | | { TW_NEXT, 1, 0 } |
437 | | }; |
438 | | |
439 | | static const hc2c_desc desc = { 16, "hc2cf2_16", twinstr, &GENUS, { 104, 42, 92, 0 } }; |
440 | | |
441 | | void X(codelet_hc2cf2_16) (planner *p) { |
442 | | X(khc2c_register) (p, hc2cf2_16, &desc, HC2C_VIA_RDFT); |
443 | | } |
444 | | #else |
445 | | |
446 | | /* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hc2cf2_16 -include rdft/scalar/hc2cf.h */ |
447 | | |
448 | | /* |
449 | | * This function contains 196 FP additions, 108 FP multiplications, |
450 | | * (or, 156 additions, 68 multiplications, 40 fused multiply/add), |
451 | | * 82 stack variables, 3 constants, and 64 memory accesses |
452 | | */ |
453 | | #include "rdft/scalar/hc2cf.h" |
454 | | |
455 | | static void hc2cf2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) |
456 | 0 | { |
457 | 0 | DK(KP382683432, +0.382683432365089771728459984030398866761344562); |
458 | 0 | DK(KP923879532, +0.923879532511286756128183189396788286822416626); |
459 | 0 | DK(KP707106781, +0.707106781186547524400844362104849039284835938); |
460 | 0 | { |
461 | 0 | INT m; |
462 | 0 | for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) { |
463 | 0 | E T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU; |
464 | 0 | E Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F; |
465 | 0 | { |
466 | 0 | E T7, Tv, Ta, Ts, T4, Tw, Tb, Tr; |
467 | 0 | { |
468 | 0 | E Th, Tn, Tj, Tm; |
469 | 0 | T2 = W[0]; |
470 | 0 | T5 = W[1]; |
471 | 0 | Tg = W[2]; |
472 | 0 | Ti = W[3]; |
473 | 0 | Th = T2 * Tg; |
474 | 0 | Tn = T5 * Tg; |
475 | 0 | Tj = T5 * Ti; |
476 | 0 | Tm = T2 * Ti; |
477 | 0 | Tk = Th - Tj; |
478 | 0 | To = Tm + Tn; |
479 | 0 | TE = Tm - Tn; |
480 | 0 | TC = Th + Tj; |
481 | 0 | T6 = W[5]; |
482 | 0 | T7 = T5 * T6; |
483 | 0 | Tv = Tg * T6; |
484 | 0 | Ta = T2 * T6; |
485 | 0 | Ts = Ti * T6; |
486 | 0 | T3 = W[4]; |
487 | 0 | T4 = T2 * T3; |
488 | 0 | Tw = Ti * T3; |
489 | 0 | Tb = T5 * T3; |
490 | 0 | Tr = Tg * T3; |
491 | 0 | } |
492 | 0 | T8 = T4 + T7; |
493 | 0 | TW = Tv - Tw; |
494 | 0 | TJ = Ta + Tb; |
495 | 0 | Tt = Tr - Ts; |
496 | 0 | TU = Tr + Ts; |
497 | 0 | Tc = Ta - Tb; |
498 | 0 | Tx = Tv + Tw; |
499 | 0 | TH = T4 - T7; |
500 | 0 | TN = W[6]; |
501 | 0 | TO = W[7]; |
502 | 0 | TP = FMA(T2, TN, T5 * TO); |
503 | 0 | TR = FNMS(T5, TN, T2 * TO); |
504 | 0 | { |
505 | 0 | E T1d, T1e, T19, T1a; |
506 | 0 | T1d = Tk * T6; |
507 | 0 | T1e = To * T3; |
508 | 0 | T1f = T1d - T1e; |
509 | 0 | T1k = T1d + T1e; |
510 | 0 | T19 = Tk * T3; |
511 | 0 | T1a = To * T6; |
512 | 0 | T1b = T19 + T1a; |
513 | 0 | T1i = T19 - T1a; |
514 | 0 | } |
515 | 0 | { |
516 | 0 | E T1w, T1x, T1s, T1t; |
517 | 0 | T1w = TC * T6; |
518 | 0 | T1x = TE * T3; |
519 | 0 | T1y = T1w - T1x; |
520 | 0 | T1H = T1w + T1x; |
521 | 0 | T1s = TC * T3; |
522 | 0 | T1t = TE * T6; |
523 | 0 | T1u = T1s + T1t; |
524 | 0 | T1F = T1s - T1t; |
525 | 0 | } |
526 | 0 | } |
527 | 0 | { |
528 | 0 | E Tf, T3r, T1N, T3e, TA, T3s, T1Q, T3b, TM, T2M, T1W, T2w, TZ, T2N, T21; |
529 | 0 | E T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2D, T2o, T2E, T18, T1n, T2Q, T2R; |
530 | 0 | E T2S, T2T, T28, T2A, T2d, T2B; |
531 | 0 | { |
532 | 0 | E T1, T3d, Te, T3c, T9, Td; |
533 | 0 | T1 = Rp[0]; |
534 | 0 | T3d = Rm[0]; |
535 | 0 | T9 = Rp[WS(rs, 4)]; |
536 | 0 | Td = Rm[WS(rs, 4)]; |
537 | 0 | Te = FMA(T8, T9, Tc * Td); |
538 | 0 | T3c = FNMS(Tc, T9, T8 * Td); |
539 | 0 | Tf = T1 + Te; |
540 | 0 | T3r = T3d - T3c; |
541 | 0 | T1N = T1 - Te; |
542 | 0 | T3e = T3c + T3d; |
543 | 0 | } |
544 | 0 | { |
545 | 0 | E Tq, T1O, Tz, T1P; |
546 | 0 | { |
547 | 0 | E Tl, Tp, Tu, Ty; |
548 | 0 | Tl = Rp[WS(rs, 2)]; |
549 | 0 | Tp = Rm[WS(rs, 2)]; |
550 | 0 | Tq = FMA(Tk, Tl, To * Tp); |
551 | 0 | T1O = FNMS(To, Tl, Tk * Tp); |
552 | 0 | Tu = Rp[WS(rs, 6)]; |
553 | 0 | Ty = Rm[WS(rs, 6)]; |
554 | 0 | Tz = FMA(Tt, Tu, Tx * Ty); |
555 | 0 | T1P = FNMS(Tx, Tu, Tt * Ty); |
556 | 0 | } |
557 | 0 | TA = Tq + Tz; |
558 | 0 | T3s = Tq - Tz; |
559 | 0 | T1Q = T1O - T1P; |
560 | 0 | T3b = T1O + T1P; |
561 | 0 | } |
562 | 0 | { |
563 | 0 | E TG, T1S, TL, T1T, T1U, T1V; |
564 | 0 | { |
565 | 0 | E TD, TF, TI, TK; |
566 | 0 | TD = Rp[WS(rs, 1)]; |
567 | 0 | TF = Rm[WS(rs, 1)]; |
568 | 0 | TG = FMA(TC, TD, TE * TF); |
569 | 0 | T1S = FNMS(TE, TD, TC * TF); |
570 | 0 | TI = Rp[WS(rs, 5)]; |
571 | 0 | TK = Rm[WS(rs, 5)]; |
572 | 0 | TL = FMA(TH, TI, TJ * TK); |
573 | 0 | T1T = FNMS(TJ, TI, TH * TK); |
574 | 0 | } |
575 | 0 | TM = TG + TL; |
576 | 0 | T2M = T1S + T1T; |
577 | 0 | T1U = T1S - T1T; |
578 | 0 | T1V = TG - TL; |
579 | 0 | T1W = T1U - T1V; |
580 | 0 | T2w = T1V + T1U; |
581 | 0 | } |
582 | 0 | { |
583 | 0 | E TT, T1Y, TY, T1Z, T1X, T20; |
584 | 0 | { |
585 | 0 | E TQ, TS, TV, TX; |
586 | 0 | TQ = Rp[WS(rs, 7)]; |
587 | 0 | TS = Rm[WS(rs, 7)]; |
588 | 0 | TT = FMA(TP, TQ, TR * TS); |
589 | 0 | T1Y = FNMS(TR, TQ, TP * TS); |
590 | 0 | TV = Rp[WS(rs, 3)]; |
591 | 0 | TX = Rm[WS(rs, 3)]; |
592 | 0 | TY = FMA(TU, TV, TW * TX); |
593 | 0 | T1Z = FNMS(TW, TV, TU * TX); |
594 | 0 | } |
595 | 0 | TZ = TT + TY; |
596 | 0 | T2N = T1Y + T1Z; |
597 | 0 | T1X = TT - TY; |
598 | 0 | T20 = T1Y - T1Z; |
599 | 0 | T21 = T1X + T20; |
600 | 0 | T2x = T1X - T20; |
601 | 0 | } |
602 | 0 | { |
603 | 0 | E T1r, T2k, T1J, T2h, T1A, T2l, T1E, T2g; |
604 | 0 | { |
605 | 0 | E T1p, T1q, T1G, T1I; |
606 | 0 | T1p = Ip[WS(rs, 7)]; |
607 | 0 | T1q = Im[WS(rs, 7)]; |
608 | 0 | T1r = FMA(TN, T1p, TO * T1q); |
609 | 0 | T2k = FNMS(TO, T1p, TN * T1q); |
610 | 0 | T1G = Ip[WS(rs, 5)]; |
611 | 0 | T1I = Im[WS(rs, 5)]; |
612 | 0 | T1J = FMA(T1F, T1G, T1H * T1I); |
613 | 0 | T2h = FNMS(T1H, T1G, T1F * T1I); |
614 | 0 | } |
615 | 0 | { |
616 | 0 | E T1v, T1z, T1C, T1D; |
617 | 0 | T1v = Ip[WS(rs, 3)]; |
618 | 0 | T1z = Im[WS(rs, 3)]; |
619 | 0 | T1A = FMA(T1u, T1v, T1y * T1z); |
620 | 0 | T2l = FNMS(T1y, T1v, T1u * T1z); |
621 | 0 | T1C = Ip[WS(rs, 1)]; |
622 | 0 | T1D = Im[WS(rs, 1)]; |
623 | 0 | T1E = FMA(Tg, T1C, Ti * T1D); |
624 | 0 | T2g = FNMS(Ti, T1C, Tg * T1D); |
625 | 0 | } |
626 | 0 | T1B = T1r + T1A; |
627 | 0 | T1K = T1E + T1J; |
628 | 0 | T2V = T1B - T1K; |
629 | 0 | T2W = T2k + T2l; |
630 | 0 | T2X = T2g + T2h; |
631 | 0 | T2Y = T2W - T2X; |
632 | 0 | { |
633 | 0 | E T2f, T2i, T2m, T2n; |
634 | 0 | T2f = T1r - T1A; |
635 | 0 | T2i = T2g - T2h; |
636 | 0 | T2j = T2f - T2i; |
637 | 0 | T2D = T2f + T2i; |
638 | 0 | T2m = T2k - T2l; |
639 | 0 | T2n = T1E - T1J; |
640 | 0 | T2o = T2m + T2n; |
641 | 0 | T2E = T2m - T2n; |
642 | 0 | } |
643 | 0 | } |
644 | 0 | { |
645 | 0 | E T14, T24, T1m, T2b, T17, T25, T1h, T2a; |
646 | 0 | { |
647 | 0 | E T12, T13, T1j, T1l; |
648 | 0 | T12 = Ip[0]; |
649 | 0 | T13 = Im[0]; |
650 | 0 | T14 = FMA(T2, T12, T5 * T13); |
651 | 0 | T24 = FNMS(T5, T12, T2 * T13); |
652 | 0 | T1j = Ip[WS(rs, 6)]; |
653 | 0 | T1l = Im[WS(rs, 6)]; |
654 | 0 | T1m = FMA(T1i, T1j, T1k * T1l); |
655 | 0 | T2b = FNMS(T1k, T1j, T1i * T1l); |
656 | 0 | } |
657 | 0 | { |
658 | 0 | E T15, T16, T1c, T1g; |
659 | 0 | T15 = Ip[WS(rs, 4)]; |
660 | 0 | T16 = Im[WS(rs, 4)]; |
661 | 0 | T17 = FMA(T3, T15, T6 * T16); |
662 | 0 | T25 = FNMS(T6, T15, T3 * T16); |
663 | 0 | T1c = Ip[WS(rs, 2)]; |
664 | 0 | T1g = Im[WS(rs, 2)]; |
665 | 0 | T1h = FMA(T1b, T1c, T1f * T1g); |
666 | 0 | T2a = FNMS(T1f, T1c, T1b * T1g); |
667 | 0 | } |
668 | 0 | T18 = T14 + T17; |
669 | 0 | T1n = T1h + T1m; |
670 | 0 | T2Q = T18 - T1n; |
671 | 0 | T2R = T24 + T25; |
672 | 0 | T2S = T2a + T2b; |
673 | 0 | T2T = T2R - T2S; |
674 | 0 | { |
675 | 0 | E T26, T27, T29, T2c; |
676 | 0 | T26 = T24 - T25; |
677 | 0 | T27 = T1h - T1m; |
678 | 0 | T28 = T26 + T27; |
679 | 0 | T2A = T26 - T27; |
680 | 0 | T29 = T14 - T17; |
681 | 0 | T2c = T2a - T2b; |
682 | 0 | T2d = T29 - T2c; |
683 | 0 | T2B = T29 + T2c; |
684 | 0 | } |
685 | 0 | } |
686 | 0 | { |
687 | 0 | E T23, T2r, T3A, T3C, T2q, T3B, T2u, T3x; |
688 | 0 | { |
689 | 0 | E T1R, T22, T3y, T3z; |
690 | 0 | T1R = T1N - T1Q; |
691 | 0 | T22 = KP707106781 * (T1W - T21); |
692 | 0 | T23 = T1R + T22; |
693 | 0 | T2r = T1R - T22; |
694 | 0 | T3y = KP707106781 * (T2x - T2w); |
695 | 0 | T3z = T3s + T3r; |
696 | 0 | T3A = T3y + T3z; |
697 | 0 | T3C = T3z - T3y; |
698 | 0 | } |
699 | 0 | { |
700 | 0 | E T2e, T2p, T2s, T2t; |
701 | 0 | T2e = FMA(KP923879532, T28, KP382683432 * T2d); |
702 | 0 | T2p = FNMS(KP923879532, T2o, KP382683432 * T2j); |
703 | 0 | T2q = T2e + T2p; |
704 | 0 | T3B = T2p - T2e; |
705 | 0 | T2s = FNMS(KP923879532, T2d, KP382683432 * T28); |
706 | 0 | T2t = FMA(KP382683432, T2o, KP923879532 * T2j); |
707 | 0 | T2u = T2s - T2t; |
708 | 0 | T3x = T2s + T2t; |
709 | 0 | } |
710 | 0 | Rm[WS(rs, 4)] = T23 - T2q; |
711 | 0 | Im[WS(rs, 4)] = T3x - T3A; |
712 | 0 | Rp[WS(rs, 3)] = T23 + T2q; |
713 | 0 | Ip[WS(rs, 3)] = T3x + T3A; |
714 | 0 | Rm[0] = T2r - T2u; |
715 | 0 | Im[0] = T3B - T3C; |
716 | 0 | Rp[WS(rs, 7)] = T2r + T2u; |
717 | 0 | Ip[WS(rs, 7)] = T3B + T3C; |
718 | 0 | } |
719 | 0 | { |
720 | 0 | E T2P, T31, T3m, T3o, T30, T3n, T34, T3j; |
721 | 0 | { |
722 | 0 | E T2L, T2O, T3k, T3l; |
723 | 0 | T2L = Tf - TA; |
724 | 0 | T2O = T2M - T2N; |
725 | 0 | T2P = T2L + T2O; |
726 | 0 | T31 = T2L - T2O; |
727 | 0 | T3k = TZ - TM; |
728 | 0 | T3l = T3e - T3b; |
729 | 0 | T3m = T3k + T3l; |
730 | 0 | T3o = T3l - T3k; |
731 | 0 | } |
732 | 0 | { |
733 | 0 | E T2U, T2Z, T32, T33; |
734 | 0 | T2U = T2Q + T2T; |
735 | 0 | T2Z = T2V - T2Y; |
736 | 0 | T30 = KP707106781 * (T2U + T2Z); |
737 | 0 | T3n = KP707106781 * (T2Z - T2U); |
738 | 0 | T32 = T2T - T2Q; |
739 | 0 | T33 = T2V + T2Y; |
740 | 0 | T34 = KP707106781 * (T32 - T33); |
741 | 0 | T3j = KP707106781 * (T32 + T33); |
742 | 0 | } |
743 | 0 | Rm[WS(rs, 5)] = T2P - T30; |
744 | 0 | Im[WS(rs, 5)] = T3j - T3m; |
745 | 0 | Rp[WS(rs, 2)] = T2P + T30; |
746 | 0 | Ip[WS(rs, 2)] = T3j + T3m; |
747 | 0 | Rm[WS(rs, 1)] = T31 - T34; |
748 | 0 | Im[WS(rs, 1)] = T3n - T3o; |
749 | 0 | Rp[WS(rs, 6)] = T31 + T34; |
750 | 0 | Ip[WS(rs, 6)] = T3n + T3o; |
751 | 0 | } |
752 | 0 | { |
753 | 0 | E T2z, T2H, T3u, T3w, T2G, T3v, T2K, T3p; |
754 | 0 | { |
755 | 0 | E T2v, T2y, T3q, T3t; |
756 | 0 | T2v = T1N + T1Q; |
757 | 0 | T2y = KP707106781 * (T2w + T2x); |
758 | 0 | T2z = T2v + T2y; |
759 | 0 | T2H = T2v - T2y; |
760 | 0 | T3q = KP707106781 * (T1W + T21); |
761 | 0 | T3t = T3r - T3s; |
762 | 0 | T3u = T3q + T3t; |
763 | 0 | T3w = T3t - T3q; |
764 | 0 | } |
765 | 0 | { |
766 | 0 | E T2C, T2F, T2I, T2J; |
767 | 0 | T2C = FMA(KP382683432, T2A, KP923879532 * T2B); |
768 | 0 | T2F = FNMS(KP382683432, T2E, KP923879532 * T2D); |
769 | 0 | T2G = T2C + T2F; |
770 | 0 | T3v = T2F - T2C; |
771 | 0 | T2I = FNMS(KP382683432, T2B, KP923879532 * T2A); |
772 | 0 | T2J = FMA(KP923879532, T2E, KP382683432 * T2D); |
773 | 0 | T2K = T2I - T2J; |
774 | 0 | T3p = T2I + T2J; |
775 | 0 | } |
776 | 0 | Rm[WS(rs, 6)] = T2z - T2G; |
777 | 0 | Im[WS(rs, 6)] = T3p - T3u; |
778 | 0 | Rp[WS(rs, 1)] = T2z + T2G; |
779 | 0 | Ip[WS(rs, 1)] = T3p + T3u; |
780 | 0 | Rm[WS(rs, 2)] = T2H - T2K; |
781 | 0 | Im[WS(rs, 2)] = T3v - T3w; |
782 | 0 | Rp[WS(rs, 5)] = T2H + T2K; |
783 | 0 | Ip[WS(rs, 5)] = T3v + T3w; |
784 | 0 | } |
785 | 0 | { |
786 | 0 | E T11, T35, T3g, T3i, T1M, T3h, T38, T39; |
787 | 0 | { |
788 | 0 | E TB, T10, T3a, T3f; |
789 | 0 | TB = Tf + TA; |
790 | 0 | T10 = TM + TZ; |
791 | 0 | T11 = TB + T10; |
792 | 0 | T35 = TB - T10; |
793 | 0 | T3a = T2M + T2N; |
794 | 0 | T3f = T3b + T3e; |
795 | 0 | T3g = T3a + T3f; |
796 | 0 | T3i = T3f - T3a; |
797 | 0 | } |
798 | 0 | { |
799 | 0 | E T1o, T1L, T36, T37; |
800 | 0 | T1o = T18 + T1n; |
801 | 0 | T1L = T1B + T1K; |
802 | 0 | T1M = T1o + T1L; |
803 | 0 | T3h = T1L - T1o; |
804 | 0 | T36 = T2R + T2S; |
805 | 0 | T37 = T2W + T2X; |
806 | 0 | T38 = T36 - T37; |
807 | 0 | T39 = T36 + T37; |
808 | 0 | } |
809 | 0 | Rm[WS(rs, 7)] = T11 - T1M; |
810 | 0 | Im[WS(rs, 7)] = T39 - T3g; |
811 | 0 | Rp[0] = T11 + T1M; |
812 | 0 | Ip[0] = T39 + T3g; |
813 | 0 | Rm[WS(rs, 3)] = T35 - T38; |
814 | 0 | Im[WS(rs, 3)] = T3h - T3i; |
815 | 0 | Rp[WS(rs, 4)] = T35 + T38; |
816 | 0 | Ip[WS(rs, 4)] = T3h + T3i; |
817 | 0 | } |
818 | 0 | } |
819 | 0 | } |
820 | 0 | } |
821 | 0 | } |
822 | | |
823 | | static const tw_instr twinstr[] = { |
824 | | { TW_CEXP, 1, 1 }, |
825 | | { TW_CEXP, 1, 3 }, |
826 | | { TW_CEXP, 1, 9 }, |
827 | | { TW_CEXP, 1, 15 }, |
828 | | { TW_NEXT, 1, 0 } |
829 | | }; |
830 | | |
831 | | static const hc2c_desc desc = { 16, "hc2cf2_16", twinstr, &GENUS, { 156, 68, 40, 0 } }; |
832 | | |
833 | 1 | void X(codelet_hc2cf2_16) (planner *p) { |
834 | 1 | X(khc2c_register) (p, hc2cf2_16, &desc, HC2C_VIA_RDFT); |
835 | 1 | } |
836 | | #endif |