/src/fftw3/rdft/scalar/r2cf/r2cfII_20.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2003, 2007-14 Matteo Frigo |
3 | | * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology |
4 | | * |
5 | | * This program is free software; you can redistribute it and/or modify |
6 | | * it under the terms of the GNU General Public License as published by |
7 | | * the Free Software Foundation; either version 2 of the License, or |
8 | | * (at your option) any later version. |
9 | | * |
10 | | * This program is distributed in the hope that it will be useful, |
11 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | * GNU General Public License for more details. |
14 | | * |
15 | | * You should have received a copy of the GNU General Public License |
16 | | * along with this program; if not, write to the Free Software |
17 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
18 | | * |
19 | | */ |
20 | | |
21 | | /* This file was automatically generated --- DO NOT EDIT */ |
22 | | /* Generated on Fri Jul 11 06:53:36 UTC 2025 */ |
23 | | |
24 | | #include "rdft/codelet-rdft.h" |
25 | | |
26 | | #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) |
27 | | |
28 | | /* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 20 -name r2cfII_20 -dft-II -include rdft/scalar/r2cfII.h */ |
29 | | |
30 | | /* |
31 | | * This function contains 102 FP additions, 63 FP multiplications, |
32 | | * (or, 39 additions, 0 multiplications, 63 fused multiply/add), |
33 | | * 53 stack variables, 10 constants, and 40 memory accesses |
34 | | */ |
35 | | #include "rdft/scalar/r2cfII.h" |
36 | | |
37 | | static void r2cfII_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) |
38 | | { |
39 | | DK(KP951056516, +0.951056516295153572116439333379382143405698634); |
40 | | DK(KP559016994, +0.559016994374947424102293417182819058860154590); |
41 | | DK(KP707106781, +0.707106781186547524400844362104849039284835938); |
42 | | DK(KP690983005, +0.690983005625052575897706582817180941139845410); |
43 | | DK(KP447213595, +0.447213595499957939281834733746255247088123672); |
44 | | DK(KP552786404, +0.552786404500042060718165266253744752911876328); |
45 | | DK(KP809016994, +0.809016994374947424102293417182819058860154590); |
46 | | DK(KP250000000, +0.250000000000000000000000000000000000000000000); |
47 | | DK(KP381966011, +0.381966011250105151795413165634361882279690820); |
48 | | DK(KP618033988, +0.618033988749894848204586834365638117720309180); |
49 | | { |
50 | | INT i; |
51 | | for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) { |
52 | | E Ti, T1d, T1f, T1e, Tg, T1p, TS, T1g, T1, T6, T7, T1r, T1k, T8, To; |
53 | | E Tp, Tv, TX, Tr, TV, Tx, TF, TC, TD, T12, TG, TK, T10, Tc, Tf; |
54 | | Ti = R1[WS(rs, 2)]; |
55 | | T1d = R0[WS(rs, 5)]; |
56 | | { |
57 | | E Ta, Tb, Td, Te; |
58 | | Ta = R0[WS(rs, 9)]; |
59 | | Tb = R0[WS(rs, 1)]; |
60 | | Tc = Ta - Tb; |
61 | | T1f = Ta + Tb; |
62 | | Td = R0[WS(rs, 3)]; |
63 | | Te = R0[WS(rs, 7)]; |
64 | | Tf = Td - Te; |
65 | | T1e = Td + Te; |
66 | | } |
67 | | Tg = FNMS(KP618033988, Tf, Tc); |
68 | | T1p = FMA(KP381966011, T1e, T1f); |
69 | | TS = FMA(KP618033988, Tc, Tf); |
70 | | T1g = FMA(KP381966011, T1f, T1e); |
71 | | { |
72 | | E T2, T5, T3, T4, T1i, T1j; |
73 | | T1 = R0[0]; |
74 | | T2 = R0[WS(rs, 4)]; |
75 | | T5 = R0[WS(rs, 6)]; |
76 | | T3 = R0[WS(rs, 8)]; |
77 | | T4 = R0[WS(rs, 2)]; |
78 | | T1i = T2 + T5; |
79 | | T1j = T3 + T4; |
80 | | T6 = T2 + T3 - T4 - T5; |
81 | | T7 = FNMS(KP250000000, T6, T1); |
82 | | T1r = FNMS(KP618033988, T1i, T1j); |
83 | | T1k = FMA(KP618033988, T1j, T1i); |
84 | | T8 = (T3 + T5 - T2) - T4; |
85 | | } |
86 | | { |
87 | | E Tn, Tu, Tt, Tq, TU; |
88 | | { |
89 | | E Tj, Tk, Tl, Tm; |
90 | | Tj = R1[WS(rs, 8)]; |
91 | | To = R1[WS(rs, 6)]; |
92 | | Tk = R1[0]; |
93 | | Tl = R1[WS(rs, 4)]; |
94 | | Tm = Tk + Tl; |
95 | | Tn = Tj - Tm; |
96 | | Tu = Tk - Tl; |
97 | | Tp = Tj + Tm; |
98 | | Tt = To + Tj; |
99 | | } |
100 | | Tv = FNMS(KP618033988, Tu, Tt); |
101 | | TX = FMA(KP618033988, Tt, Tu); |
102 | | Tq = FMA(KP809016994, Tp, To); |
103 | | Tr = FNMS(KP552786404, Tq, Tn); |
104 | | TU = FMA(KP447213595, Tp, Tn); |
105 | | TV = FNMS(KP690983005, TU, To); |
106 | | } |
107 | | { |
108 | | E TJ, TE, TI, TZ; |
109 | | Tx = R1[WS(rs, 7)]; |
110 | | { |
111 | | E Ty, Tz, TA, TB; |
112 | | Ty = R1[WS(rs, 1)]; |
113 | | TF = R1[WS(rs, 3)]; |
114 | | Tz = R1[WS(rs, 5)]; |
115 | | TA = R1[WS(rs, 9)]; |
116 | | TB = Tz + TA; |
117 | | TC = Ty + TB; |
118 | | TJ = Tz - TA; |
119 | | TE = Ty - TB; |
120 | | TI = TF + Ty; |
121 | | } |
122 | | TD = FMA(KP250000000, TC, Tx); |
123 | | T12 = FNMS(KP618033988, TI, TJ); |
124 | | TG = FNMS(KP552786404, TF, TE); |
125 | | TK = FMA(KP618033988, TJ, TI); |
126 | | TZ = FMA(KP447213595, TC, TE); |
127 | | T10 = FNMS(KP690983005, TZ, TF); |
128 | | } |
129 | | { |
130 | | E T19, T1w, T1c, T1x, T1a, T1b; |
131 | | T19 = T1 + T6; |
132 | | T1w = T1f + T1d - T1e; |
133 | | T1a = Ti + To - Tp; |
134 | | T1b = TC - TF - Tx; |
135 | | T1c = T1a + T1b; |
136 | | T1x = T1a - T1b; |
137 | | Cr[WS(csr, 2)] = FNMS(KP707106781, T1c, T19); |
138 | | Ci[WS(csi, 2)] = FMS(KP707106781, T1x, T1w); |
139 | | Cr[WS(csr, 7)] = FMA(KP707106781, T1c, T19); |
140 | | Ci[WS(csi, 7)] = FMA(KP707106781, T1x, T1w); |
141 | | } |
142 | | { |
143 | | E TT, T15, T1s, T1u, TY, T17, T13, T16; |
144 | | { |
145 | | E TR, T1q, TW, T11; |
146 | | TR = FMA(KP559016994, T8, T7); |
147 | | TT = FMA(KP951056516, TS, TR); |
148 | | T15 = FNMS(KP951056516, TS, TR); |
149 | | T1q = FNMS(KP809016994, T1p, T1d); |
150 | | T1s = FNMS(KP951056516, T1r, T1q); |
151 | | T1u = FMA(KP951056516, T1r, T1q); |
152 | | TW = FNMS(KP809016994, TV, Ti); |
153 | | TY = FMA(KP951056516, TX, TW); |
154 | | T17 = FNMS(KP951056516, TX, TW); |
155 | | T11 = FNMS(KP809016994, T10, Tx); |
156 | | T13 = FNMS(KP951056516, T12, T11); |
157 | | T16 = FMA(KP951056516, T12, T11); |
158 | | } |
159 | | { |
160 | | E T14, T1v, T18, T1t; |
161 | | T14 = TY - T13; |
162 | | Cr[WS(csr, 6)] = FNMS(KP707106781, T14, TT); |
163 | | Cr[WS(csr, 3)] = FMA(KP707106781, T14, TT); |
164 | | T1v = T17 + T16; |
165 | | Ci[WS(csi, 6)] = FMS(KP707106781, T1v, T1u); |
166 | | Ci[WS(csi, 3)] = FMA(KP707106781, T1v, T1u); |
167 | | T18 = T16 - T17; |
168 | | Cr[WS(csr, 8)] = FNMS(KP707106781, T18, T15); |
169 | | Cr[WS(csr, 1)] = FMA(KP707106781, T18, T15); |
170 | | T1t = TY + T13; |
171 | | Ci[WS(csi, 8)] = -(FMA(KP707106781, T1t, T1s)); |
172 | | Ci[WS(csi, 1)] = FNMS(KP707106781, T1t, T1s); |
173 | | } |
174 | | } |
175 | | { |
176 | | E Th, TN, T1l, T1n, Tw, TO, TL, TP; |
177 | | { |
178 | | E T9, T1h, Ts, TH; |
179 | | T9 = FNMS(KP559016994, T8, T7); |
180 | | Th = FNMS(KP951056516, Tg, T9); |
181 | | TN = FMA(KP951056516, Tg, T9); |
182 | | T1h = FMA(KP809016994, T1g, T1d); |
183 | | T1l = FMA(KP951056516, T1k, T1h); |
184 | | T1n = FNMS(KP951056516, T1k, T1h); |
185 | | Ts = FNMS(KP559016994, Tr, Ti); |
186 | | Tw = FNMS(KP951056516, Tv, Ts); |
187 | | TO = FMA(KP951056516, Tv, Ts); |
188 | | TH = FNMS(KP559016994, TG, TD); |
189 | | TL = FNMS(KP951056516, TK, TH); |
190 | | TP = FMA(KP951056516, TK, TH); |
191 | | } |
192 | | { |
193 | | E TM, T1m, TQ, T1o; |
194 | | TM = Tw - TL; |
195 | | Cr[WS(csr, 9)] = FNMS(KP707106781, TM, Th); |
196 | | Cr[0] = FMA(KP707106781, TM, Th); |
197 | | T1m = TO + TP; |
198 | | Ci[0] = -(FMA(KP707106781, T1m, T1l)); |
199 | | Ci[WS(csi, 9)] = FNMS(KP707106781, T1m, T1l); |
200 | | TQ = TO - TP; |
201 | | Cr[WS(csr, 5)] = FNMS(KP707106781, TQ, TN); |
202 | | Cr[WS(csr, 4)] = FMA(KP707106781, TQ, TN); |
203 | | T1o = Tw + TL; |
204 | | Ci[WS(csi, 4)] = -(FMA(KP707106781, T1o, T1n)); |
205 | | Ci[WS(csi, 5)] = FNMS(KP707106781, T1o, T1n); |
206 | | } |
207 | | } |
208 | | } |
209 | | } |
210 | | } |
211 | | |
212 | | static const kr2c_desc desc = { 20, "r2cfII_20", { 39, 0, 63, 0 }, &GENUS }; |
213 | | |
214 | | void X(codelet_r2cfII_20) (planner *p) { X(kr2c_register) (p, r2cfII_20, &desc); |
215 | | } |
216 | | |
217 | | #else |
218 | | |
219 | | /* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 20 -name r2cfII_20 -dft-II -include rdft/scalar/r2cfII.h */ |
220 | | |
221 | | /* |
222 | | * This function contains 102 FP additions, 34 FP multiplications, |
223 | | * (or, 86 additions, 18 multiplications, 16 fused multiply/add), |
224 | | * 60 stack variables, 13 constants, and 40 memory accesses |
225 | | */ |
226 | | #include "rdft/scalar/r2cfII.h" |
227 | | |
228 | | static void r2cfII_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) |
229 | 0 | { |
230 | 0 | DK(KP572061402, +0.572061402817684297600072783580302076536153377); |
231 | 0 | DK(KP218508012, +0.218508012224410535399650602527877556893735408); |
232 | 0 | DK(KP309016994, +0.309016994374947424102293417182819058860154590); |
233 | 0 | DK(KP809016994, +0.809016994374947424102293417182819058860154590); |
234 | 0 | DK(KP559016994, +0.559016994374947424102293417182819058860154590); |
235 | 0 | DK(KP951056516, +0.951056516295153572116439333379382143405698634); |
236 | 0 | DK(KP587785252, +0.587785252292473129168705954639072768597652438); |
237 | 0 | DK(KP250000000, +0.250000000000000000000000000000000000000000000); |
238 | 0 | DK(KP176776695, +0.176776695296636881100211090526212259821208984); |
239 | 0 | DK(KP395284707, +0.395284707521047416499861693054089816714944392); |
240 | 0 | DK(KP672498511, +0.672498511963957326960058968885748755876783111); |
241 | 0 | DK(KP415626937, +0.415626937777453428589967464113135184222253485); |
242 | 0 | DK(KP707106781, +0.707106781186547524400844362104849039284835938); |
243 | 0 | { |
244 | 0 | INT i; |
245 | 0 | for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) { |
246 | 0 | E T8, TD, Tm, TN, T9, TC, TY, TE, Te, TF, Tl, TK, T12, TL, Tk; |
247 | 0 | E TM, T1, T6, Tq, T1l, T1c, Tp, T1f, T1e, T1d, Ty, TW, T1g, T1m, Tx; |
248 | 0 | E Tu; |
249 | 0 | T8 = R1[WS(rs, 2)]; |
250 | 0 | TD = KP707106781 * T8; |
251 | 0 | Tm = R1[WS(rs, 7)]; |
252 | 0 | TN = KP707106781 * Tm; |
253 | 0 | { |
254 | 0 | E Ta, TA, Td, TB, Tb, Tc; |
255 | 0 | T9 = R1[WS(rs, 6)]; |
256 | 0 | Ta = R1[WS(rs, 8)]; |
257 | 0 | TA = T9 + Ta; |
258 | 0 | Tb = R1[0]; |
259 | 0 | Tc = R1[WS(rs, 4)]; |
260 | 0 | Td = Tb + Tc; |
261 | 0 | TB = Tb - Tc; |
262 | 0 | TC = FMA(KP415626937, TA, KP672498511 * TB); |
263 | 0 | TY = FNMS(KP415626937, TB, KP672498511 * TA); |
264 | 0 | TE = KP395284707 * (Ta - Td); |
265 | 0 | Te = Ta + Td; |
266 | 0 | TF = KP176776695 * Te; |
267 | 0 | } |
268 | 0 | { |
269 | 0 | E Tg, TJ, Tj, TI, Th, Ti; |
270 | 0 | Tg = R1[WS(rs, 1)]; |
271 | 0 | Tl = R1[WS(rs, 3)]; |
272 | 0 | TJ = Tg + Tl; |
273 | 0 | Th = R1[WS(rs, 5)]; |
274 | 0 | Ti = R1[WS(rs, 9)]; |
275 | 0 | Tj = Th + Ti; |
276 | 0 | TI = Th - Ti; |
277 | 0 | TK = FNMS(KP415626937, TJ, KP672498511 * TI); |
278 | 0 | T12 = FMA(KP415626937, TI, KP672498511 * TJ); |
279 | 0 | TL = KP395284707 * (Tg - Tj); |
280 | 0 | Tk = Tg + Tj; |
281 | 0 | TM = KP176776695 * Tk; |
282 | 0 | } |
283 | 0 | { |
284 | 0 | E T2, T5, T3, T4, T1a, T1b; |
285 | 0 | T1 = R0[0]; |
286 | 0 | T2 = R0[WS(rs, 6)]; |
287 | 0 | T5 = R0[WS(rs, 8)]; |
288 | 0 | T3 = R0[WS(rs, 2)]; |
289 | 0 | T4 = R0[WS(rs, 4)]; |
290 | 0 | T1a = T4 + T2; |
291 | 0 | T1b = T5 + T3; |
292 | 0 | T6 = T2 + T3 - (T4 + T5); |
293 | 0 | Tq = FMA(KP250000000, T6, T1); |
294 | 0 | T1l = FNMS(KP951056516, T1b, KP587785252 * T1a); |
295 | 0 | T1c = FMA(KP951056516, T1a, KP587785252 * T1b); |
296 | 0 | Tp = KP559016994 * (T5 + T2 - (T4 + T3)); |
297 | 0 | } |
298 | 0 | T1f = R0[WS(rs, 5)]; |
299 | 0 | { |
300 | 0 | E Tv, Tw, Ts, Tt; |
301 | 0 | Tv = R0[WS(rs, 9)]; |
302 | 0 | Tw = R0[WS(rs, 1)]; |
303 | 0 | Tx = Tv - Tw; |
304 | 0 | T1e = Tv + Tw; |
305 | 0 | Ts = R0[WS(rs, 3)]; |
306 | 0 | Tt = R0[WS(rs, 7)]; |
307 | 0 | Tu = Ts - Tt; |
308 | 0 | T1d = Ts + Tt; |
309 | 0 | } |
310 | 0 | Ty = FMA(KP951056516, Tu, KP587785252 * Tx); |
311 | 0 | TW = FNMS(KP951056516, Tx, KP587785252 * Tu); |
312 | 0 | T1g = FMA(KP809016994, T1d, KP309016994 * T1e) + T1f; |
313 | 0 | T1m = FNMS(KP809016994, T1e, T1f) - (KP309016994 * T1d); |
314 | 0 | { |
315 | 0 | E T7, T1r, To, T1q, Tf, Tn; |
316 | 0 | T7 = T1 - T6; |
317 | 0 | T1r = T1e + T1f - T1d; |
318 | 0 | Tf = T8 + (T9 - Te); |
319 | 0 | Tn = (Tk - Tl) - Tm; |
320 | 0 | To = KP707106781 * (Tf + Tn); |
321 | 0 | T1q = KP707106781 * (Tf - Tn); |
322 | 0 | Cr[WS(csr, 2)] = T7 - To; |
323 | 0 | Ci[WS(csi, 2)] = T1q - T1r; |
324 | 0 | Cr[WS(csr, 7)] = T7 + To; |
325 | 0 | Ci[WS(csi, 7)] = T1q + T1r; |
326 | 0 | } |
327 | 0 | { |
328 | 0 | E T1h, T1j, TX, T15, T10, T16, T13, T17, TV, TZ, T11; |
329 | 0 | T1h = T1c - T1g; |
330 | 0 | T1j = T1c + T1g; |
331 | 0 | TV = Tq - Tp; |
332 | 0 | TX = TV - TW; |
333 | 0 | T15 = TV + TW; |
334 | 0 | TZ = FMA(KP218508012, T9, TD) + TF - TE; |
335 | 0 | T10 = TY + TZ; |
336 | 0 | T16 = TZ - TY; |
337 | 0 | T11 = FNMS(KP218508012, Tl, TL) - (TM + TN); |
338 | 0 | T13 = T11 - T12; |
339 | 0 | T17 = T11 + T12; |
340 | 0 | { |
341 | 0 | E T14, T19, T18, T1i; |
342 | 0 | T14 = T10 + T13; |
343 | 0 | Cr[WS(csr, 5)] = TX - T14; |
344 | 0 | Cr[WS(csr, 4)] = TX + T14; |
345 | 0 | T19 = T17 - T16; |
346 | 0 | Ci[WS(csi, 5)] = T19 - T1h; |
347 | 0 | Ci[WS(csi, 4)] = T19 + T1h; |
348 | 0 | T18 = T16 + T17; |
349 | 0 | Cr[WS(csr, 9)] = T15 - T18; |
350 | 0 | Cr[0] = T15 + T18; |
351 | 0 | T1i = T13 - T10; |
352 | 0 | Ci[0] = T1i - T1j; |
353 | 0 | Ci[WS(csi, 9)] = T1i + T1j; |
354 | 0 | } |
355 | 0 | } |
356 | 0 | { |
357 | 0 | E T1n, T1p, Tz, TR, TH, TS, TP, TT, Tr, TG, TO; |
358 | 0 | T1n = T1l + T1m; |
359 | 0 | T1p = T1m - T1l; |
360 | 0 | Tr = Tp + Tq; |
361 | 0 | Tz = Tr + Ty; |
362 | 0 | TR = Tr - Ty; |
363 | 0 | TG = TD + TE + FNMS(KP572061402, T9, TF); |
364 | 0 | TH = TC + TG; |
365 | 0 | TS = TC - TG; |
366 | 0 | TO = TL + TM + FNMS(KP572061402, Tl, TN); |
367 | 0 | TP = TK - TO; |
368 | 0 | TT = TK + TO; |
369 | 0 | { |
370 | 0 | E TQ, T1o, TU, T1k; |
371 | 0 | TQ = TH + TP; |
372 | 0 | Cr[WS(csr, 6)] = Tz - TQ; |
373 | 0 | Cr[WS(csr, 3)] = Tz + TQ; |
374 | 0 | T1o = TT - TS; |
375 | 0 | Ci[WS(csi, 6)] = T1o - T1p; |
376 | 0 | Ci[WS(csi, 3)] = T1o + T1p; |
377 | 0 | TU = TS + TT; |
378 | 0 | Cr[WS(csr, 8)] = TR - TU; |
379 | 0 | Cr[WS(csr, 1)] = TR + TU; |
380 | 0 | T1k = TP - TH; |
381 | 0 | Ci[WS(csi, 8)] = T1k - T1n; |
382 | 0 | Ci[WS(csi, 1)] = T1k + T1n; |
383 | 0 | } |
384 | 0 | } |
385 | 0 | } |
386 | 0 | } |
387 | 0 | } |
388 | | |
389 | | static const kr2c_desc desc = { 20, "r2cfII_20", { 86, 18, 16, 0 }, &GENUS }; |
390 | | |
391 | 1 | void X(codelet_r2cfII_20) (planner *p) { X(kr2c_register) (p, r2cfII_20, &desc); |
392 | 1 | } |
393 | | |
394 | | #endif |