/src/fftw3/rdft/scalar/r2cb/r2cbIII_16.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2003, 2007-14 Matteo Frigo |
3 | | * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology |
4 | | * |
5 | | * This program is free software; you can redistribute it and/or modify |
6 | | * it under the terms of the GNU General Public License as published by |
7 | | * the Free Software Foundation; either version 2 of the License, or |
8 | | * (at your option) any later version. |
9 | | * |
10 | | * This program is distributed in the hope that it will be useful, |
11 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | * GNU General Public License for more details. |
14 | | * |
15 | | * You should have received a copy of the GNU General Public License |
16 | | * along with this program; if not, write to the Free Software |
17 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
18 | | * |
19 | | */ |
20 | | |
21 | | /* This file was automatically generated --- DO NOT EDIT */ |
22 | | /* Generated on Fri Oct 10 06:59:38 UTC 2025 */ |
23 | | |
24 | | #include "rdft/codelet-rdft.h" |
25 | | |
26 | | #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) |
27 | | |
28 | | /* Generated by: ../../../genfft/gen_r2cb.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -name r2cbIII_16 -dft-III -include rdft/scalar/r2cbIII.h */ |
29 | | |
30 | | /* |
31 | | * This function contains 66 FP additions, 36 FP multiplications, |
32 | | * (or, 46 additions, 16 multiplications, 20 fused multiply/add), |
33 | | * 40 stack variables, 9 constants, and 32 memory accesses |
34 | | */ |
35 | | #include "rdft/scalar/r2cbIII.h" |
36 | | |
37 | | static void r2cbIII_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) |
38 | | { |
39 | | DK(KP198912367, +0.198912367379658006911597622644676228597850501); |
40 | | DK(KP1_961570560, +1.961570560806460898252364472268478073947867462); |
41 | | DK(KP668178637, +0.668178637919298919997757686523080761552472251); |
42 | | DK(KP1_662939224, +1.662939224605090474157576755235811513477121624); |
43 | | DK(KP707106781, +0.707106781186547524400844362104849039284835938); |
44 | | DK(KP1_414213562, +1.414213562373095048801688724209698078569671875); |
45 | | DK(KP414213562, +0.414213562373095048801688724209698078569671875); |
46 | | DK(KP1_847759065, +1.847759065022573512256366378793576573644833252); |
47 | | DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); |
48 | | { |
49 | | INT i; |
50 | | for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) { |
51 | | E T7, TW, T13, Tj, TA, TK, TP, TH, Te, TX, T12, To, Tt, TC, TS; |
52 | | E TB, TT, TY; |
53 | | { |
54 | | E T3, Tf, Tz, TU, T6, Tw, Ti, TV; |
55 | | { |
56 | | E T1, T2, Tx, Ty; |
57 | | T1 = Cr[0]; |
58 | | T2 = Cr[WS(csr, 7)]; |
59 | | T3 = T1 + T2; |
60 | | Tf = T1 - T2; |
61 | | Tx = Ci[0]; |
62 | | Ty = Ci[WS(csi, 7)]; |
63 | | Tz = Tx + Ty; |
64 | | TU = Ty - Tx; |
65 | | } |
66 | | { |
67 | | E T4, T5, Tg, Th; |
68 | | T4 = Cr[WS(csr, 4)]; |
69 | | T5 = Cr[WS(csr, 3)]; |
70 | | T6 = T4 + T5; |
71 | | Tw = T4 - T5; |
72 | | Tg = Ci[WS(csi, 4)]; |
73 | | Th = Ci[WS(csi, 3)]; |
74 | | Ti = Tg + Th; |
75 | | TV = Th - Tg; |
76 | | } |
77 | | T7 = T3 + T6; |
78 | | TW = TU - TV; |
79 | | T13 = TV + TU; |
80 | | Tj = Tf - Ti; |
81 | | TA = Tw + Tz; |
82 | | TK = Tw - Tz; |
83 | | TP = T3 - T6; |
84 | | TH = Tf + Ti; |
85 | | } |
86 | | { |
87 | | E Ta, Tk, Tn, TR, Td, Tp, Ts, TQ; |
88 | | { |
89 | | E T8, T9, Tl, Tm; |
90 | | T8 = Cr[WS(csr, 2)]; |
91 | | T9 = Cr[WS(csr, 5)]; |
92 | | Ta = T8 + T9; |
93 | | Tk = T8 - T9; |
94 | | Tl = Ci[WS(csi, 2)]; |
95 | | Tm = Ci[WS(csi, 5)]; |
96 | | Tn = Tl + Tm; |
97 | | TR = Tl - Tm; |
98 | | } |
99 | | { |
100 | | E Tb, Tc, Tq, Tr; |
101 | | Tb = Cr[WS(csr, 1)]; |
102 | | Tc = Cr[WS(csr, 6)]; |
103 | | Td = Tb + Tc; |
104 | | Tp = Tb - Tc; |
105 | | Tq = Ci[WS(csi, 1)]; |
106 | | Tr = Ci[WS(csi, 6)]; |
107 | | Ts = Tq + Tr; |
108 | | TQ = Tr - Tq; |
109 | | } |
110 | | Te = Ta + Td; |
111 | | TX = Ta - Td; |
112 | | T12 = TR + TQ; |
113 | | To = Tk - Tn; |
114 | | Tt = Tp - Ts; |
115 | | TC = Tk + Tn; |
116 | | TS = TQ - TR; |
117 | | TB = Tp + Ts; |
118 | | } |
119 | | R0[0] = KP2_000000000 * (T7 + Te); |
120 | | R0[WS(rs, 4)] = KP2_000000000 * (T13 - T12); |
121 | | TT = TP + TS; |
122 | | TY = TW - TX; |
123 | | R0[WS(rs, 1)] = KP1_847759065 * (FMA(KP414213562, TY, TT)); |
124 | | R0[WS(rs, 5)] = KP1_847759065 * (FNMS(KP414213562, TT, TY)); |
125 | | { |
126 | | E T11, T14, TZ, T10; |
127 | | T11 = T7 - Te; |
128 | | T14 = T12 + T13; |
129 | | R0[WS(rs, 2)] = KP1_414213562 * (T11 + T14); |
130 | | R0[WS(rs, 6)] = KP1_414213562 * (T14 - T11); |
131 | | TZ = TX + TW; |
132 | | T10 = TP - TS; |
133 | | R0[WS(rs, 3)] = KP1_847759065 * (FMA(KP414213562, T10, TZ)); |
134 | | R0[WS(rs, 7)] = -(KP1_847759065 * (FNMS(KP414213562, TZ, T10))); |
135 | | } |
136 | | { |
137 | | E TJ, TO, TM, TN, TI, TL; |
138 | | TI = TC + TB; |
139 | | TJ = FNMS(KP707106781, TI, TH); |
140 | | TO = FMA(KP707106781, TI, TH); |
141 | | TL = To - Tt; |
142 | | TM = FNMS(KP707106781, TL, TK); |
143 | | TN = FMA(KP707106781, TL, TK); |
144 | | R1[WS(rs, 1)] = KP1_662939224 * (FMA(KP668178637, TM, TJ)); |
145 | | R1[WS(rs, 7)] = -(KP1_961570560 * (FNMS(KP198912367, TN, TO))); |
146 | | R1[WS(rs, 5)] = KP1_662939224 * (FNMS(KP668178637, TJ, TM)); |
147 | | R1[WS(rs, 3)] = KP1_961570560 * (FMA(KP198912367, TO, TN)); |
148 | | } |
149 | | { |
150 | | E Tv, TG, TE, TF, Tu, TD; |
151 | | Tu = To + Tt; |
152 | | Tv = FMA(KP707106781, Tu, Tj); |
153 | | TG = FNMS(KP707106781, Tu, Tj); |
154 | | TD = TB - TC; |
155 | | TE = FNMS(KP707106781, TD, TA); |
156 | | TF = FMA(KP707106781, TD, TA); |
157 | | R1[0] = KP1_961570560 * (FNMS(KP198912367, TE, Tv)); |
158 | | R1[WS(rs, 6)] = -(KP1_662939224 * (FMA(KP668178637, TF, TG))); |
159 | | R1[WS(rs, 4)] = -(KP1_961570560 * (FMA(KP198912367, Tv, TE))); |
160 | | R1[WS(rs, 2)] = -(KP1_662939224 * (FNMS(KP668178637, TG, TF))); |
161 | | } |
162 | | } |
163 | | } |
164 | | } |
165 | | |
166 | | static const kr2c_desc desc = { 16, "r2cbIII_16", { 46, 16, 20, 0 }, &GENUS }; |
167 | | |
168 | | void X(codelet_r2cbIII_16) (planner *p) { X(kr2c_register) (p, r2cbIII_16, &desc); |
169 | | } |
170 | | |
171 | | #else |
172 | | |
173 | | /* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -name r2cbIII_16 -dft-III -include rdft/scalar/r2cbIII.h */ |
174 | | |
175 | | /* |
176 | | * This function contains 66 FP additions, 32 FP multiplications, |
177 | | * (or, 54 additions, 20 multiplications, 12 fused multiply/add), |
178 | | * 40 stack variables, 9 constants, and 32 memory accesses |
179 | | */ |
180 | | #include "rdft/scalar/r2cbIII.h" |
181 | | |
182 | | static void r2cbIII_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) |
183 | 0 | { |
184 | 0 | DK(KP1_961570560, +1.961570560806460898252364472268478073947867462); |
185 | 0 | DK(KP390180644, +0.390180644032256535696569736954044481855383236); |
186 | 0 | DK(KP1_111140466, +1.111140466039204449485661627897065748749874382); |
187 | 0 | DK(KP1_662939224, +1.662939224605090474157576755235811513477121624); |
188 | 0 | DK(KP707106781, +0.707106781186547524400844362104849039284835938); |
189 | 0 | DK(KP1_414213562, +1.414213562373095048801688724209698078569671875); |
190 | 0 | DK(KP765366864, +0.765366864730179543456919968060797733522689125); |
191 | 0 | DK(KP1_847759065, +1.847759065022573512256366378793576573644833252); |
192 | 0 | DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); |
193 | 0 | { |
194 | 0 | INT i; |
195 | 0 | for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) { |
196 | 0 | E T7, TW, T13, Tj, TD, TK, TP, TH, Te, TX, T12, To, Tt, Tx, TS; |
197 | 0 | E Tw, TT, TY; |
198 | 0 | { |
199 | 0 | E T3, Tf, TC, TV, T6, Tz, Ti, TU; |
200 | 0 | { |
201 | 0 | E T1, T2, TA, TB; |
202 | 0 | T1 = Cr[0]; |
203 | 0 | T2 = Cr[WS(csr, 7)]; |
204 | 0 | T3 = T1 + T2; |
205 | 0 | Tf = T1 - T2; |
206 | 0 | TA = Ci[0]; |
207 | 0 | TB = Ci[WS(csi, 7)]; |
208 | 0 | TC = TA + TB; |
209 | 0 | TV = TB - TA; |
210 | 0 | } |
211 | 0 | { |
212 | 0 | E T4, T5, Tg, Th; |
213 | 0 | T4 = Cr[WS(csr, 4)]; |
214 | 0 | T5 = Cr[WS(csr, 3)]; |
215 | 0 | T6 = T4 + T5; |
216 | 0 | Tz = T4 - T5; |
217 | 0 | Tg = Ci[WS(csi, 4)]; |
218 | 0 | Th = Ci[WS(csi, 3)]; |
219 | 0 | Ti = Tg + Th; |
220 | 0 | TU = Tg - Th; |
221 | 0 | } |
222 | 0 | T7 = T3 + T6; |
223 | 0 | TW = TU + TV; |
224 | 0 | T13 = TV - TU; |
225 | 0 | Tj = Tf - Ti; |
226 | 0 | TD = Tz + TC; |
227 | 0 | TK = Tz - TC; |
228 | 0 | TP = T3 - T6; |
229 | 0 | TH = Tf + Ti; |
230 | 0 | } |
231 | 0 | { |
232 | 0 | E Ta, Tk, Tn, TR, Td, Tp, Ts, TQ; |
233 | 0 | { |
234 | 0 | E T8, T9, Tl, Tm; |
235 | 0 | T8 = Cr[WS(csr, 2)]; |
236 | 0 | T9 = Cr[WS(csr, 5)]; |
237 | 0 | Ta = T8 + T9; |
238 | 0 | Tk = T8 - T9; |
239 | 0 | Tl = Ci[WS(csi, 2)]; |
240 | 0 | Tm = Ci[WS(csi, 5)]; |
241 | 0 | Tn = Tl + Tm; |
242 | 0 | TR = Tl - Tm; |
243 | 0 | } |
244 | 0 | { |
245 | 0 | E Tb, Tc, Tq, Tr; |
246 | 0 | Tb = Cr[WS(csr, 1)]; |
247 | 0 | Tc = Cr[WS(csr, 6)]; |
248 | 0 | Td = Tb + Tc; |
249 | 0 | Tp = Tb - Tc; |
250 | 0 | Tq = Ci[WS(csi, 1)]; |
251 | 0 | Tr = Ci[WS(csi, 6)]; |
252 | 0 | Ts = Tq + Tr; |
253 | 0 | TQ = Tr - Tq; |
254 | 0 | } |
255 | 0 | Te = Ta + Td; |
256 | 0 | TX = Ta - Td; |
257 | 0 | T12 = TR + TQ; |
258 | 0 | To = Tk - Tn; |
259 | 0 | Tt = Tp - Ts; |
260 | 0 | Tx = Tp + Ts; |
261 | 0 | TS = TQ - TR; |
262 | 0 | Tw = Tk + Tn; |
263 | 0 | } |
264 | 0 | R0[0] = KP2_000000000 * (T7 + Te); |
265 | 0 | R0[WS(rs, 4)] = KP2_000000000 * (T13 - T12); |
266 | 0 | TT = TP + TS; |
267 | 0 | TY = TW - TX; |
268 | 0 | R0[WS(rs, 1)] = FMA(KP1_847759065, TT, KP765366864 * TY); |
269 | 0 | R0[WS(rs, 5)] = FNMS(KP765366864, TT, KP1_847759065 * TY); |
270 | 0 | { |
271 | 0 | E T11, T14, TZ, T10; |
272 | 0 | T11 = T7 - Te; |
273 | 0 | T14 = T12 + T13; |
274 | 0 | R0[WS(rs, 2)] = KP1_414213562 * (T11 + T14); |
275 | 0 | R0[WS(rs, 6)] = KP1_414213562 * (T14 - T11); |
276 | 0 | TZ = TP - TS; |
277 | 0 | T10 = TX + TW; |
278 | 0 | R0[WS(rs, 3)] = FMA(KP765366864, TZ, KP1_847759065 * T10); |
279 | 0 | R0[WS(rs, 7)] = FNMS(KP1_847759065, TZ, KP765366864 * T10); |
280 | 0 | } |
281 | 0 | { |
282 | 0 | E TJ, TN, TM, TO, TI, TL; |
283 | 0 | TI = KP707106781 * (Tw + Tx); |
284 | 0 | TJ = TH - TI; |
285 | 0 | TN = TH + TI; |
286 | 0 | TL = KP707106781 * (To - Tt); |
287 | 0 | TM = TK - TL; |
288 | 0 | TO = TL + TK; |
289 | 0 | R1[WS(rs, 1)] = FMA(KP1_662939224, TJ, KP1_111140466 * TM); |
290 | 0 | R1[WS(rs, 7)] = FNMS(KP1_961570560, TN, KP390180644 * TO); |
291 | 0 | R1[WS(rs, 5)] = FNMS(KP1_111140466, TJ, KP1_662939224 * TM); |
292 | 0 | R1[WS(rs, 3)] = FMA(KP390180644, TN, KP1_961570560 * TO); |
293 | 0 | } |
294 | 0 | { |
295 | 0 | E Tv, TF, TE, TG, Tu, Ty; |
296 | 0 | Tu = KP707106781 * (To + Tt); |
297 | 0 | Tv = Tj + Tu; |
298 | 0 | TF = Tj - Tu; |
299 | 0 | Ty = KP707106781 * (Tw - Tx); |
300 | 0 | TE = Ty + TD; |
301 | 0 | TG = Ty - TD; |
302 | 0 | R1[0] = FNMS(KP390180644, TE, KP1_961570560 * Tv); |
303 | 0 | R1[WS(rs, 6)] = FNMS(KP1_662939224, TF, KP1_111140466 * TG); |
304 | 0 | R1[WS(rs, 4)] = -(FMA(KP390180644, Tv, KP1_961570560 * TE)); |
305 | 0 | R1[WS(rs, 2)] = FMA(KP1_111140466, TF, KP1_662939224 * TG); |
306 | 0 | } |
307 | 0 | } |
308 | 0 | } |
309 | 0 | } |
310 | | |
311 | | static const kr2c_desc desc = { 16, "r2cbIII_16", { 54, 20, 12, 0 }, &GENUS }; |
312 | | |
313 | 1 | void X(codelet_r2cbIII_16) (planner *p) { X(kr2c_register) (p, r2cbIII_16, &desc); |
314 | 1 | } |
315 | | |
316 | | #endif |