/src/fftw3/rdft/scalar/r2cb/r2cbIII_15.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2003, 2007-14 Matteo Frigo |
3 | | * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology |
4 | | * |
5 | | * This program is free software; you can redistribute it and/or modify |
6 | | * it under the terms of the GNU General Public License as published by |
7 | | * the Free Software Foundation; either version 2 of the License, or |
8 | | * (at your option) any later version. |
9 | | * |
10 | | * This program is distributed in the hope that it will be useful, |
11 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | * GNU General Public License for more details. |
14 | | * |
15 | | * You should have received a copy of the GNU General Public License |
16 | | * along with this program; if not, write to the Free Software |
17 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
18 | | * |
19 | | */ |
20 | | |
21 | | /* This file was automatically generated --- DO NOT EDIT */ |
22 | | /* Generated on Sun Sep 8 06:42:26 UTC 2024 */ |
23 | | |
24 | | #include "rdft/codelet-rdft.h" |
25 | | |
26 | | #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) |
27 | | |
28 | | /* Generated by: ../../../genfft/gen_r2cb.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 15 -name r2cbIII_15 -dft-III -include rdft/scalar/r2cbIII.h */ |
29 | | |
30 | | /* |
31 | | * This function contains 64 FP additions, 43 FP multiplications, |
32 | | * (or, 21 additions, 0 multiplications, 43 fused multiply/add), |
33 | | * 42 stack variables, 9 constants, and 30 memory accesses |
34 | | */ |
35 | | #include "rdft/scalar/r2cbIII.h" |
36 | | |
37 | | static void r2cbIII_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) |
38 | | { |
39 | | DK(KP951056516, +0.951056516295153572116439333379382143405698634); |
40 | | DK(KP559016994, +0.559016994374947424102293417182819058860154590); |
41 | | DK(KP1_902113032, +1.902113032590307144232878666758764286811397268); |
42 | | DK(KP1_732050807, +1.732050807568877293527446341505872366942805254); |
43 | | DK(KP250000000, +0.250000000000000000000000000000000000000000000); |
44 | | DK(KP1_118033988, +1.118033988749894848204586834365638117720309180); |
45 | | DK(KP500000000, +0.500000000000000000000000000000000000000000000); |
46 | | DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); |
47 | | DK(KP618033988, +0.618033988749894848204586834365638117720309180); |
48 | | { |
49 | | INT i; |
50 | | for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) { |
51 | | E Tk, TA, T5, Th, Tz, T6, Tn, TX, TR, Td, Tm, TI, Tv, TN, TD; |
52 | | E TL, TM, Ti, Tj, T12, Te, T11; |
53 | | Ti = Ci[WS(csi, 4)]; |
54 | | Tj = Ci[WS(csi, 1)]; |
55 | | Tk = FMA(KP618033988, Tj, Ti); |
56 | | TA = FNMS(KP618033988, Ti, Tj); |
57 | | { |
58 | | E T1, T4, Tg, T2, T3, Tf; |
59 | | T1 = Cr[WS(csr, 7)]; |
60 | | T2 = Cr[WS(csr, 4)]; |
61 | | T3 = Cr[WS(csr, 1)]; |
62 | | T4 = T2 + T3; |
63 | | Tg = T2 - T3; |
64 | | T5 = FMA(KP2_000000000, T4, T1); |
65 | | Tf = FNMS(KP500000000, T4, T1); |
66 | | Th = FMA(KP1_118033988, Tg, Tf); |
67 | | Tz = FNMS(KP1_118033988, Tg, Tf); |
68 | | } |
69 | | { |
70 | | E Tc, TP, T9, TQ; |
71 | | T6 = Cr[WS(csr, 2)]; |
72 | | { |
73 | | E Ta, Tb, T7, T8; |
74 | | Ta = Cr[WS(csr, 3)]; |
75 | | Tb = Cr[WS(csr, 6)]; |
76 | | Tc = Ta + Tb; |
77 | | TP = Ta - Tb; |
78 | | T7 = Cr[0]; |
79 | | T8 = Cr[WS(csr, 5)]; |
80 | | T9 = T7 + T8; |
81 | | TQ = T7 - T8; |
82 | | } |
83 | | Tn = T9 - Tc; |
84 | | TX = FMA(KP618033988, TP, TQ); |
85 | | TR = FNMS(KP618033988, TQ, TP); |
86 | | Td = T9 + Tc; |
87 | | Tm = FNMS(KP250000000, Td, T6); |
88 | | } |
89 | | { |
90 | | E Tu, TK, Tr, TJ; |
91 | | TI = Ci[WS(csi, 2)]; |
92 | | { |
93 | | E Ts, Tt, Tp, Tq; |
94 | | Ts = Ci[WS(csi, 3)]; |
95 | | Tt = Ci[WS(csi, 6)]; |
96 | | Tu = Ts - Tt; |
97 | | TK = Ts + Tt; |
98 | | Tp = Ci[0]; |
99 | | Tq = Ci[WS(csi, 5)]; |
100 | | Tr = Tp + Tq; |
101 | | TJ = Tq - Tp; |
102 | | } |
103 | | Tv = FMA(KP618033988, Tu, Tr); |
104 | | TN = TJ + TK; |
105 | | TD = FNMS(KP618033988, Tr, Tu); |
106 | | TL = TJ - TK; |
107 | | TM = FNMS(KP250000000, TL, TI); |
108 | | } |
109 | | T12 = TL + TI; |
110 | | Te = T6 + Td; |
111 | | T11 = Te - T5; |
112 | | R0[0] = FMA(KP2_000000000, Te, T5); |
113 | | R0[WS(rs, 5)] = FMS(KP1_732050807, T12, T11); |
114 | | R1[WS(rs, 2)] = FMA(KP1_732050807, T12, T11); |
115 | | { |
116 | | E TB, TF, TE, TG, TS, TU, TC, TO, TH, TT; |
117 | | TB = FNMS(KP1_902113032, TA, Tz); |
118 | | TF = FMA(KP1_902113032, TA, Tz); |
119 | | TC = FNMS(KP559016994, Tn, Tm); |
120 | | TE = FMA(KP951056516, TD, TC); |
121 | | TG = FNMS(KP951056516, TD, TC); |
122 | | TO = FNMS(KP559016994, TN, TM); |
123 | | TS = FMA(KP951056516, TR, TO); |
124 | | TU = FNMS(KP951056516, TR, TO); |
125 | | R0[WS(rs, 6)] = FMA(KP2_000000000, TE, TB); |
126 | | R1[WS(rs, 1)] = -(FMA(KP2_000000000, TG, TF)); |
127 | | TH = TB - TE; |
128 | | R0[WS(rs, 1)] = FNMS(KP1_732050807, TS, TH); |
129 | | R1[WS(rs, 3)] = -(FMA(KP1_732050807, TS, TH)); |
130 | | TT = TF - TG; |
131 | | R0[WS(rs, 4)] = FNMS(KP1_732050807, TU, TT); |
132 | | R1[WS(rs, 6)] = -(FMA(KP1_732050807, TU, TT)); |
133 | | } |
134 | | { |
135 | | E Tl, Tx, Tw, Ty, TY, T10, To, TW, TV, TZ; |
136 | | Tl = FNMS(KP1_902113032, Tk, Th); |
137 | | Tx = FMA(KP1_902113032, Tk, Th); |
138 | | To = FMA(KP559016994, Tn, Tm); |
139 | | Tw = FMA(KP951056516, Tv, To); |
140 | | Ty = FNMS(KP951056516, Tv, To); |
141 | | TW = FMA(KP559016994, TN, TM); |
142 | | TY = FNMS(KP951056516, TX, TW); |
143 | | T10 = FMA(KP951056516, TX, TW); |
144 | | R1[WS(rs, 4)] = -(FMA(KP2_000000000, Tw, Tl)); |
145 | | R0[WS(rs, 3)] = FMA(KP2_000000000, Ty, Tx); |
146 | | TV = Ty - Tx; |
147 | | R1[0] = FNMS(KP1_732050807, TY, TV); |
148 | | R1[WS(rs, 5)] = FMA(KP1_732050807, TY, TV); |
149 | | TZ = Tl - Tw; |
150 | | R0[WS(rs, 7)] = FNMS(KP1_732050807, T10, TZ); |
151 | | R0[WS(rs, 2)] = FMA(KP1_732050807, T10, TZ); |
152 | | } |
153 | | } |
154 | | } |
155 | | } |
156 | | |
157 | | static const kr2c_desc desc = { 15, "r2cbIII_15", { 21, 0, 43, 0 }, &GENUS }; |
158 | | |
159 | | void X(codelet_r2cbIII_15) (planner *p) { X(kr2c_register) (p, r2cbIII_15, &desc); |
160 | | } |
161 | | |
162 | | #else |
163 | | |
164 | | /* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 15 -name r2cbIII_15 -dft-III -include rdft/scalar/r2cbIII.h */ |
165 | | |
166 | | /* |
167 | | * This function contains 64 FP additions, 26 FP multiplications, |
168 | | * (or, 49 additions, 11 multiplications, 15 fused multiply/add), |
169 | | * 47 stack variables, 14 constants, and 30 memory accesses |
170 | | */ |
171 | | #include "rdft/scalar/r2cbIII.h" |
172 | | |
173 | | static void r2cbIII_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) |
174 | 0 | { |
175 | 0 | DK(KP1_732050807, +1.732050807568877293527446341505872366942805254); |
176 | 0 | DK(KP433012701, +0.433012701892219323381861585376468091735701313); |
177 | 0 | DK(KP968245836, +0.968245836551854221294816349945599902708230426); |
178 | 0 | DK(KP587785252, +0.587785252292473129168705954639072768597652438); |
179 | 0 | DK(KP951056516, +0.951056516295153572116439333379382143405698634); |
180 | 0 | DK(KP250000000, +0.250000000000000000000000000000000000000000000); |
181 | 0 | DK(KP1_647278207, +1.647278207092663851754840078556380006059321028); |
182 | 0 | DK(KP1_018073920, +1.018073920910254366901961726787815297021466329); |
183 | 0 | DK(KP559016994, +0.559016994374947424102293417182819058860154590); |
184 | 0 | DK(KP500000000, +0.500000000000000000000000000000000000000000000); |
185 | 0 | DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); |
186 | 0 | DK(KP1_118033988, +1.118033988749894848204586834365638117720309180); |
187 | 0 | DK(KP1_175570504, +1.175570504584946258337411909278145537195304875); |
188 | 0 | DK(KP1_902113032, +1.902113032590307144232878666758764286811397268); |
189 | 0 | { |
190 | 0 | INT i; |
191 | 0 | for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) { |
192 | 0 | E Tv, TD, T5, Ts, TC, T6, Tf, TW, TK, Td, Tg, TP, To, TN, TA; |
193 | 0 | E TO, TQ, Tt, Tu, T12, Te, T11; |
194 | 0 | Tt = Ci[WS(csi, 4)]; |
195 | 0 | Tu = Ci[WS(csi, 1)]; |
196 | 0 | Tv = FMA(KP1_902113032, Tt, KP1_175570504 * Tu); |
197 | 0 | TD = FNMS(KP1_175570504, Tt, KP1_902113032 * Tu); |
198 | 0 | { |
199 | 0 | E T1, T4, Tq, T2, T3, Tr; |
200 | 0 | T1 = Cr[WS(csr, 7)]; |
201 | 0 | T2 = Cr[WS(csr, 4)]; |
202 | 0 | T3 = Cr[WS(csr, 1)]; |
203 | 0 | T4 = T2 + T3; |
204 | 0 | Tq = KP1_118033988 * (T2 - T3); |
205 | 0 | T5 = FMA(KP2_000000000, T4, T1); |
206 | 0 | Tr = FNMS(KP500000000, T4, T1); |
207 | 0 | Ts = Tq + Tr; |
208 | 0 | TC = Tr - Tq; |
209 | 0 | } |
210 | 0 | { |
211 | 0 | E Tc, TJ, T9, TI; |
212 | 0 | T6 = Cr[WS(csr, 2)]; |
213 | 0 | { |
214 | 0 | E Ta, Tb, T7, T8; |
215 | 0 | Ta = Cr[WS(csr, 3)]; |
216 | 0 | Tb = Cr[WS(csr, 6)]; |
217 | 0 | Tc = Ta + Tb; |
218 | 0 | TJ = Ta - Tb; |
219 | 0 | T7 = Cr[0]; |
220 | 0 | T8 = Cr[WS(csr, 5)]; |
221 | 0 | T9 = T7 + T8; |
222 | 0 | TI = T7 - T8; |
223 | 0 | } |
224 | 0 | Tf = KP559016994 * (T9 - Tc); |
225 | 0 | TW = FNMS(KP1_647278207, TJ, KP1_018073920 * TI); |
226 | 0 | TK = FMA(KP1_647278207, TI, KP1_018073920 * TJ); |
227 | 0 | Td = T9 + Tc; |
228 | 0 | Tg = FNMS(KP250000000, Td, T6); |
229 | 0 | } |
230 | 0 | { |
231 | 0 | E Tn, TM, Tk, TL; |
232 | 0 | TP = Ci[WS(csi, 2)]; |
233 | 0 | { |
234 | 0 | E Tl, Tm, Ti, Tj; |
235 | 0 | Tl = Ci[WS(csi, 3)]; |
236 | 0 | Tm = Ci[WS(csi, 6)]; |
237 | 0 | Tn = Tl - Tm; |
238 | 0 | TM = Tl + Tm; |
239 | 0 | Ti = Ci[0]; |
240 | 0 | Tj = Ci[WS(csi, 5)]; |
241 | 0 | Tk = Ti + Tj; |
242 | 0 | TL = Ti - Tj; |
243 | 0 | } |
244 | 0 | To = FMA(KP951056516, Tk, KP587785252 * Tn); |
245 | 0 | TN = KP968245836 * (TL - TM); |
246 | 0 | TA = FNMS(KP587785252, Tk, KP951056516 * Tn); |
247 | 0 | TO = TL + TM; |
248 | 0 | TQ = FMA(KP433012701, TO, KP1_732050807 * TP); |
249 | 0 | } |
250 | 0 | T12 = KP1_732050807 * (TP - TO); |
251 | 0 | Te = T6 + Td; |
252 | 0 | T11 = Te - T5; |
253 | 0 | R0[0] = FMA(KP2_000000000, Te, T5); |
254 | 0 | R0[WS(rs, 5)] = T12 - T11; |
255 | 0 | R1[WS(rs, 2)] = T11 + T12; |
256 | 0 | { |
257 | 0 | E TE, TG, TB, TF, TY, T10, Tz, TX, TV, TZ; |
258 | 0 | TE = TC - TD; |
259 | 0 | TG = TC + TD; |
260 | 0 | Tz = Tg - Tf; |
261 | 0 | TB = Tz + TA; |
262 | 0 | TF = TA - Tz; |
263 | 0 | TX = TN + TQ; |
264 | 0 | TY = TW - TX; |
265 | 0 | T10 = TW + TX; |
266 | 0 | R0[WS(rs, 6)] = FMA(KP2_000000000, TB, TE); |
267 | 0 | R1[WS(rs, 1)] = FMS(KP2_000000000, TF, TG); |
268 | 0 | TV = TE - TB; |
269 | 0 | R0[WS(rs, 1)] = TV + TY; |
270 | 0 | R1[WS(rs, 3)] = TY - TV; |
271 | 0 | TZ = TF + TG; |
272 | 0 | R0[WS(rs, 4)] = TZ - T10; |
273 | 0 | R1[WS(rs, 6)] = -(TZ + T10); |
274 | 0 | } |
275 | 0 | { |
276 | 0 | E Tw, Ty, Tp, Tx, TS, TU, Th, TR, TH, TT; |
277 | 0 | Tw = Ts - Tv; |
278 | 0 | Ty = Ts + Tv; |
279 | 0 | Th = Tf + Tg; |
280 | 0 | Tp = Th + To; |
281 | 0 | Tx = Th - To; |
282 | 0 | TR = TN - TQ; |
283 | 0 | TS = TK + TR; |
284 | 0 | TU = TR - TK; |
285 | 0 | R1[WS(rs, 4)] = -(FMA(KP2_000000000, Tp, Tw)); |
286 | 0 | R0[WS(rs, 3)] = FMA(KP2_000000000, Tx, Ty); |
287 | 0 | TH = Tx - Ty; |
288 | 0 | R1[WS(rs, 5)] = TH - TS; |
289 | 0 | R1[0] = TH + TS; |
290 | 0 | TT = Tw - Tp; |
291 | 0 | R0[WS(rs, 2)] = TT - TU; |
292 | 0 | R0[WS(rs, 7)] = TT + TU; |
293 | 0 | } |
294 | 0 | } |
295 | 0 | } |
296 | 0 | } |
297 | | |
298 | | static const kr2c_desc desc = { 15, "r2cbIII_15", { 49, 11, 15, 0 }, &GENUS }; |
299 | | |
300 | 1 | void X(codelet_r2cbIII_15) (planner *p) { X(kr2c_register) (p, r2cbIII_15, &desc); |
301 | 1 | } |
302 | | |
303 | | #endif |