/src/fftw3/rdft/scalar/r2cb/r2cb_32.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2003, 2007-14 Matteo Frigo |
3 | | * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology |
4 | | * |
5 | | * This program is free software; you can redistribute it and/or modify |
6 | | * it under the terms of the GNU General Public License as published by |
7 | | * the Free Software Foundation; either version 2 of the License, or |
8 | | * (at your option) any later version. |
9 | | * |
10 | | * This program is distributed in the hope that it will be useful, |
11 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | * GNU General Public License for more details. |
14 | | * |
15 | | * You should have received a copy of the GNU General Public License |
16 | | * along with this program; if not, write to the Free Software |
17 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
18 | | * |
19 | | */ |
20 | | |
21 | | /* This file was automatically generated --- DO NOT EDIT */ |
22 | | /* Generated on Wed Jul 23 07:02:25 UTC 2025 */ |
23 | | |
24 | | #include "rdft/codelet-rdft.h" |
25 | | |
26 | | #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) |
27 | | |
28 | | /* Generated by: ../../../genfft/gen_r2cb.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 32 -name r2cb_32 -include rdft/scalar/r2cb.h */ |
29 | | |
30 | | /* |
31 | | * This function contains 156 FP additions, 84 FP multiplications, |
32 | | * (or, 72 additions, 0 multiplications, 84 fused multiply/add), |
33 | | * 54 stack variables, 9 constants, and 64 memory accesses |
34 | | */ |
35 | | #include "rdft/scalar/r2cb.h" |
36 | | |
37 | | static void r2cb_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) |
38 | | { |
39 | | DK(KP1_662939224, +1.662939224605090474157576755235811513477121624); |
40 | | DK(KP668178637, +0.668178637919298919997757686523080761552472251); |
41 | | DK(KP1_961570560, +1.961570560806460898252364472268478073947867462); |
42 | | DK(KP198912367, +0.198912367379658006911597622644676228597850501); |
43 | | DK(KP707106781, +0.707106781186547524400844362104849039284835938); |
44 | | DK(KP1_847759065, +1.847759065022573512256366378793576573644833252); |
45 | | DK(KP414213562, +0.414213562373095048801688724209698078569671875); |
46 | | DK(KP1_414213562, +1.414213562373095048801688724209698078569671875); |
47 | | DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); |
48 | | { |
49 | | INT i; |
50 | | for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) { |
51 | | E T5, T1R, Tz, T1t, T8, T1S, TE, T1u, Tg, T1X, T2m, TK, TP, T1x, T1U; |
52 | | E T1w, To, T28, T2p, TW, T1d, T1D, T20, T1A, Tv, T23, T2q, T25, T1g, T1B; |
53 | | E T17, T1E; |
54 | | { |
55 | | E T4, Ty, T3, Tx, T1, T2; |
56 | | T4 = Cr[WS(csr, 8)]; |
57 | | Ty = Ci[WS(csi, 8)]; |
58 | | T1 = Cr[0]; |
59 | | T2 = Cr[WS(csr, 16)]; |
60 | | T3 = T1 + T2; |
61 | | Tx = T1 - T2; |
62 | | T5 = FMA(KP2_000000000, T4, T3); |
63 | | T1R = FNMS(KP2_000000000, T4, T3); |
64 | | Tz = FNMS(KP2_000000000, Ty, Tx); |
65 | | T1t = FMA(KP2_000000000, Ty, Tx); |
66 | | } |
67 | | { |
68 | | E T6, T7, TA, TB, TC, TD; |
69 | | T6 = Cr[WS(csr, 4)]; |
70 | | T7 = Cr[WS(csr, 12)]; |
71 | | TA = T6 - T7; |
72 | | TB = Ci[WS(csi, 4)]; |
73 | | TC = Ci[WS(csi, 12)]; |
74 | | TD = TB + TC; |
75 | | T8 = T6 + T7; |
76 | | T1S = TB - TC; |
77 | | TE = TA - TD; |
78 | | T1u = TA + TD; |
79 | | } |
80 | | { |
81 | | E Tc, TG, TO, T1V, Tf, TL, TJ, T1W; |
82 | | { |
83 | | E Ta, Tb, TM, TN; |
84 | | Ta = Cr[WS(csr, 2)]; |
85 | | Tb = Cr[WS(csr, 14)]; |
86 | | Tc = Ta + Tb; |
87 | | TG = Ta - Tb; |
88 | | TM = Ci[WS(csi, 2)]; |
89 | | TN = Ci[WS(csi, 14)]; |
90 | | TO = TM + TN; |
91 | | T1V = TM - TN; |
92 | | } |
93 | | { |
94 | | E Td, Te, TH, TI; |
95 | | Td = Cr[WS(csr, 10)]; |
96 | | Te = Cr[WS(csr, 6)]; |
97 | | Tf = Td + Te; |
98 | | TL = Td - Te; |
99 | | TH = Ci[WS(csi, 10)]; |
100 | | TI = Ci[WS(csi, 6)]; |
101 | | TJ = TH + TI; |
102 | | T1W = TH - TI; |
103 | | } |
104 | | Tg = Tc + Tf; |
105 | | T1X = T1V - T1W; |
106 | | T2m = T1W + T1V; |
107 | | TK = TG - TJ; |
108 | | TP = TL + TO; |
109 | | T1x = TG + TJ; |
110 | | T1U = Tc - Tf; |
111 | | T1w = TO - TL; |
112 | | } |
113 | | { |
114 | | E Tk, TS, T1c, T26, Tn, T19, TV, T27; |
115 | | { |
116 | | E Ti, Tj, T1a, T1b; |
117 | | Ti = Cr[WS(csr, 1)]; |
118 | | Tj = Cr[WS(csr, 15)]; |
119 | | Tk = Ti + Tj; |
120 | | TS = Ti - Tj; |
121 | | T1a = Ci[WS(csi, 1)]; |
122 | | T1b = Ci[WS(csi, 15)]; |
123 | | T1c = T1a + T1b; |
124 | | T26 = T1a - T1b; |
125 | | } |
126 | | { |
127 | | E Tl, Tm, TT, TU; |
128 | | Tl = Cr[WS(csr, 9)]; |
129 | | Tm = Cr[WS(csr, 7)]; |
130 | | Tn = Tl + Tm; |
131 | | T19 = Tl - Tm; |
132 | | TT = Ci[WS(csi, 9)]; |
133 | | TU = Ci[WS(csi, 7)]; |
134 | | TV = TT + TU; |
135 | | T27 = TT - TU; |
136 | | } |
137 | | To = Tk + Tn; |
138 | | T28 = T26 - T27; |
139 | | T2p = T27 + T26; |
140 | | TW = TS - TV; |
141 | | T1d = T19 + T1c; |
142 | | T1D = T1c - T19; |
143 | | T20 = Tk - Tn; |
144 | | T1A = TS + TV; |
145 | | } |
146 | | { |
147 | | E Tr, TX, T10, T22, Tu, T12, T15, T21; |
148 | | { |
149 | | E Tp, Tq, TY, TZ; |
150 | | Tp = Cr[WS(csr, 5)]; |
151 | | Tq = Cr[WS(csr, 11)]; |
152 | | Tr = Tp + Tq; |
153 | | TX = Tp - Tq; |
154 | | TY = Ci[WS(csi, 5)]; |
155 | | TZ = Ci[WS(csi, 11)]; |
156 | | T10 = TY + TZ; |
157 | | T22 = TY - TZ; |
158 | | } |
159 | | { |
160 | | E Ts, Tt, T13, T14; |
161 | | Ts = Cr[WS(csr, 3)]; |
162 | | Tt = Cr[WS(csr, 13)]; |
163 | | Tu = Ts + Tt; |
164 | | T12 = Ts - Tt; |
165 | | T13 = Ci[WS(csi, 3)]; |
166 | | T14 = Ci[WS(csi, 13)]; |
167 | | T15 = T13 + T14; |
168 | | T21 = T14 - T13; |
169 | | } |
170 | | Tv = Tr + Tu; |
171 | | T23 = T21 - T22; |
172 | | T2q = T22 + T21; |
173 | | T25 = Tr - Tu; |
174 | | { |
175 | | E T1e, T1f, T11, T16; |
176 | | T1e = TX + T10; |
177 | | T1f = T12 + T15; |
178 | | T1g = T1e - T1f; |
179 | | T1B = T1e + T1f; |
180 | | T11 = TX - T10; |
181 | | T16 = T12 - T15; |
182 | | T17 = T11 + T16; |
183 | | T1E = T16 - T11; |
184 | | } |
185 | | } |
186 | | { |
187 | | E Tw, T2w, Th, T2v, T9; |
188 | | Tw = To + Tv; |
189 | | T2w = T2q + T2p; |
190 | | T9 = FMA(KP2_000000000, T8, T5); |
191 | | Th = FMA(KP2_000000000, Tg, T9); |
192 | | T2v = FNMS(KP2_000000000, Tg, T9); |
193 | | R0[WS(rs, 8)] = FNMS(KP2_000000000, Tw, Th); |
194 | | R0[WS(rs, 12)] = FMA(KP2_000000000, T2w, T2v); |
195 | | R0[0] = FMA(KP2_000000000, Tw, Th); |
196 | | R0[WS(rs, 4)] = FNMS(KP2_000000000, T2w, T2v); |
197 | | } |
198 | | { |
199 | | E T2n, T2t, T2s, T2u, T2l, T2o, T2r; |
200 | | T2l = FNMS(KP2_000000000, T8, T5); |
201 | | T2n = FNMS(KP2_000000000, T2m, T2l); |
202 | | T2t = FMA(KP2_000000000, T2m, T2l); |
203 | | T2o = To - Tv; |
204 | | T2r = T2p - T2q; |
205 | | T2s = T2o - T2r; |
206 | | T2u = T2o + T2r; |
207 | | R0[WS(rs, 10)] = FNMS(KP1_414213562, T2s, T2n); |
208 | | R0[WS(rs, 14)] = FMA(KP1_414213562, T2u, T2t); |
209 | | R0[WS(rs, 2)] = FMA(KP1_414213562, T2s, T2n); |
210 | | R0[WS(rs, 6)] = FNMS(KP1_414213562, T2u, T2t); |
211 | | } |
212 | | { |
213 | | E TR, T1j, T1i, T1k; |
214 | | { |
215 | | E TF, TQ, T18, T1h; |
216 | | TF = FMA(KP1_414213562, TE, Tz); |
217 | | TQ = FNMS(KP414213562, TP, TK); |
218 | | TR = FMA(KP1_847759065, TQ, TF); |
219 | | T1j = FNMS(KP1_847759065, TQ, TF); |
220 | | T18 = FMA(KP707106781, T17, TW); |
221 | | T1h = FMA(KP707106781, T1g, T1d); |
222 | | T1i = FNMS(KP198912367, T1h, T18); |
223 | | T1k = FMA(KP198912367, T18, T1h); |
224 | | } |
225 | | R1[WS(rs, 8)] = FNMS(KP1_961570560, T1i, TR); |
226 | | R1[WS(rs, 12)] = FMA(KP1_961570560, T1k, T1j); |
227 | | R1[0] = FMA(KP1_961570560, T1i, TR); |
228 | | R1[WS(rs, 4)] = FNMS(KP1_961570560, T1k, T1j); |
229 | | } |
230 | | { |
231 | | E T2f, T2j, T2i, T2k; |
232 | | { |
233 | | E T2d, T2e, T2g, T2h; |
234 | | T2d = FMA(KP2_000000000, T1S, T1R); |
235 | | T2e = T1U + T1X; |
236 | | T2f = FNMS(KP1_414213562, T2e, T2d); |
237 | | T2j = FMA(KP1_414213562, T2e, T2d); |
238 | | T2g = T28 - T25; |
239 | | T2h = T20 - T23; |
240 | | T2i = FNMS(KP414213562, T2h, T2g); |
241 | | T2k = FMA(KP414213562, T2g, T2h); |
242 | | } |
243 | | R0[WS(rs, 3)] = FNMS(KP1_847759065, T2i, T2f); |
244 | | R0[WS(rs, 15)] = FMA(KP1_847759065, T2k, T2j); |
245 | | R0[WS(rs, 11)] = FMA(KP1_847759065, T2i, T2f); |
246 | | R0[WS(rs, 7)] = FNMS(KP1_847759065, T2k, T2j); |
247 | | } |
248 | | { |
249 | | E T1n, T1r, T1q, T1s; |
250 | | { |
251 | | E T1l, T1m, T1o, T1p; |
252 | | T1l = FNMS(KP1_414213562, TE, Tz); |
253 | | T1m = FMA(KP414213562, TK, TP); |
254 | | T1n = FNMS(KP1_847759065, T1m, T1l); |
255 | | T1r = FMA(KP1_847759065, T1m, T1l); |
256 | | T1o = FNMS(KP707106781, T1g, T1d); |
257 | | T1p = FNMS(KP707106781, T17, TW); |
258 | | T1q = FNMS(KP668178637, T1p, T1o); |
259 | | T1s = FMA(KP668178637, T1o, T1p); |
260 | | } |
261 | | R1[WS(rs, 2)] = FNMS(KP1_662939224, T1q, T1n); |
262 | | R1[WS(rs, 14)] = FMA(KP1_662939224, T1s, T1r); |
263 | | R1[WS(rs, 10)] = FMA(KP1_662939224, T1q, T1n); |
264 | | R1[WS(rs, 6)] = FNMS(KP1_662939224, T1s, T1r); |
265 | | } |
266 | | { |
267 | | E T1L, T1P, T1O, T1Q; |
268 | | { |
269 | | E T1J, T1K, T1M, T1N; |
270 | | T1J = FMA(KP1_414213562, T1u, T1t); |
271 | | T1K = FMA(KP414213562, T1w, T1x); |
272 | | T1L = FNMS(KP1_847759065, T1K, T1J); |
273 | | T1P = FMA(KP1_847759065, T1K, T1J); |
274 | | T1M = FMA(KP707106781, T1E, T1D); |
275 | | T1N = FMA(KP707106781, T1B, T1A); |
276 | | T1O = FNMS(KP198912367, T1N, T1M); |
277 | | T1Q = FMA(KP198912367, T1M, T1N); |
278 | | } |
279 | | R1[WS(rs, 3)] = FNMS(KP1_961570560, T1O, T1L); |
280 | | R1[WS(rs, 15)] = FMA(KP1_961570560, T1Q, T1P); |
281 | | R1[WS(rs, 11)] = FMA(KP1_961570560, T1O, T1L); |
282 | | R1[WS(rs, 7)] = FNMS(KP1_961570560, T1Q, T1P); |
283 | | } |
284 | | { |
285 | | E T1Z, T2b, T2a, T2c; |
286 | | { |
287 | | E T1T, T1Y, T24, T29; |
288 | | T1T = FNMS(KP2_000000000, T1S, T1R); |
289 | | T1Y = T1U - T1X; |
290 | | T1Z = FMA(KP1_414213562, T1Y, T1T); |
291 | | T2b = FNMS(KP1_414213562, T1Y, T1T); |
292 | | T24 = T20 + T23; |
293 | | T29 = T25 + T28; |
294 | | T2a = FNMS(KP414213562, T29, T24); |
295 | | T2c = FMA(KP414213562, T24, T29); |
296 | | } |
297 | | R0[WS(rs, 9)] = FNMS(KP1_847759065, T2a, T1Z); |
298 | | R0[WS(rs, 13)] = FMA(KP1_847759065, T2c, T2b); |
299 | | R0[WS(rs, 1)] = FMA(KP1_847759065, T2a, T1Z); |
300 | | R0[WS(rs, 5)] = FNMS(KP1_847759065, T2c, T2b); |
301 | | } |
302 | | { |
303 | | E T1z, T1H, T1G, T1I; |
304 | | { |
305 | | E T1v, T1y, T1C, T1F; |
306 | | T1v = FNMS(KP1_414213562, T1u, T1t); |
307 | | T1y = FNMS(KP414213562, T1x, T1w); |
308 | | T1z = FNMS(KP1_847759065, T1y, T1v); |
309 | | T1H = FMA(KP1_847759065, T1y, T1v); |
310 | | T1C = FNMS(KP707106781, T1B, T1A); |
311 | | T1F = FNMS(KP707106781, T1E, T1D); |
312 | | T1G = FNMS(KP668178637, T1F, T1C); |
313 | | T1I = FMA(KP668178637, T1C, T1F); |
314 | | } |
315 | | R1[WS(rs, 9)] = FNMS(KP1_662939224, T1G, T1z); |
316 | | R1[WS(rs, 13)] = FMA(KP1_662939224, T1I, T1H); |
317 | | R1[WS(rs, 1)] = FMA(KP1_662939224, T1G, T1z); |
318 | | R1[WS(rs, 5)] = FNMS(KP1_662939224, T1I, T1H); |
319 | | } |
320 | | } |
321 | | } |
322 | | } |
323 | | |
324 | | static const kr2c_desc desc = { 32, "r2cb_32", { 72, 0, 84, 0 }, &GENUS }; |
325 | | |
326 | | void X(codelet_r2cb_32) (planner *p) { X(kr2c_register) (p, r2cb_32, &desc); |
327 | | } |
328 | | |
329 | | #else |
330 | | |
331 | | /* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 32 -name r2cb_32 -include rdft/scalar/r2cb.h */ |
332 | | |
333 | | /* |
334 | | * This function contains 156 FP additions, 50 FP multiplications, |
335 | | * (or, 140 additions, 34 multiplications, 16 fused multiply/add), |
336 | | * 54 stack variables, 9 constants, and 64 memory accesses |
337 | | */ |
338 | | #include "rdft/scalar/r2cb.h" |
339 | | |
340 | | static void r2cb_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) |
341 | 0 | { |
342 | 0 | DK(KP1_662939224, +1.662939224605090474157576755235811513477121624); |
343 | 0 | DK(KP1_111140466, +1.111140466039204449485661627897065748749874382); |
344 | 0 | DK(KP1_961570560, +1.961570560806460898252364472268478073947867462); |
345 | 0 | DK(KP390180644, +0.390180644032256535696569736954044481855383236); |
346 | 0 | DK(KP765366864, +0.765366864730179543456919968060797733522689125); |
347 | 0 | DK(KP1_847759065, +1.847759065022573512256366378793576573644833252); |
348 | 0 | DK(KP707106781, +0.707106781186547524400844362104849039284835938); |
349 | 0 | DK(KP1_414213562, +1.414213562373095048801688724209698078569671875); |
350 | 0 | DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); |
351 | 0 | { |
352 | 0 | INT i; |
353 | 0 | for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) { |
354 | 0 | E T9, T2c, TB, T1y, T6, T2b, Ty, T1v, Th, T2e, T2f, TD, TK, T1C, T1F; |
355 | 0 | E T1h, Tp, T2i, T2m, TN, T13, T1K, T1Y, T1k, Tw, TU, T1l, TW, T1V, T2j; |
356 | 0 | E T1R, T2l; |
357 | 0 | { |
358 | 0 | E T7, T8, T1w, Tz, TA, T1x; |
359 | 0 | T7 = Cr[WS(csr, 4)]; |
360 | 0 | T8 = Cr[WS(csr, 12)]; |
361 | 0 | T1w = T7 - T8; |
362 | 0 | Tz = Ci[WS(csi, 4)]; |
363 | 0 | TA = Ci[WS(csi, 12)]; |
364 | 0 | T1x = Tz + TA; |
365 | 0 | T9 = KP2_000000000 * (T7 + T8); |
366 | 0 | T2c = KP1_414213562 * (T1w + T1x); |
367 | 0 | TB = KP2_000000000 * (Tz - TA); |
368 | 0 | T1y = KP1_414213562 * (T1w - T1x); |
369 | 0 | } |
370 | 0 | { |
371 | 0 | E T5, T1u, T3, T1s; |
372 | 0 | { |
373 | 0 | E T4, T1t, T1, T2; |
374 | 0 | T4 = Cr[WS(csr, 8)]; |
375 | 0 | T5 = KP2_000000000 * T4; |
376 | 0 | T1t = Ci[WS(csi, 8)]; |
377 | 0 | T1u = KP2_000000000 * T1t; |
378 | 0 | T1 = Cr[0]; |
379 | 0 | T2 = Cr[WS(csr, 16)]; |
380 | 0 | T3 = T1 + T2; |
381 | 0 | T1s = T1 - T2; |
382 | 0 | } |
383 | 0 | T6 = T3 + T5; |
384 | 0 | T2b = T1s + T1u; |
385 | 0 | Ty = T3 - T5; |
386 | 0 | T1v = T1s - T1u; |
387 | 0 | } |
388 | 0 | { |
389 | 0 | E Td, T1A, TG, T1E, Tg, T1D, TJ, T1B; |
390 | 0 | { |
391 | 0 | E Tb, Tc, TE, TF; |
392 | 0 | Tb = Cr[WS(csr, 2)]; |
393 | 0 | Tc = Cr[WS(csr, 14)]; |
394 | 0 | Td = Tb + Tc; |
395 | 0 | T1A = Tb - Tc; |
396 | 0 | TE = Ci[WS(csi, 2)]; |
397 | 0 | TF = Ci[WS(csi, 14)]; |
398 | 0 | TG = TE - TF; |
399 | 0 | T1E = TE + TF; |
400 | 0 | } |
401 | 0 | { |
402 | 0 | E Te, Tf, TH, TI; |
403 | 0 | Te = Cr[WS(csr, 10)]; |
404 | 0 | Tf = Cr[WS(csr, 6)]; |
405 | 0 | Tg = Te + Tf; |
406 | 0 | T1D = Te - Tf; |
407 | 0 | TH = Ci[WS(csi, 10)]; |
408 | 0 | TI = Ci[WS(csi, 6)]; |
409 | 0 | TJ = TH - TI; |
410 | 0 | T1B = TH + TI; |
411 | 0 | } |
412 | 0 | Th = KP2_000000000 * (Td + Tg); |
413 | 0 | T2e = T1A + T1B; |
414 | 0 | T2f = T1E - T1D; |
415 | 0 | TD = Td - Tg; |
416 | 0 | TK = TG - TJ; |
417 | 0 | T1C = T1A - T1B; |
418 | 0 | T1F = T1D + T1E; |
419 | 0 | T1h = KP2_000000000 * (TJ + TG); |
420 | 0 | } |
421 | 0 | { |
422 | 0 | E Tl, T1I, TZ, T1X, To, T1W, T12, T1J; |
423 | 0 | { |
424 | 0 | E Tj, Tk, TX, TY; |
425 | 0 | Tj = Cr[WS(csr, 1)]; |
426 | 0 | Tk = Cr[WS(csr, 15)]; |
427 | 0 | Tl = Tj + Tk; |
428 | 0 | T1I = Tj - Tk; |
429 | 0 | TX = Ci[WS(csi, 1)]; |
430 | 0 | TY = Ci[WS(csi, 15)]; |
431 | 0 | TZ = TX - TY; |
432 | 0 | T1X = TX + TY; |
433 | 0 | } |
434 | 0 | { |
435 | 0 | E Tm, Tn, T10, T11; |
436 | 0 | Tm = Cr[WS(csr, 9)]; |
437 | 0 | Tn = Cr[WS(csr, 7)]; |
438 | 0 | To = Tm + Tn; |
439 | 0 | T1W = Tm - Tn; |
440 | 0 | T10 = Ci[WS(csi, 9)]; |
441 | 0 | T11 = Ci[WS(csi, 7)]; |
442 | 0 | T12 = T10 - T11; |
443 | 0 | T1J = T10 + T11; |
444 | 0 | } |
445 | 0 | Tp = Tl + To; |
446 | 0 | T2i = T1I + T1J; |
447 | 0 | T2m = T1X - T1W; |
448 | 0 | TN = Tl - To; |
449 | 0 | T13 = TZ - T12; |
450 | 0 | T1K = T1I - T1J; |
451 | 0 | T1Y = T1W + T1X; |
452 | 0 | T1k = T12 + TZ; |
453 | 0 | } |
454 | 0 | { |
455 | 0 | E Ts, T1L, TT, T1M, Tv, T1O, TQ, T1P; |
456 | 0 | { |
457 | 0 | E Tq, Tr, TR, TS; |
458 | 0 | Tq = Cr[WS(csr, 5)]; |
459 | 0 | Tr = Cr[WS(csr, 11)]; |
460 | 0 | Ts = Tq + Tr; |
461 | 0 | T1L = Tq - Tr; |
462 | 0 | TR = Ci[WS(csi, 5)]; |
463 | 0 | TS = Ci[WS(csi, 11)]; |
464 | 0 | TT = TR - TS; |
465 | 0 | T1M = TR + TS; |
466 | 0 | } |
467 | 0 | { |
468 | 0 | E Tt, Tu, TO, TP; |
469 | 0 | Tt = Cr[WS(csr, 3)]; |
470 | 0 | Tu = Cr[WS(csr, 13)]; |
471 | 0 | Tv = Tt + Tu; |
472 | 0 | T1O = Tt - Tu; |
473 | 0 | TO = Ci[WS(csi, 13)]; |
474 | 0 | TP = Ci[WS(csi, 3)]; |
475 | 0 | TQ = TO - TP; |
476 | 0 | T1P = TP + TO; |
477 | 0 | } |
478 | 0 | Tw = Ts + Tv; |
479 | 0 | TU = TQ - TT; |
480 | 0 | T1l = TT + TQ; |
481 | 0 | TW = Ts - Tv; |
482 | 0 | { |
483 | 0 | E T1T, T1U, T1N, T1Q; |
484 | 0 | T1T = T1L + T1M; |
485 | 0 | T1U = T1O + T1P; |
486 | 0 | T1V = KP707106781 * (T1T - T1U); |
487 | 0 | T2j = KP707106781 * (T1T + T1U); |
488 | 0 | T1N = T1L - T1M; |
489 | 0 | T1Q = T1O - T1P; |
490 | 0 | T1R = KP707106781 * (T1N + T1Q); |
491 | 0 | T2l = KP707106781 * (T1N - T1Q); |
492 | 0 | } |
493 | 0 | } |
494 | 0 | { |
495 | 0 | E Tx, T1r, Ti, T1q, Ta; |
496 | 0 | Tx = KP2_000000000 * (Tp + Tw); |
497 | 0 | T1r = KP2_000000000 * (T1l + T1k); |
498 | 0 | Ta = T6 + T9; |
499 | 0 | Ti = Ta + Th; |
500 | 0 | T1q = Ta - Th; |
501 | 0 | R0[WS(rs, 8)] = Ti - Tx; |
502 | 0 | R0[WS(rs, 12)] = T1q + T1r; |
503 | 0 | R0[0] = Ti + Tx; |
504 | 0 | R0[WS(rs, 4)] = T1q - T1r; |
505 | 0 | } |
506 | 0 | { |
507 | 0 | E T1i, T1o, T1n, T1p, T1g, T1j, T1m; |
508 | 0 | T1g = T6 - T9; |
509 | 0 | T1i = T1g - T1h; |
510 | 0 | T1o = T1g + T1h; |
511 | 0 | T1j = Tp - Tw; |
512 | 0 | T1m = T1k - T1l; |
513 | 0 | T1n = KP1_414213562 * (T1j - T1m); |
514 | 0 | T1p = KP1_414213562 * (T1j + T1m); |
515 | 0 | R0[WS(rs, 10)] = T1i - T1n; |
516 | 0 | R0[WS(rs, 14)] = T1o + T1p; |
517 | 0 | R0[WS(rs, 2)] = T1i + T1n; |
518 | 0 | R0[WS(rs, 6)] = T1o - T1p; |
519 | 0 | } |
520 | 0 | { |
521 | 0 | E TM, T16, T15, T17; |
522 | 0 | { |
523 | 0 | E TC, TL, TV, T14; |
524 | 0 | TC = Ty - TB; |
525 | 0 | TL = KP1_414213562 * (TD - TK); |
526 | 0 | TM = TC + TL; |
527 | 0 | T16 = TC - TL; |
528 | 0 | TV = TN + TU; |
529 | 0 | T14 = TW + T13; |
530 | 0 | T15 = FNMS(KP765366864, T14, KP1_847759065 * TV); |
531 | 0 | T17 = FMA(KP765366864, TV, KP1_847759065 * T14); |
532 | 0 | } |
533 | 0 | R0[WS(rs, 9)] = TM - T15; |
534 | 0 | R0[WS(rs, 13)] = T16 + T17; |
535 | 0 | R0[WS(rs, 1)] = TM + T15; |
536 | 0 | R0[WS(rs, 5)] = T16 - T17; |
537 | 0 | } |
538 | 0 | { |
539 | 0 | E T2t, T2x, T2w, T2y; |
540 | 0 | { |
541 | 0 | E T2r, T2s, T2u, T2v; |
542 | 0 | T2r = T2b + T2c; |
543 | 0 | T2s = FMA(KP1_847759065, T2e, KP765366864 * T2f); |
544 | 0 | T2t = T2r - T2s; |
545 | 0 | T2x = T2r + T2s; |
546 | 0 | T2u = T2i + T2j; |
547 | 0 | T2v = T2m - T2l; |
548 | 0 | T2w = FNMS(KP1_961570560, T2v, KP390180644 * T2u); |
549 | 0 | T2y = FMA(KP1_961570560, T2u, KP390180644 * T2v); |
550 | 0 | } |
551 | 0 | R1[WS(rs, 11)] = T2t - T2w; |
552 | 0 | R1[WS(rs, 15)] = T2x + T2y; |
553 | 0 | R1[WS(rs, 3)] = T2t + T2w; |
554 | 0 | R1[WS(rs, 7)] = T2x - T2y; |
555 | 0 | } |
556 | 0 | { |
557 | 0 | E T1a, T1e, T1d, T1f; |
558 | 0 | { |
559 | 0 | E T18, T19, T1b, T1c; |
560 | 0 | T18 = Ty + TB; |
561 | 0 | T19 = KP1_414213562 * (TD + TK); |
562 | 0 | T1a = T18 - T19; |
563 | 0 | T1e = T18 + T19; |
564 | 0 | T1b = TN - TU; |
565 | 0 | T1c = T13 - TW; |
566 | 0 | T1d = FNMS(KP1_847759065, T1c, KP765366864 * T1b); |
567 | 0 | T1f = FMA(KP1_847759065, T1b, KP765366864 * T1c); |
568 | 0 | } |
569 | 0 | R0[WS(rs, 11)] = T1a - T1d; |
570 | 0 | R0[WS(rs, 15)] = T1e + T1f; |
571 | 0 | R0[WS(rs, 3)] = T1a + T1d; |
572 | 0 | R0[WS(rs, 7)] = T1e - T1f; |
573 | 0 | } |
574 | 0 | { |
575 | 0 | E T25, T29, T28, T2a; |
576 | 0 | { |
577 | 0 | E T23, T24, T26, T27; |
578 | 0 | T23 = T1v - T1y; |
579 | 0 | T24 = FMA(KP765366864, T1C, KP1_847759065 * T1F); |
580 | 0 | T25 = T23 - T24; |
581 | 0 | T29 = T23 + T24; |
582 | 0 | T26 = T1K - T1R; |
583 | 0 | T27 = T1Y - T1V; |
584 | 0 | T28 = FNMS(KP1_662939224, T27, KP1_111140466 * T26); |
585 | 0 | T2a = FMA(KP1_662939224, T26, KP1_111140466 * T27); |
586 | 0 | } |
587 | 0 | R1[WS(rs, 10)] = T25 - T28; |
588 | 0 | R1[WS(rs, 14)] = T29 + T2a; |
589 | 0 | R1[WS(rs, 2)] = T25 + T28; |
590 | 0 | R1[WS(rs, 6)] = T29 - T2a; |
591 | 0 | } |
592 | 0 | { |
593 | 0 | E T2h, T2p, T2o, T2q; |
594 | 0 | { |
595 | 0 | E T2d, T2g, T2k, T2n; |
596 | 0 | T2d = T2b - T2c; |
597 | 0 | T2g = FNMS(KP1_847759065, T2f, KP765366864 * T2e); |
598 | 0 | T2h = T2d + T2g; |
599 | 0 | T2p = T2d - T2g; |
600 | 0 | T2k = T2i - T2j; |
601 | 0 | T2n = T2l + T2m; |
602 | 0 | T2o = FNMS(KP1_111140466, T2n, KP1_662939224 * T2k); |
603 | 0 | T2q = FMA(KP1_111140466, T2k, KP1_662939224 * T2n); |
604 | 0 | } |
605 | 0 | R1[WS(rs, 9)] = T2h - T2o; |
606 | 0 | R1[WS(rs, 13)] = T2p + T2q; |
607 | 0 | R1[WS(rs, 1)] = T2h + T2o; |
608 | 0 | R1[WS(rs, 5)] = T2p - T2q; |
609 | 0 | } |
610 | 0 | { |
611 | 0 | E T1H, T21, T20, T22; |
612 | 0 | { |
613 | 0 | E T1z, T1G, T1S, T1Z; |
614 | 0 | T1z = T1v + T1y; |
615 | 0 | T1G = FNMS(KP765366864, T1F, KP1_847759065 * T1C); |
616 | 0 | T1H = T1z + T1G; |
617 | 0 | T21 = T1z - T1G; |
618 | 0 | T1S = T1K + T1R; |
619 | 0 | T1Z = T1V + T1Y; |
620 | 0 | T20 = FNMS(KP390180644, T1Z, KP1_961570560 * T1S); |
621 | 0 | T22 = FMA(KP390180644, T1S, KP1_961570560 * T1Z); |
622 | 0 | } |
623 | 0 | R1[WS(rs, 8)] = T1H - T20; |
624 | 0 | R1[WS(rs, 12)] = T21 + T22; |
625 | 0 | R1[0] = T1H + T20; |
626 | 0 | R1[WS(rs, 4)] = T21 - T22; |
627 | 0 | } |
628 | 0 | } |
629 | 0 | } |
630 | 0 | } |
631 | | |
632 | | static const kr2c_desc desc = { 32, "r2cb_32", { 140, 34, 16, 0 }, &GENUS }; |
633 | | |
634 | 1 | void X(codelet_r2cb_32) (planner *p) { X(kr2c_register) (p, r2cb_32, &desc); |
635 | 1 | } |
636 | | |
637 | | #endif |