/src/mpdecimal-4.0.0/libmpdec/fourstep.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2008-2024 Stefan Krah. All rights reserved. |
3 | | * |
4 | | * Redistribution and use in source and binary forms, with or without |
5 | | * modification, are permitted provided that the following conditions |
6 | | * are met: |
7 | | * |
8 | | * 1. Redistributions of source code must retain the above copyright |
9 | | * notice, this list of conditions and the following disclaimer. |
10 | | * 2. Redistributions in binary form must reproduce the above copyright |
11 | | * notice, this list of conditions and the following disclaimer in the |
12 | | * documentation and/or other materials provided with the distribution. |
13 | | * |
14 | | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
15 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
16 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
17 | | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
18 | | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
19 | | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
20 | | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
21 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
22 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
23 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
24 | | * SUCH DAMAGE. |
25 | | */ |
26 | | |
27 | | |
28 | | #include <assert.h> |
29 | | |
30 | | #include "constants.h" |
31 | | #include "fourstep.h" |
32 | | #include "mpdecimal.h" |
33 | | #include "numbertheory.h" |
34 | | #include "sixstep.h" |
35 | | #include "umodarith.h" |
36 | | |
37 | | |
38 | | /* Bignum: Cache efficient Matrix Fourier Transform for arrays of the |
39 | | form 3 * 2**n (See literature/matrix-transform.txt). */ |
40 | | |
41 | | |
42 | | #ifndef PPRO |
43 | | static inline void |
44 | | std_size3_ntt(mpd_uint_t *x1, mpd_uint_t *x2, mpd_uint_t *x3, |
45 | | mpd_uint_t w3table[3], mpd_uint_t umod) |
46 | 0 | { |
47 | 0 | mpd_uint_t r1, r2; |
48 | 0 | mpd_uint_t w; |
49 | 0 | mpd_uint_t s, tmp; |
50 | | |
51 | | |
52 | | /* k = 0 -> w = 1 */ |
53 | 0 | s = *x1; |
54 | 0 | s = addmod(s, *x2, umod); |
55 | 0 | s = addmod(s, *x3, umod); |
56 | |
|
57 | 0 | r1 = s; |
58 | | |
59 | | /* k = 1 */ |
60 | 0 | s = *x1; |
61 | |
|
62 | 0 | w = w3table[1]; |
63 | 0 | tmp = MULMOD(*x2, w); |
64 | 0 | s = addmod(s, tmp, umod); |
65 | |
|
66 | 0 | w = w3table[2]; |
67 | 0 | tmp = MULMOD(*x3, w); |
68 | 0 | s = addmod(s, tmp, umod); |
69 | |
|
70 | 0 | r2 = s; |
71 | | |
72 | | /* k = 2 */ |
73 | 0 | s = *x1; |
74 | |
|
75 | 0 | w = w3table[2]; |
76 | 0 | tmp = MULMOD(*x2, w); |
77 | 0 | s = addmod(s, tmp, umod); |
78 | |
|
79 | 0 | w = w3table[1]; |
80 | 0 | tmp = MULMOD(*x3, w); |
81 | 0 | s = addmod(s, tmp, umod); |
82 | |
|
83 | 0 | *x3 = s; |
84 | 0 | *x2 = r2; |
85 | 0 | *x1 = r1; |
86 | 0 | } |
87 | | #else /* PPRO */ |
88 | | static inline void |
89 | | ppro_size3_ntt(mpd_uint_t *x1, mpd_uint_t *x2, mpd_uint_t *x3, mpd_uint_t w3table[3], |
90 | | mpd_uint_t umod, double *dmod, uint32_t dinvmod[3]) |
91 | | { |
92 | | mpd_uint_t r1, r2; |
93 | | mpd_uint_t w; |
94 | | mpd_uint_t s, tmp; |
95 | | |
96 | | |
97 | | /* k = 0 -> w = 1 */ |
98 | | s = *x1; |
99 | | s = addmod(s, *x2, umod); |
100 | | s = addmod(s, *x3, umod); |
101 | | |
102 | | r1 = s; |
103 | | |
104 | | /* k = 1 */ |
105 | | s = *x1; |
106 | | |
107 | | w = w3table[1]; |
108 | | tmp = ppro_mulmod(*x2, w, dmod, dinvmod); |
109 | | s = addmod(s, tmp, umod); |
110 | | |
111 | | w = w3table[2]; |
112 | | tmp = ppro_mulmod(*x3, w, dmod, dinvmod); |
113 | | s = addmod(s, tmp, umod); |
114 | | |
115 | | r2 = s; |
116 | | |
117 | | /* k = 2 */ |
118 | | s = *x1; |
119 | | |
120 | | w = w3table[2]; |
121 | | tmp = ppro_mulmod(*x2, w, dmod, dinvmod); |
122 | | s = addmod(s, tmp, umod); |
123 | | |
124 | | w = w3table[1]; |
125 | | tmp = ppro_mulmod(*x3, w, dmod, dinvmod); |
126 | | s = addmod(s, tmp, umod); |
127 | | |
128 | | *x3 = s; |
129 | | *x2 = r2; |
130 | | *x1 = r1; |
131 | | } |
132 | | #endif |
133 | | |
134 | | |
135 | | /* forward transform, sign = -1; transform length = 3 * 2**n */ |
136 | | int |
137 | | four_step_fnt(mpd_uint_t *a, mpd_size_t n, int modnum) |
138 | 0 | { |
139 | 0 | mpd_size_t R = 3; /* number of rows */ |
140 | 0 | mpd_size_t C = n / 3; /* number of columns */ |
141 | 0 | mpd_uint_t w3table[3]; |
142 | 0 | mpd_uint_t kernel, w0, w1, wstep; |
143 | 0 | mpd_uint_t *s, *p0, *p1, *p2; |
144 | 0 | mpd_uint_t umod; |
145 | | #ifdef PPRO |
146 | | double dmod; |
147 | | uint32_t dinvmod[3]; |
148 | | #endif |
149 | 0 | mpd_size_t i, k; |
150 | | |
151 | |
|
152 | 0 | assert(n >= 48); |
153 | 0 | assert(n <= 3*MPD_MAXTRANSFORM_2N); |
154 | | |
155 | | |
156 | | /* Length R transform on the columns. */ |
157 | 0 | SETMODULUS(modnum); |
158 | 0 | _mpd_init_w3table(w3table, -1, modnum); |
159 | 0 | for (p0=a, p1=p0+C, p2=p0+2*C; p0<a+C; p0++,p1++,p2++) { |
160 | |
|
161 | 0 | SIZE3_NTT(p0, p1, p2, w3table); |
162 | 0 | } |
163 | | |
164 | | /* Multiply each matrix element (addressed by i*C+k) by r**(i*k). */ |
165 | 0 | kernel = _mpd_getkernel(n, -1, modnum); |
166 | 0 | for (i = 1; i < R; i++) { |
167 | 0 | w0 = 1; /* r**(i*0): initial value for k=0 */ |
168 | 0 | w1 = POWMOD(kernel, i); /* r**(i*1): initial value for k=1 */ |
169 | 0 | wstep = MULMOD(w1, w1); /* r**(2*i) */ |
170 | 0 | for (k = 0; k < C-1; k += 2) { |
171 | 0 | mpd_uint_t x0 = a[i*C+k]; |
172 | 0 | mpd_uint_t x1 = a[i*C+k+1]; |
173 | 0 | MULMOD2(&x0, w0, &x1, w1); |
174 | 0 | MULMOD2C(&w0, &w1, wstep); /* r**(i*(k+2)) = r**(i*k) * r**(2*i) */ |
175 | 0 | a[i*C+k] = x0; |
176 | 0 | a[i*C+k+1] = x1; |
177 | 0 | } |
178 | 0 | } |
179 | | |
180 | | /* Length C transform on the rows. */ |
181 | 0 | for (s = a; s < a+n; s += C) { |
182 | 0 | if (!six_step_fnt(s, C, modnum)) { |
183 | 0 | return 0; |
184 | 0 | } |
185 | 0 | } |
186 | | |
187 | 0 | return 1; |
188 | 0 | } |
189 | | |
190 | | /* backward transform, sign = 1; transform length = 3 * 2**n */ |
191 | | int |
192 | | inv_four_step_fnt(mpd_uint_t *a, mpd_size_t n, int modnum) |
193 | 0 | { |
194 | 0 | mpd_size_t R = 3; /* number of rows */ |
195 | 0 | mpd_size_t C = n / 3; /* number of columns */ |
196 | 0 | mpd_uint_t w3table[3]; |
197 | 0 | mpd_uint_t kernel, w0, w1, wstep; |
198 | 0 | mpd_uint_t *s, *p0, *p1, *p2; |
199 | 0 | mpd_uint_t umod; |
200 | | #ifdef PPRO |
201 | | double dmod; |
202 | | uint32_t dinvmod[3]; |
203 | | #endif |
204 | 0 | mpd_size_t i, k; |
205 | | |
206 | |
|
207 | 0 | assert(n >= 48); |
208 | 0 | assert(n <= 3*MPD_MAXTRANSFORM_2N); |
209 | | |
210 | | /* Length C transform on the rows. */ |
211 | 0 | for (s = a; s < a+n; s += C) { |
212 | 0 | if (!inv_six_step_fnt(s, C, modnum)) { |
213 | 0 | return 0; |
214 | 0 | } |
215 | 0 | } |
216 | | |
217 | | /* Multiply each matrix element (addressed by i*C+k) by r**(i*k). */ |
218 | 0 | SETMODULUS(modnum); |
219 | 0 | kernel = _mpd_getkernel(n, 1, modnum); |
220 | 0 | for (i = 1; i < R; i++) { |
221 | 0 | w0 = 1; |
222 | 0 | w1 = POWMOD(kernel, i); |
223 | 0 | wstep = MULMOD(w1, w1); |
224 | 0 | for (k = 0; k < C; k += 2) { |
225 | 0 | mpd_uint_t x0 = a[i*C+k]; |
226 | 0 | mpd_uint_t x1 = a[i*C+k+1]; |
227 | 0 | MULMOD2(&x0, w0, &x1, w1); |
228 | 0 | MULMOD2C(&w0, &w1, wstep); |
229 | 0 | a[i*C+k] = x0; |
230 | 0 | a[i*C+k+1] = x1; |
231 | 0 | } |
232 | 0 | } |
233 | | |
234 | | /* Length R transform on the columns. */ |
235 | 0 | _mpd_init_w3table(w3table, 1, modnum); |
236 | 0 | for (p0=a, p1=p0+C, p2=p0+2*C; p0<a+C; p0++,p1++,p2++) { |
237 | |
|
238 | 0 | SIZE3_NTT(p0, p1, p2, w3table); |
239 | 0 | } |
240 | |
|
241 | 0 | return 1; |
242 | 0 | } |