/src/mozilla-central/security/nss/lib/freebl/mpi/montmulf.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
2 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
3 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
4 | | |
5 | | #ifdef SOLARIS |
6 | | #define RF_INLINE_MACROS 1 |
7 | | #endif |
8 | | |
9 | | static const double TwoTo16 = 65536.0; |
10 | | static const double TwoToMinus16 = 1.0 / 65536.0; |
11 | | static const double Zero = 0.0; |
12 | | static const double TwoTo32 = 65536.0 * 65536.0; |
13 | | static const double TwoToMinus32 = 1.0 / (65536.0 * 65536.0); |
14 | | |
15 | | #ifdef RF_INLINE_MACROS |
16 | | |
17 | | double upper32(double); |
18 | | double lower32(double, double); |
19 | | double mod(double, double, double); |
20 | | |
21 | | void i16_to_d16_and_d32x4(const double * /*1/(2^16)*/, |
22 | | const double * /* 2^16*/, |
23 | | const double * /* 0 */, |
24 | | double * /*result16*/, |
25 | | double * /* result32 */, |
26 | | float * /*source - should be unsigned int* converted to float* */); |
27 | | |
28 | | #else |
29 | | #ifdef MP_USE_FLOOR |
30 | | #include <math.h> |
31 | | #else |
32 | 0 | #define floor(d) ((double)((unsigned long long)(d))) |
33 | | #endif |
34 | | |
35 | | static double |
36 | | upper32(double x) |
37 | 0 | { |
38 | 0 | return floor(x * TwoToMinus32); |
39 | 0 | } |
40 | | |
41 | | static double |
42 | | lower32(double x, double y) |
43 | 0 | { |
44 | 0 | return x - TwoTo32 * floor(x * TwoToMinus32); |
45 | 0 | } |
46 | | |
47 | | static double |
48 | | mod(double x, double oneoverm, double m) |
49 | 0 | { |
50 | 0 | return x - m * floor(x * oneoverm); |
51 | 0 | } |
52 | | |
53 | | #endif |
54 | | |
55 | | static void |
56 | | cleanup(double *dt, int from, int tlen) |
57 | 0 | { |
58 | 0 | int i; |
59 | 0 | double tmp, tmp1, x, x1; |
60 | 0 |
|
61 | 0 | tmp = tmp1 = Zero; |
62 | 0 | /* original code ** |
63 | 0 | for(i=2*from;i<2*tlen-2;i++) |
64 | 0 | { |
65 | 0 | x=dt[i]; |
66 | 0 | dt[i]=lower32(x,Zero)+tmp1; |
67 | 0 | tmp1=tmp; |
68 | 0 | tmp=upper32(x); |
69 | 0 | } |
70 | 0 | dt[tlen-2]+=tmp1; |
71 | 0 | dt[tlen-1]+=tmp; |
72 | 0 | **end original code ***/ |
73 | 0 | /* new code ***/ |
74 | 0 | for (i = 2 * from; i < 2 * tlen; i += 2) { |
75 | 0 | x = dt[i]; |
76 | 0 | x1 = dt[i + 1]; |
77 | 0 | dt[i] = lower32(x, Zero) + tmp; |
78 | 0 | dt[i + 1] = lower32(x1, Zero) + tmp1; |
79 | 0 | tmp = upper32(x); |
80 | 0 | tmp1 = upper32(x1); |
81 | 0 | } |
82 | 0 | /** end new code **/ |
83 | 0 | } |
84 | | |
85 | | void |
86 | | conv_d16_to_i32(unsigned int *i32, double *d16, long long *tmp, int ilen) |
87 | 0 | { |
88 | 0 | int i; |
89 | 0 | long long t, t1, a, b, c, d; |
90 | 0 |
|
91 | 0 | t1 = 0; |
92 | 0 | a = (long long)d16[0]; |
93 | 0 | b = (long long)d16[1]; |
94 | 0 | for (i = 0; i < ilen - 1; i++) { |
95 | 0 | c = (long long)d16[2 * i + 2]; |
96 | 0 | t1 += (unsigned int)a; |
97 | 0 | t = (a >> 32); |
98 | 0 | d = (long long)d16[2 * i + 3]; |
99 | 0 | t1 += (b & 0xffff) << 16; |
100 | 0 | t += (b >> 16) + (t1 >> 32); |
101 | 0 | i32[i] = (unsigned int)t1; |
102 | 0 | t1 = t; |
103 | 0 | a = c; |
104 | 0 | b = d; |
105 | 0 | } |
106 | 0 | t1 += (unsigned int)a; |
107 | 0 | t = (a >> 32); |
108 | 0 | t1 += (b & 0xffff) << 16; |
109 | 0 | i32[i] = (unsigned int)t1; |
110 | 0 | } |
111 | | |
112 | | void |
113 | | conv_i32_to_d32(double *d32, unsigned int *i32, int len) |
114 | 0 | { |
115 | 0 | int i; |
116 | 0 |
|
117 | 0 | #pragma pipeloop(0) |
118 | 0 | for (i = 0; i < len; i++) |
119 | 0 | d32[i] = (double)(i32[i]); |
120 | 0 | } |
121 | | |
122 | | void |
123 | | conv_i32_to_d16(double *d16, unsigned int *i32, int len) |
124 | 0 | { |
125 | 0 | int i; |
126 | 0 | unsigned int a; |
127 | 0 |
|
128 | 0 | #pragma pipeloop(0) |
129 | 0 | for (i = 0; i < len; i++) { |
130 | 0 | a = i32[i]; |
131 | 0 | d16[2 * i] = (double)(a & 0xffff); |
132 | 0 | d16[2 * i + 1] = (double)(a >> 16); |
133 | 0 | } |
134 | 0 | } |
135 | | |
136 | | void |
137 | | conv_i32_to_d32_and_d16(double *d32, double *d16, |
138 | | unsigned int *i32, int len) |
139 | 0 | { |
140 | 0 | int i = 0; |
141 | 0 | unsigned int a; |
142 | 0 |
|
143 | 0 | #pragma pipeloop(0) |
144 | | #ifdef RF_INLINE_MACROS |
145 | | for (; i < len - 3; i += 4) { |
146 | | i16_to_d16_and_d32x4(&TwoToMinus16, &TwoTo16, &Zero, |
147 | | &(d16[2 * i]), &(d32[i]), (float *)(&(i32[i]))); |
148 | | } |
149 | | #endif |
150 | 0 | for (; i < len; i++) { |
151 | 0 | a = i32[i]; |
152 | 0 | d32[i] = (double)(i32[i]); |
153 | 0 | d16[2 * i] = (double)(a & 0xffff); |
154 | 0 | d16[2 * i + 1] = (double)(a >> 16); |
155 | 0 | } |
156 | 0 | } |
157 | | |
158 | | void |
159 | | adjust_montf_result(unsigned int *i32, unsigned int *nint, int len) |
160 | 0 | { |
161 | 0 | long long acc; |
162 | 0 | int i; |
163 | 0 |
|
164 | 0 | if (i32[len] > 0) |
165 | 0 | i = -1; |
166 | 0 | else { |
167 | 0 | for (i = len - 1; i >= 0; i--) { |
168 | 0 | if (i32[i] != nint[i]) |
169 | 0 | break; |
170 | 0 | } |
171 | 0 | } |
172 | 0 | if ((i < 0) || (i32[i] > nint[i])) { |
173 | 0 | acc = 0; |
174 | 0 | for (i = 0; i < len; i++) { |
175 | 0 | acc = acc + (unsigned long long)(i32[i]) - (unsigned long long)(nint[i]); |
176 | 0 | i32[i] = (unsigned int)acc; |
177 | 0 | acc = acc >> 32; |
178 | 0 | } |
179 | 0 | } |
180 | 0 | } |
181 | | |
182 | | /* |
183 | | ** the lengths of the input arrays should be at least the following: |
184 | | ** result[nlen+1], dm1[nlen], dm2[2*nlen+1], dt[4*nlen+2], dn[nlen], nint[nlen] |
185 | | ** all of them should be different from one another |
186 | | ** |
187 | | */ |
188 | | void |
189 | | mont_mulf_noconv(unsigned int *result, |
190 | | double *dm1, double *dm2, double *dt, |
191 | | double *dn, unsigned int *nint, |
192 | | int nlen, double dn0) |
193 | 0 | { |
194 | 0 | int i, j, jj; |
195 | 0 | int tmp; |
196 | 0 | double digit, m2j, nextm2j, a, b; |
197 | 0 | double *dptmp, *pdm1, *pdm2, *pdn, *pdtj, pdn_0, pdm1_0; |
198 | 0 |
|
199 | 0 | pdm1 = &(dm1[0]); |
200 | 0 | pdm2 = &(dm2[0]); |
201 | 0 | pdn = &(dn[0]); |
202 | 0 | pdm2[2 * nlen] = Zero; |
203 | 0 |
|
204 | 0 | if (nlen != 16) { |
205 | 0 | for (i = 0; i < 4 * nlen + 2; i++) |
206 | 0 | dt[i] = Zero; |
207 | 0 |
|
208 | 0 | a = dt[0] = pdm1[0] * pdm2[0]; |
209 | 0 | digit = mod(lower32(a, Zero) * dn0, TwoToMinus16, TwoTo16); |
210 | 0 |
|
211 | 0 | pdtj = &(dt[0]); |
212 | 0 | for (j = jj = 0; j < 2 * nlen; j++, jj++, pdtj++) { |
213 | 0 | m2j = pdm2[j]; |
214 | 0 | a = pdtj[0] + pdn[0] * digit; |
215 | 0 | b = pdtj[1] + pdm1[0] * pdm2[j + 1] + a * TwoToMinus16; |
216 | 0 | pdtj[1] = b; |
217 | 0 |
|
218 | 0 | #pragma pipeloop(0) |
219 | 0 | for (i = 1; i < nlen; i++) { |
220 | 0 | pdtj[2 * i] += pdm1[i] * m2j + pdn[i] * digit; |
221 | 0 | } |
222 | 0 | if ((jj == 30)) { |
223 | 0 | cleanup(dt, j / 2 + 1, 2 * nlen + 1); |
224 | 0 | jj = 0; |
225 | 0 | } |
226 | 0 |
|
227 | 0 | digit = mod(lower32(b, Zero) * dn0, TwoToMinus16, TwoTo16); |
228 | 0 | } |
229 | 0 | } else { |
230 | 0 | a = dt[0] = pdm1[0] * pdm2[0]; |
231 | 0 |
|
232 | 0 | dt[65] = dt[64] = dt[63] = dt[62] = dt[61] = dt[60] = |
233 | 0 | dt[59] = dt[58] = dt[57] = dt[56] = dt[55] = dt[54] = |
234 | 0 | dt[53] = dt[52] = dt[51] = dt[50] = dt[49] = dt[48] = |
235 | 0 | dt[47] = dt[46] = dt[45] = dt[44] = dt[43] = dt[42] = |
236 | 0 | dt[41] = dt[40] = dt[39] = dt[38] = dt[37] = dt[36] = |
237 | 0 | dt[35] = dt[34] = dt[33] = dt[32] = dt[31] = dt[30] = |
238 | 0 | dt[29] = dt[28] = dt[27] = dt[26] = dt[25] = dt[24] = |
239 | 0 | dt[23] = dt[22] = dt[21] = dt[20] = dt[19] = dt[18] = |
240 | 0 | dt[17] = dt[16] = dt[15] = dt[14] = dt[13] = dt[12] = |
241 | 0 | dt[11] = dt[10] = dt[9] = dt[8] = dt[7] = dt[6] = |
242 | 0 | dt[5] = dt[4] = dt[3] = dt[2] = dt[1] = Zero; |
243 | 0 |
|
244 | 0 | pdn_0 = pdn[0]; |
245 | 0 | pdm1_0 = pdm1[0]; |
246 | 0 |
|
247 | 0 | digit = mod(lower32(a, Zero) * dn0, TwoToMinus16, TwoTo16); |
248 | 0 | pdtj = &(dt[0]); |
249 | 0 |
|
250 | 0 | for (j = 0; j < 32; j++, pdtj++) { |
251 | 0 |
|
252 | 0 | m2j = pdm2[j]; |
253 | 0 | a = pdtj[0] + pdn_0 * digit; |
254 | 0 | b = pdtj[1] + pdm1_0 * pdm2[j + 1] + a * TwoToMinus16; |
255 | 0 | pdtj[1] = b; |
256 | 0 |
|
257 | 0 | /**** this loop will be fully unrolled: |
258 | 0 | for(i=1;i<16;i++) |
259 | 0 | { |
260 | 0 | pdtj[2*i]+=pdm1[i]*m2j+pdn[i]*digit; |
261 | 0 | } |
262 | 0 | *************************************/ |
263 | 0 | pdtj[2] += pdm1[1] * m2j + pdn[1] * digit; |
264 | 0 | pdtj[4] += pdm1[2] * m2j + pdn[2] * digit; |
265 | 0 | pdtj[6] += pdm1[3] * m2j + pdn[3] * digit; |
266 | 0 | pdtj[8] += pdm1[4] * m2j + pdn[4] * digit; |
267 | 0 | pdtj[10] += pdm1[5] * m2j + pdn[5] * digit; |
268 | 0 | pdtj[12] += pdm1[6] * m2j + pdn[6] * digit; |
269 | 0 | pdtj[14] += pdm1[7] * m2j + pdn[7] * digit; |
270 | 0 | pdtj[16] += pdm1[8] * m2j + pdn[8] * digit; |
271 | 0 | pdtj[18] += pdm1[9] * m2j + pdn[9] * digit; |
272 | 0 | pdtj[20] += pdm1[10] * m2j + pdn[10] * digit; |
273 | 0 | pdtj[22] += pdm1[11] * m2j + pdn[11] * digit; |
274 | 0 | pdtj[24] += pdm1[12] * m2j + pdn[12] * digit; |
275 | 0 | pdtj[26] += pdm1[13] * m2j + pdn[13] * digit; |
276 | 0 | pdtj[28] += pdm1[14] * m2j + pdn[14] * digit; |
277 | 0 | pdtj[30] += pdm1[15] * m2j + pdn[15] * digit; |
278 | 0 | /* no need for cleenup, cannot overflow */ |
279 | 0 | digit = mod(lower32(b, Zero) * dn0, TwoToMinus16, TwoTo16); |
280 | 0 | } |
281 | 0 | } |
282 | 0 |
|
283 | 0 | conv_d16_to_i32(result, dt + 2 * nlen, (long long *)dt, nlen + 1); |
284 | 0 |
|
285 | 0 | adjust_montf_result(result, nint, nlen); |
286 | 0 | } |