/src/openssl/crypto/bn/bn_asm.c

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright 1995-2023 The OpenSSL Project Authors. All Rights Reserved.
3		*
4		* Licensed under the Apache License 2.0 (the "License"). You may not use
5		* this file except in compliance with the License. You can obtain a copy
6		* in the file LICENSE in the source distribution or at
7		* https://www.openssl.org/source/license.html
8		*/
9
10		#include <assert.h>
11		#include <openssl/crypto.h>
12		#include "internal/cryptlib.h"
13		#include "bn_local.h"
14
15		#if defined(BN_LLONG) \|\| defined(BN_UMULT_HIGH)
16
17		BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
18		BN_ULONG w)
19		{
20		BN_ULONG c1 = 0;
21
22		assert(num >= 0);
23		if (num <= 0)
24		return c1;
25
26		# ifndef OPENSSL_SMALL_FOOTPRINT
27		while (num & ~3) {
28		mul_add(rp[0], ap[0], w, c1);
29		mul_add(rp[1], ap[1], w, c1);
30		mul_add(rp[2], ap[2], w, c1);
31		mul_add(rp[3], ap[3], w, c1);
32		ap += 4;
33		rp += 4;
34		num -= 4;
35		}
36		# endif
37		while (num) {
38		mul_add(rp[0], ap[0], w, c1);
39		ap++;
40		rp++;
41		num--;
42		}
43
44		return c1;
45		}
46
47		BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
48		{
49		BN_ULONG c1 = 0;
50
51		assert(num >= 0);
52		if (num <= 0)
53		return c1;
54
55		# ifndef OPENSSL_SMALL_FOOTPRINT
56		while (num & ~3) {
57		mul(rp[0], ap[0], w, c1);
58		mul(rp[1], ap[1], w, c1);
59		mul(rp[2], ap[2], w, c1);
60		mul(rp[3], ap[3], w, c1);
61		ap += 4;
62		rp += 4;
63		num -= 4;
64		}
65		# endif
66		while (num) {
67		mul(rp[0], ap[0], w, c1);
68		ap++;
69		rp++;
70		num--;
71		}
72		return c1;
73		}
74
75		void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
76		{
77		assert(n >= 0);
78		if (n <= 0)
79		return;
80
81		# ifndef OPENSSL_SMALL_FOOTPRINT
82		while (n & ~3) {
83		sqr(r[0], r[1], a[0]);
84		sqr(r[2], r[3], a[1]);
85		sqr(r[4], r[5], a[2]);
86		sqr(r[6], r[7], a[3]);
87		a += 4;
88		r += 8;
89		n -= 4;
90		}
91		# endif
92		while (n) {
93		sqr(r[0], r[1], a[0]);
94		a++;
95		r += 2;
96		n--;
97		}
98		}
99
100		#else /* !(defined(BN_LLONG) \|\|
101		* defined(BN_UMULT_HIGH)) */
102
103		BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
104		BN_ULONG w)
105	818M	{
106	818M	BN_ULONG c = 0;
107	818M	BN_ULONG bl, bh;
108
109	818M	assert(num >= 0);
110	818M	if (num <= 0)
111	0	return (BN_ULONG)0;
112
113	818M	bl = LBITS(w);
114	818M	bh = HBITS(w);
115
116	818M	# ifndef OPENSSL_SMALL_FOOTPRINT
117	7.04G	while (num & ~3) {
118	6.22G	mul_add(rp[0], ap[0], bl, bh, c);
119	6.22G	mul_add(rp[1], ap[1], bl, bh, c);
120	6.22G	mul_add(rp[2], ap[2], bl, bh, c);
121	6.22G	mul_add(rp[3], ap[3], bl, bh, c);
122	6.22G	ap += 4;
123	6.22G	rp += 4;
124	6.22G	num -= 4;
125	6.22G	}
126	818M	# endif
127	943M	while (num) {
128	125M	mul_add(rp[0], ap[0], bl, bh, c);
129	125M	ap++;
130	125M	rp++;
131	125M	num--;
132	125M	}
133	818M	return c;
134	818M	}
135
136		BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
137	13.1M	{
138	13.1M	BN_ULONG carry = 0;
139	13.1M	BN_ULONG bl, bh;
140
141	13.1M	assert(num >= 0);
142	13.1M	if (num <= 0)
143	0	return (BN_ULONG)0;
144
145	13.1M	bl = LBITS(w);
146	13.1M	bh = HBITS(w);
147
148	13.1M	# ifndef OPENSSL_SMALL_FOOTPRINT
149	61.6M	while (num & ~3) {
150	48.4M	mul(rp[0], ap[0], bl, bh, carry);
151	48.4M	mul(rp[1], ap[1], bl, bh, carry);
152	48.4M	mul(rp[2], ap[2], bl, bh, carry);
153	48.4M	mul(rp[3], ap[3], bl, bh, carry);
154	48.4M	ap += 4;
155	48.4M	rp += 4;
156	48.4M	num -= 4;
157	48.4M	}
158	13.1M	# endif
159	33.1M	while (num) {
160	20.0M	mul(rp[0], ap[0], bl, bh, carry);
161	20.0M	ap++;
162	20.0M	rp++;
163	20.0M	num--;
164	20.0M	}
165	13.1M	return carry;
166	13.1M	}
167
168		void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
169	4.37M	{
170	4.37M	assert(n >= 0);
171	4.37M	if (n <= 0)
172	0	return;
173
174	4.37M	# ifndef OPENSSL_SMALL_FOOTPRINT
175	10.8M	while (n & ~3) {
176	6.45M	sqr64(r[0], r[1], a[0]);
177	6.45M	sqr64(r[2], r[3], a[1]);
178	6.45M	sqr64(r[4], r[5], a[2]);
179	6.45M	sqr64(r[6], r[7], a[3]);
180	6.45M	a += 4;
181	6.45M	r += 8;
182	6.45M	n -= 4;
183	6.45M	}
184	4.37M	# endif
185	15.9M	while (n) {
186	11.5M	sqr64(r[0], r[1], a[0]);
187	11.5M	a++;
188	11.5M	r += 2;
189	11.5M	n--;
190	11.5M	}
191	4.37M	}
192
193		#endif /* !(defined(BN_LLONG) \|\|
194		* defined(BN_UMULT_HIGH)) */
195
196		#if defined(BN_LLONG) && defined(BN_DIV2W)
197
198		BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
199		{
200		return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) \| l) / (BN_ULLONG) d));
201		}
202
203		#else
204
205		/* Divide h,l by d and return the result. */
206		/* I need to test this some more :-( */
207		BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
208	7.37M	{
209	7.37M	BN_ULONG dh, dl, q, ret = 0, th, tl, t;
210	7.37M	int i, count = 2;
211
212	7.37M	if (d == 0)
213	0	return BN_MASK2;
214
215	7.37M	i = BN_num_bits_word(d);
216	7.37M	assert((i == BN_BITS2) \|\| (h <= (BN_ULONG)1 << i));
217
218	7.37M	i = BN_BITS2 - i;
219	7.37M	if (h >= d)
220	0	h -= d;
221
222	7.37M	if (i) {
223	0	d <<= i;
224	0	h = (h << i) \| (l >> (BN_BITS2 - i));
225	0	l <<= i;
226	0	}
227	7.37M	dh = (d & BN_MASK2h) >> BN_BITS4;
228	7.37M	dl = (d & BN_MASK2l);
229	14.7M	for (;;) {
230	14.7M	if ((h >> BN_BITS4) == dh)
231	1.21k	q = BN_MASK2l;
232	14.7M	else
233	14.7M	q = h / dh;
234
235	14.7M	th = q * dh;
236	14.7M	tl = dl * q;
237	21.4M	for (;;) {
238	21.4M	t = h - th;
239	21.4M	if ((t & BN_MASK2h) \|\|
240	21.4M	((tl) <= ((t << BN_BITS4) \| ((l & BN_MASK2h) >> BN_BITS4))))
241	14.7M	break;
242	6.75M	q--;
243	6.75M	th -= dh;
244	6.75M	tl -= dl;
245	6.75M	}
246	14.7M	t = (tl >> BN_BITS4);
247	14.7M	tl = (tl << BN_BITS4) & BN_MASK2h;
248	14.7M	th += t;
249
250	14.7M	if (l < tl)
251	6.96M	th++;
252	14.7M	l -= tl;
253	14.7M	if (h < th) {
254	0	h += d;
255	0	q--;
256	0	}
257	14.7M	h -= th;
258
259	14.7M	if (--count == 0)
260	7.37M	break;
261
262	7.37M	ret = q << BN_BITS4;
263	7.37M	h = ((h << BN_BITS4) \| (l >> BN_BITS4)) & BN_MASK2;
264	7.37M	l = (l & BN_MASK2l) << BN_BITS4;
265	7.37M	}
266	7.37M	ret \|= q;
267	7.37M	return ret;
268	7.37M	}
269		#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
270
271		#ifdef BN_LLONG
272		BN_ULONG bn_add_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
273		int n)
274		{
275		BN_ULLONG ll = 0;
276
277		assert(n >= 0);
278		if (n <= 0)
279		return (BN_ULONG)0;
280
281		# ifndef OPENSSL_SMALL_FOOTPRINT
282		while (n & ~3) {
283		ll += (BN_ULLONG) a[0] + b[0];
284		r[0] = (BN_ULONG)ll & BN_MASK2;
285		ll >>= BN_BITS2;
286		ll += (BN_ULLONG) a[1] + b[1];
287		r[1] = (BN_ULONG)ll & BN_MASK2;
288		ll >>= BN_BITS2;
289		ll += (BN_ULLONG) a[2] + b[2];
290		r[2] = (BN_ULONG)ll & BN_MASK2;
291		ll >>= BN_BITS2;
292		ll += (BN_ULLONG) a[3] + b[3];
293		r[3] = (BN_ULONG)ll & BN_MASK2;
294		ll >>= BN_BITS2;
295		a += 4;
296		b += 4;
297		r += 4;
298		n -= 4;
299		}
300		# endif
301		while (n) {
302		ll += (BN_ULLONG) a[0] + b[0];
303		r[0] = (BN_ULONG)ll & BN_MASK2;
304		ll >>= BN_BITS2;
305		a++;
306		b++;
307		r++;
308		n--;
309		}
310		return (BN_ULONG)ll;
311		}
312		#else /* !BN_LLONG */
313		BN_ULONG bn_add_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
314		int n)
315	164M	{
316	164M	BN_ULONG c, l, t;
317
318	164M	assert(n >= 0);
319	164M	if (n <= 0)
320	150k	return (BN_ULONG)0;
321
322	164M	c = 0;
323	164M	# ifndef OPENSSL_SMALL_FOOTPRINT
324	979M	while (n & ~3) {
325	815M	t = a[0];
326	815M	t = (t + c) & BN_MASK2;
327	815M	c = (t < c);
328	815M	l = (t + b[0]) & BN_MASK2;
329	815M	c += (l < t);
330	815M	r[0] = l;
331	815M	t = a[1];
332	815M	t = (t + c) & BN_MASK2;
333	815M	c = (t < c);
334	815M	l = (t + b[1]) & BN_MASK2;
335	815M	c += (l < t);
336	815M	r[1] = l;
337	815M	t = a[2];
338	815M	t = (t + c) & BN_MASK2;
339	815M	c = (t < c);
340	815M	l = (t + b[2]) & BN_MASK2;
341	815M	c += (l < t);
342	815M	r[2] = l;
343	815M	t = a[3];
344	815M	t = (t + c) & BN_MASK2;
345	815M	c = (t < c);
346	815M	l = (t + b[3]) & BN_MASK2;
347	815M	c += (l < t);
348	815M	r[3] = l;
349	815M	a += 4;
350	815M	b += 4;
351	815M	r += 4;
352	815M	n -= 4;
353	815M	}
354	164M	# endif
355	192M	while (n) {
356	27.7M	t = a[0];
357	27.7M	t = (t + c) & BN_MASK2;
358	27.7M	c = (t < c);
359	27.7M	l = (t + b[0]) & BN_MASK2;
360	27.7M	c += (l < t);
361	27.7M	r[0] = l;
362	27.7M	a++;
363	27.7M	b++;
364	27.7M	r++;
365	27.7M	n--;
366	27.7M	}
367	164M	return (BN_ULONG)c;
368	164M	}
369		#endif /* !BN_LLONG */
370
371		BN_ULONG bn_sub_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
372		int n)
373	206M	{
374	206M	BN_ULONG t1, t2;
375	206M	int c = 0;
376
377	206M	assert(n >= 0);
378	206M	if (n <= 0)
379	78.2k	return (BN_ULONG)0;
380
381	206M	#ifndef OPENSSL_SMALL_FOOTPRINT
382	978M	while (n & ~3) {
383	771M	t1 = a[0];
384	771M	t2 = (t1 - c) & BN_MASK2;
385	771M	c = (t2 > t1);
386	771M	t1 = b[0];
387	771M	t1 = (t2 - t1) & BN_MASK2;
388	771M	r[0] = t1;
389	771M	c += (t1 > t2);
390	771M	t1 = a[1];
391	771M	t2 = (t1 - c) & BN_MASK2;
392	771M	c = (t2 > t1);
393	771M	t1 = b[1];
394	771M	t1 = (t2 - t1) & BN_MASK2;
395	771M	r[1] = t1;
396	771M	c += (t1 > t2);
397	771M	t1 = a[2];
398	771M	t2 = (t1 - c) & BN_MASK2;
399	771M	c = (t2 > t1);
400	771M	t1 = b[2];
401	771M	t1 = (t2 - t1) & BN_MASK2;
402	771M	r[2] = t1;
403	771M	c += (t1 > t2);
404	771M	t1 = a[3];
405	771M	t2 = (t1 - c) & BN_MASK2;
406	771M	c = (t2 > t1);
407	771M	t1 = b[3];
408	771M	t1 = (t2 - t1) & BN_MASK2;
409	771M	r[3] = t1;
410	771M	c += (t1 > t2);
411	771M	a += 4;
412	771M	b += 4;
413	771M	r += 4;
414	771M	n -= 4;
415	771M	}
416	206M	#endif
417	236M	while (n) {
418	29.4M	t1 = a[0];
419	29.4M	t2 = (t1 - c) & BN_MASK2;
420	29.4M	c = (t2 > t1);
421	29.4M	t1 = b[0];
422	29.4M	t1 = (t2 - t1) & BN_MASK2;
423	29.4M	r[0] = t1;
424	29.4M	c += (t1 > t2);
425	29.4M	a++;
426	29.4M	b++;
427	29.4M	r++;
428	29.4M	n--;
429	29.4M	}
430	206M	return c;
431	206M	}
432
433		#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
434
435		/* mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /
436		/* mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) */
437		/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
438		/*
439		* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number
440		* c=(c2,c1,c0)
441		*/
442
443		# ifdef BN_LLONG
444		/*
445		* Keep in mind that additions to multiplication result can not
446		* overflow, because its high half cannot be all-ones.
447		*/
448		# define mul_add_c(a,b,c0,c1,c2) do { \
449		BN_ULONG hi; \
450		BN_ULLONG t = (BN_ULLONG)(a)*(b); \
451		t += c0; /* no carry */ \
452		c0 = (BN_ULONG)Lw(t); \
453		hi = (BN_ULONG)Hw(t); \
454		c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
455		} while(0)
456
457		# define mul_add_c2(a,b,c0,c1,c2) do { \
458		BN_ULONG hi; \
459		BN_ULLONG t = (BN_ULLONG)(a)*(b); \
460		BN_ULLONG tt = t+c0; /* no carry */ \
461		c0 = (BN_ULONG)Lw(tt); \
462		hi = (BN_ULONG)Hw(tt); \
463		c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
464		t += c0; /* no carry */ \
465		c0 = (BN_ULONG)Lw(t); \
466		hi = (BN_ULONG)Hw(t); \
467		c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
468		} while(0)
469
470		# define sqr_add_c(a,i,c0,c1,c2) do { \
471		BN_ULONG hi; \
472		BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \
473		t += c0; /* no carry */ \
474		c0 = (BN_ULONG)Lw(t); \
475		hi = (BN_ULONG)Hw(t); \
476		c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
477		} while(0)
478
479		# define sqr_add_c2(a,i,j,c0,c1,c2) \
480		mul_add_c2((a)[i],(a)[j],c0,c1,c2)
481
482		# elif defined(BN_UMULT_LOHI)
483		/*
484		* Keep in mind that additions to hi can not overflow, because
485		* the high word of a multiplication result cannot be all-ones.
486		*/
487		# define mul_add_c(a,b,c0,c1,c2) do { \
488		BN_ULONG ta = (a), tb = (b); \
489		BN_ULONG lo, hi; \
490		BN_UMULT_LOHI(lo,hi,ta,tb); \
491		c0 += lo; hi += (c0<lo); \
492		c1 += hi; c2 += (c1<hi); \
493		} while(0)
494
495		# define mul_add_c2(a,b,c0,c1,c2) do { \
496		BN_ULONG ta = (a), tb = (b); \
497		BN_ULONG lo, hi, tt; \
498		BN_UMULT_LOHI(lo,hi,ta,tb); \
499		c0 += lo; tt = hi + (c0<lo); \
500		c1 += tt; c2 += (c1<tt); \
501		c0 += lo; hi += (c0<lo); \
502		c1 += hi; c2 += (c1<hi); \
503		} while(0)
504
505		# define sqr_add_c(a,i,c0,c1,c2) do { \
506		BN_ULONG ta = (a)[i]; \
507		BN_ULONG lo, hi; \
508		BN_UMULT_LOHI(lo,hi,ta,ta); \
509		c0 += lo; hi += (c0<lo); \
510		c1 += hi; c2 += (c1<hi); \
511		} while(0)
512
513		# define sqr_add_c2(a,i,j,c0,c1,c2) \
514		mul_add_c2((a)[i],(a)[j],c0,c1,c2)
515
516		# elif defined(BN_UMULT_HIGH)
517		/*
518		* Keep in mind that additions to hi can not overflow, because
519		* the high word of a multiplication result cannot be all-ones.
520		*/
521		# define mul_add_c(a,b,c0,c1,c2) do { \
522		BN_ULONG ta = (a), tb = (b); \
523		BN_ULONG lo = ta * tb; \
524		BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
525		c0 += lo; hi += (c0<lo); \
526		c1 += hi; c2 += (c1<hi); \
527		} while(0)
528
529		# define mul_add_c2(a,b,c0,c1,c2) do { \
530		BN_ULONG ta = (a), tb = (b), tt; \
531		BN_ULONG lo = ta * tb; \
532		BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
533		c0 += lo; tt = hi + (c0<lo); \
534		c1 += tt; c2 += (c1<tt); \
535		c0 += lo; hi += (c0<lo); \
536		c1 += hi; c2 += (c1<hi); \
537		} while(0)
538
539		# define sqr_add_c(a,i,c0,c1,c2) do { \
540		BN_ULONG ta = (a)[i]; \
541		BN_ULONG lo = ta * ta; \
542		BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \
543		c0 += lo; hi += (c0<lo); \
544		c1 += hi; c2 += (c1<hi); \
545		} while(0)
546
547		# define sqr_add_c2(a,i,j,c0,c1,c2) \
548		mul_add_c2((a)[i],(a)[j],c0,c1,c2)
549
550		# else /* !BN_LLONG */
551		/*
552		* Keep in mind that additions to hi can not overflow, because
553		* the high word of a multiplication result cannot be all-ones.
554		*/
555	1.93G	# define mul_add_c(a,b,c0,c1,c2) do { \
556	1.93G	BN_ULONG lo = LBITS(a), hi = HBITS(a); \
557	1.93G	BN_ULONG bl = LBITS(b), bh = HBITS(b); \
558	1.93G	mul64(lo,hi,bl,bh); \
559	1.93G	c0 = (c0+lo)&BN_MASK2; hi += (c0<lo); \
560	1.93G	c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
561	1.93G	} while(0)
562
563	4.24G	# define mul_add_c2(a,b,c0,c1,c2) do { \
564	4.24G	BN_ULONG tt; \
565	4.24G	BN_ULONG lo = LBITS(a), hi = HBITS(a); \
566	4.24G	BN_ULONG bl = LBITS(b), bh = HBITS(b); \
567	4.24G	mul64(lo,hi,bl,bh); \
568	4.24G	tt = hi; \
569	4.24G	c0 = (c0+lo)&BN_MASK2; tt += (c0<lo); \
570	4.24G	c1 = (c1+tt)&BN_MASK2; c2 += (c1<tt); \
571	4.24G	c0 = (c0+lo)&BN_MASK2; hi += (c0<lo); \
572	4.24G	c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
573	4.24G	} while(0)
574
575	1.21G	# define sqr_add_c(a,i,c0,c1,c2) do { \
576	1.21G	BN_ULONG lo, hi; \
577	1.21G	sqr64(lo,hi,(a)[i]); \
578	1.21G	c0 = (c0+lo)&BN_MASK2; hi += (c0<lo); \
579	1.21G	c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
580	1.21G	} while(0)
581
582		# define sqr_add_c2(a,i,j,c0,c1,c2) \
583	4.24G	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
584		# endif /* !BN_LLONG */
585
586		void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
587	30.1M	{
588	30.1M	BN_ULONG c1, c2, c3;
589
590	30.1M	c1 = 0;
591	30.1M	c2 = 0;
592	30.1M	c3 = 0;
593	30.1M	mul_add_c(a[0], b[0], c1, c2, c3);
594	30.1M	r[0] = c1;
595	30.1M	c1 = 0;
596	30.1M	mul_add_c(a[0], b[1], c2, c3, c1);
597	30.1M	mul_add_c(a[1], b[0], c2, c3, c1);
598	30.1M	r[1] = c2;
599	30.1M	c2 = 0;
600	30.1M	mul_add_c(a[2], b[0], c3, c1, c2);
601	30.1M	mul_add_c(a[1], b[1], c3, c1, c2);
602	30.1M	mul_add_c(a[0], b[2], c3, c1, c2);
603	30.1M	r[2] = c3;
604	30.1M	c3 = 0;
605	30.1M	mul_add_c(a[0], b[3], c1, c2, c3);
606	30.1M	mul_add_c(a[1], b[2], c1, c2, c3);
607	30.1M	mul_add_c(a[2], b[1], c1, c2, c3);
608	30.1M	mul_add_c(a[3], b[0], c1, c2, c3);
609	30.1M	r[3] = c1;
610	30.1M	c1 = 0;
611	30.1M	mul_add_c(a[4], b[0], c2, c3, c1);
612	30.1M	mul_add_c(a[3], b[1], c2, c3, c1);
613	30.1M	mul_add_c(a[2], b[2], c2, c3, c1);
614	30.1M	mul_add_c(a[1], b[3], c2, c3, c1);
615	30.1M	mul_add_c(a[0], b[4], c2, c3, c1);
616	30.1M	r[4] = c2;
617	30.1M	c2 = 0;
618	30.1M	mul_add_c(a[0], b[5], c3, c1, c2);
619	30.1M	mul_add_c(a[1], b[4], c3, c1, c2);
620	30.1M	mul_add_c(a[2], b[3], c3, c1, c2);
621	30.1M	mul_add_c(a[3], b[2], c3, c1, c2);
622	30.1M	mul_add_c(a[4], b[1], c3, c1, c2);
623	30.1M	mul_add_c(a[5], b[0], c3, c1, c2);
624	30.1M	r[5] = c3;
625	30.1M	c3 = 0;
626	30.1M	mul_add_c(a[6], b[0], c1, c2, c3);
627	30.1M	mul_add_c(a[5], b[1], c1, c2, c3);
628	30.1M	mul_add_c(a[4], b[2], c1, c2, c3);
629	30.1M	mul_add_c(a[3], b[3], c1, c2, c3);
630	30.1M	mul_add_c(a[2], b[4], c1, c2, c3);
631	30.1M	mul_add_c(a[1], b[5], c1, c2, c3);
632	30.1M	mul_add_c(a[0], b[6], c1, c2, c3);
633	30.1M	r[6] = c1;
634	30.1M	c1 = 0;
635	30.1M	mul_add_c(a[0], b[7], c2, c3, c1);
636	30.1M	mul_add_c(a[1], b[6], c2, c3, c1);
637	30.1M	mul_add_c(a[2], b[5], c2, c3, c1);
638	30.1M	mul_add_c(a[3], b[4], c2, c3, c1);
639	30.1M	mul_add_c(a[4], b[3], c2, c3, c1);
640	30.1M	mul_add_c(a[5], b[2], c2, c3, c1);
641	30.1M	mul_add_c(a[6], b[1], c2, c3, c1);
642	30.1M	mul_add_c(a[7], b[0], c2, c3, c1);
643	30.1M	r[7] = c2;
644	30.1M	c2 = 0;
645	30.1M	mul_add_c(a[7], b[1], c3, c1, c2);
646	30.1M	mul_add_c(a[6], b[2], c3, c1, c2);
647	30.1M	mul_add_c(a[5], b[3], c3, c1, c2);
648	30.1M	mul_add_c(a[4], b[4], c3, c1, c2);
649	30.1M	mul_add_c(a[3], b[5], c3, c1, c2);
650	30.1M	mul_add_c(a[2], b[6], c3, c1, c2);
651	30.1M	mul_add_c(a[1], b[7], c3, c1, c2);
652	30.1M	r[8] = c3;
653	30.1M	c3 = 0;
654	30.1M	mul_add_c(a[2], b[7], c1, c2, c3);
655	30.1M	mul_add_c(a[3], b[6], c1, c2, c3);
656	30.1M	mul_add_c(a[4], b[5], c1, c2, c3);
657	30.1M	mul_add_c(a[5], b[4], c1, c2, c3);
658	30.1M	mul_add_c(a[6], b[3], c1, c2, c3);
659	30.1M	mul_add_c(a[7], b[2], c1, c2, c3);
660	30.1M	r[9] = c1;
661	30.1M	c1 = 0;
662	30.1M	mul_add_c(a[7], b[3], c2, c3, c1);
663	30.1M	mul_add_c(a[6], b[4], c2, c3, c1);
664	30.1M	mul_add_c(a[5], b[5], c2, c3, c1);
665	30.1M	mul_add_c(a[4], b[6], c2, c3, c1);
666	30.1M	mul_add_c(a[3], b[7], c2, c3, c1);
667	30.1M	r[10] = c2;
668	30.1M	c2 = 0;
669	30.1M	mul_add_c(a[4], b[7], c3, c1, c2);
670	30.1M	mul_add_c(a[5], b[6], c3, c1, c2);
671	30.1M	mul_add_c(a[6], b[5], c3, c1, c2);
672	30.1M	mul_add_c(a[7], b[4], c3, c1, c2);
673	30.1M	r[11] = c3;
674	30.1M	c3 = 0;
675	30.1M	mul_add_c(a[7], b[5], c1, c2, c3);
676	30.1M	mul_add_c(a[6], b[6], c1, c2, c3);
677	30.1M	mul_add_c(a[5], b[7], c1, c2, c3);
678	30.1M	r[12] = c1;
679	30.1M	c1 = 0;
680	30.1M	mul_add_c(a[6], b[7], c2, c3, c1);
681	30.1M	mul_add_c(a[7], b[6], c2, c3, c1);
682	30.1M	r[13] = c2;
683	30.1M	c2 = 0;
684	30.1M	mul_add_c(a[7], b[7], c3, c1, c2);
685	30.1M	r[14] = c3;
686	30.1M	r[15] = c1;
687	30.1M	}
688
689		void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
690	0	{
691	0	BN_ULONG c1, c2, c3;
692
693	0	c1 = 0;
694	0	c2 = 0;
695	0	c3 = 0;
696	0	mul_add_c(a[0], b[0], c1, c2, c3);
697	0	r[0] = c1;
698	0	c1 = 0;
699	0	mul_add_c(a[0], b[1], c2, c3, c1);
700	0	mul_add_c(a[1], b[0], c2, c3, c1);
701	0	r[1] = c2;
702	0	c2 = 0;
703	0	mul_add_c(a[2], b[0], c3, c1, c2);
704	0	mul_add_c(a[1], b[1], c3, c1, c2);
705	0	mul_add_c(a[0], b[2], c3, c1, c2);
706	0	r[2] = c3;
707	0	c3 = 0;
708	0	mul_add_c(a[0], b[3], c1, c2, c3);
709	0	mul_add_c(a[1], b[2], c1, c2, c3);
710	0	mul_add_c(a[2], b[1], c1, c2, c3);
711	0	mul_add_c(a[3], b[0], c1, c2, c3);
712	0	r[3] = c1;
713	0	c1 = 0;
714	0	mul_add_c(a[3], b[1], c2, c3, c1);
715	0	mul_add_c(a[2], b[2], c2, c3, c1);
716	0	mul_add_c(a[1], b[3], c2, c3, c1);
717	0	r[4] = c2;
718	0	c2 = 0;
719	0	mul_add_c(a[2], b[3], c3, c1, c2);
720	0	mul_add_c(a[3], b[2], c3, c1, c2);
721	0	r[5] = c3;
722	0	c3 = 0;
723	0	mul_add_c(a[3], b[3], c1, c2, c3);
724	0	r[6] = c1;
725	0	r[7] = c2;
726	0	}
727
728		void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
729	151M	{
730	151M	BN_ULONG c1, c2, c3;
731
732	151M	c1 = 0;
733	151M	c2 = 0;
734	151M	c3 = 0;
735	151M	sqr_add_c(a, 0, c1, c2, c3);
736	151M	r[0] = c1;
737	151M	c1 = 0;
738	151M	sqr_add_c2(a, 1, 0, c2, c3, c1);
739	151M	r[1] = c2;
740	151M	c2 = 0;
741	151M	sqr_add_c(a, 1, c3, c1, c2);
742	151M	sqr_add_c2(a, 2, 0, c3, c1, c2);
743	151M	r[2] = c3;
744	151M	c3 = 0;
745	151M	sqr_add_c2(a, 3, 0, c1, c2, c3);
746	151M	sqr_add_c2(a, 2, 1, c1, c2, c3);
747	151M	r[3] = c1;
748	151M	c1 = 0;
749	151M	sqr_add_c(a, 2, c2, c3, c1);
750	151M	sqr_add_c2(a, 3, 1, c2, c3, c1);
751	151M	sqr_add_c2(a, 4, 0, c2, c3, c1);
752	151M	r[4] = c2;
753	151M	c2 = 0;
754	151M	sqr_add_c2(a, 5, 0, c3, c1, c2);
755	151M	sqr_add_c2(a, 4, 1, c3, c1, c2);
756	151M	sqr_add_c2(a, 3, 2, c3, c1, c2);
757	151M	r[5] = c3;
758	151M	c3 = 0;
759	151M	sqr_add_c(a, 3, c1, c2, c3);
760	151M	sqr_add_c2(a, 4, 2, c1, c2, c3);
761	151M	sqr_add_c2(a, 5, 1, c1, c2, c3);
762	151M	sqr_add_c2(a, 6, 0, c1, c2, c3);
763	151M	r[6] = c1;
764	151M	c1 = 0;
765	151M	sqr_add_c2(a, 7, 0, c2, c3, c1);
766	151M	sqr_add_c2(a, 6, 1, c2, c3, c1);
767	151M	sqr_add_c2(a, 5, 2, c2, c3, c1);
768	151M	sqr_add_c2(a, 4, 3, c2, c3, c1);
769	151M	r[7] = c2;
770	151M	c2 = 0;
771	151M	sqr_add_c(a, 4, c3, c1, c2);
772	151M	sqr_add_c2(a, 5, 3, c3, c1, c2);
773	151M	sqr_add_c2(a, 6, 2, c3, c1, c2);
774	151M	sqr_add_c2(a, 7, 1, c3, c1, c2);
775	151M	r[8] = c3;
776	151M	c3 = 0;
777	151M	sqr_add_c2(a, 7, 2, c1, c2, c3);
778	151M	sqr_add_c2(a, 6, 3, c1, c2, c3);
779	151M	sqr_add_c2(a, 5, 4, c1, c2, c3);
780	151M	r[9] = c1;
781	151M	c1 = 0;
782	151M	sqr_add_c(a, 5, c2, c3, c1);
783	151M	sqr_add_c2(a, 6, 4, c2, c3, c1);
784	151M	sqr_add_c2(a, 7, 3, c2, c3, c1);
785	151M	r[10] = c2;
786	151M	c2 = 0;
787	151M	sqr_add_c2(a, 7, 4, c3, c1, c2);
788	151M	sqr_add_c2(a, 6, 5, c3, c1, c2);
789	151M	r[11] = c3;
790	151M	c3 = 0;
791	151M	sqr_add_c(a, 6, c1, c2, c3);
792	151M	sqr_add_c2(a, 7, 5, c1, c2, c3);
793	151M	r[12] = c1;
794	151M	c1 = 0;
795	151M	sqr_add_c2(a, 7, 6, c2, c3, c1);
796	151M	r[13] = c2;
797	151M	c2 = 0;
798	151M	sqr_add_c(a, 7, c3, c1, c2);
799	151M	r[14] = c3;
800	151M	r[15] = c1;
801	151M	}
802
803		void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
804	510k	{
805	510k	BN_ULONG c1, c2, c3;
806
807	510k	c1 = 0;
808	510k	c2 = 0;
809	510k	c3 = 0;
810	510k	sqr_add_c(a, 0, c1, c2, c3);
811	510k	r[0] = c1;
812	510k	c1 = 0;
813	510k	sqr_add_c2(a, 1, 0, c2, c3, c1);
814	510k	r[1] = c2;
815	510k	c2 = 0;
816	510k	sqr_add_c(a, 1, c3, c1, c2);
817	510k	sqr_add_c2(a, 2, 0, c3, c1, c2);
818	510k	r[2] = c3;
819	510k	c3 = 0;
820	510k	sqr_add_c2(a, 3, 0, c1, c2, c3);
821	510k	sqr_add_c2(a, 2, 1, c1, c2, c3);
822	510k	r[3] = c1;
823	510k	c1 = 0;
824	510k	sqr_add_c(a, 2, c2, c3, c1);
825	510k	sqr_add_c2(a, 3, 1, c2, c3, c1);
826	510k	r[4] = c2;
827	510k	c2 = 0;
828	510k	sqr_add_c2(a, 3, 2, c3, c1, c2);
829	510k	r[5] = c3;
830	510k	c3 = 0;
831	510k	sqr_add_c(a, 3, c1, c2, c3);
832	510k	r[6] = c1;
833	510k	r[7] = c2;
834	510k	}
835
836		# ifdef OPENSSL_NO_ASM
837		# ifdef OPENSSL_BN_ASM_MONT
838		# include <alloca.h>
839		/*
840		* This is essentially reference implementation, which may or may not
841		* result in performance improvement. E.g. on IA-32 this routine was
842		* observed to give 40% faster rsa1024 private key operations and 10%
843		* faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
844		* by 10% and worsens rsa4096 sign by 15%. Once again, it's a
845		* reference implementation, one to be used as starting point for
846		* platform-specific assembler. Mentioned numbers apply to compiler
847		* generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
848		* can vary not only from platform to platform, but even for compiler
849		* versions. Assembler vs. assembler improvement coefficients can
850		* [and are known to] differ and are to be documented elsewhere.
851		*/
852		int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
853		const BN_ULONG np, const BN_ULONG n0p, int num)
854		{
855		BN_ULONG c0, c1, ml, *tp, n0;
856		# ifdef mul64
857		BN_ULONG mh;
858		# endif
859		volatile BN_ULONG *vp;
860		int i = 0, j;
861
862		# if 0 /* template for platform-specific
863		* implementation */
864		if (ap == bp)
865		return bn_sqr_mont(rp, ap, np, n0p, num);
866		# endif
867		vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
868
869		n0 = *n0p;
870
871		c0 = 0;
872		ml = bp[0];
873		# ifdef mul64
874		mh = HBITS(ml);
875		ml = LBITS(ml);
876		for (j = 0; j < num; ++j)
877		mul(tp[j], ap[j], ml, mh, c0);
878		# else
879		for (j = 0; j < num; ++j)
880		mul(tp[j], ap[j], ml, c0);
881		# endif
882
883		tp[num] = c0;
884		tp[num + 1] = 0;
885		goto enter;
886
887		for (i = 0; i < num; i++) {
888		c0 = 0;
889		ml = bp[i];
890		# ifdef mul64
891		mh = HBITS(ml);
892		ml = LBITS(ml);
893		for (j = 0; j < num; ++j)
894		mul_add(tp[j], ap[j], ml, mh, c0);
895		# else
896		for (j = 0; j < num; ++j)
897		mul_add(tp[j], ap[j], ml, c0);
898		# endif
899		c1 = (tp[num] + c0) & BN_MASK2;
900		tp[num] = c1;
901		tp[num + 1] = (c1 < c0 ? 1 : 0);
902		enter:
903		c1 = tp[0];
904		ml = (c1 * n0) & BN_MASK2;
905		c0 = 0;
906		# ifdef mul64
907		mh = HBITS(ml);
908		ml = LBITS(ml);
909		mul_add(c1, np[0], ml, mh, c0);
910		# else
911		mul_add(c1, ml, np[0], c0);
912		# endif
913		for (j = 1; j < num; j++) {
914		c1 = tp[j];
915		# ifdef mul64
916		mul_add(c1, np[j], ml, mh, c0);
917		# else
918		mul_add(c1, ml, np[j], c0);
919		# endif
920		tp[j - 1] = c1 & BN_MASK2;
921		}
922		c1 = (tp[num] + c0) & BN_MASK2;
923		tp[num - 1] = c1;
924		tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
925		}
926
927		if (tp[num] != 0 \|\| tp[num - 1] >= np[num - 1]) {
928		c0 = bn_sub_words(rp, tp, np, num);
929		if (tp[num] != 0 \|\| c0 == 0) {
930		for (i = 0; i < num + 2; i++)
931		vp[i] = 0;
932		return 1;
933		}
934		}
935		for (i = 0; i < num; i++)
936		rp[i] = tp[i], vp[i] = 0;
937		vp[num] = 0;
938		vp[num + 1] = 0;
939		return 1;
940		}
941		# else
942		/*
943		* Return value of 0 indicates that multiplication/convolution was not
944		* performed to signal the caller to fall down to alternative/original
945		* code-path.
946		*/
947		int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
948		const BN_ULONG np, const BN_ULONG n0, int num)
949		{
950		return 0;
951		}
952		# endif /* OPENSSL_BN_ASM_MONT */
953		# endif
954
955		#else /* !BN_MUL_COMBA */
956
957		/* hmm... is it faster just to do a multiply? */
958		void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
959		{
960		BN_ULONG t[8];
961		bn_sqr_normal(r, a, 4, t);
962		}
963
964		void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
965		{
966		BN_ULONG t[16];
967		bn_sqr_normal(r, a, 8, t);
968		}
969
970		void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
971		{
972		r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
973		r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
974		r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
975		r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
976		}
977
978		void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
979		{
980		r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
981		r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
982		r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
983		r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
984		r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
985		r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
986		r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
987		r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
988		}
989
990		# ifdef OPENSSL_NO_ASM
991		# ifdef OPENSSL_BN_ASM_MONT
992		# include <alloca.h>
993		int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
994		const BN_ULONG np, const BN_ULONG n0p, int num)
995		{
996		BN_ULONG c0, c1, tp, n0 = n0p;
997		volatile BN_ULONG *vp;
998		int i = 0, j;
999
1000		vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
1001
1002		for (i = 0; i <= num; i++)
1003		tp[i] = 0;
1004
1005		for (i = 0; i < num; i++) {
1006		c0 = bn_mul_add_words(tp, ap, num, bp[i]);
1007		c1 = (tp[num] + c0) & BN_MASK2;
1008		tp[num] = c1;
1009		tp[num + 1] = (c1 < c0 ? 1 : 0);
1010
1011		c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
1012		c1 = (tp[num] + c0) & BN_MASK2;
1013		tp[num] = c1;
1014		tp[num + 1] += (c1 < c0 ? 1 : 0);
1015		for (j = 0; j <= num; j++)
1016		tp[j] = tp[j + 1];
1017		}
1018
1019		if (tp[num] != 0 \|\| tp[num - 1] >= np[num - 1]) {
1020		c0 = bn_sub_words(rp, tp, np, num);
1021		if (tp[num] != 0 \|\| c0 == 0) {
1022		for (i = 0; i < num + 2; i++)
1023		vp[i] = 0;
1024		return 1;
1025		}
1026		}
1027		for (i = 0; i < num; i++)
1028		rp[i] = tp[i], vp[i] = 0;
1029		vp[num] = 0;
1030		vp[num + 1] = 0;
1031		return 1;
1032		}
1033		# else
1034		int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
1035		const BN_ULONG np, const BN_ULONG n0, int num)
1036		{
1037		return 0;
1038		}
1039		# endif /* OPENSSL_BN_ASM_MONT */
1040		# endif
1041
1042		#endif /* !BN_MUL_COMBA */

Coverage Report

Created: 2024-11-21 07:03