/src/openssl/crypto/bn/bn_asm.c

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright 1995-2023 The OpenSSL Project Authors. All Rights Reserved.
3		*
4		* Licensed under the Apache License 2.0 (the "License"). You may not use
5		* this file except in compliance with the License. You can obtain a copy
6		* in the file LICENSE in the source distribution or at
7		* https://www.openssl.org/source/license.html
8		*/
9
10		#include <assert.h>
11		#include <openssl/crypto.h>
12		#include "internal/cryptlib.h"
13		#include "bn_local.h"
14
15		#if defined(BN_LLONG) \|\| defined(BN_UMULT_HIGH)
16
17		BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
18		BN_ULONG w)
19		{
20		BN_ULONG c1 = 0;
21
22		assert(num >= 0);
23		if (num <= 0)
24		return c1;
25
26		# ifndef OPENSSL_SMALL_FOOTPRINT
27		while (num & ~3) {
28		mul_add(rp[0], ap[0], w, c1);
29		mul_add(rp[1], ap[1], w, c1);
30		mul_add(rp[2], ap[2], w, c1);
31		mul_add(rp[3], ap[3], w, c1);
32		ap += 4;
33		rp += 4;
34		num -= 4;
35		}
36		# endif
37		while (num) {
38		mul_add(rp[0], ap[0], w, c1);
39		ap++;
40		rp++;
41		num--;
42		}
43
44		return c1;
45		}
46
47		BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
48		{
49		BN_ULONG c1 = 0;
50
51		assert(num >= 0);
52		if (num <= 0)
53		return c1;
54
55		# ifndef OPENSSL_SMALL_FOOTPRINT
56		while (num & ~3) {
57		mul(rp[0], ap[0], w, c1);
58		mul(rp[1], ap[1], w, c1);
59		mul(rp[2], ap[2], w, c1);
60		mul(rp[3], ap[3], w, c1);
61		ap += 4;
62		rp += 4;
63		num -= 4;
64		}
65		# endif
66		while (num) {
67		mul(rp[0], ap[0], w, c1);
68		ap++;
69		rp++;
70		num--;
71		}
72		return c1;
73		}
74
75		void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
76		{
77		assert(n >= 0);
78		if (n <= 0)
79		return;
80
81		# ifndef OPENSSL_SMALL_FOOTPRINT
82		while (n & ~3) {
83		sqr(r[0], r[1], a[0]);
84		sqr(r[2], r[3], a[1]);
85		sqr(r[4], r[5], a[2]);
86		sqr(r[6], r[7], a[3]);
87		a += 4;
88		r += 8;
89		n -= 4;
90		}
91		# endif
92		while (n) {
93		sqr(r[0], r[1], a[0]);
94		a++;
95		r += 2;
96		n--;
97		}
98		}
99
100		#else /* !(defined(BN_LLONG) \|\|
101		* defined(BN_UMULT_HIGH)) */
102
103		BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
104		BN_ULONG w)
105	0	{
106	0	BN_ULONG c = 0;
107	0	BN_ULONG bl, bh;
108
109	0	assert(num >= 0);
110	0	if (num <= 0)
111	0	return (BN_ULONG)0;
112
113	0	bl = LBITS(w);
114	0	bh = HBITS(w);
115
116	0	# ifndef OPENSSL_SMALL_FOOTPRINT
117	0	while (num & ~3) {
118	0	mul_add(rp[0], ap[0], bl, bh, c);
119	0	mul_add(rp[1], ap[1], bl, bh, c);
120	0	mul_add(rp[2], ap[2], bl, bh, c);
121	0	mul_add(rp[3], ap[3], bl, bh, c);
122	0	ap += 4;
123	0	rp += 4;
124	0	num -= 4;
125	0	}
126	0	# endif
127	0	while (num) {
128	0	mul_add(rp[0], ap[0], bl, bh, c);
129	0	ap++;
130	0	rp++;
131	0	num--;
132	0	}
133	0	return c;
134	0	}
135
136		BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
137	0	{
138	0	BN_ULONG carry = 0;
139	0	BN_ULONG bl, bh;
140
141	0	assert(num >= 0);
142	0	if (num <= 0)
143	0	return (BN_ULONG)0;
144
145	0	bl = LBITS(w);
146	0	bh = HBITS(w);
147
148	0	# ifndef OPENSSL_SMALL_FOOTPRINT
149	0	while (num & ~3) {
150	0	mul(rp[0], ap[0], bl, bh, carry);
151	0	mul(rp[1], ap[1], bl, bh, carry);
152	0	mul(rp[2], ap[2], bl, bh, carry);
153	0	mul(rp[3], ap[3], bl, bh, carry);
154	0	ap += 4;
155	0	rp += 4;
156	0	num -= 4;
157	0	}
158	0	# endif
159	0	while (num) {
160	0	mul(rp[0], ap[0], bl, bh, carry);
161	0	ap++;
162	0	rp++;
163	0	num--;
164	0	}
165	0	return carry;
166	0	}
167
168		void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
169	0	{
170	0	assert(n >= 0);
171	0	if (n <= 0)
172	0	return;
173
174	0	# ifndef OPENSSL_SMALL_FOOTPRINT
175	0	while (n & ~3) {
176	0	sqr64(r[0], r[1], a[0]);
177	0	sqr64(r[2], r[3], a[1]);
178	0	sqr64(r[4], r[5], a[2]);
179	0	sqr64(r[6], r[7], a[3]);
180	0	a += 4;
181	0	r += 8;
182	0	n -= 4;
183	0	}
184	0	# endif
185	0	while (n) {
186	0	sqr64(r[0], r[1], a[0]);
187	0	a++;
188	0	r += 2;
189	0	n--;
190	0	}
191	0	}
192
193		#endif /* !(defined(BN_LLONG) \|\|
194		* defined(BN_UMULT_HIGH)) */
195
196		#if defined(BN_LLONG) && defined(BN_DIV2W)
197
198		BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
199		{
200		return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) \| l) / (BN_ULLONG) d));
201		}
202
203		#else
204
205		/* Divide h,l by d and return the result. */
206		/* I need to test this some more :-( */
207		BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
208	0	{
209	0	BN_ULONG dh, dl, q, ret = 0, th, tl, t;
210	0	int i, count = 2;
211
212	0	if (d == 0)
213	0	return BN_MASK2;
214
215	0	i = BN_num_bits_word(d);
216	0	assert((i == BN_BITS2) \|\| (h <= (BN_ULONG)1 << i));
217
218	0	i = BN_BITS2 - i;
219	0	if (h >= d)
220	0	h -= d;
221
222	0	if (i) {
223	0	d <<= i;
224	0	h = (h << i) \| (l >> (BN_BITS2 - i));
225	0	l <<= i;
226	0	}
227	0	dh = (d & BN_MASK2h) >> BN_BITS4;
228	0	dl = (d & BN_MASK2l);
229	0	for (;;) {
230	0	if ((h >> BN_BITS4) == dh)
231	0	q = BN_MASK2l;
232	0	else
233	0	q = h / dh;
234
235	0	th = q * dh;
236	0	tl = dl * q;
237	0	for (;;) {
238	0	t = h - th;
239	0	if ((t & BN_MASK2h) \|\|
240	0	((tl) <= ((t << BN_BITS4) \| ((l & BN_MASK2h) >> BN_BITS4))))
241	0	break;
242	0	q--;
243	0	th -= dh;
244	0	tl -= dl;
245	0	}
246	0	t = (tl >> BN_BITS4);
247	0	tl = (tl << BN_BITS4) & BN_MASK2h;
248	0	th += t;
249
250	0	if (l < tl)
251	0	th++;
252	0	l -= tl;
253	0	if (h < th) {
254	0	h += d;
255	0	q--;
256	0	}
257	0	h -= th;
258
259	0	if (--count == 0)
260	0	break;
261
262	0	ret = q << BN_BITS4;
263	0	h = ((h << BN_BITS4) \| (l >> BN_BITS4)) & BN_MASK2;
264	0	l = (l & BN_MASK2l) << BN_BITS4;
265	0	}
266	0	ret \|= q;
267	0	return ret;
268	0	}
269		#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
270
271		#ifdef BN_LLONG
272		BN_ULONG bn_add_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
273		int n)
274		{
275		BN_ULLONG ll = 0;
276
277		assert(n >= 0);
278		if (n <= 0)
279		return (BN_ULONG)0;
280
281		# ifndef OPENSSL_SMALL_FOOTPRINT
282		while (n & ~3) {
283		ll += (BN_ULLONG) a[0] + b[0];
284		r[0] = (BN_ULONG)ll & BN_MASK2;
285		ll >>= BN_BITS2;
286		ll += (BN_ULLONG) a[1] + b[1];
287		r[1] = (BN_ULONG)ll & BN_MASK2;
288		ll >>= BN_BITS2;
289		ll += (BN_ULLONG) a[2] + b[2];
290		r[2] = (BN_ULONG)ll & BN_MASK2;
291		ll >>= BN_BITS2;
292		ll += (BN_ULLONG) a[3] + b[3];
293		r[3] = (BN_ULONG)ll & BN_MASK2;
294		ll >>= BN_BITS2;
295		a += 4;
296		b += 4;
297		r += 4;
298		n -= 4;
299		}
300		# endif
301		while (n) {
302		ll += (BN_ULLONG) a[0] + b[0];
303		r[0] = (BN_ULONG)ll & BN_MASK2;
304		ll >>= BN_BITS2;
305		a++;
306		b++;
307		r++;
308		n--;
309		}
310		return (BN_ULONG)ll;
311		}
312		#else /* !BN_LLONG */
313		BN_ULONG bn_add_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
314		int n)
315	0	{
316	0	BN_ULONG c, l, t;
317
318	0	assert(n >= 0);
319	0	if (n <= 0)
320	0	return (BN_ULONG)0;
321
322	0	c = 0;
323	0	# ifndef OPENSSL_SMALL_FOOTPRINT
324	0	while (n & ~3) {
325	0	t = a[0];
326	0	t = (t + c) & BN_MASK2;
327	0	c = (t < c);
328	0	l = (t + b[0]) & BN_MASK2;
329	0	c += (l < t);
330	0	r[0] = l;
331	0	t = a[1];
332	0	t = (t + c) & BN_MASK2;
333	0	c = (t < c);
334	0	l = (t + b[1]) & BN_MASK2;
335	0	c += (l < t);
336	0	r[1] = l;
337	0	t = a[2];
338	0	t = (t + c) & BN_MASK2;
339	0	c = (t < c);
340	0	l = (t + b[2]) & BN_MASK2;
341	0	c += (l < t);
342	0	r[2] = l;
343	0	t = a[3];
344	0	t = (t + c) & BN_MASK2;
345	0	c = (t < c);
346	0	l = (t + b[3]) & BN_MASK2;
347	0	c += (l < t);
348	0	r[3] = l;
349	0	a += 4;
350	0	b += 4;
351	0	r += 4;
352	0	n -= 4;
353	0	}
354	0	# endif
355	0	while (n) {
356	0	t = a[0];
357	0	t = (t + c) & BN_MASK2;
358	0	c = (t < c);
359	0	l = (t + b[0]) & BN_MASK2;
360	0	c += (l < t);
361	0	r[0] = l;
362	0	a++;
363	0	b++;
364	0	r++;
365	0	n--;
366	0	}
367	0	return (BN_ULONG)c;
368	0	}
369		#endif /* !BN_LLONG */
370
371		BN_ULONG bn_sub_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
372		int n)
373	0	{
374	0	BN_ULONG t1, t2;
375	0	int c = 0;
376
377	0	assert(n >= 0);
378	0	if (n <= 0)
379	0	return (BN_ULONG)0;
380
381	0	#ifndef OPENSSL_SMALL_FOOTPRINT
382	0	while (n & ~3) {
383	0	t1 = a[0];
384	0	t2 = (t1 - c) & BN_MASK2;
385	0	c = (t2 > t1);
386	0	t1 = b[0];
387	0	t1 = (t2 - t1) & BN_MASK2;
388	0	r[0] = t1;
389	0	c += (t1 > t2);
390	0	t1 = a[1];
391	0	t2 = (t1 - c) & BN_MASK2;
392	0	c = (t2 > t1);
393	0	t1 = b[1];
394	0	t1 = (t2 - t1) & BN_MASK2;
395	0	r[1] = t1;
396	0	c += (t1 > t2);
397	0	t1 = a[2];
398	0	t2 = (t1 - c) & BN_MASK2;
399	0	c = (t2 > t1);
400	0	t1 = b[2];
401	0	t1 = (t2 - t1) & BN_MASK2;
402	0	r[2] = t1;
403	0	c += (t1 > t2);
404	0	t1 = a[3];
405	0	t2 = (t1 - c) & BN_MASK2;
406	0	c = (t2 > t1);
407	0	t1 = b[3];
408	0	t1 = (t2 - t1) & BN_MASK2;
409	0	r[3] = t1;
410	0	c += (t1 > t2);
411	0	a += 4;
412	0	b += 4;
413	0	r += 4;
414	0	n -= 4;
415	0	}
416	0	#endif
417	0	while (n) {
418	0	t1 = a[0];
419	0	t2 = (t1 - c) & BN_MASK2;
420	0	c = (t2 > t1);
421	0	t1 = b[0];
422	0	t1 = (t2 - t1) & BN_MASK2;
423	0	r[0] = t1;
424	0	c += (t1 > t2);
425	0	a++;
426	0	b++;
427	0	r++;
428	0	n--;
429	0	}
430	0	return c;
431	0	}
432
433		#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
434
435		/* mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /
436		/* mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) */
437		/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
438		/*
439		* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number
440		* c=(c2,c1,c0)
441		*/
442
443		# ifdef BN_LLONG
444		/*
445		* Keep in mind that additions to multiplication result can not
446		* overflow, because its high half cannot be all-ones.
447		*/
448		# define mul_add_c(a,b,c0,c1,c2) do { \
449		BN_ULONG hi; \
450		BN_ULLONG t = (BN_ULLONG)(a)*(b); \
451		t += c0; /* no carry */ \
452		c0 = (BN_ULONG)Lw(t); \
453		hi = (BN_ULONG)Hw(t); \
454		c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
455		} while(0)
456
457		# define mul_add_c2(a,b,c0,c1,c2) do { \
458		BN_ULONG hi; \
459		BN_ULLONG t = (BN_ULLONG)(a)*(b); \
460		BN_ULLONG tt = t+c0; /* no carry */ \
461		c0 = (BN_ULONG)Lw(tt); \
462		hi = (BN_ULONG)Hw(tt); \
463		c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
464		t += c0; /* no carry */ \
465		c0 = (BN_ULONG)Lw(t); \
466		hi = (BN_ULONG)Hw(t); \
467		c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
468		} while(0)
469
470		# define sqr_add_c(a,i,c0,c1,c2) do { \
471		BN_ULONG hi; \
472		BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \
473		t += c0; /* no carry */ \
474		c0 = (BN_ULONG)Lw(t); \
475		hi = (BN_ULONG)Hw(t); \
476		c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
477		} while(0)
478
479		# define sqr_add_c2(a,i,j,c0,c1,c2) \
480		mul_add_c2((a)[i],(a)[j],c0,c1,c2)
481
482		# elif defined(BN_UMULT_LOHI)
483		/*
484		* Keep in mind that additions to hi can not overflow, because
485		* the high word of a multiplication result cannot be all-ones.
486		*/
487		# define mul_add_c(a,b,c0,c1,c2) do { \
488		BN_ULONG ta = (a), tb = (b); \
489		BN_ULONG lo, hi; \
490		BN_UMULT_LOHI(lo,hi,ta,tb); \
491		c0 += lo; hi += (c0<lo); \
492		c1 += hi; c2 += (c1<hi); \
493		} while(0)
494
495		# define mul_add_c2(a,b,c0,c1,c2) do { \
496		BN_ULONG ta = (a), tb = (b); \
497		BN_ULONG lo, hi, tt; \
498		BN_UMULT_LOHI(lo,hi,ta,tb); \
499		c0 += lo; tt = hi + (c0<lo); \
500		c1 += tt; c2 += (c1<tt); \
501		c0 += lo; hi += (c0<lo); \
502		c1 += hi; c2 += (c1<hi); \
503		} while(0)
504
505		# define sqr_add_c(a,i,c0,c1,c2) do { \
506		BN_ULONG ta = (a)[i]; \
507		BN_ULONG lo, hi; \
508		BN_UMULT_LOHI(lo,hi,ta,ta); \
509		c0 += lo; hi += (c0<lo); \
510		c1 += hi; c2 += (c1<hi); \
511		} while(0)
512
513		# define sqr_add_c2(a,i,j,c0,c1,c2) \
514		mul_add_c2((a)[i],(a)[j],c0,c1,c2)
515
516		# elif defined(BN_UMULT_HIGH)
517		/*
518		* Keep in mind that additions to hi can not overflow, because
519		* the high word of a multiplication result cannot be all-ones.
520		*/
521		# define mul_add_c(a,b,c0,c1,c2) do { \
522		BN_ULONG ta = (a), tb = (b); \
523		BN_ULONG lo = ta * tb; \
524		BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
525		c0 += lo; hi += (c0<lo); \
526		c1 += hi; c2 += (c1<hi); \
527		} while(0)
528
529		# define mul_add_c2(a,b,c0,c1,c2) do { \
530		BN_ULONG ta = (a), tb = (b), tt; \
531		BN_ULONG lo = ta * tb; \
532		BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
533		c0 += lo; tt = hi + (c0<lo); \
534		c1 += tt; c2 += (c1<tt); \
535		c0 += lo; hi += (c0<lo); \
536		c1 += hi; c2 += (c1<hi); \
537		} while(0)
538
539		# define sqr_add_c(a,i,c0,c1,c2) do { \
540		BN_ULONG ta = (a)[i]; \
541		BN_ULONG lo = ta * ta; \
542		BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \
543		c0 += lo; hi += (c0<lo); \
544		c1 += hi; c2 += (c1<hi); \
545		} while(0)
546
547		# define sqr_add_c2(a,i,j,c0,c1,c2) \
548		mul_add_c2((a)[i],(a)[j],c0,c1,c2)
549
550		# else /* !BN_LLONG */
551		/*
552		* Keep in mind that additions to hi can not overflow, because
553		* the high word of a multiplication result cannot be all-ones.
554		*/
555	0	# define mul_add_c(a,b,c0,c1,c2) do { \
556	0	BN_ULONG lo = LBITS(a), hi = HBITS(a); \
557	0	BN_ULONG bl = LBITS(b), bh = HBITS(b); \
558	0	mul64(lo,hi,bl,bh); \
559	0	c0 = (c0+lo)&BN_MASK2; hi += (c0<lo); \
560	0	c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
561	0	} while(0)
562
563	0	# define mul_add_c2(a,b,c0,c1,c2) do { \
564	0	BN_ULONG tt; \
565	0	BN_ULONG lo = LBITS(a), hi = HBITS(a); \
566	0	BN_ULONG bl = LBITS(b), bh = HBITS(b); \
567	0	mul64(lo,hi,bl,bh); \
568	0	tt = hi; \
569	0	c0 = (c0+lo)&BN_MASK2; tt += (c0<lo); \
570	0	c1 = (c1+tt)&BN_MASK2; c2 += (c1<tt); \
571	0	c0 = (c0+lo)&BN_MASK2; hi += (c0<lo); \
572	0	c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
573	0	} while(0)
574
575	0	# define sqr_add_c(a,i,c0,c1,c2) do { \
576	0	BN_ULONG lo, hi; \
577	0	sqr64(lo,hi,(a)[i]); \
578	0	c0 = (c0+lo)&BN_MASK2; hi += (c0<lo); \
579	0	c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
580	0	} while(0)
581
582		# define sqr_add_c2(a,i,j,c0,c1,c2) \
583	0	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
584		# endif /* !BN_LLONG */
585
586		void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
587	0	{
588	0	BN_ULONG c1, c2, c3;
589
590	0	c1 = 0;
591	0	c2 = 0;
592	0	c3 = 0;
593	0	mul_add_c(a[0], b[0], c1, c2, c3);
594	0	r[0] = c1;
595	0	c1 = 0;
596	0	mul_add_c(a[0], b[1], c2, c3, c1);
597	0	mul_add_c(a[1], b[0], c2, c3, c1);
598	0	r[1] = c2;
599	0	c2 = 0;
600	0	mul_add_c(a[2], b[0], c3, c1, c2);
601	0	mul_add_c(a[1], b[1], c3, c1, c2);
602	0	mul_add_c(a[0], b[2], c3, c1, c2);
603	0	r[2] = c3;
604	0	c3 = 0;
605	0	mul_add_c(a[0], b[3], c1, c2, c3);
606	0	mul_add_c(a[1], b[2], c1, c2, c3);
607	0	mul_add_c(a[2], b[1], c1, c2, c3);
608	0	mul_add_c(a[3], b[0], c1, c2, c3);
609	0	r[3] = c1;
610	0	c1 = 0;
611	0	mul_add_c(a[4], b[0], c2, c3, c1);
612	0	mul_add_c(a[3], b[1], c2, c3, c1);
613	0	mul_add_c(a[2], b[2], c2, c3, c1);
614	0	mul_add_c(a[1], b[3], c2, c3, c1);
615	0	mul_add_c(a[0], b[4], c2, c3, c1);
616	0	r[4] = c2;
617	0	c2 = 0;
618	0	mul_add_c(a[0], b[5], c3, c1, c2);
619	0	mul_add_c(a[1], b[4], c3, c1, c2);
620	0	mul_add_c(a[2], b[3], c3, c1, c2);
621	0	mul_add_c(a[3], b[2], c3, c1, c2);
622	0	mul_add_c(a[4], b[1], c3, c1, c2);
623	0	mul_add_c(a[5], b[0], c3, c1, c2);
624	0	r[5] = c3;
625	0	c3 = 0;
626	0	mul_add_c(a[6], b[0], c1, c2, c3);
627	0	mul_add_c(a[5], b[1], c1, c2, c3);
628	0	mul_add_c(a[4], b[2], c1, c2, c3);
629	0	mul_add_c(a[3], b[3], c1, c2, c3);
630	0	mul_add_c(a[2], b[4], c1, c2, c3);
631	0	mul_add_c(a[1], b[5], c1, c2, c3);
632	0	mul_add_c(a[0], b[6], c1, c2, c3);
633	0	r[6] = c1;
634	0	c1 = 0;
635	0	mul_add_c(a[0], b[7], c2, c3, c1);
636	0	mul_add_c(a[1], b[6], c2, c3, c1);
637	0	mul_add_c(a[2], b[5], c2, c3, c1);
638	0	mul_add_c(a[3], b[4], c2, c3, c1);
639	0	mul_add_c(a[4], b[3], c2, c3, c1);
640	0	mul_add_c(a[5], b[2], c2, c3, c1);
641	0	mul_add_c(a[6], b[1], c2, c3, c1);
642	0	mul_add_c(a[7], b[0], c2, c3, c1);
643	0	r[7] = c2;
644	0	c2 = 0;
645	0	mul_add_c(a[7], b[1], c3, c1, c2);
646	0	mul_add_c(a[6], b[2], c3, c1, c2);
647	0	mul_add_c(a[5], b[3], c3, c1, c2);
648	0	mul_add_c(a[4], b[4], c3, c1, c2);
649	0	mul_add_c(a[3], b[5], c3, c1, c2);
650	0	mul_add_c(a[2], b[6], c3, c1, c2);
651	0	mul_add_c(a[1], b[7], c3, c1, c2);
652	0	r[8] = c3;
653	0	c3 = 0;
654	0	mul_add_c(a[2], b[7], c1, c2, c3);
655	0	mul_add_c(a[3], b[6], c1, c2, c3);
656	0	mul_add_c(a[4], b[5], c1, c2, c3);
657	0	mul_add_c(a[5], b[4], c1, c2, c3);
658	0	mul_add_c(a[6], b[3], c1, c2, c3);
659	0	mul_add_c(a[7], b[2], c1, c2, c3);
660	0	r[9] = c1;
661	0	c1 = 0;
662	0	mul_add_c(a[7], b[3], c2, c3, c1);
663	0	mul_add_c(a[6], b[4], c2, c3, c1);
664	0	mul_add_c(a[5], b[5], c2, c3, c1);
665	0	mul_add_c(a[4], b[6], c2, c3, c1);
666	0	mul_add_c(a[3], b[7], c2, c3, c1);
667	0	r[10] = c2;
668	0	c2 = 0;
669	0	mul_add_c(a[4], b[7], c3, c1, c2);
670	0	mul_add_c(a[5], b[6], c3, c1, c2);
671	0	mul_add_c(a[6], b[5], c3, c1, c2);
672	0	mul_add_c(a[7], b[4], c3, c1, c2);
673	0	r[11] = c3;
674	0	c3 = 0;
675	0	mul_add_c(a[7], b[5], c1, c2, c3);
676	0	mul_add_c(a[6], b[6], c1, c2, c3);
677	0	mul_add_c(a[5], b[7], c1, c2, c3);
678	0	r[12] = c1;
679	0	c1 = 0;
680	0	mul_add_c(a[6], b[7], c2, c3, c1);
681	0	mul_add_c(a[7], b[6], c2, c3, c1);
682	0	r[13] = c2;
683	0	c2 = 0;
684	0	mul_add_c(a[7], b[7], c3, c1, c2);
685	0	r[14] = c3;
686	0	r[15] = c1;
687	0	}
688
689		void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
690	0	{
691	0	BN_ULONG c1, c2, c3;
692
693	0	c1 = 0;
694	0	c2 = 0;
695	0	c3 = 0;
696	0	mul_add_c(a[0], b[0], c1, c2, c3);
697	0	r[0] = c1;
698	0	c1 = 0;
699	0	mul_add_c(a[0], b[1], c2, c3, c1);
700	0	mul_add_c(a[1], b[0], c2, c3, c1);
701	0	r[1] = c2;
702	0	c2 = 0;
703	0	mul_add_c(a[2], b[0], c3, c1, c2);
704	0	mul_add_c(a[1], b[1], c3, c1, c2);
705	0	mul_add_c(a[0], b[2], c3, c1, c2);
706	0	r[2] = c3;
707	0	c3 = 0;
708	0	mul_add_c(a[0], b[3], c1, c2, c3);
709	0	mul_add_c(a[1], b[2], c1, c2, c3);
710	0	mul_add_c(a[2], b[1], c1, c2, c3);
711	0	mul_add_c(a[3], b[0], c1, c2, c3);
712	0	r[3] = c1;
713	0	c1 = 0;
714	0	mul_add_c(a[3], b[1], c2, c3, c1);
715	0	mul_add_c(a[2], b[2], c2, c3, c1);
716	0	mul_add_c(a[1], b[3], c2, c3, c1);
717	0	r[4] = c2;
718	0	c2 = 0;
719	0	mul_add_c(a[2], b[3], c3, c1, c2);
720	0	mul_add_c(a[3], b[2], c3, c1, c2);
721	0	r[5] = c3;
722	0	c3 = 0;
723	0	mul_add_c(a[3], b[3], c1, c2, c3);
724	0	r[6] = c1;
725	0	r[7] = c2;
726	0	}
727
728		void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
729	0	{
730	0	BN_ULONG c1, c2, c3;
731
732	0	c1 = 0;
733	0	c2 = 0;
734	0	c3 = 0;
735	0	sqr_add_c(a, 0, c1, c2, c3);
736	0	r[0] = c1;
737	0	c1 = 0;
738	0	sqr_add_c2(a, 1, 0, c2, c3, c1);
739	0	r[1] = c2;
740	0	c2 = 0;
741	0	sqr_add_c(a, 1, c3, c1, c2);
742	0	sqr_add_c2(a, 2, 0, c3, c1, c2);
743	0	r[2] = c3;
744	0	c3 = 0;
745	0	sqr_add_c2(a, 3, 0, c1, c2, c3);
746	0	sqr_add_c2(a, 2, 1, c1, c2, c3);
747	0	r[3] = c1;
748	0	c1 = 0;
749	0	sqr_add_c(a, 2, c2, c3, c1);
750	0	sqr_add_c2(a, 3, 1, c2, c3, c1);
751	0	sqr_add_c2(a, 4, 0, c2, c3, c1);
752	0	r[4] = c2;
753	0	c2 = 0;
754	0	sqr_add_c2(a, 5, 0, c3, c1, c2);
755	0	sqr_add_c2(a, 4, 1, c3, c1, c2);
756	0	sqr_add_c2(a, 3, 2, c3, c1, c2);
757	0	r[5] = c3;
758	0	c3 = 0;
759	0	sqr_add_c(a, 3, c1, c2, c3);
760	0	sqr_add_c2(a, 4, 2, c1, c2, c3);
761	0	sqr_add_c2(a, 5, 1, c1, c2, c3);
762	0	sqr_add_c2(a, 6, 0, c1, c2, c3);
763	0	r[6] = c1;
764	0	c1 = 0;
765	0	sqr_add_c2(a, 7, 0, c2, c3, c1);
766	0	sqr_add_c2(a, 6, 1, c2, c3, c1);
767	0	sqr_add_c2(a, 5, 2, c2, c3, c1);
768	0	sqr_add_c2(a, 4, 3, c2, c3, c1);
769	0	r[7] = c2;
770	0	c2 = 0;
771	0	sqr_add_c(a, 4, c3, c1, c2);
772	0	sqr_add_c2(a, 5, 3, c3, c1, c2);
773	0	sqr_add_c2(a, 6, 2, c3, c1, c2);
774	0	sqr_add_c2(a, 7, 1, c3, c1, c2);
775	0	r[8] = c3;
776	0	c3 = 0;
777	0	sqr_add_c2(a, 7, 2, c1, c2, c3);
778	0	sqr_add_c2(a, 6, 3, c1, c2, c3);
779	0	sqr_add_c2(a, 5, 4, c1, c2, c3);
780	0	r[9] = c1;
781	0	c1 = 0;
782	0	sqr_add_c(a, 5, c2, c3, c1);
783	0	sqr_add_c2(a, 6, 4, c2, c3, c1);
784	0	sqr_add_c2(a, 7, 3, c2, c3, c1);
785	0	r[10] = c2;
786	0	c2 = 0;
787	0	sqr_add_c2(a, 7, 4, c3, c1, c2);
788	0	sqr_add_c2(a, 6, 5, c3, c1, c2);
789	0	r[11] = c3;
790	0	c3 = 0;
791	0	sqr_add_c(a, 6, c1, c2, c3);
792	0	sqr_add_c2(a, 7, 5, c1, c2, c3);
793	0	r[12] = c1;
794	0	c1 = 0;
795	0	sqr_add_c2(a, 7, 6, c2, c3, c1);
796	0	r[13] = c2;
797	0	c2 = 0;
798	0	sqr_add_c(a, 7, c3, c1, c2);
799	0	r[14] = c3;
800	0	r[15] = c1;
801	0	}
802
803		void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
804	0	{
805	0	BN_ULONG c1, c2, c3;
806
807	0	c1 = 0;
808	0	c2 = 0;
809	0	c3 = 0;
810	0	sqr_add_c(a, 0, c1, c2, c3);
811	0	r[0] = c1;
812	0	c1 = 0;
813	0	sqr_add_c2(a, 1, 0, c2, c3, c1);
814	0	r[1] = c2;
815	0	c2 = 0;
816	0	sqr_add_c(a, 1, c3, c1, c2);
817	0	sqr_add_c2(a, 2, 0, c3, c1, c2);
818	0	r[2] = c3;
819	0	c3 = 0;
820	0	sqr_add_c2(a, 3, 0, c1, c2, c3);
821	0	sqr_add_c2(a, 2, 1, c1, c2, c3);
822	0	r[3] = c1;
823	0	c1 = 0;
824	0	sqr_add_c(a, 2, c2, c3, c1);
825	0	sqr_add_c2(a, 3, 1, c2, c3, c1);
826	0	r[4] = c2;
827	0	c2 = 0;
828	0	sqr_add_c2(a, 3, 2, c3, c1, c2);
829	0	r[5] = c3;
830	0	c3 = 0;
831	0	sqr_add_c(a, 3, c1, c2, c3);
832	0	r[6] = c1;
833	0	r[7] = c2;
834	0	}
835
836		# ifdef OPENSSL_NO_ASM
837		# ifdef OPENSSL_BN_ASM_MONT
838		# include <alloca.h>
839		/*
840		* This is essentially reference implementation, which may or may not
841		* result in performance improvement. E.g. on IA-32 this routine was
842		* observed to give 40% faster rsa1024 private key operations and 10%
843		* faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
844		* by 10% and worsens rsa4096 sign by 15%. Once again, it's a
845		* reference implementation, one to be used as starting point for
846		* platform-specific assembler. Mentioned numbers apply to compiler
847		* generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
848		* can vary not only from platform to platform, but even for compiler
849		* versions. Assembler vs. assembler improvement coefficients can
850		* [and are known to] differ and are to be documented elsewhere.
851		*/
852		int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
853		const BN_ULONG np, const BN_ULONG n0p, int num)
854		{
855		BN_ULONG c0, c1, ml, *tp, n0;
856		# ifdef mul64
857		BN_ULONG mh;
858		# endif
859		volatile BN_ULONG *vp;
860		int i = 0, j;
861
862		# if 0 /* template for platform-specific
863		* implementation */
864		if (ap == bp)
865		return bn_sqr_mont(rp, ap, np, n0p, num);
866		# endif
867		vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
868
869		n0 = *n0p;
870
871		c0 = 0;
872		ml = bp[0];
873		# ifdef mul64
874		mh = HBITS(ml);
875		ml = LBITS(ml);
876		for (j = 0; j < num; ++j)
877		mul(tp[j], ap[j], ml, mh, c0);
878		# else
879		for (j = 0; j < num; ++j)
880		mul(tp[j], ap[j], ml, c0);
881		# endif
882
883		tp[num] = c0;
884		tp[num + 1] = 0;
885		goto enter;
886
887		for (i = 0; i < num; i++) {
888		c0 = 0;
889		ml = bp[i];
890		# ifdef mul64
891		mh = HBITS(ml);
892		ml = LBITS(ml);
893		for (j = 0; j < num; ++j)
894		mul_add(tp[j], ap[j], ml, mh, c0);
895		# else
896		for (j = 0; j < num; ++j)
897		mul_add(tp[j], ap[j], ml, c0);
898		# endif
899		c1 = (tp[num] + c0) & BN_MASK2;
900		tp[num] = c1;
901		tp[num + 1] = (c1 < c0 ? 1 : 0);
902		enter:
903		c1 = tp[0];
904		ml = (c1 * n0) & BN_MASK2;
905		c0 = 0;
906		# ifdef mul64
907		mh = HBITS(ml);
908		ml = LBITS(ml);
909		mul_add(c1, np[0], ml, mh, c0);
910		# else
911		mul_add(c1, ml, np[0], c0);
912		# endif
913		for (j = 1; j < num; j++) {
914		c1 = tp[j];
915		# ifdef mul64
916		mul_add(c1, np[j], ml, mh, c0);
917		# else
918		mul_add(c1, ml, np[j], c0);
919		# endif
920		tp[j - 1] = c1 & BN_MASK2;
921		}
922		c1 = (tp[num] + c0) & BN_MASK2;
923		tp[num - 1] = c1;
924		tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
925		}
926
927		if (tp[num] != 0 \|\| tp[num - 1] >= np[num - 1]) {
928		c0 = bn_sub_words(rp, tp, np, num);
929		if (tp[num] != 0 \|\| c0 == 0) {
930		for (i = 0; i < num + 2; i++)
931		vp[i] = 0;
932		return 1;
933		}
934		}
935		for (i = 0; i < num; i++)
936		rp[i] = tp[i], vp[i] = 0;
937		vp[num] = 0;
938		vp[num + 1] = 0;
939		return 1;
940		}
941		# else
942		/*
943		* Return value of 0 indicates that multiplication/convolution was not
944		* performed to signal the caller to fall down to alternative/original
945		* code-path.
946		*/
947		int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
948		const BN_ULONG np, const BN_ULONG n0, int num)
949	0	{
950	0	return 0;
951	0	}
952		# endif /* OPENSSL_BN_ASM_MONT */
953		# endif
954
955		#else /* !BN_MUL_COMBA */
956
957		/* hmm... is it faster just to do a multiply? */
958		void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
959		{
960		BN_ULONG t[8];
961		bn_sqr_normal(r, a, 4, t);
962		}
963
964		void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
965		{
966		BN_ULONG t[16];
967		bn_sqr_normal(r, a, 8, t);
968		}
969
970		void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
971		{
972		r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
973		r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
974		r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
975		r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
976		}
977
978		void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
979		{
980		r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
981		r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
982		r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
983		r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
984		r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
985		r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
986		r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
987		r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
988		}
989
990		# ifdef OPENSSL_NO_ASM
991		# ifdef OPENSSL_BN_ASM_MONT
992		# include <alloca.h>
993		int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
994		const BN_ULONG np, const BN_ULONG n0p, int num)
995		{
996		BN_ULONG c0, c1, tp, n0 = n0p;
997		volatile BN_ULONG *vp;
998		int i = 0, j;
999
1000		vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
1001
1002		for (i = 0; i <= num; i++)
1003		tp[i] = 0;
1004
1005		for (i = 0; i < num; i++) {
1006		c0 = bn_mul_add_words(tp, ap, num, bp[i]);
1007		c1 = (tp[num] + c0) & BN_MASK2;
1008		tp[num] = c1;
1009		tp[num + 1] = (c1 < c0 ? 1 : 0);
1010
1011		c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
1012		c1 = (tp[num] + c0) & BN_MASK2;
1013		tp[num] = c1;
1014		tp[num + 1] += (c1 < c0 ? 1 : 0);
1015		for (j = 0; j <= num; j++)
1016		tp[j] = tp[j + 1];
1017		}
1018
1019		if (tp[num] != 0 \|\| tp[num - 1] >= np[num - 1]) {
1020		c0 = bn_sub_words(rp, tp, np, num);
1021		if (tp[num] != 0 \|\| c0 == 0) {
1022		for (i = 0; i < num + 2; i++)
1023		vp[i] = 0;
1024		return 1;
1025		}
1026		}
1027		for (i = 0; i < num; i++)
1028		rp[i] = tp[i], vp[i] = 0;
1029		vp[num] = 0;
1030		vp[num + 1] = 0;
1031		return 1;
1032		}
1033		# else
1034		int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
1035		const BN_ULONG np, const BN_ULONG n0, int num)
1036		{
1037		return 0;
1038		}
1039		# endif /* OPENSSL_BN_ASM_MONT */
1040		# endif
1041
1042		#endif /* !BN_MUL_COMBA */

Coverage Report

Created: 2025-06-13 06:36