/src/openssl/crypto/bn/bn_asm.c

Line	Count	Source
1		/*
2		* Copyright 1995-2023 The OpenSSL Project Authors. All Rights Reserved.
3		*
4		* Licensed under the Apache License 2.0 (the "License"). You may not use
5		* this file except in compliance with the License. You can obtain a copy
6		* in the file LICENSE in the source distribution or at
7		* https://www.openssl.org/source/license.html
8		*/
9
10		#include <assert.h>
11		#include <openssl/crypto.h>
12		#include "internal/cryptlib.h"
13		#include "bn_local.h"
14
15		#if defined(BN_LLONG) \|\| defined(BN_UMULT_HIGH)
16
17		BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
18		BN_ULONG w)
19		{
20		BN_ULONG c1 = 0;
21
22		assert(num >= 0);
23		if (num <= 0)
24		return c1;
25
26		#ifndef OPENSSL_SMALL_FOOTPRINT
27		while (num & ~3) {
28		mul_add(rp[0], ap[0], w, c1);
29		mul_add(rp[1], ap[1], w, c1);
30		mul_add(rp[2], ap[2], w, c1);
31		mul_add(rp[3], ap[3], w, c1);
32		ap += 4;
33		rp += 4;
34		num -= 4;
35		}
36		#endif
37		while (num) {
38		mul_add(rp[0], ap[0], w, c1);
39		ap++;
40		rp++;
41		num--;
42		}
43
44		return c1;
45		}
46
47		BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
48		{
49		BN_ULONG c1 = 0;
50
51		assert(num >= 0);
52		if (num <= 0)
53		return c1;
54
55		#ifndef OPENSSL_SMALL_FOOTPRINT
56		while (num & ~3) {
57		mul(rp[0], ap[0], w, c1);
58		mul(rp[1], ap[1], w, c1);
59		mul(rp[2], ap[2], w, c1);
60		mul(rp[3], ap[3], w, c1);
61		ap += 4;
62		rp += 4;
63		num -= 4;
64		}
65		#endif
66		while (num) {
67		mul(rp[0], ap[0], w, c1);
68		ap++;
69		rp++;
70		num--;
71		}
72		return c1;
73		}
74
75		void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
76		{
77		assert(n >= 0);
78		if (n <= 0)
79		return;
80
81		#ifndef OPENSSL_SMALL_FOOTPRINT
82		while (n & ~3) {
83		sqr(r[0], r[1], a[0]);
84		sqr(r[2], r[3], a[1]);
85		sqr(r[4], r[5], a[2]);
86		sqr(r[6], r[7], a[3]);
87		a += 4;
88		r += 8;
89		n -= 4;
90		}
91		#endif
92		while (n) {
93		sqr(r[0], r[1], a[0]);
94		a++;
95		r += 2;
96		n--;
97		}
98		}
99
100		#else /* !(defined(BN_LLONG) \|\| \
101		* defined(BN_UMULT_HIGH)) */
102
103		BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
104		BN_ULONG w)
105	0	{
106	0	BN_ULONG c = 0;
107	0	BN_ULONG bl, bh;
108
109	0	assert(num >= 0);
110	0	if (num <= 0)
111	0	return (BN_ULONG)0;
112
113	0	bl = LBITS(w);
114	0	bh = HBITS(w);
115
116	0	#ifndef OPENSSL_SMALL_FOOTPRINT
117	0	while (num & ~3) {
118	0	mul_add(rp[0], ap[0], bl, bh, c);
119	0	mul_add(rp[1], ap[1], bl, bh, c);
120	0	mul_add(rp[2], ap[2], bl, bh, c);
121	0	mul_add(rp[3], ap[3], bl, bh, c);
122	0	ap += 4;
123	0	rp += 4;
124	0	num -= 4;
125	0	}
126	0	#endif
127	0	while (num) {
128	0	mul_add(rp[0], ap[0], bl, bh, c);
129	0	ap++;
130	0	rp++;
131	0	num--;
132	0	}
133	0	return c;
134	0	}
135
136		BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
137	0	{
138	0	BN_ULONG carry = 0;
139	0	BN_ULONG bl, bh;
140
141	0	assert(num >= 0);
142	0	if (num <= 0)
143	0	return (BN_ULONG)0;
144
145	0	bl = LBITS(w);
146	0	bh = HBITS(w);
147
148	0	#ifndef OPENSSL_SMALL_FOOTPRINT
149	0	while (num & ~3) {
150	0	mul(rp[0], ap[0], bl, bh, carry);
151	0	mul(rp[1], ap[1], bl, bh, carry);
152	0	mul(rp[2], ap[2], bl, bh, carry);
153	0	mul(rp[3], ap[3], bl, bh, carry);
154	0	ap += 4;
155	0	rp += 4;
156	0	num -= 4;
157	0	}
158	0	#endif
159	0	while (num) {
160	0	mul(rp[0], ap[0], bl, bh, carry);
161	0	ap++;
162	0	rp++;
163	0	num--;
164	0	}
165	0	return carry;
166	0	}
167
168		void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
169	0	{
170	0	assert(n >= 0);
171	0	if (n <= 0)
172	0	return;
173
174	0	#ifndef OPENSSL_SMALL_FOOTPRINT
175	0	while (n & ~3) {
176	0	sqr64(r[0], r[1], a[0]);
177	0	sqr64(r[2], r[3], a[1]);
178	0	sqr64(r[4], r[5], a[2]);
179	0	sqr64(r[6], r[7], a[3]);
180	0	a += 4;
181	0	r += 8;
182	0	n -= 4;
183	0	}
184	0	#endif
185	0	while (n) {
186	0	sqr64(r[0], r[1], a[0]);
187	0	a++;
188	0	r += 2;
189	0	n--;
190	0	}
191	0	}
192
193		#endif /* !(defined(BN_LLONG) \|\| \
194		* defined(BN_UMULT_HIGH)) */
195
196		#if defined(BN_LLONG) && defined(BN_DIV2W)
197
198		BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
199		{
200		return ((BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) \| l) / (BN_ULLONG)d));
201		}
202
203		#else
204
205		/* Divide h,l by d and return the result. */
206		/* I need to test this some more :-( */
207		BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
208	0	{
209	0	BN_ULONG dh, dl, q, ret = 0, th, tl, t;
210	0	int i, count = 2;
211
212	0	if (d == 0)
213	0	return BN_MASK2;
214
215	0	i = BN_num_bits_word(d);
216	0	assert((i == BN_BITS2) \|\| (h <= (BN_ULONG)1 << i));
217
218	0	i = BN_BITS2 - i;
219	0	if (h >= d)
220	0	h -= d;
221
222	0	if (i) {
223	0	d <<= i;
224	0	h = (h << i) \| (l >> (BN_BITS2 - i));
225	0	l <<= i;
226	0	}
227	0	dh = (d & BN_MASK2h) >> BN_BITS4;
228	0	dl = (d & BN_MASK2l);
229	0	for (;;) {
230	0	if ((h >> BN_BITS4) == dh)
231	0	q = BN_MASK2l;
232	0	else
233	0	q = h / dh;
234
235	0	th = q * dh;
236	0	tl = dl * q;
237	0	for (;;) {
238	0	t = h - th;
239	0	if ((t & BN_MASK2h) \|\| ((tl) <= ((t << BN_BITS4) \| ((l & BN_MASK2h) >> BN_BITS4))))
240	0	break;
241	0	q--;
242	0	th -= dh;
243	0	tl -= dl;
244	0	}
245	0	t = (tl >> BN_BITS4);
246	0	tl = (tl << BN_BITS4) & BN_MASK2h;
247	0	th += t;
248
249	0	if (l < tl)
250	0	th++;
251	0	l -= tl;
252	0	if (h < th) {
253	0	h += d;
254	0	q--;
255	0	}
256	0	h -= th;
257
258	0	if (--count == 0)
259	0	break;
260
261	0	ret = q << BN_BITS4;
262	0	h = ((h << BN_BITS4) \| (l >> BN_BITS4)) & BN_MASK2;
263	0	l = (l & BN_MASK2l) << BN_BITS4;
264	0	}
265	0	ret \|= q;
266	0	return ret;
267	0	}
268		#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
269
270		#ifdef BN_LLONG
271		BN_ULONG bn_add_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
272		int n)
273		{
274		BN_ULLONG ll = 0;
275
276		assert(n >= 0);
277		if (n <= 0)
278		return (BN_ULONG)0;
279
280		#ifndef OPENSSL_SMALL_FOOTPRINT
281		while (n & ~3) {
282		ll += (BN_ULLONG)a[0] + b[0];
283		r[0] = (BN_ULONG)ll & BN_MASK2;
284		ll >>= BN_BITS2;
285		ll += (BN_ULLONG)a[1] + b[1];
286		r[1] = (BN_ULONG)ll & BN_MASK2;
287		ll >>= BN_BITS2;
288		ll += (BN_ULLONG)a[2] + b[2];
289		r[2] = (BN_ULONG)ll & BN_MASK2;
290		ll >>= BN_BITS2;
291		ll += (BN_ULLONG)a[3] + b[3];
292		r[3] = (BN_ULONG)ll & BN_MASK2;
293		ll >>= BN_BITS2;
294		a += 4;
295		b += 4;
296		r += 4;
297		n -= 4;
298		}
299		#endif
300		while (n) {
301		ll += (BN_ULLONG)a[0] + b[0];
302		r[0] = (BN_ULONG)ll & BN_MASK2;
303		ll >>= BN_BITS2;
304		a++;
305		b++;
306		r++;
307		n--;
308		}
309		return (BN_ULONG)ll;
310		}
311		#else /* !BN_LLONG */
312		BN_ULONG bn_add_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
313		int n)
314	0	{
315	0	BN_ULONG c, l, t;
316
317	0	assert(n >= 0);
318	0	if (n <= 0)
319	0	return (BN_ULONG)0;
320
321	0	c = 0;
322	0	#ifndef OPENSSL_SMALL_FOOTPRINT
323	0	while (n & ~3) {
324	0	t = a[0];
325	0	t = (t + c) & BN_MASK2;
326	0	c = (t < c);
327	0	l = (t + b[0]) & BN_MASK2;
328	0	c += (l < t);
329	0	r[0] = l;
330	0	t = a[1];
331	0	t = (t + c) & BN_MASK2;
332	0	c = (t < c);
333	0	l = (t + b[1]) & BN_MASK2;
334	0	c += (l < t);
335	0	r[1] = l;
336	0	t = a[2];
337	0	t = (t + c) & BN_MASK2;
338	0	c = (t < c);
339	0	l = (t + b[2]) & BN_MASK2;
340	0	c += (l < t);
341	0	r[2] = l;
342	0	t = a[3];
343	0	t = (t + c) & BN_MASK2;
344	0	c = (t < c);
345	0	l = (t + b[3]) & BN_MASK2;
346	0	c += (l < t);
347	0	r[3] = l;
348	0	a += 4;
349	0	b += 4;
350	0	r += 4;
351	0	n -= 4;
352	0	}
353	0	#endif
354	0	while (n) {
355	0	t = a[0];
356	0	t = (t + c) & BN_MASK2;
357	0	c = (t < c);
358	0	l = (t + b[0]) & BN_MASK2;
359	0	c += (l < t);
360	0	r[0] = l;
361	0	a++;
362	0	b++;
363	0	r++;
364	0	n--;
365	0	}
366	0	return (BN_ULONG)c;
367	0	}
368		#endif /* !BN_LLONG */
369
370		BN_ULONG bn_sub_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
371		int n)
372	0	{
373	0	BN_ULONG t1, t2;
374	0	int c = 0;
375
376	0	assert(n >= 0);
377	0	if (n <= 0)
378	0	return (BN_ULONG)0;
379
380	0	#ifndef OPENSSL_SMALL_FOOTPRINT
381	0	while (n & ~3) {
382	0	t1 = a[0];
383	0	t2 = (t1 - c) & BN_MASK2;
384	0	c = (t2 > t1);
385	0	t1 = b[0];
386	0	t1 = (t2 - t1) & BN_MASK2;
387	0	r[0] = t1;
388	0	c += (t1 > t2);
389	0	t1 = a[1];
390	0	t2 = (t1 - c) & BN_MASK2;
391	0	c = (t2 > t1);
392	0	t1 = b[1];
393	0	t1 = (t2 - t1) & BN_MASK2;
394	0	r[1] = t1;
395	0	c += (t1 > t2);
396	0	t1 = a[2];
397	0	t2 = (t1 - c) & BN_MASK2;
398	0	c = (t2 > t1);
399	0	t1 = b[2];
400	0	t1 = (t2 - t1) & BN_MASK2;
401	0	r[2] = t1;
402	0	c += (t1 > t2);
403	0	t1 = a[3];
404	0	t2 = (t1 - c) & BN_MASK2;
405	0	c = (t2 > t1);
406	0	t1 = b[3];
407	0	t1 = (t2 - t1) & BN_MASK2;
408	0	r[3] = t1;
409	0	c += (t1 > t2);
410	0	a += 4;
411	0	b += 4;
412	0	r += 4;
413	0	n -= 4;
414	0	}
415	0	#endif
416	0	while (n) {
417	0	t1 = a[0];
418	0	t2 = (t1 - c) & BN_MASK2;
419	0	c = (t2 > t1);
420	0	t1 = b[0];
421	0	t1 = (t2 - t1) & BN_MASK2;
422	0	r[0] = t1;
423	0	c += (t1 > t2);
424	0	a++;
425	0	b++;
426	0	r++;
427	0	n--;
428	0	}
429	0	return c;
430	0	}
431
432		#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
433
434		/* mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /
435		/* mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) */
436		/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
437		/*
438		* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number
439		* c=(c2,c1,c0)
440		*/
441
442		#ifdef BN_LLONG
443		/*
444		* Keep in mind that additions to multiplication result can not
445		* overflow, because its high half cannot be all-ones.
446		*/
447		#define mul_add_c(a, b, c0, c1, c2) \
448		do { \
449		BN_ULONG hi; \
450		BN_ULLONG t = (BN_ULLONG)(a) * (b); \
451		t += c0; /* no carry */ \
452		c0 = (BN_ULONG)Lw(t); \
453		hi = (BN_ULONG)Hw(t); \
454		c1 = (c1 + hi) & BN_MASK2; \
455		c2 += (c1 < hi); \
456		} while (0)
457
458		#define mul_add_c2(a, b, c0, c1, c2) \
459		do { \
460		BN_ULONG hi; \
461		BN_ULLONG t = (BN_ULLONG)(a) * (b); \
462		BN_ULLONG tt = t + c0; /* no carry */ \
463		c0 = (BN_ULONG)Lw(tt); \
464		hi = (BN_ULONG)Hw(tt); \
465		c1 = (c1 + hi) & BN_MASK2; \
466		c2 += (c1 < hi); \
467		t += c0; /* no carry */ \
468		c0 = (BN_ULONG)Lw(t); \
469		hi = (BN_ULONG)Hw(t); \
470		c1 = (c1 + hi) & BN_MASK2; \
471		c2 += (c1 < hi); \
472		} while (0)
473
474		#define sqr_add_c(a, i, c0, c1, c2) \
475		do { \
476		BN_ULONG hi; \
477		BN_ULLONG t = (BN_ULLONG)a[i] * a[i]; \
478		t += c0; /* no carry */ \
479		c0 = (BN_ULONG)Lw(t); \
480		hi = (BN_ULONG)Hw(t); \
481		c1 = (c1 + hi) & BN_MASK2; \
482		c2 += (c1 < hi); \
483		} while (0)
484
485		#define sqr_add_c2(a, i, j, c0, c1, c2) \
486		mul_add_c2((a)[i], (a)[j], c0, c1, c2)
487
488		#elif defined(BN_UMULT_LOHI)
489		/*
490		* Keep in mind that additions to hi can not overflow, because
491		* the high word of a multiplication result cannot be all-ones.
492		*/
493		#define mul_add_c(a, b, c0, c1, c2) \
494		do { \
495		BN_ULONG ta = (a), tb = (b); \
496		BN_ULONG lo, hi; \
497		BN_UMULT_LOHI(lo, hi, ta, tb); \
498		c0 += lo; \
499		hi += (c0 < lo); \
500		c1 += hi; \
501		c2 += (c1 < hi); \
502		} while (0)
503
504		#define mul_add_c2(a, b, c0, c1, c2) \
505		do { \
506		BN_ULONG ta = (a), tb = (b); \
507		BN_ULONG lo, hi, tt; \
508		BN_UMULT_LOHI(lo, hi, ta, tb); \
509		c0 += lo; \
510		tt = hi + (c0 < lo); \
511		c1 += tt; \
512		c2 += (c1 < tt); \
513		c0 += lo; \
514		hi += (c0 < lo); \
515		c1 += hi; \
516		c2 += (c1 < hi); \
517		} while (0)
518
519		#define sqr_add_c(a, i, c0, c1, c2) \
520		do { \
521		BN_ULONG ta = (a)[i]; \
522		BN_ULONG lo, hi; \
523		BN_UMULT_LOHI(lo, hi, ta, ta); \
524		c0 += lo; \
525		hi += (c0 < lo); \
526		c1 += hi; \
527		c2 += (c1 < hi); \
528		} while (0)
529
530		#define sqr_add_c2(a, i, j, c0, c1, c2) \
531		mul_add_c2((a)[i], (a)[j], c0, c1, c2)
532
533		#elif defined(BN_UMULT_HIGH)
534		/*
535		* Keep in mind that additions to hi can not overflow, because
536		* the high word of a multiplication result cannot be all-ones.
537		*/
538		#define mul_add_c(a, b, c0, c1, c2) \
539		do { \
540		BN_ULONG ta = (a), tb = (b); \
541		BN_ULONG lo = ta * tb; \
542		BN_ULONG hi = BN_UMULT_HIGH(ta, tb); \
543		c0 += lo; \
544		hi += (c0 < lo); \
545		c1 += hi; \
546		c2 += (c1 < hi); \
547		} while (0)
548
549		#define mul_add_c2(a, b, c0, c1, c2) \
550		do { \
551		BN_ULONG ta = (a), tb = (b), tt; \
552		BN_ULONG lo = ta * tb; \
553		BN_ULONG hi = BN_UMULT_HIGH(ta, tb); \
554		c0 += lo; \
555		tt = hi + (c0 < lo); \
556		c1 += tt; \
557		c2 += (c1 < tt); \
558		c0 += lo; \
559		hi += (c0 < lo); \
560		c1 += hi; \
561		c2 += (c1 < hi); \
562		} while (0)
563
564		#define sqr_add_c(a, i, c0, c1, c2) \
565		do { \
566		BN_ULONG ta = (a)[i]; \
567		BN_ULONG lo = ta * ta; \
568		BN_ULONG hi = BN_UMULT_HIGH(ta, ta); \
569		c0 += lo; \
570		hi += (c0 < lo); \
571		c1 += hi; \
572		c2 += (c1 < hi); \
573		} while (0)
574
575		#define sqr_add_c2(a, i, j, c0, c1, c2) \
576		mul_add_c2((a)[i], (a)[j], c0, c1, c2)
577
578		#else /* !BN_LLONG */
579		/*
580		* Keep in mind that additions to hi can not overflow, because
581		* the high word of a multiplication result cannot be all-ones.
582		*/
583		#define mul_add_c(a, b, c0, c1, c2) \
584	0	do { \
585	0	BN_ULONG lo = LBITS(a), hi = HBITS(a); \
586	0	BN_ULONG bl = LBITS(b), bh = HBITS(b); \
587	0	mul64(lo, hi, bl, bh); \
588	0	c0 = (c0 + lo) & BN_MASK2; \
589	0	hi += (c0 < lo); \
590	0	c1 = (c1 + hi) & BN_MASK2; \
591	0	c2 += (c1 < hi); \
592	0	} while (0)
593
594		#define mul_add_c2(a, b, c0, c1, c2) \
595	0	do { \
596	0	BN_ULONG tt; \
597	0	BN_ULONG lo = LBITS(a), hi = HBITS(a); \
598	0	BN_ULONG bl = LBITS(b), bh = HBITS(b); \
599	0	mul64(lo, hi, bl, bh); \
600	0	tt = hi; \
601	0	c0 = (c0 + lo) & BN_MASK2; \
602	0	tt += (c0 < lo); \
603	0	c1 = (c1 + tt) & BN_MASK2; \
604	0	c2 += (c1 < tt); \
605	0	c0 = (c0 + lo) & BN_MASK2; \
606	0	hi += (c0 < lo); \
607	0	c1 = (c1 + hi) & BN_MASK2; \
608	0	c2 += (c1 < hi); \
609	0	} while (0)
610
611		#define sqr_add_c(a, i, c0, c1, c2) \
612	0	do { \
613	0	BN_ULONG lo, hi; \
614	0	sqr64(lo, hi, (a)[i]); \
615	0	c0 = (c0 + lo) & BN_MASK2; \
616	0	hi += (c0 < lo); \
617	0	c1 = (c1 + hi) & BN_MASK2; \
618	0	c2 += (c1 < hi); \
619	0	} while (0)
620
621		#define sqr_add_c2(a, i, j, c0, c1, c2) \
622	0	mul_add_c2((a)[i], (a)[j], c0, c1, c2)
623		#endif /* !BN_LLONG */
624
625		void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
626	0	{
627	0	BN_ULONG c1, c2, c3;
628
629	0	c1 = 0;
630	0	c2 = 0;
631	0	c3 = 0;
632	0	mul_add_c(a[0], b[0], c1, c2, c3);
633	0	r[0] = c1;
634	0	c1 = 0;
635	0	mul_add_c(a[0], b[1], c2, c3, c1);
636	0	mul_add_c(a[1], b[0], c2, c3, c1);
637	0	r[1] = c2;
638	0	c2 = 0;
639	0	mul_add_c(a[2], b[0], c3, c1, c2);
640	0	mul_add_c(a[1], b[1], c3, c1, c2);
641	0	mul_add_c(a[0], b[2], c3, c1, c2);
642	0	r[2] = c3;
643	0	c3 = 0;
644	0	mul_add_c(a[0], b[3], c1, c2, c3);
645	0	mul_add_c(a[1], b[2], c1, c2, c3);
646	0	mul_add_c(a[2], b[1], c1, c2, c3);
647	0	mul_add_c(a[3], b[0], c1, c2, c3);
648	0	r[3] = c1;
649	0	c1 = 0;
650	0	mul_add_c(a[4], b[0], c2, c3, c1);
651	0	mul_add_c(a[3], b[1], c2, c3, c1);
652	0	mul_add_c(a[2], b[2], c2, c3, c1);
653	0	mul_add_c(a[1], b[3], c2, c3, c1);
654	0	mul_add_c(a[0], b[4], c2, c3, c1);
655	0	r[4] = c2;
656	0	c2 = 0;
657	0	mul_add_c(a[0], b[5], c3, c1, c2);
658	0	mul_add_c(a[1], b[4], c3, c1, c2);
659	0	mul_add_c(a[2], b[3], c3, c1, c2);
660	0	mul_add_c(a[3], b[2], c3, c1, c2);
661	0	mul_add_c(a[4], b[1], c3, c1, c2);
662	0	mul_add_c(a[5], b[0], c3, c1, c2);
663	0	r[5] = c3;
664	0	c3 = 0;
665	0	mul_add_c(a[6], b[0], c1, c2, c3);
666	0	mul_add_c(a[5], b[1], c1, c2, c3);
667	0	mul_add_c(a[4], b[2], c1, c2, c3);
668	0	mul_add_c(a[3], b[3], c1, c2, c3);
669	0	mul_add_c(a[2], b[4], c1, c2, c3);
670	0	mul_add_c(a[1], b[5], c1, c2, c3);
671	0	mul_add_c(a[0], b[6], c1, c2, c3);
672	0	r[6] = c1;
673	0	c1 = 0;
674	0	mul_add_c(a[0], b[7], c2, c3, c1);
675	0	mul_add_c(a[1], b[6], c2, c3, c1);
676	0	mul_add_c(a[2], b[5], c2, c3, c1);
677	0	mul_add_c(a[3], b[4], c2, c3, c1);
678	0	mul_add_c(a[4], b[3], c2, c3, c1);
679	0	mul_add_c(a[5], b[2], c2, c3, c1);
680	0	mul_add_c(a[6], b[1], c2, c3, c1);
681	0	mul_add_c(a[7], b[0], c2, c3, c1);
682	0	r[7] = c2;
683	0	c2 = 0;
684	0	mul_add_c(a[7], b[1], c3, c1, c2);
685	0	mul_add_c(a[6], b[2], c3, c1, c2);
686	0	mul_add_c(a[5], b[3], c3, c1, c2);
687	0	mul_add_c(a[4], b[4], c3, c1, c2);
688	0	mul_add_c(a[3], b[5], c3, c1, c2);
689	0	mul_add_c(a[2], b[6], c3, c1, c2);
690	0	mul_add_c(a[1], b[7], c3, c1, c2);
691	0	r[8] = c3;
692	0	c3 = 0;
693	0	mul_add_c(a[2], b[7], c1, c2, c3);
694	0	mul_add_c(a[3], b[6], c1, c2, c3);
695	0	mul_add_c(a[4], b[5], c1, c2, c3);
696	0	mul_add_c(a[5], b[4], c1, c2, c3);
697	0	mul_add_c(a[6], b[3], c1, c2, c3);
698	0	mul_add_c(a[7], b[2], c1, c2, c3);
699	0	r[9] = c1;
700	0	c1 = 0;
701	0	mul_add_c(a[7], b[3], c2, c3, c1);
702	0	mul_add_c(a[6], b[4], c2, c3, c1);
703	0	mul_add_c(a[5], b[5], c2, c3, c1);
704	0	mul_add_c(a[4], b[6], c2, c3, c1);
705	0	mul_add_c(a[3], b[7], c2, c3, c1);
706	0	r[10] = c2;
707	0	c2 = 0;
708	0	mul_add_c(a[4], b[7], c3, c1, c2);
709	0	mul_add_c(a[5], b[6], c3, c1, c2);
710	0	mul_add_c(a[6], b[5], c3, c1, c2);
711	0	mul_add_c(a[7], b[4], c3, c1, c2);
712	0	r[11] = c3;
713	0	c3 = 0;
714	0	mul_add_c(a[7], b[5], c1, c2, c3);
715	0	mul_add_c(a[6], b[6], c1, c2, c3);
716	0	mul_add_c(a[5], b[7], c1, c2, c3);
717	0	r[12] = c1;
718	0	c1 = 0;
719	0	mul_add_c(a[6], b[7], c2, c3, c1);
720	0	mul_add_c(a[7], b[6], c2, c3, c1);
721	0	r[13] = c2;
722	0	c2 = 0;
723	0	mul_add_c(a[7], b[7], c3, c1, c2);
724	0	r[14] = c3;
725	0	r[15] = c1;
726	0	}
727
728		void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
729	0	{
730	0	BN_ULONG c1, c2, c3;
731
732	0	c1 = 0;
733	0	c2 = 0;
734	0	c3 = 0;
735	0	mul_add_c(a[0], b[0], c1, c2, c3);
736	0	r[0] = c1;
737	0	c1 = 0;
738	0	mul_add_c(a[0], b[1], c2, c3, c1);
739	0	mul_add_c(a[1], b[0], c2, c3, c1);
740	0	r[1] = c2;
741	0	c2 = 0;
742	0	mul_add_c(a[2], b[0], c3, c1, c2);
743	0	mul_add_c(a[1], b[1], c3, c1, c2);
744	0	mul_add_c(a[0], b[2], c3, c1, c2);
745	0	r[2] = c3;
746	0	c3 = 0;
747	0	mul_add_c(a[0], b[3], c1, c2, c3);
748	0	mul_add_c(a[1], b[2], c1, c2, c3);
749	0	mul_add_c(a[2], b[1], c1, c2, c3);
750	0	mul_add_c(a[3], b[0], c1, c2, c3);
751	0	r[3] = c1;
752	0	c1 = 0;
753	0	mul_add_c(a[3], b[1], c2, c3, c1);
754	0	mul_add_c(a[2], b[2], c2, c3, c1);
755	0	mul_add_c(a[1], b[3], c2, c3, c1);
756	0	r[4] = c2;
757	0	c2 = 0;
758	0	mul_add_c(a[2], b[3], c3, c1, c2);
759	0	mul_add_c(a[3], b[2], c3, c1, c2);
760	0	r[5] = c3;
761	0	c3 = 0;
762	0	mul_add_c(a[3], b[3], c1, c2, c3);
763	0	r[6] = c1;
764	0	r[7] = c2;
765	0	}
766
767		void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
768	0	{
769	0	BN_ULONG c1, c2, c3;
770
771	0	c1 = 0;
772	0	c2 = 0;
773	0	c3 = 0;
774	0	sqr_add_c(a, 0, c1, c2, c3);
775	0	r[0] = c1;
776	0	c1 = 0;
777	0	sqr_add_c2(a, 1, 0, c2, c3, c1);
778	0	r[1] = c2;
779	0	c2 = 0;
780	0	sqr_add_c(a, 1, c3, c1, c2);
781	0	sqr_add_c2(a, 2, 0, c3, c1, c2);
782	0	r[2] = c3;
783	0	c3 = 0;
784	0	sqr_add_c2(a, 3, 0, c1, c2, c3);
785	0	sqr_add_c2(a, 2, 1, c1, c2, c3);
786	0	r[3] = c1;
787	0	c1 = 0;
788	0	sqr_add_c(a, 2, c2, c3, c1);
789	0	sqr_add_c2(a, 3, 1, c2, c3, c1);
790	0	sqr_add_c2(a, 4, 0, c2, c3, c1);
791	0	r[4] = c2;
792	0	c2 = 0;
793	0	sqr_add_c2(a, 5, 0, c3, c1, c2);
794	0	sqr_add_c2(a, 4, 1, c3, c1, c2);
795	0	sqr_add_c2(a, 3, 2, c3, c1, c2);
796	0	r[5] = c3;
797	0	c3 = 0;
798	0	sqr_add_c(a, 3, c1, c2, c3);
799	0	sqr_add_c2(a, 4, 2, c1, c2, c3);
800	0	sqr_add_c2(a, 5, 1, c1, c2, c3);
801	0	sqr_add_c2(a, 6, 0, c1, c2, c3);
802	0	r[6] = c1;
803	0	c1 = 0;
804	0	sqr_add_c2(a, 7, 0, c2, c3, c1);
805	0	sqr_add_c2(a, 6, 1, c2, c3, c1);
806	0	sqr_add_c2(a, 5, 2, c2, c3, c1);
807	0	sqr_add_c2(a, 4, 3, c2, c3, c1);
808	0	r[7] = c2;
809	0	c2 = 0;
810	0	sqr_add_c(a, 4, c3, c1, c2);
811	0	sqr_add_c2(a, 5, 3, c3, c1, c2);
812	0	sqr_add_c2(a, 6, 2, c3, c1, c2);
813	0	sqr_add_c2(a, 7, 1, c3, c1, c2);
814	0	r[8] = c3;
815	0	c3 = 0;
816	0	sqr_add_c2(a, 7, 2, c1, c2, c3);
817	0	sqr_add_c2(a, 6, 3, c1, c2, c3);
818	0	sqr_add_c2(a, 5, 4, c1, c2, c3);
819	0	r[9] = c1;
820	0	c1 = 0;
821	0	sqr_add_c(a, 5, c2, c3, c1);
822	0	sqr_add_c2(a, 6, 4, c2, c3, c1);
823	0	sqr_add_c2(a, 7, 3, c2, c3, c1);
824	0	r[10] = c2;
825	0	c2 = 0;
826	0	sqr_add_c2(a, 7, 4, c3, c1, c2);
827	0	sqr_add_c2(a, 6, 5, c3, c1, c2);
828	0	r[11] = c3;
829	0	c3 = 0;
830	0	sqr_add_c(a, 6, c1, c2, c3);
831	0	sqr_add_c2(a, 7, 5, c1, c2, c3);
832	0	r[12] = c1;
833	0	c1 = 0;
834	0	sqr_add_c2(a, 7, 6, c2, c3, c1);
835	0	r[13] = c2;
836	0	c2 = 0;
837	0	sqr_add_c(a, 7, c3, c1, c2);
838	0	r[14] = c3;
839	0	r[15] = c1;
840	0	}
841
842		void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
843	0	{
844	0	BN_ULONG c1, c2, c3;
845
846	0	c1 = 0;
847	0	c2 = 0;
848	0	c3 = 0;
849	0	sqr_add_c(a, 0, c1, c2, c3);
850	0	r[0] = c1;
851	0	c1 = 0;
852	0	sqr_add_c2(a, 1, 0, c2, c3, c1);
853	0	r[1] = c2;
854	0	c2 = 0;
855	0	sqr_add_c(a, 1, c3, c1, c2);
856	0	sqr_add_c2(a, 2, 0, c3, c1, c2);
857	0	r[2] = c3;
858	0	c3 = 0;
859	0	sqr_add_c2(a, 3, 0, c1, c2, c3);
860	0	sqr_add_c2(a, 2, 1, c1, c2, c3);
861	0	r[3] = c1;
862	0	c1 = 0;
863	0	sqr_add_c(a, 2, c2, c3, c1);
864	0	sqr_add_c2(a, 3, 1, c2, c3, c1);
865	0	r[4] = c2;
866	0	c2 = 0;
867	0	sqr_add_c2(a, 3, 2, c3, c1, c2);
868	0	r[5] = c3;
869	0	c3 = 0;
870	0	sqr_add_c(a, 3, c1, c2, c3);
871	0	r[6] = c1;
872	0	r[7] = c2;
873	0	}
874
875		#ifdef OPENSSL_NO_ASM
876		#ifdef OPENSSL_BN_ASM_MONT
877		#include <alloca.h>
878		/*
879		* This is essentially reference implementation, which may or may not
880		* result in performance improvement. E.g. on IA-32 this routine was
881		* observed to give 40% faster rsa1024 private key operations and 10%
882		* faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
883		* by 10% and worsens rsa4096 sign by 15%. Once again, it's a
884		* reference implementation, one to be used as starting point for
885		* platform-specific assembler. Mentioned numbers apply to compiler
886		* generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
887		* can vary not only from platform to platform, but even for compiler
888		* versions. Assembler vs. assembler improvement coefficients can
889		* [and are known to] differ and are to be documented elsewhere.
890		*/
891		int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
892		const BN_ULONG np, const BN_ULONG n0p, int num)
893		{
894		BN_ULONG c0, c1, ml, *tp, n0;
895		#ifdef mul64
896		BN_ULONG mh;
897		#endif
898		volatile BN_ULONG *vp;
899		int i = 0, j;
900
901		#if 0 /* template for platform-specific \
902		* implementation */
903		if (ap == bp)
904		return bn_sqr_mont(rp, ap, np, n0p, num);
905		#endif
906		vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
907
908		n0 = *n0p;
909
910		c0 = 0;
911		ml = bp[0];
912		#ifdef mul64
913		mh = HBITS(ml);
914		ml = LBITS(ml);
915		for (j = 0; j < num; ++j)
916		mul(tp[j], ap[j], ml, mh, c0);
917		#else
918		for (j = 0; j < num; ++j)
919		mul(tp[j], ap[j], ml, c0);
920		#endif
921
922		tp[num] = c0;
923		tp[num + 1] = 0;
924		goto enter;
925
926		for (i = 0; i < num; i++) {
927		c0 = 0;
928		ml = bp[i];
929		#ifdef mul64
930		mh = HBITS(ml);
931		ml = LBITS(ml);
932		for (j = 0; j < num; ++j)
933		mul_add(tp[j], ap[j], ml, mh, c0);
934		#else
935		for (j = 0; j < num; ++j)
936		mul_add(tp[j], ap[j], ml, c0);
937		#endif
938		c1 = (tp[num] + c0) & BN_MASK2;
939		tp[num] = c1;
940		tp[num + 1] = (c1 < c0 ? 1 : 0);
941		enter:
942		c1 = tp[0];
943		ml = (c1 * n0) & BN_MASK2;
944		c0 = 0;
945		#ifdef mul64
946		mh = HBITS(ml);
947		ml = LBITS(ml);
948		mul_add(c1, np[0], ml, mh, c0);
949		#else
950		mul_add(c1, ml, np[0], c0);
951		#endif
952		for (j = 1; j < num; j++) {
953		c1 = tp[j];
954		#ifdef mul64
955		mul_add(c1, np[j], ml, mh, c0);
956		#else
957		mul_add(c1, ml, np[j], c0);
958		#endif
959		tp[j - 1] = c1 & BN_MASK2;
960		}
961		c1 = (tp[num] + c0) & BN_MASK2;
962		tp[num - 1] = c1;
963		tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
964		}
965
966		if (tp[num] != 0 \|\| tp[num - 1] >= np[num - 1]) {
967		c0 = bn_sub_words(rp, tp, np, num);
968		if (tp[num] != 0 \|\| c0 == 0) {
969		for (i = 0; i < num + 2; i++)
970		vp[i] = 0;
971		return 1;
972		}
973		}
974		for (i = 0; i < num; i++)
975		rp[i] = tp[i], vp[i] = 0;
976		vp[num] = 0;
977		vp[num + 1] = 0;
978		return 1;
979		}
980		#else
981		/*
982		* Return value of 0 indicates that multiplication/convolution was not
983		* performed to signal the caller to fall down to alternative/original
984		* code-path.
985		*/
986		int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
987		const BN_ULONG np, const BN_ULONG n0, int num)
988	0	{
989	0	return 0;
990	0	}
991		#endif /* OPENSSL_BN_ASM_MONT */
992		#endif
993
994		#else /* !BN_MUL_COMBA */
995
996		/* hmm... is it faster just to do a multiply? */
997		void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
998		{
999		BN_ULONG t[8];
1000		bn_sqr_normal(r, a, 4, t);
1001		}
1002
1003		void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
1004		{
1005		BN_ULONG t[16];
1006		bn_sqr_normal(r, a, 8, t);
1007		}
1008
1009		void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
1010		{
1011		r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
1012		r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
1013		r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
1014		r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
1015		}
1016
1017		void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
1018		{
1019		r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
1020		r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
1021		r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
1022		r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
1023		r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
1024		r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
1025		r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
1026		r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
1027		}
1028
1029		#ifdef OPENSSL_NO_ASM
1030		#ifdef OPENSSL_BN_ASM_MONT
1031		#include <alloca.h>
1032		int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
1033		const BN_ULONG np, const BN_ULONG n0p, int num)
1034		{
1035		BN_ULONG c0, c1, tp, n0 = n0p;
1036		volatile BN_ULONG *vp;
1037		int i = 0, j;
1038
1039		vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
1040
1041		for (i = 0; i <= num; i++)
1042		tp[i] = 0;
1043
1044		for (i = 0; i < num; i++) {
1045		c0 = bn_mul_add_words(tp, ap, num, bp[i]);
1046		c1 = (tp[num] + c0) & BN_MASK2;
1047		tp[num] = c1;
1048		tp[num + 1] = (c1 < c0 ? 1 : 0);
1049
1050		c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
1051		c1 = (tp[num] + c0) & BN_MASK2;
1052		tp[num] = c1;
1053		tp[num + 1] += (c1 < c0 ? 1 : 0);
1054		for (j = 0; j <= num; j++)
1055		tp[j] = tp[j + 1];
1056		}
1057
1058		if (tp[num] != 0 \|\| tp[num - 1] >= np[num - 1]) {
1059		c0 = bn_sub_words(rp, tp, np, num);
1060		if (tp[num] != 0 \|\| c0 == 0) {
1061		for (i = 0; i < num + 2; i++)
1062		vp[i] = 0;
1063		return 1;
1064		}
1065		}
1066		for (i = 0; i < num; i++)
1067		rp[i] = tp[i], vp[i] = 0;
1068		vp[num] = 0;
1069		vp[num + 1] = 0;
1070		return 1;
1071		}
1072		#else
1073		int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
1074		const BN_ULONG np, const BN_ULONG n0, int num)
1075		{
1076		return 0;
1077		}
1078		#endif /* OPENSSL_BN_ASM_MONT */
1079		#endif
1080
1081		#endif /* !BN_MUL_COMBA */

Coverage Report

Created: 2025-12-10 06:24