/src/irssi/subprojects/openssl-1.1.1l/crypto/bn/bn_asm.c

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
3		*
4		* Licensed under the OpenSSL license (the "License"). You may not use
5		* this file except in compliance with the License. You can obtain a copy
6		* in the file LICENSE in the source distribution or at
7		* https://www.openssl.org/source/license.html
8		*/
9
10		#include <assert.h>
11		#include <openssl/crypto.h>
12		#include "internal/cryptlib.h"
13		#include "bn_local.h"
14
15		#if defined(BN_LLONG) \|\| defined(BN_UMULT_HIGH)
16
17		BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
18		BN_ULONG w)
19		{
20		BN_ULONG c1 = 0;
21
22		assert(num >= 0);
23		if (num <= 0)
24		return c1;
25
26		# ifndef OPENSSL_SMALL_FOOTPRINT
27		while (num & ~3) {
28		mul_add(rp[0], ap[0], w, c1);
29		mul_add(rp[1], ap[1], w, c1);
30		mul_add(rp[2], ap[2], w, c1);
31		mul_add(rp[3], ap[3], w, c1);
32		ap += 4;
33		rp += 4;
34		num -= 4;
35		}
36		# endif
37		while (num) {
38		mul_add(rp[0], ap[0], w, c1);
39		ap++;
40		rp++;
41		num--;
42		}
43
44		return c1;
45		}
46
47		BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
48		{
49		BN_ULONG c1 = 0;
50
51		assert(num >= 0);
52		if (num <= 0)
53		return c1;
54
55		# ifndef OPENSSL_SMALL_FOOTPRINT
56		while (num & ~3) {
57		mul(rp[0], ap[0], w, c1);
58		mul(rp[1], ap[1], w, c1);
59		mul(rp[2], ap[2], w, c1);
60		mul(rp[3], ap[3], w, c1);
61		ap += 4;
62		rp += 4;
63		num -= 4;
64		}
65		# endif
66		while (num) {
67		mul(rp[0], ap[0], w, c1);
68		ap++;
69		rp++;
70		num--;
71		}
72		return c1;
73		}
74
75		void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
76		{
77		assert(n >= 0);
78		if (n <= 0)
79		return;
80
81		# ifndef OPENSSL_SMALL_FOOTPRINT
82		while (n & ~3) {
83		sqr(r[0], r[1], a[0]);
84		sqr(r[2], r[3], a[1]);
85		sqr(r[4], r[5], a[2]);
86		sqr(r[6], r[7], a[3]);
87		a += 4;
88		r += 8;
89		n -= 4;
90		}
91		# endif
92		while (n) {
93		sqr(r[0], r[1], a[0]);
94		a++;
95		r += 2;
96		n--;
97		}
98		}
99
100		#else /* !(defined(BN_LLONG) \|\|
101		* defined(BN_UMULT_HIGH)) */
102
103		BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
104		BN_ULONG w)
105	0	{
106	0	BN_ULONG c = 0;
107	0	BN_ULONG bl, bh;
108
109	0	assert(num >= 0);
110	0	if (num <= 0)
111	0	return (BN_ULONG)0;
112
113	0	bl = LBITS(w);
114	0	bh = HBITS(w);
115
116	0	# ifndef OPENSSL_SMALL_FOOTPRINT
117	0	while (num & ~3) {
118	0	mul_add(rp[0], ap[0], bl, bh, c);
119	0	mul_add(rp[1], ap[1], bl, bh, c);
120	0	mul_add(rp[2], ap[2], bl, bh, c);
121	0	mul_add(rp[3], ap[3], bl, bh, c);
122	0	ap += 4;
123	0	rp += 4;
124	0	num -= 4;
125	0	}
126	0	# endif
127	0	while (num) {
128	0	mul_add(rp[0], ap[0], bl, bh, c);
129	0	ap++;
130	0	rp++;
131	0	num--;
132	0	}
133	0	return c;
134	0	}
135
136		BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
137	0	{
138	0	BN_ULONG carry = 0;
139	0	BN_ULONG bl, bh;
140
141	0	assert(num >= 0);
142	0	if (num <= 0)
143	0	return (BN_ULONG)0;
144
145	0	bl = LBITS(w);
146	0	bh = HBITS(w);
147
148	0	# ifndef OPENSSL_SMALL_FOOTPRINT
149	0	while (num & ~3) {
150	0	mul(rp[0], ap[0], bl, bh, carry);
151	0	mul(rp[1], ap[1], bl, bh, carry);
152	0	mul(rp[2], ap[2], bl, bh, carry);
153	0	mul(rp[3], ap[3], bl, bh, carry);
154	0	ap += 4;
155	0	rp += 4;
156	0	num -= 4;
157	0	}
158	0	# endif
159	0	while (num) {
160	0	mul(rp[0], ap[0], bl, bh, carry);
161	0	ap++;
162	0	rp++;
163	0	num--;
164	0	}
165	0	return carry;
166	0	}
167
168		void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
169	0	{
170	0	assert(n >= 0);
171	0	if (n <= 0)
172	0	return;
173
174	0	# ifndef OPENSSL_SMALL_FOOTPRINT
175	0	while (n & ~3) {
176	0	sqr64(r[0], r[1], a[0]);
177	0	sqr64(r[2], r[3], a[1]);
178	0	sqr64(r[4], r[5], a[2]);
179	0	sqr64(r[6], r[7], a[3]);
180	0	a += 4;
181	0	r += 8;
182	0	n -= 4;
183	0	}
184	0	# endif
185	0	while (n) {
186	0	sqr64(r[0], r[1], a[0]);
187	0	a++;
188	0	r += 2;
189	0	n--;
190	0	}
191	0	}
192
193		#endif /* !(defined(BN_LLONG) \|\|
194		* defined(BN_UMULT_HIGH)) */
195
196		#if defined(BN_LLONG) && defined(BN_DIV2W)
197
198		BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
199		{
200		return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) \| l) / (BN_ULLONG) d));
201		}
202
203		#else
204
205		/* Divide h,l by d and return the result. */
206		/* I need to test this some more :-( */
207		BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
208	0	{
209	0	BN_ULONG dh, dl, q, ret = 0, th, tl, t;
210	0	int i, count = 2;
211
212	0	if (d == 0)
213	0	return BN_MASK2;
214
215	0	i = BN_num_bits_word(d);
216	0	assert((i == BN_BITS2) \|\| (h <= (BN_ULONG)1 << i));
217
218	0	i = BN_BITS2 - i;
219	0	if (h >= d)
220	0	h -= d;
221
222	0	if (i) {
223	0	d <<= i;
224	0	h = (h << i) \| (l >> (BN_BITS2 - i));
225	0	l <<= i;
226	0	}
227	0	dh = (d & BN_MASK2h) >> BN_BITS4;
228	0	dl = (d & BN_MASK2l);
229	0	for (;;) {
230	0	if ((h >> BN_BITS4) == dh)
231	0	q = BN_MASK2l;
232	0	else
233	0	q = h / dh;
234
235	0	th = q * dh;
236	0	tl = dl * q;
237	0	for (;;) {
238	0	t = h - th;
239	0	if ((t & BN_MASK2h) \|\|
240	0	((tl) <= ((t << BN_BITS4) \| ((l & BN_MASK2h) >> BN_BITS4))))
241	0	break;
242	0	q--;
243	0	th -= dh;
244	0	tl -= dl;
245	0	}
246	0	t = (tl >> BN_BITS4);
247	0	tl = (tl << BN_BITS4) & BN_MASK2h;
248	0	th += t;
249
250	0	if (l < tl)
251	0	th++;
252	0	l -= tl;
253	0	if (h < th) {
254	0	h += d;
255	0	q--;
256	0	}
257	0	h -= th;
258
259	0	if (--count == 0)
260	0	break;
261
262	0	ret = q << BN_BITS4;
263	0	h = ((h << BN_BITS4) \| (l >> BN_BITS4)) & BN_MASK2;
264	0	l = (l & BN_MASK2l) << BN_BITS4;
265	0	}
266	0	ret \|= q;
267	0	return ret;
268	0	}
269		#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
270
271		#ifdef BN_LLONG
272		BN_ULONG bn_add_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
273		int n)
274		{
275		BN_ULLONG ll = 0;
276
277		assert(n >= 0);
278		if (n <= 0)
279		return (BN_ULONG)0;
280
281		# ifndef OPENSSL_SMALL_FOOTPRINT
282		while (n & ~3) {
283		ll += (BN_ULLONG) a[0] + b[0];
284		r[0] = (BN_ULONG)ll & BN_MASK2;
285		ll >>= BN_BITS2;
286		ll += (BN_ULLONG) a[1] + b[1];
287		r[1] = (BN_ULONG)ll & BN_MASK2;
288		ll >>= BN_BITS2;
289		ll += (BN_ULLONG) a[2] + b[2];
290		r[2] = (BN_ULONG)ll & BN_MASK2;
291		ll >>= BN_BITS2;
292		ll += (BN_ULLONG) a[3] + b[3];
293		r[3] = (BN_ULONG)ll & BN_MASK2;
294		ll >>= BN_BITS2;
295		a += 4;
296		b += 4;
297		r += 4;
298		n -= 4;
299		}
300		# endif
301		while (n) {
302		ll += (BN_ULLONG) a[0] + b[0];
303		r[0] = (BN_ULONG)ll & BN_MASK2;
304		ll >>= BN_BITS2;
305		a++;
306		b++;
307		r++;
308		n--;
309		}
310		return (BN_ULONG)ll;
311		}
312		#else /* !BN_LLONG */
313		BN_ULONG bn_add_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
314		int n)
315	0	{
316	0	BN_ULONG c, l, t;
317
318	0	assert(n >= 0);
319	0	if (n <= 0)
320	0	return (BN_ULONG)0;
321
322	0	c = 0;
323	0	# ifndef OPENSSL_SMALL_FOOTPRINT
324	0	while (n & ~3) {
325	0	t = a[0];
326	0	t = (t + c) & BN_MASK2;
327	0	c = (t < c);
328	0	l = (t + b[0]) & BN_MASK2;
329	0	c += (l < t);
330	0	r[0] = l;
331	0	t = a[1];
332	0	t = (t + c) & BN_MASK2;
333	0	c = (t < c);
334	0	l = (t + b[1]) & BN_MASK2;
335	0	c += (l < t);
336	0	r[1] = l;
337	0	t = a[2];
338	0	t = (t + c) & BN_MASK2;
339	0	c = (t < c);
340	0	l = (t + b[2]) & BN_MASK2;
341	0	c += (l < t);
342	0	r[2] = l;
343	0	t = a[3];
344	0	t = (t + c) & BN_MASK2;
345	0	c = (t < c);
346	0	l = (t + b[3]) & BN_MASK2;
347	0	c += (l < t);
348	0	r[3] = l;
349	0	a += 4;
350	0	b += 4;
351	0	r += 4;
352	0	n -= 4;
353	0	}
354	0	# endif
355	0	while (n) {
356	0	t = a[0];
357	0	t = (t + c) & BN_MASK2;
358	0	c = (t < c);
359	0	l = (t + b[0]) & BN_MASK2;
360	0	c += (l < t);
361	0	r[0] = l;
362	0	a++;
363	0	b++;
364	0	r++;
365	0	n--;
366	0	}
367	0	return (BN_ULONG)c;
368	0	}
369		#endif /* !BN_LLONG */
370
371		BN_ULONG bn_sub_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
372		int n)
373	0	{
374	0	BN_ULONG t1, t2;
375	0	int c = 0;
376
377	0	assert(n >= 0);
378	0	if (n <= 0)
379	0	return (BN_ULONG)0;
380
381	0	#ifndef OPENSSL_SMALL_FOOTPRINT
382	0	while (n & ~3) {
383	0	t1 = a[0];
384	0	t2 = b[0];
385	0	r[0] = (t1 - t2 - c) & BN_MASK2;
386	0	if (t1 != t2)
387	0	c = (t1 < t2);
388	0	t1 = a[1];
389	0	t2 = b[1];
390	0	r[1] = (t1 - t2 - c) & BN_MASK2;
391	0	if (t1 != t2)
392	0	c = (t1 < t2);
393	0	t1 = a[2];
394	0	t2 = b[2];
395	0	r[2] = (t1 - t2 - c) & BN_MASK2;
396	0	if (t1 != t2)
397	0	c = (t1 < t2);
398	0	t1 = a[3];
399	0	t2 = b[3];
400	0	r[3] = (t1 - t2 - c) & BN_MASK2;
401	0	if (t1 != t2)
402	0	c = (t1 < t2);
403	0	a += 4;
404	0	b += 4;
405	0	r += 4;
406	0	n -= 4;
407	0	}
408	0	#endif
409	0	while (n) {
410	0	t1 = a[0];
411	0	t2 = b[0];
412	0	r[0] = (t1 - t2 - c) & BN_MASK2;
413	0	if (t1 != t2)
414	0	c = (t1 < t2);
415	0	a++;
416	0	b++;
417	0	r++;
418	0	n--;
419	0	}
420	0	return c;
421	0	}
422
423		#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
424
425		# undef bn_mul_comba8
426		# undef bn_mul_comba4
427		# undef bn_sqr_comba8
428		# undef bn_sqr_comba4
429
430		/* mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /
431		/* mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) */
432		/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
433		/*
434		* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number
435		* c=(c2,c1,c0)
436		*/
437
438		# ifdef BN_LLONG
439		/*
440		* Keep in mind that additions to multiplication result can not
441		* overflow, because its high half cannot be all-ones.
442		*/
443		# define mul_add_c(a,b,c0,c1,c2) do { \
444		BN_ULONG hi; \
445		BN_ULLONG t = (BN_ULLONG)(a)*(b); \
446		t += c0; /* no carry */ \
447		c0 = (BN_ULONG)Lw(t); \
448		hi = (BN_ULONG)Hw(t); \
449		c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
450		} while(0)
451
452		# define mul_add_c2(a,b,c0,c1,c2) do { \
453		BN_ULONG hi; \
454		BN_ULLONG t = (BN_ULLONG)(a)*(b); \
455		BN_ULLONG tt = t+c0; /* no carry */ \
456		c0 = (BN_ULONG)Lw(tt); \
457		hi = (BN_ULONG)Hw(tt); \
458		c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
459		t += c0; /* no carry */ \
460		c0 = (BN_ULONG)Lw(t); \
461		hi = (BN_ULONG)Hw(t); \
462		c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
463		} while(0)
464
465		# define sqr_add_c(a,i,c0,c1,c2) do { \
466		BN_ULONG hi; \
467		BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \
468		t += c0; /* no carry */ \
469		c0 = (BN_ULONG)Lw(t); \
470		hi = (BN_ULONG)Hw(t); \
471		c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
472		} while(0)
473
474		# define sqr_add_c2(a,i,j,c0,c1,c2) \
475		mul_add_c2((a)[i],(a)[j],c0,c1,c2)
476
477		# elif defined(BN_UMULT_LOHI)
478		/*
479		* Keep in mind that additions to hi can not overflow, because
480		* the high word of a multiplication result cannot be all-ones.
481		*/
482		# define mul_add_c(a,b,c0,c1,c2) do { \
483		BN_ULONG ta = (a), tb = (b); \
484		BN_ULONG lo, hi; \
485		BN_UMULT_LOHI(lo,hi,ta,tb); \
486		c0 += lo; hi += (c0<lo)?1:0; \
487		c1 += hi; c2 += (c1<hi)?1:0; \
488		} while(0)
489
490		# define mul_add_c2(a,b,c0,c1,c2) do { \
491		BN_ULONG ta = (a), tb = (b); \
492		BN_ULONG lo, hi, tt; \
493		BN_UMULT_LOHI(lo,hi,ta,tb); \
494		c0 += lo; tt = hi+((c0<lo)?1:0); \
495		c1 += tt; c2 += (c1<tt)?1:0; \
496		c0 += lo; hi += (c0<lo)?1:0; \
497		c1 += hi; c2 += (c1<hi)?1:0; \
498		} while(0)
499
500		# define sqr_add_c(a,i,c0,c1,c2) do { \
501		BN_ULONG ta = (a)[i]; \
502		BN_ULONG lo, hi; \
503		BN_UMULT_LOHI(lo,hi,ta,ta); \
504		c0 += lo; hi += (c0<lo)?1:0; \
505		c1 += hi; c2 += (c1<hi)?1:0; \
506		} while(0)
507
508		# define sqr_add_c2(a,i,j,c0,c1,c2) \
509		mul_add_c2((a)[i],(a)[j],c0,c1,c2)
510
511		# elif defined(BN_UMULT_HIGH)
512		/*
513		* Keep in mind that additions to hi can not overflow, because
514		* the high word of a multiplication result cannot be all-ones.
515		*/
516		# define mul_add_c(a,b,c0,c1,c2) do { \
517		BN_ULONG ta = (a), tb = (b); \
518		BN_ULONG lo = ta * tb; \
519		BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
520		c0 += lo; hi += (c0<lo)?1:0; \
521		c1 += hi; c2 += (c1<hi)?1:0; \
522		} while(0)
523
524		# define mul_add_c2(a,b,c0,c1,c2) do { \
525		BN_ULONG ta = (a), tb = (b), tt; \
526		BN_ULONG lo = ta * tb; \
527		BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
528		c0 += lo; tt = hi + ((c0<lo)?1:0); \
529		c1 += tt; c2 += (c1<tt)?1:0; \
530		c0 += lo; hi += (c0<lo)?1:0; \
531		c1 += hi; c2 += (c1<hi)?1:0; \
532		} while(0)
533
534		# define sqr_add_c(a,i,c0,c1,c2) do { \
535		BN_ULONG ta = (a)[i]; \
536		BN_ULONG lo = ta * ta; \
537		BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \
538		c0 += lo; hi += (c0<lo)?1:0; \
539		c1 += hi; c2 += (c1<hi)?1:0; \
540		} while(0)
541
542		# define sqr_add_c2(a,i,j,c0,c1,c2) \
543		mul_add_c2((a)[i],(a)[j],c0,c1,c2)
544
545		# else /* !BN_LLONG */
546		/*
547		* Keep in mind that additions to hi can not overflow, because
548		* the high word of a multiplication result cannot be all-ones.
549		*/
550	0	# define mul_add_c(a,b,c0,c1,c2) do { \
551	0	BN_ULONG lo = LBITS(a), hi = HBITS(a); \
552	0	BN_ULONG bl = LBITS(b), bh = HBITS(b); \
553	0	mul64(lo,hi,bl,bh); \
554	0	c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
555	0	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
556	0	} while(0)
557
558	0	# define mul_add_c2(a,b,c0,c1,c2) do { \
559	0	BN_ULONG tt; \
560	0	BN_ULONG lo = LBITS(a), hi = HBITS(a); \
561	0	BN_ULONG bl = LBITS(b), bh = HBITS(b); \
562	0	mul64(lo,hi,bl,bh); \
563	0	tt = hi; \
564	0	c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \
565	0	c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \
566	0	c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
567	0	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
568	0	} while(0)
569
570	0	# define sqr_add_c(a,i,c0,c1,c2) do { \
571	0	BN_ULONG lo, hi; \
572	0	sqr64(lo,hi,(a)[i]); \
573	0	c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
574	0	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
575	0	} while(0)
576
577		# define sqr_add_c2(a,i,j,c0,c1,c2) \
578	0	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
579		# endif /* !BN_LLONG */
580
581		void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
582	0	{
583	0	BN_ULONG c1, c2, c3;
584
585	0	c1 = 0;
586	0	c2 = 0;
587	0	c3 = 0;
588	0	mul_add_c(a[0], b[0], c1, c2, c3);
589	0	r[0] = c1;
590	0	c1 = 0;
591	0	mul_add_c(a[0], b[1], c2, c3, c1);
592	0	mul_add_c(a[1], b[0], c2, c3, c1);
593	0	r[1] = c2;
594	0	c2 = 0;
595	0	mul_add_c(a[2], b[0], c3, c1, c2);
596	0	mul_add_c(a[1], b[1], c3, c1, c2);
597	0	mul_add_c(a[0], b[2], c3, c1, c2);
598	0	r[2] = c3;
599	0	c3 = 0;
600	0	mul_add_c(a[0], b[3], c1, c2, c3);
601	0	mul_add_c(a[1], b[2], c1, c2, c3);
602	0	mul_add_c(a[2], b[1], c1, c2, c3);
603	0	mul_add_c(a[3], b[0], c1, c2, c3);
604	0	r[3] = c1;
605	0	c1 = 0;
606	0	mul_add_c(a[4], b[0], c2, c3, c1);
607	0	mul_add_c(a[3], b[1], c2, c3, c1);
608	0	mul_add_c(a[2], b[2], c2, c3, c1);
609	0	mul_add_c(a[1], b[3], c2, c3, c1);
610	0	mul_add_c(a[0], b[4], c2, c3, c1);
611	0	r[4] = c2;
612	0	c2 = 0;
613	0	mul_add_c(a[0], b[5], c3, c1, c2);
614	0	mul_add_c(a[1], b[4], c3, c1, c2);
615	0	mul_add_c(a[2], b[3], c3, c1, c2);
616	0	mul_add_c(a[3], b[2], c3, c1, c2);
617	0	mul_add_c(a[4], b[1], c3, c1, c2);
618	0	mul_add_c(a[5], b[0], c3, c1, c2);
619	0	r[5] = c3;
620	0	c3 = 0;
621	0	mul_add_c(a[6], b[0], c1, c2, c3);
622	0	mul_add_c(a[5], b[1], c1, c2, c3);
623	0	mul_add_c(a[4], b[2], c1, c2, c3);
624	0	mul_add_c(a[3], b[3], c1, c2, c3);
625	0	mul_add_c(a[2], b[4], c1, c2, c3);
626	0	mul_add_c(a[1], b[5], c1, c2, c3);
627	0	mul_add_c(a[0], b[6], c1, c2, c3);
628	0	r[6] = c1;
629	0	c1 = 0;
630	0	mul_add_c(a[0], b[7], c2, c3, c1);
631	0	mul_add_c(a[1], b[6], c2, c3, c1);
632	0	mul_add_c(a[2], b[5], c2, c3, c1);
633	0	mul_add_c(a[3], b[4], c2, c3, c1);
634	0	mul_add_c(a[4], b[3], c2, c3, c1);
635	0	mul_add_c(a[5], b[2], c2, c3, c1);
636	0	mul_add_c(a[6], b[1], c2, c3, c1);
637	0	mul_add_c(a[7], b[0], c2, c3, c1);
638	0	r[7] = c2;
639	0	c2 = 0;
640	0	mul_add_c(a[7], b[1], c3, c1, c2);
641	0	mul_add_c(a[6], b[2], c3, c1, c2);
642	0	mul_add_c(a[5], b[3], c3, c1, c2);
643	0	mul_add_c(a[4], b[4], c3, c1, c2);
644	0	mul_add_c(a[3], b[5], c3, c1, c2);
645	0	mul_add_c(a[2], b[6], c3, c1, c2);
646	0	mul_add_c(a[1], b[7], c3, c1, c2);
647	0	r[8] = c3;
648	0	c3 = 0;
649	0	mul_add_c(a[2], b[7], c1, c2, c3);
650	0	mul_add_c(a[3], b[6], c1, c2, c3);
651	0	mul_add_c(a[4], b[5], c1, c2, c3);
652	0	mul_add_c(a[5], b[4], c1, c2, c3);
653	0	mul_add_c(a[6], b[3], c1, c2, c3);
654	0	mul_add_c(a[7], b[2], c1, c2, c3);
655	0	r[9] = c1;
656	0	c1 = 0;
657	0	mul_add_c(a[7], b[3], c2, c3, c1);
658	0	mul_add_c(a[6], b[4], c2, c3, c1);
659	0	mul_add_c(a[5], b[5], c2, c3, c1);
660	0	mul_add_c(a[4], b[6], c2, c3, c1);
661	0	mul_add_c(a[3], b[7], c2, c3, c1);
662	0	r[10] = c2;
663	0	c2 = 0;
664	0	mul_add_c(a[4], b[7], c3, c1, c2);
665	0	mul_add_c(a[5], b[6], c3, c1, c2);
666	0	mul_add_c(a[6], b[5], c3, c1, c2);
667	0	mul_add_c(a[7], b[4], c3, c1, c2);
668	0	r[11] = c3;
669	0	c3 = 0;
670	0	mul_add_c(a[7], b[5], c1, c2, c3);
671	0	mul_add_c(a[6], b[6], c1, c2, c3);
672	0	mul_add_c(a[5], b[7], c1, c2, c3);
673	0	r[12] = c1;
674	0	c1 = 0;
675	0	mul_add_c(a[6], b[7], c2, c3, c1);
676	0	mul_add_c(a[7], b[6], c2, c3, c1);
677	0	r[13] = c2;
678	0	c2 = 0;
679	0	mul_add_c(a[7], b[7], c3, c1, c2);
680	0	r[14] = c3;
681	0	r[15] = c1;
682	0	}
683
684		void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
685	0	{
686	0	BN_ULONG c1, c2, c3;
687
688	0	c1 = 0;
689	0	c2 = 0;
690	0	c3 = 0;
691	0	mul_add_c(a[0], b[0], c1, c2, c3);
692	0	r[0] = c1;
693	0	c1 = 0;
694	0	mul_add_c(a[0], b[1], c2, c3, c1);
695	0	mul_add_c(a[1], b[0], c2, c3, c1);
696	0	r[1] = c2;
697	0	c2 = 0;
698	0	mul_add_c(a[2], b[0], c3, c1, c2);
699	0	mul_add_c(a[1], b[1], c3, c1, c2);
700	0	mul_add_c(a[0], b[2], c3, c1, c2);
701	0	r[2] = c3;
702	0	c3 = 0;
703	0	mul_add_c(a[0], b[3], c1, c2, c3);
704	0	mul_add_c(a[1], b[2], c1, c2, c3);
705	0	mul_add_c(a[2], b[1], c1, c2, c3);
706	0	mul_add_c(a[3], b[0], c1, c2, c3);
707	0	r[3] = c1;
708	0	c1 = 0;
709	0	mul_add_c(a[3], b[1], c2, c3, c1);
710	0	mul_add_c(a[2], b[2], c2, c3, c1);
711	0	mul_add_c(a[1], b[3], c2, c3, c1);
712	0	r[4] = c2;
713	0	c2 = 0;
714	0	mul_add_c(a[2], b[3], c3, c1, c2);
715	0	mul_add_c(a[3], b[2], c3, c1, c2);
716	0	r[5] = c3;
717	0	c3 = 0;
718	0	mul_add_c(a[3], b[3], c1, c2, c3);
719	0	r[6] = c1;
720	0	r[7] = c2;
721	0	}
722
723		void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
724	0	{
725	0	BN_ULONG c1, c2, c3;
726
727	0	c1 = 0;
728	0	c2 = 0;
729	0	c3 = 0;
730	0	sqr_add_c(a, 0, c1, c2, c3);
731	0	r[0] = c1;
732	0	c1 = 0;
733	0	sqr_add_c2(a, 1, 0, c2, c3, c1);
734	0	r[1] = c2;
735	0	c2 = 0;
736	0	sqr_add_c(a, 1, c3, c1, c2);
737	0	sqr_add_c2(a, 2, 0, c3, c1, c2);
738	0	r[2] = c3;
739	0	c3 = 0;
740	0	sqr_add_c2(a, 3, 0, c1, c2, c3);
741	0	sqr_add_c2(a, 2, 1, c1, c2, c3);
742	0	r[3] = c1;
743	0	c1 = 0;
744	0	sqr_add_c(a, 2, c2, c3, c1);
745	0	sqr_add_c2(a, 3, 1, c2, c3, c1);
746	0	sqr_add_c2(a, 4, 0, c2, c3, c1);
747	0	r[4] = c2;
748	0	c2 = 0;
749	0	sqr_add_c2(a, 5, 0, c3, c1, c2);
750	0	sqr_add_c2(a, 4, 1, c3, c1, c2);
751	0	sqr_add_c2(a, 3, 2, c3, c1, c2);
752	0	r[5] = c3;
753	0	c3 = 0;
754	0	sqr_add_c(a, 3, c1, c2, c3);
755	0	sqr_add_c2(a, 4, 2, c1, c2, c3);
756	0	sqr_add_c2(a, 5, 1, c1, c2, c3);
757	0	sqr_add_c2(a, 6, 0, c1, c2, c3);
758	0	r[6] = c1;
759	0	c1 = 0;
760	0	sqr_add_c2(a, 7, 0, c2, c3, c1);
761	0	sqr_add_c2(a, 6, 1, c2, c3, c1);
762	0	sqr_add_c2(a, 5, 2, c2, c3, c1);
763	0	sqr_add_c2(a, 4, 3, c2, c3, c1);
764	0	r[7] = c2;
765	0	c2 = 0;
766	0	sqr_add_c(a, 4, c3, c1, c2);
767	0	sqr_add_c2(a, 5, 3, c3, c1, c2);
768	0	sqr_add_c2(a, 6, 2, c3, c1, c2);
769	0	sqr_add_c2(a, 7, 1, c3, c1, c2);
770	0	r[8] = c3;
771	0	c3 = 0;
772	0	sqr_add_c2(a, 7, 2, c1, c2, c3);
773	0	sqr_add_c2(a, 6, 3, c1, c2, c3);
774	0	sqr_add_c2(a, 5, 4, c1, c2, c3);
775	0	r[9] = c1;
776	0	c1 = 0;
777	0	sqr_add_c(a, 5, c2, c3, c1);
778	0	sqr_add_c2(a, 6, 4, c2, c3, c1);
779	0	sqr_add_c2(a, 7, 3, c2, c3, c1);
780	0	r[10] = c2;
781	0	c2 = 0;
782	0	sqr_add_c2(a, 7, 4, c3, c1, c2);
783	0	sqr_add_c2(a, 6, 5, c3, c1, c2);
784	0	r[11] = c3;
785	0	c3 = 0;
786	0	sqr_add_c(a, 6, c1, c2, c3);
787	0	sqr_add_c2(a, 7, 5, c1, c2, c3);
788	0	r[12] = c1;
789	0	c1 = 0;
790	0	sqr_add_c2(a, 7, 6, c2, c3, c1);
791	0	r[13] = c2;
792	0	c2 = 0;
793	0	sqr_add_c(a, 7, c3, c1, c2);
794	0	r[14] = c3;
795	0	r[15] = c1;
796	0	}
797
798		void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
799	0	{
800	0	BN_ULONG c1, c2, c3;
801
802	0	c1 = 0;
803	0	c2 = 0;
804	0	c3 = 0;
805	0	sqr_add_c(a, 0, c1, c2, c3);
806	0	r[0] = c1;
807	0	c1 = 0;
808	0	sqr_add_c2(a, 1, 0, c2, c3, c1);
809	0	r[1] = c2;
810	0	c2 = 0;
811	0	sqr_add_c(a, 1, c3, c1, c2);
812	0	sqr_add_c2(a, 2, 0, c3, c1, c2);
813	0	r[2] = c3;
814	0	c3 = 0;
815	0	sqr_add_c2(a, 3, 0, c1, c2, c3);
816	0	sqr_add_c2(a, 2, 1, c1, c2, c3);
817	0	r[3] = c1;
818	0	c1 = 0;
819	0	sqr_add_c(a, 2, c2, c3, c1);
820	0	sqr_add_c2(a, 3, 1, c2, c3, c1);
821	0	r[4] = c2;
822	0	c2 = 0;
823	0	sqr_add_c2(a, 3, 2, c3, c1, c2);
824	0	r[5] = c3;
825	0	c3 = 0;
826	0	sqr_add_c(a, 3, c1, c2, c3);
827	0	r[6] = c1;
828	0	r[7] = c2;
829	0	}
830
831		# ifdef OPENSSL_NO_ASM
832		# ifdef OPENSSL_BN_ASM_MONT
833		# include <alloca.h>
834		/*
835		* This is essentially reference implementation, which may or may not
836		* result in performance improvement. E.g. on IA-32 this routine was
837		* observed to give 40% faster rsa1024 private key operations and 10%
838		* faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
839		* by 10% and worsens rsa4096 sign by 15%. Once again, it's a
840		* reference implementation, one to be used as starting point for
841		* platform-specific assembler. Mentioned numbers apply to compiler
842		* generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
843		* can vary not only from platform to platform, but even for compiler
844		* versions. Assembler vs. assembler improvement coefficients can
845		* [and are known to] differ and are to be documented elsewhere.
846		*/
847		int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
848		const BN_ULONG np, const BN_ULONG n0p, int num)
849		{
850		BN_ULONG c0, c1, ml, *tp, n0;
851		# ifdef mul64
852		BN_ULONG mh;
853		# endif
854		volatile BN_ULONG *vp;
855		int i = 0, j;
856
857		# if 0 /* template for platform-specific
858		* implementation */
859		if (ap == bp)
860		return bn_sqr_mont(rp, ap, np, n0p, num);
861		# endif
862		vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
863
864		n0 = *n0p;
865
866		c0 = 0;
867		ml = bp[0];
868		# ifdef mul64
869		mh = HBITS(ml);
870		ml = LBITS(ml);
871		for (j = 0; j < num; ++j)
872		mul(tp[j], ap[j], ml, mh, c0);
873		# else
874		for (j = 0; j < num; ++j)
875		mul(tp[j], ap[j], ml, c0);
876		# endif
877
878		tp[num] = c0;
879		tp[num + 1] = 0;
880		goto enter;
881
882		for (i = 0; i < num; i++) {
883		c0 = 0;
884		ml = bp[i];
885		# ifdef mul64
886		mh = HBITS(ml);
887		ml = LBITS(ml);
888		for (j = 0; j < num; ++j)
889		mul_add(tp[j], ap[j], ml, mh, c0);
890		# else
891		for (j = 0; j < num; ++j)
892		mul_add(tp[j], ap[j], ml, c0);
893		# endif
894		c1 = (tp[num] + c0) & BN_MASK2;
895		tp[num] = c1;
896		tp[num + 1] = (c1 < c0 ? 1 : 0);
897		enter:
898		c1 = tp[0];
899		ml = (c1 * n0) & BN_MASK2;
900		c0 = 0;
901		# ifdef mul64
902		mh = HBITS(ml);
903		ml = LBITS(ml);
904		mul_add(c1, np[0], ml, mh, c0);
905		# else
906		mul_add(c1, ml, np[0], c0);
907		# endif
908		for (j = 1; j < num; j++) {
909		c1 = tp[j];
910		# ifdef mul64
911		mul_add(c1, np[j], ml, mh, c0);
912		# else
913		mul_add(c1, ml, np[j], c0);
914		# endif
915		tp[j - 1] = c1 & BN_MASK2;
916		}
917		c1 = (tp[num] + c0) & BN_MASK2;
918		tp[num - 1] = c1;
919		tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
920		}
921
922		if (tp[num] != 0 \|\| tp[num - 1] >= np[num - 1]) {
923		c0 = bn_sub_words(rp, tp, np, num);
924		if (tp[num] != 0 \|\| c0 == 0) {
925		for (i = 0; i < num + 2; i++)
926		vp[i] = 0;
927		return 1;
928		}
929		}
930		for (i = 0; i < num; i++)
931		rp[i] = tp[i], vp[i] = 0;
932		vp[num] = 0;
933		vp[num + 1] = 0;
934		return 1;
935		}
936		# else
937		/*
938		* Return value of 0 indicates that multiplication/convolution was not
939		* performed to signal the caller to fall down to alternative/original
940		* code-path.
941		*/
942		int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
943		const BN_ULONG np, const BN_ULONG n0, int num)
944	0	{
945	0	return 0;
946	0	}
947		# endif /* OPENSSL_BN_ASM_MONT */
948		# endif
949
950		#else /* !BN_MUL_COMBA */
951
952		/* hmm... is it faster just to do a multiply? */
953		# undef bn_sqr_comba4
954		# undef bn_sqr_comba8
955		void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
956		{
957		BN_ULONG t[8];
958		bn_sqr_normal(r, a, 4, t);
959		}
960
961		void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
962		{
963		BN_ULONG t[16];
964		bn_sqr_normal(r, a, 8, t);
965		}
966
967		void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
968		{
969		r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
970		r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
971		r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
972		r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
973		}
974
975		void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
976		{
977		r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
978		r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
979		r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
980		r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
981		r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
982		r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
983		r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
984		r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
985		}
986
987		# ifdef OPENSSL_NO_ASM
988		# ifdef OPENSSL_BN_ASM_MONT
989		# include <alloca.h>
990		int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
991		const BN_ULONG np, const BN_ULONG n0p, int num)
992		{
993		BN_ULONG c0, c1, tp, n0 = n0p;
994		volatile BN_ULONG *vp;
995		int i = 0, j;
996
997		vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
998
999		for (i = 0; i <= num; i++)
1000		tp[i] = 0;
1001
1002		for (i = 0; i < num; i++) {
1003		c0 = bn_mul_add_words(tp, ap, num, bp[i]);
1004		c1 = (tp[num] + c0) & BN_MASK2;
1005		tp[num] = c1;
1006		tp[num + 1] = (c1 < c0 ? 1 : 0);
1007
1008		c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
1009		c1 = (tp[num] + c0) & BN_MASK2;
1010		tp[num] = c1;
1011		tp[num + 1] += (c1 < c0 ? 1 : 0);
1012		for (j = 0; j <= num; j++)
1013		tp[j] = tp[j + 1];
1014		}
1015
1016		if (tp[num] != 0 \|\| tp[num - 1] >= np[num - 1]) {
1017		c0 = bn_sub_words(rp, tp, np, num);
1018		if (tp[num] != 0 \|\| c0 == 0) {
1019		for (i = 0; i < num + 2; i++)
1020		vp[i] = 0;
1021		return 1;
1022		}
1023		}
1024		for (i = 0; i < num; i++)
1025		rp[i] = tp[i], vp[i] = 0;
1026		vp[num] = 0;
1027		vp[num + 1] = 0;
1028		return 1;
1029		}
1030		# else
1031		int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
1032		const BN_ULONG np, const BN_ULONG n0, int num)
1033		{
1034		return 0;
1035		}
1036		# endif /* OPENSSL_BN_ASM_MONT */
1037		# endif
1038
1039		#endif /* !BN_MUL_COMBA */

Coverage Report

Created: 2025-07-01 06:23