/src/openssl/crypto/bn/asm/x86_64-gcc.c

Line	Count	Source (jump to first uncovered line)
1		#include "../bn_lcl.h"
2		#if !(defined(__GNUC__) && __GNUC__>=2)
3		# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
4		#else
5		/*-
6		* x86_64 BIGNUM accelerator version 0.1, December 2002.
7		*
8		* Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9		* project.
10		*
11		* Rights for redistribution and usage in source and binary forms are
12		* granted according to the OpenSSL license. Warranty of any kind is
13		* disclaimed.
14		*
15		* Q. Version 0.1? It doesn't sound like Andy, he used to assign real
16		* versions, like 1.0...
17		* A. Well, that's because this code is basically a quick-n-dirty
18		* proof-of-concept hack. As you can see it's implemented with
19		* inline assembler, which means that you're bound to GCC and that
20		* there might be enough room for further improvement.
21		*
22		* Q. Why inline assembler?
23		* A. x86_64 features own ABI which I'm not familiar with. This is
24		* why I decided to let the compiler take care of subroutine
25		* prologue/epilogue as well as register allocation. For reference.
26		* Win64 implements different ABI for AMD64, different from Linux.
27		*
28		* Q. How much faster does it get?
29		* A. 'apps/openssl speed rsa dsa' output with no-asm:
30		*
31		* sign verify sign/s verify/s
32		* rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
33		* rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
34		* rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
35		* rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
36		* sign verify sign/s verify/s
37		* dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
38		* dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
39		* dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
40		*
41		* 'apps/openssl speed rsa dsa' output with this module:
42		*
43		* sign verify sign/s verify/s
44		* rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
45		* rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
46		* rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
47		* rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
48		* sign verify sign/s verify/s
49		* dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
50		* dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
51		* dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
52		*
53		* For the reference. IA-32 assembler implementation performs
54		* very much like 64-bit code compiled with no-asm on the same
55		* machine.
56		*/
57
58		# if defined(_WIN64) \|\| !defined(__LP64__)
59		# define BN_ULONG unsigned long long
60		# else
61	0	# define BN_ULONG unsigned long
62		# endif
63
64		# undef mul
65		# undef mul_add
66
67		/*-
68		* "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
69		* "g"(0) let the compiler to decide where does it
70		* want to keep the value of zero;
71		*/
72	0	# define mul_add(r,a,word,carry) do { \
73	0	register BN_ULONG high,low; \
74	0	asm ("mulq %3" \
75	0	: "=a"(low),"=d"(high) \
76	0	: "a"(word),"m"(a) \
77	0	: "cc"); \
78	0	asm ("addq %2,%0; adcq %3,%1" \
79	0	: "+r"(carry),"+d"(high)\
80	0	: "a"(low),"g"(0) \
81	0	: "cc"); \
82	0	asm ("addq %2,%0; adcq %3,%1" \
83	0	: "+m"(r),"+d"(high) \
84	0	: "r"(carry),"g"(0) \
85	0	: "cc"); \
86	0	carry=high; \
87	0	} while (0)
88
89	0	# define mul(r,a,word,carry) do { \
90	0	register BN_ULONG high,low; \
91	0	asm ("mulq %3" \
92	0	: "=a"(low),"=d"(high) \
93	0	: "a"(word),"g"(a) \
94	0	: "cc"); \
95	0	asm ("addq %2,%0; adcq %3,%1" \
96	0	: "+r"(carry),"+d"(high)\
97	0	: "a"(low),"g"(0) \
98	0	: "cc"); \
99	0	(r)=carry, carry=high; \
100	0	} while (0)
101		# undef sqr
102		# define sqr(r0,r1,a) \
103	0	asm ("mulq %2" \
104	0	: "=a"(r0),"=d"(r1) \
105	0	: "a"(a) \
106	0	: "cc");
107
108		BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
109		BN_ULONG w)
110	0	{
111	0	BN_ULONG c1 = 0;
112
113	0	if (num <= 0)
114	0	return (c1);
115
116	0	while (num & ~3) {
117	0	mul_add(rp[0], ap[0], w, c1);
118	0	mul_add(rp[1], ap[1], w, c1);
119	0	mul_add(rp[2], ap[2], w, c1);
120	0	mul_add(rp[3], ap[3], w, c1);
121	0	ap += 4;
122	0	rp += 4;
123	0	num -= 4;
124	0	}
125	0	if (num) {
126	0	mul_add(rp[0], ap[0], w, c1);
127	0	if (--num == 0)
128	0	return c1;
129	0	mul_add(rp[1], ap[1], w, c1);
130	0	if (--num == 0)
131	0	return c1;
132	0	mul_add(rp[2], ap[2], w, c1);
133	0	return c1;
134	0	}
135
136	0	return (c1);
137	0	}
138
139		BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
140	0	{
141	0	BN_ULONG c1 = 0;
142
143	0	if (num <= 0)
144	0	return (c1);
145
146	0	while (num & ~3) {
147	0	mul(rp[0], ap[0], w, c1);
148	0	mul(rp[1], ap[1], w, c1);
149	0	mul(rp[2], ap[2], w, c1);
150	0	mul(rp[3], ap[3], w, c1);
151	0	ap += 4;
152	0	rp += 4;
153	0	num -= 4;
154	0	}
155	0	if (num) {
156	0	mul(rp[0], ap[0], w, c1);
157	0	if (--num == 0)
158	0	return c1;
159	0	mul(rp[1], ap[1], w, c1);
160	0	if (--num == 0)
161	0	return c1;
162	0	mul(rp[2], ap[2], w, c1);
163	0	}
164	0	return (c1);
165	0	}
166
167		void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
168	0	{
169	0	if (n <= 0)
170	0	return;
171
172	0	while (n & ~3) {
173	0	sqr(r[0], r[1], a[0]);
174	0	sqr(r[2], r[3], a[1]);
175	0	sqr(r[4], r[5], a[2]);
176	0	sqr(r[6], r[7], a[3]);
177	0	a += 4;
178	0	r += 8;
179	0	n -= 4;
180	0	}
181	0	if (n) {
182	0	sqr(r[0], r[1], a[0]);
183	0	if (--n == 0)
184	0	return;
185	0	sqr(r[2], r[3], a[1]);
186	0	if (--n == 0)
187	0	return;
188	0	sqr(r[4], r[5], a[2]);
189	0	}
190	0	}
191
192		BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
193	0	{
194	0	BN_ULONG ret, waste;
195
196	0	asm("divq %4":"=a"(ret), "=d"(waste)
197	0	: "a"(l), "d"(h), "r"(d)
198	0	: "cc");
199
200	0	return ret;
201	0	}
202
203		BN_ULONG bn_add_words(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
204		int n)
205	0	{
206	0	BN_ULONG ret;
207	0	size_t i = 0;
208
209	0	if (n <= 0)
210	0	return 0;
211
212	0	asm volatile (" subq %0,%0 \n" /* clear carry */
213	0	" jmp 1f \n"
214	0	".p2align 4 \n"
215	0	"1: movq (%4,%2,8),%0 \n"
216	0	" adcq (%5,%2,8),%0 \n"
217	0	" movq %0,(%3,%2,8) \n"
218	0	" lea 1(%2),%2 \n"
219	0	" loop 1b \n"
220	0	" sbbq %0,%0 \n":"=&r" (ret), "+c"(n),
221	0	"+r"(i)
222	0	:"r"(rp), "r"(ap), "r"(bp)
223	0	:"cc", "memory");
224
225	0	return ret & 1;
226	0	}
227
228		# ifndef SIMICS
229		BN_ULONG bn_sub_words(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
230		int n)
231	0	{
232	0	BN_ULONG ret;
233	0	size_t i = 0;
234
235	0	if (n <= 0)
236	0	return 0;
237
238	0	asm volatile (" subq %0,%0 \n" /* clear borrow */
239	0	" jmp 1f \n"
240	0	".p2align 4 \n"
241	0	"1: movq (%4,%2,8),%0 \n"
242	0	" sbbq (%5,%2,8),%0 \n"
243	0	" movq %0,(%3,%2,8) \n"
244	0	" lea 1(%2),%2 \n"
245	0	" loop 1b \n"
246	0	" sbbq %0,%0 \n":"=&r" (ret), "+c"(n),
247	0	"+r"(i)
248	0	:"r"(rp), "r"(ap), "r"(bp)
249	0	:"cc", "memory");
250
251	0	return ret & 1;
252	0	}
253		# else
254		/* Simics 1.4<7 has buggy sbbq:-( */
255		# define BN_MASK2 0xffffffffffffffffL
256		BN_ULONG bn_sub_words(BN_ULONG r, BN_ULONG a, BN_ULONG *b, int n)
257		{
258		BN_ULONG t1, t2;
259		int c = 0;
260
261		if (n <= 0)
262		return ((BN_ULONG)0);
263
264		for (;;) {
265		t1 = a[0];
266		t2 = b[0];
267		r[0] = (t1 - t2 - c) & BN_MASK2;
268		if (t1 != t2)
269		c = (t1 < t2);
270		if (--n <= 0)
271		break;
272
273		t1 = a[1];
274		t2 = b[1];
275		r[1] = (t1 - t2 - c) & BN_MASK2;
276		if (t1 != t2)
277		c = (t1 < t2);
278		if (--n <= 0)
279		break;
280
281		t1 = a[2];
282		t2 = b[2];
283		r[2] = (t1 - t2 - c) & BN_MASK2;
284		if (t1 != t2)
285		c = (t1 < t2);
286		if (--n <= 0)
287		break;
288
289		t1 = a[3];
290		t2 = b[3];
291		r[3] = (t1 - t2 - c) & BN_MASK2;
292		if (t1 != t2)
293		c = (t1 < t2);
294		if (--n <= 0)
295		break;
296
297		a += 4;
298		b += 4;
299		r += 4;
300		}
301		return (c);
302		}
303		# endif
304
305		/* mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /
306		/* mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) */
307		/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
308		/*
309		* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number
310		* c=(c2,c1,c0)
311		*/
312
313		/*
314		* Keep in mind that carrying into high part of multiplication result
315		* can not overflow, because it cannot be all-ones.
316		*/
317		# if 0
318		/* original macros are kept for reference purposes */
319		# define mul_add_c(a,b,c0,c1,c2) do { \
320		BN_ULONG ta = (a), tb = (b); \
321		BN_ULONG lo, hi; \
322		BN_UMULT_LOHI(lo,hi,ta,tb); \
323		c0 += lo; hi += (c0<lo)?1:0; \
324		c1 += hi; c2 += (c1<hi)?1:0; \
325		} while(0)
326
327		# define mul_add_c2(a,b,c0,c1,c2) do { \
328		BN_ULONG ta = (a), tb = (b); \
329		BN_ULONG lo, hi, tt; \
330		BN_UMULT_LOHI(lo,hi,ta,tb); \
331		c0 += lo; tt = hi+((c0<lo)?1:0); \
332		c1 += tt; c2 += (c1<tt)?1:0; \
333		c0 += lo; hi += (c0<lo)?1:0; \
334		c1 += hi; c2 += (c1<hi)?1:0; \
335		} while(0)
336
337		# define sqr_add_c(a,i,c0,c1,c2) do { \
338		BN_ULONG ta = (a)[i]; \
339		BN_ULONG lo, hi; \
340		BN_UMULT_LOHI(lo,hi,ta,ta); \
341		c0 += lo; hi += (c0<lo)?1:0; \
342		c1 += hi; c2 += (c1<hi)?1:0; \
343		} while(0)
344		# else
345	0	# define mul_add_c(a,b,c0,c1,c2) do { \
346	0	BN_ULONG t1,t2; \
347	0	asm ("mulq %3" \
348	0	: "=a"(t1),"=d"(t2) \
349	0	: "a"(a),"m"(b) \
350	0	: "cc"); \
351	0	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
352	0	: "+r"(c0),"+r"(c1),"+r"(c2) \
353	0	: "r"(t1),"r"(t2),"g"(0) \
354	0	: "cc"); \
355	0	} while (0)
356
357	0	# define sqr_add_c(a,i,c0,c1,c2) do { \
358	0	BN_ULONG t1,t2; \
359	0	asm ("mulq %2" \
360	0	: "=a"(t1),"=d"(t2) \
361	0	: "a"(a[i]) \
362	0	: "cc"); \
363	0	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
364	0	: "+r"(c0),"+r"(c1),"+r"(c2) \
365	0	: "r"(t1),"r"(t2),"g"(0) \
366	0	: "cc"); \
367	0	} while (0)
368
369	0	# define mul_add_c2(a,b,c0,c1,c2) do { \
370	0	BN_ULONG t1,t2; \
371	0	asm ("mulq %3" \
372	0	: "=a"(t1),"=d"(t2) \
373	0	: "a"(a),"m"(b) \
374	0	: "cc"); \
375	0	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
376	0	: "+r"(c0),"+r"(c1),"+r"(c2) \
377	0	: "r"(t1),"r"(t2),"g"(0) \
378	0	: "cc"); \
379	0	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
380	0	: "+r"(c0),"+r"(c1),"+r"(c2) \
381	0	: "r"(t1),"r"(t2),"g"(0) \
382	0	: "cc"); \
383	0	} while (0)
384		# endif
385
386		# define sqr_add_c2(a,i,j,c0,c1,c2) \
387	0	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
388
389		void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
390	0	{
391	0	BN_ULONG c1, c2, c3;
392
393	0	c1 = 0;
394	0	c2 = 0;
395	0	c3 = 0;
396	0	mul_add_c(a[0], b[0], c1, c2, c3);
397	0	r[0] = c1;
398	0	c1 = 0;
399	0	mul_add_c(a[0], b[1], c2, c3, c1);
400	0	mul_add_c(a[1], b[0], c2, c3, c1);
401	0	r[1] = c2;
402	0	c2 = 0;
403	0	mul_add_c(a[2], b[0], c3, c1, c2);
404	0	mul_add_c(a[1], b[1], c3, c1, c2);
405	0	mul_add_c(a[0], b[2], c3, c1, c2);
406	0	r[2] = c3;
407	0	c3 = 0;
408	0	mul_add_c(a[0], b[3], c1, c2, c3);
409	0	mul_add_c(a[1], b[2], c1, c2, c3);
410	0	mul_add_c(a[2], b[1], c1, c2, c3);
411	0	mul_add_c(a[3], b[0], c1, c2, c3);
412	0	r[3] = c1;
413	0	c1 = 0;
414	0	mul_add_c(a[4], b[0], c2, c3, c1);
415	0	mul_add_c(a[3], b[1], c2, c3, c1);
416	0	mul_add_c(a[2], b[2], c2, c3, c1);
417	0	mul_add_c(a[1], b[3], c2, c3, c1);
418	0	mul_add_c(a[0], b[4], c2, c3, c1);
419	0	r[4] = c2;
420	0	c2 = 0;
421	0	mul_add_c(a[0], b[5], c3, c1, c2);
422	0	mul_add_c(a[1], b[4], c3, c1, c2);
423	0	mul_add_c(a[2], b[3], c3, c1, c2);
424	0	mul_add_c(a[3], b[2], c3, c1, c2);
425	0	mul_add_c(a[4], b[1], c3, c1, c2);
426	0	mul_add_c(a[5], b[0], c3, c1, c2);
427	0	r[5] = c3;
428	0	c3 = 0;
429	0	mul_add_c(a[6], b[0], c1, c2, c3);
430	0	mul_add_c(a[5], b[1], c1, c2, c3);
431	0	mul_add_c(a[4], b[2], c1, c2, c3);
432	0	mul_add_c(a[3], b[3], c1, c2, c3);
433	0	mul_add_c(a[2], b[4], c1, c2, c3);
434	0	mul_add_c(a[1], b[5], c1, c2, c3);
435	0	mul_add_c(a[0], b[6], c1, c2, c3);
436	0	r[6] = c1;
437	0	c1 = 0;
438	0	mul_add_c(a[0], b[7], c2, c3, c1);
439	0	mul_add_c(a[1], b[6], c2, c3, c1);
440	0	mul_add_c(a[2], b[5], c2, c3, c1);
441	0	mul_add_c(a[3], b[4], c2, c3, c1);
442	0	mul_add_c(a[4], b[3], c2, c3, c1);
443	0	mul_add_c(a[5], b[2], c2, c3, c1);
444	0	mul_add_c(a[6], b[1], c2, c3, c1);
445	0	mul_add_c(a[7], b[0], c2, c3, c1);
446	0	r[7] = c2;
447	0	c2 = 0;
448	0	mul_add_c(a[7], b[1], c3, c1, c2);
449	0	mul_add_c(a[6], b[2], c3, c1, c2);
450	0	mul_add_c(a[5], b[3], c3, c1, c2);
451	0	mul_add_c(a[4], b[4], c3, c1, c2);
452	0	mul_add_c(a[3], b[5], c3, c1, c2);
453	0	mul_add_c(a[2], b[6], c3, c1, c2);
454	0	mul_add_c(a[1], b[7], c3, c1, c2);
455	0	r[8] = c3;
456	0	c3 = 0;
457	0	mul_add_c(a[2], b[7], c1, c2, c3);
458	0	mul_add_c(a[3], b[6], c1, c2, c3);
459	0	mul_add_c(a[4], b[5], c1, c2, c3);
460	0	mul_add_c(a[5], b[4], c1, c2, c3);
461	0	mul_add_c(a[6], b[3], c1, c2, c3);
462	0	mul_add_c(a[7], b[2], c1, c2, c3);
463	0	r[9] = c1;
464	0	c1 = 0;
465	0	mul_add_c(a[7], b[3], c2, c3, c1);
466	0	mul_add_c(a[6], b[4], c2, c3, c1);
467	0	mul_add_c(a[5], b[5], c2, c3, c1);
468	0	mul_add_c(a[4], b[6], c2, c3, c1);
469	0	mul_add_c(a[3], b[7], c2, c3, c1);
470	0	r[10] = c2;
471	0	c2 = 0;
472	0	mul_add_c(a[4], b[7], c3, c1, c2);
473	0	mul_add_c(a[5], b[6], c3, c1, c2);
474	0	mul_add_c(a[6], b[5], c3, c1, c2);
475	0	mul_add_c(a[7], b[4], c3, c1, c2);
476	0	r[11] = c3;
477	0	c3 = 0;
478	0	mul_add_c(a[7], b[5], c1, c2, c3);
479	0	mul_add_c(a[6], b[6], c1, c2, c3);
480	0	mul_add_c(a[5], b[7], c1, c2, c3);
481	0	r[12] = c1;
482	0	c1 = 0;
483	0	mul_add_c(a[6], b[7], c2, c3, c1);
484	0	mul_add_c(a[7], b[6], c2, c3, c1);
485	0	r[13] = c2;
486	0	c2 = 0;
487	0	mul_add_c(a[7], b[7], c3, c1, c2);
488	0	r[14] = c3;
489	0	r[15] = c1;
490	0	}
491
492		void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
493	0	{
494	0	BN_ULONG c1, c2, c3;
495
496	0	c1 = 0;
497	0	c2 = 0;
498	0	c3 = 0;
499	0	mul_add_c(a[0], b[0], c1, c2, c3);
500	0	r[0] = c1;
501	0	c1 = 0;
502	0	mul_add_c(a[0], b[1], c2, c3, c1);
503	0	mul_add_c(a[1], b[0], c2, c3, c1);
504	0	r[1] = c2;
505	0	c2 = 0;
506	0	mul_add_c(a[2], b[0], c3, c1, c2);
507	0	mul_add_c(a[1], b[1], c3, c1, c2);
508	0	mul_add_c(a[0], b[2], c3, c1, c2);
509	0	r[2] = c3;
510	0	c3 = 0;
511	0	mul_add_c(a[0], b[3], c1, c2, c3);
512	0	mul_add_c(a[1], b[2], c1, c2, c3);
513	0	mul_add_c(a[2], b[1], c1, c2, c3);
514	0	mul_add_c(a[3], b[0], c1, c2, c3);
515	0	r[3] = c1;
516	0	c1 = 0;
517	0	mul_add_c(a[3], b[1], c2, c3, c1);
518	0	mul_add_c(a[2], b[2], c2, c3, c1);
519	0	mul_add_c(a[1], b[3], c2, c3, c1);
520	0	r[4] = c2;
521	0	c2 = 0;
522	0	mul_add_c(a[2], b[3], c3, c1, c2);
523	0	mul_add_c(a[3], b[2], c3, c1, c2);
524	0	r[5] = c3;
525	0	c3 = 0;
526	0	mul_add_c(a[3], b[3], c1, c2, c3);
527	0	r[6] = c1;
528	0	r[7] = c2;
529	0	}
530
531		void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
532	0	{
533	0	BN_ULONG c1, c2, c3;
534
535	0	c1 = 0;
536	0	c2 = 0;
537	0	c3 = 0;
538	0	sqr_add_c(a, 0, c1, c2, c3);
539	0	r[0] = c1;
540	0	c1 = 0;
541	0	sqr_add_c2(a, 1, 0, c2, c3, c1);
542	0	r[1] = c2;
543	0	c2 = 0;
544	0	sqr_add_c(a, 1, c3, c1, c2);
545	0	sqr_add_c2(a, 2, 0, c3, c1, c2);
546	0	r[2] = c3;
547	0	c3 = 0;
548	0	sqr_add_c2(a, 3, 0, c1, c2, c3);
549	0	sqr_add_c2(a, 2, 1, c1, c2, c3);
550	0	r[3] = c1;
551	0	c1 = 0;
552	0	sqr_add_c(a, 2, c2, c3, c1);
553	0	sqr_add_c2(a, 3, 1, c2, c3, c1);
554	0	sqr_add_c2(a, 4, 0, c2, c3, c1);
555	0	r[4] = c2;
556	0	c2 = 0;
557	0	sqr_add_c2(a, 5, 0, c3, c1, c2);
558	0	sqr_add_c2(a, 4, 1, c3, c1, c2);
559	0	sqr_add_c2(a, 3, 2, c3, c1, c2);
560	0	r[5] = c3;
561	0	c3 = 0;
562	0	sqr_add_c(a, 3, c1, c2, c3);
563	0	sqr_add_c2(a, 4, 2, c1, c2, c3);
564	0	sqr_add_c2(a, 5, 1, c1, c2, c3);
565	0	sqr_add_c2(a, 6, 0, c1, c2, c3);
566	0	r[6] = c1;
567	0	c1 = 0;
568	0	sqr_add_c2(a, 7, 0, c2, c3, c1);
569	0	sqr_add_c2(a, 6, 1, c2, c3, c1);
570	0	sqr_add_c2(a, 5, 2, c2, c3, c1);
571	0	sqr_add_c2(a, 4, 3, c2, c3, c1);
572	0	r[7] = c2;
573	0	c2 = 0;
574	0	sqr_add_c(a, 4, c3, c1, c2);
575	0	sqr_add_c2(a, 5, 3, c3, c1, c2);
576	0	sqr_add_c2(a, 6, 2, c3, c1, c2);
577	0	sqr_add_c2(a, 7, 1, c3, c1, c2);
578	0	r[8] = c3;
579	0	c3 = 0;
580	0	sqr_add_c2(a, 7, 2, c1, c2, c3);
581	0	sqr_add_c2(a, 6, 3, c1, c2, c3);
582	0	sqr_add_c2(a, 5, 4, c1, c2, c3);
583	0	r[9] = c1;
584	0	c1 = 0;
585	0	sqr_add_c(a, 5, c2, c3, c1);
586	0	sqr_add_c2(a, 6, 4, c2, c3, c1);
587	0	sqr_add_c2(a, 7, 3, c2, c3, c1);
588	0	r[10] = c2;
589	0	c2 = 0;
590	0	sqr_add_c2(a, 7, 4, c3, c1, c2);
591	0	sqr_add_c2(a, 6, 5, c3, c1, c2);
592	0	r[11] = c3;
593	0	c3 = 0;
594	0	sqr_add_c(a, 6, c1, c2, c3);
595	0	sqr_add_c2(a, 7, 5, c1, c2, c3);
596	0	r[12] = c1;
597	0	c1 = 0;
598	0	sqr_add_c2(a, 7, 6, c2, c3, c1);
599	0	r[13] = c2;
600	0	c2 = 0;
601	0	sqr_add_c(a, 7, c3, c1, c2);
602	0	r[14] = c3;
603	0	r[15] = c1;
604	0	}
605
606		void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
607	0	{
608	0	BN_ULONG c1, c2, c3;
609
610	0	c1 = 0;
611	0	c2 = 0;
612	0	c3 = 0;
613	0	sqr_add_c(a, 0, c1, c2, c3);
614	0	r[0] = c1;
615	0	c1 = 0;
616	0	sqr_add_c2(a, 1, 0, c2, c3, c1);
617	0	r[1] = c2;
618	0	c2 = 0;
619	0	sqr_add_c(a, 1, c3, c1, c2);
620	0	sqr_add_c2(a, 2, 0, c3, c1, c2);
621	0	r[2] = c3;
622	0	c3 = 0;
623	0	sqr_add_c2(a, 3, 0, c1, c2, c3);
624	0	sqr_add_c2(a, 2, 1, c1, c2, c3);
625	0	r[3] = c1;
626	0	c1 = 0;
627	0	sqr_add_c(a, 2, c2, c3, c1);
628	0	sqr_add_c2(a, 3, 1, c2, c3, c1);
629	0	r[4] = c2;
630	0	c2 = 0;
631	0	sqr_add_c2(a, 3, 2, c3, c1, c2);
632	0	r[5] = c3;
633	0	c3 = 0;
634	0	sqr_add_c(a, 3, c1, c2, c3);
635	0	r[6] = c1;
636	0	r[7] = c2;
637	0	}
638		#endif

Coverage Report

Created: 2022-11-30 06:20