/src/openssl/crypto/bn/asm/x86_64-gcc.c

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
3		*
4		* Licensed under the OpenSSL license (the "License"). You may not use
5		* this file except in compliance with the License. You can obtain a copy
6		* in the file LICENSE in the source distribution or at
7		* https://www.openssl.org/source/license.html
8		*/
9
10		#include "../bn_lcl.h"
11		#if !(defined(__GNUC__) && __GNUC__>=2)
12		# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
13		#else
14		/*-
15		* x86_64 BIGNUM accelerator version 0.1, December 2002.
16		*
17		* Implemented by Andy Polyakov <appro@openssl.org> for the OpenSSL
18		* project.
19		*
20		* Rights for redistribution and usage in source and binary forms are
21		* granted according to the OpenSSL license. Warranty of any kind is
22		* disclaimed.
23		*
24		* Q. Version 0.1? It doesn't sound like Andy, he used to assign real
25		* versions, like 1.0...
26		* A. Well, that's because this code is basically a quick-n-dirty
27		* proof-of-concept hack. As you can see it's implemented with
28		* inline assembler, which means that you're bound to GCC and that
29		* there might be enough room for further improvement.
30		*
31		* Q. Why inline assembler?
32		* A. x86_64 features own ABI which I'm not familiar with. This is
33		* why I decided to let the compiler take care of subroutine
34		* prologue/epilogue as well as register allocation. For reference.
35		* Win64 implements different ABI for AMD64, different from Linux.
36		*
37		* Q. How much faster does it get?
38		* A. 'apps/openssl speed rsa dsa' output with no-asm:
39		*
40		* sign verify sign/s verify/s
41		* rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
42		* rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
43		* rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
44		* rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
45		* sign verify sign/s verify/s
46		* dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
47		* dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
48		* dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
49		*
50		* 'apps/openssl speed rsa dsa' output with this module:
51		*
52		* sign verify sign/s verify/s
53		* rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
54		* rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
55		* rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
56		* rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
57		* sign verify sign/s verify/s
58		* dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
59		* dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
60		* dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
61		*
62		* For the reference. IA-32 assembler implementation performs
63		* very much like 64-bit code compiled with no-asm on the same
64		* machine.
65		*/
66
67		# if defined(_WIN64) \|\| !defined(__LP64__)
68		# define BN_ULONG unsigned long long
69		# else
70	51.1M	# define BN_ULONG unsigned long
71		# endif
72
73		# undef mul
74		# undef mul_add
75
76		/*-
77		* "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
78		* "g"(0) let the compiler to decide where does it
79		* want to keep the value of zero;
80		*/
81	111M	# define mul_add(r,a,word,carry) do { \
82	111M	register BN_ULONG high,low; \
83	111M	asm ("mulq %3" \
84	111M	: "=a"(low),"=d"(high) \
85	111M	: "a"(word),"m"(a) \
86	111M	: "cc"); \
87	111M	asm ("addq %2,%0; adcq %3,%1" \
88	111M	: "+r"(carry),"+d"(high)\
89	111M	: "a"(low),"g"(0) \
90	111M	: "cc"); \
91	111M	asm ("addq %2,%0; adcq %3,%1" \
92	111M	: "+m"(r),"+d"(high) \
93	111M	: "r"(carry),"g"(0) \
94	111M	: "cc"); \
95	111M	carry=high; \
96	111M	} while (0)
97
98	9.94M	# define mul(r,a,word,carry) do { \
99	9.90M	register BN_ULONG high,low; \
100	9.90M	asm ("mulq %3" \
101	9.90M	: "=a"(low),"=d"(high) \
102	9.90M	: "a"(word),"g"(a) \
103	9.90M	: "cc"); \
104	9.90M	asm ("addq %2,%0; adcq %3,%1" \
105	9.90M	: "+r"(carry),"+d"(high)\
106	9.90M	: "a"(low),"g"(0) \
107	9.90M	: "cc"); \
108	9.90M	(r)=carry, carry=high; \
109	9.90M	} while (0)
110		# undef sqr
111		# define sqr(r0,r1,a) \
112	0	asm ("mulq %2" \
113	0	: "=a"(r0),"=d"(r1) \
114	0	: "a"(a) \
115	0	: "cc");
116
117		BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
118		BN_ULONG w)
119	241k	{
120	241k	BN_ULONG c1 = 0;
121	241k
122	241k	if (num <= 0)
123	0	return c1;
124	241k
125	27.9M	while (num & ~3) {
126	27.7M	mul_add(rp[0], ap[0], w, c1);
127	27.7M	mul_add(rp[1], ap[1], w, c1);
128	27.7M	mul_add(rp[2], ap[2], w, c1);
129	27.7M	mul_add(rp[3], ap[3], w, c1);
130	27.7M	ap += 4;
131	27.7M	rp += 4;
132	27.7M	num -= 4;
133	27.7M	}
134	241k	if (num) {
135	148k	mul_add(rp[0], ap[0], w, c1);
136	148k	if (--num == 0)
137	50.2k	return c1;
138	98.6k	mul_add(rp[1], ap[1], w, c1);
139	98.6k	if (--num == 0)
140	66.9k	return c1;
141	31.7k	mul_add(rp[2], ap[2], w, c1);
142	31.7k	return c1;
143	31.7k	}
144	93.0k
145	93.0k	return c1;
146	93.0k	}
147
148		BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
149	95.3k	{
150	95.3k	BN_ULONG c1 = 0;
151	95.3k
152	95.3k	if (num <= 0)
153	0	return c1;
154	95.3k
155	2.54M	while (num & ~3) {
156	2.44M	mul(rp[0], ap[0], w, c1);
157	2.44M	mul(rp[1], ap[1], w, c1);
158	2.44M	mul(rp[2], ap[2], w, c1);
159	2.44M	mul(rp[3], ap[3], w, c1);
160	2.44M	ap += 4;
161	2.44M	rp += 4;
162	2.44M	num -= 4;
163	2.44M	}
164	95.3k	if (num) {
165	73.8k	mul(rp[0], ap[0], w, c1);
166	73.8k	if (--num == 0)
167	40.1k	return c1;
168	33.6k	mul(rp[1], ap[1], w, c1);
169	33.6k	if (--num == 0)
170	21.4k	return c1;
171	12.2k	mul(rp[2], ap[2], w, c1);
172	12.2k	}
173	95.3k	return c1;
174	95.3k	}
175
176		void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
177	0	{
178	0	if (n <= 0)
179	0	return;
180	0
181	0	while (n & ~3) {
182	0	sqr(r[0], r[1], a[0]);
183	0	sqr(r[2], r[3], a[1]);
184	0	sqr(r[4], r[5], a[2]);
185	0	sqr(r[6], r[7], a[3]);
186	0	a += 4;
187	0	r += 8;
188	0	n -= 4;
189	0	}
190	0	if (n) {
191	0	sqr(r[0], r[1], a[0]);
192	0	if (--n == 0)
193	0	return;
194	0	sqr(r[2], r[3], a[1]);
195	0	if (--n == 0)
196	0	return;
197	0	sqr(r[4], r[5], a[2]);
198	0	}
199	0	}
200
201		BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
202	19.6M	{
203	19.6M	BN_ULONG ret, waste;
204	19.6M
205	19.6M	asm("divq %4":"=a"(ret), "=d"(waste)
206	19.6M	: "a"(l), "d"(h), "r"(d)
207	19.6M	: "cc");
208	19.6M
209	19.6M	return ret;
210	19.6M	}
211
212		BN_ULONG bn_add_words(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
213		int n)
214	572k	{
215	572k	BN_ULONG ret;
216	572k	size_t i = 0;
217	572k
218	572k	if (n <= 0)
219	0	return 0;
220	572k
221	572k	asm volatile (" subq %0,%0 \n" /* clear carry */
222	572k	" jmp 1f \n"
223	572k	".p2align 4 \n"
224	572k	"1: movq (%4,%2,8),%0 \n"
225	572k	" adcq (%5,%2,8),%0 \n"
226	572k	" movq %0,(%3,%2,8) \n"
227	572k	" lea 1(%2),%2 \n"
228	572k	" dec %1 \n"
229	572k	" jnz 1b \n"
230	572k	" sbbq %0,%0 \n"
231	572k	:"=&r" (ret), "+c"(n), "+r"(i)
232	572k	:"r"(rp), "r"(ap), "r"(bp)
233	572k	:"cc", "memory");
234	572k
235	572k	return ret & 1;
236	572k	}
237
238		# ifndef SIMICS
239		BN_ULONG bn_sub_words(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
240		int n)
241	533k	{
242	533k	BN_ULONG ret;
243	533k	size_t i = 0;
244	533k
245	533k	if (n <= 0)
246	8.27k	return 0;
247	525k
248	525k	asm volatile (" subq %0,%0 \n" /* clear borrow */
249	525k	" jmp 1f \n"
250	525k	".p2align 4 \n"
251	525k	"1: movq (%4,%2,8),%0 \n"
252	525k	" sbbq (%5,%2,8),%0 \n"
253	525k	" movq %0,(%3,%2,8) \n"
254	525k	" lea 1(%2),%2 \n"
255	525k	" dec %1 \n"
256	525k	" jnz 1b \n"
257	525k	" sbbq %0,%0 \n"
258	525k	:"=&r" (ret), "+c"(n), "+r"(i)
259	525k	:"r"(rp), "r"(ap), "r"(bp)
260	525k	:"cc", "memory");
261	525k
262	525k	return ret & 1;
263	525k	}
264		# else
265		/* Simics 1.4<7 has buggy sbbq:-( */
266		# define BN_MASK2 0xffffffffffffffffL
267		BN_ULONG bn_sub_words(BN_ULONG r, BN_ULONG a, BN_ULONG *b, int n)
268		{
269		BN_ULONG t1, t2;
270		int c = 0;
271
272		if (n <= 0)
273		return (BN_ULONG)0;
274
275		for (;;) {
276		t1 = a[0];
277		t2 = b[0];
278		r[0] = (t1 - t2 - c) & BN_MASK2;
279		if (t1 != t2)
280		c = (t1 < t2);
281		if (--n <= 0)
282		break;
283
284		t1 = a[1];
285		t2 = b[1];
286		r[1] = (t1 - t2 - c) & BN_MASK2;
287		if (t1 != t2)
288		c = (t1 < t2);
289		if (--n <= 0)
290		break;
291
292		t1 = a[2];
293		t2 = b[2];
294		r[2] = (t1 - t2 - c) & BN_MASK2;
295		if (t1 != t2)
296		c = (t1 < t2);
297		if (--n <= 0)
298		break;
299
300		t1 = a[3];
301		t2 = b[3];
302		r[3] = (t1 - t2 - c) & BN_MASK2;
303		if (t1 != t2)
304		c = (t1 < t2);
305		if (--n <= 0)
306		break;
307
308		a += 4;
309		b += 4;
310		r += 4;
311		}
312		return c;
313		}
314		# endif
315
316		/* mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /
317		/* mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) */
318		/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
319		/*
320		* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number
321		* c=(c2,c1,c0)
322		*/
323
324		/*
325		* Keep in mind that carrying into high part of multiplication result
326		* can not overflow, because it cannot be all-ones.
327		*/
328		# if 0
329		/* original macros are kept for reference purposes */
330		# define mul_add_c(a,b,c0,c1,c2) do { \
331		BN_ULONG ta = (a), tb = (b); \
332		BN_ULONG lo, hi; \
333		BN_UMULT_LOHI(lo,hi,ta,tb); \
334		c0 += lo; hi += (c0<lo)?1:0; \
335		c1 += hi; c2 += (c1<hi)?1:0; \
336		} while(0)
337
338		# define mul_add_c2(a,b,c0,c1,c2) do { \
339		BN_ULONG ta = (a), tb = (b); \
340		BN_ULONG lo, hi, tt; \
341		BN_UMULT_LOHI(lo,hi,ta,tb); \
342		c0 += lo; tt = hi+((c0<lo)?1:0); \
343		c1 += tt; c2 += (c1<tt)?1:0; \
344		c0 += lo; hi += (c0<lo)?1:0; \
345		c1 += hi; c2 += (c1<hi)?1:0; \
346		} while(0)
347
348		# define sqr_add_c(a,i,c0,c1,c2) do { \
349		BN_ULONG ta = (a)[i]; \
350		BN_ULONG lo, hi; \
351		BN_UMULT_LOHI(lo,hi,ta,ta); \
352		c0 += lo; hi += (c0<lo)?1:0; \
353		c1 += hi; c2 += (c1<hi)?1:0; \
354		} while(0)
355		# else
356	29.6M	# define mul_add_c(a,b,c0,c1,c2) do { \
357	29.6M	BN_ULONG t1,t2; \
358	29.6M	asm ("mulq %3" \
359	29.6M	: "=a"(t1),"=d"(t2) \
360	29.6M	: "a"(a),"m"(b) \
361	29.6M	: "cc"); \
362	29.6M	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
363	29.6M	: "+r"(c0),"+r"(c1),"+r"(c2) \
364	29.6M	: "r"(t1),"r"(t2),"g"(0) \
365	29.6M	: "cc"); \
366	29.6M	} while (0)
367
368	0	# define sqr_add_c(a,i,c0,c1,c2) do { \
369	0	BN_ULONG t1,t2; \
370	0	asm ("mulq %2" \
371	0	: "=a"(t1),"=d"(t2) \
372	0	: "a"(a[i]) \
373	0	: "cc"); \
374	0	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
375	0	: "+r"(c0),"+r"(c1),"+r"(c2) \
376	0	: "r"(t1),"r"(t2),"g"(0) \
377	0	: "cc"); \
378	0	} while (0)
379
380	0	# define mul_add_c2(a,b,c0,c1,c2) do { \
381	0	BN_ULONG t1,t2; \
382	0	asm ("mulq %3" \
383	0	: "=a"(t1),"=d"(t2) \
384	0	: "a"(a),"m"(b) \
385	0	: "cc"); \
386	0	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
387	0	: "+r"(c0),"+r"(c1),"+r"(c2) \
388	0	: "r"(t1),"r"(t2),"g"(0) \
389	0	: "cc"); \
390	0	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
391	0	: "+r"(c0),"+r"(c1),"+r"(c2) \
392	0	: "r"(t1),"r"(t2),"g"(0) \
393	0	: "cc"); \
394	0	} while (0)
395		# endif
396
397		# define sqr_add_c2(a,i,j,c0,c1,c2) \
398	0	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
399
400		void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
401	463k	{
402	463k	BN_ULONG c1, c2, c3;
403	463k
404	463k	c1 = 0;
405	463k	c2 = 0;
406	463k	c3 = 0;
407	463k	mul_add_c(a[0], b[0], c1, c2, c3);
408	463k	r[0] = c1;
409	463k	c1 = 0;
410	463k	mul_add_c(a[0], b[1], c2, c3, c1);
411	463k	mul_add_c(a[1], b[0], c2, c3, c1);
412	463k	r[1] = c2;
413	463k	c2 = 0;
414	463k	mul_add_c(a[2], b[0], c3, c1, c2);
415	463k	mul_add_c(a[1], b[1], c3, c1, c2);
416	463k	mul_add_c(a[0], b[2], c3, c1, c2);
417	463k	r[2] = c3;
418	463k	c3 = 0;
419	463k	mul_add_c(a[0], b[3], c1, c2, c3);
420	463k	mul_add_c(a[1], b[2], c1, c2, c3);
421	463k	mul_add_c(a[2], b[1], c1, c2, c3);
422	463k	mul_add_c(a[3], b[0], c1, c2, c3);
423	463k	r[3] = c1;
424	463k	c1 = 0;
425	463k	mul_add_c(a[4], b[0], c2, c3, c1);
426	463k	mul_add_c(a[3], b[1], c2, c3, c1);
427	463k	mul_add_c(a[2], b[2], c2, c3, c1);
428	463k	mul_add_c(a[1], b[3], c2, c3, c1);
429	463k	mul_add_c(a[0], b[4], c2, c3, c1);
430	463k	r[4] = c2;
431	463k	c2 = 0;
432	463k	mul_add_c(a[0], b[5], c3, c1, c2);
433	463k	mul_add_c(a[1], b[4], c3, c1, c2);
434	463k	mul_add_c(a[2], b[3], c3, c1, c2);
435	463k	mul_add_c(a[3], b[2], c3, c1, c2);
436	463k	mul_add_c(a[4], b[1], c3, c1, c2);
437	463k	mul_add_c(a[5], b[0], c3, c1, c2);
438	463k	r[5] = c3;
439	463k	c3 = 0;
440	463k	mul_add_c(a[6], b[0], c1, c2, c3);
441	463k	mul_add_c(a[5], b[1], c1, c2, c3);
442	463k	mul_add_c(a[4], b[2], c1, c2, c3);
443	463k	mul_add_c(a[3], b[3], c1, c2, c3);
444	463k	mul_add_c(a[2], b[4], c1, c2, c3);
445	463k	mul_add_c(a[1], b[5], c1, c2, c3);
446	463k	mul_add_c(a[0], b[6], c1, c2, c3);
447	463k	r[6] = c1;
448	463k	c1 = 0;
449	463k	mul_add_c(a[0], b[7], c2, c3, c1);
450	463k	mul_add_c(a[1], b[6], c2, c3, c1);
451	463k	mul_add_c(a[2], b[5], c2, c3, c1);
452	463k	mul_add_c(a[3], b[4], c2, c3, c1);
453	463k	mul_add_c(a[4], b[3], c2, c3, c1);
454	463k	mul_add_c(a[5], b[2], c2, c3, c1);
455	463k	mul_add_c(a[6], b[1], c2, c3, c1);
456	463k	mul_add_c(a[7], b[0], c2, c3, c1);
457	463k	r[7] = c2;
458	463k	c2 = 0;
459	463k	mul_add_c(a[7], b[1], c3, c1, c2);
460	463k	mul_add_c(a[6], b[2], c3, c1, c2);
461	463k	mul_add_c(a[5], b[3], c3, c1, c2);
462	463k	mul_add_c(a[4], b[4], c3, c1, c2);
463	463k	mul_add_c(a[3], b[5], c3, c1, c2);
464	463k	mul_add_c(a[2], b[6], c3, c1, c2);
465	463k	mul_add_c(a[1], b[7], c3, c1, c2);
466	463k	r[8] = c3;
467	463k	c3 = 0;
468	463k	mul_add_c(a[2], b[7], c1, c2, c3);
469	463k	mul_add_c(a[3], b[6], c1, c2, c3);
470	463k	mul_add_c(a[4], b[5], c1, c2, c3);
471	463k	mul_add_c(a[5], b[4], c1, c2, c3);
472	463k	mul_add_c(a[6], b[3], c1, c2, c3);
473	463k	mul_add_c(a[7], b[2], c1, c2, c3);
474	463k	r[9] = c1;
475	463k	c1 = 0;
476	463k	mul_add_c(a[7], b[3], c2, c3, c1);
477	463k	mul_add_c(a[6], b[4], c2, c3, c1);
478	463k	mul_add_c(a[5], b[5], c2, c3, c1);
479	463k	mul_add_c(a[4], b[6], c2, c3, c1);
480	463k	mul_add_c(a[3], b[7], c2, c3, c1);
481	463k	r[10] = c2;
482	463k	c2 = 0;
483	463k	mul_add_c(a[4], b[7], c3, c1, c2);
484	463k	mul_add_c(a[5], b[6], c3, c1, c2);
485	463k	mul_add_c(a[6], b[5], c3, c1, c2);
486	463k	mul_add_c(a[7], b[4], c3, c1, c2);
487	463k	r[11] = c3;
488	463k	c3 = 0;
489	463k	mul_add_c(a[7], b[5], c1, c2, c3);
490	463k	mul_add_c(a[6], b[6], c1, c2, c3);
491	463k	mul_add_c(a[5], b[7], c1, c2, c3);
492	463k	r[12] = c1;
493	463k	c1 = 0;
494	463k	mul_add_c(a[6], b[7], c2, c3, c1);
495	463k	mul_add_c(a[7], b[6], c2, c3, c1);
496	463k	r[13] = c2;
497	463k	c2 = 0;
498	463k	mul_add_c(a[7], b[7], c3, c1, c2);
499	463k	r[14] = c3;
500	463k	r[15] = c1;
501	463k	}
502
503		void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
504	0	{
505	0	BN_ULONG c1, c2, c3;
506	0
507	0	c1 = 0;
508	0	c2 = 0;
509	0	c3 = 0;
510	0	mul_add_c(a[0], b[0], c1, c2, c3);
511	0	r[0] = c1;
512	0	c1 = 0;
513	0	mul_add_c(a[0], b[1], c2, c3, c1);
514	0	mul_add_c(a[1], b[0], c2, c3, c1);
515	0	r[1] = c2;
516	0	c2 = 0;
517	0	mul_add_c(a[2], b[0], c3, c1, c2);
518	0	mul_add_c(a[1], b[1], c3, c1, c2);
519	0	mul_add_c(a[0], b[2], c3, c1, c2);
520	0	r[2] = c3;
521	0	c3 = 0;
522	0	mul_add_c(a[0], b[3], c1, c2, c3);
523	0	mul_add_c(a[1], b[2], c1, c2, c3);
524	0	mul_add_c(a[2], b[1], c1, c2, c3);
525	0	mul_add_c(a[3], b[0], c1, c2, c3);
526	0	r[3] = c1;
527	0	c1 = 0;
528	0	mul_add_c(a[3], b[1], c2, c3, c1);
529	0	mul_add_c(a[2], b[2], c2, c3, c1);
530	0	mul_add_c(a[1], b[3], c2, c3, c1);
531	0	r[4] = c2;
532	0	c2 = 0;
533	0	mul_add_c(a[2], b[3], c3, c1, c2);
534	0	mul_add_c(a[3], b[2], c3, c1, c2);
535	0	r[5] = c3;
536	0	c3 = 0;
537	0	mul_add_c(a[3], b[3], c1, c2, c3);
538	0	r[6] = c1;
539	0	r[7] = c2;
540	0	}
541
542		void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
543	0	{
544	0	BN_ULONG c1, c2, c3;
545	0
546	0	c1 = 0;
547	0	c2 = 0;
548	0	c3 = 0;
549	0	sqr_add_c(a, 0, c1, c2, c3);
550	0	r[0] = c1;
551	0	c1 = 0;
552	0	sqr_add_c2(a, 1, 0, c2, c3, c1);
553	0	r[1] = c2;
554	0	c2 = 0;
555	0	sqr_add_c(a, 1, c3, c1, c2);
556	0	sqr_add_c2(a, 2, 0, c3, c1, c2);
557	0	r[2] = c3;
558	0	c3 = 0;
559	0	sqr_add_c2(a, 3, 0, c1, c2, c3);
560	0	sqr_add_c2(a, 2, 1, c1, c2, c3);
561	0	r[3] = c1;
562	0	c1 = 0;
563	0	sqr_add_c(a, 2, c2, c3, c1);
564	0	sqr_add_c2(a, 3, 1, c2, c3, c1);
565	0	sqr_add_c2(a, 4, 0, c2, c3, c1);
566	0	r[4] = c2;
567	0	c2 = 0;
568	0	sqr_add_c2(a, 5, 0, c3, c1, c2);
569	0	sqr_add_c2(a, 4, 1, c3, c1, c2);
570	0	sqr_add_c2(a, 3, 2, c3, c1, c2);
571	0	r[5] = c3;
572	0	c3 = 0;
573	0	sqr_add_c(a, 3, c1, c2, c3);
574	0	sqr_add_c2(a, 4, 2, c1, c2, c3);
575	0	sqr_add_c2(a, 5, 1, c1, c2, c3);
576	0	sqr_add_c2(a, 6, 0, c1, c2, c3);
577	0	r[6] = c1;
578	0	c1 = 0;
579	0	sqr_add_c2(a, 7, 0, c2, c3, c1);
580	0	sqr_add_c2(a, 6, 1, c2, c3, c1);
581	0	sqr_add_c2(a, 5, 2, c2, c3, c1);
582	0	sqr_add_c2(a, 4, 3, c2, c3, c1);
583	0	r[7] = c2;
584	0	c2 = 0;
585	0	sqr_add_c(a, 4, c3, c1, c2);
586	0	sqr_add_c2(a, 5, 3, c3, c1, c2);
587	0	sqr_add_c2(a, 6, 2, c3, c1, c2);
588	0	sqr_add_c2(a, 7, 1, c3, c1, c2);
589	0	r[8] = c3;
590	0	c3 = 0;
591	0	sqr_add_c2(a, 7, 2, c1, c2, c3);
592	0	sqr_add_c2(a, 6, 3, c1, c2, c3);
593	0	sqr_add_c2(a, 5, 4, c1, c2, c3);
594	0	r[9] = c1;
595	0	c1 = 0;
596	0	sqr_add_c(a, 5, c2, c3, c1);
597	0	sqr_add_c2(a, 6, 4, c2, c3, c1);
598	0	sqr_add_c2(a, 7, 3, c2, c3, c1);
599	0	r[10] = c2;
600	0	c2 = 0;
601	0	sqr_add_c2(a, 7, 4, c3, c1, c2);
602	0	sqr_add_c2(a, 6, 5, c3, c1, c2);
603	0	r[11] = c3;
604	0	c3 = 0;
605	0	sqr_add_c(a, 6, c1, c2, c3);
606	0	sqr_add_c2(a, 7, 5, c1, c2, c3);
607	0	r[12] = c1;
608	0	c1 = 0;
609	0	sqr_add_c2(a, 7, 6, c2, c3, c1);
610	0	r[13] = c2;
611	0	c2 = 0;
612	0	sqr_add_c(a, 7, c3, c1, c2);
613	0	r[14] = c3;
614	0	r[15] = c1;
615	0	}
616
617		void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
618	0	{
619	0	BN_ULONG c1, c2, c3;
620	0
621	0	c1 = 0;
622	0	c2 = 0;
623	0	c3 = 0;
624	0	sqr_add_c(a, 0, c1, c2, c3);
625	0	r[0] = c1;
626	0	c1 = 0;
627	0	sqr_add_c2(a, 1, 0, c2, c3, c1);
628	0	r[1] = c2;
629	0	c2 = 0;
630	0	sqr_add_c(a, 1, c3, c1, c2);
631	0	sqr_add_c2(a, 2, 0, c3, c1, c2);
632	0	r[2] = c3;
633	0	c3 = 0;
634	0	sqr_add_c2(a, 3, 0, c1, c2, c3);
635	0	sqr_add_c2(a, 2, 1, c1, c2, c3);
636	0	r[3] = c1;
637	0	c1 = 0;
638	0	sqr_add_c(a, 2, c2, c3, c1);
639	0	sqr_add_c2(a, 3, 1, c2, c3, c1);
640	0	r[4] = c2;
641	0	c2 = 0;
642	0	sqr_add_c2(a, 3, 2, c3, c1, c2);
643	0	r[5] = c3;
644	0	c3 = 0;
645	0	sqr_add_c(a, 3, c1, c2, c3);
646	0	r[6] = c1;
647	0	r[7] = c2;
648	0	}
649		#endif

Coverage Report

Created: 2018-08-29 13:53