/src/openssl31/crypto/bn/asm/x86_64-gcc.c

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright 2002-2018 The OpenSSL Project Authors. All Rights Reserved.
3		*
4		* Licensed under the Apache License 2.0 (the "License"). You may not use
5		* this file except in compliance with the License. You can obtain a copy
6		* in the file LICENSE in the source distribution or at
7		* https://www.openssl.org/source/license.html
8		*/
9
10		#include "../bn_local.h"
11		#if !(defined(__GNUC__) && __GNUC__>=2)
12		# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
13		#else
14		/*-
15		* x86_64 BIGNUM accelerator version 0.1, December 2002.
16		*
17		* Implemented by Andy Polyakov <appro@openssl.org> for the OpenSSL
18		* project.
19		*
20		* Rights for redistribution and usage in source and binary forms are
21		* granted according to the License. Warranty of any kind is disclaimed.
22		*
23		* Q. Version 0.1? It doesn't sound like Andy, he used to assign real
24		* versions, like 1.0...
25		* A. Well, that's because this code is basically a quick-n-dirty
26		* proof-of-concept hack. As you can see it's implemented with
27		* inline assembler, which means that you're bound to GCC and that
28		* there might be enough room for further improvement.
29		*
30		* Q. Why inline assembler?
31		* A. x86_64 features own ABI which I'm not familiar with. This is
32		* why I decided to let the compiler take care of subroutine
33		* prologue/epilogue as well as register allocation. For reference.
34		* Win64 implements different ABI for AMD64, different from Linux.
35		*
36		* Q. How much faster does it get?
37		* A. 'apps/openssl speed rsa dsa' output with no-asm:
38		*
39		* sign verify sign/s verify/s
40		* rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
41		* rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
42		* rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
43		* rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
44		* sign verify sign/s verify/s
45		* dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
46		* dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
47		* dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
48		*
49		* 'apps/openssl speed rsa dsa' output with this module:
50		*
51		* sign verify sign/s verify/s
52		* rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
53		* rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
54		* rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
55		* rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
56		* sign verify sign/s verify/s
57		* dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
58		* dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
59		* dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
60		*
61		* For the reference. IA-32 assembler implementation performs
62		* very much like 64-bit code compiled with no-asm on the same
63		* machine.
64		*/
65
66		# undef mul
67		# undef mul_add
68
69		/*-
70		* "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
71		* "g"(0) let the compiler to decide where does it
72		* want to keep the value of zero;
73		*/
74	89.1G	# define mul_add(r,a,word,carry) do { \
75	89.1G	register BN_ULONG high,low; \
76	89.1G	asm ("mulq %3" \
77	89.1G	: "=a"(low),"=d"(high) \
78	89.1G	: "a"(word),"m"(a) \
79	89.1G	: "cc"); \
80	89.1G	asm ("addq %2,%0; adcq %3,%1" \
81	89.1G	: "+r"(carry),"+d"(high)\
82	89.1G	: "a"(low),"g"(0) \
83	89.1G	: "cc"); \
84	89.1G	asm ("addq %2,%0; adcq %3,%1" \
85	89.1G	: "+m"(r),"+d"(high) \
86	89.1G	: "r"(carry),"g"(0) \
87	89.1G	: "cc"); \
88	89.1G	carry=high; \
89	89.1G	} while (0)
90
91	17.2G	# define mul(r,a,word,carry) do { \
92	17.2G	register BN_ULONG high,low; \
93	17.2G	asm ("mulq %3" \
94	17.2G	: "=a"(low),"=d"(high) \
95	17.2G	: "a"(word),"g"(a) \
96	17.2G	: "cc"); \
97	17.2G	asm ("addq %2,%0; adcq %3,%1" \
98	17.2G	: "+r"(carry),"+d"(high)\
99	17.2G	: "a"(low),"g"(0) \
100	17.2G	: "cc"); \
101	17.2G	(r)=carry, carry=high; \
102	17.2G	} while (0)
103		# undef sqr
104		# define sqr(r0,r1,a) \
105	33.0M	asm ("mulq %2" \
106	33.0M	: "=a"(r0),"=d"(r1) \
107	33.0M	: "a"(a) \
108	33.0M	: "cc");
109
110		BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
111		BN_ULONG w)
112	92.5M	{
113	92.5M	BN_ULONG c1 = 0;
114
115	92.5M	if (num <= 0)
116	0	return c1;
117
118	22.3G	while (num & ~3) {
119	22.2G	mul_add(rp[0], ap[0], w, c1);
120	22.2G	mul_add(rp[1], ap[1], w, c1);
121	22.2G	mul_add(rp[2], ap[2], w, c1);
122	22.2G	mul_add(rp[3], ap[3], w, c1);
123	22.2G	ap += 4;
124	22.2G	rp += 4;
125	22.2G	num -= 4;
126	22.2G	}
127	92.5M	if (num) {
128	57.2M	mul_add(rp[0], ap[0], w, c1);
129	57.2M	if (--num == 0)
130	19.9M	return c1;
131	37.3M	mul_add(rp[1], ap[1], w, c1);
132	37.3M	if (--num == 0)
133	15.2M	return c1;
134	22.0M	mul_add(rp[2], ap[2], w, c1);
135	22.0M	return c1;
136	37.3M	}
137
138	35.2M	return c1;
139	92.5M	}
140
141		BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
142	327M	{
143	327M	BN_ULONG c1 = 0;
144
145	327M	if (num <= 0)
146	0	return c1;
147
148	4.63G	while (num & ~3) {
149	4.30G	mul(rp[0], ap[0], w, c1);
150	4.30G	mul(rp[1], ap[1], w, c1);
151	4.30G	mul(rp[2], ap[2], w, c1);
152	4.30G	mul(rp[3], ap[3], w, c1);
153	4.30G	ap += 4;
154	4.30G	rp += 4;
155	4.30G	num -= 4;
156	4.30G	}
157	327M	if (num) {
158	38.8M	mul(rp[0], ap[0], w, c1);
159	38.8M	if (--num == 0)
160	17.9M	return c1;
161	20.9M	mul(rp[1], ap[1], w, c1);
162	20.9M	if (--num == 0)
163	11.3M	return c1;
164	9.54M	mul(rp[2], ap[2], w, c1);
165	9.54M	}
166	298M	return c1;
167	327M	}
168
169		void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
170	2.67M	{
171	2.67M	if (n <= 0)
172	0	return;
173
174	10.1M	while (n & ~3) {
175	7.45M	sqr(r[0], r[1], a[0]);
176	7.45M	sqr(r[2], r[3], a[1]);
177	7.45M	sqr(r[4], r[5], a[2]);
178	7.45M	sqr(r[6], r[7], a[3]);
179	7.45M	a += 4;
180	7.45M	r += 8;
181	7.45M	n -= 4;
182	7.45M	}
183	2.67M	if (n) {
184	2.62M	sqr(r[0], r[1], a[0]);
185	2.62M	if (--n == 0)
186	2.21M	return;
187	404k	sqr(r[2], r[3], a[1]);
188	404k	if (--n == 0)
189	212k	return;
190	191k	sqr(r[4], r[5], a[2]);
191	191k	}
192	2.67M	}
193
194		BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
195	306M	{
196	306M	BN_ULONG ret, waste;
197
198	306M	asm("divq %4":"=a"(ret), "=d"(waste)
199	306M	: "a"(l), "d"(h), "r"(d)
200	306M	: "cc");
201
202	306M	return ret;
203	306M	}
204
205		BN_ULONG bn_add_words(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
206		int n)
207	474M	{
208	474M	BN_ULONG ret;
209	474M	size_t i = 0;
210
211	474M	if (n <= 0)
212	3.86M	return 0;
213
214	471M	asm volatile (" subq %0,%0 \n" /* clear carry */
215	471M	" jmp 1f \n"
216	471M	".p2align 4 \n"
217	471M	"1: movq (%4,%2,8),%0 \n"
218	471M	" adcq (%5,%2,8),%0 \n"
219	471M	" movq %0,(%3,%2,8) \n"
220	471M	" lea 1(%2),%2 \n"
221	471M	" dec %1 \n"
222	471M	" jnz 1b \n"
223	471M	" sbbq %0,%0 \n"
224	471M	:"=&r" (ret), "+c"(n), "+r"(i)
225	471M	:"r"(rp), "r"(ap), "r"(bp)
226	471M	:"cc", "memory");
227
228	471M	return ret & 1;
229	474M	}
230
231		# ifndef SIMICS
232		BN_ULONG bn_sub_words(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
233		int n)
234	476M	{
235	476M	BN_ULONG ret;
236	476M	size_t i = 0;
237
238	476M	if (n <= 0)
239	3.26M	return 0;
240
241	473M	asm volatile (" subq %0,%0 \n" /* clear borrow */
242	473M	" jmp 1f \n"
243	473M	".p2align 4 \n"
244	473M	"1: movq (%4,%2,8),%0 \n"
245	473M	" sbbq (%5,%2,8),%0 \n"
246	473M	" movq %0,(%3,%2,8) \n"
247	473M	" lea 1(%2),%2 \n"
248	473M	" dec %1 \n"
249	473M	" jnz 1b \n"
250	473M	" sbbq %0,%0 \n"
251	473M	:"=&r" (ret), "+c"(n), "+r"(i)
252	473M	:"r"(rp), "r"(ap), "r"(bp)
253	473M	:"cc", "memory");
254
255	473M	return ret & 1;
256	476M	}
257		# else
258		/* Simics 1.4<7 has buggy sbbq:-( */
259		# define BN_MASK2 0xffffffffffffffffL
260		BN_ULONG bn_sub_words(BN_ULONG r, BN_ULONG a, BN_ULONG *b, int n)
261		{
262		BN_ULONG t1, t2;
263		int c = 0;
264
265		if (n <= 0)
266		return (BN_ULONG)0;
267
268		for (;;) {
269		t1 = a[0];
270		t2 = b[0];
271		r[0] = (t1 - t2 - c) & BN_MASK2;
272		if (t1 != t2)
273		c = (t1 < t2);
274		if (--n <= 0)
275		break;
276
277		t1 = a[1];
278		t2 = b[1];
279		r[1] = (t1 - t2 - c) & BN_MASK2;
280		if (t1 != t2)
281		c = (t1 < t2);
282		if (--n <= 0)
283		break;
284
285		t1 = a[2];
286		t2 = b[2];
287		r[2] = (t1 - t2 - c) & BN_MASK2;
288		if (t1 != t2)
289		c = (t1 < t2);
290		if (--n <= 0)
291		break;
292
293		t1 = a[3];
294		t2 = b[3];
295		r[3] = (t1 - t2 - c) & BN_MASK2;
296		if (t1 != t2)
297		c = (t1 < t2);
298		if (--n <= 0)
299		break;
300
301		a += 4;
302		b += 4;
303		r += 4;
304		}
305		return c;
306		}
307		# endif
308
309		/* mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /
310		/* mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) */
311		/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
312		/*
313		* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number
314		* c=(c2,c1,c0)
315		*/
316
317		/*
318		* Keep in mind that carrying into high part of multiplication result
319		* can not overflow, because it cannot be all-ones.
320		*/
321		# if 0
322		/* original macros are kept for reference purposes */
323		# define mul_add_c(a,b,c0,c1,c2) do { \
324		BN_ULONG ta = (a), tb = (b); \
325		BN_ULONG lo, hi; \
326		BN_UMULT_LOHI(lo,hi,ta,tb); \
327		c0 += lo; hi += (c0<lo)?1:0; \
328		c1 += hi; c2 += (c1<hi)?1:0; \
329		} while(0)
330
331		# define mul_add_c2(a,b,c0,c1,c2) do { \
332		BN_ULONG ta = (a), tb = (b); \
333		BN_ULONG lo, hi, tt; \
334		BN_UMULT_LOHI(lo,hi,ta,tb); \
335		c0 += lo; tt = hi+((c0<lo)?1:0); \
336		c1 += tt; c2 += (c1<tt)?1:0; \
337		c0 += lo; hi += (c0<lo)?1:0; \
338		c1 += hi; c2 += (c1<hi)?1:0; \
339		} while(0)
340
341		# define sqr_add_c(a,i,c0,c1,c2) do { \
342		BN_ULONG ta = (a)[i]; \
343		BN_ULONG lo, hi; \
344		BN_UMULT_LOHI(lo,hi,ta,ta); \
345		c0 += lo; hi += (c0<lo)?1:0; \
346		c1 += hi; c2 += (c1<hi)?1:0; \
347		} while(0)
348		# else
349	3.24G	# define mul_add_c(a,b,c0,c1,c2) do { \
350	3.24G	BN_ULONG t1,t2; \
351	3.24G	asm ("mulq %3" \
352	3.24G	: "=a"(t1),"=d"(t2) \
353	3.24G	: "a"(a),"m"(b) \
354	3.24G	: "cc"); \
355	3.24G	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
356	3.24G	: "+r"(c0),"+r"(c1),"+r"(c2) \
357	3.24G	: "r"(t1),"r"(t2),"g"(0) \
358	3.24G	: "cc"); \
359	3.24G	} while (0)
360
361	308M	# define sqr_add_c(a,i,c0,c1,c2) do { \
362	308M	BN_ULONG t1,t2; \
363	308M	asm ("mulq %2" \
364	308M	: "=a"(t1),"=d"(t2) \
365	308M	: "a"(a[i]) \
366	308M	: "cc"); \
367	308M	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
368	308M	: "+r"(c0),"+r"(c1),"+r"(c2) \
369	308M	: "r"(t1),"r"(t2),"g"(0) \
370	308M	: "cc"); \
371	308M	} while (0)
372
373	576M	# define mul_add_c2(a,b,c0,c1,c2) do { \
374	576M	BN_ULONG t1,t2; \
375	576M	asm ("mulq %3" \
376	576M	: "=a"(t1),"=d"(t2) \
377	576M	: "a"(a),"m"(b) \
378	576M	: "cc"); \
379	576M	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
380	576M	: "+r"(c0),"+r"(c1),"+r"(c2) \
381	576M	: "r"(t1),"r"(t2),"g"(0) \
382	576M	: "cc"); \
383	576M	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
384	576M	: "+r"(c0),"+r"(c1),"+r"(c2) \
385	576M	: "r"(t1),"r"(t2),"g"(0) \
386	576M	: "cc"); \
387	576M	} while (0)
388		# endif
389
390		# define sqr_add_c2(a,i,j,c0,c1,c2) \
391	576M	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
392
393		void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
394	50.7M	{
395	50.7M	BN_ULONG c1, c2, c3;
396
397	50.7M	c1 = 0;
398	50.7M	c2 = 0;
399	50.7M	c3 = 0;
400	50.7M	mul_add_c(a[0], b[0], c1, c2, c3);
401	50.7M	r[0] = c1;
402	50.7M	c1 = 0;
403	50.7M	mul_add_c(a[0], b[1], c2, c3, c1);
404	50.7M	mul_add_c(a[1], b[0], c2, c3, c1);
405	50.7M	r[1] = c2;
406	50.7M	c2 = 0;
407	50.7M	mul_add_c(a[2], b[0], c3, c1, c2);
408	50.7M	mul_add_c(a[1], b[1], c3, c1, c2);
409	50.7M	mul_add_c(a[0], b[2], c3, c1, c2);
410	50.7M	r[2] = c3;
411	50.7M	c3 = 0;
412	50.7M	mul_add_c(a[0], b[3], c1, c2, c3);
413	50.7M	mul_add_c(a[1], b[2], c1, c2, c3);
414	50.7M	mul_add_c(a[2], b[1], c1, c2, c3);
415	50.7M	mul_add_c(a[3], b[0], c1, c2, c3);
416	50.7M	r[3] = c1;
417	50.7M	c1 = 0;
418	50.7M	mul_add_c(a[4], b[0], c2, c3, c1);
419	50.7M	mul_add_c(a[3], b[1], c2, c3, c1);
420	50.7M	mul_add_c(a[2], b[2], c2, c3, c1);
421	50.7M	mul_add_c(a[1], b[3], c2, c3, c1);
422	50.7M	mul_add_c(a[0], b[4], c2, c3, c1);
423	50.7M	r[4] = c2;
424	50.7M	c2 = 0;
425	50.7M	mul_add_c(a[0], b[5], c3, c1, c2);
426	50.7M	mul_add_c(a[1], b[4], c3, c1, c2);
427	50.7M	mul_add_c(a[2], b[3], c3, c1, c2);
428	50.7M	mul_add_c(a[3], b[2], c3, c1, c2);
429	50.7M	mul_add_c(a[4], b[1], c3, c1, c2);
430	50.7M	mul_add_c(a[5], b[0], c3, c1, c2);
431	50.7M	r[5] = c3;
432	50.7M	c3 = 0;
433	50.7M	mul_add_c(a[6], b[0], c1, c2, c3);
434	50.7M	mul_add_c(a[5], b[1], c1, c2, c3);
435	50.7M	mul_add_c(a[4], b[2], c1, c2, c3);
436	50.7M	mul_add_c(a[3], b[3], c1, c2, c3);
437	50.7M	mul_add_c(a[2], b[4], c1, c2, c3);
438	50.7M	mul_add_c(a[1], b[5], c1, c2, c3);
439	50.7M	mul_add_c(a[0], b[6], c1, c2, c3);
440	50.7M	r[6] = c1;
441	50.7M	c1 = 0;
442	50.7M	mul_add_c(a[0], b[7], c2, c3, c1);
443	50.7M	mul_add_c(a[1], b[6], c2, c3, c1);
444	50.7M	mul_add_c(a[2], b[5], c2, c3, c1);
445	50.7M	mul_add_c(a[3], b[4], c2, c3, c1);
446	50.7M	mul_add_c(a[4], b[3], c2, c3, c1);
447	50.7M	mul_add_c(a[5], b[2], c2, c3, c1);
448	50.7M	mul_add_c(a[6], b[1], c2, c3, c1);
449	50.7M	mul_add_c(a[7], b[0], c2, c3, c1);
450	50.7M	r[7] = c2;
451	50.7M	c2 = 0;
452	50.7M	mul_add_c(a[7], b[1], c3, c1, c2);
453	50.7M	mul_add_c(a[6], b[2], c3, c1, c2);
454	50.7M	mul_add_c(a[5], b[3], c3, c1, c2);
455	50.7M	mul_add_c(a[4], b[4], c3, c1, c2);
456	50.7M	mul_add_c(a[3], b[5], c3, c1, c2);
457	50.7M	mul_add_c(a[2], b[6], c3, c1, c2);
458	50.7M	mul_add_c(a[1], b[7], c3, c1, c2);
459	50.7M	r[8] = c3;
460	50.7M	c3 = 0;
461	50.7M	mul_add_c(a[2], b[7], c1, c2, c3);
462	50.7M	mul_add_c(a[3], b[6], c1, c2, c3);
463	50.7M	mul_add_c(a[4], b[5], c1, c2, c3);
464	50.7M	mul_add_c(a[5], b[4], c1, c2, c3);
465	50.7M	mul_add_c(a[6], b[3], c1, c2, c3);
466	50.7M	mul_add_c(a[7], b[2], c1, c2, c3);
467	50.7M	r[9] = c1;
468	50.7M	c1 = 0;
469	50.7M	mul_add_c(a[7], b[3], c2, c3, c1);
470	50.7M	mul_add_c(a[6], b[4], c2, c3, c1);
471	50.7M	mul_add_c(a[5], b[5], c2, c3, c1);
472	50.7M	mul_add_c(a[4], b[6], c2, c3, c1);
473	50.7M	mul_add_c(a[3], b[7], c2, c3, c1);
474	50.7M	r[10] = c2;
475	50.7M	c2 = 0;
476	50.7M	mul_add_c(a[4], b[7], c3, c1, c2);
477	50.7M	mul_add_c(a[5], b[6], c3, c1, c2);
478	50.7M	mul_add_c(a[6], b[5], c3, c1, c2);
479	50.7M	mul_add_c(a[7], b[4], c3, c1, c2);
480	50.7M	r[11] = c3;
481	50.7M	c3 = 0;
482	50.7M	mul_add_c(a[7], b[5], c1, c2, c3);
483	50.7M	mul_add_c(a[6], b[6], c1, c2, c3);
484	50.7M	mul_add_c(a[5], b[7], c1, c2, c3);
485	50.7M	r[12] = c1;
486	50.7M	c1 = 0;
487	50.7M	mul_add_c(a[6], b[7], c2, c3, c1);
488	50.7M	mul_add_c(a[7], b[6], c2, c3, c1);
489	50.7M	r[13] = c2;
490	50.7M	c2 = 0;
491	50.7M	mul_add_c(a[7], b[7], c3, c1, c2);
492	50.7M	r[14] = c3;
493	50.7M	r[15] = c1;
494	50.7M	}
495
496		void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
497	0	{
498	0	BN_ULONG c1, c2, c3;
499
500	0	c1 = 0;
501	0	c2 = 0;
502	0	c3 = 0;
503	0	mul_add_c(a[0], b[0], c1, c2, c3);
504	0	r[0] = c1;
505	0	c1 = 0;
506	0	mul_add_c(a[0], b[1], c2, c3, c1);
507	0	mul_add_c(a[1], b[0], c2, c3, c1);
508	0	r[1] = c2;
509	0	c2 = 0;
510	0	mul_add_c(a[2], b[0], c3, c1, c2);
511	0	mul_add_c(a[1], b[1], c3, c1, c2);
512	0	mul_add_c(a[0], b[2], c3, c1, c2);
513	0	r[2] = c3;
514	0	c3 = 0;
515	0	mul_add_c(a[0], b[3], c1, c2, c3);
516	0	mul_add_c(a[1], b[2], c1, c2, c3);
517	0	mul_add_c(a[2], b[1], c1, c2, c3);
518	0	mul_add_c(a[3], b[0], c1, c2, c3);
519	0	r[3] = c1;
520	0	c1 = 0;
521	0	mul_add_c(a[3], b[1], c2, c3, c1);
522	0	mul_add_c(a[2], b[2], c2, c3, c1);
523	0	mul_add_c(a[1], b[3], c2, c3, c1);
524	0	r[4] = c2;
525	0	c2 = 0;
526	0	mul_add_c(a[2], b[3], c3, c1, c2);
527	0	mul_add_c(a[3], b[2], c3, c1, c2);
528	0	r[5] = c3;
529	0	c3 = 0;
530	0	mul_add_c(a[3], b[3], c1, c2, c3);
531	0	r[6] = c1;
532	0	r[7] = c2;
533	0	}
534
535		void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
536	7.14M	{
537	7.14M	BN_ULONG c1, c2, c3;
538
539	7.14M	c1 = 0;
540	7.14M	c2 = 0;
541	7.14M	c3 = 0;
542	7.14M	sqr_add_c(a, 0, c1, c2, c3);
543	7.14M	r[0] = c1;
544	7.14M	c1 = 0;
545	7.14M	sqr_add_c2(a, 1, 0, c2, c3, c1);
546	7.14M	r[1] = c2;
547	7.14M	c2 = 0;
548	7.14M	sqr_add_c(a, 1, c3, c1, c2);
549	7.14M	sqr_add_c2(a, 2, 0, c3, c1, c2);
550	7.14M	r[2] = c3;
551	7.14M	c3 = 0;
552	7.14M	sqr_add_c2(a, 3, 0, c1, c2, c3);
553	7.14M	sqr_add_c2(a, 2, 1, c1, c2, c3);
554	7.14M	r[3] = c1;
555	7.14M	c1 = 0;
556	7.14M	sqr_add_c(a, 2, c2, c3, c1);
557	7.14M	sqr_add_c2(a, 3, 1, c2, c3, c1);
558	7.14M	sqr_add_c2(a, 4, 0, c2, c3, c1);
559	7.14M	r[4] = c2;
560	7.14M	c2 = 0;
561	7.14M	sqr_add_c2(a, 5, 0, c3, c1, c2);
562	7.14M	sqr_add_c2(a, 4, 1, c3, c1, c2);
563	7.14M	sqr_add_c2(a, 3, 2, c3, c1, c2);
564	7.14M	r[5] = c3;
565	7.14M	c3 = 0;
566	7.14M	sqr_add_c(a, 3, c1, c2, c3);
567	7.14M	sqr_add_c2(a, 4, 2, c1, c2, c3);
568	7.14M	sqr_add_c2(a, 5, 1, c1, c2, c3);
569	7.14M	sqr_add_c2(a, 6, 0, c1, c2, c3);
570	7.14M	r[6] = c1;
571	7.14M	c1 = 0;
572	7.14M	sqr_add_c2(a, 7, 0, c2, c3, c1);
573	7.14M	sqr_add_c2(a, 6, 1, c2, c3, c1);
574	7.14M	sqr_add_c2(a, 5, 2, c2, c3, c1);
575	7.14M	sqr_add_c2(a, 4, 3, c2, c3, c1);
576	7.14M	r[7] = c2;
577	7.14M	c2 = 0;
578	7.14M	sqr_add_c(a, 4, c3, c1, c2);
579	7.14M	sqr_add_c2(a, 5, 3, c3, c1, c2);
580	7.14M	sqr_add_c2(a, 6, 2, c3, c1, c2);
581	7.14M	sqr_add_c2(a, 7, 1, c3, c1, c2);
582	7.14M	r[8] = c3;
583	7.14M	c3 = 0;
584	7.14M	sqr_add_c2(a, 7, 2, c1, c2, c3);
585	7.14M	sqr_add_c2(a, 6, 3, c1, c2, c3);
586	7.14M	sqr_add_c2(a, 5, 4, c1, c2, c3);
587	7.14M	r[9] = c1;
588	7.14M	c1 = 0;
589	7.14M	sqr_add_c(a, 5, c2, c3, c1);
590	7.14M	sqr_add_c2(a, 6, 4, c2, c3, c1);
591	7.14M	sqr_add_c2(a, 7, 3, c2, c3, c1);
592	7.14M	r[10] = c2;
593	7.14M	c2 = 0;
594	7.14M	sqr_add_c2(a, 7, 4, c3, c1, c2);
595	7.14M	sqr_add_c2(a, 6, 5, c3, c1, c2);
596	7.14M	r[11] = c3;
597	7.14M	c3 = 0;
598	7.14M	sqr_add_c(a, 6, c1, c2, c3);
599	7.14M	sqr_add_c2(a, 7, 5, c1, c2, c3);
600	7.14M	r[12] = c1;
601	7.14M	c1 = 0;
602	7.14M	sqr_add_c2(a, 7, 6, c2, c3, c1);
603	7.14M	r[13] = c2;
604	7.14M	c2 = 0;
605	7.14M	sqr_add_c(a, 7, c3, c1, c2);
606	7.14M	r[14] = c3;
607	7.14M	r[15] = c1;
608	7.14M	}
609
610		void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
611	62.8M	{
612	62.8M	BN_ULONG c1, c2, c3;
613
614	62.8M	c1 = 0;
615	62.8M	c2 = 0;
616	62.8M	c3 = 0;
617	62.8M	sqr_add_c(a, 0, c1, c2, c3);
618	62.8M	r[0] = c1;
619	62.8M	c1 = 0;
620	62.8M	sqr_add_c2(a, 1, 0, c2, c3, c1);
621	62.8M	r[1] = c2;
622	62.8M	c2 = 0;
623	62.8M	sqr_add_c(a, 1, c3, c1, c2);
624	62.8M	sqr_add_c2(a, 2, 0, c3, c1, c2);
625	62.8M	r[2] = c3;
626	62.8M	c3 = 0;
627	62.8M	sqr_add_c2(a, 3, 0, c1, c2, c3);
628	62.8M	sqr_add_c2(a, 2, 1, c1, c2, c3);
629	62.8M	r[3] = c1;
630	62.8M	c1 = 0;
631	62.8M	sqr_add_c(a, 2, c2, c3, c1);
632	62.8M	sqr_add_c2(a, 3, 1, c2, c3, c1);
633	62.8M	r[4] = c2;
634	62.8M	c2 = 0;
635	62.8M	sqr_add_c2(a, 3, 2, c3, c1, c2);
636	62.8M	r[5] = c3;
637	62.8M	c3 = 0;
638	62.8M	sqr_add_c(a, 3, c1, c2, c3);
639	62.8M	r[6] = c1;
640	62.8M	r[7] = c2;
641	62.8M	}
642		#endif

Coverage Report

Created: 2025-06-13 06:58