/src/openssl111/crypto/bn/asm/x86_64-gcc.c

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright 2002-2018 The OpenSSL Project Authors. All Rights Reserved.
3		*
4		* Licensed under the OpenSSL license (the "License"). You may not use
5		* this file except in compliance with the License. You can obtain a copy
6		* in the file LICENSE in the source distribution or at
7		* https://www.openssl.org/source/license.html
8		*/
9
10		#include "../bn_local.h"
11		#if !(defined(__GNUC__) && __GNUC__>=2)
12		# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
13		#else
14		/*-
15		* x86_64 BIGNUM accelerator version 0.1, December 2002.
16		*
17		* Implemented by Andy Polyakov <appro@openssl.org> for the OpenSSL
18		* project.
19		*
20		* Rights for redistribution and usage in source and binary forms are
21		* granted according to the OpenSSL license. Warranty of any kind is
22		* disclaimed.
23		*
24		* Q. Version 0.1? It doesn't sound like Andy, he used to assign real
25		* versions, like 1.0...
26		* A. Well, that's because this code is basically a quick-n-dirty
27		* proof-of-concept hack. As you can see it's implemented with
28		* inline assembler, which means that you're bound to GCC and that
29		* there might be enough room for further improvement.
30		*
31		* Q. Why inline assembler?
32		* A. x86_64 features own ABI which I'm not familiar with. This is
33		* why I decided to let the compiler take care of subroutine
34		* prologue/epilogue as well as register allocation. For reference.
35		* Win64 implements different ABI for AMD64, different from Linux.
36		*
37		* Q. How much faster does it get?
38		* A. 'apps/openssl speed rsa dsa' output with no-asm:
39		*
40		* sign verify sign/s verify/s
41		* rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
42		* rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
43		* rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
44		* rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
45		* sign verify sign/s verify/s
46		* dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
47		* dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
48		* dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
49		*
50		* 'apps/openssl speed rsa dsa' output with this module:
51		*
52		* sign verify sign/s verify/s
53		* rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
54		* rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
55		* rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
56		* rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
57		* sign verify sign/s verify/s
58		* dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
59		* dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
60		* dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
61		*
62		* For the reference. IA-32 assembler implementation performs
63		* very much like 64-bit code compiled with no-asm on the same
64		* machine.
65		*/
66
67		# undef mul
68		# undef mul_add
69
70		/*-
71		* "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
72		* "g"(0) let the compiler to decide where does it
73		* want to keep the value of zero;
74		*/
75	3.02G	# define mul_add(r,a,word,carry) do { \
76	3.02G	register BN_ULONG high,low; \
77	3.02G	asm ("mulq %3" \
78	3.02G	: "=a"(low),"=d"(high) \
79	3.02G	: "a"(word),"m"(a) \
80	3.02G	: "cc"); \
81	3.02G	asm ("addq %2,%0; adcq %3,%1" \
82	3.02G	: "+r"(carry),"+d"(high)\
83	3.02G	: "a"(low),"g"(0) \
84	3.02G	: "cc"); \
85	3.02G	asm ("addq %2,%0; adcq %3,%1" \
86	3.02G	: "+m"(r),"+d"(high) \
87	3.02G	: "r"(carry),"g"(0) \
88	3.02G	: "cc"); \
89	3.02G	carry=high; \
90	3.02G	} while (0)
91
92	9.78G	# define mul(r,a,word,carry) do { \
93	9.78G	register BN_ULONG high,low; \
94	9.78G	asm ("mulq %3" \
95	9.78G	: "=a"(low),"=d"(high) \
96	9.78G	: "a"(word),"g"(a) \
97	9.78G	: "cc"); \
98	9.78G	asm ("addq %2,%0; adcq %3,%1" \
99	9.78G	: "+r"(carry),"+d"(high)\
100	9.78G	: "a"(low),"g"(0) \
101	9.78G	: "cc"); \
102	9.78G	(r)=carry, carry=high; \
103	9.78G	} while (0)
104		# undef sqr
105		# define sqr(r0,r1,a) \
106	5.12M	asm ("mulq %2" \
107	5.12M	: "=a"(r0),"=d"(r1) \
108	5.12M	: "a"(a) \
109	5.12M	: "cc");
110
111		BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
112		BN_ULONG w)
113	24.0M	{
114	24.0M	BN_ULONG c1 = 0;
115
116	24.0M	if (num <= 0)
117	0	return c1;
118
119	775M	while (num & ~3) {
120	751M	mul_add(rp[0], ap[0], w, c1);
121	751M	mul_add(rp[1], ap[1], w, c1);
122	751M	mul_add(rp[2], ap[2], w, c1);
123	751M	mul_add(rp[3], ap[3], w, c1);
124	751M	ap += 4;
125	751M	rp += 4;
126	751M	num -= 4;
127	751M	}
128	24.0M	if (num) {
129	8.36M	mul_add(rp[0], ap[0], w, c1);
130	8.36M	if (--num == 0)
131	4.39M	return c1;
132	3.97M	mul_add(rp[1], ap[1], w, c1);
133	3.97M	if (--num == 0)
134	1.95M	return c1;
135	2.02M	mul_add(rp[2], ap[2], w, c1);
136	2.02M	return c1;
137	3.97M	}
138
139	15.6M	return c1;
140	24.0M	}
141
142		BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
143	225M	{
144	225M	BN_ULONG c1 = 0;
145
146	225M	if (num <= 0)
147	0	return c1;
148
149	2.66G	while (num & ~3) {
150	2.43G	mul(rp[0], ap[0], w, c1);
151	2.43G	mul(rp[1], ap[1], w, c1);
152	2.43G	mul(rp[2], ap[2], w, c1);
153	2.43G	mul(rp[3], ap[3], w, c1);
154	2.43G	ap += 4;
155	2.43G	rp += 4;
156	2.43G	num -= 4;
157	2.43G	}
158	225M	if (num) {
159	23.0M	mul(rp[0], ap[0], w, c1);
160	23.0M	if (--num == 0)
161	11.4M	return c1;
162	11.5M	mul(rp[1], ap[1], w, c1);
163	11.5M	if (--num == 0)
164	6.11M	return c1;
165	5.48M	mul(rp[2], ap[2], w, c1);
166	5.48M	}
167	207M	return c1;
168	225M	}
169
170		void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
171	1.17M	{
172	1.17M	if (n <= 0)
173	0	return;
174
175	2.11M	while (n & ~3) {
176	932k	sqr(r[0], r[1], a[0]);
177	932k	sqr(r[2], r[3], a[1]);
178	932k	sqr(r[4], r[5], a[2]);
179	932k	sqr(r[6], r[7], a[3]);
180	932k	a += 4;
181	932k	r += 8;
182	932k	n -= 4;
183	932k	}
184	1.17M	if (n) {
185	1.16M	sqr(r[0], r[1], a[0]);
186	1.16M	if (--n == 0)
187	1.01M	return;
188	146k	sqr(r[2], r[3], a[1]);
189	146k	if (--n == 0)
190	66.8k	return;
191	79.9k	sqr(r[4], r[5], a[2]);
192	79.9k	}
193	1.17M	}
194
195		BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
196	213M	{
197	213M	BN_ULONG ret, waste;
198
199	213M	asm("divq %4":"=a"(ret), "=d"(waste)
200	213M	: "a"(l), "d"(h), "r"(d)
201	213M	: "cc");
202
203	213M	return ret;
204	213M	}
205
206		BN_ULONG bn_add_words(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
207		int n)
208	281M	{
209	281M	BN_ULONG ret;
210	281M	size_t i = 0;
211
212	281M	if (n <= 0)
213	480k	return 0;
214
215	281M	asm volatile (" subq %0,%0 \n" /* clear carry */
216	281M	" jmp 1f \n"
217	281M	".p2align 4 \n"
218	281M	"1: movq (%4,%2,8),%0 \n"
219	281M	" adcq (%5,%2,8),%0 \n"
220	281M	" movq %0,(%3,%2,8) \n"
221	281M	" lea 1(%2),%2 \n"
222	281M	" dec %1 \n"
223	281M	" jnz 1b \n"
224	281M	" sbbq %0,%0 \n"
225	281M	:"=&r" (ret), "+c"(n), "+r"(i)
226	281M	:"r"(rp), "r"(ap), "r"(bp)
227	281M	:"cc", "memory");
228
229	281M	return ret & 1;
230	281M	}
231
232		# ifndef SIMICS
233		BN_ULONG bn_sub_words(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
234		int n)
235	283M	{
236	283M	BN_ULONG ret;
237	283M	size_t i = 0;
238
239	283M	if (n <= 0)
240	436k	return 0;
241
242	282M	asm volatile (" subq %0,%0 \n" /* clear borrow */
243	282M	" jmp 1f \n"
244	282M	".p2align 4 \n"
245	282M	"1: movq (%4,%2,8),%0 \n"
246	282M	" sbbq (%5,%2,8),%0 \n"
247	282M	" movq %0,(%3,%2,8) \n"
248	282M	" lea 1(%2),%2 \n"
249	282M	" dec %1 \n"
250	282M	" jnz 1b \n"
251	282M	" sbbq %0,%0 \n"
252	282M	:"=&r" (ret), "+c"(n), "+r"(i)
253	282M	:"r"(rp), "r"(ap), "r"(bp)
254	282M	:"cc", "memory");
255
256	282M	return ret & 1;
257	283M	}
258		# else
259		/* Simics 1.4<7 has buggy sbbq:-( */
260		# define BN_MASK2 0xffffffffffffffffL
261		BN_ULONG bn_sub_words(BN_ULONG r, BN_ULONG a, BN_ULONG *b, int n)
262		{
263		BN_ULONG t1, t2;
264		int c = 0;
265
266		if (n <= 0)
267		return (BN_ULONG)0;
268
269		for (;;) {
270		t1 = a[0];
271		t2 = b[0];
272		r[0] = (t1 - t2 - c) & BN_MASK2;
273		if (t1 != t2)
274		c = (t1 < t2);
275		if (--n <= 0)
276		break;
277
278		t1 = a[1];
279		t2 = b[1];
280		r[1] = (t1 - t2 - c) & BN_MASK2;
281		if (t1 != t2)
282		c = (t1 < t2);
283		if (--n <= 0)
284		break;
285
286		t1 = a[2];
287		t2 = b[2];
288		r[2] = (t1 - t2 - c) & BN_MASK2;
289		if (t1 != t2)
290		c = (t1 < t2);
291		if (--n <= 0)
292		break;
293
294		t1 = a[3];
295		t2 = b[3];
296		r[3] = (t1 - t2 - c) & BN_MASK2;
297		if (t1 != t2)
298		c = (t1 < t2);
299		if (--n <= 0)
300		break;
301
302		a += 4;
303		b += 4;
304		r += 4;
305		}
306		return c;
307		}
308		# endif
309
310		/* mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /
311		/* mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) */
312		/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
313		/*
314		* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number
315		* c=(c2,c1,c0)
316		*/
317
318		/*
319		* Keep in mind that carrying into high part of multiplication result
320		* can not overflow, because it cannot be all-ones.
321		*/
322		# if 0
323		/* original macros are kept for reference purposes */
324		# define mul_add_c(a,b,c0,c1,c2) do { \
325		BN_ULONG ta = (a), tb = (b); \
326		BN_ULONG lo, hi; \
327		BN_UMULT_LOHI(lo,hi,ta,tb); \
328		c0 += lo; hi += (c0<lo)?1:0; \
329		c1 += hi; c2 += (c1<hi)?1:0; \
330		} while(0)
331
332		# define mul_add_c2(a,b,c0,c1,c2) do { \
333		BN_ULONG ta = (a), tb = (b); \
334		BN_ULONG lo, hi, tt; \
335		BN_UMULT_LOHI(lo,hi,ta,tb); \
336		c0 += lo; tt = hi+((c0<lo)?1:0); \
337		c1 += tt; c2 += (c1<tt)?1:0; \
338		c0 += lo; hi += (c0<lo)?1:0; \
339		c1 += hi; c2 += (c1<hi)?1:0; \
340		} while(0)
341
342		# define sqr_add_c(a,i,c0,c1,c2) do { \
343		BN_ULONG ta = (a)[i]; \
344		BN_ULONG lo, hi; \
345		BN_UMULT_LOHI(lo,hi,ta,ta); \
346		c0 += lo; hi += (c0<lo)?1:0; \
347		c1 += hi; c2 += (c1<hi)?1:0; \
348		} while(0)
349		# else
350	707M	# define mul_add_c(a,b,c0,c1,c2) do { \
351	707M	BN_ULONG t1,t2; \
352	707M	asm ("mulq %3" \
353	707M	: "=a"(t1),"=d"(t2) \
354	707M	: "a"(a),"m"(b) \
355	707M	: "cc"); \
356	707M	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
357	707M	: "+r"(c0),"+r"(c1),"+r"(c2) \
358	707M	: "r"(t1),"r"(t2),"g"(0) \
359	707M	: "cc"); \
360	707M	} while (0)
361
362	182M	# define sqr_add_c(a,i,c0,c1,c2) do { \
363	182M	BN_ULONG t1,t2; \
364	182M	asm ("mulq %2" \
365	182M	: "=a"(t1),"=d"(t2) \
366	182M	: "a"(a[i]) \
367	182M	: "cc"); \
368	182M	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
369	182M	: "+r"(c0),"+r"(c1),"+r"(c2) \
370	182M	: "r"(t1),"r"(t2),"g"(0) \
371	182M	: "cc"); \
372	182M	} while (0)
373
374	280M	# define mul_add_c2(a,b,c0,c1,c2) do { \
375	280M	BN_ULONG t1,t2; \
376	280M	asm ("mulq %3" \
377	280M	: "=a"(t1),"=d"(t2) \
378	280M	: "a"(a),"m"(b) \
379	280M	: "cc"); \
380	280M	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
381	280M	: "+r"(c0),"+r"(c1),"+r"(c2) \
382	280M	: "r"(t1),"r"(t2),"g"(0) \
383	280M	: "cc"); \
384	280M	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
385	280M	: "+r"(c0),"+r"(c1),"+r"(c2) \
386	280M	: "r"(t1),"r"(t2),"g"(0) \
387	280M	: "cc"); \
388	280M	} while (0)
389		# endif
390
391		# define sqr_add_c2(a,i,j,c0,c1,c2) \
392	280M	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
393
394		void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
395	11.0M	{
396	11.0M	BN_ULONG c1, c2, c3;
397
398	11.0M	c1 = 0;
399	11.0M	c2 = 0;
400	11.0M	c3 = 0;
401	11.0M	mul_add_c(a[0], b[0], c1, c2, c3);
402	11.0M	r[0] = c1;
403	11.0M	c1 = 0;
404	11.0M	mul_add_c(a[0], b[1], c2, c3, c1);
405	11.0M	mul_add_c(a[1], b[0], c2, c3, c1);
406	11.0M	r[1] = c2;
407	11.0M	c2 = 0;
408	11.0M	mul_add_c(a[2], b[0], c3, c1, c2);
409	11.0M	mul_add_c(a[1], b[1], c3, c1, c2);
410	11.0M	mul_add_c(a[0], b[2], c3, c1, c2);
411	11.0M	r[2] = c3;
412	11.0M	c3 = 0;
413	11.0M	mul_add_c(a[0], b[3], c1, c2, c3);
414	11.0M	mul_add_c(a[1], b[2], c1, c2, c3);
415	11.0M	mul_add_c(a[2], b[1], c1, c2, c3);
416	11.0M	mul_add_c(a[3], b[0], c1, c2, c3);
417	11.0M	r[3] = c1;
418	11.0M	c1 = 0;
419	11.0M	mul_add_c(a[4], b[0], c2, c3, c1);
420	11.0M	mul_add_c(a[3], b[1], c2, c3, c1);
421	11.0M	mul_add_c(a[2], b[2], c2, c3, c1);
422	11.0M	mul_add_c(a[1], b[3], c2, c3, c1);
423	11.0M	mul_add_c(a[0], b[4], c2, c3, c1);
424	11.0M	r[4] = c2;
425	11.0M	c2 = 0;
426	11.0M	mul_add_c(a[0], b[5], c3, c1, c2);
427	11.0M	mul_add_c(a[1], b[4], c3, c1, c2);
428	11.0M	mul_add_c(a[2], b[3], c3, c1, c2);
429	11.0M	mul_add_c(a[3], b[2], c3, c1, c2);
430	11.0M	mul_add_c(a[4], b[1], c3, c1, c2);
431	11.0M	mul_add_c(a[5], b[0], c3, c1, c2);
432	11.0M	r[5] = c3;
433	11.0M	c3 = 0;
434	11.0M	mul_add_c(a[6], b[0], c1, c2, c3);
435	11.0M	mul_add_c(a[5], b[1], c1, c2, c3);
436	11.0M	mul_add_c(a[4], b[2], c1, c2, c3);
437	11.0M	mul_add_c(a[3], b[3], c1, c2, c3);
438	11.0M	mul_add_c(a[2], b[4], c1, c2, c3);
439	11.0M	mul_add_c(a[1], b[5], c1, c2, c3);
440	11.0M	mul_add_c(a[0], b[6], c1, c2, c3);
441	11.0M	r[6] = c1;
442	11.0M	c1 = 0;
443	11.0M	mul_add_c(a[0], b[7], c2, c3, c1);
444	11.0M	mul_add_c(a[1], b[6], c2, c3, c1);
445	11.0M	mul_add_c(a[2], b[5], c2, c3, c1);
446	11.0M	mul_add_c(a[3], b[4], c2, c3, c1);
447	11.0M	mul_add_c(a[4], b[3], c2, c3, c1);
448	11.0M	mul_add_c(a[5], b[2], c2, c3, c1);
449	11.0M	mul_add_c(a[6], b[1], c2, c3, c1);
450	11.0M	mul_add_c(a[7], b[0], c2, c3, c1);
451	11.0M	r[7] = c2;
452	11.0M	c2 = 0;
453	11.0M	mul_add_c(a[7], b[1], c3, c1, c2);
454	11.0M	mul_add_c(a[6], b[2], c3, c1, c2);
455	11.0M	mul_add_c(a[5], b[3], c3, c1, c2);
456	11.0M	mul_add_c(a[4], b[4], c3, c1, c2);
457	11.0M	mul_add_c(a[3], b[5], c3, c1, c2);
458	11.0M	mul_add_c(a[2], b[6], c3, c1, c2);
459	11.0M	mul_add_c(a[1], b[7], c3, c1, c2);
460	11.0M	r[8] = c3;
461	11.0M	c3 = 0;
462	11.0M	mul_add_c(a[2], b[7], c1, c2, c3);
463	11.0M	mul_add_c(a[3], b[6], c1, c2, c3);
464	11.0M	mul_add_c(a[4], b[5], c1, c2, c3);
465	11.0M	mul_add_c(a[5], b[4], c1, c2, c3);
466	11.0M	mul_add_c(a[6], b[3], c1, c2, c3);
467	11.0M	mul_add_c(a[7], b[2], c1, c2, c3);
468	11.0M	r[9] = c1;
469	11.0M	c1 = 0;
470	11.0M	mul_add_c(a[7], b[3], c2, c3, c1);
471	11.0M	mul_add_c(a[6], b[4], c2, c3, c1);
472	11.0M	mul_add_c(a[5], b[5], c2, c3, c1);
473	11.0M	mul_add_c(a[4], b[6], c2, c3, c1);
474	11.0M	mul_add_c(a[3], b[7], c2, c3, c1);
475	11.0M	r[10] = c2;
476	11.0M	c2 = 0;
477	11.0M	mul_add_c(a[4], b[7], c3, c1, c2);
478	11.0M	mul_add_c(a[5], b[6], c3, c1, c2);
479	11.0M	mul_add_c(a[6], b[5], c3, c1, c2);
480	11.0M	mul_add_c(a[7], b[4], c3, c1, c2);
481	11.0M	r[11] = c3;
482	11.0M	c3 = 0;
483	11.0M	mul_add_c(a[7], b[5], c1, c2, c3);
484	11.0M	mul_add_c(a[6], b[6], c1, c2, c3);
485	11.0M	mul_add_c(a[5], b[7], c1, c2, c3);
486	11.0M	r[12] = c1;
487	11.0M	c1 = 0;
488	11.0M	mul_add_c(a[6], b[7], c2, c3, c1);
489	11.0M	mul_add_c(a[7], b[6], c2, c3, c1);
490	11.0M	r[13] = c2;
491	11.0M	c2 = 0;
492	11.0M	mul_add_c(a[7], b[7], c3, c1, c2);
493	11.0M	r[14] = c3;
494	11.0M	r[15] = c1;
495	11.0M	}
496
497		void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
498	0	{
499	0	BN_ULONG c1, c2, c3;
500
501	0	c1 = 0;
502	0	c2 = 0;
503	0	c3 = 0;
504	0	mul_add_c(a[0], b[0], c1, c2, c3);
505	0	r[0] = c1;
506	0	c1 = 0;
507	0	mul_add_c(a[0], b[1], c2, c3, c1);
508	0	mul_add_c(a[1], b[0], c2, c3, c1);
509	0	r[1] = c2;
510	0	c2 = 0;
511	0	mul_add_c(a[2], b[0], c3, c1, c2);
512	0	mul_add_c(a[1], b[1], c3, c1, c2);
513	0	mul_add_c(a[0], b[2], c3, c1, c2);
514	0	r[2] = c3;
515	0	c3 = 0;
516	0	mul_add_c(a[0], b[3], c1, c2, c3);
517	0	mul_add_c(a[1], b[2], c1, c2, c3);
518	0	mul_add_c(a[2], b[1], c1, c2, c3);
519	0	mul_add_c(a[3], b[0], c1, c2, c3);
520	0	r[3] = c1;
521	0	c1 = 0;
522	0	mul_add_c(a[3], b[1], c2, c3, c1);
523	0	mul_add_c(a[2], b[2], c2, c3, c1);
524	0	mul_add_c(a[1], b[3], c2, c3, c1);
525	0	r[4] = c2;
526	0	c2 = 0;
527	0	mul_add_c(a[2], b[3], c3, c1, c2);
528	0	mul_add_c(a[3], b[2], c3, c1, c2);
529	0	r[5] = c3;
530	0	c3 = 0;
531	0	mul_add_c(a[3], b[3], c1, c2, c3);
532	0	r[6] = c1;
533	0	r[7] = c2;
534	0	}
535
536		void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
537	452k	{
538	452k	BN_ULONG c1, c2, c3;
539
540	452k	c1 = 0;
541	452k	c2 = 0;
542	452k	c3 = 0;
543	452k	sqr_add_c(a, 0, c1, c2, c3);
544	452k	r[0] = c1;
545	452k	c1 = 0;
546	452k	sqr_add_c2(a, 1, 0, c2, c3, c1);
547	452k	r[1] = c2;
548	452k	c2 = 0;
549	452k	sqr_add_c(a, 1, c3, c1, c2);
550	452k	sqr_add_c2(a, 2, 0, c3, c1, c2);
551	452k	r[2] = c3;
552	452k	c3 = 0;
553	452k	sqr_add_c2(a, 3, 0, c1, c2, c3);
554	452k	sqr_add_c2(a, 2, 1, c1, c2, c3);
555	452k	r[3] = c1;
556	452k	c1 = 0;
557	452k	sqr_add_c(a, 2, c2, c3, c1);
558	452k	sqr_add_c2(a, 3, 1, c2, c3, c1);
559	452k	sqr_add_c2(a, 4, 0, c2, c3, c1);
560	452k	r[4] = c2;
561	452k	c2 = 0;
562	452k	sqr_add_c2(a, 5, 0, c3, c1, c2);
563	452k	sqr_add_c2(a, 4, 1, c3, c1, c2);
564	452k	sqr_add_c2(a, 3, 2, c3, c1, c2);
565	452k	r[5] = c3;
566	452k	c3 = 0;
567	452k	sqr_add_c(a, 3, c1, c2, c3);
568	452k	sqr_add_c2(a, 4, 2, c1, c2, c3);
569	452k	sqr_add_c2(a, 5, 1, c1, c2, c3);
570	452k	sqr_add_c2(a, 6, 0, c1, c2, c3);
571	452k	r[6] = c1;
572	452k	c1 = 0;
573	452k	sqr_add_c2(a, 7, 0, c2, c3, c1);
574	452k	sqr_add_c2(a, 6, 1, c2, c3, c1);
575	452k	sqr_add_c2(a, 5, 2, c2, c3, c1);
576	452k	sqr_add_c2(a, 4, 3, c2, c3, c1);
577	452k	r[7] = c2;
578	452k	c2 = 0;
579	452k	sqr_add_c(a, 4, c3, c1, c2);
580	452k	sqr_add_c2(a, 5, 3, c3, c1, c2);
581	452k	sqr_add_c2(a, 6, 2, c3, c1, c2);
582	452k	sqr_add_c2(a, 7, 1, c3, c1, c2);
583	452k	r[8] = c3;
584	452k	c3 = 0;
585	452k	sqr_add_c2(a, 7, 2, c1, c2, c3);
586	452k	sqr_add_c2(a, 6, 3, c1, c2, c3);
587	452k	sqr_add_c2(a, 5, 4, c1, c2, c3);
588	452k	r[9] = c1;
589	452k	c1 = 0;
590	452k	sqr_add_c(a, 5, c2, c3, c1);
591	452k	sqr_add_c2(a, 6, 4, c2, c3, c1);
592	452k	sqr_add_c2(a, 7, 3, c2, c3, c1);
593	452k	r[10] = c2;
594	452k	c2 = 0;
595	452k	sqr_add_c2(a, 7, 4, c3, c1, c2);
596	452k	sqr_add_c2(a, 6, 5, c3, c1, c2);
597	452k	r[11] = c3;
598	452k	c3 = 0;
599	452k	sqr_add_c(a, 6, c1, c2, c3);
600	452k	sqr_add_c2(a, 7, 5, c1, c2, c3);
601	452k	r[12] = c1;
602	452k	c1 = 0;
603	452k	sqr_add_c2(a, 7, 6, c2, c3, c1);
604	452k	r[13] = c2;
605	452k	c2 = 0;
606	452k	sqr_add_c(a, 7, c3, c1, c2);
607	452k	r[14] = c3;
608	452k	r[15] = c1;
609	452k	}
610
611		void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
612	44.6M	{
613	44.6M	BN_ULONG c1, c2, c3;
614
615	44.6M	c1 = 0;
616	44.6M	c2 = 0;
617	44.6M	c3 = 0;
618	44.6M	sqr_add_c(a, 0, c1, c2, c3);
619	44.6M	r[0] = c1;
620	44.6M	c1 = 0;
621	44.6M	sqr_add_c2(a, 1, 0, c2, c3, c1);
622	44.6M	r[1] = c2;
623	44.6M	c2 = 0;
624	44.6M	sqr_add_c(a, 1, c3, c1, c2);
625	44.6M	sqr_add_c2(a, 2, 0, c3, c1, c2);
626	44.6M	r[2] = c3;
627	44.6M	c3 = 0;
628	44.6M	sqr_add_c2(a, 3, 0, c1, c2, c3);
629	44.6M	sqr_add_c2(a, 2, 1, c1, c2, c3);
630	44.6M	r[3] = c1;
631	44.6M	c1 = 0;
632	44.6M	sqr_add_c(a, 2, c2, c3, c1);
633	44.6M	sqr_add_c2(a, 3, 1, c2, c3, c1);
634	44.6M	r[4] = c2;
635	44.6M	c2 = 0;
636	44.6M	sqr_add_c2(a, 3, 2, c3, c1, c2);
637	44.6M	r[5] = c3;
638	44.6M	c3 = 0;
639	44.6M	sqr_add_c(a, 3, c1, c2, c3);
640	44.6M	r[6] = c1;
641	44.6M	r[7] = c2;
642	44.6M	}
643		#endif

Coverage Report

Created: 2023-06-08 06:43