/src/openssl/crypto/bn/asm/x86_64-gcc.c

Line	Count	Source
1		/*
2		* Copyright 2002-2025 The OpenSSL Project Authors. All Rights Reserved.
3		*
4		* Licensed under the Apache License 2.0 (the "License"). You may not use
5		* this file except in compliance with the License. You can obtain a copy
6		* in the file LICENSE in the source distribution or at
7		* https://www.openssl.org/source/license.html
8		*/
9
10		#include "../bn_local.h"
11		#if !(defined(__GNUC__) && __GNUC__>=2)
12		/* clang-format off */
13		# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
14		/* clang-format on */
15		#else
16		/*-
17		* x86_64 BIGNUM accelerator version 0.1, December 2002.
18		*
19		* Implemented by Andy Polyakov <https://github.com/dot-asm> for the OpenSSL
20		* project.
21		*
22		* Rights for redistribution and usage in source and binary forms are
23		* granted according to the License. Warranty of any kind is disclaimed.
24		*
25		* Q. Version 0.1? It doesn't sound like Andy, he used to assign real
26		* versions, like 1.0...
27		* A. Well, that's because this code is basically a quick-n-dirty
28		* proof-of-concept hack. As you can see it's implemented with
29		* inline assembler, which means that you're bound to GCC and that
30		* there might be enough room for further improvement.
31		*
32		* Q. Why inline assembler?
33		* A. x86_64 features own ABI which I'm not familiar with. This is
34		* why I decided to let the compiler take care of subroutine
35		* prologue/epilogue as well as register allocation. For reference.
36		* Win64 implements different ABI for AMD64, different from Linux.
37		*
38		* Q. How much faster does it get?
39		* A. 'apps/openssl speed rsa dsa' output with no-asm:
40		*
41		* sign verify sign/s verify/s
42		* rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
43		* rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
44		* rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
45		* rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
46		* sign verify sign/s verify/s
47		* dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
48		* dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
49		* dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
50		*
51		* 'apps/openssl speed rsa dsa' output with this module:
52		*
53		* sign verify sign/s verify/s
54		* rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
55		* rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
56		* rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
57		* rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
58		* sign verify sign/s verify/s
59		* dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
60		* dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
61		* dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
62		*
63		* For the reference. IA-32 assembler implementation performs
64		* very much like 64-bit code compiled with no-asm on the same
65		* machine.
66		*/
67
68		# undef mul
69		# undef mul_add
70
71		/*-
72		* "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
73		* "g"(0) let the compiler to decide where does it
74		* want to keep the value of zero;
75		*/
76	0	# define mul_add(r,a,word,carry) do { \
77	0	register BN_ULONG high,low; \
78	0	asm ("mulq %3" \
79	0	: "=a"(low),"=d"(high) \
80	0	: "a"(word),"m"(a) \
81	0	: "cc"); \
82	0	asm ("addq %2,%0; adcq %3,%1" \
83	0	: "+r"(carry),"+d"(high)\
84	0	: "a"(low),"g"(0) \
85	0	: "cc"); \
86	0	asm ("addq %2,%0; adcq %3,%1" \
87	0	: "+m"(r),"+d"(high) \
88	0	: "r"(carry),"g"(0) \
89	0	: "cc"); \
90	0	carry=high; \
91	0	} while (0)
92
93	0	# define mul(r,a,word,carry) do { \
94	0	register BN_ULONG high,low; \
95	0	asm ("mulq %3" \
96	0	: "=a"(low),"=d"(high) \
97	0	: "a"(word),"g"(a) \
98	0	: "cc"); \
99	0	asm ("addq %2,%0; adcq %3,%1" \
100	0	: "+r"(carry),"+d"(high)\
101	0	: "a"(low),"g"(0) \
102	0	: "cc"); \
103	0	(r)=carry, carry=high; \
104	0	} while (0)
105		# undef sqr
106		# define sqr(r0,r1,a) \
107	0	asm ("mulq %2" \
108	0	: "=a"(r0),"=d"(r1) \
109	0	: "a"(a) \
110	0	: "cc");
111
112		BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
113		BN_ULONG w)
114	0	{
115	0	BN_ULONG c1 = 0;
116
117	0	if (num <= 0)
118	0	return c1;
119
120	0	while (num & ~3) {
121	0	mul_add(rp[0], ap[0], w, c1);
122	0	mul_add(rp[1], ap[1], w, c1);
123	0	mul_add(rp[2], ap[2], w, c1);
124	0	mul_add(rp[3], ap[3], w, c1);
125	0	ap += 4;
126	0	rp += 4;
127	0	num -= 4;
128	0	}
129	0	if (num) {
130	0	mul_add(rp[0], ap[0], w, c1);
131	0	if (--num == 0)
132	0	return c1;
133	0	mul_add(rp[1], ap[1], w, c1);
134	0	if (--num == 0)
135	0	return c1;
136	0	mul_add(rp[2], ap[2], w, c1);
137	0	return c1;
138	0	}
139
140	0	return c1;
141	0	}
142
143		BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
144	0	{
145	0	BN_ULONG c1 = 0;
146
147	0	if (num <= 0)
148	0	return c1;
149
150	0	while (num & ~3) {
151	0	mul(rp[0], ap[0], w, c1);
152	0	mul(rp[1], ap[1], w, c1);
153	0	mul(rp[2], ap[2], w, c1);
154	0	mul(rp[3], ap[3], w, c1);
155	0	ap += 4;
156	0	rp += 4;
157	0	num -= 4;
158	0	}
159	0	if (num) {
160	0	mul(rp[0], ap[0], w, c1);
161	0	if (--num == 0)
162	0	return c1;
163	0	mul(rp[1], ap[1], w, c1);
164	0	if (--num == 0)
165	0	return c1;
166	0	mul(rp[2], ap[2], w, c1);
167	0	}
168	0	return c1;
169	0	}
170
171		void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
172	0	{
173	0	if (n <= 0)
174	0	return;
175
176	0	while (n & ~3) {
177	0	sqr(r[0], r[1], a[0]);
178	0	sqr(r[2], r[3], a[1]);
179	0	sqr(r[4], r[5], a[2]);
180	0	sqr(r[6], r[7], a[3]);
181	0	a += 4;
182	0	r += 8;
183	0	n -= 4;
184	0	}
185	0	if (n) {
186	0	sqr(r[0], r[1], a[0]);
187	0	if (--n == 0)
188	0	return;
189	0	sqr(r[2], r[3], a[1]);
190	0	if (--n == 0)
191	0	return;
192	0	sqr(r[4], r[5], a[2]);
193	0	}
194	0	}
195
196		BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
197	0	{
198	0	BN_ULONG ret, waste;
199
200	0	asm("divq %4":"=a"(ret), "=d"(waste)
201	0	: "a"(l), "d"(h), "r"(d)
202	0	: "cc");
203
204	0	return ret;
205	0	}
206
207		BN_ULONG bn_add_words(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
208		int n)
209	0	{
210	0	BN_ULONG ret;
211	0	size_t i = 0;
212
213	0	if (n <= 0)
214	0	return 0;
215
216	0	asm volatile (" subq %0,%0 \n" /* clear carry */
217	0	" jmp 1f \n"
218	0	".p2align 4 \n"
219	0	"1: movq (%4,%2,8),%0 \n"
220	0	" adcq (%5,%2,8),%0 \n"
221	0	" movq %0,(%3,%2,8) \n"
222	0	" lea 1(%2),%2 \n"
223	0	" dec %1 \n"
224	0	" jnz 1b \n"
225	0	" sbbq %0,%0 \n"
226	0	:"=&r" (ret), "+c"(n), "+r"(i)
227	0	:"r"(rp), "r"(ap), "r"(bp)
228	0	:"cc", "memory");
229
230	0	return ret & 1;
231	0	}
232
233		# ifndef SIMICS
234		BN_ULONG bn_sub_words(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
235		int n)
236	0	{
237	0	BN_ULONG ret;
238	0	size_t i = 0;
239
240	0	if (n <= 0)
241	0	return 0;
242
243	0	asm volatile (" subq %0,%0 \n" /* clear borrow */
244	0	" jmp 1f \n"
245	0	".p2align 4 \n"
246	0	"1: movq (%4,%2,8),%0 \n"
247	0	" sbbq (%5,%2,8),%0 \n"
248	0	" movq %0,(%3,%2,8) \n"
249	0	" lea 1(%2),%2 \n"
250	0	" dec %1 \n"
251	0	" jnz 1b \n"
252	0	" sbbq %0,%0 \n"
253	0	:"=&r" (ret), "+c"(n), "+r"(i)
254	0	:"r"(rp), "r"(ap), "r"(bp)
255	0	:"cc", "memory");
256
257	0	return ret & 1;
258	0	}
259		# else
260		/* Simics 1.4<7 has buggy sbbq:-( */
261		# define BN_MASK2 0xffffffffffffffffL
262		BN_ULONG bn_sub_words(BN_ULONG r, BN_ULONG a, BN_ULONG *b, int n)
263		{
264		BN_ULONG t1, t2;
265		int c = 0;
266
267		if (n <= 0)
268		return (BN_ULONG)0;
269
270		for (;;) {
271		t1 = a[0];
272		t2 = b[0];
273		r[0] = (t1 - t2 - c) & BN_MASK2;
274		if (t1 != t2)
275		c = (t1 < t2);
276		if (--n <= 0)
277		break;
278
279		t1 = a[1];
280		t2 = b[1];
281		r[1] = (t1 - t2 - c) & BN_MASK2;
282		if (t1 != t2)
283		c = (t1 < t2);
284		if (--n <= 0)
285		break;
286
287		t1 = a[2];
288		t2 = b[2];
289		r[2] = (t1 - t2 - c) & BN_MASK2;
290		if (t1 != t2)
291		c = (t1 < t2);
292		if (--n <= 0)
293		break;
294
295		t1 = a[3];
296		t2 = b[3];
297		r[3] = (t1 - t2 - c) & BN_MASK2;
298		if (t1 != t2)
299		c = (t1 < t2);
300		if (--n <= 0)
301		break;
302
303		a += 4;
304		b += 4;
305		r += 4;
306		}
307		return c;
308		}
309		# endif
310
311		/* mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /
312		/* mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) */
313		/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
314		/*
315		* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number
316		* c=(c2,c1,c0)
317		*/
318
319		/*
320		* Keep in mind that carrying into high part of multiplication result
321		* can not overflow, because it cannot be all-ones.
322		*/
323		# if 0
324		/* original macros are kept for reference purposes */
325		# define mul_add_c(a,b,c0,c1,c2) do { \
326		BN_ULONG ta = (a), tb = (b); \
327		BN_ULONG lo, hi; \
328		BN_UMULT_LOHI(lo,hi,ta,tb); \
329		c0 += lo; hi += (c0<lo)?1:0; \
330		c1 += hi; c2 += (c1<hi)?1:0; \
331		} while(0)
332
333		# define mul_add_c2(a,b,c0,c1,c2) do { \
334		BN_ULONG ta = (a), tb = (b); \
335		BN_ULONG lo, hi, tt; \
336		BN_UMULT_LOHI(lo,hi,ta,tb); \
337		c0 += lo; tt = hi+((c0<lo)?1:0); \
338		c1 += tt; c2 += (c1<tt)?1:0; \
339		c0 += lo; hi += (c0<lo)?1:0; \
340		c1 += hi; c2 += (c1<hi)?1:0; \
341		} while(0)
342
343		# define sqr_add_c(a,i,c0,c1,c2) do { \
344		BN_ULONG ta = (a)[i]; \
345		BN_ULONG lo, hi; \
346		BN_UMULT_LOHI(lo,hi,ta,ta); \
347		c0 += lo; hi += (c0<lo)?1:0; \
348		c1 += hi; c2 += (c1<hi)?1:0; \
349		} while(0)
350		# else
351	0	# define mul_add_c(a,b,c0,c1,c2) do { \
352	0	BN_ULONG t1,t2; \
353	0	asm ("mulq %3" \
354	0	: "=a"(t1),"=d"(t2) \
355	0	: "a"(a),"m"(b) \
356	0	: "cc"); \
357	0	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
358	0	: "+r"(c0),"+r"(c1),"+r"(c2) \
359	0	: "r"(t1),"r"(t2),"g"(0) \
360	0	: "cc"); \
361	0	} while (0)
362
363	0	# define sqr_add_c(a,i,c0,c1,c2) do { \
364	0	BN_ULONG t1,t2; \
365	0	asm ("mulq %2" \
366	0	: "=a"(t1),"=d"(t2) \
367	0	: "a"(a[i]) \
368	0	: "cc"); \
369	0	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
370	0	: "+r"(c0),"+r"(c1),"+r"(c2) \
371	0	: "r"(t1),"r"(t2),"g"(0) \
372	0	: "cc"); \
373	0	} while (0)
374
375	0	# define mul_add_c2(a,b,c0,c1,c2) do { \
376	0	BN_ULONG t1,t2; \
377	0	asm ("mulq %3" \
378	0	: "=a"(t1),"=d"(t2) \
379	0	: "a"(a),"m"(b) \
380	0	: "cc"); \
381	0	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
382	0	: "+r"(c0),"+r"(c1),"+r"(c2) \
383	0	: "r"(t1),"r"(t2),"g"(0) \
384	0	: "cc"); \
385	0	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
386	0	: "+r"(c0),"+r"(c1),"+r"(c2) \
387	0	: "r"(t1),"r"(t2),"g"(0) \
388	0	: "cc"); \
389	0	} while (0)
390		# endif
391
392		# define sqr_add_c2(a,i,j,c0,c1,c2) \
393	0	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
394
395		void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
396	0	{
397	0	BN_ULONG c1, c2, c3;
398
399	0	c1 = 0;
400	0	c2 = 0;
401	0	c3 = 0;
402	0	mul_add_c(a[0], b[0], c1, c2, c3);
403	0	r[0] = c1;
404	0	c1 = 0;
405	0	mul_add_c(a[0], b[1], c2, c3, c1);
406	0	mul_add_c(a[1], b[0], c2, c3, c1);
407	0	r[1] = c2;
408	0	c2 = 0;
409	0	mul_add_c(a[2], b[0], c3, c1, c2);
410	0	mul_add_c(a[1], b[1], c3, c1, c2);
411	0	mul_add_c(a[0], b[2], c3, c1, c2);
412	0	r[2] = c3;
413	0	c3 = 0;
414	0	mul_add_c(a[0], b[3], c1, c2, c3);
415	0	mul_add_c(a[1], b[2], c1, c2, c3);
416	0	mul_add_c(a[2], b[1], c1, c2, c3);
417	0	mul_add_c(a[3], b[0], c1, c2, c3);
418	0	r[3] = c1;
419	0	c1 = 0;
420	0	mul_add_c(a[4], b[0], c2, c3, c1);
421	0	mul_add_c(a[3], b[1], c2, c3, c1);
422	0	mul_add_c(a[2], b[2], c2, c3, c1);
423	0	mul_add_c(a[1], b[3], c2, c3, c1);
424	0	mul_add_c(a[0], b[4], c2, c3, c1);
425	0	r[4] = c2;
426	0	c2 = 0;
427	0	mul_add_c(a[0], b[5], c3, c1, c2);
428	0	mul_add_c(a[1], b[4], c3, c1, c2);
429	0	mul_add_c(a[2], b[3], c3, c1, c2);
430	0	mul_add_c(a[3], b[2], c3, c1, c2);
431	0	mul_add_c(a[4], b[1], c3, c1, c2);
432	0	mul_add_c(a[5], b[0], c3, c1, c2);
433	0	r[5] = c3;
434	0	c3 = 0;
435	0	mul_add_c(a[6], b[0], c1, c2, c3);
436	0	mul_add_c(a[5], b[1], c1, c2, c3);
437	0	mul_add_c(a[4], b[2], c1, c2, c3);
438	0	mul_add_c(a[3], b[3], c1, c2, c3);
439	0	mul_add_c(a[2], b[4], c1, c2, c3);
440	0	mul_add_c(a[1], b[5], c1, c2, c3);
441	0	mul_add_c(a[0], b[6], c1, c2, c3);
442	0	r[6] = c1;
443	0	c1 = 0;
444	0	mul_add_c(a[0], b[7], c2, c3, c1);
445	0	mul_add_c(a[1], b[6], c2, c3, c1);
446	0	mul_add_c(a[2], b[5], c2, c3, c1);
447	0	mul_add_c(a[3], b[4], c2, c3, c1);
448	0	mul_add_c(a[4], b[3], c2, c3, c1);
449	0	mul_add_c(a[5], b[2], c2, c3, c1);
450	0	mul_add_c(a[6], b[1], c2, c3, c1);
451	0	mul_add_c(a[7], b[0], c2, c3, c1);
452	0	r[7] = c2;
453	0	c2 = 0;
454	0	mul_add_c(a[7], b[1], c3, c1, c2);
455	0	mul_add_c(a[6], b[2], c3, c1, c2);
456	0	mul_add_c(a[5], b[3], c3, c1, c2);
457	0	mul_add_c(a[4], b[4], c3, c1, c2);
458	0	mul_add_c(a[3], b[5], c3, c1, c2);
459	0	mul_add_c(a[2], b[6], c3, c1, c2);
460	0	mul_add_c(a[1], b[7], c3, c1, c2);
461	0	r[8] = c3;
462	0	c3 = 0;
463	0	mul_add_c(a[2], b[7], c1, c2, c3);
464	0	mul_add_c(a[3], b[6], c1, c2, c3);
465	0	mul_add_c(a[4], b[5], c1, c2, c3);
466	0	mul_add_c(a[5], b[4], c1, c2, c3);
467	0	mul_add_c(a[6], b[3], c1, c2, c3);
468	0	mul_add_c(a[7], b[2], c1, c2, c3);
469	0	r[9] = c1;
470	0	c1 = 0;
471	0	mul_add_c(a[7], b[3], c2, c3, c1);
472	0	mul_add_c(a[6], b[4], c2, c3, c1);
473	0	mul_add_c(a[5], b[5], c2, c3, c1);
474	0	mul_add_c(a[4], b[6], c2, c3, c1);
475	0	mul_add_c(a[3], b[7], c2, c3, c1);
476	0	r[10] = c2;
477	0	c2 = 0;
478	0	mul_add_c(a[4], b[7], c3, c1, c2);
479	0	mul_add_c(a[5], b[6], c3, c1, c2);
480	0	mul_add_c(a[6], b[5], c3, c1, c2);
481	0	mul_add_c(a[7], b[4], c3, c1, c2);
482	0	r[11] = c3;
483	0	c3 = 0;
484	0	mul_add_c(a[7], b[5], c1, c2, c3);
485	0	mul_add_c(a[6], b[6], c1, c2, c3);
486	0	mul_add_c(a[5], b[7], c1, c2, c3);
487	0	r[12] = c1;
488	0	c1 = 0;
489	0	mul_add_c(a[6], b[7], c2, c3, c1);
490	0	mul_add_c(a[7], b[6], c2, c3, c1);
491	0	r[13] = c2;
492	0	c2 = 0;
493	0	mul_add_c(a[7], b[7], c3, c1, c2);
494	0	r[14] = c3;
495	0	r[15] = c1;
496	0	}
497
498		void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
499	0	{
500	0	BN_ULONG c1, c2, c3;
501
502	0	c1 = 0;
503	0	c2 = 0;
504	0	c3 = 0;
505	0	mul_add_c(a[0], b[0], c1, c2, c3);
506	0	r[0] = c1;
507	0	c1 = 0;
508	0	mul_add_c(a[0], b[1], c2, c3, c1);
509	0	mul_add_c(a[1], b[0], c2, c3, c1);
510	0	r[1] = c2;
511	0	c2 = 0;
512	0	mul_add_c(a[2], b[0], c3, c1, c2);
513	0	mul_add_c(a[1], b[1], c3, c1, c2);
514	0	mul_add_c(a[0], b[2], c3, c1, c2);
515	0	r[2] = c3;
516	0	c3 = 0;
517	0	mul_add_c(a[0], b[3], c1, c2, c3);
518	0	mul_add_c(a[1], b[2], c1, c2, c3);
519	0	mul_add_c(a[2], b[1], c1, c2, c3);
520	0	mul_add_c(a[3], b[0], c1, c2, c3);
521	0	r[3] = c1;
522	0	c1 = 0;
523	0	mul_add_c(a[3], b[1], c2, c3, c1);
524	0	mul_add_c(a[2], b[2], c2, c3, c1);
525	0	mul_add_c(a[1], b[3], c2, c3, c1);
526	0	r[4] = c2;
527	0	c2 = 0;
528	0	mul_add_c(a[2], b[3], c3, c1, c2);
529	0	mul_add_c(a[3], b[2], c3, c1, c2);
530	0	r[5] = c3;
531	0	c3 = 0;
532	0	mul_add_c(a[3], b[3], c1, c2, c3);
533	0	r[6] = c1;
534	0	r[7] = c2;
535	0	}
536
537		void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
538	0	{
539	0	BN_ULONG c1, c2, c3;
540
541	0	c1 = 0;
542	0	c2 = 0;
543	0	c3 = 0;
544	0	sqr_add_c(a, 0, c1, c2, c3);
545	0	r[0] = c1;
546	0	c1 = 0;
547	0	sqr_add_c2(a, 1, 0, c2, c3, c1);
548	0	r[1] = c2;
549	0	c2 = 0;
550	0	sqr_add_c(a, 1, c3, c1, c2);
551	0	sqr_add_c2(a, 2, 0, c3, c1, c2);
552	0	r[2] = c3;
553	0	c3 = 0;
554	0	sqr_add_c2(a, 3, 0, c1, c2, c3);
555	0	sqr_add_c2(a, 2, 1, c1, c2, c3);
556	0	r[3] = c1;
557	0	c1 = 0;
558	0	sqr_add_c(a, 2, c2, c3, c1);
559	0	sqr_add_c2(a, 3, 1, c2, c3, c1);
560	0	sqr_add_c2(a, 4, 0, c2, c3, c1);
561	0	r[4] = c2;
562	0	c2 = 0;
563	0	sqr_add_c2(a, 5, 0, c3, c1, c2);
564	0	sqr_add_c2(a, 4, 1, c3, c1, c2);
565	0	sqr_add_c2(a, 3, 2, c3, c1, c2);
566	0	r[5] = c3;
567	0	c3 = 0;
568	0	sqr_add_c(a, 3, c1, c2, c3);
569	0	sqr_add_c2(a, 4, 2, c1, c2, c3);
570	0	sqr_add_c2(a, 5, 1, c1, c2, c3);
571	0	sqr_add_c2(a, 6, 0, c1, c2, c3);
572	0	r[6] = c1;
573	0	c1 = 0;
574	0	sqr_add_c2(a, 7, 0, c2, c3, c1);
575	0	sqr_add_c2(a, 6, 1, c2, c3, c1);
576	0	sqr_add_c2(a, 5, 2, c2, c3, c1);
577	0	sqr_add_c2(a, 4, 3, c2, c3, c1);
578	0	r[7] = c2;
579	0	c2 = 0;
580	0	sqr_add_c(a, 4, c3, c1, c2);
581	0	sqr_add_c2(a, 5, 3, c3, c1, c2);
582	0	sqr_add_c2(a, 6, 2, c3, c1, c2);
583	0	sqr_add_c2(a, 7, 1, c3, c1, c2);
584	0	r[8] = c3;
585	0	c3 = 0;
586	0	sqr_add_c2(a, 7, 2, c1, c2, c3);
587	0	sqr_add_c2(a, 6, 3, c1, c2, c3);
588	0	sqr_add_c2(a, 5, 4, c1, c2, c3);
589	0	r[9] = c1;
590	0	c1 = 0;
591	0	sqr_add_c(a, 5, c2, c3, c1);
592	0	sqr_add_c2(a, 6, 4, c2, c3, c1);
593	0	sqr_add_c2(a, 7, 3, c2, c3, c1);
594	0	r[10] = c2;
595	0	c2 = 0;
596	0	sqr_add_c2(a, 7, 4, c3, c1, c2);
597	0	sqr_add_c2(a, 6, 5, c3, c1, c2);
598	0	r[11] = c3;
599	0	c3 = 0;
600	0	sqr_add_c(a, 6, c1, c2, c3);
601	0	sqr_add_c2(a, 7, 5, c1, c2, c3);
602	0	r[12] = c1;
603	0	c1 = 0;
604	0	sqr_add_c2(a, 7, 6, c2, c3, c1);
605	0	r[13] = c2;
606	0	c2 = 0;
607	0	sqr_add_c(a, 7, c3, c1, c2);
608	0	r[14] = c3;
609	0	r[15] = c1;
610	0	}
611
612		void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
613	0	{
614	0	BN_ULONG c1, c2, c3;
615
616	0	c1 = 0;
617	0	c2 = 0;
618	0	c3 = 0;
619	0	sqr_add_c(a, 0, c1, c2, c3);
620	0	r[0] = c1;
621	0	c1 = 0;
622	0	sqr_add_c2(a, 1, 0, c2, c3, c1);
623	0	r[1] = c2;
624	0	c2 = 0;
625	0	sqr_add_c(a, 1, c3, c1, c2);
626	0	sqr_add_c2(a, 2, 0, c3, c1, c2);
627	0	r[2] = c3;
628	0	c3 = 0;
629	0	sqr_add_c2(a, 3, 0, c1, c2, c3);
630	0	sqr_add_c2(a, 2, 1, c1, c2, c3);
631	0	r[3] = c1;
632	0	c1 = 0;
633	0	sqr_add_c(a, 2, c2, c3, c1);
634	0	sqr_add_c2(a, 3, 1, c2, c3, c1);
635	0	r[4] = c2;
636	0	c2 = 0;
637	0	sqr_add_c2(a, 3, 2, c3, c1, c2);
638	0	r[5] = c3;
639	0	c3 = 0;
640	0	sqr_add_c(a, 3, c1, c2, c3);
641	0	r[6] = c1;
642	0	r[7] = c2;
643	0	}
644		#endif

Coverage Report

Created: 2025-12-08 06:22