/src/openssl/crypto/bn/asm/x86_64-gcc.c

Line	Count	Source
1		/*
2		* Copyright 2002-2025 The OpenSSL Project Authors. All Rights Reserved.
3		*
4		* Licensed under the Apache License 2.0 (the "License"). You may not use
5		* this file except in compliance with the License. You can obtain a copy
6		* in the file LICENSE in the source distribution or at
7		* https://www.openssl.org/source/license.html
8		*/
9
10		#include "../bn_local.h"
11		#if !(defined(__GNUC__) && __GNUC__ >= 2)
12		/* clang-format off */
13		# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
14		/* clang-format on */
15		#else
16		/*-
17		* x86_64 BIGNUM accelerator version 0.1, December 2002.
18		*
19		* Implemented by Andy Polyakov <https://github.com/dot-asm> for the OpenSSL
20		* project.
21		*
22		* Rights for redistribution and usage in source and binary forms are
23		* granted according to the License. Warranty of any kind is disclaimed.
24		*
25		* Q. Version 0.1? It doesn't sound like Andy, he used to assign real
26		* versions, like 1.0...
27		* A. Well, that's because this code is basically a quick-n-dirty
28		* proof-of-concept hack. As you can see it's implemented with
29		* inline assembler, which means that you're bound to GCC and that
30		* there might be enough room for further improvement.
31		*
32		* Q. Why inline assembler?
33		* A. x86_64 features own ABI which I'm not familiar with. This is
34		* why I decided to let the compiler take care of subroutine
35		* prologue/epilogue as well as register allocation. For reference.
36		* Win64 implements different ABI for AMD64, different from Linux.
37		*
38		* Q. How much faster does it get?
39		* A. 'apps/openssl speed rsa dsa' output with no-asm:
40		*
41		* sign verify sign/s verify/s
42		* rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
43		* rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
44		* rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
45		* rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
46		* sign verify sign/s verify/s
47		* dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
48		* dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
49		* dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
50		*
51		* 'apps/openssl speed rsa dsa' output with this module:
52		*
53		* sign verify sign/s verify/s
54		* rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
55		* rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
56		* rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
57		* rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
58		* sign verify sign/s verify/s
59		* dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
60		* dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
61		* dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
62		*
63		* For the reference. IA-32 assembler implementation performs
64		* very much like 64-bit code compiled with no-asm on the same
65		* machine.
66		*/
67
68		#undef mul
69		#undef mul_add
70
71		/*-
72		* "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
73		* "g"(0) let the compiler to decide where does it
74		* want to keep the value of zero;
75		*/
76		#define mul_add(r, a, word, carry) \
77	210G	do { \
78	210G	register BN_ULONG high, low; \
79	210G	asm("mulq %3" \
80	210G	: "=a"(low), "=d"(high) \
81	210G	: "a"(word), "m"(a) \
82	210G	: "cc"); \
83	210G	asm("addq %2,%0; adcq %3,%1" \
84	210G	: "+r"(carry), "+d"(high) \
85	210G	: "a"(low), "g"(0) \
86	210G	: "cc"); \
87	210G	asm("addq %2,%0; adcq %3,%1" \
88	210G	: "+m"(r), "+d"(high) \
89	210G	: "r"(carry), "g"(0) \
90	210G	: "cc"); \
91	210G	carry = high; \
92	210G	} while (0)
93
94		#define mul(r, a, word, carry) \
95	26.8G	do { \
96	26.8G	register BN_ULONG high, low; \
97	26.8G	asm("mulq %3" \
98	26.8G	: "=a"(low), "=d"(high) \
99	26.8G	: "a"(word), "g"(a) \
100	26.8G	: "cc"); \
101	26.8G	asm("addq %2,%0; adcq %3,%1" \
102	26.8G	: "+r"(carry), "+d"(high) \
103	26.8G	: "a"(low), "g"(0) \
104	26.8G	: "cc"); \
105	26.8G	(r) = carry, carry = high; \
106	26.8G	} while (0)
107		#undef sqr
108		#define sqr(r0, r1, a) \
109	87.0M	asm("mulq %2" \
110	87.0M	: "=a"(r0), "=d"(r1) \
111	87.0M	: "a"(a) \
112	87.0M	: "cc");
113
114		BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
115		BN_ULONG w)
116	223M	{
117	223M	BN_ULONG c1 = 0;
118
119	223M	if (num <= 0)
120	0	return c1;
121
122	52.8G	while (num & ~3) {
123	52.6G	mul_add(rp[0], ap[0], w, c1);
124	52.6G	mul_add(rp[1], ap[1], w, c1);
125	52.6G	mul_add(rp[2], ap[2], w, c1);
126	52.6G	mul_add(rp[3], ap[3], w, c1);
127	52.6G	ap += 4;
128	52.6G	rp += 4;
129	52.6G	num -= 4;
130	52.6G	}
131	223M	if (num) {
132	147M	mul_add(rp[0], ap[0], w, c1);
133	147M	if (--num == 0)
134	58.8M	return c1;
135	88.3M	mul_add(rp[1], ap[1], w, c1);
136	88.3M	if (--num == 0)
137	42.4M	return c1;
138	45.8M	mul_add(rp[2], ap[2], w, c1);
139	45.8M	return c1;
140	88.3M	}
141
142	76.3M	return c1;
143	223M	}
144
145		BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
146	502M	{
147	502M	BN_ULONG c1 = 0;
148
149	502M	if (num <= 0)
150	0	return c1;
151
152	7.19G	while (num & ~3) {
153	6.69G	mul(rp[0], ap[0], w, c1);
154	6.69G	mul(rp[1], ap[1], w, c1);
155	6.69G	mul(rp[2], ap[2], w, c1);
156	6.69G	mul(rp[3], ap[3], w, c1);
157	6.69G	ap += 4;
158	6.69G	rp += 4;
159	6.69G	num -= 4;
160	6.69G	}
161	502M	if (num) {
162	71.8M	mul(rp[0], ap[0], w, c1);
163	71.8M	if (--num == 0)
164	32.5M	return c1;
165	39.2M	mul(rp[1], ap[1], w, c1);
166	39.2M	if (--num == 0)
167	21.9M	return c1;
168	17.3M	mul(rp[2], ap[2], w, c1);
169	17.3M	}
170	448M	return c1;
171	502M	}
172
173		void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
174	5.37M	{
175	5.37M	if (n <= 0)
176	0	return;
177
178	25.5M	while (n & ~3) {
179	20.1M	sqr(r[0], r[1], a[0]);
180	20.1M	sqr(r[2], r[3], a[1]);
181	20.1M	sqr(r[4], r[5], a[2]);
182	20.1M	sqr(r[6], r[7], a[3]);
183	20.1M	a += 4;
184	20.1M	r += 8;
185	20.1M	n -= 4;
186	20.1M	}
187	5.37M	if (n) {
188	5.21M	sqr(r[0], r[1], a[0]);
189	5.21M	if (--n == 0)
190	4.52M	return;
191	685k	sqr(r[2], r[3], a[1]);
192	685k	if (--n == 0)
193	381k	return;
194	303k	sqr(r[4], r[5], a[2]);
195	303k	}
196	5.37M	}
197
198		BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
199	466M	{
200	466M	BN_ULONG ret, waste;
201
202	466M	asm("divq %4" : "=a"(ret), "=d"(waste)
203	466M	: "a"(l), "d"(h), "r"(d)
204	466M	: "cc");
205
206	466M	return ret;
207	466M	}
208
209		BN_ULONG bn_add_words(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
210		int n)
211	800M	{
212	800M	BN_ULONG ret;
213	800M	size_t i = 0;
214
215	800M	if (n <= 0)
216	9.28M	return 0;
217
218	790M	asm volatile(" subq %0,%0 \n" /* clear carry */
219	790M	" jmp 1f \n"
220	790M	".p2align 4 \n"
221	790M	"1: movq (%4,%2,8),%0 \n"
222	790M	" adcq (%5,%2,8),%0 \n"
223	790M	" movq %0,(%3,%2,8) \n"
224	790M	" lea 1(%2),%2 \n"
225	790M	" dec %1 \n"
226	790M	" jnz 1b \n"
227	790M	" sbbq %0,%0 \n"
228	790M	: "=&r"(ret), "+c"(n), "+r"(i)
229	790M	: "r"(rp), "r"(ap), "r"(bp)
230	790M	: "cc", "memory");
231
232	790M	return ret & 1;
233	800M	}
234
235		#ifndef SIMICS
236		BN_ULONG bn_sub_words(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
237		int n)
238	806M	{
239	806M	BN_ULONG ret;
240	806M	size_t i = 0;
241
242	806M	if (n <= 0)
243	8.69M	return 0;
244
245	797M	asm volatile(" subq %0,%0 \n" /* clear borrow */
246	797M	" jmp 1f \n"
247	797M	".p2align 4 \n"
248	797M	"1: movq (%4,%2,8),%0 \n"
249	797M	" sbbq (%5,%2,8),%0 \n"
250	797M	" movq %0,(%3,%2,8) \n"
251	797M	" lea 1(%2),%2 \n"
252	797M	" dec %1 \n"
253	797M	" jnz 1b \n"
254	797M	" sbbq %0,%0 \n"
255	797M	: "=&r"(ret), "+c"(n), "+r"(i)
256	797M	: "r"(rp), "r"(ap), "r"(bp)
257	797M	: "cc", "memory");
258
259	797M	return ret & 1;
260	806M	}
261		#else
262		/* Simics 1.4<7 has buggy sbbq:-( */
263		#define BN_MASK2 0xffffffffffffffffL
264		BN_ULONG bn_sub_words(BN_ULONG r, BN_ULONG a, BN_ULONG *b, int n)
265		{
266		BN_ULONG t1, t2;
267		int c = 0;
268
269		if (n <= 0)
270		return (BN_ULONG)0;
271
272		for (;;) {
273		t1 = a[0];
274		t2 = b[0];
275		r[0] = (t1 - t2 - c) & BN_MASK2;
276		if (t1 != t2)
277		c = (t1 < t2);
278		if (--n <= 0)
279		break;
280
281		t1 = a[1];
282		t2 = b[1];
283		r[1] = (t1 - t2 - c) & BN_MASK2;
284		if (t1 != t2)
285		c = (t1 < t2);
286		if (--n <= 0)
287		break;
288
289		t1 = a[2];
290		t2 = b[2];
291		r[2] = (t1 - t2 - c) & BN_MASK2;
292		if (t1 != t2)
293		c = (t1 < t2);
294		if (--n <= 0)
295		break;
296
297		t1 = a[3];
298		t2 = b[3];
299		r[3] = (t1 - t2 - c) & BN_MASK2;
300		if (t1 != t2)
301		c = (t1 < t2);
302		if (--n <= 0)
303		break;
304
305		a += 4;
306		b += 4;
307		r += 4;
308		}
309		return c;
310		}
311		#endif
312
313		/* mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /
314		/* mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) */
315		/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
316		/*
317		* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number
318		* c=(c2,c1,c0)
319		*/
320
321		/*
322		* Keep in mind that carrying into high part of multiplication result
323		* can not overflow, because it cannot be all-ones.
324		*/
325		#if 0
326		/* original macros are kept for reference purposes */
327		#define mul_add_c(a, b, c0, c1, c2) \
328		do { \
329		BN_ULONG ta = (a), tb = (b); \
330		BN_ULONG lo, hi; \
331		BN_UMULT_LOHI(lo, hi, ta, tb); \
332		c0 += lo; \
333		hi += (c0 < lo) ? 1 : 0; \
334		c1 += hi; \
335		c2 += (c1 < hi) ? 1 : 0; \
336		} while (0)
337
338		#define mul_add_c2(a, b, c0, c1, c2) \
339		do { \
340		BN_ULONG ta = (a), tb = (b); \
341		BN_ULONG lo, hi, tt; \
342		BN_UMULT_LOHI(lo, hi, ta, tb); \
343		c0 += lo; \
344		tt = hi + ((c0 < lo) ? 1 : 0); \
345		c1 += tt; \
346		c2 += (c1 < tt) ? 1 : 0; \
347		c0 += lo; \
348		hi += (c0 < lo) ? 1 : 0; \
349		c1 += hi; \
350		c2 += (c1 < hi) ? 1 : 0; \
351		} while (0)
352
353		#define sqr_add_c(a, i, c0, c1, c2) \
354		do { \
355		BN_ULONG ta = (a)[i]; \
356		BN_ULONG lo, hi; \
357		BN_UMULT_LOHI(lo, hi, ta, ta); \
358		c0 += lo; \
359		hi += (c0 < lo) ? 1 : 0; \
360		c1 += hi; \
361		c2 += (c1 < hi) ? 1 : 0; \
362		} while (0)
363		#else
364		#define mul_add_c(a, b, c0, c1, c2) \
365	7.71G	do { \
366	7.71G	BN_ULONG t1, t2; \
367	7.71G	asm("mulq %3" \
368	7.71G	: "=a"(t1), "=d"(t2) \
369	7.71G	: "a"(a), "m"(b) \
370	7.71G	: "cc"); \
371	7.71G	asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
372	7.71G	: "+r"(c0), "+r"(c1), "+r"(c2) \
373	7.71G	: "r"(t1), "r"(t2), "g"(0) \
374	7.71G	: "cc"); \
375	7.71G	} while (0)
376
377		#define sqr_add_c(a, i, c0, c1, c2) \
378	540M	do { \
379	540M	BN_ULONG t1, t2; \
380	540M	asm("mulq %2" \
381	540M	: "=a"(t1), "=d"(t2) \
382	540M	: "a"(a[i]) \
383	540M	: "cc"); \
384	540M	asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
385	540M	: "+r"(c0), "+r"(c1), "+r"(c2) \
386	540M	: "r"(t1), "r"(t2), "g"(0) \
387	540M	: "cc"); \
388	540M	} while (0)
389
390		#define mul_add_c2(a, b, c0, c1, c2) \
391	1.14G	do { \
392	1.14G	BN_ULONG t1, t2; \
393	1.14G	asm("mulq %3" \
394	1.14G	: "=a"(t1), "=d"(t2) \
395	1.14G	: "a"(a), "m"(b) \
396	1.14G	: "cc"); \
397	1.14G	asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
398	1.14G	: "+r"(c0), "+r"(c1), "+r"(c2) \
399	1.14G	: "r"(t1), "r"(t2), "g"(0) \
400	1.14G	: "cc"); \
401	1.14G	asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
402	1.14G	: "+r"(c0), "+r"(c1), "+r"(c2) \
403	1.14G	: "r"(t1), "r"(t2), "g"(0) \
404	1.14G	: "cc"); \
405	1.14G	} while (0)
406		#endif
407
408		#define sqr_add_c2(a, i, j, c0, c1, c2) \
409	1.14G	mul_add_c2((a)[i], (a)[j], c0, c1, c2)
410
411		void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
412	120M	{
413	120M	BN_ULONG c1, c2, c3;
414
415	120M	c1 = 0;
416	120M	c2 = 0;
417	120M	c3 = 0;
418	120M	mul_add_c(a[0], b[0], c1, c2, c3);
419	120M	r[0] = c1;
420	120M	c1 = 0;
421	120M	mul_add_c(a[0], b[1], c2, c3, c1);
422	120M	mul_add_c(a[1], b[0], c2, c3, c1);
423	120M	r[1] = c2;
424	120M	c2 = 0;
425	120M	mul_add_c(a[2], b[0], c3, c1, c2);
426	120M	mul_add_c(a[1], b[1], c3, c1, c2);
427	120M	mul_add_c(a[0], b[2], c3, c1, c2);
428	120M	r[2] = c3;
429	120M	c3 = 0;
430	120M	mul_add_c(a[0], b[3], c1, c2, c3);
431	120M	mul_add_c(a[1], b[2], c1, c2, c3);
432	120M	mul_add_c(a[2], b[1], c1, c2, c3);
433	120M	mul_add_c(a[3], b[0], c1, c2, c3);
434	120M	r[3] = c1;
435	120M	c1 = 0;
436	120M	mul_add_c(a[4], b[0], c2, c3, c1);
437	120M	mul_add_c(a[3], b[1], c2, c3, c1);
438	120M	mul_add_c(a[2], b[2], c2, c3, c1);
439	120M	mul_add_c(a[1], b[3], c2, c3, c1);
440	120M	mul_add_c(a[0], b[4], c2, c3, c1);
441	120M	r[4] = c2;
442	120M	c2 = 0;
443	120M	mul_add_c(a[0], b[5], c3, c1, c2);
444	120M	mul_add_c(a[1], b[4], c3, c1, c2);
445	120M	mul_add_c(a[2], b[3], c3, c1, c2);
446	120M	mul_add_c(a[3], b[2], c3, c1, c2);
447	120M	mul_add_c(a[4], b[1], c3, c1, c2);
448	120M	mul_add_c(a[5], b[0], c3, c1, c2);
449	120M	r[5] = c3;
450	120M	c3 = 0;
451	120M	mul_add_c(a[6], b[0], c1, c2, c3);
452	120M	mul_add_c(a[5], b[1], c1, c2, c3);
453	120M	mul_add_c(a[4], b[2], c1, c2, c3);
454	120M	mul_add_c(a[3], b[3], c1, c2, c3);
455	120M	mul_add_c(a[2], b[4], c1, c2, c3);
456	120M	mul_add_c(a[1], b[5], c1, c2, c3);
457	120M	mul_add_c(a[0], b[6], c1, c2, c3);
458	120M	r[6] = c1;
459	120M	c1 = 0;
460	120M	mul_add_c(a[0], b[7], c2, c3, c1);
461	120M	mul_add_c(a[1], b[6], c2, c3, c1);
462	120M	mul_add_c(a[2], b[5], c2, c3, c1);
463	120M	mul_add_c(a[3], b[4], c2, c3, c1);
464	120M	mul_add_c(a[4], b[3], c2, c3, c1);
465	120M	mul_add_c(a[5], b[2], c2, c3, c1);
466	120M	mul_add_c(a[6], b[1], c2, c3, c1);
467	120M	mul_add_c(a[7], b[0], c2, c3, c1);
468	120M	r[7] = c2;
469	120M	c2 = 0;
470	120M	mul_add_c(a[7], b[1], c3, c1, c2);
471	120M	mul_add_c(a[6], b[2], c3, c1, c2);
472	120M	mul_add_c(a[5], b[3], c3, c1, c2);
473	120M	mul_add_c(a[4], b[4], c3, c1, c2);
474	120M	mul_add_c(a[3], b[5], c3, c1, c2);
475	120M	mul_add_c(a[2], b[6], c3, c1, c2);
476	120M	mul_add_c(a[1], b[7], c3, c1, c2);
477	120M	r[8] = c3;
478	120M	c3 = 0;
479	120M	mul_add_c(a[2], b[7], c1, c2, c3);
480	120M	mul_add_c(a[3], b[6], c1, c2, c3);
481	120M	mul_add_c(a[4], b[5], c1, c2, c3);
482	120M	mul_add_c(a[5], b[4], c1, c2, c3);
483	120M	mul_add_c(a[6], b[3], c1, c2, c3);
484	120M	mul_add_c(a[7], b[2], c1, c2, c3);
485	120M	r[9] = c1;
486	120M	c1 = 0;
487	120M	mul_add_c(a[7], b[3], c2, c3, c1);
488	120M	mul_add_c(a[6], b[4], c2, c3, c1);
489	120M	mul_add_c(a[5], b[5], c2, c3, c1);
490	120M	mul_add_c(a[4], b[6], c2, c3, c1);
491	120M	mul_add_c(a[3], b[7], c2, c3, c1);
492	120M	r[10] = c2;
493	120M	c2 = 0;
494	120M	mul_add_c(a[4], b[7], c3, c1, c2);
495	120M	mul_add_c(a[5], b[6], c3, c1, c2);
496	120M	mul_add_c(a[6], b[5], c3, c1, c2);
497	120M	mul_add_c(a[7], b[4], c3, c1, c2);
498	120M	r[11] = c3;
499	120M	c3 = 0;
500	120M	mul_add_c(a[7], b[5], c1, c2, c3);
501	120M	mul_add_c(a[6], b[6], c1, c2, c3);
502	120M	mul_add_c(a[5], b[7], c1, c2, c3);
503	120M	r[12] = c1;
504	120M	c1 = 0;
505	120M	mul_add_c(a[6], b[7], c2, c3, c1);
506	120M	mul_add_c(a[7], b[6], c2, c3, c1);
507	120M	r[13] = c2;
508	120M	c2 = 0;
509	120M	mul_add_c(a[7], b[7], c3, c1, c2);
510	120M	r[14] = c3;
511	120M	r[15] = c1;
512	120M	}
513
514		void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
515	0	{
516	0	BN_ULONG c1, c2, c3;
517
518	0	c1 = 0;
519	0	c2 = 0;
520	0	c3 = 0;
521	0	mul_add_c(a[0], b[0], c1, c2, c3);
522	0	r[0] = c1;
523	0	c1 = 0;
524	0	mul_add_c(a[0], b[1], c2, c3, c1);
525	0	mul_add_c(a[1], b[0], c2, c3, c1);
526	0	r[1] = c2;
527	0	c2 = 0;
528	0	mul_add_c(a[2], b[0], c3, c1, c2);
529	0	mul_add_c(a[1], b[1], c3, c1, c2);
530	0	mul_add_c(a[0], b[2], c3, c1, c2);
531	0	r[2] = c3;
532	0	c3 = 0;
533	0	mul_add_c(a[0], b[3], c1, c2, c3);
534	0	mul_add_c(a[1], b[2], c1, c2, c3);
535	0	mul_add_c(a[2], b[1], c1, c2, c3);
536	0	mul_add_c(a[3], b[0], c1, c2, c3);
537	0	r[3] = c1;
538	0	c1 = 0;
539	0	mul_add_c(a[3], b[1], c2, c3, c1);
540	0	mul_add_c(a[2], b[2], c2, c3, c1);
541	0	mul_add_c(a[1], b[3], c2, c3, c1);
542	0	r[4] = c2;
543	0	c2 = 0;
544	0	mul_add_c(a[2], b[3], c3, c1, c2);
545	0	mul_add_c(a[3], b[2], c3, c1, c2);
546	0	r[5] = c3;
547	0	c3 = 0;
548	0	mul_add_c(a[3], b[3], c1, c2, c3);
549	0	r[6] = c1;
550	0	r[7] = c2;
551	0	}
552
553		void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
554	20.6M	{
555	20.6M	BN_ULONG c1, c2, c3;
556
557	20.6M	c1 = 0;
558	20.6M	c2 = 0;
559	20.6M	c3 = 0;
560	20.6M	sqr_add_c(a, 0, c1, c2, c3);
561	20.6M	r[0] = c1;
562	20.6M	c1 = 0;
563	20.6M	sqr_add_c2(a, 1, 0, c2, c3, c1);
564	20.6M	r[1] = c2;
565	20.6M	c2 = 0;
566	20.6M	sqr_add_c(a, 1, c3, c1, c2);
567	20.6M	sqr_add_c2(a, 2, 0, c3, c1, c2);
568	20.6M	r[2] = c3;
569	20.6M	c3 = 0;
570	20.6M	sqr_add_c2(a, 3, 0, c1, c2, c3);
571	20.6M	sqr_add_c2(a, 2, 1, c1, c2, c3);
572	20.6M	r[3] = c1;
573	20.6M	c1 = 0;
574	20.6M	sqr_add_c(a, 2, c2, c3, c1);
575	20.6M	sqr_add_c2(a, 3, 1, c2, c3, c1);
576	20.6M	sqr_add_c2(a, 4, 0, c2, c3, c1);
577	20.6M	r[4] = c2;
578	20.6M	c2 = 0;
579	20.6M	sqr_add_c2(a, 5, 0, c3, c1, c2);
580	20.6M	sqr_add_c2(a, 4, 1, c3, c1, c2);
581	20.6M	sqr_add_c2(a, 3, 2, c3, c1, c2);
582	20.6M	r[5] = c3;
583	20.6M	c3 = 0;
584	20.6M	sqr_add_c(a, 3, c1, c2, c3);
585	20.6M	sqr_add_c2(a, 4, 2, c1, c2, c3);
586	20.6M	sqr_add_c2(a, 5, 1, c1, c2, c3);
587	20.6M	sqr_add_c2(a, 6, 0, c1, c2, c3);
588	20.6M	r[6] = c1;
589	20.6M	c1 = 0;
590	20.6M	sqr_add_c2(a, 7, 0, c2, c3, c1);
591	20.6M	sqr_add_c2(a, 6, 1, c2, c3, c1);
592	20.6M	sqr_add_c2(a, 5, 2, c2, c3, c1);
593	20.6M	sqr_add_c2(a, 4, 3, c2, c3, c1);
594	20.6M	r[7] = c2;
595	20.6M	c2 = 0;
596	20.6M	sqr_add_c(a, 4, c3, c1, c2);
597	20.6M	sqr_add_c2(a, 5, 3, c3, c1, c2);
598	20.6M	sqr_add_c2(a, 6, 2, c3, c1, c2);
599	20.6M	sqr_add_c2(a, 7, 1, c3, c1, c2);
600	20.6M	r[8] = c3;
601	20.6M	c3 = 0;
602	20.6M	sqr_add_c2(a, 7, 2, c1, c2, c3);
603	20.6M	sqr_add_c2(a, 6, 3, c1, c2, c3);
604	20.6M	sqr_add_c2(a, 5, 4, c1, c2, c3);
605	20.6M	r[9] = c1;
606	20.6M	c1 = 0;
607	20.6M	sqr_add_c(a, 5, c2, c3, c1);
608	20.6M	sqr_add_c2(a, 6, 4, c2, c3, c1);
609	20.6M	sqr_add_c2(a, 7, 3, c2, c3, c1);
610	20.6M	r[10] = c2;
611	20.6M	c2 = 0;
612	20.6M	sqr_add_c2(a, 7, 4, c3, c1, c2);
613	20.6M	sqr_add_c2(a, 6, 5, c3, c1, c2);
614	20.6M	r[11] = c3;
615	20.6M	c3 = 0;
616	20.6M	sqr_add_c(a, 6, c1, c2, c3);
617	20.6M	sqr_add_c2(a, 7, 5, c1, c2, c3);
618	20.6M	r[12] = c1;
619	20.6M	c1 = 0;
620	20.6M	sqr_add_c2(a, 7, 6, c2, c3, c1);
621	20.6M	r[13] = c2;
622	20.6M	c2 = 0;
623	20.6M	sqr_add_c(a, 7, c3, c1, c2);
624	20.6M	r[14] = c3;
625	20.6M	r[15] = c1;
626	20.6M	}
627
628		void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
629	93.7M	{
630	93.7M	BN_ULONG c1, c2, c3;
631
632	93.7M	c1 = 0;
633	93.7M	c2 = 0;
634	93.7M	c3 = 0;
635	93.7M	sqr_add_c(a, 0, c1, c2, c3);
636	93.7M	r[0] = c1;
637	93.7M	c1 = 0;
638	93.7M	sqr_add_c2(a, 1, 0, c2, c3, c1);
639	93.7M	r[1] = c2;
640	93.7M	c2 = 0;
641	93.7M	sqr_add_c(a, 1, c3, c1, c2);
642	93.7M	sqr_add_c2(a, 2, 0, c3, c1, c2);
643	93.7M	r[2] = c3;
644	93.7M	c3 = 0;
645	93.7M	sqr_add_c2(a, 3, 0, c1, c2, c3);
646	93.7M	sqr_add_c2(a, 2, 1, c1, c2, c3);
647	93.7M	r[3] = c1;
648	93.7M	c1 = 0;
649	93.7M	sqr_add_c(a, 2, c2, c3, c1);
650	93.7M	sqr_add_c2(a, 3, 1, c2, c3, c1);
651	93.7M	r[4] = c2;
652	93.7M	c2 = 0;
653	93.7M	sqr_add_c2(a, 3, 2, c3, c1, c2);
654	93.7M	r[5] = c3;
655	93.7M	c3 = 0;
656	93.7M	sqr_add_c(a, 3, c1, c2, c3);
657	93.7M	r[6] = c1;
658	93.7M	r[7] = c2;
659	93.7M	}
660		#endif

Coverage Report

Created: 2026-04-01 06:39