Coverage Report

Created: 2026-02-14 07:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/wolfssl-openssl-api/wolfcrypt/src/sp_int.c
Line
Count
Source
1
/* sp_int.c
2
 *
3
 * Copyright (C) 2006-2025 wolfSSL Inc.
4
 *
5
 * This file is part of wolfSSL.
6
 *
7
 * wolfSSL is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
 * the Free Software Foundation; either version 3 of the License, or
10
 * (at your option) any later version.
11
 *
12
 * wolfSSL is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU General Public License
18
 * along with this program; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
20
 */
21
22
/* Implementation by Sean Parkinson. */
23
24
/*
25
DESCRIPTION
26
This library provides single precision (SP) integer math functions.
27
28
*/
29
30
#include <wolfssl/wolfcrypt/libwolfssl_sources.h>
31
32
#if defined(WOLFSSL_SP_MATH) || defined(WOLFSSL_SP_MATH_ALL)
33
34
#ifdef NO_INLINE
35
    #include <wolfssl/wolfcrypt/misc.h>
36
#else
37
    #define WOLFSSL_MISC_INCLUDED
38
    #include <wolfcrypt/src/misc.c>
39
#endif
40
41
/* SP Build Options:
42
 * WOLFSSL_HAVE_SP_RSA:         Enable SP RSA support
43
 * WOLFSSL_HAVE_SP_DH:          Enable SP DH support
44
 * WOLFSSL_HAVE_SP_ECC:         Enable SP ECC support
45
 * WOLFSSL_SP_MATH:             Use only single precision math and algorithms
46
 *      it supports (no fastmath tfm.c or normal integer.c)
47
 * WOLFSSL_SP_MATH_ALL          Implementation of all MP functions
48
 *      (replacement for tfm.c and integer.c)
49
 * WOLFSSL_SP_SMALL:            Use smaller version of code and avoid large
50
 *      stack variables
51
 * WOLFSSL_SP_NO_MALLOC:        Always use stack, no heap XMALLOC/XFREE allowed
52
 * WOLFSSL_SP_NO_2048:          Disable RSA/DH 2048-bit support
53
 * WOLFSSL_SP_NO_3072:          Disable RSA/DH 3072-bit support
54
 * WOLFSSL_SP_4096:             Enable RSA/RH 4096-bit support
55
 * WOLFSSL_SP_NO_256            Disable ECC 256-bit SECP256R1 support
56
 * WOLFSSL_SP_384               Enable ECC 384-bit SECP384R1 support
57
 * WOLFSSL_SP_521               Enable ECC 521-bit SECP521R1 support
58
 * WOLFSSL_SP_ASM               Enable assembly speedups (detect platform)
59
 * WOLFSSL_SP_X86_64_ASM        Enable Intel x64 assembly implementation
60
 * WOLFSSL_SP_ARM32_ASM         Enable Aarch32 assembly implementation
61
 * WOLFSSL_SP_ARM64_ASM         Enable Aarch64 assembly implementation
62
 * WOLFSSL_SP_ARM_CORTEX_M_ASM  Enable Cortex-M assembly implementation
63
 * WOLFSSL_SP_ARM_THUMB_ASM     Enable ARM Thumb assembly implementation
64
 *      (used with -mthumb)
65
 * WOLFSSL_SP_X86_64            Enable Intel x86 64-bit assembly speedups
66
 * WOLFSSL_SP_X86               Enable Intel x86 assembly speedups
67
 * WOLFSSL_SP_ARM64             Enable Aarch64 assembly speedups
68
 * WOLFSSL_SP_ARM32             Enable ARM32 assembly speedups
69
 * WOLFSSL_SP_ARM32_UDIV        Enable word divide asm that uses UDIV instr
70
 * WOLFSSL_SP_ARM_THUMB         Enable ARM Thumb assembly speedups
71
 *                              (explicitly uses register 'r7')
72
 * WOLFSSL_SP_PPC64             Enable PPC64 assembly speedups
73
 * WOLFSSL_SP_PPC               Enable PPC assembly speedups
74
 * WOLFSSL_SP_MIPS64            Enable MIPS64 assembly speedups
75
 * WOLFSSL_SP_MIPS              Enable MIPS assembly speedups
76
 * WOLFSSL_SP_RISCV64           Enable RISCV64 assembly speedups
77
 * WOLFSSL_SP_RISCV32           Enable RISCV32 assembly speedups
78
 * WOLFSSL_SP_S390X             Enable S390X assembly speedups
79
 * SP_WORD_SIZE                 Force 32 or 64 bit mode
80
 * WOLFSSL_SP_NONBLOCK          Enables "non blocking" mode for SP math, which
81
 *      will return FP_WOULDBLOCK for long operations and function must be
82
 *      called again until complete.
83
 * WOLFSSL_SP_FAST_NCT_EXPTMOD  Enables the faster non-constant time modular
84
 *      exponentiation implementation.
85
 * WOLFSSL_SP_INT_NEGATIVE      Enables negative values to be used.
86
 * WOLFSSL_SP_INT_DIGIT_ALIGN   Enable when unaligned access of sp_int_digit
87
 *                              pointer is not allowed.
88
 * WOLFSSL_SP_NO_DYN_STACK      Disable use of dynamic stack items.
89
 *                              Dynamic arrays used when not small stack.
90
 * WOLFSSL_SP_FAST_MODEXP       Allow fast mod_exp with small C code
91
 * WOLFSSL_SP_LOW_MEM           Use algorithms that use less memory.
92
 */
93
94
/* TODO: WOLFSSL_SP_SMALL is incompatible with clang-12+ -Os. */
95
#if defined(__clang__) && defined(__clang_major__) && \
96
    (__clang_major__ >= 12) && defined(WOLFSSL_SP_SMALL)
97
    #undef WOLFSSL_SP_SMALL
98
#endif
99
100
#include <wolfssl/wolfcrypt/sp_int.h>
101
102
#ifdef WOLFSSL_SP_DYN_STACK
103
/* We are statically declaring a variable smaller than sp_int.
104
 * We track available memory in the 'size' field.
105
 * Disable warnings of sp_int being partly outside array bounds of variable.
106
 */
107
    PRAGMA_GCC_DIAG_PUSH
108
    PRAGMA_GCC("GCC diagnostic ignored \"-Warray-bounds\"")
109
#endif
110
111
#if defined(WOLFSSL_USE_SAVE_VECTOR_REGISTERS) && !defined(WOLFSSL_SP_ASM)
112
    /* force off unneeded vector register save/restore. */
113
    #undef SAVE_VECTOR_REGISTERS
114
    #define SAVE_VECTOR_REGISTERS(fail_clause) SAVE_NO_VECTOR_REGISTERS(fail_clause)
115
    #undef RESTORE_VECTOR_REGISTERS
116
    #define RESTORE_VECTOR_REGISTERS() RESTORE_NO_VECTOR_REGISTERS()
117
#endif
118
119
/* DECL_SP_INT: Declare one variable of type 'sp_int'. */
120
#if (defined(WOLFSSL_SMALL_STACK) || defined(SP_ALLOC)) && \
121
    !defined(WOLFSSL_SP_NO_MALLOC)
122
    /* Declare a variable that will be assigned a value on XMALLOC. */
123
    #define DECL_SP_INT(n, s)   \
124
5.00M
        sp_int* n = NULL
125
#else
126
    #ifdef WOLFSSL_SP_DYN_STACK
127
        /* Declare a variable on the stack with the required data size. */
128
        #define DECL_SP_INT(n, s)                       \
129
            sp_int_digit n##d[MP_INT_SIZEOF_DIGITS(s)]; \
130
            sp_int* (n) = (sp_int*)n##d
131
    #else
132
        /* Declare a variable on the stack. */
133
        #define DECL_SP_INT(n, s)               \
134
            sp_int n[1]
135
    #endif
136
#endif
137
138
/* ALLOC_SP_INT: Allocate an 'sp_int' of required size. */
139
#if (defined(WOLFSSL_SMALL_STACK) || defined(SP_ALLOC)) && \
140
    !defined(WOLFSSL_SP_NO_MALLOC)
141
    /* Dynamically allocate just enough data to support size. */
142
    #define ALLOC_SP_INT(n, s, err, h)                                         \
143
5.00M
    do {                                                                       \
144
5.00M
        if (((err) == MP_OKAY) && ((s) > SP_INT_DIGITS)) {                     \
145
10
            (err) = MP_VAL;                                                    \
146
10
        }                                                                      \
147
5.00M
        if ((err) == MP_OKAY) {                                                \
148
5.00M
            (n) = (sp_int*)XMALLOC(MP_INT_SIZEOF(s), (h),                      \
149
5.00M
                DYNAMIC_TYPE_BIGINT);                                          \
150
5.00M
            if ((n) == NULL) {                                                 \
151
1.12k
                (err) = MP_MEM;                                                \
152
1.12k
            }                                                                  \
153
5.00M
        }                                                                      \
154
5.00M
    }                                                                          \
155
5.00M
    while (0)
156
157
    /* Dynamically allocate just enough data to support size - and set size. */
158
    #define ALLOC_SP_INT_SIZE(n, s, err, h)                                    \
159
4.95M
    do {                                                                       \
160
4.95M
        ALLOC_SP_INT(n, s, err, h);                                            \
161
4.95M
        if ((err) == MP_OKAY) {                                                \
162
4.95M
            (n)->size = (sp_size_t)(s);                                        \
163
4.95M
        }                                                                      \
164
4.95M
    }                                                                          \
165
4.95M
    while (0)
166
#else
167
    /* Array declared on stack - check size is valid. */
168
    #define ALLOC_SP_INT(n, s, err, h)                                         \
169
    do {                                                                       \
170
        if (((err) == MP_OKAY) && ((s) > (int)SP_INT_DIGITS)) {                \
171
            (err) = MP_VAL;                                                    \
172
        }                                                                      \
173
    }                                                                          \
174
    while (0)
175
176
    /* Array declared on stack - set the size field. */
177
    #define ALLOC_SP_INT_SIZE(n, s, err, h)                                    \
178
    do {                                                                       \
179
        ALLOC_SP_INT(n, s, err, h);                                            \
180
        if ((err) == MP_OKAY) {                                                \
181
            (n)->size = (sp_size_t)(s);                                        \
182
        }                                                                      \
183
    }                                                                          \
184
    while (0)
185
#endif
186
187
/* FREE_SP_INT: Free an 'sp_int' variable. */
188
#if (defined(WOLFSSL_SMALL_STACK) || defined(SP_ALLOC)) && \
189
    !defined(WOLFSSL_SP_NO_MALLOC)
190
    /* Free dynamically allocated data. */
191
    #define FREE_SP_INT(n, h)                   \
192
5.00M
    do {                                        \
193
5.00M
        if ((n) != NULL) {                      \
194
5.00M
            XFREE(n, h, DYNAMIC_TYPE_BIGINT);   \
195
5.00M
        }                                       \
196
5.00M
    }                                           \
197
5.00M
    while (0)
198
#else
199
    /* Nothing to do as declared on stack. */
200
    #define FREE_SP_INT(n, h) WC_DO_NOTHING
201
#endif
202
203
204
/* Declare a variable that will be assigned a value on XMALLOC. */
205
#define DECL_DYN_SP_INT_ARRAY(n, s, c)               \
206
22.2M
    sp_int* n##d = NULL;                             \
207
22.2M
    sp_int* (n)[c];                                  \
208
22.2M
    void *n ## _dummy_var = XMEMSET(n, 0, sizeof(n))
209
210
/* DECL_SP_INT_ARRAY: Declare array of 'sp_int'. */
211
#if (defined(WOLFSSL_SMALL_STACK) || defined(SP_ALLOC)) && \
212
    !defined(WOLFSSL_SP_NO_MALLOC)
213
    /* Declare a variable that will be assigned a value on XMALLOC. */
214
    #define DECL_SP_INT_ARRAY(n, s, c)  \
215
22.2M
        DECL_DYN_SP_INT_ARRAY(n, s, c)
216
#elif defined(WOLFSSL_SP_DYN_STACK)
217
    /* Declare a variable on the stack with the required data size. */
218
    #define DECL_SP_INT_ARRAY(n, s, c)                    \
219
        sp_int_digit n##d[MP_INT_SIZEOF_DIGITS(s) * (c)]; \
220
        sp_int* (n)[c] = { NULL, }
221
#else
222
    /* Declare a variable on the stack. */
223
    #define DECL_SP_INT_ARRAY(n, s, c)      \
224
        sp_int n##d[c];                     \
225
        sp_int* (n)[c]
226
#endif
227
228
/* Dynamically allocate just enough data to support multiple sp_ints of the
229
 * required size. Use pointers into data to make up array and set sizes.
230
 */
231
14.4M
#define ALLOC_DYN_SP_INT_ARRAY(n, s, c, err, h)                                \
232
14.4M
do {                                                                           \
233
14.4M
    (void)n ## _dummy_var;                                                     \
234
14.4M
    if (((err) == MP_OKAY) && ((s) > SP_INT_DIGITS)) {                         \
235
0
        (err) = MP_VAL;                                                        \
236
0
    }                                                                          \
237
14.4M
    if ((err) == MP_OKAY) {                                                    \
238
14.4M
        n##d = (sp_int*)XMALLOC(MP_INT_SIZEOF(s) * (c), (h),                   \
239
14.4M
                                                         DYNAMIC_TYPE_BIGINT); \
240
14.4M
        if (n##d == NULL) {                                                    \
241
1.64k
            (err) = MP_MEM;                                                    \
242
1.64k
        }                                                                      \
243
14.4M
        else {                                                                 \
244
14.4M
            int n##ii;                                                         \
245
14.4M
            (n)[0] = n##d;                                                     \
246
14.4M
            (n)[0]->size = (sp_size_t)(s);                                     \
247
36.5M
            for (n##ii = 1; n##ii < (int)(c); n##ii++) {                       \
248
22.1M
                (n)[n##ii] = MP_INT_NEXT((n)[n##ii-1], s);                     \
249
22.1M
                (n)[n##ii]->size = (sp_size_t)(s);                             \
250
22.1M
            }                                                                  \
251
14.4M
        }                                                                      \
252
14.4M
    }                                                                          \
253
14.4M
}                                                                              \
254
14.4M
while (0)
255
256
/* ALLOC_SP_INT_ARRAY: Allocate an array of 'sp_int's of required size. */
257
#if (defined(WOLFSSL_SMALL_STACK) || defined(SP_ALLOC)) && \
258
    !defined(WOLFSSL_SP_NO_MALLOC)
259
    #define ALLOC_SP_INT_ARRAY(n, s, c, err, h) \
260
14.4M
        ALLOC_DYN_SP_INT_ARRAY(n, s, c, err, h)
261
#elif defined(WOLFSSL_SP_DYN_STACK)
262
    /* Data declared on stack that supports multiple sp_ints of the
263
     * required size. Use pointers into data to make up array and set sizes.
264
     */
265
    #define ALLOC_SP_INT_ARRAY(n, s, c, err, h)                                \
266
    do {                                                                       \
267
        if (((err) == MP_OKAY) && ((s) > SP_INT_DIGITS)) {                     \
268
            (err) = MP_VAL;                                                    \
269
        }                                                                      \
270
        if ((err) == MP_OKAY) {                                                \
271
            int n##ii;                                                         \
272
            (n)[0] = (sp_int*)n##d;                                            \
273
            ((sp_int_minimal*)(n)[0])->size = (sp_size_t)(s);                  \
274
            for (n##ii = 1; n##ii < (int)(c); n##ii++) {                       \
275
                (n)[n##ii] = MP_INT_NEXT((n)[n##ii-1], s);                     \
276
                ((sp_int_minimal*)(n)[n##ii])->size = (sp_size_t)(s);          \
277
            }                                                                  \
278
        }                                                                      \
279
    }                                                                          \
280
    while (0)
281
#else
282
    /* Data declared on stack that supports multiple sp_ints of the
283
     * required size. Set into array and set sizes.
284
     */
285
    #define ALLOC_SP_INT_ARRAY(n, s, c, err, h)                                \
286
    do {                                                                       \
287
        if (((err) == MP_OKAY) && ((s) > SP_INT_DIGITS)) {                     \
288
            (err) = MP_VAL;                                                    \
289
        }                                                                      \
290
        if ((err) == MP_OKAY) {                                                \
291
            int n##ii;                                                         \
292
            for (n##ii = 0; n##ii < (int)(c); n##ii++) {                       \
293
                (n)[n##ii] = &n##d[n##ii];                                     \
294
                (n)[n##ii]->size = (sp_size_t)(s);                             \
295
            }                                                                  \
296
        }                                                                      \
297
    }                                                                          \
298
    while (0)
299
#endif
300
301
/* Free data variable that was dynamically allocated. */
302
22.2M
#define FREE_DYN_SP_INT_ARRAY(n, h)             \
303
22.2M
do {                                            \
304
22.2M
    if (n##d != NULL) {                         \
305
14.4M
        XFREE(n##d, h, DYNAMIC_TYPE_BIGINT);    \
306
14.4M
    }                                           \
307
22.2M
}                                               \
308
22.2M
while (0)
309
310
/* FREE_SP_INT_ARRAY: Free an array of 'sp_int'. */
311
#if (defined(WOLFSSL_SMALL_STACK) || defined(SP_ALLOC)) && \
312
    !defined(WOLFSSL_SP_NO_MALLOC)
313
    #define FREE_SP_INT_ARRAY(n, h)                 \
314
22.2M
        FREE_DYN_SP_INT_ARRAY(n, h)
315
#else
316
    /* Nothing to do as data declared on stack. */
317
    #define FREE_SP_INT_ARRAY(n, h) WC_DO_NOTHING
318
#endif
319
320
321
#ifndef WOLFSSL_NO_ASM
322
    #ifdef __IAR_SYSTEMS_ICC__
323
        #define __asm__        asm
324
        #define __volatile__   volatile
325
    #endif /* __IAR_SYSTEMS_ICC__ */
326
    #ifdef __KEIL__
327
        #define __asm__        __asm
328
        #define __volatile__   volatile
329
    #endif
330
331
    #if defined(WOLFSSL_SP_X86_64) && SP_WORD_SIZE == 64
332
/*
333
 * CPU: x86_64
334
 */
335
336
#ifndef _MSC_VER
337
/* Multiply va by vb and store double size result in: vh | vl */
338
#define SP_ASM_MUL(vl, vh, va, vb)                       \
339
163M
    __asm__ __volatile__ (                               \
340
163M
        "movq %[b], %%rax \n\t"                    \
341
163M
        "mulq %[a]    \n\t"                    \
342
163M
        "movq %%rax, %[l] \n\t"                    \
343
163M
        "movq %%rdx, %[h] \n\t"                    \
344
163M
        : [h] "+r" (vh), [l] "+r" (vl)                   \
345
163M
        : [a] "rm" (va), [b] "rm" (vb)                   \
346
163M
        : "%rax", "%rdx", "cc"                           \
347
163M
    )
348
/* Multiply va by vb and store double size result in: vo | vh | vl */
349
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
350
6.11M
    __asm__ __volatile__ (                               \
351
6.11M
        "movq %[b], %%rax \n\t"                    \
352
6.11M
        "mulq %[a]    \n\t"                    \
353
6.11M
        "movq $0   , %[o] \n\t"                    \
354
6.11M
        "movq %%rax, %[l] \n\t"                    \
355
6.11M
        "movq %%rdx, %[h] \n\t"                    \
356
6.11M
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
357
6.11M
        : [a] "m" (va), [b] "m" (vb)                     \
358
6.11M
        : "%rax", "%rdx", "cc"                           \
359
6.11M
    )
360
/* Multiply va by vb and add double size result into: vo | vh | vl */
361
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
362
2.53G
    __asm__ __volatile__ (                               \
363
2.53G
        "movq %[b], %%rax \n\t"                    \
364
2.53G
        "mulq %[a]    \n\t"                    \
365
2.53G
        "addq %%rax, %[l] \n\t"                    \
366
2.53G
        "adcq %%rdx, %[h] \n\t"                    \
367
2.53G
        "adcq $0   , %[o] \n\t"                    \
368
2.53G
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
369
2.53G
        : [a] "rm" (va), [b] "rm" (vb)                   \
370
2.53G
        : "%rax", "%rdx", "cc"                           \
371
2.53G
    )
372
/* Multiply va by vb and add double size result into: vh | vl */
373
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
374
4.99G
    __asm__ __volatile__ (                               \
375
4.99G
        "movq %[b], %%rax \n\t"                    \
376
4.99G
        "mulq %[a]    \n\t"                    \
377
4.99G
        "addq %%rax, %[l] \n\t"                    \
378
4.99G
        "adcq %%rdx, %[h] \n\t"                    \
379
4.99G
        : [l] "+r" (vl), [h] "+r" (vh)                   \
380
4.99G
        : [a] "rm" (va), [b] "rm" (vb)                   \
381
4.99G
        : "%rax", "%rdx", "cc"                           \
382
4.99G
    )
383
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
384
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
385
630M
    __asm__ __volatile__ (                               \
386
630M
        "movq %[b], %%rax \n\t"                    \
387
630M
        "mulq %[a]    \n\t"                    \
388
630M
        "addq %%rax, %[l] \n\t"                    \
389
630M
        "adcq %%rdx, %[h] \n\t"                    \
390
630M
        "adcq $0   , %[o] \n\t"                    \
391
630M
        "addq %%rax, %[l] \n\t"                    \
392
630M
        "adcq %%rdx, %[h] \n\t"                    \
393
630M
        "adcq $0   , %[o] \n\t"                    \
394
630M
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
395
630M
        : [a] "rm" (va), [b] "rm" (vb)                   \
396
630M
        : "%rax", "%rdx", "cc"                           \
397
630M
    )
398
/* Multiply va by vb and add double size result twice into: vo | vh | vl
399
 * Assumes first add will not overflow vh | vl
400
 */
401
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
402
53.3M
    __asm__ __volatile__ (                               \
403
53.3M
        "movq %[b], %%rax \n\t"                    \
404
53.3M
        "mulq %[a]    \n\t"                    \
405
53.3M
        "addq %%rax, %[l] \n\t"                    \
406
53.3M
        "adcq %%rdx, %[h] \n\t"                    \
407
53.3M
        "addq %%rax, %[l] \n\t"                    \
408
53.3M
        "adcq %%rdx, %[h] \n\t"                    \
409
53.3M
        "adcq $0   , %[o] \n\t"                    \
410
53.3M
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
411
53.3M
        : [a] "rm" (va), [b] "rm" (vb)                   \
412
53.3M
        : "%rax", "%rdx", "cc"                           \
413
53.3M
    )
414
/* Square va and store double size result in: vh | vl */
415
#define SP_ASM_SQR(vl, vh, va)                           \
416
43.5M
    __asm__ __volatile__ (                               \
417
43.5M
        "movq %[a], %%rax \n\t"                    \
418
43.5M
        "mulq %%rax   \n\t"                    \
419
43.5M
        "movq %%rax, %[l] \n\t"                    \
420
43.5M
        "movq %%rdx, %[h] \n\t"                    \
421
43.5M
        : [h] "+r" (vh), [l] "+r" (vl)                   \
422
43.5M
        : [a] "rm" (va)                                  \
423
43.5M
        : "%rax", "%rdx", "cc"                           \
424
43.5M
    )
425
/* Square va and add double size result into: vo | vh | vl */
426
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
427
143M
    __asm__ __volatile__ (                               \
428
143M
        "movq %[a], %%rax \n\t"                    \
429
143M
        "mulq %%rax   \n\t"                    \
430
143M
        "addq %%rax, %[l] \n\t"                    \
431
143M
        "adcq %%rdx, %[h] \n\t"                    \
432
143M
        "adcq $0   , %[o] \n\t"                    \
433
143M
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
434
143M
        : [a] "rm" (va)                                  \
435
143M
        : "%rax", "%rdx", "cc"                           \
436
143M
    )
437
/* Square va and add double size result into: vh | vl */
438
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
439
26.6M
    __asm__ __volatile__ (                               \
440
26.6M
        "movq %[a], %%rax \n\t"                    \
441
26.6M
        "mulq %%rax   \n\t"                    \
442
26.6M
        "addq %%rax, %[l] \n\t"                    \
443
26.6M
        "adcq %%rdx, %[h] \n\t"                    \
444
26.6M
        : [l] "+r" (vl), [h] "+r" (vh)                   \
445
26.6M
        : [a] "rm" (va)                                  \
446
26.6M
        : "%rax", "%rdx", "cc"                           \
447
26.6M
    )
448
/* Add va into: vh | vl */
449
#define SP_ASM_ADDC(vl, vh, va)                          \
450
5.98G
    __asm__ __volatile__ (                               \
451
5.98G
        "addq %[a], %[l]  \n\t"                    \
452
5.98G
        "adcq $0  , %[h]  \n\t"                    \
453
5.98G
        : [l] "+r" (vl), [h] "+r" (vh)                   \
454
5.98G
        : [a] "rm" (va)                                  \
455
5.98G
        : "cc"                                           \
456
5.98G
    )
457
#define SP_ASM_ADDC_REG(vl, vh, va)                      \
458
1.52G
    __asm__ __volatile__ (                               \
459
1.52G
        "addq %[a], %[l]  \n\t"                    \
460
1.52G
        "adcq $0  , %[h]  \n\t"                    \
461
1.52G
        : [l] "+r" (vl), [h] "+r" (vh)                   \
462
1.52G
        : [a] "r" (va)                                   \
463
1.52G
        : "cc"                                           \
464
1.52G
    )
465
/* Sub va from: vh | vl */
466
#define SP_ASM_SUBB(vl, vh, va)                          \
467
1.97G
    __asm__ __volatile__ (                               \
468
1.97G
        "subq %[a], %[l]  \n\t"                    \
469
1.97G
        "sbbq $0  , %[h]  \n\t"                    \
470
1.97G
        : [l] "+r" (vl), [h] "+r" (vh)                   \
471
1.97G
        : [a] "rm" (va)                                  \
472
1.97G
        : "cc"                                           \
473
1.97G
    )
474
/* Sub va from: vh | vl */
475
#define SP_ASM_SUBB_REG(vl, vh, va)                      \
476
315M
    __asm__ __volatile__ (                               \
477
315M
        "subq %[a], %[l]  \n\t"                    \
478
315M
        "sbbq $0  , %[h]  \n\t"                    \
479
315M
        : [l] "+r" (vl), [h] "+r" (vh)                   \
480
315M
        : [a] "r" (va)                                   \
481
315M
        : "cc"                                           \
482
315M
    )
483
/* Add two times vc | vb | va into vo | vh | vl */
484
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
485
6.11M
    __asm__ __volatile__ (                               \
486
6.11M
        "addq %[a], %[l]  \n\t"                    \
487
6.11M
        "adcq %[b], %[h]  \n\t"                    \
488
6.11M
        "adcq %[c], %[o]  \n\t"                    \
489
6.11M
        "addq %[a], %[l]  \n\t"                    \
490
6.11M
        "adcq %[b], %[h]  \n\t"                    \
491
6.11M
        "adcq %[c], %[o]  \n\t"                    \
492
6.11M
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
493
6.11M
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
494
6.11M
        : "cc"                                           \
495
6.11M
    )
496
/* Index of highest bit set. */
497
#define SP_ASM_HI_BIT_SET_IDX(va, vi)                    \
498
119M
    __asm__ __volatile__ (                               \
499
119M
        "bsr  %[a], %[i]  \n\t"                    \
500
119M
        : [i] "=r" (vi)                                  \
501
119M
        : [a] "r" (va)                                   \
502
119M
        : "cc"                                           \
503
119M
    )
504
#else
505
#include <intrin.h>
506
507
/* Multiply va by vb and store double size result in: vh | vl */
508
#define SP_ASM_MUL(vl, vh, va, vb)                       \
509
    vl = _umul128(va, vb, &vh)
510
511
/* Multiply va by vb and store double size result in: vo | vh | vl */
512
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
513
    do {                                                 \
514
        vl = _umul128(va, vb, &vh);                      \
515
        vo = 0;                                          \
516
    }                                                    \
517
    while (0)
518
519
/* Multiply va by vb and add double size result into: vo | vh | vl */
520
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
521
    do {                                                 \
522
        unsigned __int64 vtl, vth;                       \
523
        unsigned char c;                                 \
524
        vtl = _umul128(va, vb, &vth);                    \
525
        c = _addcarry_u64(0, vl, vtl, &vl);              \
526
        c = _addcarry_u64(c, vh, vth, &vh);              \
527
            _addcarry_u64(c, vo,   0, &vo);              \
528
    }                                                    \
529
    while (0)
530
531
/* Multiply va by vb and add double size result into: vh | vl */
532
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
533
    do {                                                 \
534
        unsigned __int64 vtl, vth;                       \
535
        unsigned char c;                                 \
536
        vtl = _umul128(va, vb, &vth);                    \
537
        c = _addcarry_u64(0, vl, vtl, &vl);              \
538
            _addcarry_u64(c, vh, vth, &vh);              \
539
    }                                                    \
540
    while (0)
541
542
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
543
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
544
    do {                                                 \
545
        unsigned __int64 vtl, vth;                       \
546
        unsigned char c;                                 \
547
        vtl = _umul128(va, vb, &vth);                    \
548
        c = _addcarry_u64(0, vl, vtl, &vl);              \
549
        c = _addcarry_u64(c, vh, vth, &vh);              \
550
            _addcarry_u64(c, vo,   0, &vo);              \
551
        c = _addcarry_u64(0, vl, vtl, &vl);              \
552
        c = _addcarry_u64(c, vh, vth, &vh);              \
553
            _addcarry_u64(c, vo,   0, &vo);              \
554
    }                                                    \
555
    while (0)
556
/* Multiply va by vb and add double size result twice into: vo | vh | vl
557
 * Assumes first add will not overflow vh | vl
558
 */
559
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
560
    do {                                                 \
561
        unsigned __int64 vtl, vth;                       \
562
        unsigned char c;                                 \
563
        vtl = _umul128(va, vb, &vth);                    \
564
        c = _addcarry_u64(0, vl, vtl, &vl);              \
565
            _addcarry_u64(c, vh, vth, &vh);              \
566
        c = _addcarry_u64(0, vl, vtl, &vl);              \
567
        c = _addcarry_u64(c, vh, vth, &vh);              \
568
            _addcarry_u64(c, vo,   0, &vo);              \
569
    }                                                    \
570
    while (0)
571
572
 /* Square va and store double size result in: vh | vl */
573
#define SP_ASM_SQR(vl, vh, va)                           \
574
    vl = _umul128(va, va, &vh)
575
576
/* Square va and add double size result into: vo | vh | vl */
577
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
578
    do {                                                 \
579
        unsigned __int64 vtl, vth;                       \
580
        unsigned char c;                                 \
581
        vtl = _umul128(va, va, &vth);                    \
582
        c = _addcarry_u64(0, vl, vtl, &vl);              \
583
        c = _addcarry_u64(c, vh, vth, &vh);              \
584
            _addcarry_u64(c, vo,   0, &vo);              \
585
    }                                                    \
586
    while (0)
587
588
/* Square va and add double size result into: vh | vl */
589
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
590
    do {                                                 \
591
        unsigned __int64 vtl, vth;                       \
592
        unsigned char c;                                 \
593
        vtl = _umul128(va, va, &vth);                    \
594
        c = _addcarry_u64(0, vl, vtl, &vl);              \
595
            _addcarry_u64(c, vh, vth, &vh);              \
596
    }                                                    \
597
    while (0)
598
599
/* Add va into: vh | vl */
600
#define SP_ASM_ADDC(vl, vh, va)                          \
601
    do {                                                 \
602
        unsigned char c;                                 \
603
        c = _addcarry_u64(0, vl, va, &vl);               \
604
            _addcarry_u64(c, vh,  0, &vh);               \
605
    }                                                    \
606
    while (0)
607
608
/* Add va, variable in a register, into: vh | vl */
609
#define SP_ASM_ADDC_REG(vl, vh, va)                      \
610
    do {                                                 \
611
        unsigned char c;                                 \
612
        c = _addcarry_u64(0, vl, va, &vl);               \
613
            _addcarry_u64(c, vh,  0, &vh);               \
614
    }                                                    \
615
    while (0)
616
617
/* Sub va from: vh | vl */
618
#define SP_ASM_SUBB(vl, vh, va)                          \
619
    do {                                                 \
620
        unsigned char c;                                 \
621
        c = _subborrow_u64(0, vl, va, &vl);              \
622
            _subborrow_u64(c, vh,  0, &vh);              \
623
    }                                                    \
624
    while (0)
625
626
/* Add two times vc | vb | va into vo | vh | vl */
627
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
628
    do {                                                 \
629
        unsigned char c;                                 \
630
        c = _addcarry_u64(0, vl, va, &vl);               \
631
        c = _addcarry_u64(c, vh, vb, &vh);               \
632
            _addcarry_u64(c, vo, vc, &vo);               \
633
        c = _addcarry_u64(0, vl, va, &vl);               \
634
        c = _addcarry_u64(c, vh, vb, &vh);               \
635
            _addcarry_u64(c, vo, vc, &vo);               \
636
    }                                                    \
637
    while (0)
638
/* Index of highest bit set. */
639
#define SP_ASM_HI_BIT_SET_IDX(va, vi)                    \
640
    do {                                                 \
641
        unsigned long idx;                               \
642
        _BitScanReverse64(&idx, va);                     \
643
        vi = idx;                                        \
644
    }                                                    \
645
    while (0)
646
#endif
647
648
#if !defined(WOLFSSL_SP_DIV_WORD_HALF) && (!defined(_MSC_VER) || \
649
    _MSC_VER >= 1920)
650
/* Divide a two digit number by a digit number and return. (hi | lo) / d
651
 *
652
 * Using divq instruction on Intel x64.
653
 *
654
 * @param  [in]  hi  SP integer digit. High digit of the dividend.
655
 * @param  [in]  lo  SP integer digit. Lower digit of the dividend.
656
 * @param  [in]  d   SP integer digit. Number to divide by.
657
 * @return  The division result.
658
 */
659
static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
660
                                          sp_int_digit d)
661
55.2M
{
662
55.2M
#ifndef _MSC_VER
663
55.2M
    __asm__ __volatile__ (
664
55.2M
        "divq %2"
665
55.2M
        : "+a" (lo)
666
55.2M
        : "d" (hi), "r" (d)
667
55.2M
        : "cc"
668
55.2M
    );
669
55.2M
    return lo;
670
#elif defined(_MSC_VER) && _MSC_VER >= 1920
671
    return _udiv128(hi, lo, d, NULL);
672
#endif
673
55.2M
}
674
#define SP_ASM_DIV_WORD
675
#endif
676
677
#define SP_INT_ASM_AVAILABLE
678
679
    #endif /* WOLFSSL_SP_X86_64 && SP_WORD_SIZE == 64 */
680
681
    #if defined(WOLFSSL_SP_X86) && SP_WORD_SIZE == 32
682
/*
683
 * CPU: x86
684
 */
685
686
/* Multiply va by vb and store double size result in: vh | vl */
687
#define SP_ASM_MUL(vl, vh, va, vb)                       \
688
    __asm__ __volatile__ (                               \
689
        "movl %[b], %%eax \n\t"                    \
690
        "mull %[a]    \n\t"                    \
691
        "movl %%eax, %[l] \n\t"                    \
692
        "movl %%edx, %[h] \n\t"                    \
693
        : [h] "+r" (vh), [l] "+r" (vl)                   \
694
        : [a] "rm" (va), [b] "rm" (vb)                   \
695
        : "eax", "edx", "cc"                             \
696
    )
697
/* Multiply va by vb and store double size result in: vo | vh | vl */
698
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
699
    __asm__ __volatile__ (                               \
700
        "movl %[b], %%eax \n\t"                    \
701
        "mull %[a]    \n\t"                    \
702
        "movl $0   , %[o] \n\t"                    \
703
        "movl %%eax, %[l] \n\t"                    \
704
        "movl %%edx, %[h] \n\t"                    \
705
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
706
        : [a] "m" (va), [b] "m" (vb)                     \
707
        : "eax", "edx", "cc"                             \
708
    )
709
/* Multiply va by vb and add double size result into: vo | vh | vl */
710
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
711
    __asm__ __volatile__ (                               \
712
        "movl %[b], %%eax \n\t"                    \
713
        "mull %[a]    \n\t"                    \
714
        "addl %%eax, %[l] \n\t"                    \
715
        "adcl %%edx, %[h] \n\t"                    \
716
        "adcl $0   , %[o] \n\t"                    \
717
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
718
        : [a] "rm" (va), [b] "rm" (vb)                   \
719
        : "eax", "edx", "cc"                             \
720
    )
721
/* Multiply va by vb and add double size result into: vh | vl */
722
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
723
    __asm__ __volatile__ (                               \
724
        "movl %[b], %%eax \n\t"                    \
725
        "mull %[a]    \n\t"                    \
726
        "addl %%eax, %[l] \n\t"                    \
727
        "adcl %%edx, %[h] \n\t"                    \
728
        : [l] "+r" (vl), [h] "+r" (vh)                   \
729
        : [a] "rm" (va), [b] "rm" (vb)                   \
730
        : "eax", "edx", "cc"                             \
731
    )
732
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
733
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
734
    __asm__ __volatile__ (                               \
735
        "movl %[b], %%eax \n\t"                    \
736
        "mull %[a]    \n\t"                    \
737
        "addl %%eax, %[l] \n\t"                    \
738
        "adcl %%edx, %[h] \n\t"                    \
739
        "adcl $0   , %[o] \n\t"                    \
740
        "addl %%eax, %[l] \n\t"                    \
741
        "adcl %%edx, %[h] \n\t"                    \
742
        "adcl $0   , %[o] \n\t"                    \
743
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
744
        : [a] "rm" (va), [b] "rm" (vb)                   \
745
        : "eax", "edx", "cc"                             \
746
    )
747
/* Multiply va by vb and add double size result twice into: vo | vh | vl
748
 * Assumes first add will not overflow vh | vl
749
 */
750
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
751
    __asm__ __volatile__ (                               \
752
        "movl %[b], %%eax \n\t"                    \
753
        "mull %[a]    \n\t"                    \
754
        "addl %%eax, %[l] \n\t"                    \
755
        "adcl %%edx, %[h] \n\t"                    \
756
        "addl %%eax, %[l] \n\t"                    \
757
        "adcl %%edx, %[h] \n\t"                    \
758
        "adcl $0   , %[o] \n\t"                    \
759
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
760
        : [a] "rm" (va), [b] "rm" (vb)                   \
761
        : "eax", "edx", "cc"                             \
762
    )
763
/* Square va and store double size result in: vh | vl */
764
#define SP_ASM_SQR(vl, vh, va)                           \
765
    __asm__ __volatile__ (                               \
766
        "movl %[a], %%eax \n\t"                    \
767
        "mull %%eax   \n\t"                    \
768
        "movl %%eax, %[l] \n\t"                    \
769
        "movl %%edx, %[h] \n\t"                    \
770
        : [h] "+r" (vh), [l] "+r" (vl)                   \
771
        : [a] "rm" (va)                                  \
772
        : "eax", "edx", "cc"                             \
773
    )
774
/* Square va and add double size result into: vo | vh | vl */
775
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
776
    __asm__ __volatile__ (                               \
777
        "movl %[a], %%eax \n\t"                    \
778
        "mull %%eax   \n\t"                    \
779
        "addl %%eax, %[l] \n\t"                    \
780
        "adcl %%edx, %[h] \n\t"                    \
781
        "adcl $0   , %[o] \n\t"                    \
782
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
783
        : [a] "rm" (va)                                  \
784
        : "eax", "edx", "cc"                             \
785
    )
786
/* Square va and add double size result into: vh | vl */
787
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
788
    __asm__ __volatile__ (                               \
789
        "movl %[a], %%eax \n\t"                    \
790
        "mull %%eax   \n\t"                    \
791
        "addl %%eax, %[l] \n\t"                    \
792
        "adcl %%edx, %[h] \n\t"                    \
793
        : [l] "+r" (vl), [h] "+r" (vh)                   \
794
        : [a] "rm" (va)                                  \
795
        : "eax", "edx", "cc"                             \
796
    )
797
/* Add va into: vh | vl */
798
#define SP_ASM_ADDC(vl, vh, va)                          \
799
    __asm__ __volatile__ (                               \
800
        "addl %[a], %[l]  \n\t"                    \
801
        "adcl $0  , %[h]  \n\t"                    \
802
        : [l] "+r" (vl), [h] "+r" (vh)                   \
803
        : [a] "rm" (va)                                  \
804
        : "cc"                                           \
805
    )
806
#define SP_ASM_ADDC_REG(vl, vh, va)                      \
807
    __asm__ __volatile__ (                               \
808
        "addl %[a], %[l]  \n\t"                    \
809
        "adcl $0  , %[h]  \n\t"                    \
810
        : [l] "+r" (vl), [h] "+r" (vh)                   \
811
        : [a] "r" (va)                                   \
812
        : "cc"                                           \
813
    )
814
/* Sub va from: vh | vl */
815
#define SP_ASM_SUBB(vl, vh, va)                          \
816
    __asm__ __volatile__ (                               \
817
        "subl %[a], %[l]  \n\t"                    \
818
        "sbbl $0  , %[h]  \n\t"                    \
819
        : [l] "+r" (vl), [h] "+r" (vh)                   \
820
        : [a] "rm" (va)                                  \
821
        : "cc"                                           \
822
    )
823
/* Sub va from: vh | vl */
824
#define SP_ASM_SUBB_REG(vl, vh, va)                      \
825
    __asm__ __volatile__ (                               \
826
        "subl %[a], %[l]  \n\t"                    \
827
        "sbbl $0  , %[h]  \n\t"                    \
828
        : [l] "+r" (vl), [h] "+r" (vh)                   \
829
        : [a] "r" (va)                                   \
830
        : "cc"                                           \
831
    )
832
/* Add two times vc | vb | va into vo | vh | vl */
833
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
834
    __asm__ __volatile__ (                               \
835
        "addl %[a], %[l]  \n\t"                    \
836
        "adcl %[b], %[h]  \n\t"                    \
837
        "adcl %[c], %[o]  \n\t"                    \
838
        "addl %[a], %[l]  \n\t"                    \
839
        "adcl %[b], %[h]  \n\t"                    \
840
        "adcl %[c], %[o]  \n\t"                    \
841
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
842
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
843
        : "cc"                                           \
844
    )
845
/* Index of highest bit set. */
846
#define SP_ASM_HI_BIT_SET_IDX(va, vi)                    \
847
    __asm__ __volatile__ (                               \
848
        "bsr  %[a], %[i]  \n\t"                    \
849
        : [i] "=r" (vi)                                  \
850
        : [a] "r" (va)                                   \
851
        : "cc"                                           \
852
    )
853
854
#ifndef WOLFSSL_SP_DIV_WORD_HALF
855
/* Divide a two digit number by a digit number and return. (hi | lo) / d
856
 *
857
 * Using divl instruction on Intel x64.
858
 *
859
 * @param  [in]  hi  SP integer digit. High digit of the dividend.
860
 * @param  [in]  lo  SP integer digit. Lower digit of the dividend.
861
 * @param  [in]  d   SP integer digit. Number to divide by.
862
 * @return  The division result.
863
 */
864
static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
865
                                          sp_int_digit d)
866
{
867
    __asm__ __volatile__ (
868
        "divl %2"
869
        : "+a" (lo)
870
        : "d" (hi), "r" (d)
871
        : "cc"
872
    );
873
    return lo;
874
}
875
#define SP_ASM_DIV_WORD
876
#endif
877
878
#define SP_INT_ASM_AVAILABLE
879
880
    #endif /* WOLFSSL_SP_X86 && SP_WORD_SIZE == 32 */
881
882
    #if defined(WOLFSSL_SP_ARM64) && SP_WORD_SIZE == 64
883
/*
884
 * CPU: Aarch64
885
 */
886
887
/* Multiply va by vb and store double size result in: vh | vl */
888
#define SP_ASM_MUL(vl, vh, va, vb)                       \
889
    __asm__ __volatile__ (                               \
890
        "mul  %[l], %[a], %[b]  \n\t"            \
891
        "umulh  %[h], %[a], %[b]  \n\t"            \
892
        : [h] "+r" (vh), [l] "+r" (vl)                   \
893
        : [a] "r" (va), [b] "r" (vb)                     \
894
        : "cc"                                           \
895
    )
896
/* Multiply va by vb and store double size result in: vo | vh | vl */
897
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
898
    __asm__ __volatile__ (                               \
899
        "mul  x8, %[a], %[b]    \n\t"            \
900
        "umulh  %[h], %[a], %[b]  \n\t"            \
901
        "mov  %[l], x8    \n\t"            \
902
        "mov  %[o], xzr   \n\t"            \
903
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
904
        : [a] "r" (va), [b] "r" (vb)                     \
905
        : "x8", "cc"                                     \
906
    )
907
/* Multiply va by vb and add double size result into: vo | vh | vl */
908
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
909
    __asm__ __volatile__ (                               \
910
        "mul  x8, %[a], %[b]    \n\t"            \
911
        "umulh  x9, %[a], %[b]    \n\t"            \
912
        "adds %[l], %[l], x8    \n\t"            \
913
        "adcs %[h], %[h], x9    \n\t"            \
914
        "adc  %[o], %[o], xzr   \n\t"            \
915
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
916
        : [a] "r" (va), [b] "r" (vb)                     \
917
        : "x8", "x9", "cc"                               \
918
    )
919
/* Multiply va by vb and add double size result into: vh | vl */
920
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
921
    __asm__ __volatile__ (                               \
922
        "mul  x8, %[a], %[b]    \n\t"            \
923
        "umulh  x9, %[a], %[b]    \n\t"            \
924
        "adds %[l], %[l], x8    \n\t"            \
925
        "adc  %[h], %[h], x9    \n\t"            \
926
        : [l] "+r" (vl), [h] "+r" (vh)                   \
927
        : [a] "r" (va), [b] "r" (vb)                     \
928
        : "x8", "x9", "cc"                               \
929
    )
930
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
931
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
932
    __asm__ __volatile__ (                               \
933
        "mul  x8, %[a], %[b]    \n\t"            \
934
        "umulh  x9, %[a], %[b]    \n\t"            \
935
        "adds %[l], %[l], x8    \n\t"            \
936
        "adcs %[h], %[h], x9    \n\t"            \
937
        "adc  %[o], %[o], xzr   \n\t"            \
938
        "adds %[l], %[l], x8    \n\t"            \
939
        "adcs %[h], %[h], x9    \n\t"            \
940
        "adc  %[o], %[o], xzr   \n\t"            \
941
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
942
        : [a] "r" (va), [b] "r" (vb)                     \
943
        : "x8", "x9", "cc"                               \
944
    )
945
/* Multiply va by vb and add double size result twice into: vo | vh | vl
946
 * Assumes first add will not overflow vh | vl
947
 */
948
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
949
    __asm__ __volatile__ (                               \
950
        "mul  x8, %[a], %[b]    \n\t"            \
951
        "umulh  x9, %[a], %[b]    \n\t"            \
952
        "adds %[l], %[l], x8    \n\t"            \
953
        "adc  %[h], %[h], x9    \n\t"            \
954
        "adds %[l], %[l], x8    \n\t"            \
955
        "adcs %[h], %[h], x9    \n\t"            \
956
        "adc  %[o], %[o], xzr   \n\t"            \
957
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
958
        : [a] "r" (va), [b] "r" (vb)                     \
959
        : "x8", "x9", "cc"                               \
960
    )
961
/* Square va and store double size result in: vh | vl */
962
#define SP_ASM_SQR(vl, vh, va)                           \
963
    __asm__ __volatile__ (                               \
964
        "mul  %[l], %[a], %[a]  \n\t"            \
965
        "umulh  %[h], %[a], %[a]  \n\t"            \
966
        : [h] "+r" (vh), [l] "+r" (vl)                   \
967
        : [a] "r" (va)                                   \
968
        : "cc"                                           \
969
    )
970
/* Square va and add double size result into: vo | vh | vl */
971
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
972
    __asm__ __volatile__ (                               \
973
        "mul  x8, %[a], %[a]    \n\t"            \
974
        "umulh  x9, %[a], %[a]    \n\t"            \
975
        "adds %[l], %[l], x8    \n\t"            \
976
        "adcs %[h], %[h], x9    \n\t"            \
977
        "adc  %[o], %[o], xzr   \n\t"            \
978
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
979
        : [a] "r" (va)                                   \
980
        : "x8", "x9", "cc"                               \
981
    )
982
/* Square va and add double size result into: vh | vl */
983
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
984
    __asm__ __volatile__ (                               \
985
        "mul  x8, %[a], %[a]    \n\t"            \
986
        "umulh  x9, %[a], %[a]    \n\t"            \
987
        "adds %[l], %[l], x8    \n\t"            \
988
        "adc  %[h], %[h], x9    \n\t"            \
989
        : [l] "+r" (vl), [h] "+r" (vh)                   \
990
        : [a] "r" (va)                                   \
991
        : "x8", "x9", "cc"                               \
992
    )
993
/* Add va into: vh | vl */
994
#define SP_ASM_ADDC(vl, vh, va)                          \
995
    __asm__ __volatile__ (                               \
996
        "adds %[l], %[l], %[a]  \n\t"            \
997
        "adc  %[h], %[h], xzr   \n\t"            \
998
        : [l] "+r" (vl), [h] "+r" (vh)                   \
999
        : [a] "r" (va)                                   \
1000
        : "cc"                                           \
1001
    )
1002
/* Sub va from: vh | vl */
1003
#define SP_ASM_SUBB(vl, vh, va)                          \
1004
    __asm__ __volatile__ (                               \
1005
        "subs %[l], %[l], %[a]  \n\t"            \
1006
        "sbc  %[h], %[h], xzr   \n\t"            \
1007
        : [l] "+r" (vl), [h] "+r" (vh)                   \
1008
        : [a] "r" (va)                                   \
1009
        : "cc"                                           \
1010
    )
1011
/* Add two times vc | vb | va into vo | vh | vl */
1012
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
1013
    __asm__ __volatile__ (                               \
1014
        "adds %[l], %[l], %[a]  \n\t"            \
1015
        "adcs %[h], %[h], %[b]  \n\t"            \
1016
        "adc  %[o], %[o], %[c]  \n\t"            \
1017
        "adds %[l], %[l], %[a]  \n\t"            \
1018
        "adcs %[h], %[h], %[b]  \n\t"            \
1019
        "adc  %[o], %[o], %[c]  \n\t"            \
1020
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
1021
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
1022
        : "cc"                                           \
1023
    )
1024
/* Count leading zeros. */
1025
#define SP_ASM_LZCNT(va, vn)                             \
1026
    __asm__ __volatile__ (                               \
1027
        "clz  %[n], %[a]  \n\t"                    \
1028
        : [n] "=r" (vn)                                  \
1029
        : [a] "r" (va)                                   \
1030
        :                                                \
1031
    )
1032
1033
#ifndef WOLFSSL_SP_DIV_WORD_HALF
1034
/* Divide a two digit number by a digit number and return. (hi | lo) / d
1035
 *
1036
 * Using udiv instruction on Aarch64.
1037
 * Constant time.
1038
 *
1039
 * @param  [in]  hi  SP integer digit. High digit of the dividend.
1040
 * @param  [in]  lo  SP integer digit. Lower digit of the dividend.
1041
 * @param  [in]  d   SP integer digit. Number to divide by.
1042
 * @return  The division result.
1043
 */
1044
static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
1045
                                          sp_int_digit d)
1046
{
1047
    __asm__ __volatile__ (
1048
        "lsr  x3, %[d], 48\n\t"
1049
        "mov  x5, 16\n\t"
1050
        "cmp  x3, 0\n\t"
1051
        "mov  x4, 63\n\t"
1052
        "csel x3, x5, xzr, eq\n\t"
1053
        "sub  x4, x4, x3\n\t"
1054
        "lsl  %[d], %[d], x3\n\t"
1055
        "lsl  %[hi], %[hi], x3\n\t"
1056
        "lsr  x5, %[lo], x4\n\t"
1057
        "lsl  %[lo], %[lo], x3\n\t"
1058
        "orr  %[hi], %[hi], x5, lsr 1\n\t"
1059
1060
        "lsr  x5, %[d], 32\n\t"
1061
        "add  x5, x5, 1\n\t"
1062
1063
        "udiv x3, %[hi], x5\n\t"
1064
        "lsl  x6, x3, 32\n\t"
1065
        "mul  x4, %[d], x6\n\t"
1066
        "umulh  x3, %[d], x6\n\t"
1067
        "subs %[lo], %[lo], x4\n\t"
1068
        "sbc  %[hi], %[hi], x3\n\t"
1069
1070
        "udiv x3, %[hi], x5\n\t"
1071
        "lsl  x3, x3, 32\n\t"
1072
        "add  x6, x6, x3\n\t"
1073
        "mul  x4, %[d], x3\n\t"
1074
        "umulh  x3, %[d], x3\n\t"
1075
        "subs %[lo], %[lo], x4\n\t"
1076
        "sbc  %[hi], %[hi], x3\n\t"
1077
1078
        "lsr  x3, %[lo], 32\n\t"
1079
        "orr  x3, x3, %[hi], lsl 32\n\t"
1080
1081
        "udiv x3, x3, x5\n\t"
1082
        "add  x6, x6, x3\n\t"
1083
        "mul  x4, %[d], x3\n\t"
1084
        "umulh  x3, %[d], x3\n\t"
1085
        "subs %[lo], %[lo], x4\n\t"
1086
        "sbc  %[hi], %[hi], x3\n\t"
1087
1088
        "lsr  x3, %[lo], 32\n\t"
1089
        "orr  x3, x3, %[hi], lsl 32\n\t"
1090
1091
        "udiv x3, x3, x5\n\t"
1092
        "add  x6, x6, x3\n\t"
1093
        "mul  x4, %[d], x3\n\t"
1094
        "sub  %[lo], %[lo], x4\n\t"
1095
1096
        "udiv x3, %[lo], %[d]\n\t"
1097
        "add  %[hi], x6, x3\n\t"
1098
1099
        : [hi] "+r" (hi), [lo] "+r" (lo), [d] "+r" (d)
1100
        :
1101
        : "x3", "x4", "x5", "x6", "cc"
1102
    );
1103
1104
    return hi;
1105
}
1106
#define SP_ASM_DIV_WORD
1107
#endif
1108
1109
#define SP_INT_ASM_AVAILABLE
1110
1111
    #endif /* WOLFSSL_SP_ARM64 && SP_WORD_SIZE == 64 */
1112
1113
    #if (defined(WOLFSSL_SP_ARM32) || defined(WOLFSSL_SP_ARM_CORTEX_M)) && \
1114
        SP_WORD_SIZE == 32
1115
/*
1116
 * CPU: ARM32 or Cortex-M4 and similar
1117
 */
1118
1119
/* Multiply va by vb and store double size result in: vh | vl */
1120
#define SP_ASM_MUL(vl, vh, va, vb)                       \
1121
    __asm__ __volatile__ (                               \
1122
        "umull  %[l], %[h], %[a], %[b]  \n\t"            \
1123
        : [h] "+r" (vh), [l] "+r" (vl)                   \
1124
        : [a] "r" (va), [b] "r" (vb)                     \
1125
    )
1126
/* Multiply va by vb and store double size result in: vo | vh | vl */
1127
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
1128
    __asm__ __volatile__ (                               \
1129
        "umull  %[l], %[h], %[a], %[b]  \n\t"            \
1130
        "mov  %[o], #0    \n\t"            \
1131
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
1132
        : [a] "r" (va), [b] "r" (vb)                     \
1133
    )
1134
/* Multiply va by vb and add double size result into: vo | vh | vl */
1135
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
1136
    __asm__ __volatile__ (                               \
1137
        "umull  r8, r9, %[a], %[b]  \n\t"            \
1138
        "adds %[l], %[l], r8    \n\t"            \
1139
        "adcs %[h], %[h], r9    \n\t"            \
1140
        "adc  %[o], %[o], #0    \n\t"            \
1141
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
1142
        : [a] "r" (va), [b] "r" (vb)                     \
1143
        : "r8", "r9", "cc"                               \
1144
    )
1145
/* Multiply va by vb and add double size result into: vh | vl */
1146
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
1147
    __asm__ __volatile__ (                               \
1148
        "umlal  %[l], %[h], %[a], %[b]  \n\t"            \
1149
        : [l] "+r" (vl), [h] "+r" (vh)                   \
1150
        : [a] "r" (va), [b] "r" (vb)                     \
1151
    )
1152
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
1153
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
1154
    __asm__ __volatile__ (                               \
1155
        "umull  r8, r9, %[a], %[b]  \n\t"            \
1156
        "adds %[l], %[l], r8    \n\t"            \
1157
        "adcs %[h], %[h], r9    \n\t"            \
1158
        "adc  %[o], %[o], #0    \n\t"            \
1159
        "adds %[l], %[l], r8    \n\t"            \
1160
        "adcs %[h], %[h], r9    \n\t"            \
1161
        "adc  %[o], %[o], #0    \n\t"            \
1162
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
1163
        : [a] "r" (va), [b] "r" (vb)                     \
1164
        : "r8", "r9", "cc"                               \
1165
    )
1166
/* Multiply va by vb and add double size result twice into: vo | vh | vl
1167
 * Assumes first add will not overflow vh | vl
1168
 */
1169
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
1170
    __asm__ __volatile__ (                               \
1171
        "umull  r8, r9, %[a], %[b]  \n\t"            \
1172
        "adds %[l], %[l], r8    \n\t"            \
1173
        "adc  %[h], %[h], r9    \n\t"            \
1174
        "adds %[l], %[l], r8    \n\t"            \
1175
        "adcs %[h], %[h], r9    \n\t"            \
1176
        "adc  %[o], %[o], #0    \n\t"            \
1177
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
1178
        : [a] "r" (va), [b] "r" (vb)                     \
1179
        : "r8", "r9", "cc"                               \
1180
    )
1181
/* Square va and store double size result in: vh | vl */
1182
#define SP_ASM_SQR(vl, vh, va)                           \
1183
    __asm__ __volatile__ (                               \
1184
        "umull  %[l], %[h], %[a], %[a]  \n\t"            \
1185
        : [h] "+r" (vh), [l] "+r" (vl)                   \
1186
        : [a] "r" (va)                                   \
1187
    )
1188
/* Square va and add double size result into: vo | vh | vl */
1189
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
1190
    __asm__ __volatile__ (                               \
1191
        "umull  r8, r9, %[a], %[a]  \n\t"            \
1192
        "adds %[l], %[l], r8    \n\t"            \
1193
        "adcs %[h], %[h], r9    \n\t"            \
1194
        "adc  %[o], %[o], #0    \n\t"            \
1195
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
1196
        : [a] "r" (va)                                   \
1197
        : "r8", "r9", "cc"                               \
1198
    )
1199
/* Square va and add double size result into: vh | vl */
1200
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
1201
    __asm__ __volatile__ (                               \
1202
        "umlal  %[l], %[h], %[a], %[a]  \n\t"            \
1203
        : [l] "+r" (vl), [h] "+r" (vh)                   \
1204
        : [a] "r" (va)                                   \
1205
        : "cc"                                           \
1206
    )
1207
/* Add va into: vh | vl */
1208
#define SP_ASM_ADDC(vl, vh, va)                          \
1209
    __asm__ __volatile__ (                               \
1210
        "adds %[l], %[l], %[a]  \n\t"            \
1211
        "adc  %[h], %[h], #0    \n\t"            \
1212
        : [l] "+r" (vl), [h] "+r" (vh)                   \
1213
        : [a] "r" (va)                                   \
1214
        : "cc"                                           \
1215
    )
1216
/* Sub va from: vh | vl */
1217
#define SP_ASM_SUBB(vl, vh, va)                          \
1218
    __asm__ __volatile__ (                               \
1219
        "subs %[l], %[l], %[a]  \n\t"            \
1220
        "sbc  %[h], %[h], #0    \n\t"            \
1221
        : [l] "+r" (vl), [h] "+r" (vh)                   \
1222
        : [a] "r" (va)                                   \
1223
        : "cc"                                           \
1224
    )
1225
/* Add two times vc | vb | va into vo | vh | vl */
1226
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
1227
    __asm__ __volatile__ (                               \
1228
        "adds %[l], %[l], %[a]  \n\t"            \
1229
        "adcs %[h], %[h], %[b]  \n\t"            \
1230
        "adc  %[o], %[o], %[c]  \n\t"            \
1231
        "adds %[l], %[l], %[a]  \n\t"            \
1232
        "adcs %[h], %[h], %[b]  \n\t"            \
1233
        "adc  %[o], %[o], %[c]  \n\t"            \
1234
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
1235
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
1236
        : "cc"                                           \
1237
    )
1238
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH >= 7)
1239
/* Count leading zeros - instruction only available on ARMv7 and newer. */
1240
#define SP_ASM_LZCNT(va, vn)                             \
1241
    __asm__ __volatile__ (                               \
1242
        "clz  %[n], %[a]  \n\t"                    \
1243
        : [n] "=r" (vn)                                  \
1244
        : [a] "r" (va)                                   \
1245
    )
1246
#endif
1247
1248
#ifndef WOLFSSL_SP_DIV_WORD_HALF
1249
#ifndef WOLFSSL_SP_ARM32_UDIV
1250
/* Divide a two digit number by a digit number and return. (hi | lo) / d
1251
 *
1252
 * No division instruction used - does operation bit by bit.
1253
 * Constant time.
1254
 *
1255
 * @param  [in]  hi  SP integer digit. High digit of the dividend.
1256
 * @param  [in]  lo  SP integer digit. Lower digit of the dividend.
1257
 * @param  [in]  d   SP integer digit. Number to divide by.
1258
 * @return  The division result.
1259
 */
1260
static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
1261
                                          sp_int_digit d)
1262
{
1263
    sp_int_digit r = 0;
1264
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
1265
    static const char debruijn32[32] = {
1266
        0, 31, 9, 30, 3, 8, 13, 29, 2, 5, 7, 21, 12, 24, 28, 19,
1267
        1, 10, 4, 14, 6, 22, 25, 20, 11, 15, 23, 26, 16, 27, 17, 18
1268
    };
1269
    static const sp_uint32 debruijn32_mul = 0x076be629;
1270
#endif
1271
1272
    __asm__ __volatile__ (
1273
        /* Shift d so that top bit is set. */
1274
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
1275
        "ldr  r4, %[m]\n\t"
1276
        "mov  r5, %[d]\n\t"
1277
        "orr  r5, r5, r5, lsr #1\n\t"
1278
        "orr  r5, r5, r5, lsr #2\n\t"
1279
        "orr  r5, r5, r5, lsr #4\n\t"
1280
        "orr  r5, r5, r5, lsr #8\n\t"
1281
        "orr  r5, r5, r5, lsr #16\n\t"
1282
        "add  r5, r5, #1\n\t"
1283
        "mul  r6, r5, r4\n\t"
1284
        "lsr  r5, r6, #27\n\t"
1285
        "ldrb r5, [%[t], r5]\n\t"
1286
#else
1287
        "clz  r5, %[d]\n\t"
1288
#endif
1289
        "rsb  r6, r5, #31\n\t"
1290
        "lsl  %[d], %[d], r5\n\t"
1291
        "lsl  %[hi], %[hi], r5\n\t"
1292
        "lsr  r9, %[lo], r6\n\t"
1293
        "lsl  %[lo], %[lo], r5\n\t"
1294
        "orr  %[hi], %[hi], r9, lsr #1\n\t"
1295
1296
        "lsr  r5, %[d], #1\n\t"
1297
        "add  r5, r5, #1\n\t"
1298
        "mov  r6, %[lo]\n\t"
1299
        "mov  r9, %[hi]\n\t"
1300
        /* Do top 32 */
1301
        "subs r8, r5, r9\n\t"
1302
        "sbc  r8, r8, r8\n\t"
1303
        "add  %[r], %[r], %[r]\n\t"
1304
        "sub  %[r], %[r], r8\n\t"
1305
        "and  r8, r8, r5\n\t"
1306
        "subs r9, r9, r8\n\t"
1307
        /* Next 30 bits */
1308
        "mov  r4, #29\n\t"
1309
        "\n1:\n\t"
1310
        "movs r6, r6, lsl #1\n\t"
1311
        "adc  r9, r9, r9\n\t"
1312
        "subs r8, r5, r9\n\t"
1313
        "sbc  r8, r8, r8\n\t"
1314
        "add  %[r], %[r], %[r]\n\t"
1315
        "sub  %[r], %[r], r8\n\t"
1316
        "and  r8, r8, r5\n\t"
1317
        "subs r9, r9, r8\n\t"
1318
        "subs r4, r4, #1\n\t"
1319
        "bpl  1b\n\t"
1320
1321
        "add  %[r], %[r], %[r]\n\t"
1322
        "add  %[r], %[r], #1\n\t"
1323
1324
        /* Handle difference has hi word > 0. */
1325
        "umull  r4, r5, %[r], %[d]\n\t"
1326
        "subs r4, %[lo], r4\n\t"
1327
        "sbc  r5, %[hi], r5\n\t"
1328
        "add  %[r], %[r], r5\n\t"
1329
        "umull  r4, r5, %[r], %[d]\n\t"
1330
        "subs r4, %[lo], r4\n\t"
1331
        "sbc  r5, %[hi], r5\n\t"
1332
        "add  %[r], %[r], r5\n\t"
1333
1334
        /* Add 1 to result if bottom half of difference is >= d. */
1335
        "mul  r4, %[r], %[d]\n\t"
1336
        "subs r4, %[lo], r4\n\t"
1337
        "subs r9, %[d], r4\n\t"
1338
        "sbc  r8, r8, r8\n\t"
1339
        "sub  %[r], %[r], r8\n\t"
1340
        "subs r9, r9, #1\n\t"
1341
        "sbc  r8, r8, r8\n\t"
1342
        "sub  %[r], %[r], r8\n\t"
1343
        : [r] "+r" (r), [hi] "+r" (hi), [lo] "+r" (lo), [d] "+r" (d)
1344
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
1345
        : [t] "r" (debruijn32), [m] "m" (debruijn32_mul)
1346
#else
1347
        :
1348
#endif
1349
        : "r4", "r5", "r6", "r8", "r9", "cc"
1350
    );
1351
1352
    return r;
1353
}
1354
#else
1355
/* Divide a two digit number by a digit number and return. (hi | lo) / d
1356
 *
1357
 * Using udiv instruction on arm32
1358
 * Constant time.
1359
 *
1360
 * @param  [in]  hi  SP integer digit. High digit of the dividend.
1361
 * @param  [in]  lo  SP integer digit. Lower digit of the dividend.
1362
 * @param  [in]  d   SP integer digit. Number to divide by.
1363
 * @return  The division result.
1364
 */
1365
static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
1366
                                          sp_int_digit d)
1367
{
1368
    __asm__ __volatile__ (
1369
        "lsrs r3, %[d], #24\n\t"
1370
  "it eq\n\t"
1371
        "moveq  r3, #8\n\t"
1372
  "it ne\n\t"
1373
        "movne  r3, #0\n\t"
1374
        "rsb  r4, r3, #31\n\t"
1375
        "lsl  %[d], %[d], r3\n\t"
1376
        "lsl  %[hi], %[hi], r3\n\t"
1377
        "lsr  r5, %[lo], r4\n\t"
1378
        "lsl  %[lo], %[lo], r3\n\t"
1379
        "orr  %[hi], %[hi], r5, lsr #1\n\t"
1380
1381
        "lsr  r5, %[d], 16\n\t"
1382
        "add  r5, r5, 1\n\t"
1383
1384
        "udiv r3, %[hi], r5\n\t"
1385
        "lsl  r6, r3, 16\n\t"
1386
        "umull  r4, r3, %[d], r6\n\t"
1387
        "subs %[lo], %[lo], r4\n\t"
1388
        "sbc  %[hi], %[hi], r3\n\t"
1389
1390
        "udiv r3, %[hi], r5\n\t"
1391
        "lsl  r3, r3, 16\n\t"
1392
        "add  r6, r6, r3\n\t"
1393
        "umull  r4, r3, %[d], r3\n\t"
1394
        "subs %[lo], %[lo], r4\n\t"
1395
        "sbc  %[hi], %[hi], r3\n\t"
1396
1397
        "lsr  r3, %[lo], 16\n\t"
1398
        "orr  r3, r3, %[hi], lsl 16\n\t"
1399
1400
        "udiv r3, r3, r5\n\t"
1401
        "add  r6, r6, r3\n\t"
1402
        "umull  r4, r3, %[d], r3\n\t"
1403
        "subs %[lo], %[lo], r4\n\t"
1404
        "sbc  %[hi], %[hi], r3\n\t"
1405
1406
        "lsr  r3, %[lo], 16\n\t"
1407
        "orr  r3, r3, %[hi], lsl 16\n\t"
1408
1409
        "udiv r3, r3, r5\n\t"
1410
        "add  r6, r6, r3\n\t"
1411
        "mul  r4, %[d], r3\n\t"
1412
        "sub  %[lo], %[lo], r4\n\t"
1413
1414
        "udiv r3, %[lo], %[d]\n\t"
1415
        "add  %[hi], r6, r3\n\t"
1416
1417
        : [hi] "+r" (hi), [lo] "+r" (lo), [d] "+r" (d)
1418
        :
1419
        : "r3", "r4", "r5", "r6", "cc"
1420
    );
1421
1422
    return hi;
1423
}
1424
#endif
1425
1426
#define SP_ASM_DIV_WORD
1427
#endif
1428
1429
#define SP_INT_ASM_AVAILABLE
1430
1431
    #endif /* (WOLFSSL_SP_ARM32 || ARM_CORTEX_M) && SP_WORD_SIZE == 32 */
1432
1433
    #if defined(WOLFSSL_SP_ARM_THUMB) && SP_WORD_SIZE == 32
1434
/*
1435
 * CPU: ARM Thumb (like Cortex-M0)
1436
 */
1437
1438
/* Compile with -fomit-frame-pointer, or similar, if compiler complains about
1439
 * usage of register 'r7'.
1440
 */
1441
1442
#if defined(__clang__)
1443
1444
/* Multiply va by vb and store double size result in: vh | vl */
1445
#define SP_ASM_MUL(vl, vh, va, vb)                       \
1446
    __asm__ __volatile__ (                               \
1447
        /* al * bl */                                    \
1448
        "uxth r6, %[a]    \n\t"            \
1449
        "uxth %[l], %[b]    \n\t"            \
1450
        "muls %[l], r6    \n\t"            \
1451
        /* al * bh */                                    \
1452
        "lsrs r4, %[b], #16   \n\t"            \
1453
        "muls r6, r4      \n\t"            \
1454
        "lsrs %[h], r6, #16   \n\t"            \
1455
        "lsls r6, r6, #16   \n\t"            \
1456
        "adds %[l], %[l], r6    \n\t"            \
1457
        "movs r5, #0      \n\t"            \
1458
        "adcs %[h], r5    \n\t"            \
1459
        /* ah * bh */                                    \
1460
        "lsrs r6, %[a], #16   \n\t"            \
1461
        "muls r4, r6      \n\t"            \
1462
        "adds %[h], %[h], r4    \n\t"            \
1463
        /* ah * bl */                                    \
1464
        "uxth r4, %[b]    \n\t"            \
1465
        "muls r6, r4      \n\t"            \
1466
        "lsrs r4, r6, #16   \n\t"            \
1467
        "lsls r6, r6, #16   \n\t"            \
1468
        "adds %[l], %[l], r6    \n\t"            \
1469
        "adcs %[h], r4    \n\t"            \
1470
        : [h] "+l" (vh), [l] "+l" (vl)                   \
1471
        : [a] "l" (va), [b] "l" (vb)                     \
1472
        : "r4", "r5", "r6", "cc"                         \
1473
    )
1474
/* Multiply va by vb and store double size result in: vo | vh | vl */
1475
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
1476
    __asm__ __volatile__ (                               \
1477
        /* al * bl */                                    \
1478
        "uxth r6, %[a]    \n\t"            \
1479
        "uxth %[l], %[b]    \n\t"            \
1480
        "muls %[l], r6    \n\t"            \
1481
        /* al * bh */                                    \
1482
        "lsrs r5, %[b], #16   \n\t"            \
1483
        "muls r6, r5      \n\t"            \
1484
        "lsrs %[h], r6, #16   \n\t"            \
1485
        "lsls r6, r6, #16   \n\t"            \
1486
        "adds %[l], %[l], r6    \n\t"            \
1487
        "movs %[o], #0    \n\t"            \
1488
        "adcs %[h], %[o]    \n\t"            \
1489
        /* ah * bh */                                    \
1490
        "lsrs r6, %[a], #16   \n\t"            \
1491
        "muls r5, r6      \n\t"            \
1492
        "adds %[h], %[h], r5    \n\t"            \
1493
        /* ah * bl */                                    \
1494
        "uxth r5, %[b]    \n\t"            \
1495
        "muls r6, r5      \n\t"            \
1496
        "lsrs r5, r6, #16   \n\t"            \
1497
        "lsls r6, r6, #16   \n\t"            \
1498
        "adds %[l], %[l], r6    \n\t"            \
1499
        "adcs %[h], r5    \n\t"            \
1500
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
1501
        : [a] "l" (va), [b] "l" (vb)                     \
1502
        : "r5", "r6", "cc"                               \
1503
    )
1504
#if !defined(WOLFSSL_SP_SMALL) && !defined(DEBUG)
1505
/* Multiply va by vb and add double size result into: vo | vh | vl */
1506
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
1507
    __asm__ __volatile__ (                               \
1508
        /* al * bl */                                    \
1509
        "uxth r6, %[a]    \n\t"            \
1510
        "uxth r7, %[b]    \n\t"            \
1511
        "muls r7, r6      \n\t"            \
1512
        "adds %[l], %[l], r7    \n\t"            \
1513
        "movs r5, #0      \n\t"            \
1514
        "adcs %[h], r5    \n\t"            \
1515
        "adcs %[o], r5    \n\t"            \
1516
        /* al * bh */                                    \
1517
        "lsrs r7, %[b], #16   \n\t"            \
1518
        "muls r6, r7      \n\t"            \
1519
        "lsrs r7, r6, #16   \n\t"            \
1520
        "lsls r6, r6, #16   \n\t"            \
1521
        "adds %[l], %[l], r6    \n\t"            \
1522
        "adcs %[h], r7    \n\t"            \
1523
        "adcs %[o], r5    \n\t"            \
1524
        /* ah * bh */                                    \
1525
        "lsrs r6, %[a], #16   \n\t"            \
1526
        "lsrs r7, %[b], #16   \n\t"            \
1527
        "muls r7, r6      \n\t"            \
1528
        "adds %[h], %[h], r7    \n\t"            \
1529
        "adcs %[o], r5    \n\t"            \
1530
        /* ah * bl */                                    \
1531
        "uxth r7, %[b]    \n\t"            \
1532
        "muls r6, r7      \n\t"            \
1533
        "lsrs r7, r6, #16   \n\t"            \
1534
        "lsls r6, r6, #16   \n\t"            \
1535
        "adds %[l], %[l], r6    \n\t"            \
1536
        "adcs %[h], r7    \n\t"            \
1537
        "adcs %[o], r5    \n\t"            \
1538
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
1539
        : [a] "l" (va), [b] "l" (vb)                     \
1540
        : "r5", "r6", "r7", "cc"                         \
1541
    )
1542
#else
1543
/* Multiply va by vb and add double size result into: vo | vh | vl */
1544
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
1545
    __asm__ __volatile__ (                               \
1546
        /* al * bl */                                    \
1547
        "uxth r6, %[a]    \n\t"            \
1548
        "uxth r5, %[b]    \n\t"            \
1549
        "muls r5, r6      \n\t"            \
1550
        "adds %[l], %[l], r5    \n\t"            \
1551
        "movs r5, #0      \n\t"            \
1552
        "adcs %[h], r5    \n\t"            \
1553
        "adcs %[o], r5    \n\t"            \
1554
        /* al * bh */                                    \
1555
        "lsrs r5, %[b], #16   \n\t"            \
1556
        "muls r6, r5      \n\t"            \
1557
        "lsrs r5, r6, #16   \n\t"            \
1558
        "lsls r6, r6, #16   \n\t"            \
1559
        "adds %[l], %[l], r6    \n\t"            \
1560
        "adcs %[h], r5    \n\t"            \
1561
        "movs r5, #0      \n\t"            \
1562
        "adcs %[o], r5    \n\t"            \
1563
        /* ah * bh */                                    \
1564
        "lsrs r6, %[a], #16   \n\t"            \
1565
        "lsrs r5, %[b], #16   \n\t"            \
1566
        "muls r5, r6      \n\t"            \
1567
        "adds %[h], %[h], r5    \n\t"            \
1568
        "movs r5, #0      \n\t"            \
1569
        "adcs %[o], r5    \n\t"            \
1570
        /* ah * bl */                                    \
1571
        "uxth r5, %[b]    \n\t"            \
1572
        "muls r6, r5      \n\t"            \
1573
        "lsrs r5, r6, #16   \n\t"            \
1574
        "lsls r6, r6, #16   \n\t"            \
1575
        "adds %[l], %[l], r6    \n\t"            \
1576
        "adcs %[h], r5    \n\t"            \
1577
        "movs r5, #0      \n\t"            \
1578
        "adcs %[o], r5    \n\t"            \
1579
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
1580
        : [a] "l" (va), [b] "l" (vb)                     \
1581
        : "r5", "r6", "cc"                               \
1582
    )
1583
#endif
1584
/* Multiply va by vb and add double size result into: vh | vl */
1585
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
1586
    __asm__ __volatile__ (                               \
1587
        /* al * bl */                                    \
1588
        "uxth r6, %[a]    \n\t"            \
1589
        "uxth r4, %[b]    \n\t"            \
1590
        "muls r4, r6      \n\t"            \
1591
        "adds %[l], %[l], r4    \n\t"            \
1592
        "movs r5, #0      \n\t"            \
1593
        "adcs %[h], r5    \n\t"            \
1594
        /* al * bh */                                    \
1595
        "lsrs r4, %[b], #16   \n\t"            \
1596
        "muls r6, r4      \n\t"            \
1597
        "lsrs r4, r6, #16   \n\t"            \
1598
        "lsls r6, r6, #16   \n\t"            \
1599
        "adds %[l], %[l], r6    \n\t"            \
1600
        "adcs %[h], r4    \n\t"            \
1601
        /* ah * bh */                                    \
1602
        "lsrs r6, %[a], #16   \n\t"            \
1603
        "lsrs r4, %[b], #16   \n\t"            \
1604
        "muls r4, r6      \n\t"            \
1605
        "adds %[h], %[h], r4    \n\t"            \
1606
        /* ah * bl */                                    \
1607
        "uxth r4, %[b]    \n\t"            \
1608
        "muls r6, r4      \n\t"            \
1609
        "lsrs r4, r6, #16   \n\t"            \
1610
        "lsls r6, r6, #16   \n\t"            \
1611
        "adds %[l], %[l], r6    \n\t"            \
1612
        "adcs %[h], r4    \n\t"            \
1613
        : [l] "+l" (vl), [h] "+l" (vh)                   \
1614
        : [a] "l" (va), [b] "l" (vb)                     \
1615
        : "r4", "r5", "r6", "cc"                         \
1616
    )
1617
#if !defined(WOLFSSL_SP_SMALL) && !defined(DEBUG)
1618
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
1619
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
1620
    __asm__ __volatile__ (                               \
1621
        /* al * bl */                                    \
1622
        "uxth r6, %[a]    \n\t"            \
1623
        "uxth r7, %[b]    \n\t"            \
1624
        "muls r7, r6      \n\t"            \
1625
        "adds %[l], %[l], r7    \n\t"            \
1626
        "movs r5, #0      \n\t"            \
1627
        "adcs %[h], r5    \n\t"            \
1628
        "adcs %[o], r5    \n\t"            \
1629
        "adds %[l], %[l], r7    \n\t"            \
1630
        "adcs %[h], r5    \n\t"            \
1631
        "adcs %[o], r5    \n\t"            \
1632
        /* al * bh */                                    \
1633
        "lsrs r7, %[b], #16   \n\t"            \
1634
        "muls r6, r7      \n\t"            \
1635
        "lsrs r7, r6, #16   \n\t"            \
1636
        "lsls r6, r6, #16   \n\t"            \
1637
        "adds %[l], %[l], r6    \n\t"            \
1638
        "adcs %[h], r7    \n\t"            \
1639
        "adcs %[o], r5    \n\t"            \
1640
        "adds %[l], %[l], r6    \n\t"            \
1641
        "adcs %[h], r7    \n\t"            \
1642
        "adcs %[o], r5    \n\t"            \
1643
        /* ah * bh */                                    \
1644
        "lsrs r6, %[a], #16   \n\t"            \
1645
        "lsrs r7, %[b], #16   \n\t"            \
1646
        "muls r7, r6      \n\t"            \
1647
        "adds %[h], %[h], r7    \n\t"            \
1648
        "adcs %[o], r5    \n\t"            \
1649
        "adds %[h], %[h], r7    \n\t"            \
1650
        "adcs %[o], r5    \n\t"            \
1651
        /* ah * bl */                                    \
1652
        "uxth r7, %[b]    \n\t"            \
1653
        "muls r6, r7      \n\t"            \
1654
        "lsrs r7, r6, #16   \n\t"            \
1655
        "lsls r6, r6, #16   \n\t"            \
1656
        "adds %[l], %[l], r6    \n\t"            \
1657
        "adcs %[h], r7    \n\t"            \
1658
        "adcs %[o], r5    \n\t"            \
1659
        "adds %[l], %[l], r6    \n\t"            \
1660
        "adcs %[h], r7    \n\t"            \
1661
        "adcs %[o], r5    \n\t"            \
1662
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
1663
        : [a] "l" (va), [b] "l" (vb)                     \
1664
        : "r5", "r6", "r7", "cc"                         \
1665
    )
1666
#else
1667
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
1668
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
1669
    __asm__ __volatile__ (                               \
1670
        "movs r8, %[a]    \n\t"            \
1671
        /* al * bl */                                    \
1672
        "uxth r6, %[a]    \n\t"            \
1673
        "uxth r5, %[b]    \n\t"            \
1674
        "muls r5, r6      \n\t"            \
1675
        "adds %[l], %[l], r5    \n\t"            \
1676
        "movs %[a], #0    \n\t"            \
1677
        "adcs %[h], %[a]    \n\t"            \
1678
        "adcs %[o], %[a]    \n\t"            \
1679
        "adds %[l], %[l], r5    \n\t"            \
1680
        "adcs %[h], %[a]    \n\t"            \
1681
        "adcs %[o], %[a]    \n\t"            \
1682
        /* al * bh */                                    \
1683
        "lsrs r5, %[b], #16   \n\t"            \
1684
        "muls r6, r5      \n\t"            \
1685
        "lsrs r5, r6, #16   \n\t"            \
1686
        "lsls r6, r6, #16   \n\t"            \
1687
        "adds %[l], %[l], r6    \n\t"            \
1688
        "adcs %[h], r5    \n\t"            \
1689
        "adcs %[o], %[a]    \n\t"            \
1690
        "adds %[l], %[l], r6    \n\t"            \
1691
        "adcs %[h], r5    \n\t"            \
1692
        "adcs %[o], %[a]    \n\t"            \
1693
        /* ah * bh */                                    \
1694
        "movs %[a], r8    \n\t"            \
1695
        "lsrs r6, %[a], #16   \n\t"            \
1696
        "lsrs r5, %[b], #16   \n\t"            \
1697
        "muls r5, r6      \n\t"            \
1698
        "adds %[h], %[h], r5    \n\t"            \
1699
        "movs %[a], #0    \n\t"            \
1700
        "adcs %[o], %[a]    \n\t"            \
1701
        "adds %[h], %[h], r5    \n\t"            \
1702
        "adcs %[o], %[a]    \n\t"            \
1703
        /* ah * bl */                                    \
1704
        "uxth r5, %[b]    \n\t"            \
1705
        "muls r6, r5      \n\t"            \
1706
        "lsrs r5, r6, #16   \n\t"            \
1707
        "lsls r6, r6, #16   \n\t"            \
1708
        "adds %[l], %[l], r6    \n\t"            \
1709
        "adcs %[h], r5    \n\t"            \
1710
        "adcs %[o], %[a]    \n\t"            \
1711
        "adds %[l], %[l], r6    \n\t"            \
1712
        "adcs %[h], r5    \n\t"            \
1713
        "adcs %[o], %[a]    \n\t"            \
1714
        "movs %[a], r8    \n\t"            \
1715
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
1716
        : [a] "l" (va), [b] "l" (vb)                     \
1717
        : "r5", "r6", "r8", "cc"                         \
1718
    )
1719
#endif
1720
#ifndef DEBUG
1721
/* Multiply va by vb and add double size result twice into: vo | vh | vl
1722
 * Assumes first add will not overflow vh | vl
1723
 */
1724
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
1725
    __asm__ __volatile__ (                               \
1726
        /* al * bl */                                    \
1727
        "uxth r6, %[a]    \n\t"            \
1728
        "uxth r7, %[b]    \n\t"            \
1729
        "muls r7, r6      \n\t"            \
1730
        "adds %[l], %[l], r7    \n\t"            \
1731
        "movs r5, #0      \n\t"            \
1732
        "adcs %[h], r5    \n\t"            \
1733
        "adds %[l], %[l], r7    \n\t"            \
1734
        "adcs %[h], r5    \n\t"            \
1735
        /* al * bh */                                    \
1736
        "lsrs r7, %[b], #16   \n\t"            \
1737
        "muls r6, r7      \n\t"            \
1738
        "lsrs r7, r6, #16   \n\t"            \
1739
        "lsls r6, r6, #16   \n\t"            \
1740
        "adds %[l], %[l], r6    \n\t"            \
1741
        "adcs %[h], r7    \n\t"            \
1742
        "adds %[l], %[l], r6    \n\t"            \
1743
        "adcs %[h], r7    \n\t"            \
1744
        "adcs %[o], r5    \n\t"            \
1745
        /* ah * bh */                                    \
1746
        "lsrs r6, %[a], #16   \n\t"            \
1747
        "lsrs r7, %[b], #16   \n\t"            \
1748
        "muls r7, r6      \n\t"            \
1749
        "adds %[h], %[h], r7    \n\t"            \
1750
        "adcs %[o], r5    \n\t"            \
1751
        "adds %[h], %[h], r7    \n\t"            \
1752
        "adcs %[o], r5    \n\t"            \
1753
        /* ah * bl */                                    \
1754
        "uxth r7, %[b]    \n\t"            \
1755
        "muls r6, r7      \n\t"            \
1756
        "lsrs r7, r6, #16   \n\t"            \
1757
        "lsls r6, r6, #16   \n\t"            \
1758
        "adds %[l], %[l], r6    \n\t"            \
1759
        "adcs %[h], r7    \n\t"            \
1760
        "adcs %[o], r5    \n\t"            \
1761
        "adds %[l], %[l], r6    \n\t"            \
1762
        "adcs %[h], r7    \n\t"            \
1763
        "adcs %[o], r5    \n\t"            \
1764
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
1765
        : [a] "l" (va), [b] "l" (vb)                     \
1766
        : "r5", "r6", "r7", "cc"                         \
1767
    )
1768
#else
1769
/* Multiply va by vb and add double size result twice into: vo | vh | vl
1770
 * Assumes first add will not overflow vh | vl
1771
 */
1772
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
1773
    __asm__ __volatile__ (                               \
1774
        "movs r8, %[a]    \n\t"            \
1775
        /* al * bl */                                    \
1776
        "uxth r5, %[a]    \n\t"            \
1777
        "uxth r6, %[b]    \n\t"            \
1778
        "muls r6, r5      \n\t"            \
1779
        "adds %[l], %[l], r6    \n\t"            \
1780
        "movs %[a], #0    \n\t"            \
1781
        "adcs %[h], %[a]    \n\t"            \
1782
        "adds %[l], %[l], r6    \n\t"            \
1783
        "adcs %[h], %[a]    \n\t"            \
1784
        /* al * bh */                                    \
1785
        "lsrs r6, %[b], #16   \n\t"            \
1786
        "muls r5, r6      \n\t"            \
1787
        "lsrs r6, r5, #16   \n\t"            \
1788
        "lsls r5, r5, #16   \n\t"            \
1789
        "adds %[l], %[l], r5    \n\t"            \
1790
        "adcs %[h], r6    \n\t"            \
1791
        "adds %[l], %[l], r5    \n\t"            \
1792
        "adcs %[h], r6    \n\t"            \
1793
        "adcs %[o], %[a]    \n\t"            \
1794
        /* ah * bh */                                    \
1795
        "movs %[a], r8    \n\t"            \
1796
        "lsrs r5, %[a], #16   \n\t"            \
1797
        "lsrs r6, %[b], #16   \n\t"            \
1798
        "muls r6, r5      \n\t"            \
1799
        "movs %[a], #0    \n\t"            \
1800
        "adds %[h], %[h], r6    \n\t"            \
1801
        "adcs %[o], %[a]    \n\t"            \
1802
        "adds %[h], %[h], r6    \n\t"            \
1803
        "adcs %[o], %[a]    \n\t"            \
1804
        /* ah * bl */                                    \
1805
        "uxth r6, %[b]    \n\t"            \
1806
        "muls r5, r6      \n\t"            \
1807
        "lsrs r6, r5, #16   \n\t"            \
1808
        "lsls r5, r5, #16   \n\t"            \
1809
        "adds %[l], %[l], r5    \n\t"            \
1810
        "adcs %[h], r6    \n\t"            \
1811
        "adcs %[o], %[a]    \n\t"            \
1812
        "adds %[l], %[l], r5    \n\t"            \
1813
        "adcs %[h], r6    \n\t"            \
1814
        "adcs %[o], %[a]    \n\t"            \
1815
        "movs %[a], r8    \n\t"            \
1816
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
1817
        : [a] "l" (va), [b] "l" (vb)                     \
1818
        : "r5", "r6", "r8", "cc"                         \
1819
    )
1820
#endif
1821
/* Square va and store double size result in: vh | vl */
1822
#define SP_ASM_SQR(vl, vh, va)                           \
1823
    __asm__ __volatile__ (                               \
1824
        "lsrs r5, %[a], #16   \n\t"            \
1825
        "uxth r6, %[a]    \n\t"            \
1826
        "mov  %[l], r6    \n\t"            \
1827
        "mov  %[h], r5    \n\t"            \
1828
        /* al * al */                                    \
1829
        "muls %[l], %[l]    \n\t"            \
1830
        /* ah * ah */                                    \
1831
        "muls %[h], %[h]    \n\t"            \
1832
        /* 2 * al * ah */                                \
1833
        "muls r6, r5      \n\t"            \
1834
        "lsrs r5, r6, #15   \n\t"            \
1835
        "lsls r6, r6, #17   \n\t"            \
1836
        "adds %[l], %[l], r6    \n\t"            \
1837
        "adcs %[h], r5    \n\t"            \
1838
        : [h] "+l" (vh), [l] "+l" (vl)                   \
1839
        : [a] "l" (va)                                   \
1840
        : "r5", "r6", "cc"                               \
1841
    )
1842
/* Square va and add double size result into: vo | vh | vl */
1843
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
1844
    __asm__ __volatile__ (                               \
1845
        "lsrs r4, %[a], #16   \n\t"            \
1846
        "uxth r6, %[a]    \n\t"            \
1847
        /* al * al */                                    \
1848
        "muls r6, r6      \n\t"            \
1849
        /* ah * ah */                                    \
1850
        "muls r4, r4      \n\t"            \
1851
        "adds %[l], %[l], r6    \n\t"            \
1852
        "adcs %[h], r4    \n\t"            \
1853
        "movs r5, #0      \n\t"            \
1854
        "adcs %[o], r5    \n\t"            \
1855
        "lsrs r4, %[a], #16   \n\t"            \
1856
        "uxth r6, %[a]    \n\t"            \
1857
        /* 2 * al * ah */                                \
1858
        "muls r6, r4      \n\t"            \
1859
        "lsrs r4, r6, #15   \n\t"            \
1860
        "lsls r6, r6, #17   \n\t"            \
1861
        "adds %[l], %[l], r6    \n\t"            \
1862
        "adcs %[h], r4    \n\t"            \
1863
        "adcs %[o], r5    \n\t"            \
1864
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
1865
        : [a] "l" (va)                                   \
1866
        : "r4", "r5", "r6", "cc"                         \
1867
    )
1868
/* Square va and add double size result into: vh | vl */
1869
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
1870
    __asm__ __volatile__ (                               \
1871
        "lsrs r6, %[a], #16   \n\t"            \
1872
        "uxth r6, %[a]    \n\t"            \
1873
        /* al * al */                                    \
1874
        "muls r6, r6      \n\t"            \
1875
        /* ah * ah */                                    \
1876
        "muls r6, r6      \n\t"            \
1877
        "adds %[l], %[l], r6    \n\t"            \
1878
        "adcs %[h], r6    \n\t"            \
1879
        "lsrs r6, %[a], #16   \n\t"            \
1880
        "uxth r6, %[a]    \n\t"            \
1881
        /* 2 * al * ah */                                \
1882
        "muls r6, r6      \n\t"            \
1883
        "lsrs r6, r6, #15   \n\t"            \
1884
        "lsls r6, r6, #17   \n\t"            \
1885
        "adds %[l], %[l], r6    \n\t"            \
1886
        "adcs %[h], r6    \n\t"            \
1887
        : [l] "+l" (vl), [h] "+l" (vh)                   \
1888
        : [a] "l" (va)                                   \
1889
        : "r5", "r6", "cc"                               \
1890
    )
1891
/* Add va into: vh | vl */
1892
#define SP_ASM_ADDC(vl, vh, va)                          \
1893
    __asm__ __volatile__ (                               \
1894
        "adds %[l], %[l], %[a]  \n\t"            \
1895
        "movs r5, #0      \n\t"            \
1896
        "adcs %[h], r5    \n\t"            \
1897
        : [l] "+l" (vl), [h] "+l" (vh)                   \
1898
        : [a] "l" (va)                                   \
1899
        : "r5", "cc"                                     \
1900
    )
1901
/* Sub va from: vh | vl */
1902
#define SP_ASM_SUBB(vl, vh, va)                          \
1903
    __asm__ __volatile__ (                               \
1904
        "subs %[l], %[l], %[a]  \n\t"            \
1905
        "movs r5, #0      \n\t"            \
1906
        "sbcs %[h], r5    \n\t"            \
1907
        : [l] "+l" (vl), [h] "+l" (vh)                   \
1908
        : [a] "l" (va)                                   \
1909
        : "r5", "cc"                                     \
1910
    )
1911
/* Add two times vc | vb | va into vo | vh | vl */
1912
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
1913
    __asm__ __volatile__ (                               \
1914
        "adds %[l], %[l], %[a]  \n\t"            \
1915
        "adcs %[h], %[b]    \n\t"            \
1916
        "adcs %[o], %[c]    \n\t"            \
1917
        "adds %[l], %[l], %[a]  \n\t"            \
1918
        "adcs %[h], %[b]    \n\t"            \
1919
        "adcs %[o], %[c]    \n\t"            \
1920
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
1921
        : [a] "l" (va), [b] "l" (vb), [c] "l" (vc)       \
1922
        : "cc"                                           \
1923
    )
1924
1925
#elif defined(WOLFSSL_KEIL)
1926
1927
/* Multiply va by vb and store double size result in: vh | vl */
1928
#define SP_ASM_MUL(vl, vh, va, vb)                       \
1929
    __asm__ __volatile__ (                               \
1930
        /* al * bl */                                    \
1931
        "uxth r6, %[a]    \n\t"            \
1932
        "uxth %[l], %[b]    \n\t"            \
1933
        "muls %[l], r6, %[l]    \n\t"            \
1934
        /* al * bh */                                    \
1935
        "lsrs r4, %[b], #16   \n\t"            \
1936
        "muls r6, r4, r6    \n\t"            \
1937
        "lsrs %[h], r6, #16   \n\t"            \
1938
        "lsls r6, r6, #16   \n\t"            \
1939
        "adds %[l], %[l], r6    \n\t"            \
1940
        "movs r5, #0      \n\t"            \
1941
        "adcs %[h], %[h], r5    \n\t"            \
1942
        /* ah * bh */                                    \
1943
        "lsrs r6, %[a], #16   \n\t"            \
1944
        "muls r4, r6, r4    \n\t"            \
1945
        "adds %[h], %[h], r4    \n\t"            \
1946
        /* ah * bl */                                    \
1947
        "uxth r4, %[b]    \n\t"            \
1948
        "muls r6, r4, r6    \n\t"            \
1949
        "lsrs r4, r6, #16   \n\t"            \
1950
        "lsls r6, r6, #16   \n\t"            \
1951
        "adds %[l], %[l], r6    \n\t"            \
1952
        "adcs %[h], %[h], r4    \n\t"            \
1953
        : [h] "+l" (vh), [l] "+l" (vl)                   \
1954
        : [a] "l" (va), [b] "l" (vb)                     \
1955
        : "r4", "r5", "r6", "cc"                         \
1956
    )
1957
/* Multiply va by vb and store double size result in: vo | vh | vl */
1958
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
1959
    __asm__ __volatile__ (                               \
1960
        /* al * bl */                                    \
1961
        "uxth r6, %[a]    \n\t"            \
1962
        "uxth %[l], %[b]    \n\t"            \
1963
        "muls %[l], r6, %[l]    \n\t"            \
1964
        /* al * bh */                                    \
1965
        "lsrs r5, %[b], #16   \n\t"            \
1966
        "muls r6, r5, r6    \n\t"            \
1967
        "lsrs %[h], r6, #16   \n\t"            \
1968
        "lsls r6, r6, #16   \n\t"            \
1969
        "adds %[l], %[l], r6    \n\t"            \
1970
        "movs %[o], #0    \n\t"            \
1971
        "adcs %[h], %[h], %[o]  \n\t"            \
1972
        /* ah * bh */                                    \
1973
        "lsrs r6, %[a], #16   \n\t"            \
1974
        "muls r5, r6, r5    \n\t"            \
1975
        "adds %[h], %[h], r5    \n\t"            \
1976
        /* ah * bl */                                    \
1977
        "uxth r5, %[b]    \n\t"            \
1978
        "muls r6, r5, r6    \n\t"            \
1979
        "lsrs r5, r6, #16   \n\t"            \
1980
        "lsls r6, r6, #16   \n\t"            \
1981
        "adds %[l], %[l], r6    \n\t"            \
1982
        "adcs %[h], %[h], r5    \n\t"            \
1983
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
1984
        : [a] "l" (va), [b] "l" (vb)                     \
1985
        : "r5", "r6", "cc"                               \
1986
    )
1987
#if !defined(WOLFSSL_SP_SMALL) && !defined(DEBUG)
1988
/* Multiply va by vb and add double size result into: vo | vh | vl */
1989
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
1990
    __asm__ __volatile__ (                               \
1991
        /* al * bl */                                    \
1992
        "uxth r6, %[a]    \n\t"            \
1993
        "uxth r7, %[b]    \n\t"            \
1994
        "muls r7, r6, r7    \n\t"            \
1995
        "adds %[l], %[l], r7    \n\t"            \
1996
        "movs r5, #0      \n\t"            \
1997
        "adcs %[h], %[h], r5    \n\t"            \
1998
        "adcs %[o], %[o], r5    \n\t"            \
1999
        /* al * bh */                                    \
2000
        "lsrs r7, %[b], #16   \n\t"            \
2001
        "muls r6, r7, r6    \n\t"            \
2002
        "lsrs r7, r6, #16   \n\t"            \
2003
        "lsls r6, r6, #16   \n\t"            \
2004
        "adds %[l], %[l], r6    \n\t"            \
2005
        "adcs %[h], %[h], r7    \n\t"            \
2006
        "adcs %[o], %[o], r5    \n\t"            \
2007
        /* ah * bh */                                    \
2008
        "lsrs r6, %[a], #16   \n\t"            \
2009
        "lsrs r7, %[b], #16   \n\t"            \
2010
        "muls r7, r6, r7    \n\t"            \
2011
        "adds %[h], %[h], r7    \n\t"            \
2012
        "adcs %[o], %[o], r5    \n\t"            \
2013
        /* ah * bl */                                    \
2014
        "uxth r7, %[b]    \n\t"            \
2015
        "muls r6, r7, r6    \n\t"            \
2016
        "lsrs r7, r6, #16   \n\t"            \
2017
        "lsls r6, r6, #16   \n\t"            \
2018
        "adds %[l], %[l], r6    \n\t"            \
2019
        "adcs %[h], %[h], r7    \n\t"            \
2020
        "adcs %[o], %[o], r5    \n\t"            \
2021
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
2022
        : [a] "l" (va), [b] "l" (vb)                     \
2023
        : "r5", "r6", "r7", "cc"                         \
2024
    )
2025
#else
2026
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
2027
    __asm__ __volatile__ (                               \
2028
        /* al * bl */                                    \
2029
        "uxth   r6, %[a]                \n\t"            \
2030
        "uxth   r5, %[b]                \n\t"            \
2031
        "muls   r5, r6, r5              \n\t"            \
2032
        "adds   %[l], %[l], r5          \n\t"            \
2033
        "movs   r5, #0                  \n\t"            \
2034
        "adcs   %[h], %[h], r5          \n\t"            \
2035
        "adcs   %[o], %[o], r5          \n\t"            \
2036
        /* al * bh */                                    \
2037
        "lsrs   r5, %[b], #16           \n\t"            \
2038
        "muls   r6, r5, r6              \n\t"            \
2039
        "lsrs   r5, r6, #16             \n\t"            \
2040
        "lsls   r6, r6, #16             \n\t"            \
2041
        "adds   %[l], %[l], r6          \n\t"            \
2042
        "adcs   %[h], %[h], r5          \n\t"            \
2043
        "movs   r5, #0                  \n\t"            \
2044
        "adcs   %[o], %[o], r5          \n\t"            \
2045
        /* ah * bh */                                    \
2046
        "lsrs   r6, %[a], #16           \n\t"            \
2047
        "lsrs   r5, %[b], #16           \n\t"            \
2048
        "muls   r5, r6, r5              \n\t"            \
2049
        "adds   %[h], %[h], r5          \n\t"            \
2050
        "movs   r5, #0                  \n\t"            \
2051
        "adcs   %[o], %[o], r5          \n\t"            \
2052
        /* ah * bl */                                    \
2053
        "uxth   r5, %[b]                \n\t"            \
2054
        "muls   r6, r5, r6              \n\t"            \
2055
        "lsrs   r5, r6, #16             \n\t"            \
2056
        "lsls   r6, r6, #16             \n\t"            \
2057
        "adds   %[l], %[l], r6          \n\t"            \
2058
        "adcs   %[h], %[h], r5          \n\t"            \
2059
        "movs   r5, #0                  \n\t"            \
2060
        "adcs   %[o], %[o], r5          \n\t"            \
2061
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
2062
        : [a] "l" (va), [b] "l" (vb)                     \
2063
        : "r5", "r6", "cc"                               \
2064
    )
2065
#endif
2066
/* Multiply va by vb and add double size result into: vh | vl */
2067
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
2068
    __asm__ __volatile__ (                               \
2069
        /* al * bl */                                    \
2070
        "uxth r6, %[a]    \n\t"            \
2071
        "uxth r4, %[b]    \n\t"            \
2072
        "muls r4, r6, r4    \n\t"            \
2073
        "adds %[l], %[l], r4    \n\t"            \
2074
        "movs r5, #0      \n\t"            \
2075
        "adcs %[h], %[h], r5    \n\t"            \
2076
        /* al * bh */                                    \
2077
        "lsrs r4, %[b], #16   \n\t"            \
2078
        "muls r6, r4, r6    \n\t"            \
2079
        "lsrs r4, r6, #16   \n\t"            \
2080
        "lsls r6, r6, #16   \n\t"            \
2081
        "adds %[l], %[l], r6    \n\t"            \
2082
        "adcs %[h], %[h], r4    \n\t"            \
2083
        /* ah * bh */                                    \
2084
        "lsrs r6, %[a], #16   \n\t"            \
2085
        "lsrs r4, %[b], #16   \n\t"            \
2086
        "muls r4, r6, r4    \n\t"            \
2087
        "adds %[h], %[h], r4    \n\t"            \
2088
        /* ah * bl */                                    \
2089
        "uxth r4, %[b]    \n\t"            \
2090
        "muls r6, r4, r6    \n\t"            \
2091
        "lsrs r4, r6, #16   \n\t"            \
2092
        "lsls r6, r6, #16   \n\t"            \
2093
        "adds %[l], %[l], r6    \n\t"            \
2094
        "adcs %[h], %[h], r4    \n\t"            \
2095
        : [l] "+l" (vl), [h] "+l" (vh)                   \
2096
        : [a] "l" (va), [b] "l" (vb)                     \
2097
        : "r4", "r5", "r6", "cc"                         \
2098
    )
2099
#if !defined(WOLFSSL_SP_SMALL) && !defined(DEBUG)
2100
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
2101
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
2102
    __asm__ __volatile__ (                               \
2103
        /* al * bl */                                    \
2104
        "uxth r6, %[a]    \n\t"            \
2105
        "uxth r7, %[b]    \n\t"            \
2106
        "muls r7, r6, r7    \n\t"            \
2107
        "adds %[l], %[l], r7    \n\t"            \
2108
        "movs r5, #0      \n\t"            \
2109
        "adcs %[h], %[h], r5    \n\t"            \
2110
        "adcs %[o], %[o], r5    \n\t"            \
2111
        "adds %[l], %[l], r7    \n\t"            \
2112
        "adcs %[h], %[h], r5    \n\t"            \
2113
        "adcs %[o], %[o], r5    \n\t"            \
2114
        /* al * bh */                                    \
2115
        "lsrs r7, %[b], #16   \n\t"            \
2116
        "muls r6, r7, r6    \n\t"            \
2117
        "lsrs r7, r6, #16   \n\t"            \
2118
        "lsls r6, r6, #16   \n\t"            \
2119
        "adds %[l], %[l], r6    \n\t"            \
2120
        "adcs %[h], %[h], r7    \n\t"            \
2121
        "adcs %[o], %[o], r5    \n\t"            \
2122
        "adds %[l], %[l], r6    \n\t"            \
2123
        "adcs %[h], %[h], r7    \n\t"            \
2124
        "adcs %[o], %[o], r5    \n\t"            \
2125
        /* ah * bh */                                    \
2126
        "lsrs r6, %[a], #16   \n\t"            \
2127
        "lsrs r7, %[b], #16   \n\t"            \
2128
        "muls r7, r6, r7    \n\t"            \
2129
        "adds %[h], %[h], r7    \n\t"            \
2130
        "adcs %[o], %[o], r5    \n\t"            \
2131
        "adds %[h], %[h], r7    \n\t"            \
2132
        "adcs %[o], %[o], r5    \n\t"            \
2133
        /* ah * bl */                                    \
2134
        "uxth r7, %[b]    \n\t"            \
2135
        "muls r6, r7, r6    \n\t"            \
2136
        "lsrs r7, r6, #16   \n\t"            \
2137
        "lsls r6, r6, #16   \n\t"            \
2138
        "adds %[l], %[l], r6    \n\t"            \
2139
        "adcs %[h], %[h], r7    \n\t"            \
2140
        "adcs %[o], %[o], r5    \n\t"            \
2141
        "adds %[l], %[l], r6    \n\t"            \
2142
        "adcs %[h], %[h], r7    \n\t"            \
2143
        "adcs %[o], %[o], r5    \n\t"            \
2144
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
2145
        : [a] "l" (va), [b] "l" (vb)                     \
2146
        : "r5", "r6", "r7", "cc"                         \
2147
    )
2148
#else
2149
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
2150
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
2151
    __asm__ __volatile__ (                               \
2152
        "movs r8, %[a]    \n\t"            \
2153
        /* al * bl */                                    \
2154
        "uxth r6, %[a]    \n\t"            \
2155
        "uxth r5, %[b]    \n\t"            \
2156
        "muls r5, r6, r5    \n\t"            \
2157
        "adds %[l], %[l], r5    \n\t"            \
2158
        "movs %[a], #0    \n\t"            \
2159
        "adcs %[h], %[h], %[a]  \n\t"            \
2160
        "adcs %[o], %[o], %[a]  \n\t"            \
2161
        "adds %[l], %[l], r5    \n\t"            \
2162
        "adcs %[h], %[h], %[a]  \n\t"            \
2163
        "adcs %[o], %[o], %[a]  \n\t"            \
2164
        /* al * bh */                                    \
2165
        "lsrs r5, %[b], #16   \n\t"            \
2166
        "muls r6, r5, r6    \n\t"            \
2167
        "lsrs r5, r6, #16   \n\t"            \
2168
        "lsls r6, r6, #16   \n\t"            \
2169
        "adds %[l], %[l], r6    \n\t"            \
2170
        "adcs %[h], %[h], r5    \n\t"            \
2171
        "adcs %[o], %[o], %[a]  \n\t"            \
2172
        "adds %[l], %[l], r6    \n\t"            \
2173
        "adcs %[h], %[h], r5    \n\t"            \
2174
        "adcs %[o], %[o], %[a]  \n\t"            \
2175
        /* ah * bh */                                    \
2176
        "movs %[a], r8    \n\t"            \
2177
        "lsrs r6, %[a], #16   \n\t"            \
2178
        "lsrs r5, %[b], #16   \n\t"            \
2179
        "muls r5, r6, r5    \n\t"            \
2180
        "adds %[h], %[h], r5    \n\t"            \
2181
        "movs %[a], #0    \n\t"            \
2182
        "adcs %[o], %[o], %[a]  \n\t"            \
2183
        "adds %[h], %[h], r5    \n\t"            \
2184
        "adcs %[o], %[o], %[a]  \n\t"            \
2185
        /* ah * bl */                                    \
2186
        "uxth r5, %[b]    \n\t"            \
2187
        "muls r6, r5, r6    \n\t"            \
2188
        "lsrs r5, r6, #16   \n\t"            \
2189
        "lsls r6, r6, #16   \n\t"            \
2190
        "adds %[l], %[l], r6    \n\t"            \
2191
        "adcs %[h], %[h], r5    \n\t"            \
2192
        "adcs %[o], %[o], %[a]  \n\t"            \
2193
        "adds %[l], %[l], r6    \n\t"            \
2194
        "adcs %[h], %[h], r5    \n\t"            \
2195
        "adcs %[o], %[o], %[a]  \n\t"            \
2196
        "movs %[a], r8    \n\t"            \
2197
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
2198
        : [a] "l" (va), [b] "l" (vb)                     \
2199
        : "r5", "r6", "r8", "cc"                         \
2200
    )
2201
#endif
2202
#ifndef DEBUG
2203
/* Multiply va by vb and add double size result twice into: vo | vh | vl
2204
 * Assumes first add will not overflow vh | vl
2205
 */
2206
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
2207
    __asm__ __volatile__ (                               \
2208
        /* al * bl */                                    \
2209
        "uxth r6, %[a]    \n\t"            \
2210
        "uxth r7, %[b]    \n\t"            \
2211
        "muls r7, r6, r7    \n\t"            \
2212
        "adds %[l], %[l], r7    \n\t"            \
2213
        "movs r5, #0      \n\t"            \
2214
        "adcs %[h], %[h], r5    \n\t"            \
2215
        "adds %[l], %[l], r7    \n\t"            \
2216
        "adcs %[h], %[h], r5    \n\t"            \
2217
        /* al * bh */                                    \
2218
        "lsrs r7, %[b], #16   \n\t"            \
2219
        "muls r6, r7, r6    \n\t"            \
2220
        "lsrs r7, r6, #16   \n\t"            \
2221
        "lsls r6, r6, #16   \n\t"            \
2222
        "adds %[l], %[l], r6    \n\t"            \
2223
        "adcs %[h], %[h], r7    \n\t"            \
2224
        "adds %[l], %[l], r6    \n\t"            \
2225
        "adcs %[h], %[h], r7    \n\t"            \
2226
        "adcs %[o], %[o], r5    \n\t"            \
2227
        /* ah * bh */                                    \
2228
        "lsrs r6, %[a], #16   \n\t"            \
2229
        "lsrs r7, %[b], #16   \n\t"            \
2230
        "muls r7, r6, r7    \n\t"            \
2231
        "adds %[h], %[h], r7    \n\t"            \
2232
        "adcs %[o], %[o], r5    \n\t"            \
2233
        "adds %[h], %[h], r7    \n\t"            \
2234
        "adcs %[o], %[o], r5    \n\t"            \
2235
        /* ah * bl */                                    \
2236
        "uxth r7, %[b]    \n\t"            \
2237
        "muls r6, r7, r6    \n\t"            \
2238
        "lsrs r7, r6, #16   \n\t"            \
2239
        "lsls r6, r6, #16   \n\t"            \
2240
        "adds %[l], %[l], r6    \n\t"            \
2241
        "adcs %[h], %[h], r7    \n\t"            \
2242
        "adcs %[o], %[o], r5    \n\t"            \
2243
        "adds %[l], %[l], r6    \n\t"            \
2244
        "adcs %[h], %[h], r7    \n\t"            \
2245
        "adcs %[o], %[o], r5    \n\t"            \
2246
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
2247
        : [a] "l" (va), [b] "l" (vb)                     \
2248
        : "r5", "r6", "r7", "cc"                         \
2249
    )
2250
#else
2251
/* Multiply va by vb and add double size result twice into: vo | vh | vl
2252
 * Assumes first add will not overflow vh | vl
2253
 */
2254
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
2255
    __asm__ __volatile__ (                               \
2256
        "movs r8, %[a]    \n\t"            \
2257
        /* al * bl */                                    \
2258
        "uxth r5, %[a]    \n\t"            \
2259
        "uxth r6, %[b]    \n\t"            \
2260
        "muls r6, r5, r6    \n\t"            \
2261
        "adds %[l], %[l], r6    \n\t"            \
2262
        "movs %[a], #0    \n\t"            \
2263
        "adcs %[h], %[h], %[a]  \n\t"            \
2264
        "adds %[l], %[l], r6    \n\t"            \
2265
        "adcs %[h], %[h], %[a]  \n\t"            \
2266
        /* al * bh */                                    \
2267
        "lsrs r6, %[b], #16   \n\t"            \
2268
        "muls r5, r6, r5    \n\t"            \
2269
        "lsrs r6, r5, #16   \n\t"            \
2270
        "lsls r5, r5, #16   \n\t"            \
2271
        "adds %[l], %[l], r5    \n\t"            \
2272
        "adcs %[h], %[h], r6    \n\t"            \
2273
        "adds %[l], %[l], r5    \n\t"            \
2274
        "adcs %[h], %[h], r6    \n\t"            \
2275
        "adcs %[o], %[o], %[a]  \n\t"            \
2276
        /* ah * bh */                                    \
2277
        "movs %[a], r8    \n\t"            \
2278
        "lsrs r5, %[a], #16   \n\t"            \
2279
        "lsrs r6, %[b], #16   \n\t"            \
2280
        "muls r6, r5, r6    \n\t"            \
2281
        "movs %[a], #0    \n\t"            \
2282
        "adds %[h], %[h], r6    \n\t"            \
2283
        "adcs %[o], %[o], %[a]  \n\t"            \
2284
        "adds %[h], %[h], r6    \n\t"            \
2285
        "adcs %[o], %[o], %[a]  \n\t"            \
2286
        /* ah * bl */                                    \
2287
        "uxth r6, %[b]    \n\t"            \
2288
        "muls r5, r6, r5    \n\t"            \
2289
        "lsrs r6, r5, #16   \n\t"            \
2290
        "lsls r5, r5, #16   \n\t"            \
2291
        "adds %[l], %[l], r5    \n\t"            \
2292
        "adcs %[h], %[h], r6    \n\t"            \
2293
        "adcs %[o], %[o], %[a]  \n\t"            \
2294
        "adds %[l], %[l], r5    \n\t"            \
2295
        "adcs %[h], %[h], r6    \n\t"            \
2296
        "adcs %[o], %[o], %[a]  \n\t"            \
2297
        "movs %[a], r8    \n\t"            \
2298
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
2299
        : [a] "l" (va), [b] "l" (vb)                     \
2300
        : "r5", "r6", "r8", "cc"                         \
2301
    )
2302
#endif
2303
/* Square va and store double size result in: vh | vl */
2304
#define SP_ASM_SQR(vl, vh, va)                           \
2305
    __asm__ __volatile__ (                               \
2306
        "lsrs r5, %[a], #16   \n\t"            \
2307
        "uxth r6, %[a]    \n\t"            \
2308
        "mov  %[l], r6    \n\t"            \
2309
        "mov  %[h], r5    \n\t"            \
2310
        /* al * al */                                    \
2311
        "muls %[l], %[l], %[l]  \n\t"            \
2312
        /* ah * ah */                                    \
2313
        "muls %[h], %[h], %[h]  \n\t"            \
2314
        /* 2 * al * ah */                                \
2315
        "muls r6, r5, r6    \n\t"            \
2316
        "lsrs r5, r6, #15   \n\t"            \
2317
        "lsls r6, r6, #17   \n\t"            \
2318
        "adds %[l], %[l], r6    \n\t"            \
2319
        "adcs %[h], %[h], r5    \n\t"            \
2320
        : [h] "+l" (vh), [l] "+l" (vl)                   \
2321
        : [a] "l" (va)                                   \
2322
        : "r5", "r6", "cc"                               \
2323
    )
2324
/* Square va and add double size result into: vo | vh | vl */
2325
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
2326
    __asm__ __volatile__ (                               \
2327
        "lsrs r4, %[a], #16   \n\t"            \
2328
        "uxth r6, %[a]    \n\t"            \
2329
        /* al * al */                                    \
2330
        "muls r6, r6, r6    \n\t"            \
2331
        /* ah * ah */                                    \
2332
        "muls r4, r4, r4    \n\t"            \
2333
        "adds %[l], %[l], r6    \n\t"            \
2334
        "adcs %[h], %[h], r4    \n\t"            \
2335
        "movs r5, #0      \n\t"            \
2336
        "adcs %[o], %[o], r5    \n\t"            \
2337
        "lsrs r4, %[a], #16   \n\t"            \
2338
        "uxth r6, %[a]    \n\t"            \
2339
        /* 2 * al * ah */                                \
2340
        "muls r6, r4, r6    \n\t"            \
2341
        "lsrs r4, r6, #15   \n\t"            \
2342
        "lsls r6, r6, #17   \n\t"            \
2343
        "adds %[l], %[l], r6    \n\t"            \
2344
        "adcs %[h], %[h], r4    \n\t"            \
2345
        "adcs %[o], %[o], r5    \n\t"            \
2346
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
2347
        : [a] "l" (va)                                   \
2348
        : "r4", "r5", "r6", "cc"                         \
2349
    )
2350
/* Square va and add double size result into: vh | vl */
2351
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
2352
    __asm__ __volatile__ (                               \
2353
        "lsrs r5, %[a], #16   \n\t"            \
2354
        "uxth r6, %[a]    \n\t"            \
2355
        /* al * al */                                    \
2356
        "muls r6, r6, r6    \n\t"            \
2357
        /* ah * ah */                                    \
2358
        "muls r5, r5, r5    \n\t"            \
2359
        "adds %[l], %[l], r6    \n\t"            \
2360
        "adcs %[h], %[h], r5    \n\t"            \
2361
        "lsrs r5, %[a], #16   \n\t"            \
2362
        "uxth r6, %[a]    \n\t"            \
2363
        /* 2 * al * ah */                                \
2364
        "muls r6, r5, r6    \n\t"            \
2365
        "lsrs r5, r6, #15   \n\t"            \
2366
        "lsls r6, r6, #17   \n\t"            \
2367
        "adds %[l], %[l], r6    \n\t"            \
2368
        "adcs %[h], %[h], r5    \n\t"            \
2369
        : [l] "+l" (vl), [h] "+l" (vh)                   \
2370
        : [a] "l" (va)                                   \
2371
        : "r5", "r6", "cc"                               \
2372
    )
2373
/* Add va into: vh | vl */
2374
#define SP_ASM_ADDC(vl, vh, va)                          \
2375
    __asm__ __volatile__ (                               \
2376
        "adds %[l], %[l], %[a]  \n\t"            \
2377
        "movs r5, #0      \n\t"            \
2378
        "adcs %[h], %[h], r5    \n\t"            \
2379
        : [l] "+l" (vl), [h] "+l" (vh)                   \
2380
        : [a] "l" (va)                                   \
2381
        : "r5", "cc"                                     \
2382
    )
2383
/* Sub va from: vh | vl */
2384
#define SP_ASM_SUBB(vl, vh, va)                          \
2385
    __asm__ __volatile__ (                               \
2386
        "subs %[l], %[l], %[a]  \n\t"            \
2387
        "movs r5, #0      \n\t"            \
2388
        "sbcs %[h], %[h], r5    \n\t"            \
2389
        : [l] "+l" (vl), [h] "+l" (vh)                   \
2390
        : [a] "l" (va)                                   \
2391
        : "r5", "cc"                                     \
2392
    )
2393
/* Add two times vc | vb | va into vo | vh | vl */
2394
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
2395
    __asm__ __volatile__ (                               \
2396
        "adds %[l], %[l], %[a]  \n\t"            \
2397
        "adcs %[h], %[h], %[b]  \n\t"            \
2398
        "adcs %[o], %[o], %[c]  \n\t"            \
2399
        "adds %[l], %[l], %[a]  \n\t"            \
2400
        "adcs %[h], %[h], %[b]  \n\t"            \
2401
        "adcs %[o], %[o], %[c]  \n\t"            \
2402
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
2403
        : [a] "l" (va), [b] "l" (vb), [c] "l" (vc)       \
2404
        : "cc"                                           \
2405
    )
2406
2407
#elif defined(__GNUC__)
2408
2409
/* Multiply va by vb and store double size result in: vh | vl */
2410
#define SP_ASM_MUL(vl, vh, va, vb)                       \
2411
    __asm__ __volatile__ (                               \
2412
        /* al * bl */                                    \
2413
        "uxth r6, %[a]    \n\t"            \
2414
        "uxth %[l], %[b]    \n\t"            \
2415
        "mul  %[l], r6    \n\t"            \
2416
        /* al * bh */                                    \
2417
        "lsr  r4, %[b], #16   \n\t"            \
2418
        "mul  r6, r4      \n\t"            \
2419
        "lsr  %[h], r6, #16   \n\t"            \
2420
        "lsl  r6, r6, #16   \n\t"            \
2421
        "add  %[l], %[l], r6    \n\t"            \
2422
        "mov  r5, #0      \n\t"            \
2423
        "adc  %[h], r5    \n\t"            \
2424
        /* ah * bh */                                    \
2425
        "lsr  r6, %[a], #16   \n\t"            \
2426
        "mul  r4, r6      \n\t"            \
2427
        "add  %[h], %[h], r4    \n\t"            \
2428
        /* ah * bl */                                    \
2429
        "uxth r4, %[b]    \n\t"            \
2430
        "mul  r6, r4      \n\t"            \
2431
        "lsr  r4, r6, #16   \n\t"            \
2432
        "lsl  r6, r6, #16   \n\t"            \
2433
        "add  %[l], %[l], r6    \n\t"            \
2434
        "adc  %[h], r4    \n\t"            \
2435
        : [h] "+l" (vh), [l] "+l" (vl)                   \
2436
        : [a] "l" (va), [b] "l" (vb)                     \
2437
        : "r4", "r5", "r6", "cc"                         \
2438
    )
2439
/* Multiply va by vb and store double size result in: vo | vh | vl */
2440
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
2441
    __asm__ __volatile__ (                               \
2442
        /* al * bl */                                    \
2443
        "uxth r6, %[a]    \n\t"            \
2444
        "uxth %[l], %[b]    \n\t"            \
2445
        "mul  %[l], r6    \n\t"            \
2446
        /* al * bh */                                    \
2447
        "lsr  r5, %[b], #16   \n\t"            \
2448
        "mul  r6, r5      \n\t"            \
2449
        "lsr  %[h], r6, #16   \n\t"            \
2450
        "lsl  r6, r6, #16   \n\t"            \
2451
        "add  %[l], %[l], r6    \n\t"            \
2452
        "mov  %[o], #0    \n\t"            \
2453
        "adc  %[h], %[o]    \n\t"            \
2454
        /* ah * bh */                                    \
2455
        "lsr  r6, %[a], #16   \n\t"            \
2456
        "mul  r5, r6      \n\t"            \
2457
        "add  %[h], %[h], r5    \n\t"            \
2458
        /* ah * bl */                                    \
2459
        "uxth r5, %[b]    \n\t"            \
2460
        "mul  r6, r5      \n\t"            \
2461
        "lsr  r5, r6, #16   \n\t"            \
2462
        "lsl  r6, r6, #16   \n\t"            \
2463
        "add  %[l], %[l], r6    \n\t"            \
2464
        "adc  %[h], r5    \n\t"            \
2465
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
2466
        : [a] "l" (va), [b] "l" (vb)                     \
2467
        : "r5", "r6", "cc"                               \
2468
    )
2469
#if !defined(WOLFSSL_SP_SMALL) && !defined(DEBUG)
2470
/* Multiply va by vb and add double size result into: vo | vh | vl */
2471
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
2472
    __asm__ __volatile__ (                               \
2473
        /* al * bl */                                    \
2474
        "uxth r6, %[a]    \n\t"            \
2475
        "uxth r7, %[b]    \n\t"            \
2476
        "mul  r7, r6      \n\t"            \
2477
        "add  %[l], %[l], r7    \n\t"            \
2478
        "mov  r5, #0      \n\t"            \
2479
        "adc  %[h], r5    \n\t"            \
2480
        "adc  %[o], r5    \n\t"            \
2481
        /* al * bh */                                    \
2482
        "lsr  r7, %[b], #16   \n\t"            \
2483
        "mul  r6, r7      \n\t"            \
2484
        "lsr  r7, r6, #16   \n\t"            \
2485
        "lsl  r6, r6, #16   \n\t"            \
2486
        "add  %[l], %[l], r6    \n\t"            \
2487
        "adc  %[h], r7    \n\t"            \
2488
        "adc  %[o], r5    \n\t"            \
2489
        /* ah * bh */                                    \
2490
        "lsr  r6, %[a], #16   \n\t"            \
2491
        "lsr  r7, %[b], #16   \n\t"            \
2492
        "mul  r7, r6      \n\t"            \
2493
        "add  %[h], %[h], r7    \n\t"            \
2494
        "adc  %[o], r5    \n\t"            \
2495
        /* ah * bl */                                    \
2496
        "uxth r7, %[b]    \n\t"            \
2497
        "mul  r6, r7      \n\t"            \
2498
        "lsr  r7, r6, #16   \n\t"            \
2499
        "lsl  r6, r6, #16   \n\t"            \
2500
        "add  %[l], %[l], r6    \n\t"            \
2501
        "adc  %[h], r7    \n\t"            \
2502
        "adc  %[o], r5    \n\t"            \
2503
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
2504
        : [a] "l" (va), [b] "l" (vb)                     \
2505
        : "r5", "r6", "r7", "cc"                         \
2506
    )
2507
#else
2508
/* Multiply va by vb and add double size result into: vo | vh | vl */
2509
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
2510
    __asm__ __volatile__ (                               \
2511
        /* al * bl */                                    \
2512
        "uxth   r6, %[a]                \n\t"            \
2513
        "uxth   r5, %[b]                \n\t"            \
2514
        "mul    r5, r6                  \n\t"            \
2515
        "add    %[l], %[l], r5          \n\t"            \
2516
        "mov    r5, #0                  \n\t"            \
2517
        "adc    %[h], r5                \n\t"            \
2518
        "adc    %[o], r5                \n\t"            \
2519
        /* al * bh */                                    \
2520
        "lsr    r5, %[b], #16           \n\t"            \
2521
        "mul    r6, r5                  \n\t"            \
2522
        "lsr    r5, r6, #16             \n\t"            \
2523
        "lsl    r6, r6, #16             \n\t"            \
2524
        "add    %[l], %[l], r6          \n\t"            \
2525
        "adc    %[h], r5                \n\t"            \
2526
        "mov    r5, #0                  \n\t"            \
2527
        "adc    %[o], r5                \n\t"            \
2528
        /* ah * bh */                                    \
2529
        "lsr    r6, %[a], #16           \n\t"            \
2530
        "lsr    r5, %[b], #16           \n\t"            \
2531
        "mul    r5, r6                  \n\t"            \
2532
        "add    %[h], %[h], r5          \n\t"            \
2533
        "mov    r5, #0                  \n\t"            \
2534
        "adc    %[o], r5                \n\t"            \
2535
        /* ah * bl */                                    \
2536
        "uxth   r5, %[b]                \n\t"            \
2537
        "mul    r6, r5                  \n\t"            \
2538
        "lsr    r5, r6, #16             \n\t"            \
2539
        "lsl    r6, r6, #16             \n\t"            \
2540
        "add    %[l], %[l], r6          \n\t"            \
2541
        "adc    %[h], r5                \n\t"            \
2542
        "mov    r5, #0                  \n\t"            \
2543
        "adc    %[o], r5                \n\t"            \
2544
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
2545
        : [a] "l" (va), [b] "l" (vb)                     \
2546
        : "r5", "r6", "cc"                               \
2547
    )
2548
#endif
2549
/* Multiply va by vb and add double size result into: vh | vl */
2550
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
2551
    __asm__ __volatile__ (                               \
2552
        /* al * bl */                                    \
2553
        "uxth r6, %[a]    \n\t"            \
2554
        "uxth r4, %[b]    \n\t"            \
2555
        "mul  r4, r6      \n\t"            \
2556
        "add  %[l], %[l], r4    \n\t"            \
2557
        "mov  r5, #0      \n\t"            \
2558
        "adc  %[h], r5    \n\t"            \
2559
        /* al * bh */                                    \
2560
        "lsr  r4, %[b], #16   \n\t"            \
2561
        "mul  r6, r4      \n\t"            \
2562
        "lsr  r4, r6, #16   \n\t"            \
2563
        "lsl  r6, r6, #16   \n\t"            \
2564
        "add  %[l], %[l], r6    \n\t"            \
2565
        "adc  %[h], r4    \n\t"            \
2566
        /* ah * bh */                                    \
2567
        "lsr  r6, %[a], #16   \n\t"            \
2568
        "lsr  r4, %[b], #16   \n\t"            \
2569
        "mul  r4, r6      \n\t"            \
2570
        "add  %[h], %[h], r4    \n\t"            \
2571
        /* ah * bl */                                    \
2572
        "uxth r4, %[b]    \n\t"            \
2573
        "mul  r6, r4      \n\t"            \
2574
        "lsr  r4, r6, #16   \n\t"            \
2575
        "lsl  r6, r6, #16   \n\t"            \
2576
        "add  %[l], %[l], r6    \n\t"            \
2577
        "adc  %[h], r4    \n\t"            \
2578
        : [l] "+l" (vl), [h] "+l" (vh)                   \
2579
        : [a] "l" (va), [b] "l" (vb)                     \
2580
        : "r4", "r5", "r6", "cc"                         \
2581
    )
2582
#if !defined(WOLFSSL_SP_SMALL) && !defined(DEBUG)
2583
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
2584
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
2585
    __asm__ __volatile__ (                               \
2586
        /* al * bl */                                    \
2587
        "uxth r6, %[a]    \n\t"            \
2588
        "uxth r7, %[b]    \n\t"            \
2589
        "mul  r7, r6      \n\t"            \
2590
        "add  %[l], %[l], r7    \n\t"            \
2591
        "mov  r5, #0      \n\t"            \
2592
        "adc  %[h], r5    \n\t"            \
2593
        "adc  %[o], r5    \n\t"            \
2594
        "add  %[l], %[l], r7    \n\t"            \
2595
        "adc  %[h], r5    \n\t"            \
2596
        "adc  %[o], r5    \n\t"            \
2597
        /* al * bh */                                    \
2598
        "lsr  r7, %[b], #16   \n\t"            \
2599
        "mul  r6, r7      \n\t"            \
2600
        "lsr  r7, r6, #16   \n\t"            \
2601
        "lsl  r6, r6, #16   \n\t"            \
2602
        "add  %[l], %[l], r6    \n\t"            \
2603
        "adc  %[h], r7    \n\t"            \
2604
        "adc  %[o], r5    \n\t"            \
2605
        "add  %[l], %[l], r6    \n\t"            \
2606
        "adc  %[h], r7    \n\t"            \
2607
        "adc  %[o], r5    \n\t"            \
2608
        /* ah * bh */                                    \
2609
        "lsr  r6, %[a], #16   \n\t"            \
2610
        "lsr  r7, %[b], #16   \n\t"            \
2611
        "mul  r7, r6      \n\t"            \
2612
        "add  %[h], %[h], r7    \n\t"            \
2613
        "adc  %[o], r5    \n\t"            \
2614
        "add  %[h], %[h], r7    \n\t"            \
2615
        "adc  %[o], r5    \n\t"            \
2616
        /* ah * bl */                                    \
2617
        "uxth r7, %[b]    \n\t"            \
2618
        "mul  r6, r7      \n\t"            \
2619
        "lsr  r7, r6, #16   \n\t"            \
2620
        "lsl  r6, r6, #16   \n\t"            \
2621
        "add  %[l], %[l], r6    \n\t"            \
2622
        "adc  %[h], r7    \n\t"            \
2623
        "adc  %[o], r5    \n\t"            \
2624
        "add  %[l], %[l], r6    \n\t"            \
2625
        "adc  %[h], r7    \n\t"            \
2626
        "adc  %[o], r5    \n\t"            \
2627
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
2628
        : [a] "l" (va), [b] "l" (vb)                     \
2629
        : "r5", "r6", "r7", "cc"                         \
2630
    )
2631
#else
2632
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
2633
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
2634
    __asm__ __volatile__ (                               \
2635
        "mov    r8, %[a]                \n\t"            \
2636
        /* al * bl */                                    \
2637
        "uxth   r6, %[a]                \n\t"            \
2638
        "uxth   r5, %[b]                \n\t"            \
2639
        "mul    r5, r6                  \n\t"            \
2640
        "add    %[l], %[l], r5          \n\t"            \
2641
        "mov    %[a], #0                \n\t"            \
2642
        "adc    %[h], %[a]              \n\t"            \
2643
        "adc    %[o], %[a]              \n\t"            \
2644
        "add    %[l], %[l], r5          \n\t"            \
2645
        "adc    %[h], %[a]              \n\t"            \
2646
        "adc    %[o], %[a]              \n\t"            \
2647
        /* al * bh */                                    \
2648
        "lsr    r5, %[b], #16           \n\t"            \
2649
        "mul    r6, r5                  \n\t"            \
2650
        "lsr    r5, r6, #16             \n\t"            \
2651
        "lsl    r6, r6, #16             \n\t"            \
2652
        "add    %[l], %[l], r6          \n\t"            \
2653
        "adc    %[h], r5                \n\t"            \
2654
        "adc    %[o], %[a]              \n\t"            \
2655
        "add    %[l], %[l], r6          \n\t"            \
2656
        "adc    %[h], r5                \n\t"            \
2657
        "adc    %[o], %[a]              \n\t"            \
2658
        /* ah * bh */                                    \
2659
        "mov    %[a], r8                \n\t"            \
2660
        "lsr    r6, %[a], #16           \n\t"            \
2661
        "lsr    r5, %[b], #16           \n\t"            \
2662
        "mul    r5, r6                  \n\t"            \
2663
        "add    %[h], %[h], r5          \n\t"            \
2664
        "mov    %[a], #0                \n\t"            \
2665
        "adc    %[o], %[a]              \n\t"            \
2666
        "add    %[h], %[h], r5          \n\t"            \
2667
        "adc    %[o], %[a]              \n\t"            \
2668
        /* ah * bl */                                    \
2669
        "uxth   r5, %[b]                \n\t"            \
2670
        "mul    r6, r5                  \n\t"            \
2671
        "lsr    r5, r6, #16             \n\t"            \
2672
        "lsl    r6, r6, #16             \n\t"            \
2673
        "add    %[l], %[l], r6          \n\t"            \
2674
        "adc    %[h], r5                \n\t"            \
2675
        "adc    %[o], %[a]              \n\t"            \
2676
        "add    %[l], %[l], r6          \n\t"            \
2677
        "adc    %[h], r5                \n\t"            \
2678
        "adc    %[o], %[a]              \n\t"            \
2679
        "mov    %[a], r8                \n\t"            \
2680
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
2681
        : [a] "l" (va), [b] "l" (vb)                     \
2682
        : "r5", "r6", "r8", "cc"                         \
2683
    )
2684
#endif
2685
#ifndef DEBUG
2686
/* Multiply va by vb and add double size result twice into: vo | vh | vl
2687
 * Assumes first add will not overflow vh | vl
2688
 */
2689
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
2690
    __asm__ __volatile__ (                               \
2691
        /* al * bl */                                    \
2692
        "uxth r6, %[a]    \n\t"            \
2693
        "uxth r7, %[b]    \n\t"            \
2694
        "mul  r7, r6      \n\t"            \
2695
        "add  %[l], %[l], r7    \n\t"            \
2696
        "mov  r5, #0      \n\t"            \
2697
        "adc  %[h], r5    \n\t"            \
2698
        "add  %[l], %[l], r7    \n\t"            \
2699
        "adc  %[h], r5    \n\t"            \
2700
        /* al * bh */                                    \
2701
        "lsr  r7, %[b], #16   \n\t"            \
2702
        "mul  r6, r7      \n\t"            \
2703
        "lsr  r7, r6, #16   \n\t"            \
2704
        "lsl  r6, r6, #16   \n\t"            \
2705
        "add  %[l], %[l], r6    \n\t"            \
2706
        "adc  %[h], r7    \n\t"            \
2707
        "add  %[l], %[l], r6    \n\t"            \
2708
        "adc  %[h], r7    \n\t"            \
2709
        "adc  %[o], r5    \n\t"            \
2710
        /* ah * bh */                                    \
2711
        "lsr  r6, %[a], #16   \n\t"            \
2712
        "lsr  r7, %[b], #16   \n\t"            \
2713
        "mul  r7, r6      \n\t"            \
2714
        "add  %[h], %[h], r7    \n\t"            \
2715
        "adc  %[o], r5    \n\t"            \
2716
        "add  %[h], %[h], r7    \n\t"            \
2717
        "adc  %[o], r5    \n\t"            \
2718
        /* ah * bl */                                    \
2719
        "uxth r7, %[b]    \n\t"            \
2720
        "mul  r6, r7      \n\t"            \
2721
        "lsr  r7, r6, #16   \n\t"            \
2722
        "lsl  r6, r6, #16   \n\t"            \
2723
        "add  %[l], %[l], r6    \n\t"            \
2724
        "adc  %[h], r7    \n\t"            \
2725
        "adc  %[o], r5    \n\t"            \
2726
        "add  %[l], %[l], r6    \n\t"            \
2727
        "adc  %[h], r7    \n\t"            \
2728
        "adc  %[o], r5    \n\t"            \
2729
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
2730
        : [a] "l" (va), [b] "l" (vb)                     \
2731
        : "r5", "r6", "r7", "cc"                         \
2732
    )
2733
#else
2734
/* Multiply va by vb and add double size result twice into: vo | vh | vl
2735
 * Assumes first add will not overflow vh | vl
2736
 */
2737
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
2738
    __asm__ __volatile__ (                               \
2739
        "mov  r8, %[a]    \n\t"            \
2740
        /* al * bl */                                    \
2741
        "uxth r5, %[a]    \n\t"            \
2742
        "uxth r6, %[b]    \n\t"            \
2743
        "mul  r6, r5      \n\t"            \
2744
        "add  %[l], %[l], r6    \n\t"            \
2745
        "mov  %[a], #0    \n\t"            \
2746
        "adc  %[h], %[a]    \n\t"            \
2747
        "add  %[l], %[l], r6    \n\t"            \
2748
        "adc  %[h], %[a]    \n\t"            \
2749
        /* al * bh */                                    \
2750
        "lsr  r6, %[b], #16   \n\t"            \
2751
        "mul  r5, r6      \n\t"            \
2752
        "lsr  r6, r5, #16   \n\t"            \
2753
        "lsl  r5, r5, #16   \n\t"            \
2754
        "add  %[l], %[l], r5    \n\t"            \
2755
        "adc  %[h], r6    \n\t"            \
2756
        "add  %[l], %[l], r5    \n\t"            \
2757
        "adc  %[h], r6    \n\t"            \
2758
        "adc  %[o], %[a]    \n\t"            \
2759
        /* ah * bh */                                    \
2760
        "mov    %[a], r8                \n\t"            \
2761
        "lsr  r5, %[a], #16   \n\t"            \
2762
        "lsr  r6, %[b], #16   \n\t"            \
2763
        "mul  r6, r5      \n\t"            \
2764
        "mov    %[a], #0                \n\t"            \
2765
        "add  %[h], %[h], r6    \n\t"            \
2766
        "adc  %[o], %[a]    \n\t"            \
2767
        "add  %[h], %[h], r6    \n\t"            \
2768
        "adc  %[o], %[a]    \n\t"            \
2769
        /* ah * bl */                                    \
2770
        "uxth r6, %[b]    \n\t"            \
2771
        "mul  r5, r6      \n\t"            \
2772
        "lsr  r6, r5, #16   \n\t"            \
2773
        "lsl  r5, r5, #16   \n\t"            \
2774
        "add  %[l], %[l], r5    \n\t"            \
2775
        "adc  %[h], r6    \n\t"            \
2776
        "adc  %[o], %[a]    \n\t"            \
2777
        "add  %[l], %[l], r5    \n\t"            \
2778
        "adc  %[h], r6    \n\t"            \
2779
        "adc  %[o], %[a]    \n\t"            \
2780
        "mov    %[a], r8                \n\t"            \
2781
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
2782
        : [a] "l" (va), [b] "l" (vb)                     \
2783
        : "r5", "r6", "r8", "cc"                         \
2784
    )
2785
#endif
2786
/* Square va and store double size result in: vh | vl */
2787
#define SP_ASM_SQR(vl, vh, va)                           \
2788
    __asm__ __volatile__ (                               \
2789
        "lsr  r5, %[a], #16   \n\t"            \
2790
        "uxth r6, %[a]    \n\t"            \
2791
        "mov  %[l], r6    \n\t"            \
2792
        "mov  %[h], r5    \n\t"            \
2793
        /* al * al */                                    \
2794
        "mul  %[l], %[l]    \n\t"            \
2795
        /* ah * ah */                                    \
2796
        "mul  %[h], %[h]    \n\t"            \
2797
        /* 2 * al * ah */                                \
2798
        "mul  r6, r5      \n\t"            \
2799
        "lsr  r5, r6, #15   \n\t"            \
2800
        "lsl  r6, r6, #17   \n\t"            \
2801
        "add  %[l], %[l], r6    \n\t"            \
2802
        "adc  %[h], r5    \n\t"            \
2803
        : [h] "+l" (vh), [l] "+l" (vl)                   \
2804
        : [a] "l" (va)                                   \
2805
        : "r5", "r6", "cc"                               \
2806
    )
2807
/* Square va and add double size result into: vo | vh | vl */
2808
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
2809
    __asm__ __volatile__ (                               \
2810
        "lsr  r4, %[a], #16   \n\t"            \
2811
        "uxth r6, %[a]    \n\t"            \
2812
        /* al * al */                                    \
2813
        "mul  r6, r6      \n\t"            \
2814
        /* ah * ah */                                    \
2815
        "mul  r4, r4      \n\t"            \
2816
        "add  %[l], %[l], r6    \n\t"            \
2817
        "adc  %[h], r4    \n\t"            \
2818
        "mov  r5, #0      \n\t"            \
2819
        "adc  %[o], r5    \n\t"            \
2820
        "lsr  r4, %[a], #16   \n\t"            \
2821
        "uxth r6, %[a]    \n\t"            \
2822
        /* 2 * al * ah */                                \
2823
        "mul  r6, r4      \n\t"            \
2824
        "lsr  r4, r6, #15   \n\t"            \
2825
        "lsl  r6, r6, #17   \n\t"            \
2826
        "add  %[l], %[l], r6    \n\t"            \
2827
        "adc  %[h], r4    \n\t"            \
2828
        "adc  %[o], r5    \n\t"            \
2829
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
2830
        : [a] "l" (va)                                   \
2831
        : "r4", "r5", "r6", "cc"                         \
2832
    )
2833
/* Square va and add double size result into: vh | vl */
2834
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
2835
    __asm__ __volatile__ (                               \
2836
        "lsr  r5, %[a], #16   \n\t"            \
2837
        "uxth r6, %[a]    \n\t"            \
2838
        /* al * al */                                    \
2839
        "mul  r6, r6      \n\t"            \
2840
        /* ah * ah */                                    \
2841
        "mul  r5, r5      \n\t"            \
2842
        "add  %[l], %[l], r6    \n\t"            \
2843
        "adc  %[h], r5    \n\t"            \
2844
        "lsr  r5, %[a], #16   \n\t"            \
2845
        "uxth r6, %[a]    \n\t"            \
2846
        /* 2 * al * ah */                                \
2847
        "mul  r6, r5      \n\t"            \
2848
        "lsr  r5, r6, #15   \n\t"            \
2849
        "lsl  r6, r6, #17   \n\t"            \
2850
        "add  %[l], %[l], r6    \n\t"            \
2851
        "adc  %[h], r5    \n\t"            \
2852
        : [l] "+l" (vl), [h] "+l" (vh)                   \
2853
        : [a] "l" (va)                                   \
2854
        : "r5", "r6", "cc"                               \
2855
    )
2856
/* Add va into: vh | vl */
2857
#define SP_ASM_ADDC(vl, vh, va)                          \
2858
    __asm__ __volatile__ (                               \
2859
        "add  %[l], %[l], %[a]  \n\t"            \
2860
        "mov  r5, #0      \n\t"            \
2861
        "adc  %[h], r5    \n\t"            \
2862
        : [l] "+l" (vl), [h] "+l" (vh)                   \
2863
        : [a] "l" (va)                                   \
2864
        : "r5", "cc"                                     \
2865
    )
2866
/* Sub va from: vh | vl */
2867
#define SP_ASM_SUBB(vl, vh, va)                          \
2868
    __asm__ __volatile__ (                               \
2869
        "sub  %[l], %[l], %[a]  \n\t"            \
2870
        "mov  r5, #0      \n\t"            \
2871
        "sbc  %[h], r5    \n\t"            \
2872
        : [l] "+l" (vl), [h] "+l" (vh)                   \
2873
        : [a] "l" (va)                                   \
2874
        : "r5", "cc"                                     \
2875
    )
2876
/* Add two times vc | vb | va into vo | vh | vl */
2877
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
2878
    __asm__ __volatile__ (                               \
2879
        "add  %[l], %[l], %[a]  \n\t"            \
2880
        "adc  %[h], %[b]    \n\t"            \
2881
        "adc  %[o], %[c]    \n\t"            \
2882
        "add  %[l], %[l], %[a]  \n\t"            \
2883
        "adc  %[h], %[b]    \n\t"            \
2884
        "adc  %[o], %[c]    \n\t"            \
2885
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
2886
        : [a] "l" (va), [b] "l" (vb), [c] "l" (vc)       \
2887
        : "cc"                                           \
2888
    )
2889
2890
#endif
2891
2892
#ifdef WOLFSSL_SP_DIV_WORD_HALF
2893
/* Divide a two digit number by a digit number and return. (hi | lo) / d
2894
 *
2895
 * No division instruction used - does operation bit by bit.
2896
 * Constant time.
2897
 *
2898
 * @param  [in]  hi  SP integer digit. High digit of the dividend.
2899
 * @param  [in]  lo  SP integer digit. Lower digit of the dividend.
2900
 * @param  [in]  d   SP integer digit. Number to divide by.
2901
 * @return  The division result.
2902
 */
2903
static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
2904
                                          sp_int_digit d)
2905
{
2906
    __asm__ __volatile__ (
2907
#if defined(__clang__) || defined(WOLFSSL_KEIL)
2908
        "lsrs r3, %[d], #24\n\t"
2909
#else
2910
        "lsr  r3, %[d], #24\n\t"
2911
#endif
2912
        "beq  2%=f\n\t"
2913
  "\n1%=:\n\t"
2914
        "movs r3, #0\n\t"
2915
        "b  3%=f\n\t"
2916
  "\n2%=:\n\t"
2917
        "mov  r3, #8\n\t"
2918
  "\n3%=:\n\t"
2919
        "movs r4, #31\n\t"
2920
#if defined(__clang__) || defined(WOLFSSL_KEIL)
2921
        "subs r4, r4, r3\n\t"
2922
#else
2923
        "sub  r4, r4, r3\n\t"
2924
#endif
2925
#if defined(__clang__) || defined(WOLFSSL_KEIL)
2926
        "lsls %[d], %[d], r3\n\t"
2927
#else
2928
        "lsl  %[d], %[d], r3\n\t"
2929
#endif
2930
#if defined(__clang__) || defined(WOLFSSL_KEIL)
2931
        "lsls %[hi], %[hi], r3\n\t"
2932
#else
2933
        "lsl  %[hi], %[hi], r3\n\t"
2934
#endif
2935
        "mov  r5, %[lo]\n\t"
2936
#if defined(__clang__) || defined(WOLFSSL_KEIL)
2937
        "lsrs r5, r5, r4\n\t"
2938
#else
2939
        "lsr  r5, r5, r4\n\t"
2940
#endif
2941
#if defined(__clang__) || defined(WOLFSSL_KEIL)
2942
        "lsls %[lo], %[lo], r3\n\t"
2943
#else
2944
        "lsl  %[lo], %[lo], r3\n\t"
2945
#endif
2946
#if defined(__clang__) || defined(WOLFSSL_KEIL)
2947
        "lsrs r5, r5, #1\n\t"
2948
#else
2949
        "lsr  r5, r5, #1\n\t"
2950
#endif
2951
#if defined(WOLFSSL_KEIL)
2952
        "orrs %[hi], %[hi], r5\n\t"
2953
#elif defined(__clang__)
2954
        "orrs %[hi], r5\n\t"
2955
#else
2956
        "orr  %[hi], r5\n\t"
2957
#endif
2958
2959
        "movs   r3, #0\n\t"
2960
#if defined(__clang__) || defined(WOLFSSL_KEIL)
2961
        "lsrs   r5, %[d], #1\n\t"
2962
#else
2963
        "lsr    r5, %[d], #1\n\t"
2964
#endif
2965
#if defined(__clang__) || defined(WOLFSSL_KEIL)
2966
        "adds   r5, r5, #1\n\t"
2967
#else
2968
        "add    r5, r5, #1\n\t"
2969
#endif
2970
        "mov    r8, %[lo]\n\t"
2971
        "mov    r9, %[hi]\n\t"
2972
        /* Do top 32 */
2973
        "movs   r6, r5\n\t"
2974
#if defined(__clang__) || defined(WOLFSSL_KEIL)
2975
        "subs   r6, r6, %[hi]\n\t"
2976
#else
2977
        "sub    r6, r6, %[hi]\n\t"
2978
#endif
2979
#ifdef WOLFSSL_KEIL
2980
        "sbcs   r6, r6, r6\n\t"
2981
#elif defined(__clang__)
2982
        "sbcs   r6, r6\n\t"
2983
#else
2984
        "sbc    r6, r6\n\t"
2985
#endif
2986
#if defined(__clang__) || defined(WOLFSSL_KEIL)
2987
        "adds   r3, r3, r3\n\t"
2988
#else
2989
        "add    r3, r3, r3\n\t"
2990
#endif
2991
#if defined(__clang__) || defined(WOLFSSL_KEIL)
2992
        "subs   r3, r3, r6\n\t"
2993
#else
2994
        "sub    r3, r3, r6\n\t"
2995
#endif
2996
#ifdef WOLFSSL_KEIL
2997
        "ands   r6, r6, r5\n\t"
2998
#elif defined(__clang__)
2999
        "ands   r6, r5\n\t"
3000
#else
3001
        "and    r6, r5\n\t"
3002
#endif
3003
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3004
        "subs   %[hi], %[hi], r6\n\t"
3005
#else
3006
        "sub    %[hi], %[hi], r6\n\t"
3007
#endif
3008
        "movs   r4, #29\n\t"
3009
        "\n"
3010
    "L_sp_div_word_loop%=:\n\t"
3011
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3012
        "lsls   %[lo], %[lo], #1\n\t"
3013
#else
3014
        "lsl    %[lo], %[lo], #1\n\t"
3015
#endif
3016
#ifdef WOLFSSL_KEIL
3017
        "adcs   %[hi], %[hi], %[hi]\n\t"
3018
#elif defined(__clang__)
3019
        "adcs   %[hi], %[hi]\n\t"
3020
#else
3021
        "adc    %[hi], %[hi]\n\t"
3022
#endif
3023
        "movs   r6, r5\n\t"
3024
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3025
        "subs   r6, r6, %[hi]\n\t"
3026
#else
3027
        "sub    r6, r6, %[hi]\n\t"
3028
#endif
3029
#ifdef WOLFSSL_KEIL
3030
        "sbcs   r6, r6, r6\n\t"
3031
#elif defined(__clang__)
3032
        "sbcs   r6, r6\n\t"
3033
#else
3034
        "sbc    r6, r6\n\t"
3035
#endif
3036
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3037
        "adds   r3, r3, r3\n\t"
3038
#else
3039
        "add    r3, r3, r3\n\t"
3040
#endif
3041
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3042
        "subs   r3, r3, r6\n\t"
3043
#else
3044
        "sub    r3, r3, r6\n\t"
3045
#endif
3046
#ifdef WOLFSSL_KEIL
3047
        "ands   r6, r6, r5\n\t"
3048
#elif defined(__clang__)
3049
        "ands   r6, r5\n\t"
3050
#else
3051
        "and    r6, r5\n\t"
3052
#endif
3053
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3054
        "subs   %[hi], %[hi], r6\n\t"
3055
#else
3056
        "sub    %[hi], %[hi], r6\n\t"
3057
#endif
3058
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3059
        "subs   r4, r4, #1\n\t"
3060
#else
3061
        "sub    r4, r4, #1\n\t"
3062
#endif
3063
        "bpl    L_sp_div_word_loop%=\n\t"
3064
        "movs   r7, #0\n\t"
3065
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3066
        "adds   r3, r3, r3\n\t"
3067
#else
3068
        "add    r3, r3, r3\n\t"
3069
#endif
3070
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3071
        "adds   r3, r3, #1\n\t"
3072
#else
3073
        "add    r3, r3, #1\n\t"
3074
#endif
3075
        /* r * d - Start */
3076
        "uxth   %[hi], r3\n\t"
3077
        "uxth   r4, %[d]\n\t"
3078
#ifdef WOLFSSL_KEIL
3079
        "muls   r4, %[hi], r4\n\t"
3080
#elif defined(__clang__)
3081
        "muls   r4, %[hi]\n\t"
3082
#else
3083
        "mul    r4, %[hi]\n\t"
3084
#endif
3085
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3086
        "lsrs   r6, %[d], #16\n\t"
3087
#else
3088
        "lsr    r6, %[d], #16\n\t"
3089
#endif
3090
#ifdef WOLFSSL_KEIL
3091
        "muls   %[hi], r6, %[hi]\n\t"
3092
#elif defined(__clang__)
3093
        "muls   %[hi], r6\n\t"
3094
#else
3095
        "mul    %[hi], r6\n\t"
3096
#endif
3097
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3098
        "lsrs   r5, %[hi], #16\n\t"
3099
#else
3100
        "lsr    r5, %[hi], #16\n\t"
3101
#endif
3102
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3103
        "lsls   %[hi], %[hi], #16\n\t"
3104
#else
3105
        "lsl    %[hi], %[hi], #16\n\t"
3106
#endif
3107
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3108
        "adds   r4, r4, %[hi]\n\t"
3109
#else
3110
        "add    r4, r4, %[hi]\n\t"
3111
#endif
3112
#ifdef WOLFSSL_KEIL
3113
        "adcs   r5, r5, r7\n\t"
3114
#elif defined(__clang__)
3115
        "adcs   r5, r7\n\t"
3116
#else
3117
        "adc    r5, r7\n\t"
3118
#endif
3119
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3120
        "lsrs   %[hi], r3, #16\n\t"
3121
#else
3122
        "lsr    %[hi], r3, #16\n\t"
3123
#endif
3124
#ifdef WOLFSSL_KEIL
3125
        "muls   r6, %[hi], r6\n\t"
3126
#elif defined(__clang__)
3127
        "muls   r6, %[hi]\n\t"
3128
#else
3129
        "mul    r6, %[hi]\n\t"
3130
#endif
3131
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3132
        "adds   r5, r5, r6\n\t"
3133
#else
3134
        "add    r5, r5, r6\n\t"
3135
#endif
3136
        "uxth   r6, %[d]\n\t"
3137
#ifdef WOLFSSL_KEIL
3138
        "muls   %[hi], r6, %[hi]\n\t"
3139
#elif defined(__clang__)
3140
        "muls   %[hi], r6\n\t"
3141
#else
3142
        "mul    %[hi], r6\n\t"
3143
#endif
3144
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3145
        "lsrs   r6, %[hi], #16\n\t"
3146
#else
3147
        "lsr    r6, %[hi], #16\n\t"
3148
#endif
3149
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3150
        "lsls   %[hi], %[hi], #16\n\t"
3151
#else
3152
        "lsl    %[hi], %[hi], #16\n\t"
3153
#endif
3154
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3155
        "adds   r4, r4, %[hi]\n\t"
3156
#else
3157
        "add    r4, r4, %[hi]\n\t"
3158
#endif
3159
#ifdef WOLFSSL_KEIL
3160
        "adcs   r5, r5, r6\n\t"
3161
#elif defined(__clang__)
3162
        "adcs   r5, r6\n\t"
3163
#else
3164
        "adc    r5, r6\n\t"
3165
#endif
3166
        /* r * d - Done */
3167
        "mov    %[hi], r8\n\t"
3168
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3169
        "subs   %[hi], %[hi], r4\n\t"
3170
#else
3171
        "sub    %[hi], %[hi], r4\n\t"
3172
#endif
3173
        "movs   r4, %[hi]\n\t"
3174
        "mov    %[hi], r9\n\t"
3175
#ifdef WOLFSSL_KEIL
3176
        "sbcs   %[hi], %[hi], r5\n\t"
3177
#elif defined(__clang__)
3178
        "sbcs   %[hi], r5\n\t"
3179
#else
3180
        "sbc    %[hi], r5\n\t"
3181
#endif
3182
        "movs   r5, %[hi]\n\t"
3183
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3184
        "adds   r3, r3, r5\n\t"
3185
#else
3186
        "add    r3, r3, r5\n\t"
3187
#endif
3188
        /* r * d - Start */
3189
        "uxth   %[hi], r3\n\t"
3190
        "uxth   r4, %[d]\n\t"
3191
#ifdef WOLFSSL_KEIL
3192
        "muls   r4, %[hi], r4\n\t"
3193
#elif defined(__clang__)
3194
        "muls   r4, %[hi]\n\t"
3195
#else
3196
        "mul    r4, %[hi]\n\t"
3197
#endif
3198
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3199
        "lsrs   r6, %[d], #16\n\t"
3200
#else
3201
        "lsr    r6, %[d], #16\n\t"
3202
#endif
3203
#ifdef WOLFSSL_KEIL
3204
        "muls   %[hi], r6, %[hi]\n\t"
3205
#elif defined(__clang__)
3206
        "muls   %[hi], r6\n\t"
3207
#else
3208
        "mul    %[hi], r6\n\t"
3209
#endif
3210
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3211
        "lsrs   r5, %[hi], #16\n\t"
3212
#else
3213
        "lsr    r5, %[hi], #16\n\t"
3214
#endif
3215
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3216
        "lsls   %[hi], %[hi], #16\n\t"
3217
#else
3218
        "lsl    %[hi], %[hi], #16\n\t"
3219
#endif
3220
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3221
        "adds   r4, r4, %[hi]\n\t"
3222
#else
3223
        "add    r4, r4, %[hi]\n\t"
3224
#endif
3225
#ifdef WOLFSSL_KEIL
3226
        "adcs   r5, r5, r7\n\t"
3227
#elif defined(__clang__)
3228
        "adcs   r5, r7\n\t"
3229
#else
3230
        "adc    r5, r7\n\t"
3231
#endif
3232
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3233
        "lsrs   %[hi], r3, #16\n\t"
3234
#else
3235
        "lsr    %[hi], r3, #16\n\t"
3236
#endif
3237
#ifdef WOLFSSL_KEIL
3238
        "muls   r6, %[hi], r6\n\t"
3239
#elif defined(__clang__)
3240
        "muls   r6, %[hi]\n\t"
3241
#else
3242
        "mul    r6, %[hi]\n\t"
3243
#endif
3244
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3245
        "adds   r5, r5, r6\n\t"
3246
#else
3247
        "add    r5, r5, r6\n\t"
3248
#endif
3249
        "uxth   r6, %[d]\n\t"
3250
#ifdef WOLFSSL_KEIL
3251
        "muls   %[hi], r6, %[hi]\n\t"
3252
#elif defined(__clang__)
3253
        "muls   %[hi], r6\n\t"
3254
#else
3255
        "mul    %[hi], r6\n\t"
3256
#endif
3257
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3258
        "lsrs   r6, %[hi], #16\n\t"
3259
#else
3260
        "lsr    r6, %[hi], #16\n\t"
3261
#endif
3262
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3263
        "lsls   %[hi], %[hi], #16\n\t"
3264
#else
3265
        "lsl    %[hi], %[hi], #16\n\t"
3266
#endif
3267
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3268
        "adds   r4, r4, %[hi]\n\t"
3269
#else
3270
        "add    r4, r4, %[hi]\n\t"
3271
#endif
3272
#ifdef WOLFSSL_KEIL
3273
        "adcs   r5, r5, r6\n\t"
3274
#elif defined(__clang__)
3275
        "adcs   r5, r6\n\t"
3276
#else
3277
        "adc    r5, r6\n\t"
3278
#endif
3279
        /* r * d - Done */
3280
        "mov    %[hi], r8\n\t"
3281
        "mov    r6, r9\n\t"
3282
#ifdef WOLFSSL_KEIL
3283
        "subs   r4, %[hi], r4\n\t"
3284
#else
3285
#ifdef __clang__
3286
        "subs   r4, %[hi], r4\n\t"
3287
#else
3288
        "sub    r4, %[hi], r4\n\t"
3289
#endif
3290
#endif
3291
#ifdef WOLFSSL_KEIL
3292
        "sbcs   r6, r6, r5\n\t"
3293
#elif defined(__clang__)
3294
        "sbcs   r6, r5\n\t"
3295
#else
3296
        "sbc    r6, r5\n\t"
3297
#endif
3298
        "movs   r5, r6\n\t"
3299
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3300
        "adds   r3, r3, r5\n\t"
3301
#else
3302
        "add    r3, r3, r5\n\t"
3303
#endif
3304
        /* r * d - Start */
3305
        "uxth   %[hi], r3\n\t"
3306
        "uxth   r4, %[d]\n\t"
3307
#ifdef WOLFSSL_KEIL
3308
        "muls   r4, %[hi], r4\n\t"
3309
#elif defined(__clang__)
3310
        "muls   r4, %[hi]\n\t"
3311
#else
3312
        "mul    r4, %[hi]\n\t"
3313
#endif
3314
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3315
        "lsrs   r6, %[d], #16\n\t"
3316
#else
3317
        "lsr    r6, %[d], #16\n\t"
3318
#endif
3319
#ifdef WOLFSSL_KEIL
3320
        "muls   %[hi], r6, %[hi]\n\t"
3321
#elif defined(__clang__)
3322
        "muls   %[hi], r6\n\t"
3323
#else
3324
        "mul    %[hi], r6\n\t"
3325
#endif
3326
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3327
        "lsrs   r5, %[hi], #16\n\t"
3328
#else
3329
        "lsr    r5, %[hi], #16\n\t"
3330
#endif
3331
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3332
        "lsls   %[hi], %[hi], #16\n\t"
3333
#else
3334
        "lsl    %[hi], %[hi], #16\n\t"
3335
#endif
3336
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3337
        "adds   r4, r4, %[hi]\n\t"
3338
#else
3339
        "add    r4, r4, %[hi]\n\t"
3340
#endif
3341
#ifdef WOLFSSL_KEIL
3342
        "adcs   r5, r5, r7\n\t"
3343
#elif defined(__clang__)
3344
        "adcs   r5, r7\n\t"
3345
#else
3346
        "adc    r5, r7\n\t"
3347
#endif
3348
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3349
        "lsrs   %[hi], r3, #16\n\t"
3350
#else
3351
        "lsr    %[hi], r3, #16\n\t"
3352
#endif
3353
#ifdef WOLFSSL_KEIL
3354
        "muls   r6, %[hi], r6\n\t"
3355
#elif defined(__clang__)
3356
        "muls   r6, %[hi]\n\t"
3357
#else
3358
        "mul    r6, %[hi]\n\t"
3359
#endif
3360
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3361
        "adds   r5, r5, r6\n\t"
3362
#else
3363
        "add    r5, r5, r6\n\t"
3364
#endif
3365
        "uxth   r6, %[d]\n\t"
3366
#ifdef WOLFSSL_KEIL
3367
        "muls   %[hi], r6, %[hi]\n\t"
3368
#elif defined(__clang__)
3369
        "muls   %[hi], r6\n\t"
3370
#else
3371
        "mul    %[hi], r6\n\t"
3372
#endif
3373
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3374
        "lsrs   r6, %[hi], #16\n\t"
3375
#else
3376
        "lsr    r6, %[hi], #16\n\t"
3377
#endif
3378
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3379
        "lsls   %[hi], %[hi], #16\n\t"
3380
#else
3381
        "lsl    %[hi], %[hi], #16\n\t"
3382
#endif
3383
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3384
        "adds   r4, r4, %[hi]\n\t"
3385
#else
3386
        "add    r4, r4, %[hi]\n\t"
3387
#endif
3388
#ifdef WOLFSSL_KEIL
3389
        "adcs   r5, r5, r6\n\t"
3390
#elif defined(__clang__)
3391
        "adcs   r5, r6\n\t"
3392
#else
3393
        "adc    r5, r6\n\t"
3394
#endif
3395
        /* r * d - Done */
3396
        "mov    %[hi], r8\n\t"
3397
        "mov    r6, r9\n\t"
3398
#ifdef WOLFSSL_KEIL
3399
        "subs   r4, %[hi], r4\n\t"
3400
#else
3401
#ifdef __clang__
3402
        "subs   r4, %[hi], r4\n\t"
3403
#else
3404
        "sub    r4, %[hi], r4\n\t"
3405
#endif
3406
#endif
3407
#ifdef WOLFSSL_KEIL
3408
        "sbcs   r6, r6, r5\n\t"
3409
#elif defined(__clang__)
3410
        "sbcs   r6, r5\n\t"
3411
#else
3412
        "sbc    r6, r5\n\t"
3413
#endif
3414
        "movs   r5, r6\n\t"
3415
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3416
        "adds   r3, r3, r5\n\t"
3417
#else
3418
        "add    r3, r3, r5\n\t"
3419
#endif
3420
        "movs   r6, %[d]\n\t"
3421
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3422
        "subs   r6, r6, r4\n\t"
3423
#else
3424
        "sub    r6, r6, r4\n\t"
3425
#endif
3426
#ifdef WOLFSSL_KEIL
3427
        "sbcs   r6, r6, r6\n\t"
3428
#elif defined(__clang__)
3429
        "sbcs   r6, r6\n\t"
3430
#else
3431
        "sbc    r6, r6\n\t"
3432
#endif
3433
#if defined(__clang__) || defined(WOLFSSL_KEIL)
3434
        "subs   r3, r3, r6\n\t"
3435
#else
3436
        "sub    r3, r3, r6\n\t"
3437
#endif
3438
        "movs   %[hi], r3\n\t"
3439
        : [hi] "+l" (hi), [lo] "+l" (lo), [d] "+l" (d)
3440
        :
3441
        : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
3442
    );
3443
    return (sp_uint32)(size_t)hi;
3444
}
3445
3446
#define SP_ASM_DIV_WORD
3447
#endif /* !WOLFSSL_SP_DIV_WORD_HALF */
3448
3449
#define SP_INT_ASM_AVAILABLE
3450
3451
    #endif /* WOLFSSL_SP_ARM_THUMB && SP_WORD_SIZE == 32 */
3452
3453
    #if defined(WOLFSSL_SP_PPC64) && SP_WORD_SIZE == 64
3454
/*
3455
 * CPU: PPC64
3456
 */
3457
3458
    #ifdef __APPLE__
3459
3460
/* Multiply va by vb and store double size result in: vh | vl */
3461
#define SP_ASM_MUL(vl, vh, va, vb)                       \
3462
    __asm__ __volatile__ (                               \
3463
        "mulld  %[l], %[a], %[b]  \n\t"            \
3464
        "mulhdu %[h], %[a], %[b]  \n\t"            \
3465
        : [h] "+r" (vh), [l] "+r" (vl)                   \
3466
        : [a] "r" (va), [b] "r" (vb)                     \
3467
        :                                                \
3468
    )
3469
/* Multiply va by vb and store double size result in: vo | vh | vl */
3470
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
3471
    __asm__ __volatile__ (                               \
3472
        "mulhdu %[h], %[a], %[b]  \n\t"            \
3473
        "mulld  %[l], %[a], %[b]  \n\t"            \
3474
        "li %[o], 0     \n\t"            \
3475
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
3476
        : [a] "r" (va), [b] "r" (vb)                     \
3477
        :                                                \
3478
    )
3479
/* Multiply va by vb and add double size result into: vo | vh | vl */
3480
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
3481
    __asm__ __volatile__ (                               \
3482
        "mulld  r16, %[a], %[b]   \n\t"            \
3483
        "mulhdu r17, %[a], %[b]   \n\t"            \
3484
        "addc %[l], %[l], r16   \n\t"            \
3485
        "adde %[h], %[h], r17   \n\t"            \
3486
        "addze  %[o], %[o]    \n\t"            \
3487
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
3488
        : [a] "r" (va), [b] "r" (vb)                     \
3489
        : "r16", "r17", "cc"                             \
3490
    )
3491
/* Multiply va by vb and add double size result into: vh | vl */
3492
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
3493
    __asm__ __volatile__ (                               \
3494
        "mulld  r16, %[a], %[b]   \n\t"            \
3495
        "mulhdu r17, %[a], %[b]   \n\t"            \
3496
        "addc %[l], %[l], r16   \n\t"            \
3497
        "adde %[h], %[h], r17   \n\t"            \
3498
        : [l] "+r" (vl), [h] "+r" (vh)                   \
3499
        : [a] "r" (va), [b] "r" (vb)                     \
3500
        : "r16", "r17", "cc"                             \
3501
    )
3502
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
3503
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
3504
    __asm__ __volatile__ (                               \
3505
        "mulld  r16, %[a], %[b]   \n\t"            \
3506
        "mulhdu r17, %[a], %[b]   \n\t"            \
3507
        "addc %[l], %[l], r16   \n\t"            \
3508
        "adde %[h], %[h], r17   \n\t"            \
3509
        "addze  %[o], %[o]    \n\t"            \
3510
        "addc %[l], %[l], r16   \n\t"            \
3511
        "adde %[h], %[h], r17   \n\t"            \
3512
        "addze  %[o], %[o]    \n\t"            \
3513
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
3514
        : [a] "r" (va), [b] "r" (vb)                     \
3515
        : "r16", "r17", "cc"                             \
3516
    )
3517
/* Multiply va by vb and add double size result twice into: vo | vh | vl
3518
 * Assumes first add will not overflow vh | vl
3519
 */
3520
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
3521
    __asm__ __volatile__ (                               \
3522
        "mulld  r16, %[a], %[b]   \n\t"            \
3523
        "mulhdu r17, %[a], %[b]   \n\t"            \
3524
        "addc %[l], %[l], r16   \n\t"            \
3525
        "adde %[h], %[h], r17   \n\t"            \
3526
        "addc %[l], %[l], r16   \n\t"            \
3527
        "adde %[h], %[h], r17   \n\t"            \
3528
        "addze  %[o], %[o]    \n\t"            \
3529
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
3530
        : [a] "r" (va), [b] "r" (vb)                     \
3531
        : "r16", "r17", "cc"                             \
3532
    )
3533
/* Square va and store double size result in: vh | vl */
3534
#define SP_ASM_SQR(vl, vh, va)                           \
3535
    __asm__ __volatile__ (                               \
3536
        "mulld  %[l], %[a], %[a]  \n\t"            \
3537
        "mulhdu %[h], %[a], %[a]  \n\t"            \
3538
        : [h] "+r" (vh), [l] "+r" (vl)                   \
3539
        : [a] "r" (va)                                   \
3540
        :                                                \
3541
    )
3542
/* Square va and add double size result into: vo | vh | vl */
3543
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
3544
    __asm__ __volatile__ (                               \
3545
        "mulld  r16, %[a], %[a]   \n\t"            \
3546
        "mulhdu r17, %[a], %[a]   \n\t"            \
3547
        "addc %[l], %[l], r16   \n\t"            \
3548
        "adde %[h], %[h], r17   \n\t"            \
3549
        "addze  %[o], %[o]    \n\t"            \
3550
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
3551
        : [a] "r" (va)                                   \
3552
        : "r16", "r17", "cc"                             \
3553
    )
3554
/* Square va and add double size result into: vh | vl */
3555
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
3556
    __asm__ __volatile__ (                               \
3557
        "mulld  r16, %[a], %[a]   \n\t"            \
3558
        "mulhdu r17, %[a], %[a]   \n\t"            \
3559
        "addc %[l], %[l], r16   \n\t"            \
3560
        "adde %[h], %[h], r17   \n\t"            \
3561
        : [l] "+r" (vl), [h] "+r" (vh)                   \
3562
        : [a] "r" (va)                                   \
3563
        : "r16", "r17", "cc"                             \
3564
    )
3565
/* Add va into: vh | vl */
3566
#define SP_ASM_ADDC(vl, vh, va)                          \
3567
    __asm__ __volatile__ (                               \
3568
        "addc %[l], %[l], %[a]  \n\t"            \
3569
        "addze  %[h], %[h]    \n\t"            \
3570
        : [l] "+r" (vl), [h] "+r" (vh)                   \
3571
        : [a] "r" (va)                                   \
3572
        : "cc"                                           \
3573
    )
3574
/* Sub va from: vh | vl */
3575
#define SP_ASM_SUBB(vl, vh, va)                          \
3576
    __asm__ __volatile__ (                               \
3577
        "subfc  %[l], %[a], %[l]  \n\t"            \
3578
        "li    r16, 0     \n\t"            \
3579
        "subfe %[h], r16, %[h]    \n\t"            \
3580
        : [l] "+r" (vl), [h] "+r" (vh)                   \
3581
        : [a] "r" (va)                                   \
3582
        : "r16", "cc"                                    \
3583
    )
3584
/* Add two times vc | vb | va into vo | vh | vl */
3585
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
3586
    __asm__ __volatile__ (                               \
3587
        "addc %[l], %[l], %[a]  \n\t"            \
3588
        "adde %[h], %[h], %[b]  \n\t"            \
3589
        "adde %[o], %[o], %[c]  \n\t"            \
3590
        "addc %[l], %[l], %[a]  \n\t"            \
3591
        "adde %[h], %[h], %[b]  \n\t"            \
3592
        "adde %[o], %[o], %[c]  \n\t"            \
3593
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
3594
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
3595
        : "cc"                                           \
3596
    )
3597
/* Count leading zeros. */
3598
#define SP_ASM_LZCNT(va, vn)                             \
3599
    __asm__ __volatile__ (                               \
3600
        "cntlzd %[n], %[a]  \n\t"                    \
3601
        : [n] "=r" (vn)                                  \
3602
        : [a] "r" (va)                                   \
3603
        :                                                \
3604
    )
3605
3606
    #else  /* !defined(__APPLE__) */
3607
3608
/* Multiply va by vb and store double size result in: vh | vl */
3609
#define SP_ASM_MUL(vl, vh, va, vb)                       \
3610
    __asm__ __volatile__ (                               \
3611
        "mulld  %[l], %[a], %[b]  \n\t"            \
3612
        "mulhdu %[h], %[a], %[b]  \n\t"            \
3613
        : [h] "+r" (vh), [l] "+r" (vl)                   \
3614
        : [a] "r" (va), [b] "r" (vb)                     \
3615
        :                                                \
3616
    )
3617
/* Multiply va by vb and store double size result in: vo | vh | vl */
3618
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
3619
    __asm__ __volatile__ (                               \
3620
        "mulhdu %[h], %[a], %[b]  \n\t"            \
3621
        "mulld  %[l], %[a], %[b]  \n\t"            \
3622
        "li %[o], 0     \n\t"            \
3623
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
3624
        : [a] "r" (va), [b] "r" (vb)                     \
3625
        :                                                \
3626
    )
3627
/* Multiply va by vb and add double size result into: vo | vh | vl */
3628
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
3629
    __asm__ __volatile__ (                               \
3630
        "mulld  16, %[a], %[b]    \n\t"            \
3631
        "mulhdu 17, %[a], %[b]    \n\t"            \
3632
        "addc %[l], %[l], 16    \n\t"            \
3633
        "adde %[h], %[h], 17    \n\t"            \
3634
        "addze  %[o], %[o]    \n\t"            \
3635
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
3636
        : [a] "r" (va), [b] "r" (vb)                     \
3637
        : "16", "17", "cc"                               \
3638
    )
3639
/* Multiply va by vb and add double size result into: vh | vl */
3640
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
3641
    __asm__ __volatile__ (                               \
3642
        "mulld  16, %[a], %[b]    \n\t"            \
3643
        "mulhdu 17, %[a], %[b]    \n\t"            \
3644
        "addc %[l], %[l], 16    \n\t"            \
3645
        "adde %[h], %[h], 17    \n\t"            \
3646
        : [l] "+r" (vl), [h] "+r" (vh)                   \
3647
        : [a] "r" (va), [b] "r" (vb)                     \
3648
        : "16", "17", "cc"                               \
3649
    )
3650
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
3651
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
3652
    __asm__ __volatile__ (                               \
3653
        "mulld  16, %[a], %[b]    \n\t"            \
3654
        "mulhdu 17, %[a], %[b]    \n\t"            \
3655
        "addc %[l], %[l], 16    \n\t"            \
3656
        "adde %[h], %[h], 17    \n\t"            \
3657
        "addze  %[o], %[o]    \n\t"            \
3658
        "addc %[l], %[l], 16    \n\t"            \
3659
        "adde %[h], %[h], 17    \n\t"            \
3660
        "addze  %[o], %[o]    \n\t"            \
3661
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
3662
        : [a] "r" (va), [b] "r" (vb)                     \
3663
        : "16", "17", "cc"                               \
3664
    )
3665
/* Multiply va by vb and add double size result twice into: vo | vh | vl
3666
 * Assumes first add will not overflow vh | vl
3667
 */
3668
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
3669
    __asm__ __volatile__ (                               \
3670
        "mulld  16, %[a], %[b]    \n\t"            \
3671
        "mulhdu 17, %[a], %[b]    \n\t"            \
3672
        "addc %[l], %[l], 16    \n\t"            \
3673
        "adde %[h], %[h], 17    \n\t"            \
3674
        "addc %[l], %[l], 16    \n\t"            \
3675
        "adde %[h], %[h], 17    \n\t"            \
3676
        "addze  %[o], %[o]    \n\t"            \
3677
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
3678
        : [a] "r" (va), [b] "r" (vb)                     \
3679
        : "16", "17", "cc"                               \
3680
    )
3681
/* Square va and store double size result in: vh | vl */
3682
#define SP_ASM_SQR(vl, vh, va)                           \
3683
    __asm__ __volatile__ (                               \
3684
        "mulld  %[l], %[a], %[a]  \n\t"            \
3685
        "mulhdu %[h], %[a], %[a]  \n\t"            \
3686
        : [h] "+r" (vh), [l] "+r" (vl)                   \
3687
        : [a] "r" (va)                                   \
3688
        :                                                \
3689
    )
3690
/* Square va and add double size result into: vo | vh | vl */
3691
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
3692
    __asm__ __volatile__ (                               \
3693
        "mulld  16, %[a], %[a]    \n\t"            \
3694
        "mulhdu 17, %[a], %[a]    \n\t"            \
3695
        "addc %[l], %[l], 16    \n\t"            \
3696
        "adde %[h], %[h], 17    \n\t"            \
3697
        "addze  %[o], %[o]    \n\t"            \
3698
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
3699
        : [a] "r" (va)                                   \
3700
        : "16", "17", "cc"                               \
3701
    )
3702
/* Square va and add double size result into: vh | vl */
3703
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
3704
    __asm__ __volatile__ (                               \
3705
        "mulld  16, %[a], %[a]    \n\t"            \
3706
        "mulhdu 17, %[a], %[a]    \n\t"            \
3707
        "addc %[l], %[l], 16    \n\t"            \
3708
        "adde %[h], %[h], 17    \n\t"            \
3709
        : [l] "+r" (vl), [h] "+r" (vh)                   \
3710
        : [a] "r" (va)                                   \
3711
        : "16", "17", "cc"                               \
3712
    )
3713
/* Add va into: vh | vl */
3714
#define SP_ASM_ADDC(vl, vh, va)                          \
3715
    __asm__ __volatile__ (                               \
3716
        "addc %[l], %[l], %[a]  \n\t"            \
3717
        "addze  %[h], %[h]    \n\t"            \
3718
        : [l] "+r" (vl), [h] "+r" (vh)                   \
3719
        : [a] "r" (va)                                   \
3720
        : "cc"                                           \
3721
    )
3722
/* Sub va from: vh | vl */
3723
#define SP_ASM_SUBB(vl, vh, va)                          \
3724
    __asm__ __volatile__ (                               \
3725
        "subfc  %[l], %[a], %[l]  \n\t"            \
3726
        "li    16, 0      \n\t"            \
3727
        "subfe %[h], 16, %[h]   \n\t"            \
3728
        : [l] "+r" (vl), [h] "+r" (vh)                   \
3729
        : [a] "r" (va)                                   \
3730
        : "16", "cc"                                     \
3731
    )
3732
/* Add two times vc | vb | va into vo | vh | vl */
3733
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
3734
    __asm__ __volatile__ (                               \
3735
        "addc %[l], %[l], %[a]  \n\t"            \
3736
        "adde %[h], %[h], %[b]  \n\t"            \
3737
        "adde %[o], %[o], %[c]  \n\t"            \
3738
        "addc %[l], %[l], %[a]  \n\t"            \
3739
        "adde %[h], %[h], %[b]  \n\t"            \
3740
        "adde %[o], %[o], %[c]  \n\t"            \
3741
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
3742
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
3743
        : "cc"                                           \
3744
    )
3745
/* Count leading zeros. */
3746
#define SP_ASM_LZCNT(va, vn)                             \
3747
    __asm__ __volatile__ (                               \
3748
        "cntlzd %[n], %[a]  \n\t"                    \
3749
        : [n] "=r" (vn)                                  \
3750
        : [a] "r" (va)                                   \
3751
        :                                                \
3752
    )
3753
3754
    #endif /* !defined(__APPLE__) */
3755
3756
#define SP_INT_ASM_AVAILABLE
3757
3758
    #endif /* WOLFSSL_SP_PPC64 && SP_WORD_SIZE == 64 */
3759
3760
    #if defined(WOLFSSL_SP_PPC) && SP_WORD_SIZE == 32
3761
/*
3762
 * CPU: PPC 32-bit
3763
 */
3764
3765
    #ifdef __APPLE__
3766
3767
/* Multiply va by vb and store double size result in: vh | vl */
3768
#define SP_ASM_MUL(vl, vh, va, vb)                       \
3769
    __asm__ __volatile__ (                               \
3770
        "mullw  %[l], %[a], %[b]  \n\t"            \
3771
        "mulhwu %[h], %[a], %[b]  \n\t"            \
3772
        : [h] "+r" (vh), [l] "+r" (vl)                   \
3773
        : [a] "r" (va), [b] "r" (vb)                     \
3774
        :                                                \
3775
    )
3776
/* Multiply va by vb and store double size result in: vo | vh | vl */
3777
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
3778
    __asm__ __volatile__ (                               \
3779
        "mulhwu %[h], %[a], %[b]  \n\t"            \
3780
        "mullw  %[l], %[a], %[b]  \n\t"            \
3781
        "li %[o], 0     \n\t"            \
3782
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
3783
        : [a] "r" (va), [b] "r" (vb)                     \
3784
    )
3785
/* Multiply va by vb and add double size result into: vo | vh | vl */
3786
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
3787
    __asm__ __volatile__ (                               \
3788
        "mullw  r16, %[a], %[b]   \n\t"            \
3789
        "mulhwu r17, %[a], %[b]   \n\t"            \
3790
        "addc %[l], %[l], r16   \n\t"            \
3791
        "adde %[h], %[h], r17   \n\t"            \
3792
        "addze  %[o], %[o]    \n\t"            \
3793
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
3794
        : [a] "r" (va), [b] "r" (vb)                     \
3795
        : "r16", "r17", "cc"                             \
3796
    )
3797
/* Multiply va by vb and add double size result into: vh | vl */
3798
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
3799
    __asm__ __volatile__ (                               \
3800
        "mullw  r16, %[a], %[b]   \n\t"            \
3801
        "mulhwu r17, %[a], %[b]   \n\t"            \
3802
        "addc %[l], %[l], r16   \n\t"            \
3803
        "adde %[h], %[h], r17   \n\t"            \
3804
        : [l] "+r" (vl), [h] "+r" (vh)                   \
3805
        : [a] "r" (va), [b] "r" (vb)                     \
3806
        : "r16", "r17", "cc"                             \
3807
    )
3808
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
3809
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
3810
    __asm__ __volatile__ (                               \
3811
        "mullw  r16, %[a], %[b]   \n\t"            \
3812
        "mulhwu r17, %[a], %[b]   \n\t"            \
3813
        "addc %[l], %[l], r16   \n\t"            \
3814
        "adde %[h], %[h], r17   \n\t"            \
3815
        "addze  %[o], %[o]    \n\t"            \
3816
        "addc %[l], %[l], r16   \n\t"            \
3817
        "adde %[h], %[h], r17   \n\t"            \
3818
        "addze  %[o], %[o]    \n\t"            \
3819
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
3820
        : [a] "r" (va), [b] "r" (vb)                     \
3821
        : "r16", "r17", "cc"                             \
3822
    )
3823
/* Multiply va by vb and add double size result twice into: vo | vh | vl
3824
 * Assumes first add will not overflow vh | vl
3825
 */
3826
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
3827
    __asm__ __volatile__ (                               \
3828
        "mullw  r16, %[a], %[b]   \n\t"            \
3829
        "mulhwu r17, %[a], %[b]   \n\t"            \
3830
        "addc %[l], %[l], r16   \n\t"            \
3831
        "adde %[h], %[h], r17   \n\t"            \
3832
        "addc %[l], %[l], r16   \n\t"            \
3833
        "adde %[h], %[h], r17   \n\t"            \
3834
        "addze  %[o], %[o]    \n\t"            \
3835
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
3836
        : [a] "r" (va), [b] "r" (vb)                     \
3837
        : "r16", "r17", "cc"                             \
3838
    )
3839
/* Square va and store double size result in: vh | vl */
3840
#define SP_ASM_SQR(vl, vh, va)                           \
3841
    __asm__ __volatile__ (                               \
3842
        "mullw  %[l], %[a], %[a]  \n\t"            \
3843
        "mulhwu %[h], %[a], %[a]  \n\t"            \
3844
        : [h] "+r" (vh), [l] "+r" (vl)                   \
3845
        : [a] "r" (va)                                   \
3846
        :                                                \
3847
    )
3848
/* Square va and add double size result into: vo | vh | vl */
3849
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
3850
    __asm__ __volatile__ (                               \
3851
        "mullw  r16, %[a], %[a]   \n\t"            \
3852
        "mulhwu r17, %[a], %[a]   \n\t"            \
3853
        "addc %[l], %[l], r16   \n\t"            \
3854
        "adde %[h], %[h], r17   \n\t"            \
3855
        "addze  %[o], %[o]    \n\t"            \
3856
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
3857
        : [a] "r" (va)                                   \
3858
        : "r16", "r17", "cc"                             \
3859
    )
3860
/* Square va and add double size result into: vh | vl */
3861
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
3862
    __asm__ __volatile__ (                               \
3863
        "mullw  r16, %[a], %[a]   \n\t"            \
3864
        "mulhwu r17, %[a], %[a]   \n\t"            \
3865
        "addc %[l], %[l], r16   \n\t"            \
3866
        "adde %[h], %[h], r17   \n\t"            \
3867
        : [l] "+r" (vl), [h] "+r" (vh)                   \
3868
        : [a] "r" (va)                                   \
3869
        : "r16", "r17", "cc"                             \
3870
    )
3871
/* Add va into: vh | vl */
3872
#define SP_ASM_ADDC(vl, vh, va)                          \
3873
    __asm__ __volatile__ (                               \
3874
        "addc %[l], %[l], %[a]  \n\t"            \
3875
        "addze  %[h], %[h]    \n\t"            \
3876
        : [l] "+r" (vl), [h] "+r" (vh)                   \
3877
        : [a] "r" (va)                                   \
3878
        : "cc"                                           \
3879
    )
3880
/* Sub va from: vh | vl */
3881
#define SP_ASM_SUBB(vl, vh, va)                          \
3882
    __asm__ __volatile__ (                               \
3883
        "subfc  %[l], %[a], %[l]  \n\t"            \
3884
        "li r16, 0      \n\t"            \
3885
        "subfe  %[h], r16, %[h]   \n\t"            \
3886
        : [l] "+r" (vl), [h] "+r" (vh)                   \
3887
        : [a] "r" (va)                                   \
3888
        : "r16", "cc"                                    \
3889
    )
3890
/* Add two times vc | vb | va into vo | vh | vl */
3891
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
3892
    __asm__ __volatile__ (                               \
3893
        "addc %[l], %[l], %[a]  \n\t"            \
3894
        "adde %[h], %[h], %[b]  \n\t"            \
3895
        "adde %[o], %[o], %[c]  \n\t"            \
3896
        "addc %[l], %[l], %[a]  \n\t"            \
3897
        "adde %[h], %[h], %[b]  \n\t"            \
3898
        "adde %[o], %[o], %[c]  \n\t"            \
3899
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
3900
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
3901
        : "cc"                                           \
3902
    )
3903
/* Count leading zeros. */
3904
#define SP_ASM_LZCNT(va, vn)                             \
3905
    __asm__ __volatile__ (                               \
3906
        "cntlzw %[n], %[a]  \n\t"                    \
3907
        : [n] "=r" (vn)                                  \
3908
        : [a] "r" (va)                                   \
3909
    )
3910
3911
    #else /* !defined(__APPLE__) */
3912
3913
/* Multiply va by vb and store double size result in: vh | vl */
3914
#define SP_ASM_MUL(vl, vh, va, vb)                       \
3915
    __asm__ __volatile__ (                               \
3916
        "mullw  %[l], %[a], %[b]  \n\t"            \
3917
        "mulhwu %[h], %[a], %[b]  \n\t"            \
3918
        : [h] "+r" (vh), [l] "+r" (vl)                   \
3919
        : [a] "r" (va), [b] "r" (vb)                     \
3920
        :                                                \
3921
    )
3922
/* Multiply va by vb and store double size result in: vo | vh | vl */
3923
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
3924
    __asm__ __volatile__ (                               \
3925
        "mulhwu %[h], %[a], %[b]  \n\t"            \
3926
        "mullw  %[l], %[a], %[b]  \n\t"            \
3927
        "xor  %[o], %[o], %[o]  \n\t"            \
3928
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
3929
        : [a] "r" (va), [b] "r" (vb)                     \
3930
    )
3931
/* Multiply va by vb and add double size result into: vo | vh | vl */
3932
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
3933
    __asm__ __volatile__ (                               \
3934
        "mullw  16, %[a], %[b]    \n\t"            \
3935
        "mulhwu 17, %[a], %[b]    \n\t"            \
3936
        "addc %[l], %[l], 16    \n\t"            \
3937
        "adde %[h], %[h], 17    \n\t"            \
3938
        "addze  %[o], %[o]    \n\t"            \
3939
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
3940
        : [a] "r" (va), [b] "r" (vb)                     \
3941
        : "16", "17", "cc"                               \
3942
    )
3943
/* Multiply va by vb and add double size result into: vh | vl */
3944
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
3945
    __asm__ __volatile__ (                               \
3946
        "mullw  16, %[a], %[b]    \n\t"            \
3947
        "mulhwu 17, %[a], %[b]    \n\t"            \
3948
        "addc %[l], %[l], 16    \n\t"            \
3949
        "adde %[h], %[h], 17    \n\t"            \
3950
        : [l] "+r" (vl), [h] "+r" (vh)                   \
3951
        : [a] "r" (va), [b] "r" (vb)                     \
3952
        : "16", "17", "cc"                               \
3953
    )
3954
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
3955
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
3956
    __asm__ __volatile__ (                               \
3957
        "mullw  16, %[a], %[b]    \n\t"            \
3958
        "mulhwu 17, %[a], %[b]    \n\t"            \
3959
        "addc %[l], %[l], 16    \n\t"            \
3960
        "adde %[h], %[h], 17    \n\t"            \
3961
        "addze  %[o], %[o]    \n\t"            \
3962
        "addc %[l], %[l], 16    \n\t"            \
3963
        "adde %[h], %[h], 17    \n\t"            \
3964
        "addze  %[o], %[o]    \n\t"            \
3965
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
3966
        : [a] "r" (va), [b] "r" (vb)                     \
3967
        : "16", "17", "cc"                               \
3968
    )
3969
/* Multiply va by vb and add double size result twice into: vo | vh | vl
3970
 * Assumes first add will not overflow vh | vl
3971
 */
3972
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
3973
    __asm__ __volatile__ (                               \
3974
        "mullw  16, %[a], %[b]    \n\t"            \
3975
        "mulhwu 17, %[a], %[b]    \n\t"            \
3976
        "addc %[l], %[l], 16    \n\t"            \
3977
        "adde %[h], %[h], 17    \n\t"            \
3978
        "addc %[l], %[l], 16    \n\t"            \
3979
        "adde %[h], %[h], 17    \n\t"            \
3980
        "addze  %[o], %[o]    \n\t"            \
3981
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
3982
        : [a] "r" (va), [b] "r" (vb)                     \
3983
        : "16", "17", "cc"                               \
3984
    )
3985
/* Square va and store double size result in: vh | vl */
3986
#define SP_ASM_SQR(vl, vh, va)                           \
3987
    __asm__ __volatile__ (                               \
3988
        "mullw  %[l], %[a], %[a]  \n\t"            \
3989
        "mulhwu %[h], %[a], %[a]  \n\t"            \
3990
        : [h] "+r" (vh), [l] "+r" (vl)                   \
3991
        : [a] "r" (va)                                   \
3992
        :                                                \
3993
    )
3994
/* Square va and add double size result into: vo | vh | vl */
3995
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
3996
    __asm__ __volatile__ (                               \
3997
        "mullw  16, %[a], %[a]    \n\t"            \
3998
        "mulhwu 17, %[a], %[a]    \n\t"            \
3999
        "addc %[l], %[l], 16    \n\t"            \
4000
        "adde %[h], %[h], 17    \n\t"            \
4001
        "addze  %[o], %[o]    \n\t"            \
4002
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4003
        : [a] "r" (va)                                   \
4004
        : "16", "17", "cc"                               \
4005
    )
4006
/* Square va and add double size result into: vh | vl */
4007
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
4008
    __asm__ __volatile__ (                               \
4009
        "mullw  16, %[a], %[a]    \n\t"            \
4010
        "mulhwu 17, %[a], %[a]    \n\t"            \
4011
        "addc %[l], %[l], 16    \n\t"            \
4012
        "adde %[h], %[h], 17    \n\t"            \
4013
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4014
        : [a] "r" (va)                                   \
4015
        : "16", "17", "cc"                               \
4016
    )
4017
/* Add va into: vh | vl */
4018
#define SP_ASM_ADDC(vl, vh, va)                          \
4019
    __asm__ __volatile__ (                               \
4020
        "addc %[l], %[l], %[a]  \n\t"            \
4021
        "addze  %[h], %[h]    \n\t"            \
4022
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4023
        : [a] "r" (va)                                   \
4024
        : "cc"                                           \
4025
    )
4026
/* Sub va from: vh | vl */
4027
#define SP_ASM_SUBB(vl, vh, va)                          \
4028
    __asm__ __volatile__ (                               \
4029
        "subfc  %[l], %[a], %[l]  \n\t"            \
4030
        "xor  16, 16, 16    \n\t"            \
4031
        "subfe  %[h], 16, %[h]    \n\t"            \
4032
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4033
        : [a] "r" (va)                                   \
4034
        : "16", "cc"                                     \
4035
    )
4036
/* Add two times vc | vb | va into vo | vh | vl */
4037
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
4038
    __asm__ __volatile__ (                               \
4039
        "addc %[l], %[l], %[a]  \n\t"            \
4040
        "adde %[h], %[h], %[b]  \n\t"            \
4041
        "adde %[o], %[o], %[c]  \n\t"            \
4042
        "addc %[l], %[l], %[a]  \n\t"            \
4043
        "adde %[h], %[h], %[b]  \n\t"            \
4044
        "adde %[o], %[o], %[c]  \n\t"            \
4045
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4046
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
4047
        : "cc"                                           \
4048
    )
4049
/* Count leading zeros. */
4050
#define SP_ASM_LZCNT(va, vn)                             \
4051
    __asm__ __volatile__ (                               \
4052
        "cntlzw %[n], %[a]  \n\t"                    \
4053
        : [n] "=r" (vn)                                  \
4054
        : [a] "r" (va)                                   \
4055
    )
4056
4057
    #endif /* !defined(__APPLE__) */
4058
4059
#define SP_INT_ASM_AVAILABLE
4060
4061
    #endif /* WOLFSSL_SP_PPC && SP_WORD_SIZE == 64 */
4062
4063
    #if defined(WOLFSSL_SP_MIPS64) && SP_WORD_SIZE == 64
4064
/*
4065
 * CPU: MIPS 64-bit
4066
 */
4067
4068
/* Multiply va by vb and store double size result in: vh | vl */
4069
#define SP_ASM_MUL(vl, vh, va, vb)                       \
4070
    __asm__ __volatile__ (                               \
4071
        "dmultu %[a], %[b]    \n\t"            \
4072
        "mflo %[l]      \n\t"            \
4073
        "mfhi %[h]      \n\t"            \
4074
        : [h] "+r" (vh), [l] "+r" (vl)                   \
4075
        : [a] "r" (va), [b] "r" (vb)                     \
4076
        : "$lo", "$hi"                                   \
4077
    )
4078
/* Multiply va by vb and store double size result in: vo | vh | vl */
4079
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
4080
    __asm__ __volatile__ (                               \
4081
        "dmultu %[a], %[b]    \n\t"            \
4082
        "mflo %[l]      \n\t"            \
4083
        "mfhi %[h]      \n\t"            \
4084
        "move %[o], $0    \n\t"            \
4085
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
4086
        : [a] "r" (va), [b] "r" (vb)                     \
4087
        : "$lo", "$hi"                                   \
4088
    )
4089
/* Multiply va by vb and add double size result into: vo | vh | vl */
4090
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
4091
    __asm__ __volatile__ (                               \
4092
        "dmultu %[a], %[b]    \n\t"            \
4093
        "mflo $10     \n\t"            \
4094
        "mfhi $11     \n\t"            \
4095
        "daddu  %[l], %[l], $10   \n\t"            \
4096
        "sltu $12, %[l], $10    \n\t"            \
4097
        "daddu  %[h], %[h], $12   \n\t"            \
4098
        "sltu $12, %[h], $12    \n\t"            \
4099
        "daddu  %[o], %[o], $12   \n\t"            \
4100
        "daddu  %[h], %[h], $11   \n\t"            \
4101
        "sltu $12, %[h], $11    \n\t"            \
4102
        "daddu  %[o], %[o], $12   \n\t"            \
4103
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4104
        : [a] "r" (va), [b] "r" (vb)                     \
4105
        : "$10", "$11", "$12", "$lo", "$hi"              \
4106
    )
4107
/* Multiply va by vb and add double size result into: vh | vl */
4108
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
4109
    __asm__ __volatile__ (                               \
4110
        "dmultu %[a], %[b]    \n\t"            \
4111
        "mflo $10     \n\t"            \
4112
        "mfhi $11     \n\t"            \
4113
        "daddu  %[l], %[l], $10   \n\t"            \
4114
        "sltu $12, %[l], $10    \n\t"            \
4115
        "daddu  %[h], %[h], $11   \n\t"            \
4116
        "daddu  %[h], %[h], $12   \n\t"            \
4117
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4118
        : [a] "r" (va), [b] "r" (vb)                     \
4119
        : "$10", "$11", "$12", "$lo", "$hi"              \
4120
    )
4121
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
4122
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
4123
    __asm__ __volatile__ (                               \
4124
        "dmultu %[a], %[b]    \n\t"            \
4125
        "mflo $10     \n\t"            \
4126
        "mfhi $11     \n\t"            \
4127
        "daddu  %[l], %[l], $10   \n\t"            \
4128
        "sltu $12, %[l], $10    \n\t"            \
4129
        "daddu  %[h], %[h], $12   \n\t"            \
4130
        "sltu $12, %[h], $12    \n\t"            \
4131
        "daddu  %[o], %[o], $12   \n\t"            \
4132
        "daddu  %[h], %[h], $11   \n\t"            \
4133
        "sltu $12, %[h], $11    \n\t"            \
4134
        "daddu  %[o], %[o], $12   \n\t"            \
4135
        "daddu  %[l], %[l], $10   \n\t"            \
4136
        "sltu $12, %[l], $10    \n\t"            \
4137
        "daddu  %[h], %[h], $12   \n\t"            \
4138
        "sltu $12, %[h], $12    \n\t"            \
4139
        "daddu  %[o], %[o], $12   \n\t"            \
4140
        "daddu  %[h], %[h], $11   \n\t"            \
4141
        "sltu $12, %[h], $11    \n\t"            \
4142
        "daddu  %[o], %[o], $12   \n\t"            \
4143
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4144
        : [a] "r" (va), [b] "r" (vb)                     \
4145
        : "$10", "$11", "$12", "$lo", "$hi"              \
4146
    )
4147
/* Multiply va by vb and add double size result twice into: vo | vh | vl
4148
 * Assumes first add will not overflow vh | vl
4149
 */
4150
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
4151
    __asm__ __volatile__ (                               \
4152
        "dmultu %[a], %[b]    \n\t"            \
4153
        "mflo $10     \n\t"            \
4154
        "mfhi $11     \n\t"            \
4155
        "daddu  %[l], %[l], $10   \n\t"            \
4156
        "sltu $12, %[l], $10    \n\t"            \
4157
        "daddu  %[h], %[h], $11   \n\t"            \
4158
        "daddu  %[h], %[h], $12   \n\t"            \
4159
        "daddu  %[l], %[l], $10   \n\t"            \
4160
        "sltu $12, %[l], $10    \n\t"            \
4161
        "daddu  %[h], %[h], $12   \n\t"            \
4162
        "sltu $12, %[h], $12    \n\t"            \
4163
        "daddu  %[o], %[o], $12   \n\t"            \
4164
        "daddu  %[h], %[h], $11   \n\t"            \
4165
        "sltu $12, %[h], $11    \n\t"            \
4166
        "daddu  %[o], %[o], $12   \n\t"            \
4167
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4168
        : [a] "r" (va), [b] "r" (vb)                     \
4169
        : "$10", "$11", "$12", "$lo", "$hi"              \
4170
    )
4171
/* Square va and store double size result in: vh | vl */
4172
#define SP_ASM_SQR(vl, vh, va)                           \
4173
    __asm__ __volatile__ (                               \
4174
        "dmultu %[a], %[a]    \n\t"            \
4175
        "mflo %[l]      \n\t"            \
4176
        "mfhi %[h]      \n\t"            \
4177
        : [h] "+r" (vh), [l] "+r" (vl)                   \
4178
        : [a] "r" (va)                                   \
4179
        : "$lo", "$hi"                                   \
4180
    )
4181
/* Square va and add double size result into: vo | vh | vl */
4182
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
4183
    __asm__ __volatile__ (                               \
4184
        "dmultu %[a], %[a]    \n\t"            \
4185
        "mflo $10     \n\t"            \
4186
        "mfhi $11     \n\t"            \
4187
        "daddu  %[l], %[l], $10   \n\t"            \
4188
        "sltu $12, %[l], $10    \n\t"            \
4189
        "daddu  %[h], %[h], $12   \n\t"            \
4190
        "sltu $12, %[h], $12    \n\t"            \
4191
        "daddu  %[o], %[o], $12   \n\t"            \
4192
        "daddu  %[h], %[h], $11   \n\t"            \
4193
        "sltu $12, %[h], $11    \n\t"            \
4194
        "daddu  %[o], %[o], $12   \n\t"            \
4195
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4196
        : [a] "r" (va)                                   \
4197
        : "$10", "$11", "$12", "$lo", "$hi"              \
4198
    )
4199
/* Square va and add double size result into: vh | vl */
4200
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
4201
    __asm__ __volatile__ (                               \
4202
        "dmultu %[a], %[a]    \n\t"            \
4203
        "mflo $10     \n\t"            \
4204
        "mfhi $11     \n\t"            \
4205
        "daddu  %[l], %[l], $10   \n\t"            \
4206
        "sltu $12, %[l], $10    \n\t"            \
4207
        "daddu  %[h], %[h], $11   \n\t"            \
4208
        "daddu  %[h], %[h], $12   \n\t"            \
4209
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4210
        : [a] "r" (va)                                   \
4211
        : "$10", "$11", "$12", "$lo", "$hi"              \
4212
    )
4213
/* Add va into: vh | vl */
4214
#define SP_ASM_ADDC(vl, vh, va)                          \
4215
    __asm__ __volatile__ (                               \
4216
        "daddu  %[l], %[l], %[a]  \n\t"            \
4217
        "sltu $12, %[l], %[a]   \n\t"            \
4218
        "daddu  %[h], %[h], $12   \n\t"            \
4219
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4220
        : [a] "r" (va)                                   \
4221
        : "$12"                                          \
4222
    )
4223
/* Sub va from: vh | vl */
4224
#define SP_ASM_SUBB(vl, vh, va)                          \
4225
    __asm__ __volatile__ (                               \
4226
        "move $12, %[l]   \n\t"            \
4227
        "dsubu  %[l], $12, %[a]   \n\t"            \
4228
        "sltu $12, $12, %[l]    \n\t"            \
4229
        "dsubu  %[h], %[h], $12   \n\t"            \
4230
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4231
        : [a] "r" (va)                                   \
4232
        : "$12"                                          \
4233
    )
4234
/* Add two times vc | vb | va into vo | vh | vl */
4235
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
4236
    __asm__ __volatile__ (                               \
4237
        "daddu  %[l], %[l], %[a]  \n\t"            \
4238
        "sltu $12, %[l], %[a]   \n\t"            \
4239
        "daddu  %[h], %[h], $12   \n\t"            \
4240
        "sltu $12, %[h], $12    \n\t"            \
4241
        "daddu  %[o], %[o], $12   \n\t"            \
4242
        "daddu  %[h], %[h], %[b]  \n\t"            \
4243
        "sltu $12, %[h], %[b]   \n\t"            \
4244
        "daddu  %[o], %[o], %[c]  \n\t"            \
4245
        "daddu  %[o], %[o], $12   \n\t"            \
4246
        "daddu  %[l], %[l], %[a]  \n\t"            \
4247
        "sltu $12, %[l], %[a]   \n\t"            \
4248
        "daddu  %[h], %[h], $12   \n\t"            \
4249
        "sltu $12, %[h], $12    \n\t"            \
4250
        "daddu  %[o], %[o], $12   \n\t"            \
4251
        "daddu  %[h], %[h], %[b]  \n\t"            \
4252
        "sltu $12, %[h], %[b]   \n\t"            \
4253
        "daddu  %[o], %[o], %[c]  \n\t"            \
4254
        "daddu  %[o], %[o], $12   \n\t"            \
4255
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4256
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
4257
        : "$12"                                          \
4258
    )
4259
4260
#define SP_INT_ASM_AVAILABLE
4261
4262
    #endif /* WOLFSSL_SP_MIPS64 && SP_WORD_SIZE == 64 */
4263
4264
    #if defined(WOLFSSL_SP_MIPS) && SP_WORD_SIZE == 32
4265
/*
4266
 * CPU: MIPS 32-bit
4267
 */
4268
4269
/* Multiply va by vb and store double size result in: vh | vl */
4270
#define SP_ASM_MUL(vl, vh, va, vb)                       \
4271
    __asm__ __volatile__ (                               \
4272
        "multu  %[a], %[b]    \n\t"            \
4273
        "mflo %[l]      \n\t"            \
4274
        "mfhi %[h]      \n\t"            \
4275
        : [h] "+r" (vh), [l] "+r" (vl)                   \
4276
        : [a] "r" (va), [b] "r" (vb)                     \
4277
        : "%lo", "%hi"                                   \
4278
    )
4279
/* Multiply va by vb and store double size result in: vo | vh | vl */
4280
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
4281
    __asm__ __volatile__ (                               \
4282
        "multu  %[a], %[b]    \n\t"            \
4283
        "mflo %[l]      \n\t"            \
4284
        "mfhi %[h]      \n\t"            \
4285
        "move %[o], $0    \n\t"            \
4286
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
4287
        : [a] "r" (va), [b] "r" (vb)                     \
4288
        : "%lo", "%hi"                                   \
4289
    )
4290
/* Multiply va by vb and add double size result into: vo | vh | vl */
4291
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
4292
    __asm__ __volatile__ (                               \
4293
        "multu  %[a], %[b]    \n\t"            \
4294
        "mflo $10     \n\t"            \
4295
        "mfhi $11     \n\t"            \
4296
        "addu %[l], %[l], $10   \n\t"            \
4297
        "sltu $12, %[l], $10    \n\t"            \
4298
        "addu %[h], %[h], $12   \n\t"            \
4299
        "sltu $12, %[h], $12    \n\t"            \
4300
        "addu %[o], %[o], $12   \n\t"            \
4301
        "addu %[h], %[h], $11   \n\t"            \
4302
        "sltu $12, %[h], $11    \n\t"            \
4303
        "addu %[o], %[o], $12   \n\t"            \
4304
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4305
        : [a] "r" (va), [b] "r" (vb)                     \
4306
        : "$10", "$11", "$12", "%lo", "%hi"              \
4307
    )
4308
/* Multiply va by vb and add double size result into: vh | vl */
4309
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
4310
    __asm__ __volatile__ (                               \
4311
        "multu  %[a], %[b]    \n\t"            \
4312
        "mflo $10     \n\t"            \
4313
        "mfhi $11     \n\t"            \
4314
        "addu %[l], %[l], $10   \n\t"            \
4315
        "sltu $12, %[l], $10    \n\t"            \
4316
        "addu %[h], %[h], $11   \n\t"            \
4317
        "addu %[h], %[h], $12   \n\t"            \
4318
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4319
        : [a] "r" (va), [b] "r" (vb)                     \
4320
        : "$10", "$11", "$12", "%lo", "%hi"              \
4321
    )
4322
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
4323
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
4324
    __asm__ __volatile__ (                               \
4325
        "multu  %[a], %[b]    \n\t"            \
4326
        "mflo $10     \n\t"            \
4327
        "mfhi $11     \n\t"            \
4328
        "addu %[l], %[l], $10   \n\t"            \
4329
        "sltu $12, %[l], $10    \n\t"            \
4330
        "addu %[h], %[h], $12   \n\t"            \
4331
        "sltu $12, %[h], $12    \n\t"            \
4332
        "addu %[o], %[o], $12   \n\t"            \
4333
        "addu %[h], %[h], $11   \n\t"            \
4334
        "sltu $12, %[h], $11    \n\t"            \
4335
        "addu %[o], %[o], $12   \n\t"            \
4336
        "addu %[l], %[l], $10   \n\t"            \
4337
        "sltu $12, %[l], $10    \n\t"            \
4338
        "addu %[h], %[h], $12   \n\t"            \
4339
        "sltu $12, %[h], $12    \n\t"            \
4340
        "addu %[o], %[o], $12   \n\t"            \
4341
        "addu %[h], %[h], $11   \n\t"            \
4342
        "sltu $12, %[h], $11    \n\t"            \
4343
        "addu %[o], %[o], $12   \n\t"            \
4344
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4345
        : [a] "r" (va), [b] "r" (vb)                     \
4346
        : "$10", "$11", "$12", "%lo", "%hi"              \
4347
    )
4348
/* Multiply va by vb and add double size result twice into: vo | vh | vl
4349
 * Assumes first add will not overflow vh | vl
4350
 */
4351
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
4352
    __asm__ __volatile__ (                               \
4353
        "multu  %[a], %[b]    \n\t"            \
4354
        "mflo $10     \n\t"            \
4355
        "mfhi $11     \n\t"            \
4356
        "addu %[l], %[l], $10   \n\t"            \
4357
        "sltu $12, %[l], $10    \n\t"            \
4358
        "addu %[h], %[h], $11   \n\t"            \
4359
        "addu %[h], %[h], $12   \n\t"            \
4360
        "addu %[l], %[l], $10   \n\t"            \
4361
        "sltu $12, %[l], $10    \n\t"            \
4362
        "addu %[h], %[h], $12   \n\t"            \
4363
        "sltu $12, %[h], $12    \n\t"            \
4364
        "addu %[o], %[o], $12   \n\t"            \
4365
        "addu %[h], %[h], $11   \n\t"            \
4366
        "sltu $12, %[h], $11    \n\t"            \
4367
        "addu %[o], %[o], $12   \n\t"            \
4368
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4369
        : [a] "r" (va), [b] "r" (vb)                     \
4370
        : "$10", "$11", "$12", "%lo", "%hi"              \
4371
    )
4372
/* Square va and store double size result in: vh | vl */
4373
#define SP_ASM_SQR(vl, vh, va)                           \
4374
    __asm__ __volatile__ (                               \
4375
        "multu  %[a], %[a]    \n\t"            \
4376
        "mflo %[l]      \n\t"            \
4377
        "mfhi %[h]      \n\t"            \
4378
        : [h] "+r" (vh), [l] "+r" (vl)                   \
4379
        : [a] "r" (va)                                   \
4380
        : "%lo", "%hi"                                   \
4381
    )
4382
/* Square va and add double size result into: vo | vh | vl */
4383
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
4384
    __asm__ __volatile__ (                               \
4385
        "multu  %[a], %[a]    \n\t"            \
4386
        "mflo $10     \n\t"            \
4387
        "mfhi $11     \n\t"            \
4388
        "addu %[l], %[l], $10   \n\t"            \
4389
        "sltu $12, %[l], $10    \n\t"            \
4390
        "addu %[h], %[h], $12   \n\t"            \
4391
        "sltu $12, %[h], $12    \n\t"            \
4392
        "addu %[o], %[o], $12   \n\t"            \
4393
        "addu %[h], %[h], $11   \n\t"            \
4394
        "sltu $12, %[h], $11    \n\t"            \
4395
        "addu %[o], %[o], $12   \n\t"            \
4396
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4397
        : [a] "r" (va)                                   \
4398
        : "$10", "$11", "$12", "%lo", "%hi"              \
4399
    )
4400
/* Square va and add double size result into: vh | vl */
4401
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
4402
    __asm__ __volatile__ (                               \
4403
        "multu  %[a], %[a]    \n\t"            \
4404
        "mflo $10     \n\t"            \
4405
        "mfhi $11     \n\t"            \
4406
        "addu %[l], %[l], $10   \n\t"            \
4407
        "sltu $12, %[l], $10    \n\t"            \
4408
        "addu %[h], %[h], $11   \n\t"            \
4409
        "addu %[h], %[h], $12   \n\t"            \
4410
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4411
        : [a] "r" (va)                                   \
4412
        : "$10", "$11", "$12", "%lo", "%hi"              \
4413
    )
4414
/* Add va into: vh | vl */
4415
#define SP_ASM_ADDC(vl, vh, va)                          \
4416
    __asm__ __volatile__ (                               \
4417
        "addu %[l], %[l], %[a]  \n\t"            \
4418
        "sltu $12, %[l], %[a]   \n\t"            \
4419
        "addu %[h], %[h], $12   \n\t"            \
4420
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4421
        : [a] "r" (va)                                   \
4422
        : "$12"                                          \
4423
    )
4424
/* Sub va from: vh | vl */
4425
#define SP_ASM_SUBB(vl, vh, va)                          \
4426
    __asm__ __volatile__ (                               \
4427
        "move $12, %[l]   \n\t"            \
4428
        "subu %[l], $12, %[a]   \n\t"            \
4429
        "sltu $12, $12, %[l]    \n\t"            \
4430
        "subu %[h], %[h], $12   \n\t"            \
4431
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4432
        : [a] "r" (va)                                   \
4433
        : "$12"                                          \
4434
    )
4435
/* Add two times vc | vb | va into vo | vh | vl */
4436
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
4437
    __asm__ __volatile__ (                               \
4438
        "addu %[l], %[l], %[a]  \n\t"            \
4439
        "sltu $12, %[l], %[a]   \n\t"            \
4440
        "addu %[h], %[h], $12   \n\t"            \
4441
        "sltu $12, %[h], $12    \n\t"            \
4442
        "addu %[o], %[o], $12   \n\t"            \
4443
        "addu %[h], %[h], %[b]  \n\t"            \
4444
        "sltu $12, %[h], %[b]   \n\t"            \
4445
        "addu %[o], %[o], %[c]  \n\t"            \
4446
        "addu %[o], %[o], $12   \n\t"            \
4447
        "addu %[l], %[l], %[a]  \n\t"            \
4448
        "sltu $12, %[l], %[a]   \n\t"            \
4449
        "addu %[h], %[h], $12   \n\t"            \
4450
        "sltu $12, %[h], $12    \n\t"            \
4451
        "addu %[o], %[o], $12   \n\t"            \
4452
        "addu %[h], %[h], %[b]  \n\t"            \
4453
        "sltu $12, %[h], %[b]   \n\t"            \
4454
        "addu %[o], %[o], %[c]  \n\t"            \
4455
        "addu %[o], %[o], $12   \n\t"            \
4456
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4457
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
4458
        : "$12"                                          \
4459
    )
4460
4461
#define SP_INT_ASM_AVAILABLE
4462
4463
    #endif /* WOLFSSL_SP_MIPS && SP_WORD_SIZE == 32 */
4464
4465
    #if defined(WOLFSSL_SP_RISCV64) && SP_WORD_SIZE == 64
4466
/*
4467
 * CPU: RISCV 64-bit
4468
 */
4469
4470
/* Multiply va by vb and store double size result in: vh | vl */
4471
#define SP_ASM_MUL(vl, vh, va, vb)                       \
4472
    __asm__ __volatile__ (                               \
4473
        "mul  %[l], %[a], %[b]  \n\t"            \
4474
        "mulhu  %[h], %[a], %[b]  \n\t"            \
4475
        : [h] "+r" (vh), [l] "+r" (vl)                   \
4476
        : [a] "r" (va), [b] "r" (vb)                     \
4477
        :                                                \
4478
    )
4479
/* Multiply va by vb and store double size result in: vo | vh | vl */
4480
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
4481
    __asm__ __volatile__ (                               \
4482
        "mulhu  %[h], %[a], %[b]  \n\t"            \
4483
        "mul  %[l], %[a], %[b]  \n\t"            \
4484
        "add  %[o], zero, zero  \n\t"            \
4485
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
4486
        : [a] "r" (va), [b] "r" (vb)                     \
4487
        :                                                \
4488
    )
4489
/* Multiply va by vb and add double size result into: vo | vh | vl */
4490
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
4491
    __asm__ __volatile__ (                               \
4492
        "mul  a5, %[a], %[b]    \n\t"            \
4493
        "mulhu  a6, %[a], %[b]    \n\t"            \
4494
        "add  %[l], %[l], a5    \n\t"            \
4495
        "sltu a7, %[l], a5    \n\t"            \
4496
        "add  %[h], %[h], a7    \n\t"            \
4497
        "sltu a7, %[h], a7    \n\t"            \
4498
        "add  %[o], %[o], a7    \n\t"            \
4499
        "add  %[h], %[h], a6    \n\t"            \
4500
        "sltu a7, %[h], a6    \n\t"            \
4501
        "add  %[o], %[o], a7    \n\t"            \
4502
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4503
        : [a] "r" (va), [b] "r" (vb)                     \
4504
        : "a5", "a6", "a7"                               \
4505
    )
4506
/* Multiply va by vb and add double size result into: vh | vl */
4507
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
4508
    __asm__ __volatile__ (                               \
4509
        "mul  a5, %[a], %[b]    \n\t"            \
4510
        "mulhu  a6, %[a], %[b]    \n\t"            \
4511
        "add  %[l], %[l], a5    \n\t"            \
4512
        "sltu a7, %[l], a5    \n\t"            \
4513
        "add  %[h], %[h], a6    \n\t"            \
4514
        "add  %[h], %[h], a7    \n\t"            \
4515
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4516
        : [a] "r" (va), [b] "r" (vb)                     \
4517
        : "a5", "a6", "a7"                               \
4518
    )
4519
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
4520
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
4521
    __asm__ __volatile__ (                               \
4522
        "mul  a5, %[a], %[b]    \n\t"            \
4523
        "mulhu  a6, %[a], %[b]    \n\t"            \
4524
        "add  %[l], %[l], a5    \n\t"            \
4525
        "sltu a7, %[l], a5    \n\t"            \
4526
        "add  %[h], %[h], a7    \n\t"            \
4527
        "sltu a7, %[h], a7    \n\t"            \
4528
        "add  %[o], %[o], a7    \n\t"            \
4529
        "add  %[h], %[h], a6    \n\t"            \
4530
        "sltu a7, %[h], a6    \n\t"            \
4531
        "add  %[o], %[o], a7    \n\t"            \
4532
        "add  %[l], %[l], a5    \n\t"            \
4533
        "sltu a7, %[l], a5    \n\t"            \
4534
        "add  %[h], %[h], a7    \n\t"            \
4535
        "sltu a7, %[h], a7    \n\t"            \
4536
        "add  %[o], %[o], a7    \n\t"            \
4537
        "add  %[h], %[h], a6    \n\t"            \
4538
        "sltu a7, %[h], a6    \n\t"            \
4539
        "add  %[o], %[o], a7    \n\t"            \
4540
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4541
        : [a] "r" (va), [b] "r" (vb)                     \
4542
        : "a5", "a6", "a7"                               \
4543
    )
4544
/* Multiply va by vb and add double size result twice into: vo | vh | vl
4545
 * Assumes first add will not overflow vh | vl
4546
 */
4547
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
4548
    __asm__ __volatile__ (                               \
4549
        "mul  a5, %[a], %[b]    \n\t"            \
4550
        "mulhu  a6, %[a], %[b]    \n\t"            \
4551
        "add  %[l], %[l], a5    \n\t"            \
4552
        "sltu a7, %[l], a5    \n\t"            \
4553
        "add  %[h], %[h], a6    \n\t"            \
4554
        "add  %[h], %[h], a7    \n\t"            \
4555
        "add  %[l], %[l], a5    \n\t"            \
4556
        "sltu a7, %[l], a5    \n\t"            \
4557
        "add  %[h], %[h], a7    \n\t"            \
4558
        "sltu a7, %[h], a7    \n\t"            \
4559
        "add  %[o], %[o], a7    \n\t"            \
4560
        "add  %[h], %[h], a6    \n\t"            \
4561
        "sltu a7, %[h], a6    \n\t"            \
4562
        "add  %[o], %[o], a7    \n\t"            \
4563
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4564
        : [a] "r" (va), [b] "r" (vb)                     \
4565
        : "a5", "a6", "a7"                               \
4566
    )
4567
/* Square va and store double size result in: vh | vl */
4568
#define SP_ASM_SQR(vl, vh, va)                           \
4569
    __asm__ __volatile__ (                               \
4570
        "mul  %[l], %[a], %[a]  \n\t"            \
4571
        "mulhu  %[h], %[a], %[a]  \n\t"            \
4572
        : [h] "+r" (vh), [l] "+r" (vl)                   \
4573
        : [a] "r" (va)                                   \
4574
        :                                                \
4575
    )
4576
/* Square va and add double size result into: vo | vh | vl */
4577
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
4578
    __asm__ __volatile__ (                               \
4579
        "mul  a5, %[a], %[a]    \n\t"            \
4580
        "mulhu  a6, %[a], %[a]    \n\t"            \
4581
        "add  %[l], %[l], a5    \n\t"            \
4582
        "sltu a7, %[l], a5    \n\t"            \
4583
        "add  %[h], %[h], a7    \n\t"            \
4584
        "sltu a7, %[h], a7    \n\t"            \
4585
        "add  %[o], %[o], a7    \n\t"            \
4586
        "add  %[h], %[h], a6    \n\t"            \
4587
        "sltu a7, %[h], a6    \n\t"            \
4588
        "add  %[o], %[o], a7    \n\t"            \
4589
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4590
        : [a] "r" (va)                                   \
4591
        : "a5", "a6", "a7"                               \
4592
    )
4593
/* Square va and add double size result into: vh | vl */
4594
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
4595
    __asm__ __volatile__ (                               \
4596
        "mul  a5, %[a], %[a]    \n\t"            \
4597
        "mulhu  a6, %[a], %[a]    \n\t"            \
4598
        "add  %[l], %[l], a5    \n\t"            \
4599
        "sltu a7, %[l], a5    \n\t"            \
4600
        "add  %[h], %[h], a6    \n\t"            \
4601
        "add  %[h], %[h], a7    \n\t"            \
4602
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4603
        : [a] "r" (va)                                   \
4604
        : "a5", "a6", "a7"                               \
4605
    )
4606
/* Add va into: vh | vl */
4607
#define SP_ASM_ADDC(vl, vh, va)                          \
4608
    __asm__ __volatile__ (                               \
4609
        "add  %[l], %[l], %[a]  \n\t"            \
4610
        "sltu a7, %[l], %[a]    \n\t"            \
4611
        "add  %[h], %[h], a7    \n\t"            \
4612
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4613
        : [a] "r" (va)                                   \
4614
        : "a7"                                           \
4615
    )
4616
/* Sub va from: vh | vl */
4617
#define SP_ASM_SUBB(vl, vh, va)                          \
4618
    __asm__ __volatile__ (                               \
4619
        "add  a7, %[l], zero    \n\t"            \
4620
        "sub  %[l], a7, %[a]    \n\t"            \
4621
        "sltu a7, a7, %[l]    \n\t"            \
4622
        "sub  %[h], %[h], a7    \n\t"            \
4623
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4624
        : [a] "r" (va)                                   \
4625
        : "a7"                                           \
4626
    )
4627
/* Add two times vc | vb | va into vo | vh | vl */
4628
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
4629
    __asm__ __volatile__ (                               \
4630
        "add  %[l], %[l], %[a]  \n\t"            \
4631
        "sltu a7, %[l], %[a]    \n\t"            \
4632
        "add  %[h], %[h], a7    \n\t"            \
4633
        "sltu a7, %[h], a7    \n\t"            \
4634
        "add  %[o], %[o], a7    \n\t"            \
4635
        "add  %[h], %[h], %[b]  \n\t"            \
4636
        "sltu a7, %[h], %[b]    \n\t"            \
4637
        "add  %[o], %[o], %[c]  \n\t"            \
4638
        "add  %[o], %[o], a7    \n\t"            \
4639
        "add  %[l], %[l], %[a]  \n\t"            \
4640
        "sltu a7, %[l], %[a]    \n\t"            \
4641
        "add  %[h], %[h], a7    \n\t"            \
4642
        "sltu a7, %[h], a7    \n\t"            \
4643
        "add  %[o], %[o], a7    \n\t"            \
4644
        "add  %[h], %[h], %[b]  \n\t"            \
4645
        "sltu a7, %[h], %[b]    \n\t"            \
4646
        "add  %[o], %[o], %[c]  \n\t"            \
4647
        "add  %[o], %[o], a7    \n\t"            \
4648
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4649
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
4650
        : "a7"                                           \
4651
    )
4652
4653
#define SP_INT_ASM_AVAILABLE
4654
4655
    #endif /* WOLFSSL_SP_RISCV64 && SP_WORD_SIZE == 64 */
4656
4657
    #if defined(WOLFSSL_SP_RISCV32) && SP_WORD_SIZE == 32
4658
/*
4659
 * CPU: RISCV 32-bit
4660
 */
4661
4662
/* Multiply va by vb and store double size result in: vh | vl */
4663
#define SP_ASM_MUL(vl, vh, va, vb)                       \
4664
    __asm__ __volatile__ (                               \
4665
        "mul  %[l], %[a], %[b]  \n\t"            \
4666
        "mulhu  %[h], %[a], %[b]  \n\t"            \
4667
        : [h] "+r" (vh), [l] "+r" (vl)                   \
4668
        : [a] "r" (va), [b] "r" (vb)                     \
4669
        :                                                \
4670
    )
4671
/* Multiply va by vb and store double size result in: vo | vh | vl */
4672
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
4673
    __asm__ __volatile__ (                               \
4674
        "mulhu  %[h], %[a], %[b]  \n\t"            \
4675
        "mul  %[l], %[a], %[b]  \n\t"            \
4676
        "add  %[o], zero, zero  \n\t"            \
4677
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
4678
        : [a] "r" (va), [b] "r" (vb)                     \
4679
        :                                                \
4680
    )
4681
/* Multiply va by vb and add double size result into: vo | vh | vl */
4682
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
4683
    __asm__ __volatile__ (                               \
4684
        "mul  a5, %[a], %[b]    \n\t"            \
4685
        "mulhu  a6, %[a], %[b]    \n\t"            \
4686
        "add  %[l], %[l], a5    \n\t"            \
4687
        "sltu a7, %[l], a5    \n\t"            \
4688
        "add  %[h], %[h], a7    \n\t"            \
4689
        "sltu a7, %[h], a7    \n\t"            \
4690
        "add  %[o], %[o], a7    \n\t"            \
4691
        "add  %[h], %[h], a6    \n\t"            \
4692
        "sltu a7, %[h], a6    \n\t"            \
4693
        "add  %[o], %[o], a7    \n\t"            \
4694
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4695
        : [a] "r" (va), [b] "r" (vb)                     \
4696
        : "a5", "a6", "a7"                               \
4697
    )
4698
/* Multiply va by vb and add double size result into: vh | vl */
4699
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
4700
    __asm__ __volatile__ (                               \
4701
        "mul  a5, %[a], %[b]    \n\t"            \
4702
        "mulhu  a6, %[a], %[b]    \n\t"            \
4703
        "add  %[l], %[l], a5    \n\t"            \
4704
        "sltu a7, %[l], a5    \n\t"            \
4705
        "add  %[h], %[h], a6    \n\t"            \
4706
        "add  %[h], %[h], a7    \n\t"            \
4707
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4708
        : [a] "r" (va), [b] "r" (vb)                     \
4709
        : "a5", "a6", "a7"                               \
4710
    )
4711
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
4712
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
4713
    __asm__ __volatile__ (                               \
4714
        "mul  a5, %[a], %[b]    \n\t"            \
4715
        "mulhu  a6, %[a], %[b]    \n\t"            \
4716
        "add  %[l], %[l], a5    \n\t"            \
4717
        "sltu a7, %[l], a5    \n\t"            \
4718
        "add  %[h], %[h], a7    \n\t"            \
4719
        "sltu a7, %[h], a7    \n\t"            \
4720
        "add  %[o], %[o], a7    \n\t"            \
4721
        "add  %[h], %[h], a6    \n\t"            \
4722
        "sltu a7, %[h], a6    \n\t"            \
4723
        "add  %[o], %[o], a7    \n\t"            \
4724
        "add  %[l], %[l], a5    \n\t"            \
4725
        "sltu a7, %[l], a5    \n\t"            \
4726
        "add  %[h], %[h], a7    \n\t"            \
4727
        "sltu a7, %[h], a7    \n\t"            \
4728
        "add  %[o], %[o], a7    \n\t"            \
4729
        "add  %[h], %[h], a6    \n\t"            \
4730
        "sltu a7, %[h], a6    \n\t"            \
4731
        "add  %[o], %[o], a7    \n\t"            \
4732
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4733
        : [a] "r" (va), [b] "r" (vb)                     \
4734
        : "a5", "a6", "a7"                               \
4735
    )
4736
/* Multiply va by vb and add double size result twice into: vo | vh | vl
4737
 * Assumes first add will not overflow vh | vl
4738
 */
4739
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
4740
    __asm__ __volatile__ (                               \
4741
        "mul  a5, %[a], %[b]    \n\t"            \
4742
        "mulhu  a6, %[a], %[b]    \n\t"            \
4743
        "add  %[l], %[l], a5    \n\t"            \
4744
        "sltu a7, %[l], a5    \n\t"            \
4745
        "add  %[h], %[h], a6    \n\t"            \
4746
        "add  %[h], %[h], a7    \n\t"            \
4747
        "add  %[l], %[l], a5    \n\t"            \
4748
        "sltu a7, %[l], a5    \n\t"            \
4749
        "add  %[h], %[h], a7    \n\t"            \
4750
        "sltu a7, %[h], a7    \n\t"            \
4751
        "add  %[o], %[o], a7    \n\t"            \
4752
        "add  %[h], %[h], a6    \n\t"            \
4753
        "sltu a7, %[h], a6    \n\t"            \
4754
        "add  %[o], %[o], a7    \n\t"            \
4755
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4756
        : [a] "r" (va), [b] "r" (vb)                     \
4757
        : "a5", "a6", "a7"                               \
4758
    )
4759
/* Square va and store double size result in: vh | vl */
4760
#define SP_ASM_SQR(vl, vh, va)                           \
4761
    __asm__ __volatile__ (                               \
4762
        "mul  %[l], %[a], %[a]  \n\t"            \
4763
        "mulhu  %[h], %[a], %[a]  \n\t"            \
4764
        : [h] "+r" (vh), [l] "+r" (vl)                   \
4765
        : [a] "r" (va)                                   \
4766
        :                                                \
4767
    )
4768
/* Square va and add double size result into: vo | vh | vl */
4769
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
4770
    __asm__ __volatile__ (                               \
4771
        "mul  a5, %[a], %[a]    \n\t"            \
4772
        "mulhu  a6, %[a], %[a]    \n\t"            \
4773
        "add  %[l], %[l], a5    \n\t"            \
4774
        "sltu a7, %[l], a5    \n\t"            \
4775
        "add  %[h], %[h], a7    \n\t"            \
4776
        "sltu a7, %[h], a7    \n\t"            \
4777
        "add  %[o], %[o], a7    \n\t"            \
4778
        "add  %[h], %[h], a6    \n\t"            \
4779
        "sltu a7, %[h], a6    \n\t"            \
4780
        "add  %[o], %[o], a7    \n\t"            \
4781
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4782
        : [a] "r" (va)                                   \
4783
        : "a5", "a6", "a7"                               \
4784
    )
4785
/* Square va and add double size result into: vh | vl */
4786
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
4787
    __asm__ __volatile__ (                               \
4788
        "mul  a5, %[a], %[a]    \n\t"            \
4789
        "mulhu  a6, %[a], %[a]    \n\t"            \
4790
        "add  %[l], %[l], a5    \n\t"            \
4791
        "sltu a7, %[l], a5    \n\t"            \
4792
        "add  %[h], %[h], a6    \n\t"            \
4793
        "add  %[h], %[h], a7    \n\t"            \
4794
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4795
        : [a] "r" (va)                                   \
4796
        : "a5", "a6", "a7"                               \
4797
    )
4798
/* Add va into: vh | vl */
4799
#define SP_ASM_ADDC(vl, vh, va)                          \
4800
    __asm__ __volatile__ (                               \
4801
        "add  %[l], %[l], %[a]  \n\t"            \
4802
        "sltu a7, %[l], %[a]    \n\t"            \
4803
        "add  %[h], %[h], a7    \n\t"            \
4804
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4805
        : [a] "r" (va)                                   \
4806
        : "a7"                                           \
4807
    )
4808
/* Sub va from: vh | vl */
4809
#define SP_ASM_SUBB(vl, vh, va)                          \
4810
    __asm__ __volatile__ (                               \
4811
        "add  a7, %[l], zero    \n\t"            \
4812
        "sub  %[l], a7, %[a]    \n\t"            \
4813
        "sltu a7, a7, %[l]    \n\t"            \
4814
        "sub  %[h], %[h], a7    \n\t"            \
4815
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4816
        : [a] "r" (va)                                   \
4817
        : "a7"                                           \
4818
    )
4819
/* Add two times vc | vb | va into vo | vh | vl */
4820
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
4821
    __asm__ __volatile__ (                               \
4822
        "add  %[l], %[l], %[a]  \n\t"            \
4823
        "sltu a7, %[l], %[a]    \n\t"            \
4824
        "add  %[h], %[h], a7    \n\t"            \
4825
        "sltu a7, %[h], a7    \n\t"            \
4826
        "add  %[o], %[o], a7    \n\t"            \
4827
        "add  %[h], %[h], %[b]  \n\t"            \
4828
        "sltu a7, %[h], %[b]    \n\t"            \
4829
        "add  %[o], %[o], %[c]  \n\t"            \
4830
        "add  %[o], %[o], a7    \n\t"            \
4831
        "add  %[l], %[l], %[a]  \n\t"            \
4832
        "sltu a7, %[l], %[a]    \n\t"            \
4833
        "add  %[h], %[h], a7    \n\t"            \
4834
        "sltu a7, %[h], a7    \n\t"            \
4835
        "add  %[o], %[o], a7    \n\t"            \
4836
        "add  %[h], %[h], %[b]  \n\t"            \
4837
        "sltu a7, %[h], %[b]    \n\t"            \
4838
        "add  %[o], %[o], %[c]  \n\t"            \
4839
        "add  %[o], %[o], a7    \n\t"            \
4840
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4841
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
4842
        : "a7"                                           \
4843
    )
4844
4845
#define SP_INT_ASM_AVAILABLE
4846
4847
    #endif /* WOLFSSL_SP_RISCV32 && SP_WORD_SIZE == 32 */
4848
4849
    #if defined(WOLFSSL_SP_S390X) && SP_WORD_SIZE == 64
4850
/*
4851
 * CPU: Intel s390x
4852
 */
4853
4854
/* Multiply va by vb and store double size result in: vh | vl */
4855
#define SP_ASM_MUL(vl, vh, va, vb)                       \
4856
    __asm__ __volatile__ (                               \
4857
        "lgr  %%r1, %[a]    \n\t"            \
4858
        "mlgr %%r0, %[b]    \n\t"            \
4859
        "lgr  %[l], %%r1    \n\t"            \
4860
        "lgr  %[h], %%r0    \n\t"            \
4861
        : [h] "+r" (vh), [l] "+r" (vl)                   \
4862
        : [a] "r" (va), [b] "r" (vb)                     \
4863
        : "r0", "r1"                                     \
4864
    )
4865
/* Multiply va by vb and store double size result in: vo | vh | vl */
4866
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
4867
    __asm__ __volatile__ (                               \
4868
        "lgr  %%r1, %[a]    \n\t"            \
4869
        "mlgr %%r0, %[b]    \n\t"            \
4870
        "lghi %[o], 0     \n\t"            \
4871
        "lgr  %[l], %%r1    \n\t"            \
4872
        "lgr  %[h], %%r0    \n\t"            \
4873
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
4874
        : [a] "r" (va), [b] "r" (vb)                     \
4875
        : "r0", "r1"                                     \
4876
    )
4877
/* Multiply va by vb and add double size result into: vo | vh | vl */
4878
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
4879
    __asm__ __volatile__ (                               \
4880
        "lghi %%r10, 0  \n\t"                    \
4881
        "lgr  %%r1, %[a]    \n\t"            \
4882
        "mlgr %%r0, %[b]    \n\t"            \
4883
        "algr %[l], %%r1  \n\t"                    \
4884
        "alcgr  %[h], %%r0  \n\t"                    \
4885
        "alcgr  %[o], %%r10 \n\t"                    \
4886
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4887
        : [a] "r" (va), [b] "r" (vb)                     \
4888
        : "r0", "r1", "r10", "cc"                        \
4889
    )
4890
/* Multiply va by vb and add double size result into: vh | vl */
4891
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
4892
    __asm__ __volatile__ (                               \
4893
        "lgr  %%r1, %[a]    \n\t"            \
4894
        "mlgr %%r0, %[b]    \n\t"            \
4895
        "algr %[l], %%r1  \n\t"                    \
4896
        "alcgr  %[h], %%r0  \n\t"                    \
4897
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4898
        : [a] "r" (va), [b] "r" (vb)                     \
4899
        : "r0", "r1", "cc"                               \
4900
    )
4901
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
4902
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
4903
    __asm__ __volatile__ (                               \
4904
        "lghi %%r10, 0  \n\t"                    \
4905
        "lgr  %%r1, %[a]    \n\t"            \
4906
        "mlgr %%r0, %[b]    \n\t"            \
4907
        "algr %[l], %%r1  \n\t"                    \
4908
        "alcgr  %[h], %%r0  \n\t"                    \
4909
        "alcgr  %[o], %%r10 \n\t"                    \
4910
        "algr %[l], %%r1  \n\t"                    \
4911
        "alcgr  %[h], %%r0  \n\t"                    \
4912
        "alcgr  %[o], %%r10 \n\t"                    \
4913
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4914
        : [a] "r" (va), [b] "r" (vb)                     \
4915
        : "r0", "r1", "r10", "cc"                        \
4916
    )
4917
/* Multiply va by vb and add double size result twice into: vo | vh | vl
4918
 * Assumes first add will not overflow vh | vl
4919
 */
4920
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
4921
    __asm__ __volatile__ (                               \
4922
        "lghi %%r10, 0  \n\t"                    \
4923
        "lgr  %%r1, %[a]    \n\t"            \
4924
        "mlgr %%r0, %[b]    \n\t"            \
4925
        "algr %[l], %%r1  \n\t"                    \
4926
        "alcgr  %[h], %%r0  \n\t"                    \
4927
        "algr %[l], %%r1  \n\t"                    \
4928
        "alcgr  %[h], %%r0  \n\t"                    \
4929
        "alcgr  %[o], %%r10 \n\t"                    \
4930
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4931
        : [a] "r" (va), [b] "r" (vb)                     \
4932
        : "r0", "r1", "r10", "cc"                        \
4933
    )
4934
/* Square va and store double size result in: vh | vl */
4935
#define SP_ASM_SQR(vl, vh, va)                           \
4936
    __asm__ __volatile__ (                               \
4937
        "lgr  %%r1, %[a]    \n\t"            \
4938
        "mlgr %%r0, %%r1    \n\t"            \
4939
        "lgr  %[l], %%r1    \n\t"            \
4940
        "lgr  %[h], %%r0    \n\t"            \
4941
        : [h] "+r" (vh), [l] "+r" (vl)                   \
4942
        : [a] "r" (va)                                   \
4943
        : "r0", "r1"                                     \
4944
    )
4945
/* Square va and add double size result into: vo | vh | vl */
4946
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
4947
    __asm__ __volatile__ (                               \
4948
        "lghi %%r10, 0  \n\t"                    \
4949
        "lgr  %%r1, %[a]    \n\t"            \
4950
        "mlgr %%r0, %%r1    \n\t"            \
4951
        "algr %[l], %%r1  \n\t"                    \
4952
        "alcgr  %[h], %%r0  \n\t"                    \
4953
        "alcgr  %[o], %%r10 \n\t"                    \
4954
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4955
        : [a] "r" (va)                                   \
4956
        : "r0", "r1", "r10", "cc"                        \
4957
    )
4958
/* Square va and add double size result into: vh | vl */
4959
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
4960
    __asm__ __volatile__ (                               \
4961
        "lgr  %%r1, %[a]    \n\t"            \
4962
        "mlgr %%r0, %%r1    \n\t"            \
4963
        "algr %[l], %%r1  \n\t"                    \
4964
        "alcgr  %[h], %%r0  \n\t"                    \
4965
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4966
        : [a] "r" (va)                                   \
4967
        : "r0", "r1", "cc"                               \
4968
    )
4969
/* Add va into: vh | vl */
4970
#define SP_ASM_ADDC(vl, vh, va)                          \
4971
    __asm__ __volatile__ (                               \
4972
        "lghi %%r10, 0  \n\t"                    \
4973
        "algr %[l], %[a]  \n\t"                    \
4974
        "alcgr  %[h], %%r10 \n\t"                    \
4975
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4976
        : [a] "r" (va)                                   \
4977
        : "r10", "cc"                                    \
4978
    )
4979
/* Sub va from: vh | vl */
4980
#define SP_ASM_SUBB(vl, vh, va)                          \
4981
    __asm__ __volatile__ (                               \
4982
        "lghi %%r10, 0  \n\t"                    \
4983
        "slgr %[l], %[a]  \n\t"                    \
4984
        "slbgr  %[h], %%r10 \n\t"                    \
4985
        : [l] "+r" (vl), [h] "+r" (vh)                   \
4986
        : [a] "r" (va)                                   \
4987
        : "r10", "cc"                                    \
4988
    )
4989
/* Add two times vc | vb | va into vo | vh | vl */
4990
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
4991
    __asm__ __volatile__ (                               \
4992
        "algr %[l], %[a]  \n\t"                    \
4993
        "alcgr  %[h], %[b]  \n\t"                    \
4994
        "alcgr  %[o], %[c]  \n\t"                    \
4995
        "algr %[l], %[a]  \n\t"                    \
4996
        "alcgr  %[h], %[b]  \n\t"                    \
4997
        "alcgr  %[o], %[c]  \n\t"                    \
4998
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
4999
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
5000
        : "cc"                                           \
5001
    )
5002
5003
#define SP_INT_ASM_AVAILABLE
5004
5005
    #endif /* WOLFSSL_SP_S390X && SP_WORD_SIZE == 64 */
5006
5007
#ifdef SP_INT_ASM_AVAILABLE
5008
    #ifndef SP_INT_NO_ASM
5009
        #define SQR_MUL_ASM
5010
    #endif
5011
    #ifndef SP_ASM_ADDC_REG
5012
        #define SP_ASM_ADDC_REG  SP_ASM_ADDC
5013
    #endif /* SP_ASM_ADDC_REG */
5014
    #ifndef SP_ASM_SUBB_REG
5015
        #define SP_ASM_SUBB_REG  SP_ASM_SUBB
5016
    #endif /* SP_ASM_ADDC_REG */
5017
#endif /* SQR_MUL_ASM */
5018
5019
#endif /* !WOLFSSL_NO_ASM */
5020
5021
5022
#if (!defined(NO_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \
5023
    !defined(NO_DSA) || !defined(NO_DH) || \
5024
    (defined(HAVE_ECC) && defined(HAVE_COMP_KEY)) || defined(OPENSSL_EXTRA) || \
5025
    (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_PUBLIC_ONLY))
5026
#ifndef WC_NO_CACHE_RESISTANT
5027
    /* Mask of address for constant time operations. */
5028
    const size_t sp_off_on_addr[2] =
5029
    {
5030
        (size_t) 0,
5031
        (size_t)-1
5032
    };
5033
#endif
5034
#endif
5035
5036
5037
#if defined(WOLFSSL_HAVE_SP_DH) || defined(WOLFSSL_HAVE_SP_RSA)
5038
5039
#ifdef __cplusplus
5040
extern "C" {
5041
#endif
5042
5043
/* Modular exponentiation implementations using Single Precision. */
5044
WOLFSSL_LOCAL int sp_ModExp_1024(sp_int* base, sp_int* exp, sp_int* mod,
5045
    sp_int* res);
5046
WOLFSSL_LOCAL int sp_ModExp_1536(sp_int* base, sp_int* exp, sp_int* mod,
5047
    sp_int* res);
5048
WOLFSSL_LOCAL int sp_ModExp_2048(sp_int* base, sp_int* exp, sp_int* mod,
5049
    sp_int* res);
5050
WOLFSSL_LOCAL int sp_ModExp_3072(sp_int* base, sp_int* exp, sp_int* mod,
5051
    sp_int* res);
5052
WOLFSSL_LOCAL int sp_ModExp_4096(sp_int* base, sp_int* exp, sp_int* mod,
5053
    sp_int* res);
5054
5055
#ifdef __cplusplus
5056
} /* extern "C" */
5057
#endif
5058
5059
#endif /* WOLFSSL_HAVE_SP_DH || WOLFSSL_HAVE_SP_RSA */
5060
5061
5062
#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_HAVE_SP_DH) || \
5063
    defined(OPENSSL_ALL)
5064
static int _sp_mont_red(sp_int* a, const sp_int* m, sp_int_digit mp, int ct);
5065
#endif
5066
#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_HAVE_SP_DH) || \
5067
    defined(WOLFCRYPT_HAVE_ECCSI) || defined(WOLFCRYPT_HAVE_SAKKE) || \
5068
    defined(OPENSSL_ALL)
5069
static void _sp_mont_setup(const sp_int* m, sp_int_digit* rho);
5070
#endif
5071
5072
5073
/* Set the multi-precision number to zero.
5074
 *
5075
 * Assumes a is not NULL.
5076
 *
5077
 * @param  [out]  a  SP integer to set to zero.
5078
 */
5079
static void _sp_zero(sp_int* a)
5080
124M
{
5081
124M
    sp_int_minimal* am = (sp_int_minimal *)a;
5082
5083
124M
    am->used = 0;
5084
124M
    am->dp[0] = 0;
5085
124M
#ifdef WOLFSSL_SP_INT_NEGATIVE
5086
124M
    am->sign = MP_ZPOS;
5087
124M
#endif
5088
124M
}
5089
5090
5091
/* Initialize the multi-precision number to be zero with a given max size.
5092
 *
5093
 * @param  [out]  a     SP integer.
5094
 * @param  [in]   size  Number of words to say are available.
5095
 */
5096
static void _sp_init_size(sp_int* a, unsigned int size)
5097
91.4M
{
5098
91.4M
    volatile sp_int_minimal* am = (sp_int_minimal *)a;
5099
5100
#ifdef HAVE_WOLF_BIGINT
5101
    wc_bigint_init((struct WC_BIGINT*)&am->raw);
5102
#endif
5103
91.4M
    _sp_zero((sp_int*)am);
5104
5105
91.4M
    am->size = (sp_size_t)size;
5106
91.4M
}
5107
5108
/* Initialize the multi-precision number to be zero with a given max size.
5109
 *
5110
 * @param  [out]  a     SP integer.
5111
 * @param  [in]   size  Number of words to say are available.
5112
 *
5113
 * @return  MP_OKAY on success.
5114
 * @return  MP_VAL when a is NULL.
5115
 */
5116
int sp_init_size(sp_int* a, unsigned int size)
5117
28.9M
{
5118
28.9M
    int err = MP_OKAY;
5119
5120
    /* Validate parameters. Don't use size more than max compiled. */
5121
28.9M
    if ((a == NULL) || ((size == 0) || (size > SP_INT_DIGITS))) {
5122
94.5k
        err = MP_VAL;
5123
94.5k
    }
5124
5125
28.9M
    if (err == MP_OKAY) {
5126
28.8M
        _sp_init_size(a, size);
5127
28.8M
    }
5128
5129
28.9M
    return err;
5130
28.9M
}
5131
5132
/* Initialize the multi-precision number to be zero.
5133
 *
5134
 * @param  [out]  a  SP integer.
5135
 *
5136
 * @return  MP_OKAY on success.
5137
 * @return  MP_VAL when a is NULL.
5138
 */
5139
int sp_init(sp_int* a)
5140
732k
{
5141
732k
    int err = MP_OKAY;
5142
5143
    /* Validate parameter. */
5144
732k
    if (a == NULL) {
5145
0
        err = MP_VAL;
5146
0
    }
5147
732k
    else {
5148
        /* Assume complete sp_int with SP_INT_DIGITS digits. */
5149
732k
        _sp_init_size(a, SP_INT_DIGITS);
5150
732k
    }
5151
5152
732k
    return err;
5153
732k
}
5154
5155
#if !defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(NO_DH) || defined(HAVE_ECC)
5156
/* Initialize up to six multi-precision numbers to be zero.
5157
 *
5158
 * @param  [out]  n1  SP integer.
5159
 * @param  [out]  n2  SP integer.
5160
 * @param  [out]  n3  SP integer.
5161
 * @param  [out]  n4  SP integer.
5162
 * @param  [out]  n5  SP integer.
5163
 * @param  [out]  n6  SP integer.
5164
 *
5165
 * @return  MP_OKAY on success.
5166
 */
5167
int sp_init_multi(sp_int* n1, sp_int* n2, sp_int* n3, sp_int* n4, sp_int* n5,
5168
    sp_int* n6)
5169
408k
{
5170
    /* Initialize only those pointers that are valid. */
5171
408k
    if (n1 != NULL) {
5172
408k
        _sp_init_size(n1, SP_INT_DIGITS);
5173
408k
    }
5174
408k
    if (n2 != NULL) {
5175
408k
        _sp_init_size(n2, SP_INT_DIGITS);
5176
408k
    }
5177
408k
    if (n3 != NULL) {
5178
358k
        _sp_init_size(n3, SP_INT_DIGITS);
5179
358k
    }
5180
408k
    if (n4 != NULL) {
5181
98.2k
        _sp_init_size(n4, SP_INT_DIGITS);
5182
98.2k
    }
5183
408k
    if (n5 != NULL) {
5184
25.8k
        _sp_init_size(n5, SP_INT_DIGITS);
5185
25.8k
    }
5186
408k
    if (n6 != NULL) {
5187
20.2k
        _sp_init_size(n6, SP_INT_DIGITS);
5188
20.2k
    }
5189
5190
408k
    return MP_OKAY;
5191
408k
}
5192
#endif /* !WOLFSSL_RSA_PUBLIC_ONLY || !NO_DH || HAVE_ECC */
5193
5194
/* Free the memory allocated in the multi-precision number.
5195
 *
5196
 * @param  [in]  a  SP integer.
5197
 */
5198
void sp_free(sp_int* a)
5199
30.6M
{
5200
30.6M
    if (a != NULL) {
5201
    #ifdef HAVE_WOLF_BIGINT
5202
        wc_bigint_free(&a->raw);
5203
    #endif
5204
30.6M
    }
5205
30.6M
}
5206
5207
#if (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
5208
    !defined(NO_DH) || defined(HAVE_ECC)
5209
/* Grow multi-precision number to be able to hold l digits.
5210
 * This function does nothing as the number of digits is fixed.
5211
 *
5212
 * @param  [in,out]  a  SP integer.
5213
 * @param  [in]      l  Number of digits to grow to.
5214
 *
5215
 * @return  MP_OKAY on success
5216
 * @return  MP_MEM if the number of digits requested is more than available.
5217
 */
5218
int sp_grow(sp_int* a, int l)
5219
104k
{
5220
104k
    int err = MP_OKAY;
5221
5222
    /* Validate parameter. */
5223
104k
    if ((a == NULL) || (l < 0)) {
5224
0
        err = MP_VAL;
5225
0
    }
5226
    /* Ensure enough words allocated for grow. */
5227
104k
    if ((err == MP_OKAY) && ((unsigned int)l > a->size)) {
5228
34
        err = MP_MEM;
5229
34
    }
5230
104k
    if (err == MP_OKAY) {
5231
104k
        unsigned int i;
5232
5233
        /* Put in zeros up to the new length. */
5234
394k
        for (i = a->used; i < (unsigned int)l; i++) {
5235
290k
            a->dp[i] = 0;
5236
290k
        }
5237
104k
    }
5238
5239
104k
    return err;
5240
104k
}
5241
#endif /* (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) || !NO_DH || HAVE_ECC */
5242
5243
#if (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
5244
    defined(HAVE_ECC) || defined(WOLFSSL_PUBLIC_MP)
5245
/* Set the multi-precision number to zero.
5246
 *
5247
 * @param  [out]  a  SP integer to set to zero.
5248
 */
5249
void sp_zero(sp_int* a)
5250
1.31k
{
5251
    /* Make an sp_int with valid pointer zero. */
5252
1.31k
    if (a != NULL) {
5253
1.31k
        _sp_zero(a);
5254
1.31k
    }
5255
1.31k
}
5256
#endif /* (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) || HAVE_ECC */
5257
5258
/* Clear the data from the multi-precision number, set to zero and free.
5259
 *
5260
 * @param  [out]  a  SP integer.
5261
 */
5262
void sp_clear(sp_int* a)
5263
30.3M
{
5264
#ifdef HAVE_FIPS
5265
    sp_forcezero(a);
5266
#else
5267
    /* Clear when valid pointer passed in. */
5268
30.3M
    if (a != NULL) {
5269
30.3M
        unsigned int i;
5270
5271
        /* Only clear the digits being used. */
5272
244M
        for (i = 0; i < a->used; i++) {
5273
214M
            a->dp[i] = 0;
5274
214M
        }
5275
        /* Set back to zero and free. */
5276
30.3M
        _sp_zero(a);
5277
30.3M
        sp_free(a);
5278
30.3M
    }
5279
30.3M
#endif
5280
30.3M
}
5281
5282
#if !defined(NO_RSA) || !defined(NO_DH) || defined(HAVE_ECC) || \
5283
    !defined(NO_DSA) || defined(WOLFSSL_SP_PRIME_GEN)
5284
/* Ensure the data in the multi-precision number is zeroed.
5285
 *
5286
 * Use when security sensitive data needs to be wiped.
5287
 *
5288
 * @param  [in]  a  SP integer.
5289
 */
5290
void sp_forcezero(sp_int* a)
5291
183k
{
5292
    /* Zeroize when a vald pointer passed in. */
5293
183k
    if (a != NULL) {
5294
        /* Ensure all data zeroized - data not zeroed when used decreases. */
5295
183k
        ForceZero(a->dp, a->size * (word32)SP_WORD_SIZEOF);
5296
        /* Set back to zero. */
5297
    #ifdef HAVE_WOLF_BIGINT
5298
        /* Zeroize the raw data as well. */
5299
        wc_bigint_zero(&a->raw);
5300
    #endif
5301
        /* Make value zero and free. */
5302
183k
        _sp_zero(a);
5303
183k
        sp_free(a);
5304
183k
    }
5305
183k
}
5306
#endif /* !WOLFSSL_RSA_VERIFY_ONLY || !NO_DH || HAVE_ECC */
5307
5308
#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_DH) || defined(HAVE_ECC) || \
5309
    !defined(NO_RSA) || defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY)
5310
/* Copy value of multi-precision number a into r.
5311
 *
5312
 * @param  [in]   a  SP integer - source.
5313
 * @param  [out]  r  SP integer - destination.
5314
 */
5315
static void _sp_copy(const sp_int* a, sp_int* r)
5316
349M
{
5317
    /* Copy words across. */
5318
349M
    if (a->used == 0) {
5319
1.82M
        r->dp[0] = 0;
5320
1.82M
    }
5321
347M
    else {
5322
347M
        XMEMCPY(r->dp, a->dp, a->used * (word32)SP_WORD_SIZEOF);
5323
347M
    }
5324
    /* Set number of used words in result. */
5325
349M
    r->used = a->used;/* // NOLINT(clang-analyzer-core.uninitialized.Assign) */
5326
349M
#ifdef WOLFSSL_SP_INT_NEGATIVE
5327
    /* Set sign of result. */
5328
349M
    r->sign = a->sign;/* // NOLINT(clang-analyzer-core.uninitialized.Assign) */
5329
349M
#endif
5330
349M
}
5331
5332
/* Copy value of multi-precision number a into r.
5333
 *
5334
 * @param  [in]   a  SP integer - source.
5335
 * @param  [out]  r  SP integer - destination.
5336
 *
5337
 * @return  MP_OKAY on success.
5338
 */
5339
int sp_copy(const sp_int* a, sp_int* r)
5340
44.7M
{
5341
44.7M
    int err = MP_OKAY;
5342
5343
    /* Validate parameters. */
5344
44.7M
    if ((a == NULL) || (r == NULL)) {
5345
0
        err = MP_VAL;
5346
0
    }
5347
    /* Only copy if different pointers. */
5348
44.7M
    if (a != r) {
5349
        /* Validated space in result. */
5350
2.28M
        if ((err == MP_OKAY) && (a->used > r->size)) {
5351
1.50k
            err = MP_VAL;
5352
1.50k
        }
5353
2.28M
        if (err == MP_OKAY) {
5354
2.28M
            _sp_copy(a, r);
5355
2.28M
        }
5356
2.28M
    }
5357
5358
44.7M
    return err;
5359
44.7M
}
5360
#endif
5361
5362
#if ((defined(WOLFSSL_SP_MATH_ALL) && ((!defined(WOLFSSL_RSA_VERIFY_ONLY) && \
5363
      !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || !defined(NO_DH))) || \
5364
     defined(OPENSSL_ALL)) && defined(WC_PROTECT_ENCRYPTED_MEM)
5365
5366
/* Copy 2 numbers into two results based on y. Copy a fixed number of digits.
5367
 *
5368
 * Constant time implementation.
5369
 * When y is 0, r1 = a2 and r2 = a1.
5370
 * When y is 1, r1 = a1 and r2 = a2.
5371
 *
5372
 * @param [in]  a1    First number to copy.
5373
 * @param [in]  a2    Second number to copy.
5374
 * @param [out] r1    First result number to copy into.
5375
 * @param [out] r2    Second result number to copy into.
5376
 * @param [in]  y     Indicates which number goes into which result number.
5377
 * @param [in]  used  Number of digits to copy.
5378
 */
5379
static void _sp_copy_2_ct(const sp_int* a1, const sp_int* a2, sp_int* r1,
5380
    sp_int* r2, int y, unsigned int used)
5381
{
5382
    unsigned int i;
5383
5384
    /* Copy data - constant time. */
5385
    for (i = 0; i < used; i++) {
5386
        r1->dp[i] = (a1->dp[i] & ((sp_int_digit)wc_off_on_addr[y  ])) +
5387
                    (a2->dp[i] & ((sp_int_digit)wc_off_on_addr[y^1]));
5388
        r2->dp[i] = (a1->dp[i] & ((sp_int_digit)wc_off_on_addr[y^1])) +
5389
                    (a2->dp[i] & ((sp_int_digit)wc_off_on_addr[y  ]));
5390
    }
5391
    /* Copy used. */
5392
    r1->used = (a1->used & ((int)wc_off_on_addr[y  ])) +
5393
               (a2->used & ((int)wc_off_on_addr[y^1]));
5394
    r2->used = (a1->used & ((int)wc_off_on_addr[y^1])) +
5395
               (a2->used & ((int)wc_off_on_addr[y  ]));
5396
#ifdef WOLFSSL_SP_INT_NEGATIVE
5397
    /* Copy sign. */
5398
    r1->sign = (a1->sign & ((int)wc_off_on_addr[y  ])) +
5399
               (a2->sign & ((int)wc_off_on_addr[y^1]));
5400
    r2->sign = (a1->sign & ((int)wc_off_on_addr[y^1])) +
5401
               (a2->sign & ((int)wc_off_on_addr[y  ]));
5402
#endif
5403
}
5404
5405
#endif
5406
5407
#if defined(WOLFSSL_SP_MATH_ALL) || (defined(HAVE_ECC) && defined(FP_ECC))
5408
/* Initializes r and copies in value from a.
5409
 *
5410
 * @param  [out]  r  SP integer - destination.
5411
 * @param  [in]   a  SP integer - source.
5412
 *
5413
 * @return  MP_OKAY on success.
5414
 * @return  MP_VAL when a or r is NULL.
5415
 */
5416
int sp_init_copy(sp_int* r, const sp_int* a)
5417
0
{
5418
0
    int err;
5419
5420
    /* Initialize r and copy value in a into it. */
5421
0
    err = sp_init(r);
5422
0
    if (err == MP_OKAY) {
5423
0
        err = sp_copy(a, r);
5424
0
    }
5425
5426
0
    return err;
5427
0
}
5428
#endif /* WOLFSSL_SP_MATH_ALL || (HAVE_ECC && FP_ECC) */
5429
5430
#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
5431
    !defined(NO_DH) || !defined(NO_DSA)
5432
/* Exchange the values in a and b.
5433
 *
5434
 * Avoid using this API as three copy operations are performed.
5435
 *
5436
 * @param  [in,out]  a  SP integer to swap.
5437
 * @param  [in,out]  b  SP integer to swap.
5438
 *
5439
 * @return  MP_OKAY on success.
5440
 * @return  MP_VAL when a or b is NULL.
5441
 * @return  MP_MEM when dynamic memory allocation fails.
5442
 */
5443
int sp_exch(sp_int* a, sp_int* b)
5444
167
{
5445
167
    int err = MP_OKAY;
5446
5447
    /* Validate parameters. */
5448
167
    if ((a == NULL) || (b == NULL)) {
5449
0
        err = MP_VAL;
5450
0
    }
5451
    /* Check space for a in b and b in a. */
5452
167
    if ((err == MP_OKAY) && ((a->size < b->used) || (b->size < a->used))) {
5453
25
        err = MP_VAL;
5454
25
    }
5455
5456
167
    if (err == MP_OKAY) {
5457
        /* Declare temporary for swapping. */
5458
142
        DECL_SP_INT(t, a->used);
5459
5460
        /* Create temporary for swapping. */
5461
142
        ALLOC_SP_INT(t, a->used, err, NULL);
5462
142
        if (err == MP_OKAY) {
5463
            /* Cache allocated size of a and b. */
5464
101
            sp_size_t asize = a->size;
5465
101
            sp_size_t bsize = b->size;
5466
            /* Copy all of SP int: t <- a, a <- b, b <- t. */
5467
101
            XMEMCPY(t, a, MP_INT_SIZEOF(a->used));
5468
101
            XMEMCPY(a, b, MP_INT_SIZEOF(b->used));
5469
101
            XMEMCPY(b, t, MP_INT_SIZEOF(t->used));
5470
            /* Put back size of a and b. */
5471
101
            a->size = asize;
5472
101
            b->size = bsize;
5473
101
        }
5474
5475
142
        FREE_SP_INT(t, NULL);
5476
142
    }
5477
5478
167
    return err;
5479
167
}
5480
#endif /* (WOLFSSL_SP_MATH_ALL && !WOLFSSL_RSA_VERIFY_ONLY) || !NO_DH ||
5481
        * !NO_DSA */
5482
5483
#if defined(HAVE_ECC) && defined(ECC_TIMING_RESISTANT) && \
5484
    !defined(WC_NO_CACHE_RESISTANT)
5485
/* Conditional swap of SP int values in constant time.
5486
 *
5487
 * @param [in]  a     First SP int to conditionally swap.
5488
 * @param [in]  b     Second SP int to conditionally swap.
5489
 * @param [in]  cnt   Count of words to copy.
5490
 * @param [in]  swap  When value is 1 then swap.
5491
 * @param [in]  t     Temporary SP int to use in swap.
5492
 * @return  MP_OKAY on success.
5493
 * @return  MP_MEM when dynamic memory allocation fails.
5494
 */
5495
int sp_cond_swap_ct_ex(sp_int* a, sp_int* b, int cnt, int swap, sp_int* t)
5496
19.8M
{
5497
19.8M
    unsigned int i;
5498
19.8M
    volatile sp_int_digit mask = (sp_int_digit)0 - (sp_int_digit)swap;
5499
5500
    /* XOR other fields in sp_int into temp - mask set when swapping. */
5501
19.8M
    t->used = (a->used ^ b->used) & (sp_size_t)mask;
5502
19.8M
#ifdef WOLFSSL_SP_INT_NEGATIVE
5503
19.8M
    t->sign = (a->sign ^ b->sign) & (sp_uint8)mask;
5504
19.8M
#endif
5505
5506
    /* XOR requested words into temp - mask set when swapping. */
5507
170M
    for (i = 0; i < (unsigned int)cnt; i++) {
5508
150M
        t->dp[i] = (a->dp[i] ^ b->dp[i]) & mask;
5509
150M
    }
5510
5511
    /* XOR temporary - when mask set then result will be b. */
5512
19.8M
    a->used ^= t->used;
5513
19.8M
#ifdef WOLFSSL_SP_INT_NEGATIVE
5514
19.8M
    a->sign ^= t->sign;
5515
19.8M
#endif
5516
170M
    for (i = 0; i < (unsigned int)cnt; i++) {
5517
150M
        a->dp[i] ^= t->dp[i];
5518
150M
    }
5519
5520
    /* XOR temporary - when mask set then result will be a. */
5521
19.8M
    b->used ^= t->used;
5522
19.8M
#ifdef WOLFSSL_SP_INT_NEGATIVE
5523
19.8M
    b->sign ^= b->sign;
5524
19.8M
#endif
5525
170M
    for (i = 0; i < (unsigned int)cnt; i++) {
5526
150M
        b->dp[i] ^= t->dp[i];
5527
150M
    }
5528
5529
19.8M
    return MP_OKAY;
5530
19.8M
}
5531
5532
/* Conditional swap of SP int values in constant time.
5533
 *
5534
 * @param [in]  a     First SP int to conditionally swap.
5535
 * @param [in]  b     Second SP int to conditionally swap.
5536
 * @param [in]  cnt   Count of words to copy.
5537
 * @param [in]  swap  When value is 1 then swap.
5538
 * @return  MP_OKAY on success.
5539
 * @return  MP_MEM when dynamic memory allocation fails.
5540
 */
5541
int sp_cond_swap_ct(sp_int* a, sp_int* b, int cnt, int swap)
5542
0
{
5543
0
    int err = MP_OKAY;
5544
0
    DECL_SP_INT(t, (size_t)cnt);
5545
5546
    /* Allocate temporary to hold masked xor of a and b. */
5547
0
    ALLOC_SP_INT(t, cnt, err, NULL);
5548
5549
0
    if (err == MP_OKAY) {
5550
0
        err = sp_cond_swap_ct_ex(a, b, cnt, swap, t);
5551
0
        FREE_SP_INT(t, NULL);
5552
0
    }
5553
5554
0
    return err;
5555
0
}
5556
#endif /* HAVE_ECC && ECC_TIMING_RESISTANT && !WC_NO_CACHE_RESISTANT */
5557
5558
#ifdef WOLFSSL_SP_INT_NEGATIVE
5559
/* Calculate the absolute value of the multi-precision number.
5560
 *
5561
 * @param  [in]   a  SP integer to calculate absolute value of.
5562
 * @param  [out]  r  SP integer to hold result.
5563
 *
5564
 * @return  MP_OKAY on success.
5565
 * @return  MP_VAL when a or r is NULL.
5566
 */
5567
int sp_abs(const sp_int* a, sp_int* r)
5568
69
{
5569
69
    int err;
5570
5571
    /* Copy a into r - copy fails when r is NULL. */
5572
69
    err = sp_copy(a, r);
5573
69
    if (err == MP_OKAY) {
5574
54
        r->sign = MP_ZPOS;
5575
54
    }
5576
5577
69
    return err;
5578
69
}
5579
#endif /* WOLFSSL_SP_INT_NEGATIVE */
5580
5581
#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_DH) || defined(HAVE_ECC) || \
5582
    !defined(NO_RSA)
5583
/* Compare absolute value of two multi-precision numbers.
5584
 *
5585
 * @param [in] a  SP integer.
5586
 * @param [in] b  SP integer.
5587
 *
5588
 * @return  MP_GT when a is greater than b.
5589
 * @return  MP_LT when a is less than b.
5590
 * @return  MP_EQ when a is equals b.
5591
 */
5592
static int _sp_cmp_abs(const sp_int* a, const sp_int* b)
5593
361M
{
5594
361M
    int ret = MP_EQ;
5595
5596
    /* Check number of words first. */
5597
361M
    if (a->used > b->used) {
5598
51.0M
        ret = MP_GT;
5599
51.0M
    }
5600
310M
    else if (a->used < b->used) {
5601
10.5M
        ret = MP_LT;
5602
10.5M
    }
5603
299M
    else {
5604
299M
        int i;
5605
5606
        /* Starting from most significant word, compare words.
5607
         * Stop when different and set comparison return.
5608
         */
5609
352M
        for (i = (int)(a->used - 1); i >= 0; i--) {
5610
349M
            if (a->dp[i] > b->dp[i]) {
5611
69.0M
                ret = MP_GT;
5612
69.0M
                break;
5613
69.0M
            }
5614
280M
            else if (a->dp[i] < b->dp[i]) {
5615
226M
                ret = MP_LT;
5616
226M
                break;
5617
226M
            }
5618
349M
        }
5619
        /* If we made to the end then ret is MP_EQ from initialization. */
5620
299M
    }
5621
5622
361M
    return ret;
5623
361M
}
5624
#endif
5625
5626
#if defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)
5627
/* Compare absolute value of two multi-precision numbers.
5628
 *
5629
 * Pointers are compared such that NULL is less than not NULL.
5630
 *
5631
 * @param [in] a  SP integer.
5632
 * @param [in] b  SP integer.
5633
 *
5634
 * @return  MP_GT when a is greater than b.
5635
 * @return  MP_LT when a is less than b.
5636
 * @return  MP_EQ when a equals b.
5637
 */
5638
int sp_cmp_mag(const sp_int* a, const sp_int* b)
5639
0
{
5640
0
    int ret;
5641
5642
    /* Do pointer checks first. Both NULL returns equal. */
5643
0
    if (a == b) {
5644
0
        ret = MP_EQ;
5645
0
    }
5646
    /* Nothing is smaller than something. */
5647
0
    else if (a == NULL) {
5648
0
        ret = MP_LT;
5649
0
    }
5650
    /* Something is larger than nothing. */
5651
0
    else if (b == NULL) {
5652
0
        ret = MP_GT;
5653
0
    }
5654
0
    else
5655
0
    {
5656
        /* Compare values - a and b are not NULL. */
5657
0
        ret = _sp_cmp_abs(a, b);
5658
0
    }
5659
5660
0
    return ret;
5661
0
}
5662
#endif
5663
5664
#if defined(WOLFSSL_SP_MATH_ALL) || defined(HAVE_ECC) || !defined(NO_DSA) || \
5665
    defined(OPENSSL_EXTRA) || !defined(NO_DH) || !defined(NO_RSA)
5666
/* Compare two multi-precision numbers.
5667
 *
5668
 * Assumes a and b are not NULL.
5669
 *
5670
 * @param [in] a  SP integer.
5671
 * @param [in] b  SP integer.
5672
 *
5673
 * @return  MP_GT when a is greater than b.
5674
 * @return  MP_LT when a is less than b.
5675
 * @return  MP_EQ when a is equals b.
5676
 */
5677
static int _sp_cmp(const sp_int* a, const sp_int* b)
5678
9.91M
{
5679
9.91M
    int ret;
5680
5681
9.91M
#ifdef WOLFSSL_SP_INT_NEGATIVE
5682
    /* Check sign first. */
5683
9.91M
    if (a->sign > b->sign) {
5684
1.30k
        ret = MP_LT;
5685
1.30k
    }
5686
9.91M
    else if (a->sign < b->sign) {
5687
521
        ret = MP_GT;
5688
521
    }
5689
9.91M
    else /* (a->sign == b->sign) */ {
5690
9.91M
#endif
5691
        /* Compare values. */
5692
9.91M
        ret = _sp_cmp_abs(a, b);
5693
9.91M
#ifdef WOLFSSL_SP_INT_NEGATIVE
5694
9.91M
        if (a->sign == MP_NEG) {
5695
            /* MP_GT = 1, MP_LT = -1, MP_EQ = 0
5696
             * Swapping MP_GT and MP_LT results.
5697
             */
5698
654
            ret = -ret;
5699
654
        }
5700
9.91M
    }
5701
9.91M
#endif
5702
5703
9.91M
    return ret;
5704
9.91M
}
5705
#endif
5706
5707
#if !defined(NO_RSA) || !defined(NO_DSA) || defined(HAVE_ECC) || \
5708
    !defined(NO_DH) || defined(WOLFSSL_SP_MATH_ALL)
5709
/* Compare two multi-precision numbers.
5710
 *
5711
 * Pointers are compared such that NULL is less than not NULL.
5712
 *
5713
 * @param [in] a  SP integer.
5714
 * @param [in] b  SP integer.
5715
 *
5716
 * @return  MP_GT when a is greater than b.
5717
 * @return  MP_LT when a is less than b.
5718
 * @return  MP_EQ when a is equals b.
5719
 */
5720
int sp_cmp(const sp_int* a, const sp_int* b)
5721
13.6M
{
5722
13.6M
    int ret;
5723
5724
    /* Check pointers first. Both NULL returns equal. */
5725
13.6M
    if (a == b) {
5726
2.09k
        ret = MP_EQ;
5727
2.09k
    }
5728
    /* Nothing is smaller than something. */
5729
13.6M
    else if (a == NULL) {
5730
0
        ret = MP_LT;
5731
0
    }
5732
    /* Something is larger than nothing. */
5733
13.6M
    else if (b == NULL) {
5734
0
        ret = MP_GT;
5735
0
    }
5736
13.6M
    else
5737
13.6M
    {
5738
        /* Compare values - a and b are not NULL. */
5739
13.6M
        ret = _sp_cmp(a, b);
5740
13.6M
    }
5741
5742
13.6M
    return ret;
5743
13.6M
}
5744
#endif
5745
5746
#if defined(HAVE_ECC) && !defined(WC_NO_RNG) && \
5747
    defined(WOLFSSL_ECC_GEN_REJECT_SAMPLING)
5748
/* Compare two multi-precision numbers in constant time.
5749
 *
5750
 * Assumes a and b are not NULL.
5751
 * Assumes a and b are positive.
5752
 *
5753
 * @param [in] a  SP integer.
5754
 * @param [in] b  SP integer.
5755
 * @param [in] n  Number of digits to compare.
5756
 *
5757
 * @return  MP_GT when a is greater than b.
5758
 * @return  MP_LT when a is less than b.
5759
 * @return  MP_EQ when a is equals b.
5760
 */
5761
static int _sp_cmp_ct(const sp_int* a, const sp_int* b, unsigned int n)
5762
{
5763
    int ret = MP_EQ;
5764
    int i;
5765
    volatile int mask = -1;
5766
5767
    for (i = n - 1; i >= 0; i--) {
5768
        sp_int_digit ad = a->dp[i] & ((sp_int_digit)0 - (i < (int)a->used));
5769
        sp_int_digit bd = b->dp[i] & ((sp_int_digit)0 - (i < (int)b->used));
5770
5771
        ret |= mask & ((0 - (ad < bd)) & MP_LT);
5772
        mask &= 0 - (ret == MP_EQ);
5773
        ret |= mask & ((0 - (ad > bd)) & MP_GT);
5774
        mask &= 0 - (ret == MP_EQ);
5775
    }
5776
5777
    return ret;
5778
}
5779
5780
/* Compare two multi-precision numbers in constant time.
5781
 *
5782
 * Pointers are compared such that NULL is less than not NULL.
5783
 * Assumes a and b are positive.
5784
 * Assumes a and b have n digits set at sometime.
5785
 *
5786
 * @param [in] a  SP integer.
5787
 * @param [in] b  SP integer.
5788
 * @param [in] n  Number of digits to compare.
5789
 *
5790
 * @return  MP_GT when a is greater than b.
5791
 * @return  MP_LT when a is less than b.
5792
 * @return  MP_EQ when a is equals b.
5793
 */
5794
int sp_cmp_ct(const sp_int* a, const sp_int* b, unsigned int n)
5795
{
5796
    int ret;
5797
5798
    /* Check pointers first. Both NULL returns equal. */
5799
    if (a == b) {
5800
        ret = MP_EQ;
5801
    }
5802
    /* Nothing is smaller than something. */
5803
    else if (a == NULL) {
5804
        ret = MP_LT;
5805
    }
5806
    /* Something is larger than nothing. */
5807
    else if (b == NULL) {
5808
        ret = MP_GT;
5809
    }
5810
    else
5811
    {
5812
        /* Compare values - a and b are not NULL. */
5813
        ret = _sp_cmp_ct(a, b, n);
5814
    }
5815
5816
    return ret;
5817
}
5818
#endif /* HAVE_ECC && !WC_NO_RNG && WOLFSSL_ECC_GEN_REJECT_SAMPLING */
5819
5820
/*************************
5821
 * Bit check/set functions
5822
 *************************/
5823
5824
#if (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
5825
    ((defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_SP_SM2)) && \
5826
     defined(HAVE_ECC)) || defined(OPENSSL_EXTRA) || defined(WOLFSSL_PUBLIC_MP)
5827
/* Check if a bit is set
5828
 *
5829
 * When a is NULL, result is 0.
5830
 *
5831
 * @param  [in]  a  SP integer.
5832
 * @param  [in]  b  Bit position to check.
5833
 *
5834
 * @return  0 when bit is not set.
5835
 * @return  1 when bit is set.
5836
 */
5837
int sp_is_bit_set(const sp_int* a, unsigned int b)
5838
4.76M
{
5839
4.76M
    int ret = 0;
5840
    /* Index of word. */
5841
4.76M
    unsigned int i = b >> SP_WORD_SHIFT;
5842
5843
    /* Check parameters. */
5844
4.76M
    if ((a != NULL) && (i < a->used)) {
5845
        /* Shift amount to get bit down to index 0. */
5846
4.76M
        unsigned int s = b & SP_WORD_MASK;
5847
5848
        /* Get and mask bit. */
5849
4.76M
        ret = (int)((a->dp[i] >> s) & (sp_int_digit)1);
5850
4.76M
    }
5851
5852
4.76M
    return ret;
5853
4.76M
}
5854
#endif /* (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) ||
5855
        * (WOLFSSL_SP_MATH_ALL && HAVE_ECC) */
5856
5857
/* Count the number of bits in the multi-precision number.
5858
 *
5859
 * When a is NULL, result is 0.
5860
 *
5861
 * @param  [in]  a  SP integer.
5862
 *
5863
 * @return  Number of bits in the SP integer value.
5864
 */
5865
int sp_count_bits(const sp_int* a)
5866
119M
{
5867
119M
    int n = -1;
5868
5869
    /* Check parameter. */
5870
119M
    if ((a != NULL) && (a->used > 0)) {
5871
        /* Get index of last word. */
5872
119M
        n = (int)(a->used - 1);
5873
        /* Don't count leading zeros. */
5874
119M
        while ((n >= 0) && (a->dp[n] == 0)) {
5875
2.86k
            n--;
5876
2.86k
        }
5877
119M
    }
5878
5879
    /* -1 indicates SP integer value was zero. */
5880
119M
    if (n < 0) {
5881
12.2k
        n = 0;
5882
12.2k
    }
5883
119M
    else {
5884
        /* Get the most significant word. */
5885
119M
        sp_int_digit d = a->dp[n];
5886
        /* Count of bits up to last word. */
5887
119M
        n *= SP_WORD_SIZE;
5888
5889
119M
    #ifdef SP_ASM_HI_BIT_SET_IDX
5890
119M
        {
5891
119M
            sp_int_digit hi;
5892
            /* Get index of highest set bit. */
5893
119M
            SP_ASM_HI_BIT_SET_IDX(d, hi);
5894
            /* Add bits up to and including index. */
5895
119M
            n += (int)hi + 1;
5896
119M
        }
5897
    #elif defined(SP_ASM_LZCNT)
5898
        {
5899
            sp_int_digit lz;
5900
            /* Count number of leading zeros in highest non-zero digit. */
5901
            SP_ASM_LZCNT(d, lz);
5902
            /* Add non-leading zero bits count. */
5903
            n += SP_WORD_SIZE - (int)lz;
5904
        }
5905
    #else
5906
        /* Check if top word has more than half the bits set. */
5907
        if (d > SP_HALF_MAX) {
5908
            /* Set count to a full last word. */
5909
            n += SP_WORD_SIZE;
5910
            /* Don't count leading zero bits. */
5911
            while ((d & ((sp_int_digit)1 << (SP_WORD_SIZE - 1))) == 0) {
5912
                n--;
5913
                d <<= 1;
5914
            }
5915
        }
5916
        else {
5917
            /* Add to count until highest set bit is shifted out. */
5918
            while (d != 0) {
5919
                n++;
5920
                d >>= 1;
5921
            }
5922
        }
5923
    #endif
5924
119M
    }
5925
5926
119M
    return n;
5927
119M
}
5928
5929
#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && \
5930
    !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || !defined(NO_DH) || \
5931
    (defined(HAVE_ECC) && defined(FP_ECC)) || \
5932
    (!defined(NO_RSA) && defined(WOLFSSL_KEY_GEN))
5933
5934
/* Number of entries in array of number of least significant zero bits. */
5935
#define SP_LNZ_CNT      16
5936
/* Number of bits the array checks. */
5937
79.6k
#define SP_LNZ_BITS     4
5938
/* Mask to apply to check with array. */
5939
422k
#define SP_LNZ_MASK     0xf
5940
/* Number of least significant zero bits in first SP_LNZ_CNT numbers. */
5941
static const int sp_lnz[SP_LNZ_CNT] = {
5942
   4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
5943
};
5944
5945
/* Count the number of least significant zero bits.
5946
 *
5947
 * When a is not NULL, result is 0.
5948
 *
5949
 * @param  [in]   a  SP integer to use.
5950
 *
5951
 * @return  Number of least significant zero bits.
5952
 */
5953
#if !defined(HAVE_ECC) || !defined(HAVE_COMP_KEY)
5954
static
5955
#endif /* !HAVE_ECC || HAVE_COMP_KEY */
5956
int sp_cnt_lsb(const sp_int* a)
5957
342k
{
5958
342k
    unsigned int bc = 0;
5959
5960
    /* Check for number with a value. */
5961
342k
    if ((a != NULL) && (!sp_iszero(a))) {
5962
342k
        unsigned int i;
5963
342k
        unsigned int j;
5964
5965
        /* Count least significant words that are zero. */
5966
349k
        for (i = 0; (i < a->used) && (a->dp[i] == 0); i++, bc += SP_WORD_SIZE) {
5967
7.13k
        }
5968
5969
        /* Use 4-bit table to get count. */
5970
422k
        for (j = 0; j < SP_WORD_SIZE; j += SP_LNZ_BITS) {
5971
            /* Get number of lesat significant 0 bits in nibble. */
5972
422k
            int cnt = sp_lnz[(a->dp[i] >> j) & SP_LNZ_MASK];
5973
            /* Done if not all 4 bits are zero. */
5974
422k
            if (cnt != 4) {
5975
                /* Add checked bits and count in last 4 bits checked. */
5976
342k
                bc += j + (unsigned int)cnt;
5977
342k
                break;
5978
342k
            }
5979
422k
        }
5980
342k
    }
5981
5982
342k
    return (int)bc;
5983
342k
}
5984
#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_HAVE_SP_DH || (HAVE_ECC && FP_ECC) */
5985
5986
#if !defined(WOLFSSL_RSA_VERIFY_ONLY) || defined(WOLFSSL_ASN_TEMPLATE) || \
5987
    (defined(WOLFSSL_SP_MATH_ALL) && !defined(NO_ASN))
5988
/* Determine if the most significant byte of the encoded multi-precision number
5989
 * has the top bit set.
5990
 *
5991
 * When a is NULL, result is 0.
5992
 *
5993
 * @param  [in]  a  SP integer.
5994
 *
5995
 * @return  1 when the top bit of top byte is set.
5996
 * @return  0 when the top bit of top byte is not set.
5997
 */
5998
int sp_leading_bit(const sp_int* a)
5999
36.3k
{
6000
36.3k
    int bit = 0;
6001
6002
    /* Check if we have a number and value to use. */
6003
36.3k
    if ((a != NULL) && (a->used > 0)) {
6004
        /* Get top word. */
6005
36.3k
        sp_int_digit d = a->dp[a->used - 1];
6006
6007
36.3k
    #if SP_WORD_SIZE > 8
6008
        /* Remove bottom 8 bits until highest 8 bits left. */
6009
203k
        while (d > (sp_int_digit)0xff) {
6010
167k
            d >>= 8;
6011
167k
        }
6012
36.3k
    #endif
6013
        /* Get the highest bit of the 8-bit value. */
6014
36.3k
        bit = (int)(d >> 7);
6015
36.3k
    }
6016
6017
36.3k
    return bit;
6018
36.3k
}
6019
#endif /* !WOLFSSL_RSA_VERIFY_ONLY */
6020
6021
#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_HAVE_SP_DH) || \
6022
    defined(HAVE_ECC) || defined(WOLFSSL_KEY_GEN) || defined(OPENSSL_EXTRA) || \
6023
    !defined(NO_RSA)
6024
/* Set one bit of a: a |= 1 << i
6025
 * The field 'used' is updated in a.
6026
 *
6027
 * @param  [in,out]  a  SP integer to set bit into.
6028
 * @param  [in]      i  Index of bit to set.
6029
 *
6030
 * @return  MP_OKAY on success.
6031
 * @return  MP_VAL when a is NULL, index is negative or index is too large.
6032
 */
6033
int sp_set_bit(sp_int* a, int i)
6034
784k
{
6035
784k
    int err = MP_OKAY;
6036
    /* Get index of word to set. */
6037
784k
    sp_size_t w = (sp_size_t)(i >> SP_WORD_SHIFT);
6038
6039
    /* Check for valid number and and space for bit. */
6040
784k
    if ((a == NULL) || (i < 0) || (w >= a->size)) {
6041
112
        err = MP_VAL;
6042
112
    }
6043
784k
    if (err == MP_OKAY) {
6044
        /* Amount to shift up to set bit in word. */
6045
784k
        unsigned int s = (unsigned int)(i & (SP_WORD_SIZE - 1));
6046
784k
        unsigned int j;
6047
6048
        /* Set to zero all unused words up to and including word to have bit
6049
         * set.
6050
         */
6051
6.26M
        for (j = a->used; j <= w; j++) {
6052
5.48M
            a->dp[j] = 0;
6053
5.48M
        }
6054
        /* Set bit in word. */
6055
784k
        a->dp[w] |= (sp_int_digit)1 << s;
6056
        /* Update used if necessary */
6057
784k
        if (a->used <= w) {
6058
784k
            a->used = (sp_size_t)(w + 1U);
6059
784k
        }
6060
784k
    }
6061
6062
784k
    return err;
6063
784k
}
6064
#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_HAVE_SP_DH || HAVE_ECC ||
6065
        * WOLFSSL_KEY_GEN || OPENSSL_EXTRA || !NO_RSA */
6066
6067
#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
6068
    defined(WOLFSSL_KEY_GEN) || !defined(NO_DH)
6069
/* Exponentiate 2 to the power of e: a = 2^e
6070
 * This is done by setting the 'e'th bit.
6071
 *
6072
 * @param  [out]  a  SP integer to hold result.
6073
 * @param  [in]   e  Exponent.
6074
 *
6075
 * @return  MP_OKAY on success.
6076
 * @return  MP_VAL when a is NULL, e is negative or 2^exponent is too large.
6077
 */
6078
int sp_2expt(sp_int* a, int e)
6079
285
{
6080
285
    int err = MP_OKAY;
6081
6082
    /* Validate parameters. */
6083
285
    if ((a == NULL) || (e < 0)) {
6084
0
        err = MP_VAL;
6085
0
    }
6086
285
    if (err == MP_OKAY) {
6087
        /* Set number to zero and then set bit. */
6088
285
        _sp_zero(a);
6089
285
        err = sp_set_bit(a, e);
6090
285
    }
6091
6092
285
    return err;
6093
285
}
6094
#endif /* (WOLFSSL_SP_MATH_ALL && !WOLFSSL_RSA_VERIFY_ONLY) ||
6095
        * WOLFSSL_KEY_GEN || !NO_DH */
6096
6097
/**********************
6098
 * Digit/Long functions
6099
 **********************/
6100
6101
#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_RSA) || !defined(NO_DH) || \
6102
    defined(HAVE_ECC)
6103
/* Set the multi-precision number to be the value of the digit.
6104
 *
6105
 * @param  [out]  a  SP integer to become number.
6106
 * @param  [in]   d  Digit to be set.
6107
 */
6108
static void _sp_set(sp_int* a, sp_int_digit d)
6109
4.47M
{
6110
    /* Use sp_int_minimal to support allocated byte arrays as sp_ints. */
6111
4.47M
    sp_int_minimal* am = (sp_int_minimal*)a;
6112
6113
4.47M
    am->dp[0] = d;
6114
    /* d == 0 => used = 0, d > 0 => used = 1 */
6115
4.47M
    am->used = (d > 0);
6116
4.47M
#ifdef WOLFSSL_SP_INT_NEGATIVE
6117
4.47M
    am->sign = MP_ZPOS;
6118
4.47M
#endif
6119
4.47M
}
6120
6121
/* Set the multi-precision number to be the value of the digit.
6122
 *
6123
 * @param  [out]  a  SP integer to become number.
6124
 * @param  [in]   d  Digit to be set.
6125
 *
6126
 * @return  MP_OKAY on success.
6127
 * @return  MP_VAL when a is NULL.
6128
 */
6129
int sp_set(sp_int* a, sp_int_digit d)
6130
325k
{
6131
325k
    int err = MP_OKAY;
6132
6133
    /* Validate parameters. */
6134
325k
    if (a == NULL) {
6135
0
        err = MP_VAL;
6136
0
    }
6137
325k
    if (err == MP_OKAY) {
6138
325k
        _sp_set(a, d);
6139
325k
    }
6140
6141
325k
    return err;
6142
325k
}
6143
#endif
6144
6145
#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_RSA) || defined(OPENSSL_EXTRA)
6146
/* Set a number into the multi-precision number.
6147
 *
6148
 * Number may be larger than the size of a digit.
6149
 *
6150
 * @param  [out]  a  SP integer to set.
6151
 * @param  [in]   n  Long value to set.
6152
 *
6153
 * @return  MP_OKAY on success.
6154
 * @return  MP_VAL when a is NULL.
6155
 */
6156
int sp_set_int(sp_int* a, unsigned long n)
6157
3.44k
{
6158
3.44k
    int err = MP_OKAY;
6159
6160
3.44k
    if (a == NULL) {
6161
0
        err = MP_VAL;
6162
0
    }
6163
6164
3.44k
    if (err == MP_OKAY) {
6165
    #if SP_WORD_SIZE < SP_ULONG_BITS
6166
        /* Assign if value first in one word. */
6167
        if (n <= (sp_int_digit)SP_DIGIT_MAX) {
6168
    #endif
6169
3.44k
            a->dp[0] = (sp_int_digit)n;
6170
3.44k
            a->used = (n != 0);
6171
    #if SP_WORD_SIZE < SP_ULONG_BITS
6172
        }
6173
        else {
6174
            unsigned int i;
6175
6176
            /* Assign value word by word. */
6177
            for (i = 0; (i < a->size) && (n > 0); i++,n >>= SP_WORD_SIZE) {
6178
                a->dp[i] = (sp_int_digit)n;
6179
            }
6180
            /* Update number of words used. */
6181
            a->used = i;
6182
            /* Check for overflow. */
6183
            if ((i == a->size) && (n != 0)) {
6184
                err = MP_VAL;
6185
            }
6186
        }
6187
    #endif
6188
3.44k
    #ifdef WOLFSSL_SP_INT_NEGATIVE
6189
3.44k
        a->sign = MP_ZPOS;
6190
3.44k
    #endif
6191
3.44k
    }
6192
6193
3.44k
    return err;
6194
3.44k
}
6195
#endif /* WOLFSSL_SP_MATH_ALL || !NO_RSA  */
6196
6197
#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_RSA) || !defined(NO_DH) || \
6198
    defined(HAVE_ECC)
6199
/* Compare a one digit number with a multi-precision number.
6200
 *
6201
 * When a is NULL, MP_LT is returned.
6202
 *
6203
 * @param  [in]  a  SP integer to compare.
6204
 * @param  [in]  d  Digit to compare with.
6205
 *
6206
 * @return  MP_GT when a is greater than d.
6207
 * @return  MP_LT when a is less than d.
6208
 * @return  MP_EQ when a is equals d.
6209
 */
6210
int sp_cmp_d(const sp_int* a, sp_int_digit d)
6211
5.98M
{
6212
5.98M
    int ret = MP_EQ;
6213
6214
    /* No SP integer is always less - even when d is zero. */
6215
5.98M
    if (a == NULL) {
6216
0
        ret = MP_LT;
6217
0
    }
6218
5.98M
    else
6219
5.98M
#ifdef WOLFSSL_SP_INT_NEGATIVE
6220
    /* Check sign first. */
6221
5.98M
    if (a->sign == MP_NEG) {
6222
132
        ret = MP_LT;
6223
132
    }
6224
5.98M
    else
6225
5.98M
#endif
6226
5.98M
    {
6227
        /* Check if SP integer as more than one word. */
6228
5.98M
        if (a->used > 1) {
6229
1.34M
            ret = MP_GT;
6230
1.34M
        }
6231
        /* Special case for zero. */
6232
4.63M
        else if (a->used == 0) {
6233
130k
            if (d != 0) {
6234
129k
                ret = MP_LT;
6235
129k
            }
6236
            /* ret initialized to equal. */
6237
130k
        }
6238
4.50M
        else {
6239
            /* The single word in the SP integer can now be compared with d. */
6240
4.50M
            if (a->dp[0] > d) {
6241
729k
                ret = MP_GT;
6242
729k
            }
6243
3.77M
            else if (a->dp[0] < d) {
6244
29.0k
                ret = MP_LT;
6245
29.0k
            }
6246
            /* ret initialized to equal. */
6247
4.50M
        }
6248
5.98M
    }
6249
6250
5.98M
    return ret;
6251
5.98M
}
6252
#endif
6253
6254
#if defined(WOLFSSL_SP_ADD_D) || (defined(WOLFSSL_SP_INT_NEGATIVE) && \
6255
    defined(WOLFSSL_SP_SUB_D)) || defined(WOLFSSL_SP_READ_RADIX_10)
6256
/* Add a one digit number to the multi-precision number.
6257
 *
6258
 * @param  [in]   a  SP integer be added to.
6259
 * @param  [in]   d  Digit to add.
6260
 * @param  [out]  r  SP integer to store result in.
6261
 *
6262
 * @return  MP_OKAY on success.
6263
 * @return  MP_VAL when result is too large for fixed size dp array.
6264
 */
6265
static int _sp_add_d(const sp_int* a, sp_int_digit d, sp_int* r)
6266
6.43M
{
6267
6.43M
    int err = MP_OKAY;
6268
6269
    /* Special case of zero means we want result to have a digit when not adding
6270
     * zero. */
6271
6.43M
    if (a->used == 0) {
6272
76.1k
        r->dp[0] = d;
6273
76.1k
        r->used = (d > 0);
6274
76.1k
    }
6275
6.35M
    else {
6276
6.35M
        unsigned int i = 0;
6277
6.35M
        sp_int_digit a0 = a->dp[0];
6278
6279
        /* Set used of result - updated if overflow seen. */
6280
6.35M
        r->used = a->used;
6281
6282
6.35M
        r->dp[0] = a0 + d;
6283
        /* Check for carry. */
6284
6.35M
        if (r->dp[0] < a0) {
6285
            /* Do carry through all words. */
6286
23.5k
            for (++i; i < a->used; i++) {
6287
22.8k
                r->dp[i] = a->dp[i] + 1;
6288
22.8k
                if (r->dp[i] != 0) {
6289
14.7k
                   break;
6290
14.7k
                }
6291
22.8k
            }
6292
            /* Add another word if required. */
6293
15.4k
            if (i == a->used) {
6294
                /* Check result has enough space for another word. */
6295
694
                if (i < r->size) {
6296
694
                    r->used++;
6297
694
                    r->dp[i] = 1;
6298
694
                }
6299
0
                else {
6300
0
                    err = MP_VAL;
6301
0
                }
6302
694
            }
6303
15.4k
        }
6304
        /* When result is not the same as input, copy rest of digits. */
6305
6.35M
        if ((err == MP_OKAY) && (r != a)) {
6306
            /* Copy any words that didn't update with carry. */
6307
12.8k
            for (++i; i < a->used; i++) {
6308
10.1k
                r->dp[i] = a->dp[i];
6309
10.1k
            }
6310
2.67k
        }
6311
6.35M
    }
6312
6313
6.43M
    return err;
6314
6.43M
}
6315
#endif /* WOLFSSL_SP_ADD_D || (WOLFSSL_SP_INT_NEGATIVE && WOLFSSL_SP_SUB_D) ||
6316
        * defined(WOLFSSL_SP_READ_RADIX_10) */
6317
6318
#if (defined(WOLFSSL_SP_INT_NEGATIVE) && defined(WOLFSSL_SP_ADD_D)) || \
6319
    defined(WOLFSSL_SP_SUB_D) || defined(WOLFSSL_SP_INVMOD) || \
6320
    defined(WOLFSSL_SP_INVMOD_MONT_CT) || (defined(WOLFSSL_SP_PRIME_GEN) && \
6321
    !defined(WC_NO_RNG))
6322
/* Sub a one digit number from the multi-precision number.
6323
 *
6324
 * @param  [in]   a  SP integer be subtracted from.
6325
 * @param  [in]   d  Digit to subtract.
6326
 * @param  [out]  r  SP integer to store result in.
6327
 */
6328
static void _sp_sub_d(const sp_int* a, sp_int_digit d, sp_int* r)
6329
71.6k
{
6330
    /* Set result used to be same as input. Updated with clamp. */
6331
71.6k
    r->used = a->used;
6332
    /* Only possible when not handling negatives. */
6333
71.6k
    if (a->used == 0) {
6334
        /* Set result to zero as no negative support. */
6335
23
        r->dp[0] = 0;
6336
23
    }
6337
71.6k
    else {
6338
71.6k
        unsigned int i = 0;
6339
71.6k
        sp_int_digit a0 = a->dp[0];
6340
6341
71.6k
        r->dp[0] = a0 - d;
6342
        /* Check for borrow. */
6343
71.6k
        if (r->dp[0] > a0) {
6344
            /* Do borrow through all words. */
6345
7.86k
            for (++i; i < a->used; i++) {
6346
7.84k
                r->dp[i] = a->dp[i] - 1;
6347
7.84k
                if (r->dp[i] != SP_DIGIT_MAX) {
6348
2.10k
                   break;
6349
2.10k
                }
6350
7.84k
            }
6351
2.12k
        }
6352
        /* When result is not the same as input, copy rest of digits. */
6353
71.6k
        if (r != a) {
6354
            /* Copy any words that didn't update with borrow. */
6355
341k
            for (++i; i < a->used; i++) {
6356
285k
                r->dp[i] = a->dp[i];
6357
285k
            }
6358
56.0k
        }
6359
        /* Remove leading zero words. */
6360
71.6k
        sp_clamp(r);
6361
71.6k
    }
6362
71.6k
}
6363
#endif /* (WOLFSSL_SP_INT_NEGATIVE && WOLFSSL_SP_ADD_D) || WOLFSSL_SP_SUB_D
6364
        * WOLFSSL_SP_INVMOD || WOLFSSL_SP_INVMOD_MONT_CT ||
6365
        * WOLFSSL_SP_PRIME_GEN */
6366
6367
#ifdef WOLFSSL_SP_ADD_D
6368
/* Add a one digit number to the multi-precision number.
6369
 *
6370
 * @param  [in]   a  SP integer be added to.
6371
 * @param  [in]   d  Digit to add.
6372
 * @param  [out]  r  SP integer to store result in.
6373
 *
6374
 * @return  MP_OKAY on success.
6375
 * @return  MP_VAL when result is too large for fixed size dp array.
6376
 */
6377
int sp_add_d(const sp_int* a, sp_int_digit d, sp_int* r)
6378
49.3k
{
6379
49.3k
    int err = MP_OKAY;
6380
6381
    /* Check validity of parameters. */
6382
49.3k
    if ((a == NULL) || (r == NULL)) {
6383
0
        err = MP_VAL;
6384
0
    }
6385
6386
#ifndef WOLFSSL_SP_INT_NEGATIVE
6387
    /* Check for space in result especially when carry adds a new word. */
6388
    if ((err == MP_OKAY) && (a->used + 1 > r->size)) {
6389
         err = MP_VAL;
6390
    }
6391
    if (err == MP_OKAY) {
6392
        /* Positive only so just use internal function. */
6393
        err = _sp_add_d(a, d, r);
6394
    }
6395
#else
6396
    /* Check for space in result especially when carry adds a new word. */
6397
49.3k
    if ((err == MP_OKAY) && (a->sign == MP_ZPOS) && (a->used + 1 > r->size)) {
6398
11
         err = MP_VAL;
6399
11
    }
6400
    /* Check for space in result - no carry but borrow possible. */
6401
49.3k
    if ((err == MP_OKAY) && (a->sign == MP_NEG) && (a->used > r->size)) {
6402
11
         err = MP_VAL;
6403
11
    }
6404
49.3k
    if (err == MP_OKAY) {
6405
49.2k
        if (a->sign == MP_ZPOS) {
6406
            /* Positive, so use internal function. */
6407
49.0k
            r->sign = MP_ZPOS;
6408
49.0k
            err = _sp_add_d(a, d, r);
6409
49.0k
        }
6410
203
        else if ((a->used > 1) || (a->dp[0] > d)) {
6411
            /* Negative value bigger than digit so subtract digit. */
6412
111
            r->sign = MP_NEG;
6413
111
            _sp_sub_d(a, d, r);
6414
111
        }
6415
92
        else {
6416
            /* Negative value smaller or equal to digit. */
6417
92
            r->sign = MP_ZPOS;
6418
            /* Subtract negative value from digit. */
6419
92
            r->dp[0] = d - a->dp[0];
6420
            /* Result is a digit equal to or greater than zero. */
6421
92
            r->used = (r->dp[0] > 0);
6422
92
        }
6423
49.2k
    }
6424
49.3k
#endif
6425
6426
49.3k
    return err;
6427
49.3k
}
6428
#endif /* WOLFSSL_SP_ADD_D */
6429
6430
#ifdef WOLFSSL_SP_SUB_D
6431
/* Sub a one digit number from the multi-precision number.
6432
 *
6433
 * @param  [in]   a  SP integer be subtracted from.
6434
 * @param  [in]   d  Digit to subtract.
6435
 * @param  [out]  r  SP integer to store result in.
6436
 *
6437
 * @return  MP_OKAY on success.
6438
 * @return  MP_VAL when a or r is NULL.
6439
 */
6440
int sp_sub_d(const sp_int* a, sp_int_digit d, sp_int* r)
6441
38.0k
{
6442
38.0k
    int err = MP_OKAY;
6443
6444
    /* Check validity of parameters. */
6445
38.0k
    if ((a == NULL) || (r == NULL)) {
6446
0
        err = MP_VAL;
6447
0
    }
6448
#ifndef WOLFSSL_SP_INT_NEGATIVE
6449
    /* Check for space in result. */
6450
    if ((err == MP_OKAY) && (a->used > r->size)) {
6451
         err = MP_VAL;
6452
    }
6453
    if (err == MP_OKAY) {
6454
        /* Positive only so just use internal function. */
6455
        _sp_sub_d(a, d, r);
6456
    }
6457
#else
6458
    /* Check for space in result especially when borrow adds a new word. */
6459
38.0k
    if ((err == MP_OKAY) && (a->sign == MP_NEG) && (a->used + 1 > r->size)) {
6460
9
         err = MP_VAL;
6461
9
    }
6462
    /* Check for space in result - no carry but borrow possible. */
6463
38.0k
    if ((err == MP_OKAY) && (a->sign == MP_ZPOS) && (a->used > r->size)) {
6464
6
         err = MP_VAL;
6465
6
    }
6466
38.0k
    if (err == MP_OKAY) {
6467
37.9k
        if (a->sign == MP_NEG) {
6468
            /* Subtracting from negative use internal add. */
6469
118
            r->sign = MP_NEG;
6470
118
            err = _sp_add_d(a, d, r);
6471
118
        }
6472
37.8k
        else if ((a->used > 1) || (a->dp[0] >= d)) {
6473
            /* Positive number greater than or equal to digit - subtract digit.
6474
             */
6475
37.8k
            r->sign = MP_ZPOS;
6476
37.8k
            _sp_sub_d(a, d, r);
6477
37.8k
        }
6478
16
        else {
6479
            /* Positive value smaller than digit. */
6480
16
            r->sign = MP_NEG;
6481
            /* Subtract positive value from digit. */
6482
16
            r->dp[0] = d - a->dp[0];
6483
            /* Result is a digit equal to or greater than zero. */
6484
16
            r->used = 1;
6485
16
        }
6486
37.9k
    }
6487
38.0k
#endif
6488
6489
38.0k
    return err;
6490
38.0k
}
6491
#endif /* WOLFSSL_SP_SUB_D */
6492
6493
#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
6494
    defined(WOLFSSL_SP_SMALL) && (defined(WOLFSSL_SP_MATH_ALL) || \
6495
    !defined(NO_DH) || defined(HAVE_ECC) || \
6496
    (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && \
6497
     !defined(WOLFSSL_RSA_PUBLIC_ONLY))) || \
6498
    (defined(WOLFSSL_KEY_GEN) && !defined(NO_RSA)) || \
6499
    defined(WOLFSSL_SP_MUL_D)
6500
/* Multiply a by digit n and put result into r shifting up o digits.
6501
 *   r = (a * n) << (o * SP_WORD_SIZE)
6502
 *
6503
 * @param  [in]   a  SP integer to be multiplied.
6504
 * @param  [in]   d  SP digit to multiply by.
6505
 * @param  [out]  r  SP integer result.
6506
 * @param  [in]   o  Number of digits to move result up by.
6507
 * @return  MP_OKAY on success.
6508
 * @return  MP_VAL when result is too large for sp_int.
6509
 */
6510
static int _sp_mul_d(const sp_int* a, sp_int_digit d, sp_int* r, unsigned int o)
6511
6.38M
{
6512
6.38M
    int err = MP_OKAY;
6513
6.38M
    unsigned int i;
6514
#ifndef SQR_MUL_ASM
6515
    sp_int_word t = 0;
6516
#else
6517
6.38M
    sp_int_digit l = 0;
6518
6.38M
    sp_int_digit h = 0;
6519
6.38M
#endif
6520
6521
#ifdef WOLFSSL_SP_SMALL
6522
    /* Zero out offset words. */
6523
    for (i = 0; i < o; i++) {
6524
        r->dp[i] = 0;
6525
    }
6526
#else
6527
    /* Don't use the offset. Only when doing small code size div. */
6528
6.38M
    (void)o;
6529
6.38M
#endif
6530
6531
    /* Multiply each word of a by n. */
6532
254M
    for (i = 0; i < a->used; i++, o++) {
6533
    #ifndef SQR_MUL_ASM
6534
        /* Add product to top word of previous result. */
6535
        t += (sp_int_word)a->dp[i] * d;
6536
        /* Store low word. */
6537
        r->dp[o] = (sp_int_digit)t;
6538
        /* Move top word down. */
6539
        t >>= SP_WORD_SIZE;
6540
    #else
6541
        /* Multiply and add into low and high from previous result.
6542
         * No overflow of possible with add. */
6543
247M
        SP_ASM_MUL_ADD_NO(l, h, a->dp[i], d);
6544
        /* Store low word. */
6545
247M
        r->dp[o] = l;
6546
        /* Move high word into low word and set high word to 0. */
6547
247M
        l = h;
6548
247M
        h = 0;
6549
247M
    #endif
6550
247M
    }
6551
6552
    /* Check whether new word to be appended to result. */
6553
#ifndef SQR_MUL_ASM
6554
    if (t > 0)
6555
#else
6556
6.38M
    if (l > 0)
6557
677k
#endif
6558
677k
    {
6559
        /* Validate space available in result. */
6560
677k
        if (o == r->size) {
6561
114
            err = MP_VAL;
6562
114
        }
6563
677k
        else {
6564
            /* Store new top word. */
6565
        #ifndef SQR_MUL_ASM
6566
            r->dp[o++] = (sp_int_digit)t;
6567
        #else
6568
677k
            r->dp[o++] = l;
6569
677k
        #endif
6570
677k
        }
6571
677k
    }
6572
    /* Update number of words in result. */
6573
6.38M
    r->used = (sp_size_t)o;
6574
    /* In case n is zero. */
6575
6.38M
    sp_clamp(r);
6576
6577
6.38M
    return err;
6578
6.38M
}
6579
#endif /* (WOLFSSL_SP_MATH_ALL && !WOLFSSL_RSA_VERIFY_ONLY) ||
6580
        *  WOLFSSL_SP_SMALL || (WOLFSSL_KEY_GEN && !NO_RSA) */
6581
6582
#ifdef WOLFSSL_SP_MUL_D
6583
/* Multiply a by digit n and put result into r. r = a * n
6584
 *
6585
 * @param  [in]   a  SP integer to multiply.
6586
 * @param  [in]   n  Digit to multiply by.
6587
 * @param  [out]  r  SP integer to hold result.
6588
 *
6589
 * @return  MP_OKAY on success.
6590
 * @return  MP_VAL when a or b is NULL, or a has maximum number of digits used.
6591
 */
6592
int sp_mul_d(const sp_int* a, sp_int_digit d, sp_int* r)
6593
236
{
6594
236
    int err = MP_OKAY;
6595
6596
    /* Validate parameters. */
6597
236
    if ((a == NULL) || (r == NULL)) {
6598
0
        err = MP_VAL;
6599
0
    }
6600
    /* Check space for product result - _sp_mul_d checks when new word added. */
6601
236
    if ((err == MP_OKAY) && (a->used > r->size)) {
6602
4
        err = MP_VAL;
6603
4
    }
6604
6605
236
    if (err == MP_OKAY) {
6606
232
        err = _sp_mul_d(a, d, r, 0);
6607
232
    #ifdef WOLFSSL_SP_INT_NEGATIVE
6608
        /* Update sign. */
6609
232
        if (d == 0) {
6610
32
            r->sign = MP_ZPOS;
6611
32
        }
6612
200
        else {
6613
200
            r->sign = a->sign;
6614
200
        }
6615
232
    #endif
6616
232
    }
6617
6618
236
    return err;
6619
236
}
6620
#endif /* WOLFSSL_SP_MUL_D */
6621
6622
/* Predefine complicated rules of when to compile in sp_div_d and sp_mod_d. */
6623
#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
6624
    defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) || \
6625
    defined(OPENSSL_EXTRA) || defined(WC_MP_TO_RADIX)
6626
#define WOLFSSL_SP_DIV_D
6627
#endif
6628
#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
6629
    !defined(NO_DH) || \
6630
    (defined(HAVE_ECC) && (defined(FP_ECC) || defined(HAVE_COMP_KEY))) || \
6631
    (!defined(NO_RSA) && defined(WOLFSSL_KEY_GEN))
6632
#define WOLFSSL_SP_MOD_D
6633
#endif
6634
6635
#if (defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_DH) || defined(HAVE_ECC) || \
6636
     (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && \
6637
      !defined(WOLFSSL_RSA_PUBLIC_ONLY))) || \
6638
    defined(WOLFSSL_SP_DIV_D) || defined(WOLFSSL_SP_MOD_D)
6639
#ifndef SP_ASM_DIV_WORD
6640
/* Divide a two digit number by a digit number and return. (hi | lo) / d
6641
 *
6642
 * @param  [in]  hi  SP integer digit. High digit of the dividend.
6643
 * @param  [in]  lo  SP integer digit. Lower digit of the dividend.
6644
 * @param  [in]  d   SP integer digit. Number to divide by.
6645
 * @return  The division result.
6646
 */
6647
static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
6648
    sp_int_digit d)
6649
{
6650
#ifdef WOLFSSL_SP_DIV_WORD_HALF
6651
    sp_int_digit r;
6652
6653
    /* Trial division using half of the bits in d. */
6654
6655
    /* Check for shortcut when no high word set. */
6656
    if (hi == 0) {
6657
        r = lo / d;
6658
    }
6659
    else {
6660
        /* Half the bits of d. */
6661
        sp_int_digit divh = d >> SP_HALF_SIZE;
6662
        /* Number to divide in one value. */
6663
        sp_int_word w = ((sp_int_word)hi << SP_WORD_SIZE) | lo;
6664
        sp_int_word trial;
6665
        sp_int_digit r2;
6666
6667
        /* Calculation for top SP_WORD_SIZE / 2 bits of dividend. */
6668
        /* Divide high word by top half of divisor. */
6669
        r = hi / divh;
6670
        /* When result too big then assume only max value. */
6671
        if (r > SP_HALF_MAX) {
6672
            r = SP_HALF_MAX;
6673
        }
6674
        /* Shift up result for trial division calculation. */
6675
        r <<= SP_HALF_SIZE;
6676
        /* Calculate trial value. */
6677
        trial = r * (sp_int_word)d;
6678
        /* Decrease r while trial is too big. */
6679
        while (trial > w) {
6680
            r -= (sp_int_digit)1 << SP_HALF_SIZE;
6681
            trial -= (sp_int_word)d << SP_HALF_SIZE;
6682
        }
6683
        /* Subtract trial. */
6684
        w -= trial;
6685
6686
        /* Calculation for remaining second SP_WORD_SIZE / 2 bits. */
6687
        /* Divide top SP_WORD_SIZE of remainder by top half of divisor. */
6688
        r2 = ((sp_int_digit)(w >> SP_HALF_SIZE)) / divh;
6689
        /* Calculate trial value. */
6690
        trial = r2 * (sp_int_word)d;
6691
        /* Decrease r while trial is too big. */
6692
        while (trial > w) {
6693
            r2--;
6694
            trial -= d;
6695
        }
6696
        /* Subtract trial. */
6697
        w -= trial;
6698
        /* Update result. */
6699
        r += r2;
6700
6701
        /* Calculation for remaining bottom SP_WORD_SIZE bits. */
6702
        r2 = ((sp_int_digit)w) / d;
6703
        /* Update result. */
6704
        r += r2;
6705
    }
6706
6707
    return r;
6708
#else
6709
    sp_int_word w;
6710
    sp_int_digit r;
6711
6712
    /* Use built-in divide. */
6713
    w = ((sp_int_word)hi << SP_WORD_SIZE) | lo;
6714
    w /= d;
6715
    r = (sp_int_digit)w;
6716
6717
    return r;
6718
#endif /* WOLFSSL_SP_DIV_WORD_HALF */
6719
}
6720
#endif /* !SP_ASM_DIV_WORD */
6721
#endif /* WOLFSSL_SP_MATH_ALL || !NO_DH || HAVE_ECC ||
6722
        * (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) */
6723
6724
#if (defined(WOLFSSL_SP_DIV_D) || defined(WOLFSSL_SP_MOD_D)) && \
6725
    !defined(WOLFSSL_SP_SMALL)
6726
6727
#if SP_WORD_SIZE == 64
6728
    /* 2^64 / 3 */
6729
    #define SP_DIV_3_CONST      0x5555555555555555L
6730
    /* 2^64 / 10 */
6731
106M
    #define SP_DIV_10_CONST     0x1999999999999999L
6732
#elif SP_WORD_SIZE == 32
6733
    /* 2^32 / 3 */
6734
    #define SP_DIV_3_CONST      0x55555555
6735
    /* 2^32 / 10 */
6736
    #define SP_DIV_10_CONST     0x19999999
6737
#elif SP_WORD_SIZE == 16
6738
    /* 2^16 / 3 */
6739
    #define SP_DIV_3_CONST      0x5555
6740
    /* 2^16 / 10 */
6741
    #define SP_DIV_10_CONST     0x1999
6742
#elif SP_WORD_SIZE == 8
6743
    /* 2^8 / 3 */
6744
    #define SP_DIV_3_CONST      0x55
6745
    /* 2^8 / 10 */
6746
    #define SP_DIV_10_CONST     0x19
6747
#endif
6748
6749
#if !defined(WOLFSSL_SP_SMALL) && (SP_WORD_SIZE < 64)
6750
/* Divide by 3: r = a / 3 and rem = a % 3
6751
 *
6752
 * Used in checking prime: (a % 3) == 0?.
6753
 *
6754
 * @param  [in]   a    SP integer to be divided.
6755
 * @param  [out]  r    SP integer that is the quotient. May be NULL.
6756
 * @param  [out]  rem  SP integer that is the remainder. May be NULL.
6757
 */
6758
static void _sp_div_3(const sp_int* a, sp_int* r, sp_int_digit* rem)
6759
{
6760
#ifndef SQR_MUL_ASM
6761
    sp_int_word t;
6762
    sp_int_digit tt;
6763
#else
6764
    sp_int_digit l = 0;
6765
    sp_int_digit tt = 0;
6766
    sp_int_digit t = SP_DIV_3_CONST;
6767
    sp_int_digit lm = 0;
6768
    sp_int_digit hm = 0;
6769
#endif
6770
    sp_int_digit tr = 0;
6771
    /* Quotient fixup. */
6772
    static const unsigned char sp_r6[6] = { 0, 0, 0, 1, 1, 1 };
6773
    /* Remainder fixup. */
6774
    static const unsigned char sp_rem6[6] = { 0, 1, 2, 0, 1, 2 };
6775
6776
    /* Check whether only mod value needed. */
6777
    if (r == NULL) {
6778
        unsigned int i;
6779
6780
        /*    2^2 mod 3 = 4 mod 3 = 1.
6781
         * => 2^(2*n) mod 3 = (2^2 mod 3)^n mod 3 = 1^n mod 3 = 1
6782
         * => (2^(2*n) * x) mod 3 = (2^(2*n) mod 3) * (x mod 3) = x mod 3
6783
         *
6784
         * Calculate mod 3 on sum of digits as SP_WORD_SIZE is a multiple of 2.
6785
         */
6786
    #ifndef SQR_MUL_ASM
6787
        t = 0;
6788
        /* Sum the digits. */
6789
        for (i = 0; i < a->used; i++) {
6790
            t += a->dp[i];
6791
        }
6792
        /* Sum digits of sum. */
6793
        t = (t >> SP_WORD_SIZE) + (t & SP_MASK);
6794
        /* Get top digit after multiplying by (2^SP_WORD_SIZE) / 3. */
6795
        tt = (sp_int_digit)((t * SP_DIV_3_CONST) >> SP_WORD_SIZE);
6796
        /* Subtract trial division. */
6797
        tr = (sp_int_digit)(t - (sp_int_word)tt * 3);
6798
    #else
6799
        /* Sum the digits. */
6800
        for (i = 0; i < a->used; i++) {
6801
            SP_ASM_ADDC_REG(l, tr, a->dp[i]);
6802
        }
6803
        /* Sum digits of sum - can get carry. */
6804
        SP_ASM_ADDC_REG(l, tt, tr);
6805
        /* Multiply digit by (2^SP_WORD_SIZE) / 3. */
6806
        SP_ASM_MUL(lm, hm, l, t);
6807
        /* Add remainder multiplied by (2^SP_WORD_SIZE) / 3 to top digit. */
6808
        hm += tt * SP_DIV_3_CONST;
6809
        /* Subtract trial division from digit. */
6810
        tr = l - (hm * 3);
6811
    #endif
6812
        /* tr is 0..5 but need 0..2 */
6813
        /* Fix up remainder. */
6814
        tr = sp_rem6[tr];
6815
        *rem = tr;
6816
    }
6817
    /* At least result needed - remainder is calculated anyway. */
6818
    else {
6819
        int i;
6820
6821
        /* Divide starting at most significant word down to least. */
6822
        for (i = (int)(a->used - 1); i >= 0; i--) {
6823
    #ifndef SQR_MUL_ASM
6824
            /* Combine remainder from last operation with this word. */
6825
            t = ((sp_int_word)tr << SP_WORD_SIZE) | a->dp[i];
6826
            /* Get top digit after multiplying by (2^SP_WORD_SIZE) / 3. */
6827
            tt = (sp_int_digit)((t * SP_DIV_3_CONST) >> SP_WORD_SIZE);
6828
            /* Subtract trial division. */
6829
            tr = (sp_int_digit)(t - (sp_int_word)tt * 3);
6830
    #else
6831
            /* Multiply digit by (2^SP_WORD_SIZE) / 3. */
6832
            SP_ASM_MUL(l, tt, a->dp[i], t);
6833
            /* Add remainder multiplied by (2^SP_WORD_SIZE) / 3 to top digit. */
6834
            tt += tr * SP_DIV_3_CONST;
6835
            /* Subtract trial division from digit. */
6836
            tr = a->dp[i] - (tt * 3);
6837
    #endif
6838
            /* tr is 0..5 but need 0..2 */
6839
            /* Fix up result. */
6840
            tt += sp_r6[tr];
6841
            /* Fix up remainder. */
6842
            tr = sp_rem6[tr];
6843
            /* Store result of digit divided by 3. */
6844
            r->dp[i] = tt;
6845
        }
6846
6847
        /* Set the used amount to maximal amount. */
6848
        r->used = a->used;
6849
        /* Remove leading zeros. */
6850
        sp_clamp(r);
6851
        /* Return remainder if required. */
6852
        if (rem != NULL) {
6853
            *rem = tr;
6854
        }
6855
    }
6856
}
6857
#endif /* !(WOLFSSL_SP_SMALL && (SP_WORD_SIZE < 64) */
6858
6859
/* Divide by 10: r = a / 10 and rem = a % 10
6860
 *
6861
 * Used when writing with a radix of 10 - decimal number.
6862
 *
6863
 * @param  [in]   a    SP integer to be divided.
6864
 * @param  [out]  r    SP integer that is the quotient. May be NULL.
6865
 * @param  [out]  rem  SP integer that is the remainder. May be NULL.
6866
 */
6867
static void _sp_div_10(const sp_int* a, sp_int* r, sp_int_digit* rem)
6868
5.42M
{
6869
5.42M
    int i;
6870
#ifndef SQR_MUL_ASM
6871
    sp_int_word t;
6872
    sp_int_digit tt;
6873
#else
6874
5.42M
    sp_int_digit l = 0;
6875
5.42M
    sp_int_digit tt = 0;
6876
5.42M
    sp_int_digit t = SP_DIV_10_CONST;
6877
5.42M
#endif
6878
5.42M
    sp_int_digit tr = 0;
6879
6880
    /* Check whether only mod value needed. */
6881
5.42M
    if (r == NULL) {
6882
        /* Divide starting at most significant word down to least. */
6883
664
        for (i = (int)(a->used - 1); i >= 0; i--) {
6884
    #ifndef SQR_MUL_ASM
6885
            /* Combine remainder from last operation with this word. */
6886
            t = ((sp_int_word)tr << SP_WORD_SIZE) | a->dp[i];
6887
            /* Get top digit after multiplying by (2^SP_WORD_SIZE) / 10. */
6888
            tt = (sp_int_digit)((t * SP_DIV_10_CONST) >> SP_WORD_SIZE);
6889
            /* Subtract trial division. */
6890
            tr = (sp_int_digit)(t - (sp_int_word)tt * 10);
6891
    #else
6892
            /* Multiply digit by (2^SP_WORD_SIZE) / 10. */
6893
583
            SP_ASM_MUL(l, tt, a->dp[i], t);
6894
            /* Add remainder multiplied by (2^SP_WORD_SIZE) / 10 to top digit.
6895
             */
6896
583
            tt += tr * SP_DIV_10_CONST;
6897
            /* Subtract trial division from digit. */
6898
583
            tr = a->dp[i] - (tt * 10);
6899
583
    #endif
6900
            /* tr is 0..99 but need 0..9 */
6901
            /* Fix up remainder. */
6902
583
            tr = tr % 10;
6903
583
        }
6904
81
        *rem = tr;
6905
81
    }
6906
    /* At least result needed - remainder is calculated anyway. */
6907
5.42M
    else {
6908
        /* Divide starting at most significant word down to least. */
6909
106M
        for (i = (int)(a->used - 1); i >= 0; i--) {
6910
    #ifndef SQR_MUL_ASM
6911
            /* Combine remainder from last operation with this word. */
6912
            t = ((sp_int_word)tr << SP_WORD_SIZE) | a->dp[i];
6913
            /* Get top digit after multiplying by (2^SP_WORD_SIZE) / 10. */
6914
            tt = (sp_int_digit)((t * SP_DIV_10_CONST) >> SP_WORD_SIZE);
6915
            /* Subtract trial division. */
6916
            tr = (sp_int_digit)(t - (sp_int_word)tt * 10);
6917
    #else
6918
            /* Multiply digit by (2^SP_WORD_SIZE) / 10. */
6919
100M
            SP_ASM_MUL(l, tt, a->dp[i], t);
6920
            /* Add remainder multiplied by (2^SP_WORD_SIZE) / 10 to top digit.
6921
             */
6922
100M
            tt += tr * SP_DIV_10_CONST;
6923
            /* Subtract trial division from digit. */
6924
100M
            tr = a->dp[i] - (tt * 10);
6925
100M
    #endif
6926
            /* tr is 0..99 but need 0..9 */
6927
            /* Fix up result. */
6928
100M
            tt += tr / 10;
6929
            /* Fix up remainder. */
6930
100M
            tr %= 10;
6931
            /* Store result of digit divided by 10. */
6932
100M
            r->dp[i] = tt;
6933
100M
        }
6934
6935
        /* Set the used amount to maximal amount. */
6936
5.42M
        r->used = a->used;
6937
        /* Remove leading zeros. */
6938
5.42M
        sp_clamp(r);
6939
        /* Return remainder if required. */
6940
5.42M
        if (rem != NULL) {
6941
5.42M
            *rem = tr;
6942
5.42M
        }
6943
5.42M
    }
6944
5.42M
}
6945
#endif /* (WOLFSSL_SP_DIV_D || WOLFSSL_SP_MOD_D) && !WOLFSSL_SP_SMALL */
6946
6947
#if defined(WOLFSSL_SP_DIV_D) || defined(WOLFSSL_SP_MOD_D)
6948
/* Divide by small number: r = a / d and rem = a % d
6949
 *
6950
 * @param  [in]   a    SP integer to be divided.
6951
 * @param  [in]   d    Digit to divide by.
6952
 * @param  [out]  r    SP integer that is the quotient. May be NULL.
6953
 * @param  [out]  rem  SP integer that is the remainder. May be NULL.
6954
 */
6955
static void _sp_div_small(const sp_int* a, sp_int_digit d, sp_int* r,
6956
    sp_int_digit* rem)
6957
19.1k
{
6958
19.1k
    int i;
6959
#ifndef SQR_MUL_ASM
6960
    sp_int_word t;
6961
    sp_int_digit tt;
6962
#else
6963
19.1k
    sp_int_digit l = 0;
6964
19.1k
    sp_int_digit tt = 0;
6965
19.1k
#endif
6966
19.1k
    sp_int_digit tr = 0;
6967
19.1k
    sp_int_digit m = SP_DIGIT_MAX / d;
6968
6969
19.1k
#ifndef WOLFSSL_SP_SMALL
6970
    /* Check whether only mod value needed. */
6971
19.1k
    if (r == NULL) {
6972
        /* Divide starting at most significant word down to least. */
6973
692k
        for (i = (int)(a->used - 1); i >= 0; i--) {
6974
        #ifndef SQR_MUL_ASM
6975
            /* Combine remainder from last operation with this word. */
6976
            t = ((sp_int_word)tr << SP_WORD_SIZE) | a->dp[i];
6977
            /* Get top digit after multiplying. */
6978
            tt = (sp_int_digit)((t * m) >> SP_WORD_SIZE);
6979
            /* Subtract trial division. */
6980
            tr = (sp_int_digit)t - (sp_int_digit)(tt * d);
6981
        #else
6982
            /* Multiply digit. */
6983
673k
            SP_ASM_MUL(l, tt, a->dp[i], m);
6984
            /* Add multiplied remainder to top digit. */
6985
673k
            tt += tr * m;
6986
            /* Subtract trial division from digit. */
6987
673k
            tr = a->dp[i] - (tt * d);
6988
673k
        #endif
6989
            /* tr < d * d */
6990
            /* Fix up remainder. */
6991
673k
            tr = tr % d;
6992
673k
        }
6993
19.0k
        *rem = tr;
6994
19.0k
    }
6995
    /* At least result needed - remainder is calculated anyway. */
6996
103
    else
6997
103
#endif /* !WOLFSSL_SP_SMALL */
6998
103
    {
6999
        /* Divide starting at most significant word down to least. */
7000
1.11k
        for (i = (int)(a->used - 1); i >= 0; i--) {
7001
        #ifndef SQR_MUL_ASM
7002
            /* Combine remainder from last operation with this word. */
7003
            t = ((sp_int_word)tr << SP_WORD_SIZE) | a->dp[i];
7004
            /* Get top digit after multiplying. */
7005
            tt = (sp_int_digit)((t * m) >> SP_WORD_SIZE);
7006
            /* Subtract trial division. */
7007
            tr = (sp_int_digit)t - (sp_int_digit)(tt * d);
7008
        #else
7009
            /* Multiply digit. */
7010
1.01k
            SP_ASM_MUL(l, tt, a->dp[i], m);
7011
            /* Add multiplied remainder to top digit. */
7012
1.01k
            tt += tr * m;
7013
            /* Subtract trial division from digit. */
7014
1.01k
            tr = a->dp[i] - (tt * d);
7015
1.01k
        #endif
7016
            /* tr < d * d */
7017
            /* Fix up result. */
7018
1.01k
            tt += tr / d;
7019
            /* Fix up remainder. */
7020
1.01k
            tr %= d;
7021
            /* Store result of dividing the digit. */
7022
        #ifdef WOLFSSL_SP_SMALL
7023
            if (r != NULL)
7024
        #endif
7025
1.01k
            {
7026
1.01k
                r->dp[i] = tt;
7027
1.01k
            }
7028
1.01k
        }
7029
7030
    #ifdef WOLFSSL_SP_SMALL
7031
        if (r != NULL)
7032
    #endif
7033
103
        {
7034
            /* Set the used amount to maximal amount. */
7035
103
            r->used = a->used;
7036
            /* Remove leading zeros. */
7037
103
            sp_clamp(r);
7038
103
        }
7039
        /* Return remainder if required. */
7040
103
        if (rem != NULL) {
7041
47
            *rem = tr;
7042
47
        }
7043
103
    }
7044
19.1k
}
7045
#endif
7046
7047
#ifdef WOLFSSL_SP_DIV_D
7048
/* Divide a multi-precision number by a digit size number and calculate
7049
 * remainder.
7050
 *   r = a / d; rem = a % d
7051
 *
7052
 * Use trial division algorithm.
7053
 *
7054
 * @param  [in]   a    SP integer to be divided.
7055
 * @param  [in]   d    Digit to divide by.
7056
 * @param  [out]  r    SP integer that is the quotient. May be NULL.
7057
 * @param  [out]  rem  Digit that is the remainder. May be NULL.
7058
 */
7059
static void _sp_div_d(const sp_int* a, sp_int_digit d, sp_int* r,
7060
    sp_int_digit* rem)
7061
180
{
7062
180
    int i;
7063
#ifndef SQR_MUL_ASM
7064
    sp_int_word w = 0;
7065
#else
7066
180
    sp_int_digit l;
7067
180
    sp_int_digit h = 0;
7068
180
#endif
7069
180
    sp_int_digit t;
7070
7071
    /* Divide starting at most significant word down to least. */
7072
1.45k
    for (i = (int)(a->used - 1); i >= 0; i--) {
7073
    #ifndef SQR_MUL_ASM
7074
        /* Combine remainder from last operation with this word and divide. */
7075
        t = sp_div_word((sp_int_digit)w, a->dp[i], d);
7076
        /* Combine remainder from last operation with this word. */
7077
        w = (w << SP_WORD_SIZE) | a->dp[i];
7078
        /* Subtract to get modulo result. */
7079
        w -= (sp_int_word)t * d;
7080
    #else
7081
        /* Get current word. */
7082
1.27k
        l = a->dp[i];
7083
        /* Combine remainder from last operation with this word and divide. */
7084
1.27k
        t = sp_div_word(h, l, d);
7085
        /* Subtract to get modulo result. */
7086
1.27k
        h = l - t * d;
7087
1.27k
    #endif
7088
        /* Store result of dividing the digit. */
7089
1.27k
        if (r != NULL) {
7090
1.27k
            r->dp[i] = t;
7091
1.27k
        }
7092
1.27k
    }
7093
180
    if (r != NULL) {
7094
        /* Set the used amount to maximal amount. */
7095
180
        r->used = a->used;
7096
        /* Remove leading zeros. */
7097
180
        sp_clamp(r);
7098
180
    }
7099
7100
    /* Return remainder if required. */
7101
180
    if (rem != NULL) {
7102
    #ifndef SQR_MUL_ASM
7103
        *rem = (sp_int_digit)w;
7104
    #else
7105
81
        *rem = h;
7106
81
    #endif
7107
81
    }
7108
180
}
7109
7110
/* Divide a multi-precision number by a digit size number and calculate
7111
 * remainder.
7112
 *   r = a / d; rem = a % d
7113
 *
7114
 * @param  [in]   a    SP integer to be divided.
7115
 * @param  [in]   d    Digit to divide by.
7116
 * @param  [out]  r    SP integer that is the quotient. May be NULL.
7117
 * @param  [out]  rem  Digit that is the remainder. May be NULL.
7118
 *
7119
 * @return  MP_OKAY on success.
7120
 * @return  MP_VAL when a is NULL or d is 0.
7121
 */
7122
int sp_div_d(const sp_int* a, sp_int_digit d, sp_int* r, sp_int_digit* rem)
7123
2.64M
{
7124
2.64M
    int err = MP_OKAY;
7125
7126
    /* Validate parameters. */
7127
2.64M
    if ((a == NULL) || (d == 0)) {
7128
7
        err = MP_VAL;
7129
7
    }
7130
    /* Check space for maximal sized result. */
7131
2.64M
    if ((err == MP_OKAY) && (r != NULL) && (a->used > r->size)) {
7132
2
        err = MP_VAL;
7133
2
    }
7134
7135
2.64M
    if (err == MP_OKAY) {
7136
2.64M
#if !defined(WOLFSSL_SP_SMALL)
7137
    #if SP_WORD_SIZE < 64
7138
        if (d == 3) {
7139
            /* Fast implementation for divisor of 3. */
7140
            _sp_div_3(a, r, rem);
7141
        }
7142
        else
7143
    #endif
7144
2.64M
        if (d == 10) {
7145
            /* Fast implementation for divisor of 10 - sp_todecimal(). */
7146
2.64M
            _sp_div_10(a, r, rem);
7147
2.64M
        }
7148
127
        else
7149
127
#endif
7150
127
        if (d <= SP_HALF_MAX) {
7151
            /* For small divisors. */
7152
38
            _sp_div_small(a, d, r, rem);
7153
38
        }
7154
89
        else
7155
89
        {
7156
89
            _sp_div_d(a, d, r, rem);
7157
89
        }
7158
7159
2.64M
    #ifdef WOLFSSL_SP_INT_NEGATIVE
7160
2.64M
        if (r != NULL) {
7161
2.64M
            r->sign = a->sign;
7162
2.64M
        }
7163
2.64M
    #endif
7164
2.64M
    }
7165
7166
2.64M
    return err;
7167
2.64M
}
7168
#endif /* WOLFSSL_SP_DIV_D */
7169
7170
#ifdef WOLFSSL_SP_MOD_D
7171
/* Calculate a modulo the digit d into r: r = a mod d
7172
 *
7173
 * @param  [in]   a  SP integer to reduce.
7174
 * @param  [in]   d  Digit to that is the modulus.
7175
 * @param  [out]  r  Digit that is the result.
7176
 */
7177
static void _sp_mod_d(const sp_int* a, const sp_int_digit d, sp_int_digit* r)
7178
401k
{
7179
401k
    int i;
7180
#ifndef SQR_MUL_ASM
7181
    sp_int_word w = 0;
7182
#else
7183
401k
    sp_int_digit h = 0;
7184
401k
#endif
7185
7186
    /* Divide starting at most significant word down to least. */
7187
6.45M
    for (i = (int)(a->used - 1); i >= 0; i--) {
7188
    #ifndef SQR_MUL_ASM
7189
        /* Combine remainder from last operation with this word and divide. */
7190
        sp_int_digit t = sp_div_word((sp_int_digit)w, a->dp[i], d);
7191
        /* Combine remainder from last operation with this word. */
7192
        w = (w << SP_WORD_SIZE) | a->dp[i];
7193
        /* Subtract to get modulo result. */
7194
        w -= (sp_int_word)t * d;
7195
    #else
7196
        /* Combine remainder from last operation with this word and divide. */
7197
6.05M
        sp_int_digit t = sp_div_word(h, a->dp[i], d);
7198
        /* Subtract to get modulo result. */
7199
6.05M
        h = a->dp[i] - t * d;
7200
6.05M
    #endif
7201
6.05M
    }
7202
7203
    /* Return remainder. */
7204
#ifndef SQR_MUL_ASM
7205
    *r = (sp_int_digit)w;
7206
#else
7207
401k
    *r = h;
7208
401k
#endif
7209
401k
}
7210
7211
/* Calculate a modulo the digit d into r: r = a mod d
7212
 *
7213
 * @param  [in]   a  SP integer to reduce.
7214
 * @param  [in]   d  Digit to that is the modulus.
7215
 * @param  [out]  r  Digit that is the result.
7216
 *
7217
 * @return  MP_OKAY on success.
7218
 * @return  MP_VAL when a is NULL or d is 0.
7219
 */
7220
#if !defined(WOLFSSL_SP_MATH_ALL) && (!defined(HAVE_ECC) || \
7221
    !defined(HAVE_COMP_KEY)) && !defined(OPENSSL_EXTRA)
7222
static
7223
#endif /* !WOLFSSL_SP_MATH_ALL && (!HAVE_ECC || !HAVE_COMP_KEY) */
7224
int sp_mod_d(const sp_int* a, sp_int_digit d, sp_int_digit* r)
7225
150k
{
7226
150k
    int err = MP_OKAY;
7227
7228
    /* Validate parameters. */
7229
150k
    if ((a == NULL) || (r == NULL) || (d == 0)) {
7230
11
        err = MP_VAL;
7231
11
    }
7232
7233
#if 0
7234
    sp_print(a, "a");
7235
    sp_print_digit(d, "m");
7236
#endif
7237
7238
150k
    if (err == MP_OKAY) {
7239
        /* Check whether d is a power of 2. */
7240
150k
        if ((d & (d - 1)) == 0) {
7241
2.24k
            if (a->used == 0) {
7242
3
                *r = 0;
7243
3
            }
7244
2.24k
            else {
7245
2.24k
                *r = a->dp[0] & (d - 1);
7246
2.24k
            }
7247
2.24k
        }
7248
148k
#if !defined(WOLFSSL_SP_SMALL)
7249
    #if SP_WORD_SIZE < 64
7250
        else if (d == 3) {
7251
            /* Fast implementation for divisor of 3. */
7252
            _sp_div_3(a, NULL, r);
7253
        }
7254
    #endif
7255
148k
        else if (d == 10) {
7256
            /* Fast implementation for divisor of 10. */
7257
23
            _sp_div_10(a, NULL, r);
7258
23
        }
7259
148k
#endif
7260
148k
        else if (d <= SP_HALF_MAX) {
7261
            /* For small divisors. */
7262
114
            _sp_div_small(a, d, NULL, r);
7263
114
        }
7264
148k
        else {
7265
148k
            _sp_mod_d(a, d, r);
7266
148k
        }
7267
7268
150k
    #ifdef WOLFSSL_SP_INT_NEGATIVE
7269
150k
        if (a->sign == MP_NEG) {
7270
0
            *r = d - *r;
7271
0
        }
7272
150k
    #endif
7273
150k
    }
7274
7275
#if 0
7276
    sp_print_digit(*r, "rmod");
7277
#endif
7278
7279
150k
    return err;
7280
150k
}
7281
#endif /* WOLFSSL_SP_MOD_D */
7282
7283
#if defined(HAVE_ECC) || !defined(NO_DSA) || defined(OPENSSL_EXTRA) || \
7284
    (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && \
7285
     !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_SP_INVMOD)
7286
/* Divides a by 2 and stores in r: r = a >> 1
7287
 *
7288
 * @param  [in]   a  SP integer to divide.
7289
 * @param  [out]  r  SP integer to hold result.
7290
 */
7291
static void _sp_div_2(const sp_int* a, sp_int* r)
7292
25.9M
{
7293
25.9M
    int i;
7294
7295
    /* Shift down each word by 1 and include bottom bit of next at top. */
7296
199M
    for (i = 0; i < (int)a->used - 1; i++) {
7297
173M
        r->dp[i]  = a->dp[i] >> 1;
7298
173M
        r->dp[i] |= a->dp[i+1] << (SP_WORD_SIZE - 1);
7299
173M
    }
7300
    /* Last word only needs to be shifted down. */
7301
25.9M
    r->dp[i] = a->dp[i] >> 1;
7302
    /* Set used to be all words seen. */
7303
25.9M
    r->used = (sp_size_t)(i + 1);
7304
    /* Remove leading zeros. */
7305
25.9M
    sp_clamp(r);
7306
25.9M
#ifdef WOLFSSL_SP_INT_NEGATIVE
7307
    /* Same sign in result. */
7308
25.9M
    r->sign = a->sign;
7309
25.9M
#endif
7310
25.9M
}
7311
7312
#if defined(WOLFSSL_SP_MATH_ALL) && defined(HAVE_ECC)
7313
/* Divides a by 2 and stores in r: r = a >> 1
7314
 *
7315
 * @param  [in]   a  SP integer to divide.
7316
 * @param  [out]  r  SP integer to hold result.
7317
 *
7318
 * @return  MP_OKAY on success.
7319
 * @return  MP_VAL when a or r is NULL.
7320
 */
7321
int sp_div_2(const sp_int* a, sp_int* r)
7322
42.2k
{
7323
42.2k
    int err = MP_OKAY;
7324
7325
    /* Only when a public API. */
7326
42.2k
    if ((a == NULL) || (r == NULL)) {
7327
0
        err = MP_VAL;
7328
0
    }
7329
    /* Ensure maximal size is supported by result. */
7330
42.2k
    if ((err == MP_OKAY) && (a->used > r->size)) {
7331
3
        err = MP_VAL;
7332
3
    }
7333
7334
42.2k
    if (err == MP_OKAY) {
7335
42.2k
        _sp_div_2(a, r);
7336
42.2k
    }
7337
7338
42.2k
    return err;
7339
42.2k
}
7340
#endif /* WOLFSSL_SP_MATH_ALL && HAVE_ECC */
7341
#endif /* HAVE_ECC || !NO_DSA || OPENSSL_EXTRA ||
7342
        * (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) */
7343
7344
#if defined(WOLFSSL_SP_MATH_ALL) && defined(HAVE_ECC)
7345
/* Divides a by 2 mod m and stores in r: r = (a / 2) mod m
7346
 *
7347
 * r = a / 2 (mod m) - constant time (a < m and positive)
7348
 *
7349
 * @param  [in]   a  SP integer to divide.
7350
 * @param  [in]   m  SP integer that is modulus.
7351
 * @param  [out]  r  SP integer to hold result.
7352
 *
7353
 * @return  MP_OKAY on success.
7354
 * @return  MP_VAL when a, m or r is NULL.
7355
 */
7356
int sp_div_2_mod_ct(const sp_int* a, const sp_int* m, sp_int* r)
7357
13.9M
{
7358
13.9M
    int err = MP_OKAY;
7359
7360
    /* Validate parameters. */
7361
13.9M
    if ((a == NULL) || (m == NULL) || (r == NULL)) {
7362
0
        err = MP_VAL;
7363
0
    }
7364
    /* Check result has enough space for a + m. */
7365
13.9M
    if ((err == MP_OKAY) && (m->used + 1 > r->size)) {
7366
17
        err = MP_VAL;
7367
17
    }
7368
7369
13.9M
    if (err == MP_OKAY) {
7370
    #ifndef SQR_MUL_ASM
7371
        sp_int_word  w = 0;
7372
    #else
7373
13.9M
        sp_int_digit l = 0;
7374
13.9M
        sp_int_digit h;
7375
13.9M
        sp_int_digit t;
7376
13.9M
    #endif
7377
        /* Mask to apply to modulus. */
7378
13.9M
        volatile sp_int_digit mask = (sp_int_digit)0 - (a->dp[0] & 1);
7379
13.9M
        sp_size_t i;
7380
7381
    #if 0
7382
        sp_print(a, "a");
7383
        sp_print(m, "m");
7384
    #endif
7385
7386
        /* Add a to m, if a is odd, into r in constant time. */
7387
116M
        for (i = 0; i < m->used; i++) {
7388
            /* Mask to apply to a - set when used value at index. */
7389
102M
            volatile sp_int_digit mask_a = (sp_int_digit)0 - (i < a->used);
7390
7391
        #ifndef SQR_MUL_ASM
7392
            /* Conditionally add modulus. */
7393
            w         += m->dp[i] & mask;
7394
            /* Conditionally add a. */
7395
            w         += a->dp[i] & mask_a;
7396
            /* Store low digit in result. */
7397
            r->dp[i]   = (sp_int_digit)w;
7398
            /* Move high digit down. */
7399
            w        >>= DIGIT_BIT;
7400
        #else
7401
            /* No high digit. */
7402
102M
            h        = 0;
7403
            /* Conditionally use modulus. */
7404
102M
            t        = m->dp[i] & mask;
7405
            /* Add with carry modulus. */
7406
102M
            SP_ASM_ADDC_REG(l, h, t);
7407
            /* Conditionally use a. */
7408
102M
            t        = a->dp[i] & mask_a;
7409
            /* Add with carry a. */
7410
102M
            SP_ASM_ADDC_REG(l, h, t);
7411
            /* Store low digit in result. */
7412
102M
            r->dp[i] = l;
7413
            /* Move high digit down. */
7414
102M
            l        = h;
7415
102M
        #endif
7416
102M
        }
7417
        /* Store carry. */
7418
    #ifndef SQR_MUL_ASM
7419
        r->dp[i] = (sp_int_digit)w;
7420
    #else
7421
13.9M
        r->dp[i] = l;
7422
13.9M
    #endif
7423
        /* Used includes carry - set or not. */
7424
13.9M
        r->used = (sp_size_t)(i + 1);
7425
13.9M
    #ifdef WOLFSSL_SP_INT_NEGATIVE
7426
13.9M
        r->sign = MP_ZPOS;
7427
13.9M
    #endif
7428
        /* Divide conditional sum by 2. */
7429
13.9M
        _sp_div_2(r, r);
7430
7431
    #if 0
7432
        sp_print(r, "rd2");
7433
    #endif
7434
13.9M
    }
7435
7436
13.9M
    return err;
7437
13.9M
}
7438
#endif /* WOLFSSL_SP_MATH_ALL && HAVE_ECC */
7439
7440
/************************
7441
 * Add/Subtract Functions
7442
 ************************/
7443
7444
#if !defined(WOLFSSL_RSA_VERIFY_ONLY) || defined(WOLFSSL_SP_INVMOD)
7445
/* Add offset b to a into r: r = a + (b << (o * SP_WORD_SIZEOF))
7446
 *
7447
 * @param  [in]   a  SP integer to add to.
7448
 * @param  [in]   b  SP integer to add.
7449
 * @param  [out]  r  SP integer to store result in.
7450
 * @param  [in]   o  Number of digits to offset b.
7451
 */
7452
static void _sp_add_off(const sp_int* a, const sp_int* b, sp_int* r, int o)
7453
15.3M
{
7454
15.3M
    sp_size_t i = 0;
7455
#ifndef SQR_MUL_ASM
7456
    sp_int_word t = 0;
7457
#else
7458
15.3M
    sp_int_digit l = 0;
7459
15.3M
    sp_int_digit h = 0;
7460
15.3M
    sp_int_digit t = 0;
7461
15.3M
#endif
7462
7463
#ifdef SP_MATH_NEED_ADD_OFF
7464
    unsigned int j;
7465
7466
    /* Copy a into result up to offset. */
7467
    for (; (i < o) && (i < a->used); i++) {
7468
        r->dp[i] = a->dp[i];
7469
    }
7470
    /* Set result to 0 for digits beyonf those in a. */
7471
    for (; i < o; i++) {
7472
        r->dp[i] = 0;
7473
    }
7474
7475
    /* Add each digit from a and b where both have values. */
7476
    for (j = 0; (i < a->used) && (j < b->used); i++, j++) {
7477
    #ifndef SQR_MUL_ASM
7478
        t += a->dp[i];
7479
        t += b->dp[j];
7480
        r->dp[i] = (sp_int_digit)t;
7481
        t >>= SP_WORD_SIZE;
7482
    #else
7483
        t = a->dp[i];
7484
        SP_ASM_ADDC(l, h, t);
7485
        t = b->dp[j];
7486
        SP_ASM_ADDC(l, h, t);
7487
        r->dp[i] = l;
7488
        l = h;
7489
        h = 0;
7490
    #endif
7491
    }
7492
    /* Either a and/or b are out of digits. Add carry and remaining a digits. */
7493
    for (; i < a->used; i++) {
7494
    #ifndef SQR_MUL_ASM
7495
        t += a->dp[i];
7496
        r->dp[i] = (sp_int_digit)t;
7497
        t >>= SP_WORD_SIZE;
7498
    #else
7499
        t = a->dp[i];
7500
        SP_ASM_ADDC(l, h, t);
7501
        r->dp[i] = l;
7502
        l = h;
7503
        h = 0;
7504
    #endif
7505
    }
7506
    /* a is out of digits. Add carry and remaining b digits. */
7507
    for (; j < b->used; i++, j++) {
7508
    #ifndef SQR_MUL_ASM
7509
        t += b->dp[j];
7510
        r->dp[i] = (sp_int_digit)t;
7511
        t >>= SP_WORD_SIZE;
7512
    #else
7513
        t = b->dp[j];
7514
        SP_ASM_ADDC(l, h, t);
7515
        r->dp[i] = l;
7516
        l = h;
7517
        h = 0;
7518
    #endif
7519
    }
7520
#else
7521
15.3M
    (void)o;
7522
7523
    /* Add each digit from a and b where both have values. */
7524
218M
    for (; (i < a->used) && (i < b->used); i++) {
7525
    #ifndef SQR_MUL_ASM
7526
        t += a->dp[i];
7527
        t += b->dp[i];
7528
        r->dp[i] = (sp_int_digit)t;
7529
        t >>= SP_WORD_SIZE;
7530
    #else
7531
202M
        t = a->dp[i];
7532
202M
        SP_ASM_ADDC(l, h, t);
7533
202M
        t = b->dp[i];
7534
202M
        SP_ASM_ADDC(l, h, t);
7535
202M
        r->dp[i] = l;
7536
202M
        l = h;
7537
202M
        h = 0;
7538
202M
    #endif
7539
202M
    }
7540
    /* Either a and/or b are out of digits. Add carry and remaining a digits. */
7541
15.4M
    for (; i < a->used; i++) {
7542
    #ifndef SQR_MUL_ASM
7543
        t += a->dp[i];
7544
        r->dp[i] = (sp_int_digit)t;
7545
        t >>= SP_WORD_SIZE;
7546
    #else
7547
171k
        t = a->dp[i];
7548
171k
        SP_ASM_ADDC(l, h, t);
7549
171k
        r->dp[i] = l;
7550
171k
        l = h;
7551
171k
        h = 0;
7552
171k
    #endif
7553
171k
    }
7554
    /* a is out of digits. Add carry and remaining b digits. */
7555
22.0M
    for (; i < b->used; i++) {
7556
    #ifndef SQR_MUL_ASM
7557
        t += b->dp[i];
7558
        r->dp[i] = (sp_int_digit)t;
7559
        t >>= SP_WORD_SIZE;
7560
    #else
7561
6.75M
        t = b->dp[i];
7562
6.75M
        SP_ASM_ADDC(l, h, t);
7563
6.75M
        r->dp[i] = l;
7564
6.75M
        l = h;
7565
6.75M
        h = 0;
7566
6.75M
    #endif
7567
6.75M
    }
7568
15.3M
#endif
7569
7570
    /* Set used based on last digit put in. */
7571
15.3M
    r->used = i;
7572
    /* Put in carry. */
7573
#ifndef SQR_MUL_ASM
7574
    r->dp[i] = (sp_int_digit)t;
7575
    r->used = (sp_size_t)(r->used + (sp_size_t)(t != 0));
7576
#else
7577
15.3M
    r->dp[i] = l;
7578
15.3M
    r->used = (sp_size_t)(r->used + (sp_size_t)(l != 0));
7579
15.3M
#endif
7580
7581
    /* Remove leading zeros. */
7582
15.3M
    sp_clamp(r);
7583
15.3M
}
7584
#endif /* !WOLFSSL_RSA_VERIFY_ONLY */
7585
7586
#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_SP_INT_NEGATIVE) || \
7587
    !defined(NO_DH) || defined(HAVE_ECC) || (!defined(NO_RSA) && \
7588
    !defined(WOLFSSL_RSA_VERIFY_ONLY))
7589
/* Sub offset b from a into r: r = a - (b << (o * SP_WORD_SIZEOF))
7590
 * a must be greater than b.
7591
 *
7592
 * When using offset, r == a is faster.
7593
 *
7594
 * @param  [in]   a  SP integer to subtract from.
7595
 * @param  [in]   b  SP integer to subtract.
7596
 * @param  [out]  r  SP integer to store result in.
7597
 * @param  [in]   o  Number of digits to offset b.
7598
 */
7599
static void _sp_sub_off(const sp_int* a, const sp_int* b, sp_int* r,
7600
    sp_size_t o)
7601
95.6M
{
7602
95.6M
    sp_size_t i = 0;
7603
95.6M
    sp_size_t j;
7604
#ifndef SQR_MUL_ASM
7605
    sp_int_sword t = 0;
7606
#else
7607
95.6M
    sp_int_digit l = 0;
7608
95.6M
    sp_int_digit h = 0;
7609
95.6M
#endif
7610
7611
    /* Need to copy digits up to offset into result. */
7612
95.6M
    if (r != a) {
7613
6.31M
        for (; (i < o) && (i < a->used); i++) {
7614
0
            r->dp[i] = a->dp[i];
7615
0
        }
7616
6.31M
    }
7617
89.3M
    else {
7618
89.3M
        i = o;
7619
89.3M
    }
7620
    /* Index to add at is the offset now. */
7621
7622
841M
    for (j = 0; (i < a->used) && (j < b->used); i++, j++) {
7623
    #ifndef SQR_MUL_ASM
7624
        /* Add a into and subtract b from current value. */
7625
        t += a->dp[i];
7626
        t -= b->dp[j];
7627
        /* Store low digit in result. */
7628
        r->dp[i] = (sp_int_digit)t;
7629
        /* Move high digit down. */
7630
        t >>= SP_WORD_SIZE;
7631
    #else
7632
        /* Add a into and subtract b from current value. */
7633
745M
        SP_ASM_ADDC(l, h, a->dp[i]);
7634
745M
        SP_ASM_SUBB(l, h, b->dp[j]);
7635
        /* Store low digit in result. */
7636
745M
        r->dp[i] = l;
7637
        /* Move high digit down. */
7638
745M
        l = h;
7639
        /* High digit is 0 when positive or -1 on negative. */
7640
745M
        h = (sp_int_digit)0 - (h >> (SP_WORD_SIZE - 1));
7641
745M
    #endif
7642
745M
    }
7643
144M
    for (; i < a->used; i++) {
7644
    #ifndef SQR_MUL_ASM
7645
        /* Add a into current value. */
7646
        t += a->dp[i];
7647
        /* Store low digit in result. */
7648
        r->dp[i] = (sp_int_digit)t;
7649
        /* Move high digit down. */
7650
        t >>= SP_WORD_SIZE;
7651
    #else
7652
        /* Add a into current value. */
7653
48.8M
        SP_ASM_ADDC(l, h, a->dp[i]);
7654
        /* Store low digit in result. */
7655
48.8M
        r->dp[i] = l;
7656
        /* Move high digit down. */
7657
48.8M
        l = h;
7658
        /* High digit is 0 when positive or -1 on negative. */
7659
48.8M
        h = (sp_int_digit)0 - (h >> (SP_WORD_SIZE - 1));
7660
48.8M
    #endif
7661
48.8M
    }
7662
7663
    /* Set used based on last digit put in. */
7664
95.6M
    r->used = i;
7665
    /* Remove leading zeros. */
7666
95.6M
    sp_clamp(r);
7667
95.6M
}
7668
#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_SP_INT_NEGATIVE || !NO_DH ||
7669
        * HAVE_ECC || (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) */
7670
7671
#if !defined(WOLFSSL_RSA_VERIFY_ONLY) || defined(WOLFSSL_SP_INVMOD)
7672
/* Add b to a into r: r = a + b
7673
 *
7674
 * @param  [in]   a  SP integer to add to.
7675
 * @param  [in]   b  SP integer to add.
7676
 * @param  [out]  r  SP integer to store result in.
7677
 *
7678
 * @return  MP_OKAY on success.
7679
 * @return  MP_VAL when a, b, or r is NULL.
7680
 */
7681
int sp_add(const sp_int* a, const sp_int* b, sp_int* r)
7682
246k
{
7683
246k
    int err = MP_OKAY;
7684
7685
    /* Validate parameters. */
7686
246k
    if ((a == NULL) || (b == NULL) || (r == NULL)) {
7687
0
        err = MP_VAL;
7688
0
    }
7689
    /* Check that r as big as a and b plus one word. */
7690
246k
    if ((err == MP_OKAY) && ((a->used >= r->size) || (b->used >= r->size))) {
7691
39
        err = MP_VAL;
7692
39
    }
7693
7694
246k
    if (err == MP_OKAY) {
7695
    #ifndef WOLFSSL_SP_INT_NEGATIVE
7696
        /* Add two positive numbers. */
7697
        _sp_add_off(a, b, r, 0);
7698
    #else
7699
        /* Same sign then add absolute values and use sign. */
7700
246k
        if (a->sign == b->sign) {
7701
221k
            _sp_add_off(a, b, r, 0);
7702
221k
            r->sign = a->sign;
7703
221k
        }
7704
        /* Different sign and abs(a) >= abs(b). */
7705
24.9k
        else if (_sp_cmp_abs(a, b) != MP_LT) {
7706
            /* Subtract absolute values and use sign of a unless result 0. */
7707
239
            _sp_sub_off(a, b, r, 0);
7708
239
            if (sp_iszero(r)) {
7709
20
                r->sign = MP_ZPOS;
7710
20
            }
7711
219
            else {
7712
219
                r->sign = a->sign;
7713
219
            }
7714
239
        }
7715
        /* Different sign and abs(a) < abs(b). */
7716
24.7k
        else {
7717
            /* Reverse subtract absolute values and use sign of b. */
7718
24.7k
            _sp_sub_off(b, a, r, 0);
7719
24.7k
            r->sign = b->sign;
7720
24.7k
        }
7721
246k
    #endif
7722
246k
    }
7723
7724
246k
    return err;
7725
246k
}
7726
#endif /* !WOLFSSL_RSA_VERIFY_ONLY */
7727
7728
#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_DH) || defined(HAVE_ECC) || \
7729
    (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY))
7730
/* Subtract b from a into r: r = a - b
7731
 *
7732
 * a must be greater than b unless WOLFSSL_SP_INT_NEGATIVE is defined.
7733
 *
7734
 * @param  [in]   a  SP integer to subtract from.
7735
 * @param  [in]   b  SP integer to subtract.
7736
 * @param  [out]  r  SP integer to store result in.
7737
 *
7738
 * @return  MP_OKAY on success.
7739
 * @return  MP_VAL when a, b, or r is NULL.
7740
 */
7741
int sp_sub(const sp_int* a, const sp_int* b, sp_int* r)
7742
15.3M
{
7743
15.3M
    int err = MP_OKAY;
7744
7745
    /* Validate parameters. */
7746
15.3M
    if ((a == NULL) || (b == NULL) || (r == NULL)) {
7747
0
        err = MP_VAL;
7748
0
    }
7749
    /* Check that r as big as a and b plus one word. */
7750
15.3M
    if ((err == MP_OKAY) && ((a->used >= r->size) || (b->used >= r->size))) {
7751
46
        err = MP_VAL;
7752
46
    }
7753
7754
15.3M
    if (err == MP_OKAY) {
7755
    #ifndef WOLFSSL_SP_INT_NEGATIVE
7756
        /* Subtract positive numbers b from a. */
7757
        _sp_sub_off(a, b, r, 0);
7758
    #else
7759
        /* Different sign. */
7760
15.3M
        if (a->sign != b->sign) {
7761
            /* Add absolute values and use sign of a. */
7762
9.71M
            _sp_add_off(a, b, r, 0);
7763
9.71M
            r->sign = a->sign;
7764
9.71M
        }
7765
        /* Same sign and abs(a) >= abs(b). */
7766
5.68M
        else if (_sp_cmp_abs(a, b) != MP_LT) {
7767
            /* Subtract absolute values and use sign of a unless result 0. */
7768
5.66M
            _sp_sub_off(a, b, r, 0);
7769
5.66M
            if (sp_iszero(r)) {
7770
560
                r->sign = MP_ZPOS;
7771
560
            }
7772
5.66M
            else {
7773
5.66M
                r->sign = a->sign;
7774
5.66M
            }
7775
5.66M
        }
7776
        /* Same sign and abs(a) < abs(b). */
7777
15.6k
        else {
7778
            /* Reverse subtract absolute values and use opposite sign of a */
7779
15.6k
            _sp_sub_off(b, a, r, 0);
7780
15.6k
            r->sign = 1 - a->sign;
7781
15.6k
        }
7782
15.3M
    #endif
7783
15.3M
    }
7784
7785
15.3M
    return err;
7786
15.3M
}
7787
#endif /* WOLFSSL_SP_MATH_ALL || !NO_DH || HAVE_ECC ||
7788
        * (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY)*/
7789
7790
/****************************
7791
 * Add/Subtract mod functions
7792
 ****************************/
7793
7794
#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
7795
    (!defined(WOLFSSL_SP_MATH) && defined(WOLFSSL_CUSTOM_CURVES)) || \
7796
    defined(WOLFCRYPT_HAVE_ECCSI) || defined(WOLFCRYPT_HAVE_SAKKE)
7797
/* Add two value and reduce: r = (a + b) % m
7798
 *
7799
 * @param  [in]   a  SP integer to add.
7800
 * @param  [in]   b  SP integer to add with.
7801
 * @param  [in]   m  SP integer that is the modulus.
7802
 * @param  [out]  r  SP integer to hold result.
7803
 *
7804
 * @return  MP_OKAY on success.
7805
 * @return  MP_MEM when dynamic memory allocation fails.
7806
 */
7807
static int _sp_addmod(const sp_int* a, const sp_int* b, const sp_int* m,
7808
    sp_int* r)
7809
5.72k
{
7810
5.72k
    int err = MP_OKAY;
7811
    /* Calculate used based on digits used in a and b. */
7812
5.72k
    sp_size_t used = (sp_size_t)(((a->used >= b->used) ? a->used + 1U : b->used + 1U));
7813
5.72k
    DECL_SP_INT(t, used);
7814
7815
    /* Allocate a temporary SP int to hold sum. */
7816
5.72k
    ALLOC_SP_INT_SIZE(t, used, err, NULL);
7817
7818
5.72k
    if (err == MP_OKAY) {
7819
        /* Do sum. */
7820
5.69k
        err = sp_add(a, b, t);
7821
5.69k
    }
7822
5.72k
    if (err == MP_OKAY) {
7823
        /* Mod result. */
7824
5.69k
        err = sp_mod(t, m, r);
7825
5.69k
    }
7826
7827
5.72k
    FREE_SP_INT(t, NULL);
7828
5.72k
    return err;
7829
5.72k
}
7830
7831
/* Add two value and reduce: r = (a + b) % m
7832
 *
7833
 * @param  [in]   a  SP integer to add.
7834
 * @param  [in]   b  SP integer to add with.
7835
 * @param  [in]   m  SP integer that is the modulus.
7836
 * @param  [out]  r  SP integer to hold result.
7837
 *
7838
 * @return  MP_OKAY on success.
7839
 * @return  MP_VAL when a, b, m or r is NULL.
7840
 * @return  MP_MEM when dynamic memory allocation fails.
7841
 */
7842
int sp_addmod(const sp_int* a, const sp_int* b, const sp_int* m, sp_int* r)
7843
5.73k
{
7844
5.73k
    int err = MP_OKAY;
7845
7846
    /* Validate parameters. */
7847
5.73k
    if ((a == NULL) || (b == NULL) || (m == NULL) || (r == NULL)) {
7848
0
        err = MP_VAL;
7849
0
    }
7850
    /* Ensure a and b aren't too big a number to operate on. */
7851
5.73k
    else if (a->used >= SP_INT_DIGITS) {
7852
6
        err = MP_VAL;
7853
6
    }
7854
5.72k
    else if (b->used >= SP_INT_DIGITS) {
7855
6
        err = MP_VAL;
7856
6
    }
7857
7858
7859
#if 0
7860
    if (err == MP_OKAY) {
7861
        sp_print(a, "a");
7862
        sp_print(b, "b");
7863
        sp_print(m, "m");
7864
    }
7865
#endif
7866
5.73k
    if (err == MP_OKAY) {
7867
        /* Do add and modular reduction. */
7868
5.72k
        err = _sp_addmod(a, b, m, r);
7869
5.72k
    }
7870
#if 0
7871
    if (err == MP_OKAY) {
7872
        sp_print(r, "rma");
7873
    }
7874
#endif
7875
7876
5.73k
    return err;
7877
5.73k
}
7878
#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_CUSTOM_CURVES) ||
7879
        * WOLFCRYPT_HAVE_ECCSI || WOLFCRYPT_HAVE_SAKKE */
7880
7881
#if defined(WOLFSSL_SP_MATH_ALL) && (!defined(WOLFSSL_RSA_VERIFY_ONLY) || \
7882
    defined(HAVE_ECC))
7883
/* Sub b from a and reduce: r = (a - b) % m
7884
 * Result is always positive.
7885
 *
7886
 * @param  [in]   a  SP integer to subtract from
7887
 * @param  [in]   b  SP integer to subtract.
7888
 * @param  [in]   m  SP integer that is the modulus.
7889
 * @param  [out]  r  SP integer to hold result.
7890
 *
7891
 * @return  MP_OKAY on success.
7892
 * @return  MP_MEM when dynamic memory allocation fails.
7893
 */
7894
static int _sp_submod(const sp_int* a, const sp_int* b, const sp_int* m,
7895
    sp_int* r)
7896
4.92M
{
7897
4.92M
    int err = MP_OKAY;
7898
#ifndef WOLFSSL_SP_INT_NEGATIVE
7899
    unsigned int used = ((a->used >= m->used) ?
7900
        ((a->used >= b->used) ? (a->used + 1U) : (b->used + 1U)) :
7901
        ((b->used >= m->used)) ? (b->used + 1U) : (m->used + 1U));
7902
    DECL_SP_INT(t0, used);
7903
    DECL_SP_INT(t1, used);
7904
7905
    ALLOC_SP_INT_SIZE(t0, used, err, NULL);
7906
    ALLOC_SP_INT_SIZE(t1, used, err, NULL);
7907
    if (err == MP_OKAY) {
7908
        /* Reduce a to less than m. */
7909
        if (_sp_cmp(a, m) != MP_LT) {
7910
            err = sp_mod(a, m, t0);
7911
            a = t0;
7912
        }
7913
    }
7914
    if (err == MP_OKAY) {
7915
        /* Reduce b to less than m. */
7916
        if (_sp_cmp(b, m) != MP_LT) {
7917
            err = sp_mod(b, m, t1);
7918
            b = t1;
7919
        }
7920
    }
7921
    if (err == MP_OKAY) {
7922
        /* Add m to a if a smaller than b. */
7923
        if (_sp_cmp(a, b) == MP_LT) {
7924
            err = sp_add(a, m, t0);
7925
            a = t0;
7926
        }
7927
    }
7928
    if (err == MP_OKAY) {
7929
        /* Subtract b from a. */
7930
        err = sp_sub(a, b, r);
7931
    }
7932
7933
    FREE_SP_INT(t0, NULL);
7934
    FREE_SP_INT(t1, NULL);
7935
#else /* WOLFSSL_SP_INT_NEGATIVE */
7936
4.92M
    sp_size_t used = ((a->used >= b->used) ? a->used + 1 : b->used + 1);
7937
4.92M
    DECL_SP_INT(t, used);
7938
7939
4.92M
    ALLOC_SP_INT_SIZE(t, used, err, NULL);
7940
    /* Subtract b from a into temporary. */
7941
4.92M
    if (err == MP_OKAY) {
7942
4.92M
        err = sp_sub(a, b, t);
7943
4.92M
    }
7944
4.92M
    if (err == MP_OKAY) {
7945
        /* Reduce result mod m into result. */
7946
4.92M
        err = sp_mod(t, m, r);
7947
4.92M
    }
7948
4.92M
    FREE_SP_INT(t, NULL);
7949
4.92M
#endif /* WOLFSSL_SP_INT_NEGATIVE */
7950
7951
4.92M
    return err;
7952
4.92M
}
7953
7954
/* Sub b from a and reduce: r = (a - b) % m
7955
 * Result is always positive.
7956
 *
7957
 * @param  [in]   a  SP integer to subtract from
7958
 * @param  [in]   b  SP integer to subtract.
7959
 * @param  [in]   m  SP integer that is the modulus.
7960
 * @param  [out]  r  SP integer to hold result.
7961
 *
7962
 * @return  MP_OKAY on success.
7963
 * @return  MP_VAL when a, b, m or r is NULL.
7964
 * @return  MP_MEM when dynamic memory allocation fails.
7965
 */
7966
int sp_submod(const sp_int* a, const sp_int* b, const sp_int* m, sp_int* r)
7967
6.21M
{
7968
6.21M
    int err = MP_OKAY;
7969
    /* Validate parameters. */
7970
6.21M
    if ((a == NULL) || (b == NULL) || (m == NULL) || (r == NULL)) {
7971
0
        err = MP_VAL;
7972
0
    }
7973
    /* Ensure a, b and m aren't too big a number to operate on. */
7974
6.21M
    else if (a->used >= SP_INT_DIGITS) {
7975
6
        err = MP_VAL;
7976
6
    }
7977
6.21M
    else if (b->used >= SP_INT_DIGITS) {
7978
6
        err = MP_VAL;
7979
6
    }
7980
6.21M
    else if (m->used >= SP_INT_DIGITS) {
7981
6
        err = MP_VAL;
7982
6
    }
7983
7984
#if 0
7985
    if (err == MP_OKAY) {
7986
        sp_print(a, "a");
7987
        sp_print(b, "b");
7988
        sp_print(m, "m");
7989
    }
7990
#endif
7991
6.21M
    if (err == MP_OKAY) {
7992
        /* Do submod. */
7993
6.21M
        err = _sp_submod(a, b, m, r);
7994
6.21M
    }
7995
#if 0
7996
    if (err == MP_OKAY) {
7997
        sp_print(r, "rms");
7998
    }
7999
#endif
8000
8001
6.21M
    return err;
8002
6.21M
}
8003
#endif /* WOLFSSL_SP_MATH_ALL */
8004
8005
/* Constant time clamping.
8006
 *
8007
 * @param [in, out] a  SP integer to clamp.
8008
 */
8009
static void sp_clamp_ct(sp_int* a)
8010
143M
{
8011
143M
    int i;
8012
143M
    sp_size_t used = a->used;
8013
143M
    volatile sp_size_t mask = (sp_size_t)-1;
8014
8015
1.20G
    for (i = (int)a->used - 1; i >= 0; i--) {
8016
#if ((SP_WORD_SIZE == 64) && \
8017
     (defined(_WIN64) || !defined(WOLFSSL_UINT128_T_DEFINED))) || \
8018
    ((SP_WORD_SIZE == 32) && defined(NO_64BIT))
8019
        sp_int_digit negVal = ~a->dp[i];
8020
        sp_int_digit minusOne = a->dp[i] - 1;
8021
        sp_int_digit zeroMask =
8022
            (sp_int_digit)((sp_int_sdigit)(negVal & minusOne) >>
8023
                           (SP_WORD_SIZE - 1));
8024
#else
8025
1.05G
        sp_size_t zeroMask =
8026
1.05G
            (sp_size_t)((((sp_int_sword)a->dp[i]) - 1) >> SP_WORD_SIZE);
8027
1.05G
#endif
8028
1.05G
        mask &= (sp_size_t)zeroMask;
8029
1.05G
        used = (sp_size_t)(used + mask);
8030
1.05G
    }
8031
143M
    a->used = used;
8032
143M
}
8033
8034
#if defined(WOLFSSL_SP_MATH_ALL) && defined(HAVE_ECC)
8035
/* Add two value and reduce: r = (a + b) % m
8036
 *
8037
 * r = a + b (mod m) - constant time (a < m and b < m, a, b and m are positive)
8038
 *
8039
 * Assumes a, b, m and r are not NULL.
8040
 * m and r must not be the same pointer.
8041
 *
8042
 * @param  [in]   a  SP integer to add.
8043
 * @param  [in]   b  SP integer to add with.
8044
 * @param  [in]   m  SP integer that is the modulus.
8045
 * @param  [out]  r  SP integer to hold result.
8046
 *
8047
 * @return  MP_OKAY on success.
8048
 */
8049
int sp_addmod_ct(const sp_int* a, const sp_int* b, const sp_int* m, sp_int* r)
8050
26.6M
{
8051
26.6M
    int err = MP_OKAY;
8052
#ifndef SQR_MUL_ASM
8053
    sp_int_sword w;
8054
    sp_int_sword s;
8055
#else
8056
26.6M
    sp_int_digit wl;
8057
26.6M
    sp_int_digit wh;
8058
26.6M
    sp_int_digit sl;
8059
26.6M
    sp_int_digit sh;
8060
26.6M
    sp_int_digit t;
8061
26.6M
#endif
8062
26.6M
    volatile sp_int_digit mask;
8063
26.6M
    volatile sp_int_digit mask_a = (sp_int_digit)-1;
8064
26.6M
    volatile sp_int_digit mask_b = (sp_int_digit)-1;
8065
26.6M
    sp_size_t i;
8066
8067
    /* Check result is as big as modulus. */
8068
26.6M
    if (m->used > r->size) {
8069
13
        err = MP_VAL;
8070
13
    }
8071
    /* Validate parameters. */
8072
26.6M
    if ((err == MP_OKAY) && (r == m)) {
8073
6
        err = MP_VAL;
8074
6
    }
8075
8076
26.6M
    if (err == MP_OKAY) {
8077
#if 0
8078
        sp_print(a, "a");
8079
        sp_print(b, "b");
8080
        sp_print(m, "m");
8081
#endif
8082
8083
        /* Add a to b into r. Do the subtract of modulus but don't store result.
8084
         * When subtract result is negative, the overflow will be negative.
8085
         * Only need to subtract mod when result is positive - overflow is
8086
         * positive.
8087
         */
8088
    #ifndef SQR_MUL_ASM
8089
        w = 0;
8090
        s = 0;
8091
    #else
8092
26.6M
        wl = 0;
8093
26.6M
        sl = 0;
8094
26.6M
        sh = 0;
8095
26.6M
    #endif
8096
        /* Constant time - add modulus digits worth from a and b. */
8097
164M
        for (i = 0; i < m->used; i++) {
8098
            /* Values past 'used' are not initialized. */
8099
137M
            mask_a += (i == a->used);
8100
137M
            mask_b += (i == b->used);
8101
8102
        #ifndef SQR_MUL_ASM
8103
            /* Add next digits from a and b to current value. */
8104
            w         += a->dp[i] & mask_a;
8105
            w         += b->dp[i] & mask_b;
8106
            /* Store low digit in result. */
8107
            r->dp[i]   = (sp_int_digit)w;
8108
            /* Add result to reducing value. */
8109
            s         += (sp_int_digit)w;
8110
            /* Subtract next digit of modulus. */
8111
            s         -= m->dp[i];
8112
            /* Move high digit of reduced result down. */
8113
            s        >>= DIGIT_BIT;
8114
            /* Move high digit of sum result down. */
8115
            w        >>= DIGIT_BIT;
8116
        #else
8117
137M
            wh = 0;
8118
            /* Add next digits from a and b to current value. */
8119
137M
            t = a->dp[i] & mask_a;
8120
137M
            SP_ASM_ADDC_REG(wl, wh, t);
8121
137M
            t = b->dp[i] & mask_b;
8122
137M
            SP_ASM_ADDC_REG(wl, wh, t);
8123
            /* Store low digit in result. */
8124
137M
            r->dp[i] = wl;
8125
            /* Add result to reducing value. */
8126
137M
            SP_ASM_ADDC_REG(sl, sh, wl);
8127
            /* Subtract next digit of modulus. */
8128
137M
            SP_ASM_SUBB(sl, sh, m->dp[i]);
8129
            /* Move high digit of reduced result down. */
8130
137M
            sl = sh;
8131
            /* High digit is 0 when positive or -1 on negative. */
8132
137M
            sh = (sp_int_digit)0 - (sh >> (SP_WORD_SIZE-1));
8133
            /* Move high digit of sum result down. */
8134
137M
            wl = wh;
8135
137M
        #endif
8136
137M
        }
8137
    #ifndef SQR_MUL_ASM
8138
        /* Add carry into reduced result. */
8139
        s += (sp_int_digit)w;
8140
        /* s will be positive when subtracting modulus is needed. */
8141
        mask = (sp_int_digit)0 - (s >= 0);
8142
    #else
8143
        /* Add carry into reduced result. */
8144
26.6M
        SP_ASM_ADDC_REG(sl, sh, wl);
8145
        /* s will be positive when subtracting modulus is needed. */
8146
26.6M
        mask = (sh >> (SP_WORD_SIZE-1)) - 1;
8147
26.6M
    #endif
8148
8149
        /* Constant time, conditionally, subtract modulus from sum. */
8150
    #ifndef SQR_MUL_ASM
8151
        w = 0;
8152
    #else
8153
26.6M
        wl = 0;
8154
26.6M
        wh = 0;
8155
26.6M
    #endif
8156
164M
        for (i = 0; i < m->used; i++) {
8157
        #ifndef SQR_MUL_ASM
8158
            /* Add result to current value and conditionally subtract modulus.
8159
             */
8160
            w         += r->dp[i];
8161
            w         -= m->dp[i] & mask;
8162
            /* Store low digit in result. */
8163
            r->dp[i]   = (sp_int_digit)w;
8164
            /* Move high digit of sum result down. */
8165
            w        >>= DIGIT_BIT;
8166
        #else
8167
            /* Add result to current value and conditionally subtract modulus.
8168
             */
8169
137M
            SP_ASM_ADDC(wl, wh, r->dp[i]);
8170
137M
            t = m->dp[i] & mask;
8171
137M
            SP_ASM_SUBB_REG(wl, wh, t);
8172
            /* Store low digit in result. */
8173
137M
            r->dp[i] = wl;
8174
            /* Move high digit of sum result down. */
8175
137M
            wl = wh;
8176
            /* High digit is 0 when positive or -1 on negative. */
8177
137M
            wh = (sp_int_digit)0 - (wl >> (SP_WORD_SIZE-1));
8178
137M
        #endif
8179
137M
        }
8180
        /* Result will always have digits equal to or less than those in
8181
         * modulus. */
8182
26.6M
        r->used = i;
8183
26.6M
    #ifdef WOLFSSL_SP_INT_NEGATIVE
8184
26.6M
        r->sign = MP_ZPOS;
8185
26.6M
    #endif /* WOLFSSL_SP_INT_NEGATIVE */
8186
        /* Remove leading zeros. */
8187
26.6M
        sp_clamp_ct(r);
8188
8189
#if 0
8190
        sp_print(r, "rma");
8191
#endif
8192
26.6M
    }
8193
8194
26.6M
    return err;
8195
26.6M
}
8196
#endif /* WOLFSSL_SP_MATH_ALL && HAVE_ECC */
8197
8198
#if (defined(WOLFSSL_SP_MATH_ALL) && defined(HAVE_ECC)) || \
8199
    (defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_HAVE_SP_DH) || \
8200
     defined(WOLFCRYPT_HAVE_ECCSI) || defined(WOLFCRYPT_HAVE_SAKKE) || \
8201
     defined(OPENSSL_ALL))
8202
/* Sub b from a modulo m: r = (a - b) % m
8203
 *
8204
 * Result is always positive.
8205
 *
8206
 * Assumes a, b, m and r are not NULL.
8207
 * m and r must not be the same pointer.
8208
 *
8209
 * @param  [in]   a  SP integer to subtract from
8210
 * @param  [in]   b  SP integer to subtract.
8211
 * @param  [in]   m  SP integer that is the modulus.
8212
 * @param  [out]  r  SP integer to hold result.
8213
 *
8214
 * @return  MP_OKAY on success.
8215
 */
8216
static void _sp_submod_ct(const sp_int* a, const sp_int* b, const sp_int* m,
8217
    unsigned int max_size, sp_int* r)
8218
34.6M
{
8219
#ifndef SQR_MUL_ASM
8220
    sp_int_sword w;
8221
#else
8222
34.6M
    sp_int_digit l;
8223
34.6M
    sp_int_digit h;
8224
34.6M
    sp_int_digit t;
8225
34.6M
#endif
8226
34.6M
    volatile sp_int_digit mask;
8227
34.6M
    volatile sp_int_digit mask_a = (sp_int_digit)-1;
8228
34.6M
    volatile sp_int_digit mask_b = (sp_int_digit)-1;
8229
34.6M
    unsigned int i;
8230
8231
    /* In constant time, subtract b from a putting result in r. */
8232
#ifndef SQR_MUL_ASM
8233
    w = 0;
8234
#else
8235
34.6M
    l = 0;
8236
34.6M
    h = 0;
8237
34.6M
#endif
8238
213M
    for (i = 0; i < max_size; i++) {
8239
        /* Values past 'used' are not initialized. */
8240
178M
        mask_a += (i == a->used);
8241
178M
        mask_b += (i == b->used);
8242
8243
    #ifndef SQR_MUL_ASM
8244
        /* Add a to and subtract b from current value. */
8245
        w         += a->dp[i] & mask_a;
8246
        w         -= b->dp[i] & mask_b;
8247
        /* Store low digit in result. */
8248
        r->dp[i]   = (sp_int_digit)w;
8249
        /* Move high digit down. */
8250
        w        >>= DIGIT_BIT;
8251
    #else
8252
        /* Add a and subtract b from current value. */
8253
178M
        t = a->dp[i] & mask_a;
8254
178M
        SP_ASM_ADDC_REG(l, h, t);
8255
178M
        t = b->dp[i] & mask_b;
8256
178M
        SP_ASM_SUBB_REG(l, h, t);
8257
        /* Store low digit in result. */
8258
178M
        r->dp[i] = l;
8259
        /* Move high digit down. */
8260
178M
        l = h;
8261
        /* High digit is 0 when positive or -1 on negative. */
8262
178M
        h = (sp_int_digit)0 - (l >> (SP_WORD_SIZE - 1));
8263
178M
    #endif
8264
178M
    }
8265
    /* When w is negative then we need to add modulus to make result
8266
     * positive. */
8267
#ifndef SQR_MUL_ASM
8268
    mask = (sp_int_digit)0 - (w < 0);
8269
#else
8270
34.6M
    mask = h;
8271
34.6M
#endif
8272
8273
    /* Constant time, conditionally, add modulus to difference. */
8274
#ifndef SQR_MUL_ASM
8275
    w = 0;
8276
#else
8277
34.6M
    l = 0;
8278
34.6M
#endif
8279
213M
    for (i = 0; i < m->used; i++) {
8280
    #ifndef SQR_MUL_ASM
8281
        /* Add result and conditionally modulus to current value. */
8282
        w         += r->dp[i];
8283
        w         += m->dp[i] & mask;
8284
        /* Store low digit in result. */
8285
        r->dp[i]   = (sp_int_digit)w;
8286
        /* Move high digit down. */
8287
        w        >>= DIGIT_BIT;
8288
    #else
8289
178M
        h = 0;
8290
        /* Add result and conditionally modulus to current value. */
8291
178M
        SP_ASM_ADDC(l, h, r->dp[i]);
8292
178M
        t = m->dp[i] & mask;
8293
178M
        SP_ASM_ADDC_REG(l, h, t);
8294
        /* Store low digit in result. */
8295
178M
        r->dp[i] = l;
8296
        /* Move high digit down. */
8297
178M
        l = h;
8298
178M
    #endif
8299
178M
    }
8300
    /* Result will always have digits equal to or less than those in
8301
     * modulus. */
8302
34.6M
    r->used = (sp_size_t)i;
8303
34.6M
#ifdef WOLFSSL_SP_INT_NEGATIVE
8304
34.6M
    r->sign = MP_ZPOS;
8305
34.6M
#endif /* WOLFSSL_SP_INT_NEGATIVE */
8306
    /* Remove leading zeros. */
8307
34.6M
    sp_clamp_ct(r);
8308
34.6M
}
8309
#endif
8310
8311
#if defined(WOLFSSL_SP_MATH_ALL) && defined(HAVE_ECC)
8312
/* Sub b from a modulo m: r = (a - b) % m
8313
 * Result is always positive.
8314
 *
8315
 * r = a - b (mod m) - constant time (a < m and b < m, a, b and m are positive)
8316
 *
8317
 * Assumes a, b, m and r are not NULL.
8318
 * m and r must not be the same pointer.
8319
 *
8320
 * @param  [in]   a  SP integer to subtract from
8321
 * @param  [in]   b  SP integer to subtract.
8322
 * @param  [in]   m  SP integer that is the modulus.
8323
 * @param  [out]  r  SP integer to hold result.
8324
 *
8325
 * @return  MP_OKAY on success.
8326
 */
8327
int sp_submod_ct(const sp_int* a, const sp_int* b, const sp_int* m, sp_int* r)
8328
81.3M
{
8329
81.3M
    int err = MP_OKAY;
8330
8331
    /* Check result is as big as modulus plus one digit. */
8332
81.3M
    if (m->used > r->size) {
8333
35
        err = MP_VAL;
8334
35
    }
8335
    /* Validate parameters. */
8336
81.3M
    if ((err == MP_OKAY) && (r == m)) {
8337
11
        err = MP_VAL;
8338
11
    }
8339
8340
81.3M
    if (err == MP_OKAY) {
8341
#if 0
8342
        sp_print(a, "a");
8343
        sp_print(b, "b");
8344
        sp_print(m, "m");
8345
#endif
8346
8347
81.3M
        _sp_submod_ct(a, b, m, m->used, r);
8348
8349
#if 0
8350
        sp_print(r, "rms");
8351
#endif
8352
81.3M
    }
8353
8354
81.3M
    return err;
8355
81.3M
}
8356
#endif /* WOLFSSL_SP_MATH_ALL && HAVE_ECC */
8357
8358
#if defined(WOLFSSL_SP_MATH_ALL) && defined(HAVE_ECC) && \
8359
    defined(WOLFSSL_ECC_BLIND_K)
8360
void sp_xor_ct(const sp_int* a, const sp_int* b, int len, sp_int* r)
8361
{
8362
    if ((a != NULL) && (b != NULL) && (r != NULL)) {
8363
        unsigned int i;
8364
8365
        r->used = (len * 8 + SP_WORD_SIZE - 1) / SP_WORD_SIZE;
8366
        for (i = 0; i < r->used; i++) {
8367
            r->dp[i] = a->dp[i] ^ b->dp[i];
8368
        }
8369
        i = (len * 8) % SP_WORD_SIZE;
8370
        if (i > 0) {
8371
            r->dp[r->used - 1] &= ((sp_int_digit)1 << i) - 1;
8372
        }
8373
        /* Remove leading zeros. */
8374
        sp_clamp_ct(r);
8375
    }
8376
}
8377
#endif
8378
8379
/********************
8380
 * Shifting functoins
8381
 ********************/
8382
8383
#if !defined(NO_DH) || defined(HAVE_ECC) || (!defined(NO_RSA) && \
8384
    defined(WC_RSA_BLINDING) && !defined(WOLFSSL_RSA_VERIFY_ONLY))
8385
/* Left shift the multi-precision number by a number of digits.
8386
 *
8387
 * @param  [in,out]  a  SP integer to shift.
8388
 * @param  [in]      s  Number of digits to shift.
8389
 *
8390
 * @return  MP_OKAY on success.
8391
 * @return  MP_VAL when a is NULL, s is negative or the result is too big.
8392
 */
8393
int sp_lshd(sp_int* a, int s)
8394
44
{
8395
44
    int err = MP_OKAY;
8396
8397
    /* Validate parameters. */
8398
44
    if ((a == NULL) || (s < 0)) {
8399
0
        err = MP_VAL;
8400
0
    }
8401
    /* Ensure number has enough digits for operation. */
8402
44
    if ((err == MP_OKAY) && (a->used + (unsigned int)s > a->size)) {
8403
4
        err = MP_VAL;
8404
4
    }
8405
44
    if (err == MP_OKAY) {
8406
        /* Move up digits. */
8407
40
        XMEMMOVE(a->dp + s, a->dp, a->used * (word32)SP_WORD_SIZEOF);
8408
        /* Back fill with zeros. */
8409
40
        XMEMSET(a->dp, 0, (size_t)s * SP_WORD_SIZEOF);
8410
        /* Update used. */
8411
40
        a->used = (sp_size_t)(a->used + s);
8412
        /* Remove leading zeros. */
8413
40
        sp_clamp(a);
8414
40
    }
8415
8416
44
    return err;
8417
44
}
8418
#endif
8419
8420
#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_DH) || defined(HAVE_ECC) || \
8421
    (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && \
8422
     !defined(WOLFSSL_RSA_PUBLIC_ONLY))
8423
/* Left shift the multi-precision number by n bits.
8424
 * Bits may be larger than the word size.
8425
 *
8426
 * Used by sp_mul_2d() and other internal functions.
8427
 *
8428
 * @param  [in,out]  a  SP integer to shift.
8429
 * @param  [in]      n  Number of bits to shift left.
8430
 *
8431
 * @return  MP_OKAY on success.
8432
 * @return  MP_VAL when the result is too big.
8433
 */
8434
static int sp_lshb(sp_int* a, int n)
8435
32.0M
{
8436
32.0M
    int err = MP_OKAY;
8437
8438
32.0M
    if (a->used != 0) {
8439
        /* Calculate number of digits to shift. */
8440
32.0M
        sp_size_t s = (sp_size_t)n >> SP_WORD_SHIFT;
8441
8442
        /* Ensure number has enough digits for result. */
8443
32.0M
        if (a->used + s >= a->size) {
8444
37
            err = MP_VAL;
8445
37
        }
8446
32.0M
        if (err == MP_OKAY) {
8447
            /* Get count of bits to move in digit. */
8448
32.0M
            n &= (int)SP_WORD_MASK;
8449
            /* Check whether this is a complicated case. */
8450
32.0M
            if (n != 0) {
8451
31.9M
                unsigned int i;
8452
8453
                /* Shift up starting at most significant digit. */
8454
                /* Get new most significant digit. */
8455
31.9M
                sp_int_digit v = a->dp[a->used - 1] >> (SP_WORD_SIZE - n);
8456
                /* Shift up each digit. */
8457
320M
                for (i = a->used - 1U; i >= 1U; i--) {
8458
288M
                    a->dp[i + s] = (a->dp[i] << n) |
8459
288M
                                   (a->dp[i - 1] >> (SP_WORD_SIZE - n));
8460
288M
                }
8461
                /* Shift up least significant digit. */
8462
31.9M
                a->dp[s] = a->dp[0] << n;
8463
                /* Add new high digit unless zero. */
8464
31.9M
                if (v != 0) {
8465
13.3M
                    a->dp[a->used + s] = v;
8466
13.3M
                    a->used++;
8467
13.3M
                }
8468
31.9M
            }
8469
            /* Only digits to move and ensure not zero. */
8470
102k
            else if (s > 0) {
8471
                /* Move up digits. */
8472
36
                XMEMMOVE(a->dp + s, a->dp, a->used * (word32)SP_WORD_SIZEOF);
8473
36
            }
8474
8475
            /* Update used digit count. */
8476
32.0M
            a->used = (sp_size_t)(a->used + s);
8477
            /* Back fill with zeros. */
8478
32.0M
            XMEMSET(a->dp, 0, (word32)SP_WORD_SIZEOF * s);
8479
32.0M
        }
8480
32.0M
    }
8481
8482
32.0M
    return err;
8483
32.0M
}
8484
#endif /* WOLFSSL_SP_MATH_ALL || !NO_DH || HAVE_ECC ||
8485
        * (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) */
8486
8487
#ifdef WOLFSSL_SP_MATH_ALL
8488
/* Shift a right by c digits: a = a >> (n * SP_WORD_SIZE)
8489
 *
8490
 * @param  [in, out] a  SP integer to shift.
8491
 * @param  [in]      c  Number of digits to shift.
8492
 */
8493
void sp_rshd(sp_int* a, int c)
8494
101
{
8495
    /* Do shift if we have an SP int. */
8496
101
    if ((a != NULL) && (c > 0)) {
8497
        /* Make zero if shift removes all digits. */
8498
63
        if ((sp_size_t)c >= a->used) {
8499
35
            _sp_zero(a);
8500
35
        }
8501
28
        else {
8502
28
            sp_size_t i;
8503
8504
            /* Update used digits count. */
8505
28
            a->used = (sp_size_t)(a->used - c);
8506
            /* Move digits down. */
8507
485
            for (i = 0; i < a->used; i++, c++) {
8508
457
                a->dp[i] = a->dp[c];
8509
457
            }
8510
28
        }
8511
63
    }
8512
101
}
8513
#endif /* WOLFSSL_SP_MATH_ALL */
8514
8515
#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_DH) || defined(HAVE_ECC) || \
8516
    (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
8517
    defined(WOLFSSL_HAVE_SP_DH)
8518
/* Shift a right by n bits into r: r = a >> n
8519
 *
8520
 * @param  [in]   a  SP integer to shift.
8521
 * @param  [in]   n  Number of bits to shift.
8522
 * @param  [out]  r  SP integer to store result in.
8523
 */
8524
int sp_rshb(const sp_int* a, int n, sp_int* r)
8525
221M
{
8526
221M
    int err = MP_OKAY;
8527
    /* Number of digits to shift down. */
8528
221M
    sp_size_t i;
8529
8530
221M
    if ((a == NULL) || (n < 0)) {
8531
0
        err = MP_VAL;
8532
0
    }
8533
    /* Handle case where shifting out all digits. */
8534
221M
    else if ((i = (sp_size_t)(n >> SP_WORD_SHIFT)) >= a->used) {
8535
278k
        _sp_zero(r);
8536
278k
    }
8537
    /* Change callers when more error cases returned. */
8538
221M
    else if ((err == MP_OKAY) && (a->used - i > r->size)) {
8539
13
        err = MP_VAL;
8540
13
    }
8541
221M
    else if (err == MP_OKAY) {
8542
221M
        sp_size_t j;
8543
8544
        /* Number of bits to shift in digits. */
8545
221M
        n &= SP_WORD_SIZE - 1;
8546
        /* Handle simple case. */
8547
221M
        if (n == 0) {
8548
            /* Set the count of used digits. */
8549
109M
            r->used = (sp_size_t)(a->used - i);
8550
            /* Move digits down. */
8551
109M
            if (r == a) {
8552
109M
                XMEMMOVE(r->dp, r->dp + i, (word32)SP_WORD_SIZEOF * r->used);
8553
109M
            }
8554
242
            else {
8555
242
                XMEMCPY(r->dp, a->dp + i, (word32)SP_WORD_SIZEOF * r->used);
8556
242
            }
8557
109M
        }
8558
111M
        else {
8559
            /* Move the bits down starting at least significant digit. */
8560
782M
            for (j = 0; j < (sp_size_t)(a->used - 1 - i); j++)
8561
670M
                r->dp[j] = (a->dp[j+i] >> n) |
8562
670M
                    (a->dp[j+i+1] << (SP_WORD_SIZE - n));
8563
            /* Most significant digit has no higher digit to pull from. */
8564
111M
            r->dp[j] = a->dp[j+i] >> n;
8565
            /* Set the count of used digits. */
8566
111M
            r->used = (sp_size_t)(j + (r->dp[j] > 0));
8567
111M
        }
8568
221M
#ifdef WOLFSSL_SP_INT_NEGATIVE
8569
221M
        if (sp_iszero(r)) {
8570
            /* Set zero sign. */
8571
16.0k
            r->sign = MP_ZPOS;
8572
16.0k
        }
8573
221M
        else {
8574
            /* Retain sign. */
8575
221M
            r->sign = a->sign;
8576
221M
        }
8577
221M
#endif
8578
221M
    }
8579
8580
221M
    return err;
8581
221M
}
8582
#endif /* WOLFSSL_SP_MATH_ALL || !NO_DH || HAVE_ECC ||
8583
        * (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) || WOLFSSL_HAVE_SP_DH */
8584
8585
#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_DH) || defined(HAVE_ECC) || \
8586
    (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && \
8587
     !defined(WOLFSSL_RSA_PUBLIC_ONLY))
8588
static void _sp_div_same_size(sp_int* a, const sp_int* d, sp_int* r)
8589
43.1M
{
8590
43.1M
    sp_size_t i;
8591
8592
    /* Compare top digits of dividend with those of divisor up to last. */
8593
43.4M
    for (i = (sp_size_t)(d->used - 1U); i > 0; i--) {
8594
        /* Break if top divisor is not equal to dividend. */
8595
25.9M
        if (a->dp[a->used - d->used + i] != d->dp[i]) {
8596
25.6M
            break;
8597
25.6M
        }
8598
25.9M
    }
8599
    /* Check if top dividend is greater than or equal to divisor. */
8600
43.1M
    if (a->dp[a->used - d->used + i] >= d->dp[i]) {
8601
        /* Update quotient result. */
8602
72.8k
        r->dp[a->used - d->used] += 1;
8603
        /* Get 'used' to restore - ensure zeros put into quotient. */
8604
72.8k
        i = a->used;
8605
        /* Subtract d from top of a. */
8606
72.8k
        _sp_sub_off(a, d, a, (sp_size_t)(a->used - d->used));
8607
        /* Restore 'used' on remainder. */
8608
72.8k
        a->used = i;
8609
72.8k
    }
8610
43.1M
}
8611
8612
/* Divide a by d and return the quotient in r and the remainder in a.
8613
 *   r = a / d; a = a % d
8614
 *
8615
 * Note: a is constantly having multiplies of d subtracted.
8616
 *
8617
 * @param  [in, out] a      SP integer to be divided and remainder on out.
8618
 * @param  [in]      d      SP integer to divide by.
8619
 * @param  [out]     r      SP integer that is the quotient.
8620
 * @param  [out]     trial  SP integer that is product in trial division.
8621
 *
8622
 * @return  MP_OKAY on success.
8623
 * @return  MP_VAL when operation fails - only when compiling small code.
8624
 */
8625
static int _sp_div_impl(sp_int* a, const sp_int* d, sp_int* r, sp_int* trial)
8626
21.5M
{
8627
21.5M
    int err = MP_OKAY;
8628
21.5M
    sp_size_t i;
8629
#ifdef WOLFSSL_SP_SMALL
8630
    int c;
8631
#else
8632
21.5M
    sp_size_t j;
8633
21.5M
    sp_size_t o;
8634
    #ifndef SQR_MUL_ASM
8635
    sp_int_sword sw;
8636
    #else
8637
21.5M
    sp_int_digit sl;
8638
21.5M
    sp_int_digit sh;
8639
21.5M
    sp_int_digit st;
8640
21.5M
    #endif
8641
21.5M
#endif /* WOLFSSL_SP_SMALL */
8642
21.5M
    sp_int_digit t;
8643
21.5M
    sp_int_digit dt;
8644
8645
    /* Set result size to clear. */
8646
21.5M
    r->used = (sp_size_t)(a->used - d->used + 1);
8647
    /* Set all potentially used digits to zero. */
8648
92.5M
    for (i = 0; i < r->used; i++) {
8649
70.9M
        r->dp[i] = 0;
8650
70.9M
    }
8651
21.5M
#ifdef WOLFSSL_SP_INT_NEGATIVE
8652
21.5M
    r->sign = MP_ZPOS;
8653
21.5M
#endif
8654
    /* Get the most significant digit (will have top bit set). */
8655
21.5M
    dt = d->dp[d->used-1];
8656
8657
    /* Handle when a >= d ^ (2 ^ (SP_WORD_SIZE * x)). */
8658
21.5M
    _sp_div_same_size(a, d, r);
8659
8660
    /* Keep subtracting multiples of d as long as the digit count of a is
8661
     * greater than equal to d.
8662
     */
8663
70.9M
    for (i = (sp_size_t)(a->used - 1U); i >= d->used; i--) {
8664
        /* When top digits equal, guestimate maximum multiplier.
8665
         * Worst case, multiplier is actually SP_DIGIT_MAX - 1.
8666
         * That is, for w (word size in bits) > 1, n > 1, let:
8667
         *   a = 2^((n+1)*w-1), d = 2^(n*w-1) + 2^((n-1)*w) - 1, t = 2^w - 2
8668
         * Then,
8669
         *     d * t
8670
         *   = (2^(n*w-1) + 2^((n-1)*w) - 1) * (2^w - 2)
8671
         *   = 2^((n+1)*w-1) - 2^(n*w) + 2^(n*w) - 2^((n-1)*w+1) - 2^w + 2
8672
         *   = 2^((n+1)*w-1) - 2^((n-1)*w+1) - 2^w + 2
8673
         *   = a - 2^((n-1)*w+1) - 2^w + 2
8674
         * d > 2^((n-1)*w+1) + 2^w - 2, when w > 1, n > 1
8675
         */
8676
49.3M
        if (a->dp[i] == dt) {
8677
176k
            t = SP_DIGIT_MAX;
8678
176k
        }
8679
49.1M
        else {
8680
            /* Calculate trial quotient by dividing top word of dividend by top
8681
             * digit of divisor.
8682
             * Some implementations segfault when quotient > SP_DIGIT_MAX.
8683
             * Implementations in assembly, using builtins or using
8684
             * digits only (WOLFSSL_SP_DIV_WORD_HALF).
8685
             */
8686
49.1M
            t = sp_div_word(a->dp[i], a->dp[i-1], dt);
8687
49.1M
        }
8688
#ifdef WOLFSSL_SP_SMALL
8689
        do {
8690
            /* Calculate trial from trial quotient. */
8691
            err = _sp_mul_d(d, t, trial, i - d->used);
8692
            if (err != MP_OKAY) {
8693
                break;
8694
            }
8695
            /* Check if trial is bigger. */
8696
            c = _sp_cmp_abs(trial, a);
8697
            if (c == MP_GT) {
8698
                /* Decrement trial quotient and try again. */
8699
                t--;
8700
            }
8701
        }
8702
        while (c == MP_GT);
8703
8704
        if (err != MP_OKAY) {
8705
            break;
8706
        }
8707
8708
        /* Subtract the trial and add qoutient to result. */
8709
        _sp_sub_off(a, trial, a, 0);
8710
        r->dp[i - d->used] += t;
8711
        /* Handle overflow of digit. */
8712
        if (r->dp[i - d->used] < t) {
8713
            r->dp[i + 1 - d->used]++;
8714
        }
8715
#else
8716
        /* Index of lowest digit trial is subtracted from. */
8717
49.3M
        o = (sp_size_t)(i - d->used);
8718
58.6M
        do {
8719
        #ifndef SQR_MUL_ASM
8720
            sp_int_word tw = 0;
8721
        #else
8722
58.6M
            sp_int_digit tl = 0;
8723
58.6M
            sp_int_digit th = 0;
8724
58.6M
        #endif
8725
8726
            /* Multiply divisor by trial quotient. */
8727
1.41G
            for (j = 0; j < d->used; j++) {
8728
            #ifndef SQR_MUL_ASM
8729
                tw += (sp_int_word)d->dp[j] * t;
8730
                trial->dp[j] = (sp_int_digit)tw;
8731
                tw >>= SP_WORD_SIZE;
8732
            #else
8733
1.36G
                SP_ASM_MUL_ADD_NO(tl, th, d->dp[j], t);
8734
1.36G
                trial->dp[j] = tl;
8735
1.36G
                tl = th;
8736
1.36G
                th = 0;
8737
1.36G
            #endif
8738
1.36G
            }
8739
          #ifndef SQR_MUL_ASM
8740
            trial->dp[j] = (sp_int_digit)tw;
8741
          #else
8742
58.6M
            trial->dp[j] = tl;
8743
58.6M
          #endif
8744
8745
            /* Check trial quotient isn't larger than dividend. */
8746
93.7M
            for (j = d->used; j > 0; j--) {
8747
88.6M
                if (trial->dp[j] != a->dp[j + o]) {
8748
53.4M
                    break;
8749
53.4M
                }
8750
88.6M
            }
8751
            /* Decrement trial quotient if larger and try again. */
8752
58.6M
            if (trial->dp[j] > a->dp[j + o]) {
8753
9.33M
                t--;
8754
9.33M
            }
8755
58.6M
        }
8756
58.6M
        while (trial->dp[j] > a->dp[j + o]);
8757
8758
    #ifndef SQR_MUL_ASM
8759
        sw = 0;
8760
    #else
8761
49.3M
        sl = 0;
8762
49.3M
        sh = 0;
8763
49.3M
    #endif
8764
        /* Subtract trial - don't need to update used. */
8765
1.14G
        for (j = 0; j <= d->used; j++) {
8766
        #ifndef SQR_MUL_ASM
8767
            sw += a->dp[j + o];
8768
            sw -= trial->dp[j];
8769
            a->dp[j + o] = (sp_int_digit)sw;
8770
            sw >>= SP_WORD_SIZE;
8771
        #else
8772
1.09G
            st = a->dp[j + o];
8773
1.09G
            SP_ASM_ADDC(sl, sh, st);
8774
1.09G
            st = trial->dp[j];
8775
1.09G
            SP_ASM_SUBB(sl, sh, st);
8776
1.09G
            a->dp[j + o] = sl;
8777
1.09G
            sl = sh;
8778
1.09G
            sh = (sp_int_digit)0 - (sl >> (SP_WORD_SIZE - 1));
8779
1.09G
        #endif
8780
1.09G
        }
8781
8782
49.3M
        r->dp[o] = t;
8783
49.3M
#endif /* WOLFSSL_SP_SMALL */
8784
49.3M
    }
8785
    /* Update used. */
8786
21.5M
    a->used = (sp_size_t)(i + 1U);
8787
21.5M
    if (a->used == d->used) {
8788
        /* Finish div now that length of dividend is same as divisor. */
8789
21.5M
        _sp_div_same_size(a, d, r);
8790
21.5M
    }
8791
8792
21.5M
    return err;
8793
21.5M
}
8794
8795
/* Divide a by d and return the quotient in r and the remainder in rem.
8796
 *   r = a / d; rem = a % d
8797
 *
8798
 * @param  [in]   a     SP integer to be divided.
8799
 * @param  [in]   d     SP integer to divide by.
8800
 * @param  [out]  r     SP integer that is the quotient.
8801
 * @param  [out]  rem   SP integer that is the remainder.
8802
 * @param  [in]   used  Number of digits in temporaries to use.
8803
 *
8804
 * @return  MP_OKAY on success.
8805
 * @return  MP_MEM when dynamic memory allocation fails.
8806
 */
8807
static int _sp_div(const sp_int* a, const sp_int* d, sp_int* r, sp_int* rem,
8808
    unsigned int used)
8809
21.4M
{
8810
21.4M
    int err = MP_OKAY;
8811
21.4M
    int ret;
8812
21.4M
    int done = 0;
8813
21.4M
    int s = 0;
8814
21.4M
    sp_int* sa = NULL;
8815
21.4M
    sp_int* sd = NULL;
8816
21.4M
    sp_int* tr = NULL;
8817
21.4M
    sp_int* trial = NULL;
8818
21.4M
#ifdef WOLFSSL_SP_INT_NEGATIVE
8819
21.4M
    sp_uint8 signA = MP_ZPOS;
8820
21.4M
    sp_uint8 signD = MP_ZPOS;
8821
21.4M
#endif /* WOLFSSL_SP_INT_NEGATIVE */
8822
    /* Intermediates will always be less than or equal to dividend. */
8823
21.4M
    DECL_SP_INT_ARRAY(td, used, 4);
8824
8825
21.4M
#ifdef WOLFSSL_SP_INT_NEGATIVE
8826
    /* Cache sign for results. */
8827
21.4M
    signA = a->sign;
8828
21.4M
    signD = d->sign;
8829
21.4M
#endif /* WOLFSSL_SP_INT_NEGATIVE */
8830
8831
    /* Handle simple case of: dividend < divisor. */
8832
21.4M
    ret = _sp_cmp_abs(a, d);
8833
21.4M
    if (ret == MP_LT) {
8834
        /* a = 0 * d + a */
8835
5.02M
        if ((rem != NULL) && (a != rem)) {
8836
4.28M
            _sp_copy(a, rem);
8837
4.28M
        }
8838
5.02M
        if (r != NULL) {
8839
46
            _sp_set(r, 0);
8840
46
        }
8841
5.02M
        done = 1;
8842
5.02M
    }
8843
    /* Handle simple case of: dividend == divisor. */
8844
16.4M
    else if (ret == MP_EQ) {
8845
        /* a = 1 * d + 0 */
8846
653k
        if (rem != NULL) {
8847
653k
            _sp_set(rem, 0);
8848
653k
        }
8849
653k
        if (r != NULL) {
8850
59
            _sp_set(r, 1);
8851
59
        #ifdef WOLFSSL_SP_INT_NEGATIVE
8852
59
            r->sign = (signA == signD) ? MP_ZPOS : MP_NEG;
8853
59
        #endif /* WOLFSSL_SP_INT_NEGATIVE */
8854
59
        }
8855
653k
        done = 1;
8856
653k
    }
8857
15.7M
    else if (sp_count_bits(a) == sp_count_bits(d)) {
8858
        /* a is greater than d but same bit length - subtract. */
8859
2.17M
        if (rem != NULL) {
8860
2.17M
            _sp_sub_off(a, d, rem, 0);
8861
2.17M
        #ifdef WOLFSSL_SP_INT_NEGATIVE
8862
2.17M
            rem->sign = signA;
8863
2.17M
        #endif
8864
2.17M
        }
8865
2.17M
        if (r != NULL) {
8866
2.15M
            _sp_set(r, 1);
8867
2.15M
        #ifdef WOLFSSL_SP_INT_NEGATIVE
8868
2.15M
            r->sign = (signA == signD) ? MP_ZPOS : MP_NEG;
8869
2.15M
        #endif /* WOLFSSL_SP_INT_NEGATIVE */
8870
2.15M
        }
8871
2.17M
        done = 1;
8872
2.17M
    }
8873
8874
    /* Allocate temporary 'sp_int's and assign. */
8875
21.4M
    if ((!done) && (err == MP_OKAY)) {
8876
13.5M
    #if (defined(WOLFSSL_SMALL_STACK) || defined(SP_ALLOC)) && \
8877
13.5M
        !defined(WOLFSSL_SP_NO_MALLOC)
8878
13.5M
        unsigned int cnt = 4;
8879
        /* Reuse remainder sp_int where possible. */
8880
13.5M
        if ((rem != NULL) && (rem != d) && (rem->size > a->used)) {
8881
13.5M
            sa = rem;
8882
13.5M
            cnt--;
8883
13.5M
        }
8884
        /* Reuse result sp_int where possible. */
8885
13.5M
        if ((r != NULL) && (r != d)) {
8886
7.56M
            tr = r;
8887
7.56M
            cnt--;
8888
7.56M
        }
8889
        /* Macro always has code associated with it and checks err first. */
8890
13.5M
        ALLOC_SP_INT_ARRAY(td, used, cnt, err, NULL);
8891
    #else
8892
        ALLOC_SP_INT_ARRAY(td, used, 4, err, NULL);
8893
    #endif
8894
13.5M
    }
8895
21.4M
    if ((!done) && (err == MP_OKAY)) {
8896
13.5M
    #if (defined(WOLFSSL_SMALL_STACK) || defined(SP_ALLOC)) && \
8897
13.5M
        !defined(WOLFSSL_SP_NO_MALLOC)
8898
13.5M
        int i = 2;
8899
8900
        /* Set to temporary when not reusing. */
8901
13.5M
        if (sa == NULL) {
8902
886
            sa = td[i++];
8903
886
            _sp_init_size(sa, used);
8904
886
        }
8905
13.5M
        if (tr == NULL) {
8906
6.02M
            tr = td[i];
8907
6.02M
            _sp_init_size(tr, (unsigned int)(a->used - d->used + 2));
8908
6.02M
        }
8909
    #else
8910
        sa    = td[2];
8911
        tr    = td[3];
8912
8913
        _sp_init_size(sa, used);
8914
        _sp_init_size(tr, (unsigned int)(a->used - d->used + 2));
8915
    #endif
8916
13.5M
        sd    = td[0];
8917
13.5M
        trial = td[1];
8918
8919
        /* Initialize sizes to minimal values. */
8920
13.5M
        _sp_init_size(sd, (sp_size_t)(d->used + 1U));
8921
13.5M
        _sp_init_size(trial, used);
8922
8923
        /* Move divisor to top of word. Adjust dividend as well. */
8924
13.5M
        s = sp_count_bits(d);
8925
13.5M
        s = SP_WORD_SIZE - (s & (int)SP_WORD_MASK);
8926
13.5M
        _sp_copy(a, sa);
8927
        /* Only shift if top bit of divisor no set. */
8928
13.5M
        if (s != SP_WORD_SIZE) {
8929
10.5M
            err = sp_lshb(sa, s);
8930
10.5M
            if (err == MP_OKAY) {
8931
10.5M
                _sp_copy(d, sd);
8932
10.5M
                d = sd;
8933
10.5M
                err = sp_lshb(sd, s);
8934
10.5M
            }
8935
10.5M
        }
8936
13.5M
    }
8937
21.4M
    if ((!done) && (err == MP_OKAY) && (d->used > 0)) {
8938
        /* Do division: tr = sa / d, sa = sa % d. */
8939
13.5M
        err = _sp_div_impl(sa, d, tr, trial);
8940
        /* Return the remainder if required. */
8941
13.5M
        if ((err == MP_OKAY) && (rem != NULL)) {
8942
            /* Move result back down if moved up for divisor value. */
8943
13.5M
            if (s != SP_WORD_SIZE) {
8944
10.5M
                (void)sp_rshb(sa, s, sa);
8945
10.5M
            }
8946
13.5M
            _sp_copy(sa, rem);
8947
13.5M
            sp_clamp(rem);
8948
13.5M
        #ifdef WOLFSSL_SP_INT_NEGATIVE
8949
13.5M
            rem->sign = (rem->used == 0) ? MP_ZPOS : signA;
8950
13.5M
        #endif
8951
13.5M
        }
8952
        /* Return the quotient if required. */
8953
13.5M
        if ((err == MP_OKAY) && (r != NULL)) {
8954
7.56M
            _sp_copy(tr, r);
8955
7.56M
            sp_clamp(r);
8956
7.56M
        #ifdef WOLFSSL_SP_INT_NEGATIVE
8957
7.56M
            if ((r->used == 0) || (signA == signD)) {
8958
7.56M
                r->sign = MP_ZPOS;
8959
7.56M
            }
8960
7
            else {
8961
7
                r->sign = MP_NEG;
8962
7
            }
8963
7.56M
        #endif /* WOLFSSL_SP_INT_NEGATIVE */
8964
7.56M
        }
8965
13.5M
    }
8966
8967
21.4M
    FREE_SP_INT_ARRAY(td, NULL);
8968
21.4M
    return err;
8969
21.4M
}
8970
8971
/* Divide a by d and return the quotient in r and the remainder in rem.
8972
 *   r = a / d; rem = a % d
8973
 *
8974
 * @param  [in]   a    SP integer to be divided.
8975
 * @param  [in]   d    SP integer to divide by.
8976
 * @param  [out]  r    SP integer that is the quotient.
8977
 * @param  [out]  rem  SP integer that is the remainder.
8978
 *
8979
 * @return  MP_OKAY on success.
8980
 * @return  MP_VAL when a or d is NULL, r and rem are NULL, or d is 0.
8981
 * @return  MP_MEM when dynamic memory allocation fails.
8982
 */
8983
int sp_div(const sp_int* a, const sp_int* d, sp_int* r, sp_int* rem)
8984
30.4M
{
8985
30.4M
    int err = MP_OKAY;
8986
30.4M
    unsigned int used = 1;
8987
8988
    /* Validate parameters. */
8989
30.4M
    if ((a == NULL) || (d == NULL) || ((r == NULL) && (rem == NULL))) {
8990
0
        err = MP_VAL;
8991
0
    }
8992
    /* a / 0 = infinity. */
8993
30.4M
    if ((err == MP_OKAY) && sp_iszero(d)) {
8994
587
        err = MP_VAL;
8995
587
    }
8996
    /* Ensure quotient result has enough memory. */
8997
30.4M
    if ((err == MP_OKAY) && (r != NULL) && (r->size < a->used - d->used + 2)) {
8998
23
        err = MP_VAL;
8999
23
    }
9000
30.4M
    if ((err == MP_OKAY) && (rem != NULL)) {
9001
        /* Ensure remainder has enough memory. */
9002
30.4M
        if ((a->used <= d->used) && (rem->size < a->used + 1)) {
9003
46
            err = MP_VAL;
9004
46
        }
9005
30.4M
        else if ((a->used > d->used) && (rem->size < d->used + 1)) {
9006
16
            err = MP_VAL;
9007
16
        }
9008
30.4M
    }
9009
30.4M
    if (err == MP_OKAY) {
9010
30.4M
        if (a->used == SP_INT_DIGITS) {
9011
            /* May need to shift number being divided left into a new word. */
9012
36
            int bits = SP_WORD_SIZE - (sp_count_bits(d) % SP_WORD_SIZE);
9013
36
            if ((bits != SP_WORD_SIZE) &&
9014
31
                    (sp_count_bits(a) + bits > (int)(SP_INT_DIGITS * SP_WORD_SIZE))) {
9015
11
                err = MP_VAL;
9016
11
            }
9017
25
            else {
9018
25
                used = SP_INT_DIGITS;
9019
25
            }
9020
36
        }
9021
30.4M
        else {
9022
30.4M
            used = (sp_size_t)(a->used + 1U);
9023
30.4M
        }
9024
30.4M
    }
9025
9026
30.4M
    if (err == MP_OKAY) {
9027
    #if 0
9028
        sp_print(a, "a");
9029
        sp_print(d, "b");
9030
    #endif
9031
        /* Do operation. */
9032
30.4M
        err = _sp_div(a, d, r, rem, used);
9033
    #if 0
9034
        if (err == MP_OKAY) {
9035
            if (rem != NULL) {
9036
                sp_print(rem, "rdr");
9037
            }
9038
            if (r != NULL) {
9039
                sp_print(r, "rdw");
9040
            }
9041
        }
9042
    #endif
9043
30.4M
    }
9044
9045
30.4M
    return err;
9046
30.4M
}
9047
#endif /* WOLFSSL_SP_MATH_ALL || !NO_DH || HAVE_ECC || \
9048
        * (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) */
9049
9050
#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_DH) || defined(HAVE_ECC) || \
9051
    (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && \
9052
     !defined(WOLFSSL_RSA_PUBLIC_ONLY))
9053
#ifndef FREESCALE_LTC_TFM
9054
#ifdef WOLFSSL_SP_INT_NEGATIVE
9055
/* Calculate the remainder of dividing a by m: r = a mod m. r is m.
9056
 *
9057
 * @param  [in]   a  SP integer to reduce.
9058
 * @param  [in]   m  SP integer that is the modulus.
9059
 * @param  [out]  r  SP integer to store result in.
9060
 *
9061
 * @return  MP_OKAY on success.
9062
 * @return  MP_MEM when dynamic memory allocation fails.
9063
 */
9064
static int _sp_mod(const sp_int* a, const sp_int* m, sp_int* r)
9065
94
{
9066
94
    int err = MP_OKAY;
9067
    /* Remainder will start as a. */
9068
94
    DECL_SP_INT(t, (a == NULL) ? 1 : a->used + 1);
9069
9070
    /* In case remainder is modulus - allocate temporary. */
9071
94
    ALLOC_SP_INT(t, a->used + 1, err, NULL);
9072
94
    if (err == MP_OKAY) {
9073
81
        _sp_init_size(t, a->used + 1);
9074
        /* Use divide to calculate remainder and don't get quotient. */
9075
81
        err = sp_div(a, m, NULL, t);
9076
81
    }
9077
94
    if (err == MP_OKAY) {
9078
        /* Make remainder positive and copy into result. */
9079
66
        if ((!sp_iszero(t)) && (t->sign != m->sign)) {
9080
30
            err = sp_add(t, m, r);
9081
30
        }
9082
36
        else {
9083
36
            _sp_copy(t, r);
9084
36
        }
9085
66
    }
9086
94
    FREE_SP_INT(t, NULL);
9087
9088
94
    return err;
9089
94
}
9090
#endif
9091
9092
/* Calculate the remainder of dividing a by m: r = a mod m.
9093
 *
9094
 * @param  [in]   a  SP integer to reduce.
9095
 * @param  [in]   m  SP integer that is the modulus.
9096
 * @param  [out]  r  SP integer to store result in.
9097
 *
9098
 * @return  MP_OKAY on success.
9099
 * @return  MP_VAL when a, m or r is NULL or m is 0.
9100
 * @return  MP_MEM when dynamic memory allocation fails.
9101
 */
9102
int sp_mod(const sp_int* a, const sp_int* m, sp_int* r)
9103
11.1M
{
9104
11.1M
    int err = MP_OKAY;
9105
9106
    /* Validate parameters. */
9107
11.1M
    if ((a == NULL) || (m == NULL) || (r == NULL)) {
9108
0
        err = MP_VAL;
9109
0
    }
9110
    /* Ensure a isn't too big a number to operate on. */
9111
11.1M
    else if (a->used >= SP_INT_DIGITS) {
9112
9
        err = MP_VAL;
9113
9
    }
9114
9115
#ifndef WOLFSSL_SP_INT_NEGATIVE
9116
    if (err == MP_OKAY) {
9117
        /* Use divide to calculate remainder and don't get quotient. */
9118
        err = sp_div(a, m, NULL, r);
9119
    }
9120
#else
9121
11.1M
    if ((err == MP_OKAY) && (r != m)) {
9122
11.1M
        err = sp_div(a, m, NULL, r);
9123
11.1M
        if ((err == MP_OKAY) && (!sp_iszero(r)) && (r->sign != m->sign)) {
9124
20.5k
            err = sp_add(r, m, r);
9125
20.5k
        }
9126
11.1M
    }
9127
103
    else if (err == MP_OKAY) {
9128
94
        err = _sp_mod(a, m, r);
9129
94
    }
9130
11.1M
#endif /* WOLFSSL_SP_INT_NEGATIVE */
9131
9132
11.1M
    return err;
9133
11.1M
}
9134
#endif /* !FREESCALE_LTC_TFM */
9135
#endif /* WOLFSSL_SP_MATH_ALL || !NO_DH || HAVE_ECC || \
9136
        * (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) */
9137
9138
#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_HAVE_SP_DH) || \
9139
    defined(HAVE_ECC) || !defined(NO_RSA)
9140
9141
/* START SP_MUL implementations. */
9142
/* This code is generated.
9143
 * To generate:
9144
 *   cd scripts/sp/sp_int
9145
 *   ./gen.sh
9146
 * File sp_mul.c contains code.
9147
 */
9148
9149
#ifdef SQR_MUL_ASM
9150
/* Multiply a by b into r where a and b have same no. digits. r = a * b
9151
 *
9152
 * Optimised code for when number of digits in a and b are the same.
9153
 *
9154
 * @param  [in]   a    SP integer to multiply.
9155
 * @param  [in]   b    SP integer to multiply by.
9156
 * @param  [out]  r    SP integer to hold result.
9157
 *
9158
 * @return  MP_OKAY otherwise.
9159
 * @return  MP_MEM when dynamic memory allocation fails.
9160
 */
9161
static int _sp_mul_nxn(const sp_int* a, const sp_int* b, sp_int* r)
9162
22.6M
{
9163
22.6M
    int err = MP_OKAY;
9164
22.6M
    unsigned int i;
9165
22.6M
    int j;
9166
22.6M
    unsigned int k;
9167
22.6M
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
9168
22.6M
    sp_int_digit* t = NULL;
9169
#elif defined(WOLFSSL_SP_DYN_STACK)
9170
    sp_int_digit t[a->used];
9171
#else
9172
    sp_int_digit t[SP_INT_DIGITS / 2];
9173
#endif
9174
9175
22.6M
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
9176
22.6M
    t = (sp_int_digit*)XMALLOC(sizeof(sp_int_digit) * a->used, NULL,
9177
22.6M
        DYNAMIC_TYPE_BIGINT);
9178
22.6M
    if (t == NULL) {
9179
171
        err = MP_MEM;
9180
171
    }
9181
22.6M
#endif
9182
22.6M
    if (err == MP_OKAY) {
9183
22.6M
        sp_int_digit l;
9184
22.6M
        sp_int_digit h;
9185
22.6M
        sp_int_digit o;
9186
22.6M
        const sp_int_digit* dp;
9187
9188
22.6M
        h = 0;
9189
22.6M
        l = 0;
9190
22.6M
        SP_ASM_MUL(h, l, a->dp[0], b->dp[0]);
9191
22.6M
        t[0] = h;
9192
22.6M
        h = 0;
9193
22.6M
        o = 0;
9194
134M
        for (k = 1; k <= (unsigned int)a->used - 1; k++) {
9195
111M
            j = (int)k;
9196
111M
            dp = a->dp;
9197
826M
            for (; j >= 0; dp++, j--) {
9198
715M
                SP_ASM_MUL_ADD(l, h, o, dp[0], b->dp[j]);
9199
715M
            }
9200
111M
            t[k] = l;
9201
111M
            l = h;
9202
111M
            h = o;
9203
111M
            o = 0;
9204
111M
        }
9205
134M
        for (; k <= ((unsigned int)a->used - 1) * 2; k++) {
9206
111M
            i = k - (sp_size_t)(b->used - 1);
9207
111M
            dp = &b->dp[b->used - 1];
9208
715M
            for (; i < a->used; i++, dp--) {
9209
603M
                SP_ASM_MUL_ADD(l, h, o, a->dp[i], dp[0]);
9210
603M
            }
9211
111M
            r->dp[k] = l;
9212
111M
            l = h;
9213
111M
            h = o;
9214
111M
            o = 0;
9215
111M
        }
9216
22.6M
        r->dp[k] = l;
9217
22.6M
        XMEMCPY(r->dp, t, a->used * sizeof(sp_int_digit));
9218
22.6M
        r->used = (sp_size_t)(k + 1);
9219
22.6M
        sp_clamp(r);
9220
22.6M
    }
9221
9222
22.6M
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
9223
22.6M
    XFREE(t, NULL, DYNAMIC_TYPE_BIGINT);
9224
22.6M
#endif
9225
22.6M
    return err;
9226
22.6M
}
9227
9228
/* Multiply a by b into r. r = a * b
9229
 *
9230
 * @param  [in]   a    SP integer to multiply.
9231
 * @param  [in]   b    SP integer to multiply by.
9232
 * @param  [out]  r    SP integer to hold result.
9233
 *
9234
 * @return  MP_OKAY otherwise.
9235
 * @return  MP_MEM when dynamic memory allocation fails.
9236
 */
9237
static int _sp_mul(const sp_int* a, const sp_int* b, sp_int* r)
9238
1.64M
{
9239
1.64M
    int err = MP_OKAY;
9240
1.64M
    sp_size_t i;
9241
1.64M
    int j;
9242
1.64M
    sp_size_t k;
9243
1.64M
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
9244
1.64M
    sp_int_digit* t = NULL;
9245
#elif defined(WOLFSSL_SP_DYN_STACK)
9246
    sp_int_digit t[a->used + b->used];
9247
#else
9248
    sp_int_digit t[SP_INT_DIGITS];
9249
#endif
9250
9251
1.64M
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
9252
1.64M
    t = (sp_int_digit*)XMALLOC(sizeof(sp_int_digit) *
9253
1.64M
                               (size_t)(a->used + b->used), NULL,
9254
1.64M
                               DYNAMIC_TYPE_BIGINT);
9255
1.64M
    if (t == NULL) {
9256
135
        err = MP_MEM;
9257
135
    }
9258
1.64M
#endif
9259
1.64M
    if (err == MP_OKAY) {
9260
1.64M
        sp_int_digit l;
9261
1.64M
        sp_int_digit h;
9262
1.64M
        sp_int_digit o;
9263
9264
1.64M
        h = 0;
9265
1.64M
        l = 0;
9266
1.64M
        SP_ASM_MUL(h, l, a->dp[0], b->dp[0]);
9267
1.64M
        t[0] = h;
9268
1.64M
        h = 0;
9269
1.64M
        o = 0;
9270
6.95M
        for (k = 1; k <= (sp_size_t)(b->used - 1); k++) {
9271
5.30M
            i = 0;
9272
5.30M
            j = (int)k;
9273
17.6M
            for (; (i < a->used) && (j >= 0); i++, j--) {
9274
12.3M
                SP_ASM_MUL_ADD(l, h, o, a->dp[i], b->dp[j]);
9275
12.3M
            }
9276
5.30M
            t[k] = l;
9277
5.30M
            l = h;
9278
5.30M
            h = o;
9279
5.30M
            o = 0;
9280
5.30M
        }
9281
5.12M
        for (; k <= (sp_size_t)((a->used - 1) + (b->used - 1)); k++) {
9282
3.47M
            j = (int)(b->used - 1);
9283
3.47M
            i = (sp_size_t)(k - (sp_size_t)j);
9284
12.1M
            for (; (i < a->used) && (j >= 0); i++, j--) {
9285
8.68M
                SP_ASM_MUL_ADD(l, h, o, a->dp[i], b->dp[j]);
9286
8.68M
            }
9287
3.47M
            t[k] = l;
9288
3.47M
            l = h;
9289
3.47M
            h = o;
9290
3.47M
            o = 0;
9291
3.47M
        }
9292
1.64M
        t[k] = l;
9293
1.64M
        r->used = (sp_size_t)(k + 1);
9294
1.64M
        XMEMCPY(r->dp, t, r->used * sizeof(sp_int_digit));
9295
1.64M
        sp_clamp(r);
9296
1.64M
    }
9297
9298
1.64M
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
9299
1.64M
    XFREE(t, NULL, DYNAMIC_TYPE_BIGINT);
9300
1.64M
#endif
9301
1.64M
    return err;
9302
1.64M
}
9303
#else
9304
/* Multiply a by b into r. r = a * b
9305
 *
9306
 * @param  [in]   a    SP integer to multiply.
9307
 * @param  [in]   b    SP integer to multiply by.
9308
 * @param  [out]  r    SP integer to hold result.
9309
 *
9310
 * @return  MP_OKAY otherwise.
9311
 * @return  MP_MEM when dynamic memory allocation fails.
9312
 */
9313
static int _sp_mul(const sp_int* a, const sp_int* b, sp_int* r)
9314
{
9315
    int err = MP_OKAY;
9316
    sp_size_t i;
9317
    int j;
9318
    sp_size_t k;
9319
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
9320
    sp_int_digit* t = NULL;
9321
#elif defined(WOLFSSL_SP_DYN_STACK)
9322
    sp_int_digit t[a->used + b->used];
9323
#else
9324
    sp_int_digit t[SP_INT_DIGITS];
9325
#endif
9326
9327
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
9328
    t = (sp_int_digit*)XMALLOC(sizeof(sp_int_digit) *
9329
                               (size_t)(a->used + b->used), NULL,
9330
                               DYNAMIC_TYPE_BIGINT);
9331
    if (t == NULL) {
9332
        err = MP_MEM;
9333
    }
9334
#endif
9335
    if (err == MP_OKAY) {
9336
        sp_int_word w;
9337
        sp_int_word l;
9338
        sp_int_word h;
9339
    #ifdef SP_WORD_OVERFLOW
9340
        sp_int_word o;
9341
    #endif
9342
9343
        w = (sp_int_word)a->dp[0] * b->dp[0];
9344
        t[0] = (sp_int_digit)w;
9345
        l = (sp_int_digit)(w >> SP_WORD_SIZE);
9346
        h = 0;
9347
    #ifdef SP_WORD_OVERFLOW
9348
        o = 0;
9349
    #endif
9350
        for (k = 1; (int)k <= ((int)a->used - 1) + ((int)b->used - 1); k++) {
9351
            i = (sp_size_t)(k - (b->used - 1));
9352
            i &= (sp_size_t)(((unsigned int)i >> (sizeof(i) * 8 - 1)) - 1U);
9353
            j = (int)(k - i);
9354
            for (; (i < a->used) && (j >= 0); i++, j--) {
9355
                w = (sp_int_word)a->dp[i] * b->dp[j];
9356
                l += (sp_int_digit)w;
9357
                h += (sp_int_digit)(w >> SP_WORD_SIZE);
9358
            #ifdef SP_WORD_OVERFLOW
9359
                h += (sp_int_digit)(l >> SP_WORD_SIZE);
9360
                l &= SP_MASK;
9361
                o += (sp_int_digit)(h >> SP_WORD_SIZE);
9362
                h &= SP_MASK;
9363
            #endif
9364
            }
9365
            t[k] = (sp_int_digit)l;
9366
            l >>= SP_WORD_SIZE;
9367
            l += (sp_int_digit)h;
9368
            h >>= SP_WORD_SIZE;
9369
        #ifdef SP_WORD_OVERFLOW
9370
            h += o & SP_MASK;
9371
            o >>= SP_WORD_SIZE;
9372
        #endif
9373
        }
9374
        t[k] = (sp_int_digit)l;
9375
        r->used = (sp_size_t)(k + 1);
9376
        XMEMCPY(r->dp, t, r->used * sizeof(sp_int_digit));
9377
        sp_clamp(r);
9378
    }
9379
9380
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
9381
    XFREE(t, NULL, DYNAMIC_TYPE_BIGINT);
9382
#endif
9383
    return err;
9384
}
9385
#endif
9386
9387
#ifndef WOLFSSL_SP_SMALL
9388
#if !defined(WOLFSSL_HAVE_SP_ECC) && defined(HAVE_ECC)
9389
#if (SP_WORD_SIZE == 64 && SP_INT_BITS >= 256)
9390
#ifndef SQR_MUL_ASM
9391
/* Multiply a by b and store in r: r = a * b
9392
 *
9393
 * Long-hand implementation.
9394
 *
9395
 * @param  [in]   a  SP integer to multiply.
9396
 * @param  [in]   b  SP integer to multiply.
9397
 * @param  [out]  r  SP integer result.
9398
 *
9399
 * @return  MP_OKAY on success.
9400
 * @return  MP_MEM when dynamic memory allocation fails.
9401
 */
9402
static int _sp_mul_4(const sp_int* a, const sp_int* b, sp_int* r)
9403
{
9404
    int err = MP_OKAY;
9405
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
9406
    sp_int_word* w = NULL;
9407
#else
9408
    sp_int_word w[16];
9409
#endif
9410
    const sp_int_digit* da = a->dp;
9411
    const sp_int_digit* db = b->dp;
9412
9413
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
9414
    w = (sp_int_word*)XMALLOC(sizeof(sp_int_word) * 16, NULL,
9415
        DYNAMIC_TYPE_BIGINT);
9416
    if (w == NULL) {
9417
        err = MP_MEM;
9418
    }
9419
#endif
9420
9421
    if (err == MP_OKAY) {
9422
        w[0] = (sp_int_word)da[0] * db[0];
9423
        w[1] = (sp_int_word)da[0] * db[1];
9424
        w[2] = (sp_int_word)da[1] * db[0];
9425
        w[3] = (sp_int_word)da[0] * db[2];
9426
        w[4] = (sp_int_word)da[1] * db[1];
9427
        w[5] = (sp_int_word)da[2] * db[0];
9428
        w[6] = (sp_int_word)da[0] * db[3];
9429
        w[7] = (sp_int_word)da[1] * db[2];
9430
        w[8] = (sp_int_word)da[2] * db[1];
9431
        w[9] = (sp_int_word)da[3] * db[0];
9432
        w[10] = (sp_int_word)da[1] * db[3];
9433
        w[11] = (sp_int_word)da[2] * db[2];
9434
        w[12] = (sp_int_word)da[3] * db[1];
9435
        w[13] = (sp_int_word)da[2] * db[3];
9436
        w[14] = (sp_int_word)da[3] * db[2];
9437
        w[15] = (sp_int_word)da[3] * db[3];
9438
9439
        r->dp[0] = (sp_int_digit)w[0];
9440
        w[0] >>= SP_WORD_SIZE;
9441
        w[0] += (sp_int_digit)w[1];
9442
        w[0] += (sp_int_digit)w[2];
9443
        r->dp[1] = (sp_int_digit)w[0];
9444
        w[0] >>= SP_WORD_SIZE;
9445
        w[1] >>= SP_WORD_SIZE;
9446
        w[0] += (sp_int_digit)w[1];
9447
        w[2] >>= SP_WORD_SIZE;
9448
        w[0] += (sp_int_digit)w[2];
9449
        w[0] += (sp_int_digit)w[3];
9450
        w[0] += (sp_int_digit)w[4];
9451
        w[0] += (sp_int_digit)w[5];
9452
        r->dp[2] = (sp_int_digit)w[0];
9453
        w[0] >>= SP_WORD_SIZE;
9454
        w[3] >>= SP_WORD_SIZE;
9455
        w[0] += (sp_int_digit)w[3];
9456
        w[4] >>= SP_WORD_SIZE;
9457
        w[0] += (sp_int_digit)w[4];
9458
        w[5] >>= SP_WORD_SIZE;
9459
        w[0] += (sp_int_digit)w[5];
9460
        w[0] += (sp_int_digit)w[6];
9461
        w[0] += (sp_int_digit)w[7];
9462
        w[0] += (sp_int_digit)w[8];
9463
        w[0] += (sp_int_digit)w[9];
9464
        r->dp[3] = (sp_int_digit)w[0];
9465
        w[0] >>= SP_WORD_SIZE;
9466
        w[6] >>= SP_WORD_SIZE;
9467
        w[0] += (sp_int_digit)w[6];
9468
        w[7] >>= SP_WORD_SIZE;
9469
        w[0] += (sp_int_digit)w[7];
9470
        w[8] >>= SP_WORD_SIZE;
9471
        w[0] += (sp_int_digit)w[8];
9472
        w[9] >>= SP_WORD_SIZE;
9473
        w[0] += (sp_int_digit)w[9];
9474
        w[0] += (sp_int_digit)w[10];
9475
        w[0] += (sp_int_digit)w[11];
9476
        w[0] += (sp_int_digit)w[12];
9477
        r->dp[4] = (sp_int_digit)w[0];
9478
        w[0] >>= SP_WORD_SIZE;
9479
        w[10] >>= SP_WORD_SIZE;
9480
        w[0] += (sp_int_digit)w[10];
9481
        w[11] >>= SP_WORD_SIZE;
9482
        w[0] += (sp_int_digit)w[11];
9483
        w[12] >>= SP_WORD_SIZE;
9484
        w[0] += (sp_int_digit)w[12];
9485
        w[0] += (sp_int_digit)w[13];
9486
        w[0] += (sp_int_digit)w[14];
9487
        r->dp[5] = (sp_int_digit)w[0];
9488
        w[0] >>= SP_WORD_SIZE;
9489
        w[13] >>= SP_WORD_SIZE;
9490
        w[0] += (sp_int_digit)w[13];
9491
        w[14] >>= SP_WORD_SIZE;
9492
        w[0] += (sp_int_digit)w[14];
9493
        w[0] += (sp_int_digit)w[15];
9494
        r->dp[6] = (sp_int_digit)w[0];
9495
        w[0] >>= SP_WORD_SIZE;
9496
        w[15] >>= SP_WORD_SIZE;
9497
        w[0] += (sp_int_digit)w[15];
9498
        r->dp[7] = (sp_int_digit)w[0];
9499
9500
        r->used = 8;
9501
        sp_clamp(r);
9502
    }
9503
9504
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
9505
    XFREE(w, NULL, DYNAMIC_TYPE_BIGINT);
9506
#endif
9507
    return err;
9508
}
9509
#else /* SQR_MUL_ASM */
9510
/* Multiply a by b and store in r: r = a * b
9511
 *
9512
 * Comba implementation.
9513
 *
9514
 * @param  [in]   a  SP integer to multiply.
9515
 * @param  [in]   b  SP integer to multiply.
9516
 * @param  [out]  r  SP integer result.
9517
 *
9518
 * @return  MP_OKAY on success.
9519
 * @return  MP_MEM when dynamic memory allocation fails.
9520
 */
9521
static int _sp_mul_4(const sp_int* a, const sp_int* b, sp_int* r)
9522
28.5M
{
9523
28.5M
    sp_int_digit l = 0;
9524
28.5M
    sp_int_digit h = 0;
9525
28.5M
    sp_int_digit o = 0;
9526
28.5M
    sp_int_digit t[4];
9527
9528
28.5M
    SP_ASM_MUL(h, l, a->dp[0], b->dp[0]);
9529
28.5M
    t[0] = h;
9530
28.5M
    h = 0;
9531
28.5M
    SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[1]);
9532
28.5M
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[0]);
9533
28.5M
    t[1] = l;
9534
28.5M
    l = h;
9535
28.5M
    h = o;
9536
28.5M
    o = 0;
9537
28.5M
    SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[2]);
9538
28.5M
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[1]);
9539
28.5M
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[0]);
9540
28.5M
    t[2] = l;
9541
28.5M
    l = h;
9542
28.5M
    h = o;
9543
28.5M
    o = 0;
9544
28.5M
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[3]);
9545
28.5M
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[2]);
9546
28.5M
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[1]);
9547
28.5M
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[0]);
9548
28.5M
    t[3] = l;
9549
28.5M
    l = h;
9550
28.5M
    h = o;
9551
28.5M
    o = 0;
9552
28.5M
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[3]);
9553
28.5M
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[2]);
9554
28.5M
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[1]);
9555
28.5M
    r->dp[4] = l;
9556
28.5M
    l = h;
9557
28.5M
    h = o;
9558
28.5M
    o = 0;
9559
28.5M
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[3]);
9560
28.5M
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[2]);
9561
28.5M
    r->dp[5] = l;
9562
28.5M
    l = h;
9563
28.5M
    h = o;
9564
28.5M
    SP_ASM_MUL_ADD_NO(l, h, a->dp[3], b->dp[3]);
9565
28.5M
    r->dp[6] = l;
9566
28.5M
    r->dp[7] = h;
9567
28.5M
    XMEMCPY(r->dp, t, 4 * sizeof(sp_int_digit));
9568
28.5M
    r->used = 8;
9569
28.5M
    sp_clamp(r);
9570
9571
28.5M
    return MP_OKAY;
9572
28.5M
}
9573
#endif /* SQR_MUL_ASM */
9574
#endif /* SP_WORD_SIZE == 64 */
9575
#if (SP_WORD_SIZE == 64 && SP_INT_BITS >= 384)
9576
#ifdef SQR_MUL_ASM
9577
/* Multiply a by b and store in r: r = a * b
9578
 *
9579
 * Comba implementation.
9580
 *
9581
 * @param  [in]   a  SP integer to multiply.
9582
 * @param  [in]   b  SP integer to multiply.
9583
 * @param  [out]  r  SP integer result.
9584
 *
9585
 * @return  MP_OKAY on success.
9586
 * @return  MP_MEM when dynamic memory allocation fails.
9587
 */
9588
static int _sp_mul_6(const sp_int* a, const sp_int* b, sp_int* r)
9589
9.75M
{
9590
9.75M
    sp_int_digit l = 0;
9591
9.75M
    sp_int_digit h = 0;
9592
9.75M
    sp_int_digit o = 0;
9593
9.75M
    sp_int_digit t[6];
9594
9595
9.75M
    SP_ASM_MUL(h, l, a->dp[0], b->dp[0]);
9596
9.75M
    t[0] = h;
9597
9.75M
    h = 0;
9598
9.75M
    SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[1]);
9599
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[0]);
9600
9.75M
    t[1] = l;
9601
9.75M
    l = h;
9602
9.75M
    h = o;
9603
9.75M
    o = 0;
9604
9.75M
    SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[2]);
9605
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[1]);
9606
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[0]);
9607
9.75M
    t[2] = l;
9608
9.75M
    l = h;
9609
9.75M
    h = o;
9610
9.75M
    o = 0;
9611
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[3]);
9612
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[2]);
9613
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[1]);
9614
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[0]);
9615
9.75M
    t[3] = l;
9616
9.75M
    l = h;
9617
9.75M
    h = o;
9618
9.75M
    o = 0;
9619
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[4]);
9620
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[3]);
9621
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[2]);
9622
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[1]);
9623
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[0]);
9624
9.75M
    t[4] = l;
9625
9.75M
    l = h;
9626
9.75M
    h = o;
9627
9.75M
    o = 0;
9628
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[5]);
9629
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[4]);
9630
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[3]);
9631
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[2]);
9632
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[1]);
9633
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[0]);
9634
9.75M
    t[5] = l;
9635
9.75M
    l = h;
9636
9.75M
    h = o;
9637
9.75M
    o = 0;
9638
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[5]);
9639
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[4]);
9640
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[3]);
9641
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[2]);
9642
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[1]);
9643
9.75M
    r->dp[6] = l;
9644
9.75M
    l = h;
9645
9.75M
    h = o;
9646
9.75M
    o = 0;
9647
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[5]);
9648
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[4]);
9649
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[3]);
9650
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[2]);
9651
9.75M
    r->dp[7] = l;
9652
9.75M
    l = h;
9653
9.75M
    h = o;
9654
9.75M
    o = 0;
9655
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[5]);
9656
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[4]);
9657
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[3]);
9658
9.75M
    r->dp[8] = l;
9659
9.75M
    l = h;
9660
9.75M
    h = o;
9661
9.75M
    o = 0;
9662
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[5]);
9663
9.75M
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[4]);
9664
9.75M
    r->dp[9] = l;
9665
9.75M
    l = h;
9666
9.75M
    h = o;
9667
9.75M
    SP_ASM_MUL_ADD_NO(l, h, a->dp[5], b->dp[5]);
9668
9.75M
    r->dp[10] = l;
9669
9.75M
    r->dp[11] = h;
9670
9.75M
    XMEMCPY(r->dp, t, 6 * sizeof(sp_int_digit));
9671
9.75M
    r->used = 12;
9672
9.75M
    sp_clamp(r);
9673
9674
9.75M
    return MP_OKAY;
9675
9.75M
}
9676
#endif /* SQR_MUL_ASM */
9677
#endif /* SP_WORD_SIZE == 64 */
9678
#if (SP_WORD_SIZE == 32 && SP_INT_BITS >= 256)
9679
#ifdef SQR_MUL_ASM
9680
/* Multiply a by b and store in r: r = a * b
9681
 *
9682
 * Comba implementation.
9683
 *
9684
 * @param  [in]   a  SP integer to multiply.
9685
 * @param  [in]   b  SP integer to multiply.
9686
 * @param  [out]  r  SP integer result.
9687
 *
9688
 * @return  MP_OKAY on success.
9689
 * @return  MP_MEM when dynamic memory allocation fails.
9690
 */
9691
static int _sp_mul_8(const sp_int* a, const sp_int* b, sp_int* r)
9692
{
9693
    sp_int_digit l = 0;
9694
    sp_int_digit h = 0;
9695
    sp_int_digit o = 0;
9696
    sp_int_digit t[8];
9697
9698
    SP_ASM_MUL(h, l, a->dp[0], b->dp[0]);
9699
    t[0] = h;
9700
    h = 0;
9701
    SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[1]);
9702
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[0]);
9703
    t[1] = l;
9704
    l = h;
9705
    h = o;
9706
    o = 0;
9707
    SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[2]);
9708
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[1]);
9709
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[0]);
9710
    t[2] = l;
9711
    l = h;
9712
    h = o;
9713
    o = 0;
9714
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[3]);
9715
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[2]);
9716
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[1]);
9717
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[0]);
9718
    t[3] = l;
9719
    l = h;
9720
    h = o;
9721
    o = 0;
9722
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[4]);
9723
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[3]);
9724
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[2]);
9725
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[1]);
9726
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[0]);
9727
    t[4] = l;
9728
    l = h;
9729
    h = o;
9730
    o = 0;
9731
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[5]);
9732
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[4]);
9733
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[3]);
9734
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[2]);
9735
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[1]);
9736
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[0]);
9737
    t[5] = l;
9738
    l = h;
9739
    h = o;
9740
    o = 0;
9741
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[6]);
9742
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[5]);
9743
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[4]);
9744
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[3]);
9745
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[2]);
9746
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[1]);
9747
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[0]);
9748
    t[6] = l;
9749
    l = h;
9750
    h = o;
9751
    o = 0;
9752
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[7]);
9753
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[6]);
9754
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[5]);
9755
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[4]);
9756
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[3]);
9757
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[2]);
9758
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[1]);
9759
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[0]);
9760
    t[7] = l;
9761
    l = h;
9762
    h = o;
9763
    o = 0;
9764
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[7]);
9765
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[6]);
9766
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[5]);
9767
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[4]);
9768
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[3]);
9769
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[2]);
9770
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[1]);
9771
    r->dp[8] = l;
9772
    l = h;
9773
    h = o;
9774
    o = 0;
9775
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[7]);
9776
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[6]);
9777
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[5]);
9778
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[4]);
9779
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[3]);
9780
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[2]);
9781
    r->dp[9] = l;
9782
    l = h;
9783
    h = o;
9784
    o = 0;
9785
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[7]);
9786
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[6]);
9787
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[5]);
9788
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[4]);
9789
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[3]);
9790
    r->dp[10] = l;
9791
    l = h;
9792
    h = o;
9793
    o = 0;
9794
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[7]);
9795
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[6]);
9796
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[5]);
9797
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[4]);
9798
    r->dp[11] = l;
9799
    l = h;
9800
    h = o;
9801
    o = 0;
9802
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[7]);
9803
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[6]);
9804
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[5]);
9805
    r->dp[12] = l;
9806
    l = h;
9807
    h = o;
9808
    o = 0;
9809
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[7]);
9810
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[6]);
9811
    r->dp[13] = l;
9812
    l = h;
9813
    h = o;
9814
    SP_ASM_MUL_ADD_NO(l, h, a->dp[7], b->dp[7]);
9815
    r->dp[14] = l;
9816
    r->dp[15] = h;
9817
    XMEMCPY(r->dp, t, 8 * sizeof(sp_int_digit));
9818
    r->used = 16;
9819
    sp_clamp(r);
9820
9821
    return MP_OKAY;
9822
}
9823
#endif /* SQR_MUL_ASM */
9824
#endif /* SP_WORD_SIZE == 32 */
9825
#if (SP_WORD_SIZE == 32 && SP_INT_BITS >= 384)
9826
#ifdef SQR_MUL_ASM
9827
/* Multiply a by b and store in r: r = a * b
9828
 *
9829
 * Comba implementation.
9830
 *
9831
 * @param  [in]   a  SP integer to multiply.
9832
 * @param  [in]   b  SP integer to multiply.
9833
 * @param  [out]  r  SP integer result.
9834
 *
9835
 * @return  MP_OKAY on success.
9836
 * @return  MP_MEM when dynamic memory allocation fails.
9837
 */
9838
static int _sp_mul_12(const sp_int* a, const sp_int* b, sp_int* r)
9839
{
9840
    sp_int_digit l = 0;
9841
    sp_int_digit h = 0;
9842
    sp_int_digit o = 0;
9843
    sp_int_digit t[12];
9844
9845
    SP_ASM_MUL(h, l, a->dp[0], b->dp[0]);
9846
    t[0] = h;
9847
    h = 0;
9848
    SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[1]);
9849
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[0]);
9850
    t[1] = l;
9851
    l = h;
9852
    h = o;
9853
    o = 0;
9854
    SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[2]);
9855
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[1]);
9856
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[0]);
9857
    t[2] = l;
9858
    l = h;
9859
    h = o;
9860
    o = 0;
9861
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[3]);
9862
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[2]);
9863
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[1]);
9864
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[0]);
9865
    t[3] = l;
9866
    l = h;
9867
    h = o;
9868
    o = 0;
9869
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[4]);
9870
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[3]);
9871
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[2]);
9872
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[1]);
9873
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[0]);
9874
    t[4] = l;
9875
    l = h;
9876
    h = o;
9877
    o = 0;
9878
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[5]);
9879
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[4]);
9880
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[3]);
9881
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[2]);
9882
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[1]);
9883
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[0]);
9884
    t[5] = l;
9885
    l = h;
9886
    h = o;
9887
    o = 0;
9888
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[6]);
9889
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[5]);
9890
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[4]);
9891
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[3]);
9892
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[2]);
9893
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[1]);
9894
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[0]);
9895
    t[6] = l;
9896
    l = h;
9897
    h = o;
9898
    o = 0;
9899
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[7]);
9900
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[6]);
9901
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[5]);
9902
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[4]);
9903
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[3]);
9904
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[2]);
9905
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[1]);
9906
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[0]);
9907
    t[7] = l;
9908
    l = h;
9909
    h = o;
9910
    o = 0;
9911
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[8]);
9912
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[7]);
9913
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[6]);
9914
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[5]);
9915
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[4]);
9916
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[3]);
9917
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[2]);
9918
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[1]);
9919
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[0]);
9920
    t[8] = l;
9921
    l = h;
9922
    h = o;
9923
    o = 0;
9924
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[9]);
9925
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[8]);
9926
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[7]);
9927
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[6]);
9928
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[5]);
9929
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[4]);
9930
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[3]);
9931
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[2]);
9932
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[1]);
9933
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[0]);
9934
    t[9] = l;
9935
    l = h;
9936
    h = o;
9937
    o = 0;
9938
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[10]);
9939
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[9]);
9940
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[8]);
9941
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[7]);
9942
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[6]);
9943
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[5]);
9944
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[4]);
9945
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[3]);
9946
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[2]);
9947
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[1]);
9948
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[0]);
9949
    t[10] = l;
9950
    l = h;
9951
    h = o;
9952
    o = 0;
9953
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[11]);
9954
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[10]);
9955
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[9]);
9956
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[8]);
9957
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[7]);
9958
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[6]);
9959
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[5]);
9960
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[4]);
9961
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[3]);
9962
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[2]);
9963
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[1]);
9964
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[0]);
9965
    t[11] = l;
9966
    l = h;
9967
    h = o;
9968
    o = 0;
9969
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[11]);
9970
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[10]);
9971
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[9]);
9972
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[8]);
9973
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[7]);
9974
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[6]);
9975
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[5]);
9976
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[4]);
9977
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[3]);
9978
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[2]);
9979
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[1]);
9980
    r->dp[12] = l;
9981
    l = h;
9982
    h = o;
9983
    o = 0;
9984
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[11]);
9985
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[10]);
9986
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[9]);
9987
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[8]);
9988
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[7]);
9989
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[6]);
9990
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[5]);
9991
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[4]);
9992
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[3]);
9993
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[2]);
9994
    r->dp[13] = l;
9995
    l = h;
9996
    h = o;
9997
    o = 0;
9998
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[11]);
9999
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[10]);
10000
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[9]);
10001
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[8]);
10002
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[7]);
10003
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[6]);
10004
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[5]);
10005
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[4]);
10006
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[3]);
10007
    r->dp[14] = l;
10008
    l = h;
10009
    h = o;
10010
    o = 0;
10011
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[11]);
10012
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[10]);
10013
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[9]);
10014
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[8]);
10015
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[7]);
10016
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[6]);
10017
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[5]);
10018
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[4]);
10019
    r->dp[15] = l;
10020
    l = h;
10021
    h = o;
10022
    o = 0;
10023
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[11]);
10024
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[10]);
10025
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[9]);
10026
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[8]);
10027
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[7]);
10028
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[6]);
10029
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[5]);
10030
    r->dp[16] = l;
10031
    l = h;
10032
    h = o;
10033
    o = 0;
10034
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[11]);
10035
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[10]);
10036
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[9]);
10037
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[8]);
10038
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[7]);
10039
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[6]);
10040
    r->dp[17] = l;
10041
    l = h;
10042
    h = o;
10043
    o = 0;
10044
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[11]);
10045
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[10]);
10046
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[9]);
10047
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[8]);
10048
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[7]);
10049
    r->dp[18] = l;
10050
    l = h;
10051
    h = o;
10052
    o = 0;
10053
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[11]);
10054
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[10]);
10055
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[9]);
10056
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[8]);
10057
    r->dp[19] = l;
10058
    l = h;
10059
    h = o;
10060
    o = 0;
10061
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[11]);
10062
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[10]);
10063
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[9]);
10064
    r->dp[20] = l;
10065
    l = h;
10066
    h = o;
10067
    o = 0;
10068
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[11]);
10069
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[10]);
10070
    r->dp[21] = l;
10071
    l = h;
10072
    h = o;
10073
    SP_ASM_MUL_ADD_NO(l, h, a->dp[11], b->dp[11]);
10074
    r->dp[22] = l;
10075
    r->dp[23] = h;
10076
    XMEMCPY(r->dp, t, 12 * sizeof(sp_int_digit));
10077
    r->used = 24;
10078
    sp_clamp(r);
10079
10080
    return MP_OKAY;
10081
}
10082
#endif /* SQR_MUL_ASM */
10083
#endif /* SP_WORD_SIZE == 32 */
10084
#endif /* !WOLFSSL_HAVE_SP_ECC && HAVE_ECC */
10085
10086
#if defined(SQR_MUL_ASM) && (defined(WOLFSSL_SP_INT_LARGE_COMBA) || \
10087
    (!defined(WOLFSSL_SP_MATH) && defined(WOLFCRYPT_HAVE_SAKKE) && \
10088
    (SP_WORD_SIZE == 64)))
10089
    #if SP_INT_DIGITS >= 32
10090
/* Multiply a by b and store in r: r = a * b
10091
 *
10092
 * Comba implementation.
10093
 *
10094
 * @param  [in]   a  SP integer to multiply.
10095
 * @param  [in]   b  SP integer to multiply.
10096
 * @param  [out]  r  SP integer result.
10097
 *
10098
 * @return  MP_OKAY on success.
10099
 * @return  MP_MEM when dynamic memory allocation fails.
10100
 */
10101
static int _sp_mul_16(const sp_int* a, const sp_int* b, sp_int* r)
10102
{
10103
    int err = MP_OKAY;
10104
    sp_int_digit l = 0;
10105
    sp_int_digit h = 0;
10106
    sp_int_digit o = 0;
10107
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
10108
    sp_int_digit* t = NULL;
10109
#else
10110
    sp_int_digit t[16];
10111
#endif
10112
10113
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
10114
     t = (sp_int_digit*)XMALLOC(sizeof(sp_int_digit) * 16, NULL,
10115
         DYNAMIC_TYPE_BIGINT);
10116
     if (t == NULL) {
10117
         err = MP_MEM;
10118
     }
10119
#endif
10120
    if (err == MP_OKAY) {
10121
        SP_ASM_MUL(h, l, a->dp[0], b->dp[0]);
10122
        t[0] = h;
10123
        h = 0;
10124
        SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[1]);
10125
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[0]);
10126
        t[1] = l;
10127
        l = h;
10128
        h = o;
10129
        o = 0;
10130
        SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[2]);
10131
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[1]);
10132
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[0]);
10133
        t[2] = l;
10134
        l = h;
10135
        h = o;
10136
        o = 0;
10137
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[3]);
10138
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[2]);
10139
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[1]);
10140
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[0]);
10141
        t[3] = l;
10142
        l = h;
10143
        h = o;
10144
        o = 0;
10145
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[4]);
10146
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[3]);
10147
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[2]);
10148
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[1]);
10149
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[0]);
10150
        t[4] = l;
10151
        l = h;
10152
        h = o;
10153
        o = 0;
10154
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[5]);
10155
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[4]);
10156
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[3]);
10157
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[2]);
10158
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[1]);
10159
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[0]);
10160
        t[5] = l;
10161
        l = h;
10162
        h = o;
10163
        o = 0;
10164
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[6]);
10165
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[5]);
10166
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[4]);
10167
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[3]);
10168
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[2]);
10169
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[1]);
10170
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[0]);
10171
        t[6] = l;
10172
        l = h;
10173
        h = o;
10174
        o = 0;
10175
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[7]);
10176
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[6]);
10177
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[5]);
10178
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[4]);
10179
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[3]);
10180
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[2]);
10181
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[1]);
10182
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[0]);
10183
        t[7] = l;
10184
        l = h;
10185
        h = o;
10186
        o = 0;
10187
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[8]);
10188
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[7]);
10189
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[6]);
10190
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[5]);
10191
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[4]);
10192
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[3]);
10193
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[2]);
10194
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[1]);
10195
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[0]);
10196
        t[8] = l;
10197
        l = h;
10198
        h = o;
10199
        o = 0;
10200
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[9]);
10201
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[8]);
10202
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[7]);
10203
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[6]);
10204
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[5]);
10205
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[4]);
10206
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[3]);
10207
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[2]);
10208
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[1]);
10209
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[0]);
10210
        t[9] = l;
10211
        l = h;
10212
        h = o;
10213
        o = 0;
10214
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[10]);
10215
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[9]);
10216
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[8]);
10217
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[7]);
10218
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[6]);
10219
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[5]);
10220
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[4]);
10221
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[3]);
10222
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[2]);
10223
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[1]);
10224
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[0]);
10225
        t[10] = l;
10226
        l = h;
10227
        h = o;
10228
        o = 0;
10229
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[11]);
10230
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[10]);
10231
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[9]);
10232
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[8]);
10233
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[7]);
10234
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[6]);
10235
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[5]);
10236
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[4]);
10237
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[3]);
10238
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[2]);
10239
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[1]);
10240
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[0]);
10241
        t[11] = l;
10242
        l = h;
10243
        h = o;
10244
        o = 0;
10245
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[12]);
10246
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[11]);
10247
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[10]);
10248
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[9]);
10249
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[8]);
10250
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[7]);
10251
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[6]);
10252
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[5]);
10253
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[4]);
10254
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[3]);
10255
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[2]);
10256
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[1]);
10257
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[0]);
10258
        t[12] = l;
10259
        l = h;
10260
        h = o;
10261
        o = 0;
10262
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[13]);
10263
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[12]);
10264
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[11]);
10265
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[10]);
10266
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[9]);
10267
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[8]);
10268
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[7]);
10269
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[6]);
10270
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[5]);
10271
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[4]);
10272
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[3]);
10273
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[2]);
10274
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[1]);
10275
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[0]);
10276
        t[13] = l;
10277
        l = h;
10278
        h = o;
10279
        o = 0;
10280
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[14]);
10281
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[13]);
10282
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[12]);
10283
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[11]);
10284
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[10]);
10285
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[9]);
10286
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[8]);
10287
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[7]);
10288
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[6]);
10289
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[5]);
10290
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[4]);
10291
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[3]);
10292
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[2]);
10293
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[1]);
10294
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[0]);
10295
        t[14] = l;
10296
        l = h;
10297
        h = o;
10298
        o = 0;
10299
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[15]);
10300
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[14]);
10301
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[13]);
10302
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[12]);
10303
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[11]);
10304
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[10]);
10305
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[9]);
10306
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[8]);
10307
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[7]);
10308
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[6]);
10309
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[5]);
10310
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[4]);
10311
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[3]);
10312
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[2]);
10313
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[1]);
10314
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[0]);
10315
        t[15] = l;
10316
        l = h;
10317
        h = o;
10318
        o = 0;
10319
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[15]);
10320
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[14]);
10321
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[13]);
10322
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[12]);
10323
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[11]);
10324
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[10]);
10325
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[9]);
10326
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[8]);
10327
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[7]);
10328
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[6]);
10329
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[5]);
10330
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[4]);
10331
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[3]);
10332
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[2]);
10333
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[1]);
10334
        r->dp[16] = l;
10335
        l = h;
10336
        h = o;
10337
        o = 0;
10338
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[15]);
10339
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[14]);
10340
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[13]);
10341
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[12]);
10342
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[11]);
10343
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[10]);
10344
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[9]);
10345
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[8]);
10346
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[7]);
10347
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[6]);
10348
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[5]);
10349
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[4]);
10350
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[3]);
10351
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[2]);
10352
        r->dp[17] = l;
10353
        l = h;
10354
        h = o;
10355
        o = 0;
10356
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[15]);
10357
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[14]);
10358
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[13]);
10359
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[12]);
10360
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[11]);
10361
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[10]);
10362
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[9]);
10363
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[8]);
10364
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[7]);
10365
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[6]);
10366
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[5]);
10367
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[4]);
10368
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[3]);
10369
        r->dp[18] = l;
10370
        l = h;
10371
        h = o;
10372
        o = 0;
10373
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[15]);
10374
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[14]);
10375
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[13]);
10376
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[12]);
10377
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[11]);
10378
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[10]);
10379
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[9]);
10380
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[8]);
10381
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[7]);
10382
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[6]);
10383
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[5]);
10384
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[4]);
10385
        r->dp[19] = l;
10386
        l = h;
10387
        h = o;
10388
        o = 0;
10389
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[15]);
10390
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[14]);
10391
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[13]);
10392
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[12]);
10393
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[11]);
10394
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[10]);
10395
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[9]);
10396
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[8]);
10397
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[7]);
10398
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[6]);
10399
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[5]);
10400
        r->dp[20] = l;
10401
        l = h;
10402
        h = o;
10403
        o = 0;
10404
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[15]);
10405
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[14]);
10406
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[13]);
10407
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[12]);
10408
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[11]);
10409
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[10]);
10410
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[9]);
10411
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[8]);
10412
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[7]);
10413
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[6]);
10414
        r->dp[21] = l;
10415
        l = h;
10416
        h = o;
10417
        o = 0;
10418
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[15]);
10419
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[14]);
10420
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[13]);
10421
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[12]);
10422
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[11]);
10423
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[10]);
10424
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[9]);
10425
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[8]);
10426
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[7]);
10427
        r->dp[22] = l;
10428
        l = h;
10429
        h = o;
10430
        o = 0;
10431
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[15]);
10432
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[14]);
10433
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[13]);
10434
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[12]);
10435
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[11]);
10436
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[10]);
10437
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[9]);
10438
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[8]);
10439
        r->dp[23] = l;
10440
        l = h;
10441
        h = o;
10442
        o = 0;
10443
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[15]);
10444
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[14]);
10445
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[13]);
10446
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[12]);
10447
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[11]);
10448
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[10]);
10449
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[9]);
10450
        r->dp[24] = l;
10451
        l = h;
10452
        h = o;
10453
        o = 0;
10454
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[15]);
10455
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[14]);
10456
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[13]);
10457
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[12]);
10458
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[11]);
10459
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[10]);
10460
        r->dp[25] = l;
10461
        l = h;
10462
        h = o;
10463
        o = 0;
10464
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[15]);
10465
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[14]);
10466
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[13]);
10467
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[12]);
10468
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[11]);
10469
        r->dp[26] = l;
10470
        l = h;
10471
        h = o;
10472
        o = 0;
10473
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[15]);
10474
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[14]);
10475
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[13]);
10476
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[12]);
10477
        r->dp[27] = l;
10478
        l = h;
10479
        h = o;
10480
        o = 0;
10481
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[15]);
10482
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[14]);
10483
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[13]);
10484
        r->dp[28] = l;
10485
        l = h;
10486
        h = o;
10487
        o = 0;
10488
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[15]);
10489
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[14]);
10490
        r->dp[29] = l;
10491
        l = h;
10492
        h = o;
10493
        SP_ASM_MUL_ADD_NO(l, h, a->dp[15], b->dp[15]);
10494
        r->dp[30] = l;
10495
        r->dp[31] = h;
10496
        XMEMCPY(r->dp, t, 16 * sizeof(sp_int_digit));
10497
        r->used = 32;
10498
        sp_clamp(r);
10499
    }
10500
10501
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
10502
    XFREE(t, NULL, DYNAMIC_TYPE_BIGINT);
10503
#endif
10504
    return err;
10505
}
10506
    #endif /* SP_INT_DIGITS >= 32 */
10507
#endif /* SQR_MUL_ASM && (WOLFSSL_SP_INT_LARGE_COMBA || !WOLFSSL_SP_MATH &&
10508
        * WOLFCRYPT_HAVE_SAKKE && SP_WORD_SIZE == 64 */
10509
10510
#if defined(SQR_MUL_ASM) && defined(WOLFSSL_SP_INT_LARGE_COMBA)
10511
    #if SP_INT_DIGITS >= 48
10512
/* Multiply a by b and store in r: r = a * b
10513
 *
10514
 * Comba implementation.
10515
 *
10516
 * @param  [in]   a  SP integer to multiply.
10517
 * @param  [in]   b  SP integer to multiply.
10518
 * @param  [out]  r  SP integer result.
10519
 *
10520
 * @return  MP_OKAY on success.
10521
 * @return  MP_MEM when dynamic memory allocation fails.
10522
 */
10523
static int _sp_mul_24(const sp_int* a, const sp_int* b, sp_int* r)
10524
{
10525
    int err = MP_OKAY;
10526
    sp_int_digit l = 0;
10527
    sp_int_digit h = 0;
10528
    sp_int_digit o = 0;
10529
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
10530
    sp_int_digit* t = NULL;
10531
#else
10532
    sp_int_digit t[24];
10533
#endif
10534
10535
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
10536
     t = (sp_int_digit*)XMALLOC(sizeof(sp_int_digit) * 24, NULL,
10537
         DYNAMIC_TYPE_BIGINT);
10538
     if (t == NULL) {
10539
         err = MP_MEM;
10540
     }
10541
#endif
10542
    if (err == MP_OKAY) {
10543
        SP_ASM_MUL(h, l, a->dp[0], b->dp[0]);
10544
        t[0] = h;
10545
        h = 0;
10546
        SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[1]);
10547
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[0]);
10548
        t[1] = l;
10549
        l = h;
10550
        h = o;
10551
        o = 0;
10552
        SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[2]);
10553
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[1]);
10554
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[0]);
10555
        t[2] = l;
10556
        l = h;
10557
        h = o;
10558
        o = 0;
10559
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[3]);
10560
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[2]);
10561
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[1]);
10562
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[0]);
10563
        t[3] = l;
10564
        l = h;
10565
        h = o;
10566
        o = 0;
10567
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[4]);
10568
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[3]);
10569
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[2]);
10570
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[1]);
10571
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[0]);
10572
        t[4] = l;
10573
        l = h;
10574
        h = o;
10575
        o = 0;
10576
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[5]);
10577
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[4]);
10578
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[3]);
10579
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[2]);
10580
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[1]);
10581
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[0]);
10582
        t[5] = l;
10583
        l = h;
10584
        h = o;
10585
        o = 0;
10586
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[6]);
10587
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[5]);
10588
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[4]);
10589
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[3]);
10590
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[2]);
10591
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[1]);
10592
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[0]);
10593
        t[6] = l;
10594
        l = h;
10595
        h = o;
10596
        o = 0;
10597
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[7]);
10598
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[6]);
10599
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[5]);
10600
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[4]);
10601
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[3]);
10602
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[2]);
10603
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[1]);
10604
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[0]);
10605
        t[7] = l;
10606
        l = h;
10607
        h = o;
10608
        o = 0;
10609
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[8]);
10610
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[7]);
10611
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[6]);
10612
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[5]);
10613
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[4]);
10614
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[3]);
10615
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[2]);
10616
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[1]);
10617
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[0]);
10618
        t[8] = l;
10619
        l = h;
10620
        h = o;
10621
        o = 0;
10622
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[9]);
10623
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[8]);
10624
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[7]);
10625
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[6]);
10626
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[5]);
10627
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[4]);
10628
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[3]);
10629
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[2]);
10630
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[1]);
10631
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[0]);
10632
        t[9] = l;
10633
        l = h;
10634
        h = o;
10635
        o = 0;
10636
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[10]);
10637
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[9]);
10638
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[8]);
10639
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[7]);
10640
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[6]);
10641
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[5]);
10642
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[4]);
10643
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[3]);
10644
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[2]);
10645
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[1]);
10646
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[0]);
10647
        t[10] = l;
10648
        l = h;
10649
        h = o;
10650
        o = 0;
10651
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[11]);
10652
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[10]);
10653
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[9]);
10654
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[8]);
10655
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[7]);
10656
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[6]);
10657
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[5]);
10658
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[4]);
10659
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[3]);
10660
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[2]);
10661
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[1]);
10662
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[0]);
10663
        t[11] = l;
10664
        l = h;
10665
        h = o;
10666
        o = 0;
10667
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[12]);
10668
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[11]);
10669
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[10]);
10670
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[9]);
10671
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[8]);
10672
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[7]);
10673
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[6]);
10674
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[5]);
10675
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[4]);
10676
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[3]);
10677
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[2]);
10678
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[1]);
10679
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[0]);
10680
        t[12] = l;
10681
        l = h;
10682
        h = o;
10683
        o = 0;
10684
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[13]);
10685
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[12]);
10686
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[11]);
10687
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[10]);
10688
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[9]);
10689
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[8]);
10690
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[7]);
10691
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[6]);
10692
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[5]);
10693
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[4]);
10694
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[3]);
10695
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[2]);
10696
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[1]);
10697
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[0]);
10698
        t[13] = l;
10699
        l = h;
10700
        h = o;
10701
        o = 0;
10702
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[14]);
10703
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[13]);
10704
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[12]);
10705
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[11]);
10706
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[10]);
10707
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[9]);
10708
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[8]);
10709
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[7]);
10710
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[6]);
10711
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[5]);
10712
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[4]);
10713
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[3]);
10714
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[2]);
10715
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[1]);
10716
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[0]);
10717
        t[14] = l;
10718
        l = h;
10719
        h = o;
10720
        o = 0;
10721
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[15]);
10722
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[14]);
10723
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[13]);
10724
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[12]);
10725
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[11]);
10726
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[10]);
10727
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[9]);
10728
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[8]);
10729
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[7]);
10730
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[6]);
10731
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[5]);
10732
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[4]);
10733
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[3]);
10734
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[2]);
10735
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[1]);
10736
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[0]);
10737
        t[15] = l;
10738
        l = h;
10739
        h = o;
10740
        o = 0;
10741
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[16]);
10742
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[15]);
10743
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[14]);
10744
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[13]);
10745
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[12]);
10746
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[11]);
10747
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[10]);
10748
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[9]);
10749
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[8]);
10750
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[7]);
10751
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[6]);
10752
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[5]);
10753
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[4]);
10754
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[3]);
10755
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[2]);
10756
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[1]);
10757
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[0]);
10758
        t[16] = l;
10759
        l = h;
10760
        h = o;
10761
        o = 0;
10762
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[17]);
10763
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[16]);
10764
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[15]);
10765
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[14]);
10766
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[13]);
10767
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[12]);
10768
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[11]);
10769
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[10]);
10770
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[9]);
10771
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[8]);
10772
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[7]);
10773
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[6]);
10774
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[5]);
10775
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[4]);
10776
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[3]);
10777
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[2]);
10778
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[1]);
10779
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[0]);
10780
        t[17] = l;
10781
        l = h;
10782
        h = o;
10783
        o = 0;
10784
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[18]);
10785
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[17]);
10786
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[16]);
10787
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[15]);
10788
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[14]);
10789
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[13]);
10790
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[12]);
10791
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[11]);
10792
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[10]);
10793
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[9]);
10794
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[8]);
10795
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[7]);
10796
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[6]);
10797
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[5]);
10798
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[4]);
10799
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[3]);
10800
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[2]);
10801
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[1]);
10802
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[0]);
10803
        t[18] = l;
10804
        l = h;
10805
        h = o;
10806
        o = 0;
10807
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[19]);
10808
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[18]);
10809
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[17]);
10810
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[16]);
10811
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[15]);
10812
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[14]);
10813
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[13]);
10814
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[12]);
10815
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[11]);
10816
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[10]);
10817
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[9]);
10818
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[8]);
10819
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[7]);
10820
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[6]);
10821
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[5]);
10822
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[4]);
10823
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[3]);
10824
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[2]);
10825
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[1]);
10826
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[0]);
10827
        t[19] = l;
10828
        l = h;
10829
        h = o;
10830
        o = 0;
10831
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[20]);
10832
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[19]);
10833
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[18]);
10834
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[17]);
10835
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[16]);
10836
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[15]);
10837
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[14]);
10838
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[13]);
10839
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[12]);
10840
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[11]);
10841
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[10]);
10842
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[9]);
10843
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[8]);
10844
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[7]);
10845
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[6]);
10846
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[5]);
10847
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[4]);
10848
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[3]);
10849
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[2]);
10850
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[1]);
10851
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[0]);
10852
        t[20] = l;
10853
        l = h;
10854
        h = o;
10855
        o = 0;
10856
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[21]);
10857
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[20]);
10858
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[19]);
10859
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[18]);
10860
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[17]);
10861
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[16]);
10862
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[15]);
10863
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[14]);
10864
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[13]);
10865
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[12]);
10866
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[11]);
10867
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[10]);
10868
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[9]);
10869
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[8]);
10870
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[7]);
10871
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[6]);
10872
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[5]);
10873
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[4]);
10874
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[3]);
10875
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[2]);
10876
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[1]);
10877
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[0]);
10878
        t[21] = l;
10879
        l = h;
10880
        h = o;
10881
        o = 0;
10882
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[22]);
10883
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[21]);
10884
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[20]);
10885
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[19]);
10886
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[18]);
10887
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[17]);
10888
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[16]);
10889
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[15]);
10890
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[14]);
10891
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[13]);
10892
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[12]);
10893
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[11]);
10894
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[10]);
10895
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[9]);
10896
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[8]);
10897
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[7]);
10898
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[6]);
10899
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[5]);
10900
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[4]);
10901
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[3]);
10902
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[2]);
10903
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[1]);
10904
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[0]);
10905
        t[22] = l;
10906
        l = h;
10907
        h = o;
10908
        o = 0;
10909
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[23]);
10910
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[22]);
10911
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[21]);
10912
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[20]);
10913
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[19]);
10914
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[18]);
10915
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[17]);
10916
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[16]);
10917
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[15]);
10918
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[14]);
10919
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[13]);
10920
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[12]);
10921
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[11]);
10922
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[10]);
10923
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[9]);
10924
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[8]);
10925
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[7]);
10926
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[6]);
10927
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[5]);
10928
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[4]);
10929
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[3]);
10930
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[2]);
10931
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[1]);
10932
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[0]);
10933
        t[23] = l;
10934
        l = h;
10935
        h = o;
10936
        o = 0;
10937
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[23]);
10938
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[22]);
10939
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[21]);
10940
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[20]);
10941
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[19]);
10942
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[18]);
10943
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[17]);
10944
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[16]);
10945
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[15]);
10946
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[14]);
10947
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[13]);
10948
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[12]);
10949
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[11]);
10950
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[10]);
10951
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[9]);
10952
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[8]);
10953
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[7]);
10954
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[6]);
10955
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[5]);
10956
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[4]);
10957
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[3]);
10958
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[2]);
10959
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[1]);
10960
        r->dp[24] = l;
10961
        l = h;
10962
        h = o;
10963
        o = 0;
10964
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[23]);
10965
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[22]);
10966
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[21]);
10967
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[20]);
10968
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[19]);
10969
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[18]);
10970
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[17]);
10971
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[16]);
10972
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[15]);
10973
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[14]);
10974
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[13]);
10975
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[12]);
10976
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[11]);
10977
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[10]);
10978
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[9]);
10979
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[8]);
10980
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[7]);
10981
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[6]);
10982
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[5]);
10983
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[4]);
10984
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[3]);
10985
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[2]);
10986
        r->dp[25] = l;
10987
        l = h;
10988
        h = o;
10989
        o = 0;
10990
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[23]);
10991
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[22]);
10992
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[21]);
10993
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[20]);
10994
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[19]);
10995
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[18]);
10996
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[17]);
10997
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[16]);
10998
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[15]);
10999
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[14]);
11000
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[13]);
11001
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[12]);
11002
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[11]);
11003
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[10]);
11004
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[9]);
11005
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[8]);
11006
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[7]);
11007
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[6]);
11008
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[5]);
11009
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[4]);
11010
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[3]);
11011
        r->dp[26] = l;
11012
        l = h;
11013
        h = o;
11014
        o = 0;
11015
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[23]);
11016
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[22]);
11017
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[21]);
11018
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[20]);
11019
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[19]);
11020
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[18]);
11021
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[17]);
11022
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[16]);
11023
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[15]);
11024
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[14]);
11025
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[13]);
11026
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[12]);
11027
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[11]);
11028
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[10]);
11029
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[9]);
11030
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[8]);
11031
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[7]);
11032
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[6]);
11033
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[5]);
11034
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[4]);
11035
        r->dp[27] = l;
11036
        l = h;
11037
        h = o;
11038
        o = 0;
11039
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[23]);
11040
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[22]);
11041
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[21]);
11042
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[20]);
11043
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[19]);
11044
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[18]);
11045
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[17]);
11046
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[16]);
11047
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[15]);
11048
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[14]);
11049
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[13]);
11050
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[12]);
11051
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[11]);
11052
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[10]);
11053
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[9]);
11054
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[8]);
11055
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[7]);
11056
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[6]);
11057
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[5]);
11058
        r->dp[28] = l;
11059
        l = h;
11060
        h = o;
11061
        o = 0;
11062
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[23]);
11063
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[22]);
11064
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[21]);
11065
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[20]);
11066
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[19]);
11067
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[18]);
11068
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[17]);
11069
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[16]);
11070
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[15]);
11071
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[14]);
11072
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[13]);
11073
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[12]);
11074
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[11]);
11075
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[10]);
11076
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[9]);
11077
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[8]);
11078
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[7]);
11079
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[6]);
11080
        r->dp[29] = l;
11081
        l = h;
11082
        h = o;
11083
        o = 0;
11084
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[23]);
11085
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[22]);
11086
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[21]);
11087
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[20]);
11088
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[19]);
11089
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[18]);
11090
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[17]);
11091
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[16]);
11092
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[15]);
11093
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[14]);
11094
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[13]);
11095
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[12]);
11096
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[11]);
11097
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[10]);
11098
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[9]);
11099
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[8]);
11100
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[7]);
11101
        r->dp[30] = l;
11102
        l = h;
11103
        h = o;
11104
        o = 0;
11105
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[23]);
11106
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[22]);
11107
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[21]);
11108
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[20]);
11109
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[19]);
11110
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[18]);
11111
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[17]);
11112
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[16]);
11113
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[15]);
11114
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[14]);
11115
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[13]);
11116
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[12]);
11117
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[11]);
11118
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[10]);
11119
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[9]);
11120
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[8]);
11121
        r->dp[31] = l;
11122
        l = h;
11123
        h = o;
11124
        o = 0;
11125
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[23]);
11126
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[22]);
11127
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[21]);
11128
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[20]);
11129
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[19]);
11130
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[18]);
11131
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[17]);
11132
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[16]);
11133
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[15]);
11134
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[14]);
11135
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[13]);
11136
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[12]);
11137
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[11]);
11138
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[10]);
11139
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[9]);
11140
        r->dp[32] = l;
11141
        l = h;
11142
        h = o;
11143
        o = 0;
11144
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[23]);
11145
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[22]);
11146
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[21]);
11147
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[20]);
11148
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[19]);
11149
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[18]);
11150
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[17]);
11151
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[16]);
11152
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[15]);
11153
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[14]);
11154
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[13]);
11155
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[12]);
11156
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[11]);
11157
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[10]);
11158
        r->dp[33] = l;
11159
        l = h;
11160
        h = o;
11161
        o = 0;
11162
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[23]);
11163
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[22]);
11164
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[21]);
11165
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[20]);
11166
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[19]);
11167
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[18]);
11168
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[17]);
11169
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[16]);
11170
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[15]);
11171
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[14]);
11172
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[13]);
11173
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[12]);
11174
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[11]);
11175
        r->dp[34] = l;
11176
        l = h;
11177
        h = o;
11178
        o = 0;
11179
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[23]);
11180
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[22]);
11181
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[21]);
11182
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[20]);
11183
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[19]);
11184
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[18]);
11185
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[17]);
11186
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[16]);
11187
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[15]);
11188
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[14]);
11189
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[13]);
11190
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[12]);
11191
        r->dp[35] = l;
11192
        l = h;
11193
        h = o;
11194
        o = 0;
11195
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[23]);
11196
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[22]);
11197
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[21]);
11198
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[20]);
11199
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[19]);
11200
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[18]);
11201
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[17]);
11202
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[16]);
11203
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[15]);
11204
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[14]);
11205
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[13]);
11206
        r->dp[36] = l;
11207
        l = h;
11208
        h = o;
11209
        o = 0;
11210
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[23]);
11211
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[22]);
11212
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[21]);
11213
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[20]);
11214
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[19]);
11215
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[18]);
11216
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[17]);
11217
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[16]);
11218
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[15]);
11219
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[14]);
11220
        r->dp[37] = l;
11221
        l = h;
11222
        h = o;
11223
        o = 0;
11224
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[23]);
11225
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[22]);
11226
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[21]);
11227
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[20]);
11228
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[19]);
11229
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[18]);
11230
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[17]);
11231
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[16]);
11232
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[15]);
11233
        r->dp[38] = l;
11234
        l = h;
11235
        h = o;
11236
        o = 0;
11237
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[23]);
11238
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[22]);
11239
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[21]);
11240
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[20]);
11241
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[19]);
11242
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[18]);
11243
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[17]);
11244
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[16]);
11245
        r->dp[39] = l;
11246
        l = h;
11247
        h = o;
11248
        o = 0;
11249
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[23]);
11250
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[22]);
11251
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[21]);
11252
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[20]);
11253
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[19]);
11254
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[18]);
11255
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[17]);
11256
        r->dp[40] = l;
11257
        l = h;
11258
        h = o;
11259
        o = 0;
11260
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[23]);
11261
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[22]);
11262
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[21]);
11263
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[20]);
11264
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[19]);
11265
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[18]);
11266
        r->dp[41] = l;
11267
        l = h;
11268
        h = o;
11269
        o = 0;
11270
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[23]);
11271
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[22]);
11272
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[21]);
11273
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[20]);
11274
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[19]);
11275
        r->dp[42] = l;
11276
        l = h;
11277
        h = o;
11278
        o = 0;
11279
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[23]);
11280
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[22]);
11281
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[21]);
11282
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[20]);
11283
        r->dp[43] = l;
11284
        l = h;
11285
        h = o;
11286
        o = 0;
11287
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[23]);
11288
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[22]);
11289
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[21]);
11290
        r->dp[44] = l;
11291
        l = h;
11292
        h = o;
11293
        o = 0;
11294
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[23]);
11295
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[22]);
11296
        r->dp[45] = l;
11297
        l = h;
11298
        h = o;
11299
        SP_ASM_MUL_ADD_NO(l, h, a->dp[23], b->dp[23]);
11300
        r->dp[46] = l;
11301
        r->dp[47] = h;
11302
        XMEMCPY(r->dp, t, 24 * sizeof(sp_int_digit));
11303
        r->used = 48;
11304
        sp_clamp(r);
11305
    }
11306
11307
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
11308
    XFREE(t, NULL, DYNAMIC_TYPE_BIGINT);
11309
#endif
11310
    return err;
11311
}
11312
    #endif /* SP_INT_DIGITS >= 48 */
11313
11314
    #if SP_INT_DIGITS >= 64
11315
/* Multiply a by b and store in r: r = a * b
11316
 *
11317
 * Karatsuba implementation.
11318
 *
11319
 * @param  [in]   a  SP integer to multiply.
11320
 * @param  [in]   b  SP integer to multiply.
11321
 * @param  [out]  r  SP integer result.
11322
 *
11323
 * @return  MP_OKAY on success.
11324
 * @return  MP_MEM when dynamic memory allocation fails.
11325
 */
11326
static int _sp_mul_32(const sp_int* a, const sp_int* b, sp_int* r)
11327
{
11328
    int err = MP_OKAY;
11329
    unsigned int i;
11330
    sp_int_digit l;
11331
    sp_int_digit h;
11332
    sp_int* a1;
11333
    sp_int* b1;
11334
    sp_int* z0;
11335
    sp_int* z1;
11336
    sp_int* z2;
11337
    sp_int_digit ca;
11338
    sp_int_digit cb;
11339
    DECL_SP_INT_ARRAY(t, 16, 2);
11340
    DECL_SP_INT_ARRAY(z, 33, 2);
11341
11342
    ALLOC_SP_INT_ARRAY(t, 16, 2, err, NULL);
11343
    ALLOC_SP_INT_ARRAY(z, 33, 2, err, NULL);
11344
    if (err == MP_OKAY) {
11345
        a1 = t[0];
11346
        b1 = t[1];
11347
        z1 = z[0];
11348
        z2 = z[1];
11349
        z0 = r;
11350
11351
        XMEMCPY(a1->dp, &a->dp[16], sizeof(sp_int_digit) * 16);
11352
        a1->used = 16;
11353
        XMEMCPY(b1->dp, &b->dp[16], sizeof(sp_int_digit) * 16);
11354
        b1->used = 16;
11355
11356
        /* z2 = a1 * b1 */
11357
        err = _sp_mul_16(a1, b1, z2);
11358
    }
11359
    if (err == MP_OKAY) {
11360
        l = a1->dp[0];
11361
        h = 0;
11362
        SP_ASM_ADDC(l, h, a->dp[0]);
11363
        a1->dp[0] = l;
11364
        l = h;
11365
        h = 0;
11366
        for (i = 1; i < 16; i++) {
11367
            SP_ASM_ADDC(l, h, a1->dp[i]);
11368
            SP_ASM_ADDC(l, h, a->dp[i]);
11369
            a1->dp[i] = l;
11370
            l = h;
11371
            h = 0;
11372
        }
11373
        ca = l;
11374
        /* b01 = b0 + b1 */
11375
        l = b1->dp[0];
11376
        h = 0;
11377
        SP_ASM_ADDC(l, h, b->dp[0]);
11378
        b1->dp[0] = l;
11379
        l = h;
11380
        h = 0;
11381
        for (i = 1; i < 16; i++) {
11382
            SP_ASM_ADDC(l, h, b1->dp[i]);
11383
            SP_ASM_ADDC(l, h, b->dp[i]);
11384
            b1->dp[i] = l;
11385
            l = h;
11386
            h = 0;
11387
        }
11388
        cb = l;
11389
11390
        /* z0 = a0 * b0 */
11391
        err = _sp_mul_16(a, b, z0);
11392
    }
11393
    if (err == MP_OKAY) {
11394
        /* z1 = (a0 + a1) * (b0 + b1) */
11395
        err = _sp_mul_16(a1, b1, z1);
11396
    }
11397
    if (err == MP_OKAY) {
11398
        /* r = (z2 << 32) + (z1 - z0 - z2) << 16) + z0 */
11399
        /* r = z0 */
11400
        /* r += (z1 - z0 - z2) << 16 */
11401
        z1->dp[32] = ca & cb;
11402
        l = 0;
11403
        if (ca) {
11404
            h = 0;
11405
            for (i = 0; i < 16; i++) {
11406
                SP_ASM_ADDC(l, h, z1->dp[i + 16]);
11407
                SP_ASM_ADDC(l, h, b1->dp[i]);
11408
                z1->dp[i + 16] = l;
11409
                l = h;
11410
                h = 0;
11411
            }
11412
        }
11413
        z1->dp[32] += l;
11414
        l = 0;
11415
        if (cb) {
11416
            h = 0;
11417
            for (i = 0; i < 16; i++) {
11418
                SP_ASM_ADDC(l, h, z1->dp[i + 16]);
11419
                SP_ASM_ADDC(l, h, a1->dp[i]);
11420
                z1->dp[i + 16] = l;
11421
                l = h;
11422
                h = 0;
11423
            }
11424
        }
11425
        z1->dp[32] += l;
11426
        /* z1 = z1 - z0 - z1 */
11427
        l = 0;
11428
        h = 0;
11429
        for (i = 0; i < 32; i++) {
11430
            l += z1->dp[i];
11431
            SP_ASM_SUBB(l, h, z0->dp[i]);
11432
            SP_ASM_SUBB(l, h, z2->dp[i]);
11433
            z1->dp[i] = l;
11434
            l = h;
11435
            h = 0;
11436
        }
11437
        z1->dp[i] += l;
11438
        /* r += z1 << 16 */
11439
        l = 0;
11440
        h = 0;
11441
        for (i = 0; i < 16; i++) {
11442
            SP_ASM_ADDC(l, h, r->dp[i + 16]);
11443
            SP_ASM_ADDC(l, h, z1->dp[i]);
11444
            r->dp[i + 16] = l;
11445
            l = h;
11446
            h = 0;
11447
        }
11448
        for (; i < 33; i++) {
11449
            SP_ASM_ADDC(l, h, z1->dp[i]);
11450
            r->dp[i + 16] = l;
11451
            l = h;
11452
            h = 0;
11453
        }
11454
        /* r += z2 << 32  */
11455
        l = 0;
11456
        h = 0;
11457
        for (i = 0; i < 17; i++) {
11458
            SP_ASM_ADDC(l, h, r->dp[i + 32]);
11459
            SP_ASM_ADDC(l, h, z2->dp[i]);
11460
            r->dp[i + 32] = l;
11461
            l = h;
11462
            h = 0;
11463
        }
11464
        for (; i < 32; i++) {
11465
            SP_ASM_ADDC(l, h, z2->dp[i]);
11466
            r->dp[i + 32] = l;
11467
            l = h;
11468
            h = 0;
11469
        }
11470
        r->used = 64;
11471
        sp_clamp(r);
11472
    }
11473
11474
    FREE_SP_INT_ARRAY(z, NULL);
11475
    FREE_SP_INT_ARRAY(t, NULL);
11476
    return err;
11477
}
11478
    #endif /* SP_INT_DIGITS >= 64 */
11479
11480
    #if SP_INT_DIGITS >= 96
11481
/* Multiply a by b and store in r: r = a * b
11482
 *
11483
 * Karatsuba implementation.
11484
 *
11485
 * @param  [in]   a  SP integer to multiply.
11486
 * @param  [in]   b  SP integer to multiply.
11487
 * @param  [out]  r  SP integer result.
11488
 *
11489
 * @return  MP_OKAY on success.
11490
 * @return  MP_MEM when dynamic memory allocation fails.
11491
 */
11492
static int _sp_mul_48(const sp_int* a, const sp_int* b, sp_int* r)
11493
{
11494
    int err = MP_OKAY;
11495
    unsigned int i;
11496
    sp_int_digit l;
11497
    sp_int_digit h;
11498
    sp_int* a1;
11499
    sp_int* b1;
11500
    sp_int* z0;
11501
    sp_int* z1;
11502
    sp_int* z2;
11503
    sp_int_digit ca;
11504
    sp_int_digit cb;
11505
    DECL_SP_INT_ARRAY(t, 24, 2);
11506
    DECL_SP_INT_ARRAY(z, 49, 2);
11507
11508
    ALLOC_SP_INT_ARRAY(t, 24, 2, err, NULL);
11509
    ALLOC_SP_INT_ARRAY(z, 49, 2, err, NULL);
11510
    if (err == MP_OKAY) {
11511
        a1 = t[0];
11512
        b1 = t[1];
11513
        z1 = z[0];
11514
        z2 = z[1];
11515
        z0 = r;
11516
11517
        XMEMCPY(a1->dp, &a->dp[24], sizeof(sp_int_digit) * 24);
11518
        a1->used = 24;
11519
        XMEMCPY(b1->dp, &b->dp[24], sizeof(sp_int_digit) * 24);
11520
        b1->used = 24;
11521
11522
        /* z2 = a1 * b1 */
11523
        err = _sp_mul_24(a1, b1, z2);
11524
    }
11525
    if (err == MP_OKAY) {
11526
        l = a1->dp[0];
11527
        h = 0;
11528
        SP_ASM_ADDC(l, h, a->dp[0]);
11529
        a1->dp[0] = l;
11530
        l = h;
11531
        h = 0;
11532
        for (i = 1; i < 24; i++) {
11533
            SP_ASM_ADDC(l, h, a1->dp[i]);
11534
            SP_ASM_ADDC(l, h, a->dp[i]);
11535
            a1->dp[i] = l;
11536
            l = h;
11537
            h = 0;
11538
        }
11539
        ca = l;
11540
        /* b01 = b0 + b1 */
11541
        l = b1->dp[0];
11542
        h = 0;
11543
        SP_ASM_ADDC(l, h, b->dp[0]);
11544
        b1->dp[0] = l;
11545
        l = h;
11546
        h = 0;
11547
        for (i = 1; i < 24; i++) {
11548
            SP_ASM_ADDC(l, h, b1->dp[i]);
11549
            SP_ASM_ADDC(l, h, b->dp[i]);
11550
            b1->dp[i] = l;
11551
            l = h;
11552
            h = 0;
11553
        }
11554
        cb = l;
11555
11556
        /* z0 = a0 * b0 */
11557
        err = _sp_mul_24(a, b, z0);
11558
    }
11559
    if (err == MP_OKAY) {
11560
        /* z1 = (a0 + a1) * (b0 + b1) */
11561
        err = _sp_mul_24(a1, b1, z1);
11562
    }
11563
    if (err == MP_OKAY) {
11564
        /* r = (z2 << 48) + (z1 - z0 - z2) << 24) + z0 */
11565
        /* r = z0 */
11566
        /* r += (z1 - z0 - z2) << 24 */
11567
        z1->dp[48] = ca & cb;
11568
        l = 0;
11569
        if (ca) {
11570
            h = 0;
11571
            for (i = 0; i < 24; i++) {
11572
                SP_ASM_ADDC(l, h, z1->dp[i + 24]);
11573
                SP_ASM_ADDC(l, h, b1->dp[i]);
11574
                z1->dp[i + 24] = l;
11575
                l = h;
11576
                h = 0;
11577
            }
11578
        }
11579
        z1->dp[48] += l;
11580
        l = 0;
11581
        if (cb) {
11582
            h = 0;
11583
            for (i = 0; i < 24; i++) {
11584
                SP_ASM_ADDC(l, h, z1->dp[i + 24]);
11585
                SP_ASM_ADDC(l, h, a1->dp[i]);
11586
                z1->dp[i + 24] = l;
11587
                l = h;
11588
                h = 0;
11589
            }
11590
        }
11591
        z1->dp[48] += l;
11592
        /* z1 = z1 - z0 - z1 */
11593
        l = 0;
11594
        h = 0;
11595
        for (i = 0; i < 48; i++) {
11596
            l += z1->dp[i];
11597
            SP_ASM_SUBB(l, h, z0->dp[i]);
11598
            SP_ASM_SUBB(l, h, z2->dp[i]);
11599
            z1->dp[i] = l;
11600
            l = h;
11601
            h = 0;
11602
        }
11603
        z1->dp[i] += l;
11604
        /* r += z1 << 16 */
11605
        l = 0;
11606
        h = 0;
11607
        for (i = 0; i < 24; i++) {
11608
            SP_ASM_ADDC(l, h, r->dp[i + 24]);
11609
            SP_ASM_ADDC(l, h, z1->dp[i]);
11610
            r->dp[i + 24] = l;
11611
            l = h;
11612
            h = 0;
11613
        }
11614
        for (; i < 49; i++) {
11615
            SP_ASM_ADDC(l, h, z1->dp[i]);
11616
            r->dp[i + 24] = l;
11617
            l = h;
11618
            h = 0;
11619
        }
11620
        /* r += z2 << 48  */
11621
        l = 0;
11622
        h = 0;
11623
        for (i = 0; i < 25; i++) {
11624
            SP_ASM_ADDC(l, h, r->dp[i + 48]);
11625
            SP_ASM_ADDC(l, h, z2->dp[i]);
11626
            r->dp[i + 48] = l;
11627
            l = h;
11628
            h = 0;
11629
        }
11630
        for (; i < 48; i++) {
11631
            SP_ASM_ADDC(l, h, z2->dp[i]);
11632
            r->dp[i + 48] = l;
11633
            l = h;
11634
            h = 0;
11635
        }
11636
        r->used = 96;
11637
        sp_clamp(r);
11638
    }
11639
11640
    FREE_SP_INT_ARRAY(z, NULL);
11641
    FREE_SP_INT_ARRAY(t, NULL);
11642
    return err;
11643
}
11644
    #endif /* SP_INT_DIGITS >= 96 */
11645
11646
    #if SP_INT_DIGITS >= 128
11647
/* Multiply a by b and store in r: r = a * b
11648
 *
11649
 * Karatsuba implementation.
11650
 *
11651
 * @param  [in]   a  SP integer to multiply.
11652
 * @param  [in]   b  SP integer to multiply.
11653
 * @param  [out]  r  SP integer result.
11654
 *
11655
 * @return  MP_OKAY on success.
11656
 * @return  MP_MEM when dynamic memory allocation fails.
11657
 */
11658
static int _sp_mul_64(const sp_int* a, const sp_int* b, sp_int* r)
11659
{
11660
    int err = MP_OKAY;
11661
    unsigned int i;
11662
    sp_int_digit l;
11663
    sp_int_digit h;
11664
    sp_int* a1;
11665
    sp_int* b1;
11666
    sp_int* z0;
11667
    sp_int* z1;
11668
    sp_int* z2;
11669
    sp_int_digit ca;
11670
    sp_int_digit cb;
11671
    DECL_SP_INT_ARRAY(t, 32, 2);
11672
    DECL_SP_INT_ARRAY(z, 65, 2);
11673
11674
    ALLOC_SP_INT_ARRAY(t, 32, 2, err, NULL);
11675
    ALLOC_SP_INT_ARRAY(z, 65, 2, err, NULL);
11676
    if (err == MP_OKAY) {
11677
        a1 = t[0];
11678
        b1 = t[1];
11679
        z1 = z[0];
11680
        z2 = z[1];
11681
        z0 = r;
11682
11683
        XMEMCPY(a1->dp, &a->dp[32], sizeof(sp_int_digit) * 32);
11684
        a1->used = 32;
11685
        XMEMCPY(b1->dp, &b->dp[32], sizeof(sp_int_digit) * 32);
11686
        b1->used = 32;
11687
11688
        /* z2 = a1 * b1 */
11689
        err = _sp_mul_32(a1, b1, z2);
11690
    }
11691
    if (err == MP_OKAY) {
11692
        l = a1->dp[0];
11693
        h = 0;
11694
        SP_ASM_ADDC(l, h, a->dp[0]);
11695
        a1->dp[0] = l;
11696
        l = h;
11697
        h = 0;
11698
        for (i = 1; i < 32; i++) {
11699
            SP_ASM_ADDC(l, h, a1->dp[i]);
11700
            SP_ASM_ADDC(l, h, a->dp[i]);
11701
            a1->dp[i] = l;
11702
            l = h;
11703
            h = 0;
11704
        }
11705
        ca = l;
11706
        /* b01 = b0 + b1 */
11707
        l = b1->dp[0];
11708
        h = 0;
11709
        SP_ASM_ADDC(l, h, b->dp[0]);
11710
        b1->dp[0] = l;
11711
        l = h;
11712
        h = 0;
11713
        for (i = 1; i < 32; i++) {
11714
            SP_ASM_ADDC(l, h, b1->dp[i]);
11715
            SP_ASM_ADDC(l, h, b->dp[i]);
11716
            b1->dp[i] = l;
11717
            l = h;
11718
            h = 0;
11719
        }
11720
        cb = l;
11721
11722
        /* z0 = a0 * b0 */
11723
        err = _sp_mul_32(a, b, z0);
11724
    }
11725
    if (err == MP_OKAY) {
11726
        /* z1 = (a0 + a1) * (b0 + b1) */
11727
        err = _sp_mul_32(a1, b1, z1);
11728
    }
11729
    if (err == MP_OKAY) {
11730
        /* r = (z2 << 64) + (z1 - z0 - z2) << 32) + z0 */
11731
        /* r = z0 */
11732
        /* r += (z1 - z0 - z2) << 32 */
11733
        z1->dp[64] = ca & cb;
11734
        l = 0;
11735
        if (ca) {
11736
            h = 0;
11737
            for (i = 0; i < 32; i++) {
11738
                SP_ASM_ADDC(l, h, z1->dp[i + 32]);
11739
                SP_ASM_ADDC(l, h, b1->dp[i]);
11740
                z1->dp[i + 32] = l;
11741
                l = h;
11742
                h = 0;
11743
            }
11744
        }
11745
        z1->dp[64] += l;
11746
        l = 0;
11747
        if (cb) {
11748
            h = 0;
11749
            for (i = 0; i < 32; i++) {
11750
                SP_ASM_ADDC(l, h, z1->dp[i + 32]);
11751
                SP_ASM_ADDC(l, h, a1->dp[i]);
11752
                z1->dp[i + 32] = l;
11753
                l = h;
11754
                h = 0;
11755
            }
11756
        }
11757
        z1->dp[64] += l;
11758
        /* z1 = z1 - z0 - z1 */
11759
        l = 0;
11760
        h = 0;
11761
        for (i = 0; i < 64; i++) {
11762
            l += z1->dp[i];
11763
            SP_ASM_SUBB(l, h, z0->dp[i]);
11764
            SP_ASM_SUBB(l, h, z2->dp[i]);
11765
            z1->dp[i] = l;
11766
            l = h;
11767
            h = 0;
11768
        }
11769
        z1->dp[i] += l;
11770
        /* r += z1 << 16 */
11771
        l = 0;
11772
        h = 0;
11773
        for (i = 0; i < 32; i++) {
11774
            SP_ASM_ADDC(l, h, r->dp[i + 32]);
11775
            SP_ASM_ADDC(l, h, z1->dp[i]);
11776
            r->dp[i + 32] = l;
11777
            l = h;
11778
            h = 0;
11779
        }
11780
        for (; i < 65; i++) {
11781
            SP_ASM_ADDC(l, h, z1->dp[i]);
11782
            r->dp[i + 32] = l;
11783
            l = h;
11784
            h = 0;
11785
        }
11786
        /* r += z2 << 64  */
11787
        l = 0;
11788
        h = 0;
11789
        for (i = 0; i < 33; i++) {
11790
            SP_ASM_ADDC(l, h, r->dp[i + 64]);
11791
            SP_ASM_ADDC(l, h, z2->dp[i]);
11792
            r->dp[i + 64] = l;
11793
            l = h;
11794
            h = 0;
11795
        }
11796
        for (; i < 64; i++) {
11797
            SP_ASM_ADDC(l, h, z2->dp[i]);
11798
            r->dp[i + 64] = l;
11799
            l = h;
11800
            h = 0;
11801
        }
11802
        r->used = 128;
11803
        sp_clamp(r);
11804
    }
11805
11806
    FREE_SP_INT_ARRAY(z, NULL);
11807
    FREE_SP_INT_ARRAY(t, NULL);
11808
    return err;
11809
}
11810
    #endif /* SP_INT_DIGITS >= 128 */
11811
11812
    #if SP_INT_DIGITS >= 192
11813
/* Multiply a by b and store in r: r = a * b
11814
 *
11815
 * Karatsuba implementation.
11816
 *
11817
 * @param  [in]   a  SP integer to multiply.
11818
 * @param  [in]   b  SP integer to multiply.
11819
 * @param  [out]  r  SP integer result.
11820
 *
11821
 * @return  MP_OKAY on success.
11822
 * @return  MP_MEM when dynamic memory allocation fails.
11823
 */
11824
static int _sp_mul_96(const sp_int* a, const sp_int* b, sp_int* r)
11825
{
11826
    int err = MP_OKAY;
11827
    unsigned int i;
11828
    sp_int_digit l;
11829
    sp_int_digit h;
11830
    sp_int* a1;
11831
    sp_int* b1;
11832
    sp_int* z0;
11833
    sp_int* z1;
11834
    sp_int* z2;
11835
    sp_int_digit ca;
11836
    sp_int_digit cb;
11837
    DECL_SP_INT_ARRAY(t, 48, 2);
11838
    DECL_SP_INT_ARRAY(z, 97, 2);
11839
11840
    ALLOC_SP_INT_ARRAY(t, 48, 2, err, NULL);
11841
    ALLOC_SP_INT_ARRAY(z, 97, 2, err, NULL);
11842
    if (err == MP_OKAY) {
11843
        a1 = t[0];
11844
        b1 = t[1];
11845
        z1 = z[0];
11846
        z2 = z[1];
11847
        z0 = r;
11848
11849
        XMEMCPY(a1->dp, &a->dp[48], sizeof(sp_int_digit) * 48);
11850
        a1->used = 48;
11851
        XMEMCPY(b1->dp, &b->dp[48], sizeof(sp_int_digit) * 48);
11852
        b1->used = 48;
11853
11854
        /* z2 = a1 * b1 */
11855
        err = _sp_mul_48(a1, b1, z2);
11856
    }
11857
    if (err == MP_OKAY) {
11858
        l = a1->dp[0];
11859
        h = 0;
11860
        SP_ASM_ADDC(l, h, a->dp[0]);
11861
        a1->dp[0] = l;
11862
        l = h;
11863
        h = 0;
11864
        for (i = 1; i < 48; i++) {
11865
            SP_ASM_ADDC(l, h, a1->dp[i]);
11866
            SP_ASM_ADDC(l, h, a->dp[i]);
11867
            a1->dp[i] = l;
11868
            l = h;
11869
            h = 0;
11870
        }
11871
        ca = l;
11872
        /* b01 = b0 + b1 */
11873
        l = b1->dp[0];
11874
        h = 0;
11875
        SP_ASM_ADDC(l, h, b->dp[0]);
11876
        b1->dp[0] = l;
11877
        l = h;
11878
        h = 0;
11879
        for (i = 1; i < 48; i++) {
11880
            SP_ASM_ADDC(l, h, b1->dp[i]);
11881
            SP_ASM_ADDC(l, h, b->dp[i]);
11882
            b1->dp[i] = l;
11883
            l = h;
11884
            h = 0;
11885
        }
11886
        cb = l;
11887
11888
        /* z0 = a0 * b0 */
11889
        err = _sp_mul_48(a, b, z0);
11890
    }
11891
    if (err == MP_OKAY) {
11892
        /* z1 = (a0 + a1) * (b0 + b1) */
11893
        err = _sp_mul_48(a1, b1, z1);
11894
    }
11895
    if (err == MP_OKAY) {
11896
        /* r = (z2 << 96) + (z1 - z0 - z2) << 48) + z0 */
11897
        /* r = z0 */
11898
        /* r += (z1 - z0 - z2) << 48 */
11899
        z1->dp[96] = ca & cb;
11900
        l = 0;
11901
        if (ca) {
11902
            h = 0;
11903
            for (i = 0; i < 48; i++) {
11904
                SP_ASM_ADDC(l, h, z1->dp[i + 48]);
11905
                SP_ASM_ADDC(l, h, b1->dp[i]);
11906
                z1->dp[i + 48] = l;
11907
                l = h;
11908
                h = 0;
11909
            }
11910
        }
11911
        z1->dp[96] += l;
11912
        l = 0;
11913
        if (cb) {
11914
            h = 0;
11915
            for (i = 0; i < 48; i++) {
11916
                SP_ASM_ADDC(l, h, z1->dp[i + 48]);
11917
                SP_ASM_ADDC(l, h, a1->dp[i]);
11918
                z1->dp[i + 48] = l;
11919
                l = h;
11920
                h = 0;
11921
            }
11922
        }
11923
        z1->dp[96] += l;
11924
        /* z1 = z1 - z0 - z1 */
11925
        l = 0;
11926
        h = 0;
11927
        for (i = 0; i < 96; i++) {
11928
            l += z1->dp[i];
11929
            SP_ASM_SUBB(l, h, z0->dp[i]);
11930
            SP_ASM_SUBB(l, h, z2->dp[i]);
11931
            z1->dp[i] = l;
11932
            l = h;
11933
            h = 0;
11934
        }
11935
        z1->dp[i] += l;
11936
        /* r += z1 << 16 */
11937
        l = 0;
11938
        h = 0;
11939
        for (i = 0; i < 48; i++) {
11940
            SP_ASM_ADDC(l, h, r->dp[i + 48]);
11941
            SP_ASM_ADDC(l, h, z1->dp[i]);
11942
            r->dp[i + 48] = l;
11943
            l = h;
11944
            h = 0;
11945
        }
11946
        for (; i < 97; i++) {
11947
            SP_ASM_ADDC(l, h, z1->dp[i]);
11948
            r->dp[i + 48] = l;
11949
            l = h;
11950
            h = 0;
11951
        }
11952
        /* r += z2 << 96  */
11953
        l = 0;
11954
        h = 0;
11955
        for (i = 0; i < 49; i++) {
11956
            SP_ASM_ADDC(l, h, r->dp[i + 96]);
11957
            SP_ASM_ADDC(l, h, z2->dp[i]);
11958
            r->dp[i + 96] = l;
11959
            l = h;
11960
            h = 0;
11961
        }
11962
        for (; i < 96; i++) {
11963
            SP_ASM_ADDC(l, h, z2->dp[i]);
11964
            r->dp[i + 96] = l;
11965
            l = h;
11966
            h = 0;
11967
        }
11968
        r->used = 192;
11969
        sp_clamp(r);
11970
    }
11971
11972
    FREE_SP_INT_ARRAY(z, NULL);
11973
    FREE_SP_INT_ARRAY(t, NULL);
11974
    return err;
11975
}
11976
    #endif /* SP_INT_DIGITS >= 192 */
11977
11978
#endif /* SQR_MUL_ASM && WOLFSSL_SP_INT_LARGE_COMBA */
11979
#endif /* !WOLFSSL_SP_SMALL */
11980
11981
/* Multiply a by b and store in r: r = a * b
11982
 *
11983
 * @param  [in]   a  SP integer to multiply.
11984
 * @param  [in]   b  SP integer to multiply.
11985
 * @param  [out]  r  SP integer result.
11986
 *
11987
 * @return  MP_OKAY on success.
11988
 * @return  MP_VAL when a, b or is NULL; or the result will be too big for fixed
11989
 *          data length.
11990
 * @return  MP_MEM when dynamic memory allocation fails.
11991
 */
11992
int sp_mul(const sp_int* a, const sp_int* b, sp_int* r)
11993
36.5M
{
11994
36.5M
    int err = MP_OKAY;
11995
36.5M
#ifdef WOLFSSL_SP_INT_NEGATIVE
11996
36.5M
    sp_uint8 sign = MP_ZPOS;
11997
36.5M
#endif
11998
11999
36.5M
    if ((a == NULL) || (b == NULL) || (r == NULL)) {
12000
0
        err = MP_VAL;
12001
0
    }
12002
12003
    /* Need extra digit during calculation. */
12004
    /* NOLINTBEGIN(clang-analyzer-core.UndefinedBinaryOperatorResult) */
12005
    /* clang-tidy falsely believes that r->size was corrupted by the _sp_copy()
12006
     * to "Copy base into working variable" in _sp_exptmod_ex().
12007
     */
12008
36.5M
    if ((err == MP_OKAY) && (a->used + b->used > r->size)) {
12009
29
        err = MP_VAL;
12010
29
    }
12011
    /* NOLINTEND(clang-analyzer-core.UndefinedBinaryOperatorResult) */
12012
12013
#if 0
12014
    if (err == MP_OKAY) {
12015
        sp_print(a, "a");
12016
        sp_print(b, "b");
12017
    }
12018
#endif
12019
12020
36.5M
    if (err == MP_OKAY) {
12021
36.5M
    #ifdef WOLFSSL_SP_INT_NEGATIVE
12022
36.5M
        sign = a->sign ^ b->sign;
12023
36.5M
    #endif
12024
12025
36.5M
        if ((a->used == 0) || (b->used == 0)) {
12026
159k
            _sp_zero(r);
12027
159k
        }
12028
36.3M
        else
12029
36.3M
#ifndef WOLFSSL_SP_SMALL
12030
36.3M
#if !defined(WOLFSSL_HAVE_SP_ECC) && defined(HAVE_ECC)
12031
36.3M
#if (SP_WORD_SIZE == 64 && SP_INT_BITS >= 256)
12032
36.3M
        if ((a->used == 4) && (b->used == 4)) {
12033
15.5M
            err = _sp_mul_4(a, b, r);
12034
15.5M
        }
12035
20.8M
        else
12036
20.8M
#endif /* SP_WORD_SIZE == 64 */
12037
20.8M
#if (SP_WORD_SIZE == 64 && SP_INT_BITS >= 384)
12038
20.8M
#ifdef SQR_MUL_ASM
12039
20.8M
        if ((a->used == 6) && (b->used == 6)) {
12040
5.81M
            err = _sp_mul_6(a, b, r);
12041
5.81M
        }
12042
15.0M
        else
12043
15.0M
#endif /* SQR_MUL_ASM */
12044
15.0M
#endif /* SP_WORD_SIZE == 64 */
12045
#if (SP_WORD_SIZE == 32 && SP_INT_BITS >= 256)
12046
#ifdef SQR_MUL_ASM
12047
        if ((a->used == 8) && (b->used == 8)) {
12048
            err = _sp_mul_8(a, b, r);
12049
        }
12050
        else
12051
#endif /* SQR_MUL_ASM */
12052
#endif /* SP_WORD_SIZE == 32 */
12053
#if (SP_WORD_SIZE == 32 && SP_INT_BITS >= 384)
12054
#ifdef SQR_MUL_ASM
12055
        if ((a->used == 12) && (b->used == 12)) {
12056
            err = _sp_mul_12(a, b, r);
12057
        }
12058
        else
12059
#endif /* SQR_MUL_ASM */
12060
#endif /* SP_WORD_SIZE == 32 */
12061
15.0M
#endif /* !WOLFSSL_HAVE_SP_ECC && HAVE_ECC */
12062
#if defined(SQR_MUL_ASM) && (defined(WOLFSSL_SP_INT_LARGE_COMBA) || \
12063
    (!defined(WOLFSSL_SP_MATH) && defined(WOLFCRYPT_HAVE_SAKKE) && \
12064
    (SP_WORD_SIZE == 64)))
12065
    #if SP_INT_DIGITS >= 32
12066
        if ((a->used == 16) && (b->used == 16)) {
12067
            err = _sp_mul_16(a, b, r);
12068
        }
12069
        else
12070
    #endif /* SP_INT_DIGITS >= 32 */
12071
#endif /* SQR_MUL_ASM && (WOLFSSL_SP_INT_LARGE_COMBA || !WOLFSSL_SP_MATH &&
12072
        * WOLFCRYPT_HAVE_SAKKE && SP_WORD_SIZE == 64 */
12073
#if defined(SQR_MUL_ASM) && defined(WOLFSSL_SP_INT_LARGE_COMBA)
12074
    #if SP_INT_DIGITS >= 48
12075
        if ((a->used == 24) && (b->used == 24)) {
12076
            err = _sp_mul_24(a, b, r);
12077
        }
12078
        else
12079
    #endif /* SP_INT_DIGITS >= 48 */
12080
    #if SP_INT_DIGITS >= 64
12081
        if ((a->used == 32) && (b->used == 32)) {
12082
            err = _sp_mul_32(a, b, r);
12083
        }
12084
        else
12085
    #endif /* SP_INT_DIGITS >= 64 */
12086
    #if SP_INT_DIGITS >= 96
12087
        if ((a->used == 48) && (b->used == 48)) {
12088
            err = _sp_mul_48(a, b, r);
12089
        }
12090
        else
12091
    #endif /* SP_INT_DIGITS >= 96 */
12092
    #if SP_INT_DIGITS >= 128
12093
        if ((a->used == 64) && (b->used == 64)) {
12094
            err = _sp_mul_64(a, b, r);
12095
        }
12096
        else
12097
    #endif /* SP_INT_DIGITS >= 128 */
12098
    #if SP_INT_DIGITS >= 192
12099
        if ((a->used == 96) && (b->used == 96)) {
12100
            err = _sp_mul_96(a, b, r);
12101
        }
12102
        else
12103
    #endif /* SP_INT_DIGITS >= 192 */
12104
#endif /* SQR_MUL_ASM && WOLFSSL_SP_INT_LARGE_COMBA */
12105
15.0M
#endif /* !WOLFSSL_SP_SMALL */
12106
12107
15.0M
#ifdef SQR_MUL_ASM
12108
15.0M
        if (a->used == b->used) {
12109
14.1M
            err = _sp_mul_nxn(a, b, r);
12110
14.1M
        }
12111
867k
        else
12112
867k
#endif
12113
867k
        {
12114
867k
            err = _sp_mul(a, b, r);
12115
867k
        }
12116
36.5M
    }
12117
12118
36.5M
#ifdef WOLFSSL_SP_INT_NEGATIVE
12119
36.5M
    if (err == MP_OKAY) {
12120
36.5M
        r->sign = (r->used == 0) ? MP_ZPOS : sign;
12121
36.5M
    }
12122
36.5M
#endif
12123
12124
#if 0
12125
    if (err == MP_OKAY) {
12126
        sp_print(r, "rmul");
12127
    }
12128
#endif
12129
12130
36.5M
    return err;
12131
36.5M
}
12132
/* END SP_MUL implementations. */
12133
12134
#endif
12135
12136
#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_HAVE_SP_DH) || \
12137
    defined(WOLFCRYPT_HAVE_ECCSI) || \
12138
    (!defined(NO_RSA) && defined(WOLFSSL_KEY_GEN)) || defined(OPENSSL_ALL)
12139
/* Multiply a by b mod m and store in r: r = (a * b) mod m
12140
 *
12141
 * @param  [in]   a  SP integer to multiply.
12142
 * @param  [in]   b  SP integer to multiply.
12143
 * @param  [in]   m  SP integer that is the modulus.
12144
 * @param  [out]  r  SP integer result.
12145
 *
12146
 * @return  MP_OKAY on success.
12147
 * @return  MP_MEM when dynamic memory allocation fails.
12148
 */
12149
static int _sp_mulmod_tmp(const sp_int* a, const sp_int* b, const sp_int* m,
12150
    sp_int* r)
12151
95
{
12152
95
    int err = MP_OKAY;
12153
    /* Create temporary for multiplication result. */
12154
95
    DECL_SP_INT(t, a->used + b->used);
12155
12156
95
    ALLOC_SP_INT(t, a->used + b->used, err, NULL);
12157
95
    if (err == MP_OKAY) {
12158
60
        err = sp_init_size(t, (sp_size_t)(a->used + b->used));
12159
60
    }
12160
12161
    /* Multiply and reduce. */
12162
95
    if (err == MP_OKAY) {
12163
53
        err = sp_mul(a, b, t);
12164
53
    }
12165
95
    if (err == MP_OKAY) {
12166
48
        err = sp_mod(t, m, r);
12167
48
    }
12168
12169
    /* Dispose of an allocated SP int. */
12170
95
    FREE_SP_INT(t, NULL);
12171
12172
95
    return err;
12173
95
}
12174
12175
/* Multiply a by b mod m and store in r: r = (a * b) mod m
12176
 *
12177
 * @param  [in]   a  SP integer to multiply.
12178
 * @param  [in]   b  SP integer to multiply.
12179
 * @param  [in]   m  SP integer that is the modulus.
12180
 * @param  [out]  r  SP integer result.
12181
 *
12182
 * @return  MP_OKAY on success.
12183
 * @return  MP_MEM when dynamic memory allocation fails.
12184
 */
12185
static int _sp_mulmod(const sp_int* a, const sp_int* b, const sp_int* m,
12186
    sp_int* r)
12187
7.01M
{
12188
7.01M
    int err = MP_OKAY;
12189
12190
    /* Use r as intermediate result if not same as pointer m which is needed
12191
     * after first intermediate result.
12192
     */
12193
7.01M
    if (r != m) {
12194
        /* Multiply and reduce. */
12195
7.01M
        err = sp_mul(a, b, r);
12196
7.01M
        if (err == MP_OKAY) {
12197
7.01M
            err = sp_mod(r, m, r);
12198
7.01M
        }
12199
7.01M
    }
12200
95
    else {
12201
        /* Do operation using temporary. */
12202
95
        err = _sp_mulmod_tmp(a, b, m, r);
12203
95
    }
12204
12205
7.01M
    return err;
12206
7.01M
}
12207
12208
/* Multiply a by b mod m and store in r: r = (a * b) mod m
12209
 *
12210
 * @param  [in]   a  SP integer to multiply.
12211
 * @param  [in]   b  SP integer to multiply.
12212
 * @param  [in]   m  SP integer that is the modulus.
12213
 * @param  [out]  r  SP integer result.
12214
 *
12215
 * @return  MP_OKAY on success.
12216
 * @return  MP_VAL when a, b, m or r is NULL; m is 0; or a * b is too big for
12217
 *          fixed data length.
12218
 * @return  MP_MEM when dynamic memory allocation fails.
12219
 */
12220
int sp_mulmod(const sp_int* a, const sp_int* b, const sp_int* m, sp_int* r)
12221
983k
{
12222
983k
    int err = MP_OKAY;
12223
12224
    /* Validate parameters. */
12225
983k
    if ((a == NULL) || (b == NULL) || (m == NULL) || (r == NULL)) {
12226
0
        err = MP_VAL;
12227
0
    }
12228
    /* Ensure result SP int is big enough for intermediates. */
12229
983k
    if ((err == MP_OKAY) && (r != m) && (a->used + b->used > r->size)) {
12230
46
        err = MP_VAL;
12231
46
    }
12232
12233
#if 0
12234
    if (err == 0) {
12235
        sp_print(a, "a");
12236
        sp_print(b, "b");
12237
        sp_print(m, "m");
12238
    }
12239
#endif
12240
12241
983k
    if (err == MP_OKAY) {
12242
983k
        err = _sp_mulmod(a, b, m, r);
12243
983k
    }
12244
12245
#if 0
12246
    if (err == 0) {
12247
        sp_print(r, "rmm");
12248
    }
12249
#endif
12250
12251
983k
    return err;
12252
983k
}
12253
#endif
12254
12255
#ifdef WOLFSSL_SP_INVMOD
12256
/* Calculates the multiplicative inverse in the field. r*a = x*m + 1
12257
 * Right-shift Algorithm. NOT constant time.
12258
 *
12259
 * Algorithm:
12260
 *   1. u = m, v = a, b = 0, c = 1
12261
 *   2. While v != 1 and u != 0
12262
 *     2.1. If u even
12263
 *       2.1.1. u /= 2
12264
 *       2.1.2. b = (b / 2) mod m
12265
 *     2.2. Else if v even
12266
 *       2.2.1. v /= 2
12267
 *       2.2.2. c = (c / 2) mod m
12268
 *     2.3. Else if u >= v
12269
 *       2.3.1. u -= v
12270
 *       2.3.2. b = (c - b) mod m
12271
 *     2.4. Else (v > u)
12272
 *       2.4.1. v -= u
12273
 *       2.4.2. c = (b - c) mod m
12274
 *  3. NO_INVERSE if u == 0
12275
 *
12276
 * @param  [in]   a  SP integer to find inverse of.
12277
 * @param  [in]   m  SP integer this is the modulus.
12278
 * @param  [in]   u  SP integer to use in calculation.
12279
 * @param  [in]   v  SP integer to use in calculation.
12280
 * @param  [in]   b  SP integer to use in calculation
12281
 * @param  [out]  c  SP integer that is the inverse.
12282
 *
12283
 * @return  MP_OKAY on success.
12284
 * @return  MP_VAL when no inverse.
12285
 */
12286
static int _sp_invmod_bin(const sp_int* a, const sp_int* m, sp_int* u,
12287
    sp_int* v, sp_int* b, sp_int* c)
12288
11.5k
{
12289
11.5k
    int err = MP_OKAY;
12290
12291
    /* 1. u = m, v = a, b = 0, c = 1 */
12292
11.5k
    _sp_copy(m, u);
12293
11.5k
    if (a != v) {
12294
11.2k
        _sp_copy(a, v);
12295
11.2k
    }
12296
11.5k
    _sp_zero(b);
12297
11.5k
    _sp_set(c, 1);
12298
12299
    /* 2. While v != 1 and u != 0 */
12300
5.62M
    while (!sp_isone(v) && !sp_iszero(u)) {
12301
        /* 2.1. If u even */
12302
5.61M
        if ((u->dp[0] & 1) == 0) {
12303
            /* 2.1.1. u /= 2 */
12304
1.87M
            _sp_div_2(u, u);
12305
            /* 2.1.2. b = (b / 2) mod m */
12306
1.87M
            if (sp_isodd(b)) {
12307
1.01M
                _sp_add_off(b, m, b, 0);
12308
1.01M
            }
12309
1.87M
            _sp_div_2(b, b);
12310
1.87M
        }
12311
        /* 2.2. Else if v even */
12312
3.73M
        else if ((v->dp[0] & 1) == 0) {
12313
            /* 2.2.1. v /= 2 */
12314
1.92M
            _sp_div_2(v, v);
12315
            /* 2.1.2. c = (c / 2) mod m */
12316
1.92M
            if (sp_isodd(c)) {
12317
779k
                _sp_add_off(c, m, c, 0);
12318
779k
            }
12319
1.92M
            _sp_div_2(c, c);
12320
1.92M
        }
12321
        /* 2.3. Else if u >= v */
12322
1.80M
        else if (_sp_cmp_abs(u, v) != MP_LT) {
12323
            /* 2.3.1. u -= v */
12324
918k
            _sp_sub_off(u, v, u, 0);
12325
            /* 2.3.2. b = (c - b) mod m */
12326
918k
            if (_sp_cmp_abs(b, c) == MP_LT) {
12327
384k
                _sp_add_off(b, m, b, 0);
12328
384k
            }
12329
918k
            _sp_sub_off(b, c, b, 0);
12330
918k
        }
12331
        /* 2.4. Else (v > u) */
12332
891k
        else {
12333
            /* 2.4.1. v -= u */
12334
891k
            _sp_sub_off(v, u, v, 0);
12335
            /* 2.4.2. c = (b - c) mod m */
12336
891k
            if (_sp_cmp_abs(c, b) == MP_LT) {
12337
496k
                _sp_add_off(c, m, c, 0);
12338
496k
            }
12339
891k
            _sp_sub_off(c, b, c, 0);
12340
891k
        }
12341
5.61M
    }
12342
    /* 3. NO_INVERSE if u == 0 */
12343
11.5k
    if (sp_iszero(u)) {
12344
148
        err = MP_VAL;
12345
148
    }
12346
12347
11.5k
    return err;
12348
11.5k
}
12349
12350
#if !defined(WOLFSSL_SP_LOW_MEM) && !defined(WOLFSSL_SP_SMALL) && \
12351
    (!defined(NO_RSA) || !defined(NO_DH))
12352
/* Calculates the multiplicative inverse in the field. r*a = x*m + 1
12353
 * Extended Euclidean Algorithm. NOT constant time.
12354
 *
12355
 * Creates two new SP ints.
12356
 *
12357
 * Algorithm:
12358
 *  1. x = m, y = a, b = 1, c = 0
12359
 *  2. while x > 1
12360
 *   2.1. d = x / y, r = x mod y
12361
 *   2.2. c -= d * b
12362
 *   2.3. x = y, y = r
12363
 *   2.4. s = b, b = c, c = s
12364
 *  3. If y != 0 then NO_INVERSE
12365
 *  4. If c < 0 then c += m
12366
 *  5. inv = c
12367
 *
12368
 * @param  [in]   a    SP integer to find inverse of.
12369
 * @param  [in]   m    SP integer this is the modulus.
12370
 * @param  [in]   u    SP integer to use in calculation.
12371
 * @param  [in]   v    SP integer to use in calculation.
12372
 * @param  [in]   b    SP integer to use in calculation
12373
 * @param  [in]   c    SP integer to use in calculation
12374
 * @param  [out]  inv  SP integer that is the inverse.
12375
 *
12376
 * @return  MP_OKAY on success.
12377
 * @return  MP_VAL when no inverse.
12378
 * @return  MP_MEM when dynamic memory allocation fails.
12379
 */
12380
static int _sp_invmod_div(const sp_int* a, const sp_int* m, sp_int* x,
12381
    sp_int* y, sp_int* b, sp_int* c, sp_int* inv)
12382
8.96k
{
12383
8.96k
    int err = MP_OKAY;
12384
8.96k
    sp_int* s;
12385
#ifndef WOLFSSL_SP_INT_NEGATIVE
12386
    int bneg = 0;
12387
    int cneg = 0;
12388
    int neg;
12389
#endif
12390
8.96k
    DECL_SP_INT(d, m->used + 1);
12391
12392
8.96k
    ALLOC_SP_INT(d, m->used + 1, err, NULL);
12393
8.96k
    if (err == MP_OKAY) {
12394
8.95k
        err = sp_init_size(d, (sp_size_t)(m->used + 1U));
12395
8.95k
    }
12396
12397
8.96k
    if (err == MP_OKAY) {
12398
        /* 1. x = m, y = a, b = 1, c = 0 */
12399
8.95k
        if (a != y) {
12400
8.90k
            _sp_copy(a, y);
12401
8.90k
        }
12402
8.95k
        _sp_copy(m, x);
12403
8.95k
        _sp_set(b, 1);
12404
8.95k
        _sp_zero(c);
12405
8.95k
    }
12406
8.96k
#ifdef WOLFSSL_SP_INT_NEGATIVE
12407
    /* 2. while x > 1 */
12408
9.73M
    while ((err == MP_OKAY) && (!sp_isone(x)) && (!sp_iszero(x))) {
12409
        /* 2.1. d = x / y, r = x mod y */
12410
9.72M
        err = sp_div(x, y, d, x);
12411
9.72M
        if (err == MP_OKAY) {
12412
            /* 2.2. c -= d * b */
12413
9.72M
            if (sp_isone(d)) {
12414
                /* c -= 1 * b */
12415
4.02M
                err = sp_sub(c, b, c);
12416
4.02M
            }
12417
5.69M
            else {
12418
                /* d *= b */
12419
5.69M
                err = sp_mul(d, b, d);
12420
                /* c -= d */
12421
5.69M
                if (err == MP_OKAY) {
12422
5.69M
                    err = sp_sub(c, d, c);
12423
5.69M
                }
12424
5.69M
            }
12425
            /* 2.3. x = y, y = r */
12426
9.72M
            s = y; y = x; x = s;
12427
            /* 2.4. s = b, b = c, c = s */
12428
9.72M
            s = b; b = c; c = s;
12429
9.72M
        }
12430
9.72M
    }
12431
    /* 3. If y != 0 then NO_INVERSE */
12432
8.96k
    if ((err == MP_OKAY) && (!sp_iszero(y))) {
12433
0
        err = MP_VAL;
12434
0
    }
12435
    /* 4. If c < 0 then c += m */
12436
8.96k
    if ((err == MP_OKAY) && sp_isneg(c)) {
12437
4.14k
        err = sp_add(c, m, c);
12438
4.14k
    }
12439
8.96k
    if (err == MP_OKAY) {
12440
        /* 5. inv = c */
12441
8.21k
        err = sp_copy(c, inv);
12442
8.21k
    }
12443
#else
12444
    /* 2. while x > 1 */
12445
    while ((err == MP_OKAY) && (!sp_isone(x)) && (!sp_iszero(x))) {
12446
        /* 2.1. d = x / y, r = x mod y */
12447
        err = sp_div(x, y, d, x);
12448
        if (err == MP_OKAY) {
12449
            if (sp_isone(d)) {
12450
                /* c -= 1 * b */
12451
                if ((bneg ^ cneg) == 1) {
12452
                    /* c -= -b or -c -= b, therefore add. */
12453
                    _sp_add_off(c, b, c, 0);
12454
                }
12455
                else if (_sp_cmp_abs(c, b) == MP_LT) {
12456
                    /* |c| < |b| and same sign, reverse subtract and negate. */
12457
                    _sp_sub_off(b, c, c, 0);
12458
                    cneg = !cneg;
12459
                }
12460
                else {
12461
                    /* |c| >= |b| */
12462
                    _sp_sub_off(c, b, c, 0);
12463
                }
12464
            }
12465
            else {
12466
                /* d *= b */
12467
                err = sp_mul(d, b, d);
12468
                /* c -= d */
12469
                if (err == MP_OKAY) {
12470
                    if ((bneg ^ cneg) == 1) {
12471
                        /* c -= -d or -c -= d, therefore add. */
12472
                        _sp_add_off(c, d, c, 0);
12473
                    }
12474
                    else if (_sp_cmp_abs(c, d) == MP_LT) {
12475
                        /* |c| < |d| and same sign, reverse subtract and negate.
12476
                         */
12477
                        _sp_sub_off(d, c, c, 0);
12478
                        cneg = !cneg;
12479
                    }
12480
                    else {
12481
                        _sp_sub_off(c, d, c, 0);
12482
                    }
12483
                }
12484
            }
12485
            /* 2.3. x = y, y = r */
12486
            s = y; y = x; x = s;
12487
            /* 2.4. s = b, b = c, c = s */
12488
            s = b; b = c; c = s;
12489
            neg = bneg; bneg = cneg; cneg = neg;
12490
        }
12491
    }
12492
    /* 3. If y != 0 then NO_INVERSE */
12493
    if ((err == MP_OKAY) && (!sp_iszero(y))) {
12494
        err = MP_VAL;
12495
    }
12496
    /* 4. If c < 0 then c += m */
12497
    if ((err == MP_OKAY) && cneg) {
12498
        /* c = m - |c| */
12499
        _sp_sub_off(m, c, c, 0);
12500
    }
12501
    if (err == MP_OKAY) {
12502
        /* 5. inv = c */
12503
        err = sp_copy(c, inv);
12504
    }
12505
#endif
12506
12507
8.96k
    FREE_SP_INT(d, NULL);
12508
8.96k
    return err;
12509
8.96k
}
12510
#endif
12511
12512
/* Calculates the multiplicative inverse in the field.
12513
 * Right-shift Algorithm or Extended Euclidean Algorithm. NOT constant time.
12514
 *
12515
 * r*a = x*m + 1
12516
 *
12517
 * @param  [in]   a  SP integer to find inverse of.
12518
 * @param  [in]   m  SP integer this is the modulus.
12519
 * @param  [out]  r  SP integer to hold result. r cannot be m.
12520
 *
12521
 * @return  MP_OKAY on success.
12522
 * @return  MP_VAL when m is even and a divides m evenly.
12523
 * @return  MP_MEM when dynamic memory allocation fails.
12524
 */
12525
static int _sp_invmod(const sp_int* a, const sp_int* m, sp_int* r)
12526
26.9k
{
12527
26.9k
    int err = MP_OKAY;
12528
26.9k
    sp_int* u = NULL;
12529
26.9k
    sp_int* v = NULL;
12530
26.9k
    sp_int* b = NULL;
12531
26.9k
    DECL_SP_INT_ARRAY(t, m->used + 1, 3);
12532
26.9k
    DECL_SP_INT(c, 2 * m->used + 1);
12533
12534
    /* Allocate SP ints:
12535
     *  - x3 one word larger than modulus
12536
     *  - x1 one word longer than twice modulus used
12537
     */
12538
26.9k
    ALLOC_SP_INT_ARRAY(t, m->used + 1U, 3, err, NULL);
12539
26.9k
    ALLOC_SP_INT(c, 2 * m->used + 1, err, NULL);
12540
26.9k
    if (err == MP_OKAY) {
12541
26.8k
        u = t[0];
12542
26.8k
        v = t[1];
12543
26.8k
        b = t[2];
12544
        /* c allocated separately and larger for even mod case. */
12545
26.8k
    }
12546
12547
    /* Initialize intermediate values with minimal sizes. */
12548
26.9k
    if (err == MP_OKAY) {
12549
26.8k
        err = sp_init_size(u, (sp_size_t)(m->used + 1U));
12550
26.8k
    }
12551
26.9k
    if (err == MP_OKAY) {
12552
26.8k
        err = sp_init_size(v, (sp_size_t)(m->used + 1U));
12553
26.8k
    }
12554
26.9k
    if (err == MP_OKAY) {
12555
26.8k
        err = sp_init_size(b, (sp_size_t)(m->used + 1U));
12556
26.8k
    }
12557
26.9k
    if (err == MP_OKAY) {
12558
26.8k
        err = sp_init_size(c, (sp_size_t)(2U * m->used + 1U));
12559
26.8k
    }
12560
12561
26.9k
    if (err == MP_OKAY) {
12562
26.8k
        const sp_int* mm = m;
12563
26.8k
        const sp_int* ma = a;
12564
26.8k
        int evenMod = 0;
12565
12566
26.8k
        if (sp_iseven(m)) {
12567
            /* a^-1 mod m = m + ((1 - m*(m^-1 % a)) / a) */
12568
1.15k
            mm = a;
12569
1.15k
            ma = v;
12570
1.15k
            _sp_copy(a, u);
12571
1.15k
            err = sp_mod(m, a, v);
12572
            /* v == 0 when a divides m evenly - no inverse.  */
12573
1.15k
            if ((err == MP_OKAY) && sp_iszero(v)) {
12574
50
                err = MP_VAL;
12575
50
            }
12576
1.15k
            evenMod = 1;
12577
1.15k
        }
12578
12579
26.8k
        if (err == MP_OKAY) {
12580
            /* Calculate inverse. */
12581
26.7k
        #if !defined(WOLFSSL_SP_LOW_MEM) && !defined(WOLFSSL_SP_SMALL) && \
12582
26.7k
            (!defined(NO_RSA) || !defined(NO_DH))
12583
26.7k
            if (sp_count_bits(mm) >= 1024) {
12584
9.26k
                err = _sp_invmod_div(ma, mm, u, v, b, c, c);
12585
9.26k
            }
12586
17.5k
            else
12587
17.5k
        #endif
12588
17.5k
            {
12589
17.5k
                err = _sp_invmod_bin(ma, mm, u, v, b, c);
12590
17.5k
            }
12591
26.7k
        }
12592
12593
        /* Fixup for even modulus. */
12594
26.8k
        if ((err == MP_OKAY) && evenMod) {
12595
            /* Finish operation.
12596
             *    a^-1 mod m = m + ((1 - m*c) / a)
12597
             * => a^-1 mod m = m - ((m*c - 1) / a)
12598
             */
12599
919
            err = sp_mul(c, m, c);
12600
919
            if (err == MP_OKAY) {
12601
915
                _sp_sub_d(c, 1, c);
12602
915
                err = sp_div(c, a, c, NULL);
12603
915
            }
12604
919
            if (err == MP_OKAY) {
12605
913
                err = sp_sub(m, c, r);
12606
913
            }
12607
919
        }
12608
25.9k
        else if (err == MP_OKAY) {
12609
24.7k
            _sp_copy(c, r);
12610
24.7k
        }
12611
26.8k
    }
12612
12613
26.9k
    FREE_SP_INT(c, NULL);
12614
26.9k
    FREE_SP_INT_ARRAY(t, NULL);
12615
26.9k
    return err;
12616
26.9k
}
12617
12618
/* Calculates the multiplicative inverse in the field.
12619
 * Right-shift Algorithm or Extended Euclidean Algorithm. NOT constant time.
12620
 *
12621
 * r*a = x*m + 1
12622
 *
12623
 * @param  [in]   a  SP integer to find inverse of.
12624
 * @param  [in]   m  SP integer this is the modulus.
12625
 * @param  [out]  r  SP integer to hold result. r cannot be m.
12626
 *
12627
 * @return  MP_OKAY on success.
12628
 * @return  MP_VAL when a, m or r is NULL; a or m is zero; a and m are even or
12629
 *          m is negative.
12630
 * @return  MP_MEM when dynamic memory allocation fails.
12631
 */
12632
int sp_invmod(const sp_int* a, const sp_int* m, sp_int* r)
12633
21.3k
{
12634
21.3k
    int err = MP_OKAY;
12635
12636
    /* Validate parameters. */
12637
21.3k
    if ((a == NULL) || (m == NULL) || (r == NULL) || (r == m)) {
12638
6
        err = MP_VAL;
12639
6
    }
12640
21.3k
    if ((err == MP_OKAY) && (m->used * 2 > r->size)) {
12641
26
        err = MP_VAL;
12642
26
    }
12643
12644
21.3k
#ifdef WOLFSSL_SP_INT_NEGATIVE
12645
    /* Don't support negative modulus. */
12646
21.3k
    if ((err == MP_OKAY) && (m->sign == MP_NEG)) {
12647
25
        err = MP_VAL;
12648
25
    }
12649
21.3k
#endif
12650
12651
21.3k
    if (err == MP_OKAY) {
12652
        /* Ensure number is less than modulus. */
12653
21.3k
        if (_sp_cmp_abs(a, m) != MP_LT) {
12654
2.90k
            err = sp_mod(a, m, r);
12655
2.90k
            a = r;
12656
2.90k
        }
12657
21.3k
    }
12658
12659
21.3k
#ifdef WOLFSSL_SP_INT_NEGATIVE
12660
21.3k
    if ((err == MP_OKAY) && (a->sign == MP_NEG)) {
12661
        /* Make 'a' positive */
12662
161
        err = sp_add(m, a, r);
12663
161
        a = r;
12664
161
    }
12665
21.3k
#endif
12666
12667
    /* 0 != n*m + 1 (+ve m), r*a mod 0 is always 0 (never 1)  */
12668
21.3k
    if ((err == MP_OKAY) && (sp_iszero(a) || sp_iszero(m))) {
12669
32
        err = MP_VAL;
12670
32
    }
12671
    /* r*2*x != n*2*y + 1 for integer x,y */
12672
21.3k
    if ((err == MP_OKAY) && sp_iseven(a) && sp_iseven(m)) {
12673
31
        err = MP_VAL;
12674
31
    }
12675
    /* 1*1 = 0*m + 1  */
12676
21.3k
    if ((err == MP_OKAY) && sp_isone(a)) {
12677
601
        _sp_set(r, 1);
12678
601
    }
12679
20.7k
    else if (err == MP_OKAY) {
12680
20.6k
        err = _sp_invmod(a, m, r);
12681
20.6k
    }
12682
12683
21.3k
    return err;
12684
21.3k
}
12685
#endif /* WOLFSSL_SP_INVMOD */
12686
12687
#ifdef WOLFSSL_SP_INVMOD_MONT_CT
12688
12689
/* Number of entries to pre-compute.
12690
 * Many pre-defined primes have multiple of 8 consecutive 1s.
12691
 * P-256 modulus - 2 => 32x1, 31x0, 1x1, 96x0, 94x1, 1x0, 1x1.
12692
 */
12693
4.92M
#define CT_INV_MOD_PRE_CNT      8
12694
12695
/* Calculates the multiplicative inverse in the field - constant time.
12696
 *
12697
 * Modulus (m) must be a prime and greater than 2.
12698
 * For prime m, inv = a ^ (m-2) mod m as 1 = a ^ (m-1) mod m.
12699
 *
12700
 * Algorithm:
12701
 *  pre = pre-computed values, m = modulus, a = value to find inverse of,
12702
 *  e = exponent
12703
 *  Pre-calc:
12704
 *   1. pre[0] = 2^0 * a mod m
12705
 *   2. For i in 2..CT_INV_MOD_PRE_CNT
12706
 *    2.1. pre[i-1] = ((pre[i-2] ^ 2) * a) mod m
12707
 *  Calc inverse:
12708
 *   1. e = m - 2
12709
 *   2. j = Count leading 1's up to CT_INV_MOD_PRE_CNT
12710
 *   3. t = pre[j-1]
12711
 *   4. s = 0
12712
 *   5. j = 0
12713
 *   6. For i index of next top bit..0
12714
 *    6.1. bit = e[i]
12715
 *    6.2. j += bit
12716
 *    6.3. s += 1
12717
 *    6.4. if j == CT_INV_MOD_PRE_CNT or (bit == 0 and j > 0)
12718
 *     6.4.1. s -= 1 - bit
12719
 *     6.4.2. For s downto 1
12720
 *      6.4.2.1. t = (t ^ 2) mod m
12721
 *     6.4.3. s = 1 - bit
12722
 *     6.4.4. t = (t * pre[j-1]) mod m
12723
 *     6.4.5. j = 0
12724
 *   7. For s downto 1
12725
 *    7.1. t = (t ^ 2) mod m
12726
 *   8. If j > 0 then r = (t * pre[j-1]) mod m
12727
 *   9. Else r = t
12728
 *
12729
 * @param  [in]   a   SP integer, Montgomery form, to find inverse of.
12730
 * @param  [in]   m   SP integer this is the modulus.
12731
 * @param  [out]  r   SP integer to hold result.
12732
 * @param  [in]   mp  SP integer digit that is the bottom digit of inv(-m).
12733
 *
12734
 * @return  MP_OKAY on success.
12735
 * @return  MP_MEM when dynamic memory allocation fails.
12736
 */
12737
static int _sp_invmod_mont_ct(const sp_int* a, const sp_int* m, sp_int* r,
12738
    sp_int_digit mp)
12739
17.4k
{
12740
17.4k
    int err = MP_OKAY;
12741
17.4k
    int i;
12742
17.4k
    int j = 0;
12743
17.4k
    int s = 0;
12744
17.4k
    sp_int* t = NULL;
12745
17.4k
    sp_int* e = NULL;
12746
17.4k
#ifndef WOLFSSL_SP_NO_MALLOC
12747
17.4k
    DECL_DYN_SP_INT_ARRAY(pre, m->used * 2 + 1, CT_INV_MOD_PRE_CNT + 2);
12748
#else
12749
    DECL_SP_INT_ARRAY(pre, m->used * 2 + 1, CT_INV_MOD_PRE_CNT + 2);
12750
#endif
12751
12752
17.4k
#ifndef WOLFSSL_SP_NO_MALLOC
12753
17.4k
    ALLOC_DYN_SP_INT_ARRAY(pre, m->used * 2U + 1U, CT_INV_MOD_PRE_CNT + 2, err,
12754
17.4k
        NULL);
12755
#else
12756
    ALLOC_SP_INT_ARRAY(pre, m->used * 2U + 1U, CT_INV_MOD_PRE_CNT + 2, err, NULL);
12757
#endif
12758
17.4k
    if (err == MP_OKAY) {
12759
17.3k
        t = pre[CT_INV_MOD_PRE_CNT + 0];
12760
17.3k
        e = pre[CT_INV_MOD_PRE_CNT + 1];
12761
        /* Space for sqr and mul result. */
12762
17.3k
        _sp_init_size(t, (sp_size_t)(m->used * 2 + 1));
12763
        /* e = mod - 2 */
12764
17.3k
        _sp_init_size(e, (sp_size_t)(m->used + 1));
12765
12766
        /* Create pre-computation results: ((2^(1..8))-1).a. */
12767
17.3k
        _sp_init_size(pre[0], (sp_size_t)(m->used * 2 + 1));
12768
        /* 1. pre[0] = 2^0 * a mod m
12769
         *    Start with 1.a = a.
12770
         */
12771
17.3k
        _sp_copy(a, pre[0]);
12772
        /* 2. For i in 2..CT_INV_MOD_PRE_CNT
12773
         *    For rest of entries in table.
12774
         */
12775
138k
        for (i = 1; (err == MP_OKAY) && (i < CT_INV_MOD_PRE_CNT); i++) {
12776
            /* 2.1 pre[i-1] = ((pre[i-1] ^ 2) * a) mod m */
12777
            /* Previous value ..1 -> ..10 */
12778
121k
            _sp_init_size(pre[i], (sp_size_t)(m->used * 2 + 1));
12779
121k
            err = sp_sqr(pre[i-1], pre[i]);
12780
121k
            if (err == MP_OKAY) {
12781
121k
                err = _sp_mont_red(pre[i], m, mp, 0);
12782
121k
            }
12783
            /* ..10 -> ..11 */
12784
121k
            if (err == MP_OKAY) {
12785
121k
                err = sp_mul(pre[i], a, pre[i]);
12786
121k
            }
12787
121k
            if (err == MP_OKAY) {
12788
121k
                err = _sp_mont_red(pre[i], m, mp, 0);
12789
121k
            }
12790
121k
        }
12791
17.3k
    }
12792
12793
17.4k
    if (err == MP_OKAY) {
12794
        /* 1. e = m - 2 */
12795
17.2k
        _sp_sub_d(m, 2, e);
12796
        /* 2. j = Count leading 1's up to CT_INV_MOD_PRE_CNT
12797
         *    One or more of the top bits is 1 so count.
12798
         */
12799
126k
        for (i = sp_count_bits(e)-2, j = 1; i >= 0; i--, j++) {
12800
126k
            if ((!sp_is_bit_set(e, (unsigned int)i)) ||
12801
124k
                    (j == CT_INV_MOD_PRE_CNT)) {
12802
17.2k
                break;
12803
17.2k
            }
12804
126k
        }
12805
        /* 3. Set tmp to product of leading bits. */
12806
17.2k
        _sp_copy(pre[j-1], t);
12807
12808
        /* 4. s = 0 */
12809
17.2k
        s = 0;
12810
        /* 5. j = 0 */
12811
17.2k
        j = 0;
12812
        /* 6. For i index of next top bit..0
12813
         *    Do remaining bits in exponent.
12814
         */
12815
4.64M
        for (; (err == MP_OKAY) && (i >= 0); i--) {
12816
            /* 6.1. bit = e[i] */
12817
4.63M
            int bit = sp_is_bit_set(e, (unsigned int)i);
12818
12819
            /* 6.2. j += bit
12820
             *      Update count of consecutive 1 bits.
12821
             */
12822
4.63M
            j += bit;
12823
            /* 6.3. s += 1
12824
             *      Update count of squares required.
12825
             */
12826
4.63M
            s++;
12827
12828
            /* 6.4. if j == CT_INV_MOD_PRE_CNT or (bit == 0 and j > 0)
12829
             *      Check if max 1 bits or 0 and have seen at least one 1 bit.
12830
             */
12831
4.63M
            if ((j == CT_INV_MOD_PRE_CNT) || ((!bit) && (j > 0))) {
12832
                /* 6.4.1. s -= 1 - bit */
12833
478k
                bit = 1 - bit;
12834
478k
                s -= bit;
12835
                /* 6.4.2. For s downto 1
12836
                 *        Do s squares.
12837
                 */
12838
5.07M
                for (; (err == MP_OKAY) && (s > 0); s--) {
12839
                    /* 6.4.2.1. t = (t ^ 2) mod m */
12840
4.59M
                    err = sp_sqr(t, t);
12841
4.59M
                    if (err == MP_OKAY) {
12842
4.59M
                        err = _sp_mont_red(t, m, mp, 0);
12843
4.59M
                    }
12844
4.59M
                }
12845
                /* 6.4.3. s = 1 - bit */
12846
478k
                s = bit;
12847
12848
                /* 6.4.4. t = (t * pre[j-1]) mod m */
12849
478k
                if (err == MP_OKAY) {
12850
478k
                    err = sp_mul(t, pre[j-1], t);
12851
478k
                }
12852
478k
                if (err == MP_OKAY) {
12853
478k
                    err = _sp_mont_red(t, m, mp, 0);
12854
478k
                }
12855
                /* 6.4.5. j = 0
12856
                 *        Reset number of 1 bits seen.
12857
                 */
12858
478k
                j = 0;
12859
478k
            }
12860
4.63M
        }
12861
17.2k
    }
12862
17.4k
    if (err == MP_OKAY) {
12863
        /* 7. For s downto 1
12864
         *    Do s squares - total remaining. */
12865
52.8k
        for (; (err == MP_OKAY) && (s > 0); s--) {
12866
            /* 7.1. t = (t ^ 2) mod m */
12867
35.7k
            err = sp_sqr(t, t);
12868
35.7k
            if (err == MP_OKAY) {
12869
35.6k
                err = _sp_mont_red(t, m, mp, 0);
12870
35.6k
            }
12871
35.7k
        }
12872
17.1k
    }
12873
17.4k
    if (err == MP_OKAY) {
12874
        /* 8. If j > 0 then r = (t * pre[j-1]) mod m */
12875
17.1k
        if (j > 0) {
12876
16.5k
            err = sp_mul(t, pre[j-1], r);
12877
16.5k
            if (err == MP_OKAY) {
12878
16.5k
                err = _sp_mont_red(r, m, mp, 0);
12879
16.5k
            }
12880
16.5k
        }
12881
        /* 9. Else r = t */
12882
593
        else {
12883
593
            _sp_copy(t, r);
12884
593
        }
12885
17.1k
    }
12886
12887
17.4k
#ifndef WOLFSSL_SP_NO_MALLOC
12888
17.4k
    FREE_DYN_SP_INT_ARRAY(pre, NULL);
12889
#else
12890
    FREE_SP_INT_ARRAY(pre, NULL);
12891
#endif
12892
17.4k
    return err;
12893
17.4k
}
12894
12895
/* Calculates the multiplicative inverse in the field - constant time.
12896
 *
12897
 * Modulus (m) must be a prime and greater than 2.
12898
 * For prime m, inv = a ^ (m-2) mod m as 1 = a ^ (m-1) mod m.
12899
 *
12900
 * @param  [in]   a   SP integer, Montgomery form, to find inverse of.
12901
 * @param  [in]   m   SP integer this is the modulus.
12902
 * @param  [out]  r   SP integer to hold result.
12903
 * @param  [in]   mp  SP integer digit that is the bottom digit of inv(-m).
12904
 *
12905
 * @return  MP_OKAY on success.
12906
 * @return  MP_VAL when a, m or r is NULL; a is 0 or m is less than 3.
12907
 * @return  MP_MEM when dynamic memory allocation fails.
12908
 */
12909
int sp_invmod_mont_ct(const sp_int* a, const sp_int* m, sp_int* r,
12910
    sp_int_digit mp)
12911
17.4k
{
12912
17.4k
    int err = MP_OKAY;
12913
12914
    /* Validate parameters. */
12915
17.4k
    if ((a == NULL) || (m == NULL) || (r == NULL)) {
12916
0
        err = MP_VAL;
12917
0
    }
12918
    /* Ensure m is not too big. */
12919
17.4k
    else if (m->used * 2 >= SP_INT_DIGITS) {
12920
0
        err = MP_VAL;
12921
0
    }
12922
    /* check that r can hold the range of the modulus result */
12923
17.4k
    else if (m->used > r->size) {
12924
14
        err = MP_VAL;
12925
14
    }
12926
12927
    /* 0 != n*m + 1 (+ve m), r*a mod 0 is always 0 (never 1) */
12928
17.4k
    if ((err == MP_OKAY) && (sp_iszero(a) || sp_iszero(m) ||
12929
17.4k
            ((m->used == 1) && (m->dp[0] < 3)))) {
12930
31
        err = MP_VAL;
12931
31
    }
12932
12933
17.4k
    if (err == MP_OKAY) {
12934
        /* Do operation. */
12935
17.4k
        err = _sp_invmod_mont_ct(a, m, r, mp);
12936
17.4k
    }
12937
12938
17.4k
    return err;
12939
17.4k
}
12940
12941
#endif /* WOLFSSL_SP_INVMOD_MONT_CT */
12942
12943
12944
/**************************
12945
 * Exponentiation functions
12946
 **************************/
12947
12948
#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && \
12949
    !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || !defined(NO_DH) || \
12950
    defined(OPENSSL_ALL)
12951
12952
#ifndef WC_PROTECT_ENCRYPTED_MEM
12953
12954
/* Internal. Exponentiates b to the power of e modulo m into r: r = b ^ e mod m
12955
 * Process the exponent one bit at a time.
12956
 * Is constant time and can be cache attack resistant.
12957
 *
12958
 * Algorithm:
12959
 *  b: base, e: exponent, m: modulus, r: result, bits: #bits to use
12960
 *  1. s = 0
12961
 *  2. t[0] = b mod m.
12962
 *  3. t[1] = t[0]
12963
 *  4. For i in (bits-1)...0
12964
 *   4.1. t[s] = t[s] ^ 2
12965
 *   4.2. y = e[i]
12966
 *   4.3  j = y & s
12967
 *   4.4  s = s | y
12968
 *   4.5. t[j] = t[j] * b
12969
 *  5. r = t[1]
12970
 *
12971
 * @param  [in]   b     SP integer that is the base.
12972
 * @param  [in]   e     SP integer that is the exponent.
12973
 * @param  [in]   bits  Number of bits in exponent to use. May be greater than
12974
 *                      count of bits in e.
12975
 * @param  [in]   m     SP integer that is the modulus.
12976
 * @param  [out]  r     SP integer to hold result.
12977
 *
12978
 * @return  MP_OKAY on success.
12979
 * @return  MP_MEM when dynamic memory allocation fails.
12980
 */
12981
static int _sp_exptmod_ex(const sp_int* b, const sp_int* e, int bits,
12982
    const sp_int* m, sp_int* r)
12983
79.3k
{
12984
79.3k
    int i;
12985
79.3k
    int err = MP_OKAY;
12986
79.3k
    int done = 0;
12987
    /* 1. s = 0 */
12988
79.3k
    int s = 0;
12989
#ifdef WC_NO_CACHE_RESISTANT
12990
    DECL_SP_INT_ARRAY(t, 2 * m->used + 1, 2);
12991
#else
12992
79.3k
    DECL_SP_INT_ARRAY(t, 2 * m->used + 1, 3);
12993
79.3k
#endif
12994
12995
    /* Allocate temporaries. */
12996
#ifdef WC_NO_CACHE_RESISTANT
12997
    ALLOC_SP_INT_ARRAY(t, 2 * m->used + 1, 2, err, NULL);
12998
#else
12999
    /* Working SP int needed when cache resistant. */
13000
79.3k
    ALLOC_SP_INT_ARRAY(t, 2U * m->used + 1U, 3, err, NULL);
13001
79.3k
#endif
13002
79.3k
    if (err == MP_OKAY) {
13003
        /* Initialize temporaries. */
13004
79.2k
        _sp_init_size(t[0], (sp_size_t)(m->used * 2 + 1));
13005
79.2k
        _sp_init_size(t[1], (sp_size_t)(m->used * 2 + 1));
13006
79.2k
    #ifndef WC_NO_CACHE_RESISTANT
13007
79.2k
        _sp_init_size(t[2], (sp_size_t)(m->used * 2 + 1));
13008
79.2k
    #endif
13009
13010
        /* 2. t[0] = b mod m
13011
         * Ensure base is less than modulus - set fake working value to base.
13012
         */
13013
79.2k
        if (_sp_cmp_abs(b, m) != MP_LT) {
13014
69
            err = sp_mod(b, m, t[0]);
13015
            /* Handle base == modulus. */
13016
69
            if ((err == MP_OKAY) && sp_iszero(t[0])) {
13017
8
                _sp_set(r, 0);
13018
8
                done = 1;
13019
8
            }
13020
69
        }
13021
79.1k
        else {
13022
            /* Copy base into working variable. */
13023
79.1k
            _sp_copy(b, t[0]);
13024
79.1k
        }
13025
79.2k
    }
13026
13027
79.3k
    if ((!done) && (err == MP_OKAY)) {
13028
        /* 3. t[1] = t[0]
13029
         *    Set real working value to base.
13030
         */
13031
79.2k
        _sp_copy(t[0], t[1]);
13032
13033
        /* 4. For i in (bits-1)...0 */
13034
6.09M
        for (i = bits - 1; (err == MP_OKAY) && (i >= 0); i--) {
13035
#ifdef WC_NO_CACHE_RESISTANT
13036
            /* 4.1. t[s] = t[s] ^ 2 */
13037
            err = sp_sqrmod(t[s], m, t[s]);
13038
            if (err == MP_OKAY) {
13039
                /* 4.2. y = e[i] */
13040
                int y = (e->dp[i >> SP_WORD_SHIFT] >> (i & SP_WORD_MASK)) & 1;
13041
                /* 4.3. j = y & s */
13042
                int j = y & s;
13043
                /* 4.4  s = s | y */
13044
                s |= y;
13045
                /* 4.5. t[j] = t[j] * b */
13046
                err = _sp_mulmod(t[j], b, m, t[j]);
13047
            }
13048
#else
13049
            /* 4.1. t[s] = t[s] ^ 2 */
13050
6.01M
            _sp_copy((sp_int*)(((size_t)t[0] & sp_off_on_addr[s^1]) +
13051
6.01M
                               ((size_t)t[1] & sp_off_on_addr[s  ])),
13052
6.01M
                     t[2]);
13053
6.01M
            err = sp_sqrmod(t[2], m, t[2]);
13054
6.01M
            _sp_copy(t[2],
13055
6.01M
                     (sp_int*)(((size_t)t[0] & sp_off_on_addr[s^1]) +
13056
6.01M
                               ((size_t)t[1] & sp_off_on_addr[s  ])));
13057
13058
6.01M
            if (err == MP_OKAY) {
13059
                /* 4.2. y = e[i] */
13060
6.01M
                int y = (int)((e->dp[i >> SP_WORD_SHIFT] >> (i & (int)SP_WORD_MASK)) & 1);
13061
                /* 4.3. j = y & s */
13062
6.01M
                int j = y & s;
13063
                /* 4.4  s = s | y */
13064
6.01M
                s |= y;
13065
                /* 4.5. t[j] = t[j] * b */
13066
6.01M
                _sp_copy((sp_int*)(((size_t)t[0] & sp_off_on_addr[j^1]) +
13067
6.01M
                                   ((size_t)t[1] & sp_off_on_addr[j  ])),
13068
6.01M
                         t[2]);
13069
6.01M
                err = _sp_mulmod(t[2], b, m, t[2]);
13070
6.01M
                _sp_copy(t[2],
13071
6.01M
                         (sp_int*)(((size_t)t[0] & sp_off_on_addr[j^1]) +
13072
6.01M
                                   ((size_t)t[1] & sp_off_on_addr[j  ])));
13073
6.01M
            }
13074
6.01M
#endif
13075
6.01M
        }
13076
79.2k
    }
13077
79.3k
    if ((!done) && (err == MP_OKAY)) {
13078
        /* 5. r = t[1] */
13079
79.0k
        _sp_copy(t[1], r);
13080
79.0k
    }
13081
13082
79.3k
    FREE_SP_INT_ARRAY(t, NULL);
13083
79.3k
    return err;
13084
79.3k
}
13085
13086
#else
13087
13088
/* Internal. Exponentiates b to the power of e modulo m into r: r = b ^ e mod m
13089
 * Process the exponent one bit at a time with base in Montgomery form.
13090
 * Is constant time and cache attack resistant.
13091
 *
13092
 * Based on work by Marc Joye, Sung-Ming Yen, "The Montgomery Powering Ladder",
13093
 * Cryptographic Hardware and Embedded Systems, CHES 2002
13094
 *
13095
 * Algorithm:
13096
 *  b: base, e: exponent, m: modulus, r: result, bits: #bits to use
13097
 *  1. t[1] = b mod m.
13098
 *  2. t[0] = 1
13099
 *  3. For i in (bits-1)...0
13100
 *   3.1. y = e[i]
13101
 *   3.2. t[2] = t[0] * t[1]
13102
 *   3.3. t[3] = t[y] ^ 2
13103
 *   3.4. t[y] = t[3], t[y^1] = t[2]
13104
 *  4. r = t[0]
13105
 *
13106
 * @param  [in]   b     SP integer that is the base.
13107
 * @param  [in]   e     SP integer that is the exponent.
13108
 * @param  [in]   bits  Number of bits in exponent to use. May be greater than
13109
 *                      count of bits in e.
13110
 * @param  [in]   m     SP integer that is the modulus.
13111
 * @param  [out]  r     SP integer to hold result.
13112
 *
13113
 * @return  MP_OKAY on success.
13114
 * @return  MP_MEM when dynamic memory allocation fails.
13115
 */
13116
static int _sp_exptmod_ex(const sp_int* b, const sp_int* e, int bits,
13117
    const sp_int* m, sp_int* r)
13118
{
13119
    int err = MP_OKAY;
13120
    int done = 0;
13121
    DECL_SP_INT_ARRAY(t, m->used * 2 + 1, 4);
13122
13123
    /* Allocate temporaries. */
13124
    ALLOC_SP_INT_ARRAY(t, m->used * 2 + 1, 4, err, NULL);
13125
    if (err == MP_OKAY) {
13126
        /* Initialize temporaries. */
13127
        _sp_init_size(t[0], m->used * 2 + 1);
13128
        _sp_init_size(t[1], m->used * 2 + 1);
13129
        _sp_init_size(t[2], m->used * 2 + 1);
13130
        _sp_init_size(t[3], m->used * 2 + 1);
13131
13132
        /* 1. Ensure base is less than modulus. */
13133
        if (_sp_cmp_abs(b, m) != MP_LT) {
13134
            err = sp_mod(b, m, t[1]);
13135
            /* Handle base == modulus. */
13136
            if ((err == MP_OKAY) && sp_iszero(t[1])) {
13137
                _sp_set(r, 0);
13138
                done = 1;
13139
            }
13140
        }
13141
        else {
13142
            /* Copy base into working variable. */
13143
            err = sp_copy(b, t[1]);
13144
        }
13145
    }
13146
13147
    if ((!done) && (err == MP_OKAY)) {
13148
        int i;
13149
13150
        /* 2. t[0] = 1 */
13151
        _sp_set(t[0], 1);
13152
13153
        /* 3. For i in (bits-1)...0 */
13154
        for (i = bits - 1; (err == MP_OKAY) && (i >= 0); i--) {
13155
            /* 3.1. y = e[i] */
13156
            int y = (e->dp[i >> SP_WORD_SHIFT] >> (i & SP_WORD_MASK)) & 1;
13157
13158
            /* 3.2. t[2] = t[0] * t[1] */
13159
            err = sp_mulmod(t[0], t[1], m, t[2]);
13160
            /* 3.3. t[3] = t[y] ^ 2 */
13161
            if (err == MP_OKAY) {
13162
                _sp_copy((sp_int*)(((size_t)t[0] & sp_off_on_addr[y^1]) +
13163
                                   ((size_t)t[1] & sp_off_on_addr[y  ])),
13164
                         t[3]);
13165
                err = sp_sqrmod(t[3], m, t[3]);
13166
            }
13167
            /* 3.4. t[y] = t[3], t[y^1] = t[2] */
13168
            if (err == MP_OKAY) {
13169
                _sp_copy_2_ct(t[2], t[3], t[0], t[1], y, m->used);
13170
            }
13171
        }
13172
    }
13173
    if ((!done) && (err == MP_OKAY)) {
13174
        /* 4. r = t[0] */
13175
        err = sp_copy(t[0], r);
13176
    }
13177
13178
    FREE_SP_INT_ARRAY(t, NULL);
13179
    return err;
13180
}
13181
13182
#endif /* WC_PROTECT_ENCRYPTED_MEM */
13183
13184
#endif
13185
13186
#if (defined(WOLFSSL_SP_MATH_ALL) && ((!defined(WOLFSSL_RSA_VERIFY_ONLY) && \
13187
    !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || !defined(NO_DH))) || \
13188
    defined(OPENSSL_ALL)
13189
#ifndef WC_NO_HARDEN
13190
#if !defined(WC_NO_CACHE_RESISTANT)
13191
13192
#ifndef WC_PROTECT_ENCRYPTED_MEM
13193
13194
/* Internal. Exponentiates b to the power of e modulo m into r: r = b ^ e mod m
13195
 * Process the exponent one bit at a time with base in Montgomery form.
13196
 * Is constant time and cache attack resistant.
13197
 *
13198
 * Algorithm:
13199
 *  b: base, e: exponent, m: modulus, r: result, bits: #bits to use
13200
 *  1. t[0] = b mod m.
13201
 *  2. s = 0
13202
 *  3. t[0] = ToMont(t[0])
13203
 *  4. t[1] = t[0]
13204
 *  5. bm = t[0]
13205
 *  6. For i in (bits-1)...0
13206
 *   6.1. t[s] = t[s] ^ 2
13207
 *   6.2. y = e[i]
13208
 *   6.3  j = y & s
13209
 *   6.4  s = s | y
13210
 *   6.5. t[j] = t[j] * bm
13211
 *  7. t[1] = FromMont(t[1])
13212
 *  8. r = t[1]
13213
 *
13214
 * @param  [in]   b     SP integer that is the base.
13215
 * @param  [in]   e     SP integer that is the exponent.
13216
 * @param  [in]   bits  Number of bits in exponent to use. May be greater than
13217
 *                      count of bits in e.
13218
 * @param  [in]   m     SP integer that is the modulus.
13219
 * @param  [out]  r     SP integer to hold result.
13220
 *
13221
 * @return  MP_OKAY on success.
13222
 * @return  MP_MEM when dynamic memory allocation fails.
13223
 */
13224
static int _sp_exptmod_mont_ex(const sp_int* b, const sp_int* e, int bits,
13225
    const sp_int* m, sp_int* r)
13226
679k
{
13227
679k
    int err = MP_OKAY;
13228
679k
    int done = 0;
13229
679k
    DECL_SP_INT_ARRAY(t, m->used * 2 + 1, 4);
13230
13231
    /* Allocate temporaries. */
13232
679k
    ALLOC_SP_INT_ARRAY(t, m->used * 2U + 1U, 4, err, NULL);
13233
679k
    if (err == MP_OKAY) {
13234
        /* Initialize temporaries. */
13235
679k
        _sp_init_size(t[0], (sp_size_t)(m->used * 2 + 1));
13236
679k
        _sp_init_size(t[1], (sp_size_t)(m->used * 2 + 1));
13237
679k
        _sp_init_size(t[2], (sp_size_t)(m->used * 2 + 1));
13238
679k
        _sp_init_size(t[3], (sp_size_t)(m->used * 2 + 1));
13239
13240
        /* 1. Ensure base is less than modulus. */
13241
679k
        if (_sp_cmp_abs(b, m) != MP_LT) {
13242
0
            err = sp_mod(b, m, t[0]);
13243
            /* Handle base == modulus. */
13244
0
            if ((err == MP_OKAY) && sp_iszero(t[0])) {
13245
0
                _sp_set(r, 0);
13246
0
                done = 1;
13247
0
            }
13248
0
        }
13249
679k
        else {
13250
            /* Copy base into working variable. */
13251
679k
            _sp_copy(b, t[0]);
13252
679k
        }
13253
679k
    }
13254
13255
679k
    if ((!done) && (err == MP_OKAY)) {
13256
679k
        int i;
13257
        /* 2. s = 0 */
13258
679k
        int s = 0;
13259
679k
        sp_int_digit mp;
13260
13261
        /* Calculate Montgomery multiplier for reduction. */
13262
679k
        _sp_mont_setup(m, &mp);
13263
        /* 3. t[0] = ToMont(t[0])
13264
         *    Convert base to Montgomery form - as fake working value.
13265
         */
13266
679k
        err = sp_mont_norm(t[1], m);
13267
679k
        if (err == MP_OKAY) {
13268
679k
            err = sp_mul(t[0], t[1], t[0]);
13269
679k
        }
13270
679k
        if (err == MP_OKAY) {
13271
            /* t[0] = t[0] mod m, temporary size has to be bigger than t[0]. */
13272
679k
            err = _sp_div(t[0], m, NULL, t[0], t[0]->used + 1U);
13273
679k
        }
13274
679k
        if (err == MP_OKAY) {
13275
            /* 4. t[1] = t[0]
13276
             *    Set real working value to base.
13277
             */
13278
679k
            _sp_copy(t[0], t[1]);
13279
            /* 5. bm = t[0]. */
13280
679k
            _sp_copy(t[0], t[2]);
13281
679k
        }
13282
13283
        /* 6. For i in (bits-1)...0 */
13284
62.6M
        for (i = bits - 1; (err == MP_OKAY) && (i >= 0); i--) {
13285
            /* 6.1. t[s] = t[s] ^ 2 */
13286
62.0M
            _sp_copy((sp_int*)(((size_t)t[0] & sp_off_on_addr[s^1]) +
13287
62.0M
                               ((size_t)t[1] & sp_off_on_addr[s  ])),
13288
62.0M
                     t[3]);
13289
62.0M
            err = sp_sqr(t[3], t[3]);
13290
62.0M
            if (err == MP_OKAY) {
13291
62.0M
                err = _sp_mont_red(t[3], m, mp, 0);
13292
62.0M
            }
13293
62.0M
            _sp_copy(t[3],
13294
62.0M
                     (sp_int*)(((size_t)t[0] & sp_off_on_addr[s^1]) +
13295
62.0M
                               ((size_t)t[1] & sp_off_on_addr[s  ])));
13296
13297
62.0M
            if (err == MP_OKAY) {
13298
                /* 6.2. y = e[i] */
13299
62.0M
                int y = (int)((e->dp[i >> SP_WORD_SHIFT] >> (i & (int)SP_WORD_MASK)) & 1);
13300
                /* 6.3  j = y & s */
13301
62.0M
                int j = y & s;
13302
                /* 6.4  s = s | y */
13303
62.0M
                s |= y;
13304
13305
                /* 6.5. t[j] = t[j] * bm */
13306
62.0M
                _sp_copy((sp_int*)(((size_t)t[0] & sp_off_on_addr[j^1]) +
13307
62.0M
                                   ((size_t)t[1] & sp_off_on_addr[j  ])),
13308
62.0M
                         t[3]);
13309
62.0M
                err = sp_mul(t[3], t[2], t[3]);
13310
62.0M
                if (err == MP_OKAY) {
13311
62.0M
                    err = _sp_mont_red(t[3], m, mp, 0);
13312
62.0M
                }
13313
62.0M
                _sp_copy(t[3],
13314
62.0M
                         (sp_int*)(((size_t)t[0] & sp_off_on_addr[j^1]) +
13315
62.0M
                                   ((size_t)t[1] & sp_off_on_addr[j  ])));
13316
62.0M
            }
13317
62.0M
        }
13318
679k
        if (err == MP_OKAY) {
13319
            /* 7. t[1] = FromMont(t[1]) */
13320
678k
            err = _sp_mont_red(t[1], m, mp, 0);
13321
            /* Reduction implementation returns number to range: 0..m-1. */
13322
678k
        }
13323
679k
    }
13324
679k
    if ((!done) && (err == MP_OKAY)) {
13325
        /* 8. r = t[1] */
13326
678k
        _sp_copy(t[1], r);
13327
678k
    }
13328
13329
679k
    FREE_SP_INT_ARRAY(t, NULL);
13330
679k
    return err;
13331
679k
}
13332
13333
#else
13334
13335
/* Internal. Exponentiates b to the power of e modulo m into r: r = b ^ e mod m
13336
 * Process the exponent one bit at a time with base in Montgomery form.
13337
 * Is constant time and cache attack resistant.
13338
 *
13339
 * Based on work by Marc Joye, Sung-Ming Yen, "The Montgomery Powering Ladder",
13340
 * Cryptographic Hardware and Embedded Systems, CHES 2002
13341
 *
13342
 * Algorithm:
13343
 *  b: base, e: exponent, m: modulus, r: result, bits: #bits to use
13344
 *  1. t[1] = b mod m.
13345
 *  2. t[0] = ToMont(1)
13346
 *  3. t[1] = ToMont(t[1])
13347
 *  4. For i in (bits-1)...0
13348
 *   4.1. y = e[i]
13349
 *   4.2. t[2] = t[0] * t[1]
13350
 *   4.3. t[3] = t[y] ^ 2
13351
 *   4.4. t[y] = t[3], t[y^1] = t[2]
13352
 *  5. t[0] = FromMont(t[0])
13353
 *  6. r = t[0]
13354
 *
13355
 * @param  [in]   b     SP integer that is the base.
13356
 * @param  [in]   e     SP integer that is the exponent.
13357
 * @param  [in]   bits  Number of bits in exponent to use. May be greater than
13358
 *                      count of bits in e.
13359
 * @param  [in]   m     SP integer that is the modulus.
13360
 * @param  [out]  r     SP integer to hold result.
13361
 *
13362
 * @return  MP_OKAY on success.
13363
 * @return  MP_MEM when dynamic memory allocation fails.
13364
 */
13365
static int _sp_exptmod_mont_ex(const sp_int* b, const sp_int* e, int bits,
13366
    const sp_int* m, sp_int* r)
13367
{
13368
    int err = MP_OKAY;
13369
    int done = 0;
13370
    DECL_SP_INT_ARRAY(t, m->used * 2 + 1, 4);
13371
13372
    /* Allocate temporaries. */
13373
    ALLOC_SP_INT_ARRAY(t, m->used * 2 + 1, 4, err, NULL);
13374
    if (err == MP_OKAY) {
13375
        /* Initialize temporaries. */
13376
        _sp_init_size(t[0], m->used * 2 + 1);
13377
        _sp_init_size(t[1], m->used * 2 + 1);
13378
        _sp_init_size(t[2], m->used * 2 + 1);
13379
        _sp_init_size(t[3], m->used * 2 + 1);
13380
13381
        /* 1. Ensure base is less than modulus. */
13382
        if (_sp_cmp_abs(b, m) != MP_LT) {
13383
            err = sp_mod(b, m, t[1]);
13384
            /* Handle base == modulus. */
13385
            if ((err == MP_OKAY) && sp_iszero(t[1])) {
13386
                _sp_set(r, 0);
13387
                done = 1;
13388
            }
13389
        }
13390
        else {
13391
            /* Copy base into working variable. */
13392
            err = sp_copy(b, t[1]);
13393
        }
13394
    }
13395
13396
    if ((!done) && (err == MP_OKAY)) {
13397
        int i;
13398
        sp_int_digit mp;
13399
13400
        /* Calculate Montgomery multiplier for reduction. */
13401
        _sp_mont_setup(m, &mp);
13402
        /* 2. t[0] = ToMont(1)
13403
          *    Calculate 1 in Montgomery form.
13404
          */
13405
        err = sp_mont_norm(t[0], m);
13406
        if (err == MP_OKAY) {
13407
            /* 3. t[1] = ToMont(t[1])
13408
             *    Convert base to Montgomery form.
13409
             */
13410
            err = sp_mulmod(t[1], t[0], m, t[1]);
13411
        }
13412
13413
        /* 4. For i in (bits-1)...0 */
13414
        for (i = bits - 1; (err == MP_OKAY) && (i >= 0); i--) {
13415
            /* 4.1. y = e[i] */
13416
            int y = (e->dp[i >> SP_WORD_SHIFT] >> (i & SP_WORD_MASK)) & 1;
13417
13418
            /* 4.2. t[2] = t[0] * t[1] */
13419
            err = sp_mul(t[0], t[1], t[2]);
13420
            if (err == MP_OKAY) {
13421
                err = _sp_mont_red(t[2], m, mp, 0);
13422
            }
13423
            /* 4.3. t[3] = t[y] ^ 2 */
13424
            if (err == MP_OKAY) {
13425
                _sp_copy((sp_int*)(((size_t)t[0] & sp_off_on_addr[y^1]) +
13426
                                   ((size_t)t[1] & sp_off_on_addr[y  ])),
13427
                         t[3]);
13428
                err = sp_sqr(t[3], t[3]);
13429
            }
13430
            if (err == MP_OKAY) {
13431
                err = _sp_mont_red(t[3], m, mp, 0);
13432
            }
13433
            /* 4.4. t[y] = t[3], t[y^1] = t[2] */
13434
            if (err == MP_OKAY) {
13435
                _sp_copy_2_ct(t[2], t[3], t[0], t[1], y, m->used);
13436
            }
13437
        }
13438
13439
        if (err == MP_OKAY) {
13440
            /* 5. t[0] = FromMont(t[0]) */
13441
            err = _sp_mont_red(t[0], m, mp, 0);
13442
            /* Reduction implementation returns number to range: 0..m-1. */
13443
        }
13444
    }
13445
    if ((!done) && (err == MP_OKAY)) {
13446
        /* 6. r = t[0] */
13447
        err = sp_copy(t[0], r);
13448
    }
13449
13450
    FREE_SP_INT_ARRAY(t, NULL);
13451
    return err;
13452
}
13453
13454
#endif /* WC_PROTECT_ENCRYPTED_MEM */
13455
13456
#else
13457
13458
#ifdef SP_ALLOC
13459
#define SP_ALLOC_PREDEFINED
13460
#endif
13461
/* Always allocate large array of sp_ints unless defined WOLFSSL_SP_NO_MALLOC */
13462
#define SP_ALLOC
13463
13464
/* Internal. Exponentiates b to the power of e modulo m into r: r = b ^ e mod m
13465
 * Creates a window of precalculated exponents with base in Montgomery form.
13466
 * Is constant time but NOT cache attack resistant.
13467
 *
13468
 * Algorithm:
13469
 *  b: base, e: exponent, m: modulus, r: result, bits: #bits to use
13470
 *  w: window size based on bits.
13471
 *  1. t[1] = b mod m.
13472
 *  2. t[0] = MontNorm(m) = ToMont(1)
13473
 *  3. t[1] = ToMont(t[1])
13474
 *  4. For i in 2..(2 ^ w) - 1
13475
 *   4.1 if i[0] == 0 then t[i] = t[i/2] ^ 2
13476
 *   4.2 if i[0] == 1 then t[i] = t[i-1] * t[1]
13477
 *  5. cb = w * (bits / w)
13478
 *  5. tr = t[e / (2 ^ cb)]
13479
 *  6. For i in cb..w
13480
 *   6.1. y = e[(i-1)..(i-w)]
13481
 *   6.2. tr = tr ^ (2 * w)
13482
 *   6.3. tr = tr * t[y]
13483
 *  7. tr = FromMont(tr)
13484
 *  8. r = tr
13485
 *
13486
 * @param  [in]   b     SP integer that is the base.
13487
 * @param  [in]   e     SP integer that is the exponent.
13488
 * @param  [in]   bits  Number of bits in exponent to use. May be greater than
13489
 *                      count of bits in e.
13490
 * @param  [in]   m     SP integer that is the modulus.
13491
 * @param  [out]  r     SP integer to hold result.
13492
 *
13493
 * @return  MP_OKAY on success.
13494
 * @return  MP_MEM when dynamic memory allocation fails.
13495
 */
13496
static int _sp_exptmod_mont_ex(const sp_int* b, const sp_int* e, int bits,
13497
    const sp_int* m, sp_int* r)
13498
{
13499
    int i;
13500
    int c;
13501
    int y;
13502
    int winBits;
13503
    int preCnt;
13504
    int err = MP_OKAY;
13505
    int done = 0;
13506
    sp_int_digit mask;
13507
    sp_int* tr = NULL;
13508
    DECL_SP_INT_ARRAY(t, m->used * 2 + 1, (1 << 6) + 1);
13509
13510
    /* Window bits based on number of pre-calculations versus number of loop
13511
     * calculations.
13512
     * Exponents for RSA and DH will result in 6-bit windows.
13513
     */
13514
    if (bits > 450) {
13515
        winBits = 6;
13516
    }
13517
    else if (bits <= 21) {
13518
        winBits = 1;
13519
    }
13520
    else if (bits <= 36) {
13521
        winBits = 3;
13522
    }
13523
    else if (bits <= 140) {
13524
        winBits = 4;
13525
    }
13526
    else {
13527
        winBits = 5;
13528
    }
13529
    /* An entry for each possible 0..2^winBits-1 value. */
13530
    preCnt = 1 << winBits;
13531
    /* Mask for calculating index into pre-computed table. */
13532
    mask = preCnt - 1;
13533
13534
    /* Allocate sp_ints for:
13535
     *  - pre-computation table
13536
     *  - temporary result
13537
     */
13538
    ALLOC_SP_INT_ARRAY(t, m->used * 2 + 1, preCnt + 1, err, NULL);
13539
    if (err == MP_OKAY) {
13540
        /* Set variable to use allocate memory. */
13541
        tr = t[preCnt];
13542
13543
        /* Initialize all allocated. */
13544
        for (i = 0; i < preCnt; i++) {
13545
            _sp_init_size(t[i], m->used * 2 + 1);
13546
        }
13547
        _sp_init_size(tr, m->used * 2 + 1);
13548
13549
        /* 1. t[1] = b mod m. */
13550
        if (_sp_cmp_abs(b, m) != MP_LT) {
13551
            err = sp_mod(b, m, t[1]);
13552
            /* Handle base == modulus. */
13553
            if ((err == MP_OKAY) && sp_iszero(t[1])) {
13554
                _sp_set(r, 0);
13555
                done = 1;
13556
            }
13557
        }
13558
        else {
13559
            /* Copy base into entry of table to contain b^1. */
13560
            _sp_copy(b, t[1]);
13561
        }
13562
    }
13563
13564
    if ((!done) && (err == MP_OKAY)) {
13565
        sp_int_digit mp;
13566
        sp_int_digit n;
13567
13568
        /* Calculate Montgomery multiplier for reduction. */
13569
        _sp_mont_setup(m, &mp);
13570
        /* 2. t[0] = MontNorm(m) = ToMont(1) */
13571
        err = sp_mont_norm(t[0], m);
13572
        if (err == MP_OKAY) {
13573
            /* 3. t[1] = ToMont(t[1]) */
13574
            err = sp_mul(t[1], t[0], t[1]);
13575
        }
13576
        if (err == MP_OKAY) {
13577
            /* t[1] = t[1] mod m, temporary size has to be bigger than t[1]. */
13578
            err = _sp_div(t[1], m, NULL, t[1], t[1]->used + 1);
13579
        }
13580
13581
        /* 4. For i in 2..(2 ^ w) - 1 */
13582
        for (i = 2; (i < preCnt) && (err == MP_OKAY); i++) {
13583
            /* 4.1 if i[0] == 0 then t[i] = t[i/2] ^ 2 */
13584
            if ((i & 1) == 0) {
13585
                err = sp_sqr(t[i/2], t[i]);
13586
            }
13587
            /* 4.2 if i[0] == 1 then t[i] = t[i-1] * t[1] */
13588
            else {
13589
                err = sp_mul(t[i-1], t[1], t[i]);
13590
            }
13591
            /* Montgomery reduce square or multiplication result. */
13592
            if (err == MP_OKAY) {
13593
                err = _sp_mont_red(t[i], m, mp, 0);
13594
            }
13595
        }
13596
13597
        if (err == MP_OKAY) {
13598
            /* 5. cb = w * (bits / w) */
13599
            i = (bits - 1) >> SP_WORD_SHIFT;
13600
            n = e->dp[i--];
13601
            /* Find top bit index in last word. */
13602
            c = bits & (SP_WORD_SIZE - 1);
13603
            if (c == 0) {
13604
                c = SP_WORD_SIZE;
13605
            }
13606
            /* Use as many bits from top to make remaining a multiple of window
13607
             * size.
13608
             */
13609
            if ((bits % winBits) != 0) {
13610
                c -= bits % winBits;
13611
            }
13612
            else {
13613
                c -= winBits;
13614
            }
13615
13616
            /* 5. tr = t[e / (2 ^ cb)] */
13617
            y = (int)(n >> c);
13618
            n <<= SP_WORD_SIZE - c;
13619
            /* 5. Copy table value for first window. */
13620
            _sp_copy(t[y], tr);
13621
13622
            /* 6. For i in cb..w */
13623
            for (; (i >= 0) || (c >= winBits); ) {
13624
                int j;
13625
13626
                /* 6.1. y = e[(i-1)..(i-w)] */
13627
                if (c == 0) {
13628
                    /* Bits up to end of digit */
13629
                    n = e->dp[i--];
13630
                    y = (int)(n >> (SP_WORD_SIZE - winBits));
13631
                    n <<= winBits;
13632
                    c = SP_WORD_SIZE - winBits;
13633
                }
13634
                else if (c < winBits) {
13635
                    /* Bits to end of digit and part of next */
13636
                    y = (int)(n >> (SP_WORD_SIZE - winBits));
13637
                    n = e->dp[i--];
13638
                    c = winBits - c;
13639
                    y |= (int)(n >> (SP_WORD_SIZE - c));
13640
                    n <<= c;
13641
                    c = SP_WORD_SIZE - c;
13642
                }
13643
                else {
13644
                    /* Bits from middle of digit */
13645
                    y = (int)((n >> (SP_WORD_SIZE - winBits)) & mask);
13646
                    n <<= winBits;
13647
                    c -= winBits;
13648
                }
13649
13650
                /* 6.2. tr = tr ^ (2 * w) */
13651
                for (j = 0; (j < winBits) && (err == MP_OKAY); j++) {
13652
                    err = sp_sqr(tr, tr);
13653
                    if (err == MP_OKAY) {
13654
                        err = _sp_mont_red(tr, m, mp, 0);
13655
                    }
13656
                }
13657
13658
                /* 6.3. tr = tr * t[y] */
13659
                if (err == MP_OKAY) {
13660
                    err = sp_mul(tr, t[y], tr);
13661
                }
13662
                if (err == MP_OKAY) {
13663
                    err = _sp_mont_red(tr, m, mp, 0);
13664
                }
13665
            }
13666
        }
13667
13668
        if (err == MP_OKAY) {
13669
            /* 7. tr = FromMont(tr) */
13670
            err = _sp_mont_red(tr, m, mp, 0);
13671
            /* Reduction implementation returns number to range: 0..m-1. */
13672
        }
13673
    }
13674
    if ((!done) && (err == MP_OKAY)) {
13675
        /* 8. r = tr */
13676
        _sp_copy(tr, r);
13677
    }
13678
13679
    FREE_SP_INT_ARRAY(t, NULL);
13680
    return err;
13681
}
13682
13683
#ifndef SP_ALLOC_PREDEFINED
13684
#undef SP_ALLOC
13685
#undef SP_ALLOC_PREDEFINED
13686
#endif
13687
13688
#endif /* !WC_NO_CACHE_RESISTANT */
13689
#endif /* !WC_NO_HARDEN */
13690
13691
/* w = Log2(SP_WORD_SIZE) - 1 */
13692
#if SP_WORD_SIZE == 8
13693
    #define EXP2_WINSIZE    2
13694
#elif SP_WORD_SIZE == 16
13695
    #define EXP2_WINSIZE    3
13696
#elif SP_WORD_SIZE == 32
13697
    #define EXP2_WINSIZE    4
13698
#elif SP_WORD_SIZE == 64
13699
2.45M
    #define EXP2_WINSIZE    5
13700
#else
13701
    #error "sp_exptmod_base_2: Unexpected SP_WORD_SIZE"
13702
#endif
13703
/* Mask is all bits in window set. */
13704
204k
#define EXP2_MASK           ((1 << EXP2_WINSIZE) - 1)
13705
13706
/* Internal. Exponentiates 2 to the power of e modulo m into r: r = 2 ^ e mod m
13707
 * Is constant time and cache attack resistant.
13708
 *
13709
 * Calculates value to make mod operations constant time expect when
13710
 * WC_NO_HARDERN defined or modulus fits in one word.
13711
 *
13712
 * Algorithm:
13713
 *  b: base, e: exponent, m: modulus, r: result, bits: #bits to use
13714
 *  w: window size based on #bits in word.
13715
 *  1. if Words(m) > 1 then tr = MontNorm(m) = ToMont(1)
13716
 *     else                 tr = 1
13717
 *  2. if Words(m) > 1 and HARDEN then a = m * (2 ^ (2^w))
13718
 *     else                            a = 0
13719
 *  3. cb = w * (bits / w)
13720
 *  4. y = e / (2 ^ cb)
13721
 *  5. tr = (tr * (2 ^ y) + a) mod m
13722
 *  6. For i in cb..w
13723
 *   6.1. y = e[(i-1)..(i-w)]
13724
 *   6.2. tr = tr ^ (2 * w)
13725
 *   6.3. tr = ((tr * (2 ^ y) + a) mod m
13726
 *  7. if Words(m) > 1 then tr = FromMont(tr)
13727
 *  8. r = tr
13728
 *
13729
 * @param  [in]   e       SP integer that is the exponent.
13730
 * @param  [in]   digits  Number of digits in base to use. May be greater than
13731
 *                        count of bits in b.
13732
 * @param  [in]   m       SP integer that is the modulus.
13733
 * @param  [out]  r       SP integer to hold result.
13734
 *
13735
 * @return  MP_OKAY on success.
13736
 * @return  MP_MEM when dynamic memory allocation fails.
13737
 */
13738
static int _sp_exptmod_base_2(const sp_int* e, int digits, const sp_int* m,
13739
    sp_int* r)
13740
9.28k
{
13741
9.28k
    int i = 0;
13742
9.28k
    int c = 0;
13743
9.28k
    int y;
13744
9.28k
    int err = MP_OKAY;
13745
9.28k
    sp_int_digit mp = 0;
13746
9.28k
    sp_int_digit n = 0;
13747
9.28k
#ifndef WC_NO_HARDEN
13748
9.28k
    sp_int* a = NULL;
13749
9.28k
    sp_int* tr = NULL;
13750
9.28k
    DECL_SP_INT_ARRAY(d, m->used * 2 + 1, 2);
13751
#else
13752
    DECL_SP_INT(tr, m->used * 2 + 1);
13753
#endif
13754
9.28k
    int useMont = (m->used > 1);
13755
13756
#if 0
13757
    sp_print_int(2, "a");
13758
    sp_print(e, "b");
13759
    sp_print(m, "m");
13760
#endif
13761
13762
9.28k
#ifndef WC_NO_HARDEN
13763
    /* Allocate sp_ints for:
13764
     *  - constant time add value for mod operation
13765
     *  - temporary result
13766
     */
13767
9.28k
    ALLOC_SP_INT_ARRAY(d, m->used * 2U + 1U, 2, err, NULL);
13768
#else
13769
    /* Allocate sp_int for temporary result. */
13770
    ALLOC_SP_INT(tr, m->used * 2U + 1U, err, NULL);
13771
#endif
13772
9.28k
    if (err == MP_OKAY) {
13773
9.27k
    #ifndef WC_NO_HARDEN
13774
9.27k
        a  = d[0];
13775
9.27k
        tr = d[1];
13776
13777
9.27k
        _sp_init_size(a, (sp_size_t)(m->used * 2 + 1));
13778
9.27k
    #endif
13779
9.27k
        _sp_init_size(tr, (sp_size_t)(m->used * 2 + 1));
13780
13781
9.27k
    }
13782
13783
9.28k
    if ((err == MP_OKAY) && useMont) {
13784
        /* Calculate Montgomery multiplier for reduction. */
13785
8.80k
        _sp_mont_setup(m, &mp);
13786
8.80k
    }
13787
9.28k
    if (err == MP_OKAY) {
13788
        /* 1. if Words(m) > 1 then tr = MontNorm(m) = ToMont(1)
13789
         *    else                 tr = 1
13790
         */
13791
9.27k
        if (useMont) {
13792
            /* Calculate Montgomery normalizer for modulus - 1 in Montgomery
13793
             * form.
13794
             */
13795
8.80k
            err = sp_mont_norm(tr, m);
13796
8.80k
        }
13797
472
        else {
13798
             /* For single word modulus don't use Montgomery form. */
13799
472
            err = sp_set(tr, 1);
13800
472
        }
13801
9.27k
    }
13802
    /* 2. if Words(m) > 1 and HARDEN then a = m * (2 ^ (2^w))
13803
     *    else                            a = 0
13804
     */
13805
9.28k
#ifndef WC_NO_HARDEN
13806
9.28k
    if ((err == MP_OKAY) && useMont) {
13807
8.80k
        err = sp_mul_2d(m, 1 << EXP2_WINSIZE, a);
13808
8.80k
    }
13809
9.28k
#endif
13810
13811
9.28k
    if (err == MP_OKAY) {
13812
        /* 3. cb = w * (bits / w) */
13813
9.27k
        i = digits - 1;
13814
9.27k
        n = e->dp[i--];
13815
9.27k
        c = SP_WORD_SIZE;
13816
9.27k
    #if EXP2_WINSIZE != 1
13817
9.27k
        c -= (digits * SP_WORD_SIZE) % EXP2_WINSIZE;
13818
9.27k
        if (c != SP_WORD_SIZE) {
13819
            /* 4. y = e / (2 ^ cb) */
13820
9.23k
            y = (int)(n >> c);
13821
9.23k
            n <<= SP_WORD_SIZE - c;
13822
9.23k
        }
13823
38
        else
13824
38
    #endif
13825
38
        {
13826
            /* 4. y = e / (2 ^ cb) */
13827
38
            y = (int)((n >> (SP_WORD_SIZE - EXP2_WINSIZE)) & EXP2_MASK);
13828
38
            n <<= EXP2_WINSIZE;
13829
38
            c -= EXP2_WINSIZE;
13830
38
        }
13831
13832
        /* 5. tr = (tr * (2 ^ y) + a) mod m */
13833
9.27k
        err = sp_mul_2d(tr, y, tr);
13834
9.27k
    }
13835
9.28k
#ifndef WC_NO_HARDEN
13836
9.28k
    if ((err == MP_OKAY) && useMont) {
13837
        /* Add value to make mod operation constant time. */
13838
8.80k
        err = sp_add(tr, a, tr);
13839
8.80k
    }
13840
9.28k
#endif
13841
9.28k
    if (err == MP_OKAY) {
13842
9.27k
        err = sp_mod(tr, m, tr);
13843
9.27k
    }
13844
    /* 6. For i in cb..w */
13845
221k
    for (; (err == MP_OKAY) && ((i >= 0) || (c >= EXP2_WINSIZE)); ) {
13846
212k
        int j;
13847
13848
        /* 6.1. y = e[(i-1)..(i-w)] */
13849
212k
        if (c == 0) {
13850
            /* Bits from next digit. */
13851
417
            n = e->dp[i--];
13852
417
            y = (int)(n >> (SP_WORD_SIZE - EXP2_WINSIZE));
13853
417
            n <<= EXP2_WINSIZE;
13854
417
            c = SP_WORD_SIZE - EXP2_WINSIZE;
13855
417
        }
13856
211k
    #if (EXP2_WINSIZE != 1) && (EXP2_WINSIZE != 2) && (EXP2_WINSIZE != 4)
13857
211k
        else if (c < EXP2_WINSIZE) {
13858
            /* Bits to end of digit and part of next */
13859
7.56k
            y = (int)(n >> (SP_WORD_SIZE - EXP2_WINSIZE));
13860
7.56k
            n = e->dp[i--];
13861
7.56k
            c = EXP2_WINSIZE - c;
13862
7.56k
            y |= (int)(n >> (SP_WORD_SIZE - c));
13863
7.56k
            n <<= c;
13864
7.56k
            c = SP_WORD_SIZE - c;
13865
7.56k
        }
13866
204k
    #endif
13867
204k
        else {
13868
            /* Bits from middle of digit */
13869
204k
            y = (int)((n >> (SP_WORD_SIZE - EXP2_WINSIZE)) & EXP2_MASK);
13870
204k
            n <<= EXP2_WINSIZE;
13871
204k
            c -= EXP2_WINSIZE;
13872
204k
        }
13873
13874
        /* 6.2. tr = tr ^ (2 * w) */
13875
1.27M
        for (j = 0; (j < EXP2_WINSIZE) && (err == MP_OKAY); j++) {
13876
1.06M
            err = sp_sqr(tr, tr);
13877
1.06M
            if (err == MP_OKAY) {
13878
1.06M
                if (useMont) {
13879
964k
                    err = _sp_mont_red(tr, m, mp, 0);
13880
964k
                }
13881
96.2k
                else {
13882
96.2k
                    err = sp_mod(tr, m, tr);
13883
96.2k
                }
13884
1.06M
            }
13885
1.06M
        }
13886
13887
        /* 6.3. tr = ((tr * (2 ^ y) + a) mod m */
13888
212k
        if (err == MP_OKAY) {
13889
212k
            err = sp_mul_2d(tr, y, tr);
13890
212k
        }
13891
212k
    #ifndef WC_NO_HARDEN
13892
212k
        if ((err == MP_OKAY) && useMont) {
13893
            /* Add value to make mod operation constant time. */
13894
192k
            err = sp_add(tr, a, tr);
13895
192k
        }
13896
212k
    #endif
13897
212k
        if (err == MP_OKAY) {
13898
            /* Reduce current result by modulus. */
13899
212k
            err = sp_mod(tr, m, tr);
13900
212k
        }
13901
212k
    }
13902
13903
    /* 7. if Words(m) > 1 then tr = FromMont(tr) */
13904
9.28k
    if ((err == MP_OKAY) && useMont) {
13905
8.49k
        err = _sp_mont_red(tr, m, mp, 0);
13906
        /* Reduction implementation returns number to range: 0..m-1. */
13907
8.49k
    }
13908
9.28k
    if (err == MP_OKAY) {
13909
        /* 8. r = tr */
13910
8.95k
        _sp_copy(tr, r);
13911
8.95k
    }
13912
13913
#if 0
13914
    sp_print(r, "rme");
13915
#endif
13916
13917
9.28k
#ifndef WC_NO_HARDEN
13918
9.28k
    FREE_SP_INT_ARRAY(d, NULL);
13919
#else
13920
    FREE_SP_INT(tr, NULL);
13921
#endif
13922
9.28k
    return err;
13923
9.28k
}
13924
#endif
13925
13926
#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
13927
    !defined(NO_DH) || (!defined(NO_RSA) && defined(WOLFSSL_KEY_GEN)) || \
13928
    defined(OPENSSL_ALL)
13929
/* Exponentiates b to the power of e modulo m into r: r = b ^ e mod m
13930
 *
13931
 * Error returned when parameters r == e or r == m and base >= modulus.
13932
 *
13933
 * @param  [in]   b       SP integer that is the base.
13934
 * @param  [in]   e       SP integer that is the exponent.
13935
 * @param  [in]   digits  Number of digits in exponent to use. May be greater
13936
 *                        than count of digits in e.
13937
 * @param  [in]   m       SP integer that is the modulus.
13938
 * @param  [out]  r       SP integer to hold result.
13939
 *
13940
 * @return  MP_OKAY on success.
13941
 * @return  MP_VAL when b, e, m or r is NULL, digits is negative, or m <= 0 or
13942
 *          e is negative.
13943
 * @return  MP_MEM when dynamic memory allocation fails.
13944
 */
13945
int sp_exptmod_ex(const sp_int* b, const sp_int* e, int digits, const sp_int* m,
13946
    sp_int* r)
13947
651k
{
13948
651k
    int err = MP_OKAY;
13949
651k
    int done = 0;
13950
651k
    int mBits = sp_count_bits(m);
13951
651k
    int bBits = sp_count_bits(b);
13952
651k
    int eBits = sp_count_bits(e);
13953
13954
651k
    if ((b == NULL) || (e == NULL) || (m == NULL) || (r == NULL) ||
13955
651k
             (digits < 0)) {
13956
0
        err = MP_VAL;
13957
0
    }
13958
    /* Ensure m is not too big. */
13959
651k
    else if (m->used * 2 >= SP_INT_DIGITS) {
13960
20
        err = MP_VAL;
13961
20
    }
13962
13963
#if 0
13964
    if (err == MP_OKAY) {
13965
        sp_print(b, "a");
13966
        sp_print(e, "b");
13967
        sp_print(m, "m");
13968
    }
13969
#endif
13970
13971
    /* Check for invalid modulus. */
13972
651k
    if ((err == MP_OKAY) && sp_iszero(m)) {
13973
157
        err = MP_VAL;
13974
157
    }
13975
651k
#ifdef WOLFSSL_SP_INT_NEGATIVE
13976
    /* Check for unsupported negative values of exponent and modulus. */
13977
651k
    if ((err == MP_OKAY) && ((e->sign == MP_NEG) || (m->sign == MP_NEG))) {
13978
112
        err = MP_VAL;
13979
112
    }
13980
651k
#endif
13981
13982
    /* Check for degenerate cases. */
13983
651k
    if ((err == MP_OKAY) && sp_isone(m)) {
13984
88
        _sp_set(r, 0);
13985
88
        done = 1;
13986
88
    }
13987
651k
    if ((!done) && (err == MP_OKAY) && sp_iszero(e)) {
13988
7.05k
        _sp_set(r, 1);
13989
7.05k
        done = 1;
13990
7.05k
    }
13991
13992
    /* Ensure base is less than modulus. */
13993
651k
    if ((!done) && (err == MP_OKAY) && (_sp_cmp_abs(b, m) != MP_LT)) {
13994
14.7k
        if ((r == e) || (r == m)) {
13995
9
            err = MP_VAL;
13996
9
        }
13997
14.7k
        if (err == MP_OKAY) {
13998
14.7k
            err = sp_mod(b, m, r);
13999
14.7k
        }
14000
14.7k
        if (err == MP_OKAY) {
14001
14.7k
            b = r;
14002
14.7k
        }
14003
14.7k
    }
14004
    /* Check for degenerate case of base. */
14005
651k
    if ((!done) && (err == MP_OKAY) && sp_iszero(b)) {
14006
168
        _sp_set(r, 0);
14007
168
        done = 1;
14008
168
    }
14009
14010
    /* Ensure SP integers have space for intermediate values. */
14011
651k
    if ((!done) && (err == MP_OKAY) && (m->used * 2 >= r->size)) {
14012
48
        err = MP_VAL;
14013
48
    }
14014
14015
651k
    if ((!done) && (err == MP_OKAY)) {
14016
        /* Use code optimized for specific sizes if possible */
14017
#if (defined(WOLFSSL_SP_MATH) || defined(WOLFSSL_SP_MATH_ALL)) && \
14018
    ((defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \
14019
        defined(WOLFSSL_HAVE_SP_DH))
14020
    #ifndef WOLFSSL_SP_NO_2048
14021
        if ((mBits == 1024) && sp_isodd(m) && (bBits <= 1024) &&
14022
                (eBits <= 1024)) {
14023
            err = sp_ModExp_1024((sp_int*)b, (sp_int*)e, (sp_int*)m, r);
14024
            done = 1;
14025
        }
14026
        else if ((mBits == 2048) && sp_isodd(m) && (bBits <= 2048) &&
14027
                 (eBits <= 2048)) {
14028
            err = sp_ModExp_2048((sp_int*)b, (sp_int*)e, (sp_int*)m, r);
14029
            done = 1;
14030
        }
14031
        else
14032
    #endif
14033
    #ifndef WOLFSSL_SP_NO_3072
14034
        if ((mBits == 1536) && sp_isodd(m) && (bBits <= 1536) &&
14035
                (eBits <= 1536)) {
14036
            err = sp_ModExp_1536((sp_int*)b, (sp_int*)e, (sp_int*)m, r);
14037
            done = 1;
14038
        }
14039
        else if ((mBits == 3072) && sp_isodd(m) && (bBits <= 3072) &&
14040
                 (eBits <= 3072)) {
14041
            err = sp_ModExp_3072((sp_int*)b, (sp_int*)e, (sp_int*)m, r);
14042
            done = 1;
14043
        }
14044
        else
14045
    #endif
14046
    #ifdef WOLFSSL_SP_4096
14047
        if ((mBits == 4096) && sp_isodd(m) && (bBits <= 4096) &&
14048
                (eBits <= 4096)) {
14049
            err = sp_ModExp_4096((sp_int*)b, (sp_int*)e, (sp_int*)m, r);
14050
            done = 1;
14051
        }
14052
        else
14053
    #endif
14054
#endif
14055
643k
        {
14056
            /* SP does not support size. */
14057
643k
        }
14058
643k
    }
14059
651k
#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_DH) || defined(OPENSSL_ALL)
14060
#if (defined(WOLFSSL_RSA_VERIFY_ONLY) || defined(WOLFSSL_RSA_PUBLIC_ONLY)) && \
14061
    defined(NO_DH)
14062
    if ((!done) && (err == MP_OKAY)) {
14063
        /* Use non-constant time version - fastest. */
14064
        err = sp_exptmod_nct(b, e, m, r);
14065
    }
14066
#else
14067
651k
#if defined(WOLFSSL_SP_MATH_ALL) || defined(OPENSSL_ALL)
14068
651k
    if ((!done) && (err == MP_OKAY) && (b->used == 1) && (b->dp[0] == 2) &&
14069
9.62k
         mp_isodd(m)) {
14070
        /* Use the generic base 2 implementation. */
14071
8.94k
        err = _sp_exptmod_base_2(e, digits, m, r);
14072
8.94k
    }
14073
642k
    else if ((!done) && (err == MP_OKAY) && ((m->used > 1) && mp_isodd(m))) {
14074
608k
    #ifndef WC_NO_HARDEN
14075
        /* Use constant time version hardened against timing attacks and
14076
         * cache attacks when WC_NO_CACHE_RESISTANT not defined. */
14077
608k
        err = _sp_exptmod_mont_ex(b, e, digits * SP_WORD_SIZE, m, r);
14078
    #else
14079
        /* Use non-constant time version - fastest. */
14080
        err = sp_exptmod_nct(b, e, m, r);
14081
    #endif
14082
608k
    }
14083
34.0k
    else
14084
34.0k
#endif /* WOLFSSL_SP_MATH_ALL || OPENSSL_ALL */
14085
34.0k
    if ((!done) && (err == MP_OKAY)) {
14086
        /* Otherwise use the generic implementation hardened against
14087
         * timing and cache attacks. */
14088
26.4k
        err = _sp_exptmod_ex(b, e, digits * SP_WORD_SIZE, m, r);
14089
26.4k
    }
14090
651k
#endif /* WOLFSSL_RSA_VERIFY_ONLY || WOLFSSL_RSA_PUBLIC_ONLY */
14091
#else
14092
    if ((!done) && (err == MP_OKAY)) {
14093
        err = MP_VAL;
14094
    }
14095
#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_HAVE_SP_DH */
14096
14097
651k
    (void)mBits;
14098
651k
    (void)bBits;
14099
651k
    (void)eBits;
14100
651k
    (void)digits;
14101
14102
#if 0
14103
    if (err == MP_OKAY) {
14104
        sp_print(r, "rme");
14105
    }
14106
#endif
14107
651k
    return err;
14108
651k
}
14109
#endif
14110
14111
#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
14112
    !defined(NO_DH) || (!defined(NO_RSA) && defined(WOLFSSL_KEY_GEN)) || \
14113
    defined(OPENSSL_ALL)
14114
/* Exponentiates b to the power of e modulo m into r: r = b ^ e mod m
14115
 *
14116
 * @param  [in]   b  SP integer that is the base.
14117
 * @param  [in]   e  SP integer that is the exponent.
14118
 * @param  [in]   m  SP integer that is the modulus.
14119
 * @param  [out]  r  SP integer to hold result.
14120
 *
14121
 * @return  MP_OKAY on success.
14122
 * @return  MP_VAL when b, e, m or r is NULL; or m <= 0 or e is negative.
14123
 * @return  MP_MEM when dynamic memory allocation fails.
14124
 */
14125
int sp_exptmod(const sp_int* b, const sp_int* e, const sp_int* m, sp_int* r)
14126
779k
{
14127
779k
    int err = MP_OKAY;
14128
14129
    /* Validate parameters. */
14130
779k
    if ((b == NULL) || (e == NULL) || (m == NULL) || (r == NULL)) {
14131
0
        err = MP_VAL;
14132
0
    }
14133
779k
    SAVE_VECTOR_REGISTERS(err = _svr_ret;);
14134
779k
    if (err == MP_OKAY) {
14135
779k
        err = sp_exptmod_ex(b, e, (int)e->used, m, r);
14136
779k
    }
14137
779k
    RESTORE_VECTOR_REGISTERS();
14138
779k
    return err;
14139
779k
}
14140
#endif
14141
14142
#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_HAVE_SP_DH)
14143
#if defined(WOLFSSL_SP_FAST_NCT_EXPTMOD) || !defined(WOLFSSL_SP_SMALL)
14144
14145
/* Internal. Exponentiates b to the power of e modulo m into r: r = b ^ e mod m
14146
 * Creates a window of precalculated exponents with base in Montgomery form.
14147
 * Sliding window and is NOT constant time.
14148
 *
14149
 * n-bit window is: (b^(2^(n-1))*b^0)...(b^(2^(n-1))*b^(2^(n-1)-1))
14150
 * e.g. when n=6, b^32..b^63
14151
 * Algorithm:
14152
 *   1. Ensure base is less than modulus.
14153
 *   2. Convert base to Montgomery form
14154
 *   3. Set result to table entry for top window bits, or
14155
 *      if less than windows bits in exponent, 1 in Montgomery form.
14156
 *   4. While at least window bits left:
14157
 *     4.1. Count number of and skip leading 0 bits unless less then window bits
14158
 *          left.
14159
 *     4.2. Montgomery square result for each leading 0 and window bits if bits
14160
 *          left.
14161
 *     4.3. Break if less than window bits left.
14162
 *     4.4. Get top window bits from expononent and drop.
14163
 *     4.5. Montgomery multiply result by table entry.
14164
 *   5. While bits left:
14165
 *     5.1. Montogmery square result
14166
 *     5.2. If exponent bit set
14167
 *       5.2.1. Montgomery multiply result by Montgomery form of base.
14168
 *   6. Convert result back from Montgomery form.
14169
 *
14170
 * @param  [in]   b     SP integer that is the base.
14171
 * @param  [in]   e     SP integer that is the exponent.
14172
 * @param  [in]   bits  Number of bits in exponent to use. May be greater than
14173
 *                      count of bits in e.
14174
 * @param  [in]   m     SP integer that is the modulus.
14175
 * @param  [out]  r     SP integer to hold result.
14176
 *
14177
 * @return  MP_OKAY on success.
14178
 * @return  MP_MEM when dynamic memory allocation fails.
14179
 */
14180
static int _sp_exptmod_nct(const sp_int* b, const sp_int* e, const sp_int* m,
14181
    sp_int* r)
14182
16.7k
{
14183
16.7k
    int i = 0;
14184
16.7k
    int bits;
14185
16.7k
    int winBits;
14186
16.7k
    int preCnt;
14187
16.7k
    int err = MP_OKAY;
14188
16.7k
    int done = 0;
14189
16.7k
    sp_int* tr = NULL;
14190
16.7k
    sp_int* bm = NULL;
14191
    /* Maximum winBits is 6 and preCnt is (1 << (winBits - 1)). */
14192
16.7k
#ifndef WOLFSSL_SP_NO_MALLOC
14193
16.7k
    DECL_DYN_SP_INT_ARRAY(t, m->used * 2 + 1, (1 << 5) + 2);
14194
#else
14195
    DECL_SP_INT_ARRAY(t, m->used * 2 + 1, (1 << 5) + 2);
14196
#endif
14197
14198
16.7k
    bits = sp_count_bits(e);
14199
14200
    /* Window bits based on number of pre-calculations versus number of loop
14201
     * calculations.
14202
     * Exponents for RSA and DH will result in 6-bit windows.
14203
     * Note: for 4096-bit values, 7-bit window is slightly better.
14204
     */
14205
16.7k
    if (bits > 450) {
14206
277
        winBits = 6;
14207
277
    }
14208
16.4k
    else if (bits <= 21) {
14209
15.6k
        winBits = 2;
14210
15.6k
    }
14211
781
    else if (bits <= 36) {
14212
250
        winBits = 3;
14213
250
    }
14214
531
    else if (bits <= 140) {
14215
377
        winBits = 4;
14216
377
    }
14217
154
    else {
14218
154
        winBits = 5;
14219
154
    }
14220
    /* Top bit of exponent fixed as 1 for pre-calculated window. */
14221
16.7k
    preCnt = 1 << (winBits - 1);
14222
14223
    /* Allocate sp_ints for:
14224
     *  - pre-computation table
14225
     *  - temporary result
14226
     *  - Montgomery form of base
14227
     */
14228
16.7k
#ifndef WOLFSSL_SP_NO_MALLOC
14229
16.7k
    ALLOC_DYN_SP_INT_ARRAY(t, m->used * 2U + 1U, (size_t)preCnt + 2, err, NULL);
14230
#else
14231
    ALLOC_SP_INT_ARRAY(t, m->used * 2U + 1U, (size_t)preCnt + 2, err, NULL);
14232
#endif
14233
16.7k
    if (err == MP_OKAY) {
14234
        /* Set variables to use allocate memory. */
14235
16.7k
        tr = t[preCnt + 0];
14236
16.7k
        bm = t[preCnt + 1];
14237
14238
        /* Initialize all allocated  */
14239
63.0k
        for (i = 0; i < preCnt; i++) {
14240
46.3k
            _sp_init_size(t[i], (sp_size_t)(m->used * 2 + 1));
14241
46.3k
        }
14242
16.7k
        _sp_init_size(tr, (sp_size_t)(m->used * 2 + 1));
14243
16.7k
        _sp_init_size(bm, (sp_size_t)(m->used * 2 + 1));
14244
14245
        /* 1. Ensure base is less than modulus. */
14246
16.7k
        if (_sp_cmp_abs(b, m) != MP_LT) {
14247
2.55k
            err = sp_mod(b, m, bm);
14248
            /* Handle base == modulus. */
14249
2.55k
            if ((err == MP_OKAY) && sp_iszero(bm)) {
14250
10
                _sp_set(r, 0);
14251
10
                done = 1;
14252
10
            }
14253
2.55k
        }
14254
14.1k
        else {
14255
            /* Copy base into Montogmery base variable. */
14256
14.1k
            _sp_copy(b, bm);
14257
14.1k
        }
14258
16.7k
    }
14259
14260
16.7k
    if ((!done) && (err == MP_OKAY)) {
14261
16.6k
        int y = 0;
14262
16.6k
        int c = 0;
14263
16.6k
        sp_int_digit mp;
14264
14265
        /* Calculate Montgomery multiplier for reduction. */
14266
16.6k
        _sp_mont_setup(m, &mp);
14267
        /* Calculate Montgomery normalizer for modulus. */
14268
16.6k
        err = sp_mont_norm(t[0], m);
14269
16.6k
        if (err == MP_OKAY) {
14270
            /* 2. Convert base to Montgomery form. */
14271
16.6k
            err = sp_mul(bm, t[0], bm);
14272
16.6k
        }
14273
16.6k
        if (err == MP_OKAY) {
14274
            /* bm = bm mod m, temporary size has to be bigger than bm->used. */
14275
16.6k
            err = _sp_div(bm, m, NULL, bm, bm->used + 1U);
14276
16.6k
        }
14277
16.6k
        if (err == MP_OKAY) {
14278
            /* Copy Montgomery form of base into first element of table. */
14279
16.6k
            _sp_copy(bm, t[0]);
14280
16.6k
        }
14281
        /* Calculate b^(2^(winBits-1)) */
14282
35.7k
        for (i = 1; (i < winBits) && (err == MP_OKAY); i++) {
14283
19.0k
            err = sp_sqr(t[0], t[0]);
14284
19.0k
            if (err == MP_OKAY) {
14285
19.0k
                err = _sp_mont_red(t[0], m, mp, 0);
14286
19.0k
            }
14287
19.0k
        }
14288
        /* For each table entry after first. */
14289
45.5k
        for (i = 1; (i < preCnt) && (err == MP_OKAY); i++) {
14290
            /* Multiply previous entry by the base in Mont form into table. */
14291
28.8k
            err = sp_mul(t[i-1], bm, t[i]);
14292
28.8k
            if (err == MP_OKAY) {
14293
28.8k
                err = _sp_mont_red(t[i], m, mp, 0);
14294
28.8k
            }
14295
28.8k
        }
14296
14297
        /* 3. Set result to table entry for top window bits, or
14298
         *    if less than windows bits in exponent, 1 in Montgomery form.
14299
         */
14300
16.6k
        if (err == MP_OKAY) {
14301
16.6k
            sp_int_digit n;
14302
            /* Mask for calculating index into pre-computed table. */
14303
16.6k
            sp_int_digit mask = (sp_int_digit)preCnt - 1;
14304
14305
            /* Find the top bit. */
14306
16.6k
            i = (bits - 1) >> SP_WORD_SHIFT;
14307
16.6k
            n = e->dp[i--];
14308
16.6k
            c = bits % SP_WORD_SIZE;
14309
16.6k
            if (c == 0) {
14310
204
                c = SP_WORD_SIZE;
14311
204
            }
14312
            /* Put top bit at highest offset in digit. */
14313
16.6k
            n <<= SP_WORD_SIZE - c;
14314
14315
16.6k
            if (bits >= winBits) {
14316
                /* Top bit set. Copy from window. */
14317
16.6k
                if (c < winBits) {
14318
                    /* Bits to end of digit and part of next */
14319
64
                    y = (int)((n >> (SP_WORD_SIZE - winBits)) & mask);
14320
64
                    n = e->dp[i--];
14321
64
                    c = winBits - c;
14322
64
                    y |= (int)(n >> (SP_WORD_SIZE - c));
14323
64
                    n <<= c;
14324
64
                    c = SP_WORD_SIZE - c;
14325
64
                }
14326
16.5k
                else {
14327
                    /* Bits from middle of digit */
14328
16.5k
                    y = (int)((n >> (SP_WORD_SIZE - winBits)) & mask);
14329
16.5k
                    n <<= winBits;
14330
16.5k
                    c -= winBits;
14331
16.5k
                }
14332
16.6k
                _sp_copy(t[y], tr);
14333
16.6k
            }
14334
27
            else {
14335
                /* 1 in Montgomery form. */
14336
27
                err = sp_mont_norm(tr, m);
14337
27
            }
14338
14339
            /* 4. While at least window bits left. */
14340
100k
            while ((err == MP_OKAY) && ((i >= 0) || (c >= winBits))) {
14341
                /* Number of squares to before due to top bits being 0. */
14342
100k
                int sqrs = 0;
14343
14344
                /* 4.1. Count number of and skip leading 0 bits unless less
14345
                 *      than window bits.
14346
                 */
14347
456k
                do {
14348
                    /* Make sure n has bits from the right digit. */
14349
456k
                    if (c == 0) {
14350
5.01k
                        n = e->dp[i--];
14351
5.01k
                        c = SP_WORD_SIZE;
14352
5.01k
                    }
14353
                    /* Mask off the next bit. */
14354
456k
                    if ((n & ((sp_int_digit)1 << (SP_WORD_SIZE - 1))) != 0) {
14355
84.1k
                        break;
14356
84.1k
                    }
14357
14358
                    /* Another square needed. */
14359
372k
                    sqrs++;
14360
                    /* Skip bit. */
14361
372k
                    n <<= 1;
14362
372k
                    c--;
14363
372k
                }
14364
372k
                while ((err == MP_OKAY) && ((i >= 0) || (c >= winBits)));
14365
14366
100k
                if ((err == MP_OKAY) && ((i >= 0) || (c >= winBits))) {
14367
                    /* Add squares needed before using table entry. */
14368
84.1k
                    sqrs += winBits;
14369
84.1k
                }
14370
14371
                /* 4.2. Montgomery square result for each leading 0 and window
14372
                 *      bits if bits left.
14373
                 */
14374
957k
                for (; (err == MP_OKAY) && (sqrs > 0); sqrs--) {
14375
857k
                    err = sp_sqr(tr, tr);
14376
857k
                    if (err == MP_OKAY) {
14377
857k
                        err = _sp_mont_red(tr, m, mp, 0);
14378
857k
                    }
14379
857k
                }
14380
14381
                /* 4.3. Break if less than window bits left. */
14382
100k
                if ((err == MP_OKAY) && (i < 0) && (c < winBits)) {
14383
15.9k
                    break;
14384
15.9k
                }
14385
14386
                /* 4.4. Get top window bits from exponent and drop. */
14387
84.1k
                if (err == MP_OKAY) {
14388
84.0k
                    if (c == 0) {
14389
                        /* Bits from next digit. */
14390
0
                        n = e->dp[i--];
14391
0
                        y = (int)(n >> (SP_WORD_SIZE - winBits));
14392
0
                        n <<= winBits;
14393
0
                        c = SP_WORD_SIZE - winBits;
14394
0
                    }
14395
84.0k
                    else if (c < winBits) {
14396
                        /* Bits to end of digit and part of next. */
14397
6.80k
                        y = (int)(n >> (SP_WORD_SIZE - winBits));
14398
6.80k
                        n = e->dp[i--];
14399
6.80k
                        c = winBits - c;
14400
6.80k
                        y |= (int)(n >> (SP_WORD_SIZE - c));
14401
6.80k
                        n <<= c;
14402
6.80k
                        c = SP_WORD_SIZE - c;
14403
6.80k
                    }
14404
77.2k
                    else {
14405
                        /* Bits from middle of digit. */
14406
77.2k
                        y = (int)(n >> (SP_WORD_SIZE - winBits));
14407
77.2k
                        n <<= winBits;
14408
77.2k
                        c -= winBits;
14409
77.2k
                    }
14410
84.0k
                    y &= (int)mask;
14411
84.0k
                }
14412
14413
                /* 4.5. Montgomery multiply result by table entry. */
14414
84.1k
                if (err == MP_OKAY) {
14415
84.0k
                    err = sp_mul(tr, t[y], tr);
14416
84.0k
                }
14417
84.1k
                if (err == MP_OKAY) {
14418
84.0k
                    err = _sp_mont_red(tr, m, mp, 0);
14419
84.0k
                }
14420
84.1k
            }
14421
14422
            /* Finished multiplying in table entries. */
14423
16.6k
            if ((err == MP_OKAY) && (c > 0)) {
14424
                /* Handle remaining bits.
14425
                 * Window values have top bit set and can't be used. */
14426
16.4k
                n = e->dp[0];
14427
                /*  5. While bits left: */
14428
34.2k
                for (--c; (err == MP_OKAY) && (c >= 0); c--) {
14429
                    /* 5.1. Montogmery square result */
14430
17.8k
                    err = sp_sqr(tr, tr);
14431
17.8k
                    if (err == MP_OKAY) {
14432
17.8k
                        err = _sp_mont_red(tr, m, mp, 0);
14433
17.8k
                    }
14434
                    /* 5.2. If exponent bit set */
14435
17.8k
                    if ((err == MP_OKAY) && ((n >> c) & 1)) {
14436
                        /* 5.2.1. Montgomery multiply result by Montgomery form
14437
                         * of base.
14438
                         */
14439
16.4k
                        err = sp_mul(tr, bm, tr);
14440
16.4k
                        if (err == MP_OKAY) {
14441
16.4k
                            err = _sp_mont_red(tr, m, mp, 0);
14442
16.4k
                        }
14443
16.4k
                    }
14444
17.8k
                }
14445
16.4k
            }
14446
16.6k
        }
14447
14448
16.6k
        if (err == MP_OKAY) {
14449
            /* 6. Convert result back from Montgomery form. */
14450
16.5k
            err = _sp_mont_red(tr, m, mp, 0);
14451
            /* Reduction implementation returns number to range: 0..m-1. */
14452
16.5k
        }
14453
16.6k
    }
14454
16.7k
    if ((!done) && (err == MP_OKAY)) {
14455
        /* Copy temporary result into parameter. */
14456
16.5k
        _sp_copy(tr, r);
14457
16.5k
    }
14458
14459
16.7k
#ifndef WOLFSSL_SP_NO_MALLOC
14460
16.7k
    FREE_DYN_SP_INT_ARRAY(t, NULL);
14461
#else
14462
    FREE_SP_INT_ARRAY(t, NULL);
14463
#endif
14464
16.7k
    return err;
14465
16.7k
}
14466
14467
#else
14468
/* Exponentiates b to the power of e modulo m into r: r = b ^ e mod m
14469
 * Non-constant time implementation.
14470
 *
14471
 * Algorithm:
14472
 *   1. Convert base to Montgomery form
14473
 *   2. Set result to base (assumes exponent is not zero)
14474
 *   3. For each bit in exponent starting at second highest
14475
 *     3.1. Montogmery square result
14476
 *     3.2. If exponent bit set
14477
 *       3.2.1. Montgomery multiply result by Montgomery form of base.
14478
 *   4. Convert result back from Montgomery form.
14479
 *
14480
 * @param  [in]   b  SP integer that is the base.
14481
 * @param  [in]   e  SP integer that is the exponent.
14482
 * @param  [in]   m  SP integer that is the modulus.
14483
 * @param  [out]  r  SP integer to hold result.
14484
 *
14485
 * @return  MP_OKAY on success.
14486
 * @return  MP_VAL when b, e, m or r is NULL; or m <= 0 or e is negative.
14487
 * @return  MP_MEM when dynamic memory allocation fails.
14488
 */
14489
static int _sp_exptmod_nct(const sp_int* b, const sp_int* e, const sp_int* m,
14490
    sp_int* r)
14491
{
14492
    int i;
14493
    int err = MP_OKAY;
14494
    int done = 0;
14495
    int y = 0;
14496
    int bits = sp_count_bits(e);
14497
    sp_int_digit mp;
14498
    DECL_SP_INT_ARRAY(t, m->used * 2 + 1, 2);
14499
14500
    /* Allocate memory for:
14501
     *  - Montgomery form of base
14502
     *  - Temporary result (in case r is same var as another parameter). */
14503
    ALLOC_SP_INT_ARRAY(t, m->used * 2 + 1, 2, err, NULL);
14504
    if (err == MP_OKAY) {
14505
        _sp_init_size(t[0], m->used * 2 + 1);
14506
        _sp_init_size(t[1], m->used * 2 + 1);
14507
14508
        /* Ensure base is less than modulus and copy into temp. */
14509
        if (_sp_cmp_abs(b, m) != MP_LT) {
14510
            err = sp_mod(b, m, t[0]);
14511
            /* Handle base == modulus. */
14512
            if ((err == MP_OKAY) && sp_iszero(t[0])) {
14513
                _sp_set(r, 0);
14514
                done = 1;
14515
            }
14516
        }
14517
        else {
14518
            /* Copy base into temp. */
14519
            _sp_copy(b, t[0]);
14520
        }
14521
    }
14522
14523
    if ((!done) && (err == MP_OKAY)) {
14524
        /* Calculate Montgomery multiplier for reduction. */
14525
        _sp_mont_setup(m, &mp);
14526
        /* Calculate Montgomery normalizer for modulus. */
14527
        err = sp_mont_norm(t[1], m);
14528
        if (err == MP_OKAY) {
14529
            /* 1. Convert base to Montgomery form. */
14530
            err = sp_mul(t[0], t[1], t[0]);
14531
        }
14532
        if (err == MP_OKAY) {
14533
            /* t[0] = t[0] mod m, temporary size has to be bigger than t[0]. */
14534
            err = _sp_div(t[0], m, NULL, t[0], t[0]->used + 1);
14535
        }
14536
        if (err == MP_OKAY) {
14537
            /* 2. Result starts as Montgomery form of base (assuming e > 0). */
14538
            _sp_copy(t[0], t[1]);
14539
        }
14540
14541
        /* 3. For each bit in exponent starting at second highest. */
14542
        for (i = bits - 2; (err == MP_OKAY) && (i >= 0); i--) {
14543
            /* 3.1. Montgomery square result. */
14544
            err = sp_sqr(t[0], t[0]);
14545
            if (err == MP_OKAY) {
14546
                err = _sp_mont_red(t[0], m, mp, 0);
14547
            }
14548
            if (err == MP_OKAY) {
14549
                /* Get bit and index i. */
14550
                y = (e->dp[i >> SP_WORD_SHIFT] >> (i & SP_WORD_MASK)) & 1;
14551
                /* 3.2. If exponent bit set */
14552
                if (y != 0) {
14553
                    /* 3.2.1. Montgomery multiply result by Mont of base. */
14554
                    err = sp_mul(t[0], t[1], t[0]);
14555
                    if (err == MP_OKAY) {
14556
                        err = _sp_mont_red(t[0], m, mp, 0);
14557
                    }
14558
                }
14559
            }
14560
        }
14561
        if (err == MP_OKAY) {
14562
            /* 4. Convert from Montgomery form. */
14563
            err = _sp_mont_red(t[0], m, mp, 0);
14564
            /* Reduction implementation returns number of range 0..m-1. */
14565
        }
14566
    }
14567
    if ((!done) && (err == MP_OKAY)) {
14568
        /* Copy temporary result into parameter. */
14569
        _sp_copy(t[0], r);
14570
    }
14571
14572
    FREE_SP_INT_ARRAY(t, NULL);
14573
    return err;
14574
}
14575
#endif /* WOLFSSL_SP_FAST_NCT_EXPTMOD || !WOLFSSL_SP_SMALL */
14576
14577
/* Exponentiates b to the power of e modulo m into r: r = b ^ e mod m
14578
 * Non-constant time implementation.
14579
 *
14580
 * @param  [in]   b  SP integer that is the base.
14581
 * @param  [in]   e  SP integer that is the exponent.
14582
 * @param  [in]   m  SP integer that is the modulus.
14583
 * @param  [out]  r  SP integer to hold result.
14584
 *
14585
 * @return  MP_OKAY on success.
14586
 * @return  MP_VAL when b, e, m or r is NULL; or m <= 0 or e is negative.
14587
 * @return  MP_MEM when dynamic memory allocation fails.
14588
 */
14589
int sp_exptmod_nct(const sp_int* b, const sp_int* e, const sp_int* m, sp_int* r)
14590
16.5k
{
14591
16.5k
    int err = MP_OKAY;
14592
14593
    /* Validate parameters. */
14594
16.5k
    if ((b == NULL) || (e == NULL) || (m == NULL) || (r == NULL)) {
14595
0
        err = MP_VAL;
14596
0
    }
14597
14598
#if 0
14599
    if (err == MP_OKAY) {
14600
        sp_print(b, "a");
14601
        sp_print(e, "b");
14602
        sp_print(m, "m");
14603
    }
14604
#endif
14605
14606
16.5k
    if (err != MP_OKAY) {
14607
0
    }
14608
    /* Handle special cases. */
14609
16.5k
    else if (sp_iszero(m)) {
14610
19
        err = MP_VAL;
14611
19
    }
14612
16.5k
#ifdef WOLFSSL_SP_INT_NEGATIVE
14613
16.5k
    else if ((e->sign == MP_NEG) || (m->sign == MP_NEG)) {
14614
24
        err = MP_VAL;
14615
24
    }
14616
16.5k
#endif
14617
    /* x mod 1 is always 0. */
14618
16.5k
    else if (sp_isone(m)) {
14619
2
        _sp_set(r, 0);
14620
2
    }
14621
    /* b^0 mod m = 1 mod m = 1. */
14622
16.5k
    else if (sp_iszero(e)) {
14623
129
        _sp_set(r, 1);
14624
129
    }
14625
    /* 0^x mod m = 0 mod m = 0. */
14626
16.4k
    else if (sp_iszero(b)) {
14627
9
        _sp_set(r, 0);
14628
9
    }
14629
    /* Ensure SP integers have space for intermediate values. */
14630
16.4k
    else if (m->used * 2 >= r->size) {
14631
6
        err = MP_VAL;
14632
6
    }
14633
16.4k
#if !defined(WOLFSSL_RSA_VERIFY_ONLY) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)
14634
16.4k
    else if (mp_iseven(m)) {
14635
51
        err = _sp_exptmod_ex(b, e, (int)(e->used * SP_WORD_SIZE), m, r);
14636
51
    }
14637
16.3k
#endif
14638
16.3k
    else {
14639
16.3k
        err = _sp_exptmod_nct(b, e, m, r);
14640
16.3k
    }
14641
14642
#if 0
14643
    if (err == MP_OKAY) {
14644
        sp_print(r, "rme");
14645
    }
14646
#endif
14647
14648
16.5k
    return err;
14649
16.5k
}
14650
#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_HAVE_SP_DH */
14651
14652
/***************
14653
 * 2^e functions
14654
 ***************/
14655
14656
#if defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)
14657
/* Divide by 2^e: r = a >> e and rem = bits shifted out
14658
 *
14659
 * @param  [in]   a    SP integer to divide.
14660
 * @param  [in]   e    Exponent bits (dividing by 2^e).
14661
 * @param  [in]   m    SP integer that is the modulus.
14662
 * @param  [out]  r    SP integer to hold result.
14663
 * @param  [out]  rem  SP integer to hold remainder.
14664
 *
14665
 * @return  MP_OKAY on success.
14666
 * @return  MP_VAL when a is NULL or e is negative.
14667
 */
14668
int sp_div_2d(const sp_int* a, int e, sp_int* r, sp_int* rem)
14669
305
{
14670
305
    int err = MP_OKAY;
14671
14672
305
    if ((a == NULL) || (e < 0)) {
14673
0
        err = MP_VAL;
14674
0
    }
14675
14676
305
    if (err == MP_OKAY) {
14677
        /* Number of bits remaining after shift. */
14678
305
        int remBits = sp_count_bits(a) - e;
14679
14680
305
        if (remBits <= 0) {
14681
            /* Shifting down by more bits than in number. */
14682
121
            _sp_zero(r);
14683
121
            if (rem != NULL) {
14684
81
                err = sp_copy(a, rem);
14685
81
            }
14686
121
        }
14687
184
        else {
14688
184
            if (rem != NULL) {
14689
                /* Copy a in to remainder. */
14690
118
                err = sp_copy(a, rem);
14691
118
            }
14692
184
            if (err == MP_OKAY) {
14693
                /* Shift a down by into result. */
14694
175
                err = sp_rshb(a, e, r);
14695
175
            }
14696
184
            if ((err == MP_OKAY) && (rem != NULL)) {
14697
                /* Set used and mask off top digit of remainder. */
14698
104
                rem->used = (sp_size_t)((e + SP_WORD_SIZE - 1) >>
14699
104
                                        SP_WORD_SHIFT);
14700
104
                e &= SP_WORD_MASK;
14701
104
                if (e > 0) {
14702
41
                    rem->dp[rem->used - 1] &= ((sp_int_digit)1 << e) - 1;
14703
41
                }
14704
14705
                /* Remove leading zeros from remainder. */
14706
104
                sp_clamp(rem);
14707
104
            #ifdef WOLFSSL_SP_INT_NEGATIVE
14708
104
                rem->sign = MP_ZPOS;
14709
104
            #endif
14710
104
            }
14711
184
        }
14712
305
    }
14713
14714
305
    return err;
14715
305
}
14716
#endif /* WOLFSSL_SP_MATH_ALL && !WOLFSSL_RSA_VERIFY_ONLY */
14717
14718
#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
14719
    defined(HAVE_ECC)
14720
/* The bottom e bits: r = a & ((1 << e) - 1)
14721
 *
14722
 * @param  [in]   a  SP integer to reduce.
14723
 * @param  [in]   e  Modulus bits (modulus equals 2^e).
14724
 * @param  [out]  r  SP integer to hold result.
14725
 *
14726
 * @return  MP_OKAY on success.
14727
 * @return  MP_VAL when a or r is NULL, e is negative or e is too large for
14728
 *          result.
14729
 */
14730
int sp_mod_2d(const sp_int* a, int e, sp_int* r)
14731
99
{
14732
99
    int err = MP_OKAY;
14733
99
    sp_size_t digits = (sp_size_t)((e + SP_WORD_SIZE - 1) >> SP_WORD_SHIFT);
14734
14735
99
    if ((a == NULL) || (r == NULL) || (e < 0)) {
14736
0
        err = MP_VAL;
14737
0
    }
14738
99
    if ((err == MP_OKAY) && (digits > r->size)) {
14739
0
        err = MP_VAL;
14740
0
    }
14741
14742
99
    if (err == MP_OKAY) {
14743
        /* Copy a into r if not same pointer. */
14744
99
        if (a != r) {
14745
4
            XMEMCPY(r->dp, a->dp, digits * (word32)SP_WORD_SIZEOF);
14746
4
            r->used = a->used;
14747
4
        #ifdef WOLFSSL_SP_INT_NEGATIVE
14748
4
            r->sign = a->sign;
14749
4
        #endif
14750
4
        }
14751
14752
        /* Modify result if a is bigger or same digit size. */
14753
    #ifndef WOLFSSL_SP_INT_NEGATIVE
14754
        if (digits <= a->used)
14755
    #else
14756
        /* Need to make negative positive and mask. */
14757
99
        if ((a->sign == MP_NEG) || (digits <= a->used))
14758
84
    #endif
14759
84
        {
14760
84
        #ifdef WOLFSSL_SP_INT_NEGATIVE
14761
84
            if (a->sign == MP_NEG) {
14762
31
                unsigned int i;
14763
31
                sp_int_digit carry = 0;
14764
14765
                /* Negate value. */
14766
638
                for (i = 0; i < r->used; i++) {
14767
607
                    sp_int_digit next = r->dp[i] > 0;
14768
607
                    r->dp[i] = (sp_int_digit)0 - r->dp[i] - carry;
14769
607
                    carry |= next;
14770
607
                }
14771
101
                for (; i < digits; i++) {
14772
70
                    r->dp[i] = (sp_int_digit)0 - carry;
14773
70
                }
14774
31
                r->sign = MP_ZPOS;
14775
31
            }
14776
84
        #endif
14777
            /* Set used and mask off top digit of result. */
14778
84
            r->used = digits;
14779
84
            e &= SP_WORD_MASK;
14780
84
            if (e > 0) {
14781
57
                r->dp[r->used - 1] &= ((sp_int_digit)1 << e) - 1;
14782
57
            }
14783
84
            sp_clamp(r);
14784
84
        }
14785
99
    }
14786
14787
99
    return err;
14788
99
}
14789
#endif /* (WOLFSSL_SP_MATH_ALL && !WOLFSSL_RSA_VERIFY_ONLY)) || HAVE_ECC */
14790
14791
#if (defined(WOLFSSL_SP_MATH_ALL) && (!defined(WOLFSSL_RSA_VERIFY_ONLY) || \
14792
    !defined(NO_DH))) || defined(OPENSSL_ALL)
14793
/* Multiply by 2^e: r = a << e
14794
 *
14795
 * @param  [in]   a  SP integer to multiply.
14796
 * @param  [in]   e  Multiplier bits (multiplier equals 2^e).
14797
 * @param  [out]  r  SP integer to hold result.
14798
 *
14799
 * @return  MP_OKAY on success.
14800
 * @return  MP_VAL when a or r is NULL, e is negative, or result is too big for
14801
 *          result size.
14802
 */
14803
int sp_mul_2d(const sp_int* a, int e, sp_int* r)
14804
284k
{
14805
284k
    int err = MP_OKAY;
14806
14807
    /* Validate parameters. */
14808
284k
    if ((a == NULL) || (r == NULL) || (e < 0)) {
14809
0
        err = MP_VAL;
14810
0
    }
14811
14812
    /* Ensure result has enough allocated digits for result. */
14813
284k
    if ((err == MP_OKAY) &&
14814
284k
            ((unsigned int)(sp_count_bits(a) + e) >
14815
284k
             (unsigned int)r->size * SP_WORD_SIZE)) {
14816
34
        err = MP_VAL;
14817
34
    }
14818
14819
284k
    if (err == MP_OKAY) {
14820
        /* Copy a into r as left shift function works on the number. */
14821
284k
        if (a != r) {
14822
10.2k
            err = sp_copy(a, r);
14823
10.2k
        }
14824
284k
    }
14825
14826
284k
    if (err == MP_OKAY) {
14827
#if 0
14828
        sp_print(a, "a");
14829
        sp_print_int(e, "n");
14830
#endif
14831
284k
        err = sp_lshb(r, e);
14832
#if 0
14833
        sp_print(r, "rsl");
14834
#endif
14835
284k
    }
14836
14837
284k
    return err;
14838
284k
}
14839
#endif /* WOLFSSL_SP_MATH_ALL && !WOLFSSL_RSA_VERIFY_ONLY */
14840
14841
#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_HAVE_SP_DH) || \
14842
    defined(HAVE_ECC) || (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY))
14843
14844
/* START SP_SQR implementations */
14845
/* This code is generated.
14846
 * To generate:
14847
 *   cd scripts/sp/sp_int
14848
 *   ./gen.sh
14849
 * File sp_sqr.c contains code.
14850
 */
14851
14852
#if !defined(WOLFSSL_SP_MATH) || !defined(WOLFSSL_SP_SMALL)
14853
#ifdef SQR_MUL_ASM
14854
/* Square a and store in r. r = a * a
14855
 *
14856
 * @param  [in]   a  SP integer to square.
14857
 * @param  [out]  r  SP integer result.
14858
 *
14859
 * @return  MP_OKAY on success.
14860
 * @return  MP_MEM when dynamic memory allocation fails.
14861
 */
14862
static int _sp_sqr(const sp_int* a, sp_int* r)
14863
16.8M
{
14864
16.8M
    int err = MP_OKAY;
14865
16.8M
    sp_size_t i;
14866
16.8M
    int j;
14867
16.8M
    sp_size_t k;
14868
16.8M
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
14869
16.8M
    sp_int_digit* t = NULL;
14870
#elif defined(WOLFSSL_SP_DYN_STACK)
14871
    sp_int_digit t[((a->used + 1) / 2) * 2 + 1];
14872
#else
14873
    sp_int_digit t[(SP_INT_DIGITS + 1) / 2];
14874
#endif
14875
14876
16.8M
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
14877
16.8M
    t = (sp_int_digit*)XMALLOC(
14878
16.8M
        sizeof(sp_int_digit) * (size_t)(((a->used + 1) / 2) * 2 + 1), NULL,
14879
16.8M
        DYNAMIC_TYPE_BIGINT);
14880
16.8M
    if (t == NULL) {
14881
294
        err = MP_MEM;
14882
294
    }
14883
16.8M
#endif
14884
16.8M
    if ((err == MP_OKAY) && (a->used <= 1)) {
14885
3.36M
        sp_int_digit l;
14886
3.36M
        sp_int_digit h;
14887
14888
3.36M
        h = 0;
14889
3.36M
        l = 0;
14890
3.36M
        SP_ASM_SQR(h, l, a->dp[0]);
14891
3.36M
        r->dp[0] = h;
14892
3.36M
        r->dp[1] = l;
14893
3.36M
    }
14894
13.4M
    else if (err == MP_OKAY) {
14895
13.4M
        sp_int_digit l;
14896
13.4M
        sp_int_digit h;
14897
13.4M
        sp_int_digit o;
14898
13.4M
        sp_int_digit* p = t;
14899
14900
13.4M
        h = 0;
14901
13.4M
        l = 0;
14902
13.4M
        SP_ASM_SQR(h, l, a->dp[0]);
14903
13.4M
        t[0] = h;
14904
13.4M
        h = 0;
14905
13.4M
        o = 0;
14906
50.6M
        for (k = 1; k < (sp_size_t)((a->used + 1) / 2); k++) {
14907
37.1M
            i = k;
14908
37.1M
            j = (int)(k - 1);
14909
163M
            for (; (j >= 0); i++, j--) {
14910
126M
                SP_ASM_MUL_ADD2(l, h, o, a->dp[i], a->dp[j]);
14911
126M
            }
14912
37.1M
            t[k * 2 - 1] = l;
14913
37.1M
            l = h;
14914
37.1M
            h = o;
14915
37.1M
            o = 0;
14916
14917
37.1M
            SP_ASM_SQR_ADD(l, h, o, a->dp[k]);
14918
37.1M
            i = (sp_size_t)(k + 1);
14919
37.1M
            j = (int)(k - 1);
14920
163M
            for (; (j >= 0); i++, j--) {
14921
126M
                SP_ASM_MUL_ADD2(l, h, o, a->dp[i], a->dp[j]);
14922
126M
            }
14923
37.1M
            t[k * 2] = l;
14924
37.1M
            l = h;
14925
37.1M
            h = o;
14926
37.1M
            o = 0;
14927
37.1M
        }
14928
54.4M
        for (; k < a->used; k++) {
14929
40.9M
            i = k;
14930
40.9M
            j = (int)(k - 1);
14931
178M
            for (; (i < a->used); i++, j--) {
14932
137M
                SP_ASM_MUL_ADD2(l, h, o, a->dp[i], a->dp[j]);
14933
137M
            }
14934
40.9M
            p[k * 2 - 1] = l;
14935
40.9M
            l = h;
14936
40.9M
            h = o;
14937
40.9M
            o = 0;
14938
14939
40.9M
            SP_ASM_SQR_ADD(l, h, o, a->dp[k]);
14940
40.9M
            i = (sp_size_t)(k + 1);
14941
40.9M
            j = (int)(k - 1);
14942
137M
            for (; (i < a->used); i++, j--) {
14943
96.4M
                SP_ASM_MUL_ADD2(l, h, o, a->dp[i], a->dp[j]);
14944
96.4M
            }
14945
40.9M
            p[k * 2] = l;
14946
40.9M
            l = h;
14947
40.9M
            h = o;
14948
40.9M
            o = 0;
14949
14950
40.9M
            p = r->dp;
14951
40.9M
        }
14952
13.4M
        r->dp[k * 2 - 1] = l;
14953
13.4M
        XMEMCPY(r->dp, t, (size_t)(((a->used + 1) / 2) * 2 + 1) *
14954
13.4M
            sizeof(sp_int_digit));
14955
13.4M
    }
14956
14957
16.8M
    if (err == MP_OKAY) {
14958
16.8M
        r->used = (sp_size_t)(a->used * 2U);
14959
16.8M
        sp_clamp(r);
14960
16.8M
    }
14961
14962
16.8M
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
14963
16.8M
    XFREE(t, NULL, DYNAMIC_TYPE_BIGINT);
14964
16.8M
#endif
14965
16.8M
    return err;
14966
16.8M
}
14967
#else /* !SQR_MUL_ASM */
14968
/* Square a and store in r. r = a * a
14969
 *
14970
 * @param  [in]   a  SP integer to square.
14971
 * @param  [out]  r  SP integer result.
14972
 *
14973
 * @return  MP_OKAY on success.
14974
 * @return  MP_MEM when dynamic memory allocation fails.
14975
 */
14976
static int _sp_sqr(const sp_int* a, sp_int* r)
14977
{
14978
    int err = MP_OKAY;
14979
    sp_size_t i;
14980
    int j;
14981
    sp_size_t k;
14982
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
14983
    sp_int_digit* t = NULL;
14984
#elif defined(WOLFSSL_SP_DYN_STACK)
14985
    sp_int_digit t[a->used * 2];
14986
#else
14987
    sp_int_digit t[SP_INT_DIGITS];
14988
#endif
14989
14990
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
14991
    t = (sp_int_digit*)XMALLOC(sizeof(sp_int_digit) *
14992
                               (size_t)(a->used * 2), NULL,
14993
                               DYNAMIC_TYPE_BIGINT);
14994
    if (t == NULL) {
14995
        err = MP_MEM;
14996
    }
14997
#endif
14998
    if (err == MP_OKAY) {
14999
    #ifndef WOLFSSL_SP_INT_SQR_VOLATILE
15000
        sp_int_word w;
15001
        sp_int_word l;
15002
        sp_int_word h;
15003
    #else
15004
        volatile sp_int_word w;
15005
        volatile sp_int_word l;
15006
        volatile sp_int_word h;
15007
    #endif
15008
    #ifdef SP_WORD_OVERFLOW
15009
        sp_int_word o;
15010
    #endif
15011
15012
        w = (sp_int_word)a->dp[0] * a->dp[0];
15013
        t[0] = (sp_int_digit)w;
15014
        l = (sp_int_digit)(w >> SP_WORD_SIZE);
15015
        h = 0;
15016
    #ifdef SP_WORD_OVERFLOW
15017
        o = 0;
15018
    #endif
15019
        for (k = 1; k <= (sp_size_t)((a->used - 1) * 2); k++) {
15020
            i = k / 2;
15021
            j = (int)(k - i);
15022
            if (i == (unsigned int)j) {
15023
                w = (sp_int_word)a->dp[i] * a->dp[j];
15024
                l += (sp_int_digit)w;
15025
                h += (sp_int_digit)(w >> SP_WORD_SIZE);
15026
            #ifdef SP_WORD_OVERFLOW
15027
                h += (sp_int_digit)(l >> SP_WORD_SIZE);
15028
                l &= SP_MASK;
15029
                o += (sp_int_digit)(h >> SP_WORD_SIZE);
15030
                h &= SP_MASK;
15031
            #endif
15032
            }
15033
            for (++i, --j; (i < a->used) && (j >= 0); i++, j--) {
15034
                w = (sp_int_word)a->dp[i] * a->dp[j];
15035
                l += (sp_int_digit)w;
15036
                h += (sp_int_digit)(w >> SP_WORD_SIZE);
15037
            #ifdef SP_WORD_OVERFLOW
15038
                h += (sp_int_digit)(l >> SP_WORD_SIZE);
15039
                l &= SP_MASK;
15040
                o += (sp_int_digit)(h >> SP_WORD_SIZE);
15041
                h &= SP_MASK;
15042
            #endif
15043
                l += (sp_int_digit)w;
15044
                h += (sp_int_digit)(w >> SP_WORD_SIZE);
15045
            #ifdef SP_WORD_OVERFLOW
15046
                h += (sp_int_digit)(l >> SP_WORD_SIZE);
15047
                l &= SP_MASK;
15048
                o += (sp_int_digit)(h >> SP_WORD_SIZE);
15049
                h &= SP_MASK;
15050
            #endif
15051
            }
15052
            t[k] = (sp_int_digit)l;
15053
            l >>= SP_WORD_SIZE;
15054
            l += (sp_int_digit)h;
15055
            h >>= SP_WORD_SIZE;
15056
        #ifdef SP_WORD_OVERFLOW
15057
            h += o & SP_MASK;
15058
            o >>= SP_WORD_SIZE;
15059
        #endif
15060
        }
15061
        t[k] = (sp_int_digit)l;
15062
        r->used = (sp_size_t)(k + 1);
15063
        XMEMCPY(r->dp, t, r->used * sizeof(sp_int_digit));
15064
        sp_clamp(r);
15065
    }
15066
15067
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
15068
    XFREE(t, NULL, DYNAMIC_TYPE_BIGINT);
15069
#endif
15070
    return err;
15071
}
15072
#endif /* SQR_MUL_ASM */
15073
#endif /* !WOLFSSL_SP_MATH || !WOLFSSL_SP_SMALL */
15074
15075
#ifndef WOLFSSL_SP_SMALL
15076
#if !defined(WOLFSSL_HAVE_SP_ECC) && defined(HAVE_ECC)
15077
#if (SP_WORD_SIZE == 64 && SP_INT_BITS >= 256)
15078
#ifndef SQR_MUL_ASM
15079
/* Square a and store in r. r = a * a
15080
 *
15081
 * Long-hand implementation.
15082
 *
15083
 * @param  [in]   a  SP integer to square.
15084
 * @param  [out]  r  SP integer result.
15085
 *
15086
 * @return  MP_OKAY on success.
15087
 * @return  MP_MEM when dynamic memory allocation fails.
15088
 */
15089
static int _sp_sqr_4(const sp_int* a, sp_int* r)
15090
{
15091
    int err = MP_OKAY;
15092
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
15093
    sp_int_word* w = NULL;
15094
#else
15095
    sp_int_word w[10];
15096
#endif
15097
    const sp_int_digit* da = a->dp;
15098
15099
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
15100
    w = (sp_int_word*)XMALLOC(sizeof(sp_int_word) * 10, NULL,
15101
        DYNAMIC_TYPE_BIGINT);
15102
    if (w == NULL) {
15103
        err = MP_MEM;
15104
    }
15105
#endif
15106
15107
15108
    if (err == MP_OKAY) {
15109
        w[0] = (sp_int_word)da[0] * da[0];
15110
        w[1] = (sp_int_word)da[0] * da[1];
15111
        w[2] = (sp_int_word)da[0] * da[2];
15112
        w[3] = (sp_int_word)da[1] * da[1];
15113
        w[4] = (sp_int_word)da[0] * da[3];
15114
        w[5] = (sp_int_word)da[1] * da[2];
15115
        w[6] = (sp_int_word)da[1] * da[3];
15116
        w[7] = (sp_int_word)da[2] * da[2];
15117
        w[8] = (sp_int_word)da[2] * da[3];
15118
        w[9] = (sp_int_word)da[3] * da[3];
15119
15120
        r->dp[0] = (sp_int_digit)w[0];
15121
        w[0] >>= SP_WORD_SIZE;
15122
        w[0] += (sp_int_digit)w[1];
15123
        w[0] += (sp_int_digit)w[1];
15124
        r->dp[1] = (sp_int_digit)w[0];
15125
        w[0] >>= SP_WORD_SIZE;
15126
        w[1] >>= SP_WORD_SIZE;
15127
        w[0] += (sp_int_digit)w[1];
15128
        w[0] += (sp_int_digit)w[1];
15129
        w[0] += (sp_int_digit)w[2];
15130
        w[0] += (sp_int_digit)w[2];
15131
        w[0] += (sp_int_digit)w[3];
15132
        r->dp[2] = (sp_int_digit)w[0];
15133
        w[0] >>= SP_WORD_SIZE;
15134
        w[2] >>= SP_WORD_SIZE;
15135
        w[0] += (sp_int_digit)w[2];
15136
        w[0] += (sp_int_digit)w[2];
15137
        w[3] >>= SP_WORD_SIZE;
15138
        w[0] += (sp_int_digit)w[3];
15139
        w[0] += (sp_int_digit)w[4];
15140
        w[0] += (sp_int_digit)w[4];
15141
        w[0] += (sp_int_digit)w[5];
15142
        w[0] += (sp_int_digit)w[5];
15143
        r->dp[3] = (sp_int_digit)w[0];
15144
        w[0] >>= SP_WORD_SIZE;
15145
        w[4] >>= SP_WORD_SIZE;
15146
        w[0] += (sp_int_digit)w[4];
15147
        w[0] += (sp_int_digit)w[4];
15148
        w[5] >>= SP_WORD_SIZE;
15149
        w[0] += (sp_int_digit)w[5];
15150
        w[0] += (sp_int_digit)w[5];
15151
        w[0] += (sp_int_digit)w[6];
15152
        w[0] += (sp_int_digit)w[6];
15153
        w[0] += (sp_int_digit)w[7];
15154
        r->dp[4] = (sp_int_digit)w[0];
15155
        w[0] >>= SP_WORD_SIZE;
15156
        w[6] >>= SP_WORD_SIZE;
15157
        w[0] += (sp_int_digit)w[6];
15158
        w[0] += (sp_int_digit)w[6];
15159
        w[7] >>= SP_WORD_SIZE;
15160
        w[0] += (sp_int_digit)w[7];
15161
        w[0] += (sp_int_digit)w[8];
15162
        w[0] += (sp_int_digit)w[8];
15163
        r->dp[5] = (sp_int_digit)w[0];
15164
        w[0] >>= SP_WORD_SIZE;
15165
        w[8] >>= SP_WORD_SIZE;
15166
        w[0] += (sp_int_digit)w[8];
15167
        w[0] += (sp_int_digit)w[8];
15168
        w[0] += (sp_int_digit)w[9];
15169
        r->dp[6] = (sp_int_digit)w[0];
15170
        w[0] >>= SP_WORD_SIZE;
15171
        w[9] >>= SP_WORD_SIZE;
15172
        w[0] += (sp_int_digit)w[9];
15173
        r->dp[7] = (sp_int_digit)w[0];
15174
15175
        r->used = 8;
15176
        sp_clamp(r);
15177
    }
15178
15179
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
15180
    XFREE(w, NULL, DYNAMIC_TYPE_BIGINT);
15181
#endif
15182
    return err;
15183
}
15184
#else /* SQR_MUL_ASM */
15185
/* Square a and store in r. r = a * a
15186
 *
15187
 * Comba implementation.
15188
 *
15189
 * @param  [in]   a  SP integer to square.
15190
 * @param  [out]  r  SP integer result.
15191
 *
15192
 * @return  MP_OKAY on success.
15193
 * @return  MP_MEM when dynamic memory allocation fails.
15194
 */
15195
static int _sp_sqr_4(const sp_int* a, sp_int* r)
15196
20.5M
{
15197
20.5M
    sp_int_digit l = 0;
15198
20.5M
    sp_int_digit h = 0;
15199
20.5M
    sp_int_digit o = 0;
15200
20.5M
    sp_int_digit t[4];
15201
15202
20.5M
    SP_ASM_SQR(h, l, a->dp[0]);
15203
20.5M
    t[0] = h;
15204
20.5M
    h = 0;
15205
20.5M
    SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[1]);
15206
20.5M
    t[1] = l;
15207
20.5M
    l = h;
15208
20.5M
    h = o;
15209
20.5M
    o = 0;
15210
20.5M
    SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[2]);
15211
20.5M
    SP_ASM_SQR_ADD(l, h, o, a->dp[1]);
15212
20.5M
    t[2] = l;
15213
20.5M
    l = h;
15214
20.5M
    h = o;
15215
20.5M
    o = 0;
15216
20.5M
    SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[3]);
15217
20.5M
    SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[2]);
15218
20.5M
    t[3] = l;
15219
20.5M
    l = h;
15220
20.5M
    h = o;
15221
20.5M
    o = 0;
15222
20.5M
    SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[3]);
15223
20.5M
    SP_ASM_SQR_ADD(l, h, o, a->dp[2]);
15224
20.5M
    r->dp[4] = l;
15225
20.5M
    l = h;
15226
20.5M
    h = o;
15227
20.5M
    o = 0;
15228
20.5M
    SP_ASM_MUL_ADD2(l, h, o, a->dp[2], a->dp[3]);
15229
20.5M
    r->dp[5] = l;
15230
20.5M
    l = h;
15231
20.5M
    h = o;
15232
20.5M
    SP_ASM_SQR_ADD_NO(l, h, a->dp[3]);
15233
20.5M
    r->dp[6] = l;
15234
20.5M
    r->dp[7] = h;
15235
20.5M
    XMEMCPY(r->dp, t, 4 * sizeof(sp_int_digit));
15236
20.5M
    r->used = 8;
15237
20.5M
    sp_clamp(r);
15238
15239
20.5M
    return MP_OKAY;
15240
20.5M
}
15241
#endif /* SQR_MUL_ASM */
15242
#endif /* SP_WORD_SIZE == 64 */
15243
#if (SP_WORD_SIZE == 64 && SP_INT_BITS >= 384)
15244
#ifdef SQR_MUL_ASM
15245
/* Square a and store in r. r = a * a
15246
 *
15247
 * Comba implementation.
15248
 *
15249
 * @param  [in]   a  SP integer to square.
15250
 * @param  [out]  r  SP integer result.
15251
 *
15252
 * @return  MP_OKAY on success.
15253
 * @return  MP_MEM when dynamic memory allocation fails.
15254
 */
15255
static int _sp_sqr_6(const sp_int* a, sp_int* r)
15256
6.11M
{
15257
6.11M
    sp_int_digit l = 0;
15258
6.11M
    sp_int_digit h = 0;
15259
6.11M
    sp_int_digit o = 0;
15260
6.11M
    sp_int_digit tl = 0;
15261
6.11M
    sp_int_digit th = 0;
15262
6.11M
    sp_int_digit to;
15263
6.11M
    sp_int_digit t[6];
15264
15265
#if defined(WOLFSSL_SP_ARM_THUMB) && SP_WORD_SIZE == 32
15266
    to = 0;
15267
#endif
15268
15269
6.11M
    SP_ASM_SQR(h, l, a->dp[0]);
15270
6.11M
    t[0] = h;
15271
6.11M
    h = 0;
15272
6.11M
    SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[1]);
15273
6.11M
    t[1] = l;
15274
6.11M
    l = h;
15275
6.11M
    h = o;
15276
6.11M
    o = 0;
15277
6.11M
    SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[2]);
15278
6.11M
    SP_ASM_SQR_ADD(l, h, o, a->dp[1]);
15279
6.11M
    t[2] = l;
15280
6.11M
    l = h;
15281
6.11M
    h = o;
15282
6.11M
    o = 0;
15283
6.11M
    SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[3]);
15284
6.11M
    SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[2]);
15285
6.11M
    t[3] = l;
15286
6.11M
    l = h;
15287
6.11M
    h = o;
15288
6.11M
    o = 0;
15289
6.11M
    SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[4]);
15290
6.11M
    SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[3]);
15291
6.11M
    SP_ASM_SQR_ADD(l, h, o, a->dp[2]);
15292
6.11M
    t[4] = l;
15293
6.11M
    l = h;
15294
6.11M
    h = o;
15295
6.11M
    o = 0;
15296
6.11M
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[5]);
15297
6.11M
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[4]);
15298
6.11M
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[3]);
15299
6.11M
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15300
6.11M
    t[5] = l;
15301
6.11M
    l = h;
15302
6.11M
    h = o;
15303
6.11M
    o = 0;
15304
6.11M
    SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[5]);
15305
6.11M
    SP_ASM_MUL_ADD2(l, h, o, a->dp[2], a->dp[4]);
15306
6.11M
    SP_ASM_SQR_ADD(l, h, o, a->dp[3]);
15307
6.11M
    r->dp[6] = l;
15308
6.11M
    l = h;
15309
6.11M
    h = o;
15310
6.11M
    o = 0;
15311
6.11M
    SP_ASM_MUL_ADD2(l, h, o, a->dp[2], a->dp[5]);
15312
6.11M
    SP_ASM_MUL_ADD2(l, h, o, a->dp[3], a->dp[4]);
15313
6.11M
    r->dp[7] = l;
15314
6.11M
    l = h;
15315
6.11M
    h = o;
15316
6.11M
    o = 0;
15317
6.11M
    SP_ASM_MUL_ADD2(l, h, o, a->dp[3], a->dp[5]);
15318
6.11M
    SP_ASM_SQR_ADD(l, h, o, a->dp[4]);
15319
6.11M
    r->dp[8] = l;
15320
6.11M
    l = h;
15321
6.11M
    h = o;
15322
6.11M
    o = 0;
15323
6.11M
    SP_ASM_MUL_ADD2(l, h, o, a->dp[4], a->dp[5]);
15324
6.11M
    r->dp[9] = l;
15325
6.11M
    l = h;
15326
6.11M
    h = o;
15327
6.11M
    SP_ASM_SQR_ADD_NO(l, h, a->dp[5]);
15328
6.11M
    r->dp[10] = l;
15329
6.11M
    r->dp[11] = h;
15330
6.11M
    XMEMCPY(r->dp, t, 6 * sizeof(sp_int_digit));
15331
6.11M
    r->used = 12;
15332
6.11M
    sp_clamp(r);
15333
15334
6.11M
    return MP_OKAY;
15335
6.11M
}
15336
#endif /* SQR_MUL_ASM */
15337
#endif /* SP_WORD_SIZE == 64 */
15338
#if (SP_WORD_SIZE == 32 && SP_INT_BITS >= 256)
15339
#ifdef SQR_MUL_ASM
15340
/* Square a and store in r. r = a * a
15341
 *
15342
 * Comba implementation.
15343
 *
15344
 * @param  [in]   a  SP integer to square.
15345
 * @param  [out]  r  SP integer result.
15346
 *
15347
 * @return  MP_OKAY on success.
15348
 * @return  MP_MEM when dynamic memory allocation fails.
15349
 */
15350
static int _sp_sqr_8(const sp_int* a, sp_int* r)
15351
{
15352
    sp_int_digit l = 0;
15353
    sp_int_digit h = 0;
15354
    sp_int_digit o = 0;
15355
    sp_int_digit tl = 0;
15356
    sp_int_digit th = 0;
15357
    sp_int_digit to;
15358
    sp_int_digit t[8];
15359
15360
#if defined(WOLFSSL_SP_ARM_THUMB) && SP_WORD_SIZE == 32
15361
    to = 0;
15362
#endif
15363
15364
    SP_ASM_SQR(h, l, a->dp[0]);
15365
    t[0] = h;
15366
    h = 0;
15367
    SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[1]);
15368
    t[1] = l;
15369
    l = h;
15370
    h = o;
15371
    o = 0;
15372
    SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[2]);
15373
    SP_ASM_SQR_ADD(l, h, o, a->dp[1]);
15374
    t[2] = l;
15375
    l = h;
15376
    h = o;
15377
    o = 0;
15378
    SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[3]);
15379
    SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[2]);
15380
    t[3] = l;
15381
    l = h;
15382
    h = o;
15383
    o = 0;
15384
    SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[4]);
15385
    SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[3]);
15386
    SP_ASM_SQR_ADD(l, h, o, a->dp[2]);
15387
    t[4] = l;
15388
    l = h;
15389
    h = o;
15390
    o = 0;
15391
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[5]);
15392
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[4]);
15393
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[3]);
15394
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15395
    t[5] = l;
15396
    l = h;
15397
    h = o;
15398
    o = 0;
15399
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[6]);
15400
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[5]);
15401
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[4]);
15402
    SP_ASM_SQR_ADD(l, h, o, a->dp[3]);
15403
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15404
    t[6] = l;
15405
    l = h;
15406
    h = o;
15407
    o = 0;
15408
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[7]);
15409
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[6]);
15410
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[5]);
15411
    SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[4]);
15412
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15413
    t[7] = l;
15414
    l = h;
15415
    h = o;
15416
    o = 0;
15417
    SP_ASM_MUL_SET(tl, th, to, a->dp[1], a->dp[7]);
15418
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[6]);
15419
    SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[5]);
15420
    SP_ASM_SQR_ADD(l, h, o, a->dp[4]);
15421
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15422
    r->dp[8] = l;
15423
    l = h;
15424
    h = o;
15425
    o = 0;
15426
    SP_ASM_MUL_SET(tl, th, to, a->dp[2], a->dp[7]);
15427
    SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[6]);
15428
    SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[5]);
15429
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15430
    r->dp[9] = l;
15431
    l = h;
15432
    h = o;
15433
    o = 0;
15434
    SP_ASM_MUL_ADD2(l, h, o, a->dp[3], a->dp[7]);
15435
    SP_ASM_MUL_ADD2(l, h, o, a->dp[4], a->dp[6]);
15436
    SP_ASM_SQR_ADD(l, h, o, a->dp[5]);
15437
    r->dp[10] = l;
15438
    l = h;
15439
    h = o;
15440
    o = 0;
15441
    SP_ASM_MUL_ADD2(l, h, o, a->dp[4], a->dp[7]);
15442
    SP_ASM_MUL_ADD2(l, h, o, a->dp[5], a->dp[6]);
15443
    r->dp[11] = l;
15444
    l = h;
15445
    h = o;
15446
    o = 0;
15447
    SP_ASM_MUL_ADD2(l, h, o, a->dp[5], a->dp[7]);
15448
    SP_ASM_SQR_ADD(l, h, o, a->dp[6]);
15449
    r->dp[12] = l;
15450
    l = h;
15451
    h = o;
15452
    o = 0;
15453
    SP_ASM_MUL_ADD2(l, h, o, a->dp[6], a->dp[7]);
15454
    r->dp[13] = l;
15455
    l = h;
15456
    h = o;
15457
    SP_ASM_SQR_ADD_NO(l, h, a->dp[7]);
15458
    r->dp[14] = l;
15459
    r->dp[15] = h;
15460
    XMEMCPY(r->dp, t, 8 * sizeof(sp_int_digit));
15461
    r->used = 16;
15462
    sp_clamp(r);
15463
15464
    return MP_OKAY;
15465
}
15466
#endif /* SQR_MUL_ASM */
15467
#endif /* SP_WORD_SIZE == 32 */
15468
#if (SP_WORD_SIZE == 32 && SP_INT_BITS >= 384)
15469
#ifdef SQR_MUL_ASM
15470
/* Square a and store in r. r = a * a
15471
 *
15472
 * Comba implementation.
15473
 *
15474
 * @param  [in]   a  SP integer to square.
15475
 * @param  [out]  r  SP integer result.
15476
 *
15477
 * @return  MP_OKAY on success.
15478
 * @return  MP_MEM when dynamic memory allocation fails.
15479
 */
15480
static int _sp_sqr_12(const sp_int* a, sp_int* r)
15481
{
15482
    sp_int_digit l = 0;
15483
    sp_int_digit h = 0;
15484
    sp_int_digit o = 0;
15485
    sp_int_digit tl = 0;
15486
    sp_int_digit th = 0;
15487
    sp_int_digit to;
15488
    sp_int_digit t[12];
15489
15490
#if defined(WOLFSSL_SP_ARM_THUMB) && SP_WORD_SIZE == 32
15491
    to = 0;
15492
#endif
15493
15494
    SP_ASM_SQR(h, l, a->dp[0]);
15495
    t[0] = h;
15496
    h = 0;
15497
    SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[1]);
15498
    t[1] = l;
15499
    l = h;
15500
    h = o;
15501
    o = 0;
15502
    SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[2]);
15503
    SP_ASM_SQR_ADD(l, h, o, a->dp[1]);
15504
    t[2] = l;
15505
    l = h;
15506
    h = o;
15507
    o = 0;
15508
    SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[3]);
15509
    SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[2]);
15510
    t[3] = l;
15511
    l = h;
15512
    h = o;
15513
    o = 0;
15514
    SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[4]);
15515
    SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[3]);
15516
    SP_ASM_SQR_ADD(l, h, o, a->dp[2]);
15517
    t[4] = l;
15518
    l = h;
15519
    h = o;
15520
    o = 0;
15521
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[5]);
15522
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[4]);
15523
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[3]);
15524
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15525
    t[5] = l;
15526
    l = h;
15527
    h = o;
15528
    o = 0;
15529
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[6]);
15530
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[5]);
15531
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[4]);
15532
    SP_ASM_SQR_ADD(l, h, o, a->dp[3]);
15533
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15534
    t[6] = l;
15535
    l = h;
15536
    h = o;
15537
    o = 0;
15538
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[7]);
15539
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[6]);
15540
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[5]);
15541
    SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[4]);
15542
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15543
    t[7] = l;
15544
    l = h;
15545
    h = o;
15546
    o = 0;
15547
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[8]);
15548
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[7]);
15549
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[6]);
15550
    SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[5]);
15551
    SP_ASM_SQR_ADD(l, h, o, a->dp[4]);
15552
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15553
    t[8] = l;
15554
    l = h;
15555
    h = o;
15556
    o = 0;
15557
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[9]);
15558
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[8]);
15559
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[7]);
15560
    SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[6]);
15561
    SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[5]);
15562
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15563
    t[9] = l;
15564
    l = h;
15565
    h = o;
15566
    o = 0;
15567
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[10]);
15568
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[9]);
15569
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[8]);
15570
    SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[7]);
15571
    SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[6]);
15572
    SP_ASM_SQR_ADD(l, h, o, a->dp[5]);
15573
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15574
    t[10] = l;
15575
    l = h;
15576
    h = o;
15577
    o = 0;
15578
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[11]);
15579
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[10]);
15580
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[9]);
15581
    SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[8]);
15582
    SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[7]);
15583
    SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[6]);
15584
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15585
    t[11] = l;
15586
    l = h;
15587
    h = o;
15588
    o = 0;
15589
    SP_ASM_MUL_SET(tl, th, to, a->dp[1], a->dp[11]);
15590
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[10]);
15591
    SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[9]);
15592
    SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[8]);
15593
    SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[7]);
15594
    SP_ASM_SQR_ADD(l, h, o, a->dp[6]);
15595
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15596
    r->dp[12] = l;
15597
    l = h;
15598
    h = o;
15599
    o = 0;
15600
    SP_ASM_MUL_SET(tl, th, to, a->dp[2], a->dp[11]);
15601
    SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[10]);
15602
    SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[9]);
15603
    SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[8]);
15604
    SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[7]);
15605
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15606
    r->dp[13] = l;
15607
    l = h;
15608
    h = o;
15609
    o = 0;
15610
    SP_ASM_MUL_SET(tl, th, to, a->dp[3], a->dp[11]);
15611
    SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[10]);
15612
    SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[9]);
15613
    SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[8]);
15614
    SP_ASM_SQR_ADD(l, h, o, a->dp[7]);
15615
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15616
    r->dp[14] = l;
15617
    l = h;
15618
    h = o;
15619
    o = 0;
15620
    SP_ASM_MUL_SET(tl, th, to, a->dp[4], a->dp[11]);
15621
    SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[10]);
15622
    SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[9]);
15623
    SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[8]);
15624
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15625
    r->dp[15] = l;
15626
    l = h;
15627
    h = o;
15628
    o = 0;
15629
    SP_ASM_MUL_SET(tl, th, to, a->dp[5], a->dp[11]);
15630
    SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[10]);
15631
    SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[9]);
15632
    SP_ASM_SQR_ADD(l, h, o, a->dp[8]);
15633
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15634
    r->dp[16] = l;
15635
    l = h;
15636
    h = o;
15637
    o = 0;
15638
    SP_ASM_MUL_SET(tl, th, to, a->dp[6], a->dp[11]);
15639
    SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[10]);
15640
    SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[9]);
15641
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15642
    r->dp[17] = l;
15643
    l = h;
15644
    h = o;
15645
    o = 0;
15646
    SP_ASM_MUL_ADD2(l, h, o, a->dp[7], a->dp[11]);
15647
    SP_ASM_MUL_ADD2(l, h, o, a->dp[8], a->dp[10]);
15648
    SP_ASM_SQR_ADD(l, h, o, a->dp[9]);
15649
    r->dp[18] = l;
15650
    l = h;
15651
    h = o;
15652
    o = 0;
15653
    SP_ASM_MUL_ADD2(l, h, o, a->dp[8], a->dp[11]);
15654
    SP_ASM_MUL_ADD2(l, h, o, a->dp[9], a->dp[10]);
15655
    r->dp[19] = l;
15656
    l = h;
15657
    h = o;
15658
    o = 0;
15659
    SP_ASM_MUL_ADD2(l, h, o, a->dp[9], a->dp[11]);
15660
    SP_ASM_SQR_ADD(l, h, o, a->dp[10]);
15661
    r->dp[20] = l;
15662
    l = h;
15663
    h = o;
15664
    o = 0;
15665
    SP_ASM_MUL_ADD2(l, h, o, a->dp[10], a->dp[11]);
15666
    r->dp[21] = l;
15667
    l = h;
15668
    h = o;
15669
    SP_ASM_SQR_ADD_NO(l, h, a->dp[11]);
15670
    r->dp[22] = l;
15671
    r->dp[23] = h;
15672
    XMEMCPY(r->dp, t, 12 * sizeof(sp_int_digit));
15673
    r->used = 24;
15674
    sp_clamp(r);
15675
15676
    return MP_OKAY;
15677
}
15678
#endif /* SQR_MUL_ASM */
15679
#endif /* SP_WORD_SIZE == 32 */
15680
#endif /* !WOLFSSL_HAVE_SP_ECC && HAVE_ECC */
15681
15682
#if defined(SQR_MUL_ASM) && (defined(WOLFSSL_SP_INT_LARGE_COMBA) || \
15683
    (!defined(WOLFSSL_SP_MATH) && defined(WOLFCRYPT_HAVE_SAKKE) && \
15684
    (SP_WORD_SIZE == 64)))
15685
    #if SP_INT_DIGITS >= 32
15686
/* Square a and store in r. r = a * a
15687
 *
15688
 * Comba implementation.
15689
 *
15690
 * @param  [in]   a  SP integer to square.
15691
 * @param  [out]  r  SP integer result.
15692
 *
15693
 * @return  MP_OKAY on success.
15694
 * @return  MP_MEM when dynamic memory allocation fails.
15695
 */
15696
static int _sp_sqr_16(const sp_int* a, sp_int* r)
15697
{
15698
    int err = MP_OKAY;
15699
    sp_int_digit l = 0;
15700
    sp_int_digit h = 0;
15701
    sp_int_digit o = 0;
15702
    sp_int_digit tl = 0;
15703
    sp_int_digit th = 0;
15704
    sp_int_digit to;
15705
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
15706
    sp_int_digit* t = NULL;
15707
#else
15708
    sp_int_digit t[16];
15709
#endif
15710
15711
#if defined(WOLFSSL_SP_ARM_THUMB) && SP_WORD_SIZE == 32
15712
    to = 0;
15713
#endif
15714
15715
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
15716
     t = (sp_int_digit*)XMALLOC(sizeof(sp_int_digit) * 16, NULL,
15717
         DYNAMIC_TYPE_BIGINT);
15718
     if (t == NULL) {
15719
         err = MP_MEM;
15720
     }
15721
#endif
15722
    if (err == MP_OKAY) {
15723
        SP_ASM_SQR(h, l, a->dp[0]);
15724
        t[0] = h;
15725
        h = 0;
15726
        SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[1]);
15727
        t[1] = l;
15728
        l = h;
15729
        h = o;
15730
        o = 0;
15731
        SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[2]);
15732
        SP_ASM_SQR_ADD(l, h, o, a->dp[1]);
15733
        t[2] = l;
15734
        l = h;
15735
        h = o;
15736
        o = 0;
15737
        SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[3]);
15738
        SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[2]);
15739
        t[3] = l;
15740
        l = h;
15741
        h = o;
15742
        o = 0;
15743
        SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[4]);
15744
        SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[3]);
15745
        SP_ASM_SQR_ADD(l, h, o, a->dp[2]);
15746
        t[4] = l;
15747
        l = h;
15748
        h = o;
15749
        o = 0;
15750
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[5]);
15751
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[4]);
15752
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[3]);
15753
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15754
        t[5] = l;
15755
        l = h;
15756
        h = o;
15757
        o = 0;
15758
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[6]);
15759
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[5]);
15760
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[4]);
15761
        SP_ASM_SQR_ADD(l, h, o, a->dp[3]);
15762
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15763
        t[6] = l;
15764
        l = h;
15765
        h = o;
15766
        o = 0;
15767
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[7]);
15768
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[6]);
15769
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[5]);
15770
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[4]);
15771
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15772
        t[7] = l;
15773
        l = h;
15774
        h = o;
15775
        o = 0;
15776
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[8]);
15777
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[7]);
15778
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[6]);
15779
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[5]);
15780
        SP_ASM_SQR_ADD(l, h, o, a->dp[4]);
15781
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15782
        t[8] = l;
15783
        l = h;
15784
        h = o;
15785
        o = 0;
15786
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[9]);
15787
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[8]);
15788
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[7]);
15789
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[6]);
15790
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[5]);
15791
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15792
        t[9] = l;
15793
        l = h;
15794
        h = o;
15795
        o = 0;
15796
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[10]);
15797
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[9]);
15798
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[8]);
15799
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[7]);
15800
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[6]);
15801
        SP_ASM_SQR_ADD(l, h, o, a->dp[5]);
15802
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15803
        t[10] = l;
15804
        l = h;
15805
        h = o;
15806
        o = 0;
15807
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[11]);
15808
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[10]);
15809
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[9]);
15810
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[8]);
15811
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[7]);
15812
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[6]);
15813
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15814
        t[11] = l;
15815
        l = h;
15816
        h = o;
15817
        o = 0;
15818
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[12]);
15819
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[11]);
15820
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[10]);
15821
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[9]);
15822
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[8]);
15823
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[7]);
15824
        SP_ASM_SQR_ADD(l, h, o, a->dp[6]);
15825
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15826
        t[12] = l;
15827
        l = h;
15828
        h = o;
15829
        o = 0;
15830
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[13]);
15831
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[12]);
15832
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[11]);
15833
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[10]);
15834
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[9]);
15835
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[8]);
15836
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[7]);
15837
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15838
        t[13] = l;
15839
        l = h;
15840
        h = o;
15841
        o = 0;
15842
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[14]);
15843
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[13]);
15844
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[12]);
15845
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[11]);
15846
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[10]);
15847
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[9]);
15848
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[8]);
15849
        SP_ASM_SQR_ADD(l, h, o, a->dp[7]);
15850
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15851
        t[14] = l;
15852
        l = h;
15853
        h = o;
15854
        o = 0;
15855
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[15]);
15856
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[14]);
15857
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[13]);
15858
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[12]);
15859
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[11]);
15860
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[10]);
15861
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[9]);
15862
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[8]);
15863
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15864
        t[15] = l;
15865
        l = h;
15866
        h = o;
15867
        o = 0;
15868
        SP_ASM_MUL_SET(tl, th, to, a->dp[1], a->dp[15]);
15869
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[14]);
15870
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[13]);
15871
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[12]);
15872
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[11]);
15873
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[10]);
15874
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[9]);
15875
        SP_ASM_SQR_ADD(l, h, o, a->dp[8]);
15876
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15877
        r->dp[16] = l;
15878
        l = h;
15879
        h = o;
15880
        o = 0;
15881
        SP_ASM_MUL_SET(tl, th, to, a->dp[2], a->dp[15]);
15882
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[14]);
15883
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[13]);
15884
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[12]);
15885
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[11]);
15886
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[10]);
15887
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[9]);
15888
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15889
        r->dp[17] = l;
15890
        l = h;
15891
        h = o;
15892
        o = 0;
15893
        SP_ASM_MUL_SET(tl, th, to, a->dp[3], a->dp[15]);
15894
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[14]);
15895
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[13]);
15896
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[12]);
15897
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[11]);
15898
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[10]);
15899
        SP_ASM_SQR_ADD(l, h, o, a->dp[9]);
15900
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15901
        r->dp[18] = l;
15902
        l = h;
15903
        h = o;
15904
        o = 0;
15905
        SP_ASM_MUL_SET(tl, th, to, a->dp[4], a->dp[15]);
15906
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[14]);
15907
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[13]);
15908
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[12]);
15909
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[11]);
15910
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[10]);
15911
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15912
        r->dp[19] = l;
15913
        l = h;
15914
        h = o;
15915
        o = 0;
15916
        SP_ASM_MUL_SET(tl, th, to, a->dp[5], a->dp[15]);
15917
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[14]);
15918
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[13]);
15919
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[12]);
15920
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[11]);
15921
        SP_ASM_SQR_ADD(l, h, o, a->dp[10]);
15922
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15923
        r->dp[20] = l;
15924
        l = h;
15925
        h = o;
15926
        o = 0;
15927
        SP_ASM_MUL_SET(tl, th, to, a->dp[6], a->dp[15]);
15928
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[14]);
15929
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[13]);
15930
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[12]);
15931
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[11]);
15932
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15933
        r->dp[21] = l;
15934
        l = h;
15935
        h = o;
15936
        o = 0;
15937
        SP_ASM_MUL_SET(tl, th, to, a->dp[7], a->dp[15]);
15938
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[14]);
15939
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[13]);
15940
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[12]);
15941
        SP_ASM_SQR_ADD(l, h, o, a->dp[11]);
15942
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15943
        r->dp[22] = l;
15944
        l = h;
15945
        h = o;
15946
        o = 0;
15947
        SP_ASM_MUL_SET(tl, th, to, a->dp[8], a->dp[15]);
15948
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[14]);
15949
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[13]);
15950
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[12]);
15951
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15952
        r->dp[23] = l;
15953
        l = h;
15954
        h = o;
15955
        o = 0;
15956
        SP_ASM_MUL_SET(tl, th, to, a->dp[9], a->dp[15]);
15957
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[14]);
15958
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[13]);
15959
        SP_ASM_SQR_ADD(l, h, o, a->dp[12]);
15960
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15961
        r->dp[24] = l;
15962
        l = h;
15963
        h = o;
15964
        o = 0;
15965
        SP_ASM_MUL_SET(tl, th, to, a->dp[10], a->dp[15]);
15966
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[14]);
15967
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[13]);
15968
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
15969
        r->dp[25] = l;
15970
        l = h;
15971
        h = o;
15972
        o = 0;
15973
        SP_ASM_MUL_ADD2(l, h, o, a->dp[11], a->dp[15]);
15974
        SP_ASM_MUL_ADD2(l, h, o, a->dp[12], a->dp[14]);
15975
        SP_ASM_SQR_ADD(l, h, o, a->dp[13]);
15976
        r->dp[26] = l;
15977
        l = h;
15978
        h = o;
15979
        o = 0;
15980
        SP_ASM_MUL_ADD2(l, h, o, a->dp[12], a->dp[15]);
15981
        SP_ASM_MUL_ADD2(l, h, o, a->dp[13], a->dp[14]);
15982
        r->dp[27] = l;
15983
        l = h;
15984
        h = o;
15985
        o = 0;
15986
        SP_ASM_MUL_ADD2(l, h, o, a->dp[13], a->dp[15]);
15987
        SP_ASM_SQR_ADD(l, h, o, a->dp[14]);
15988
        r->dp[28] = l;
15989
        l = h;
15990
        h = o;
15991
        o = 0;
15992
        SP_ASM_MUL_ADD2(l, h, o, a->dp[14], a->dp[15]);
15993
        r->dp[29] = l;
15994
        l = h;
15995
        h = o;
15996
        SP_ASM_SQR_ADD_NO(l, h, a->dp[15]);
15997
        r->dp[30] = l;
15998
        r->dp[31] = h;
15999
        XMEMCPY(r->dp, t, 16 * sizeof(sp_int_digit));
16000
        r->used = 32;
16001
        sp_clamp(r);
16002
    }
16003
16004
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
16005
    XFREE(t, NULL, DYNAMIC_TYPE_BIGINT);
16006
#endif
16007
    return err;
16008
}
16009
    #endif /* SP_INT_DIGITS >= 32 */
16010
#endif /* SQR_MUL_ASM && (WOLFSSL_SP_INT_LARGE_COMBA || !WOLFSSL_SP_MATH &&
16011
        * WOLFCRYPT_HAVE_SAKKE && SP_WORD_SIZE == 64 */
16012
16013
#if defined(SQR_MUL_ASM) && defined(WOLFSSL_SP_INT_LARGE_COMBA)
16014
    #if SP_INT_DIGITS >= 48
16015
/* Square a and store in r. r = a * a
16016
 *
16017
 * Comba implementation.
16018
 *
16019
 * @param  [in]   a  SP integer to square.
16020
 * @param  [out]  r  SP integer result.
16021
 *
16022
 * @return  MP_OKAY on success.
16023
 * @return  MP_MEM when dynamic memory allocation fails.
16024
 */
16025
static int _sp_sqr_24(const sp_int* a, sp_int* r)
16026
{
16027
    int err = MP_OKAY;
16028
    sp_int_digit l = 0;
16029
    sp_int_digit h = 0;
16030
    sp_int_digit o = 0;
16031
    sp_int_digit tl = 0;
16032
    sp_int_digit th = 0;
16033
    sp_int_digit to;
16034
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
16035
    sp_int_digit* t = NULL;
16036
#else
16037
    sp_int_digit t[24];
16038
#endif
16039
16040
#if defined(WOLFSSL_SP_ARM_THUMB) && SP_WORD_SIZE == 32
16041
    to = 0;
16042
#endif
16043
16044
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
16045
     t = (sp_int_digit*)XMALLOC(sizeof(sp_int_digit) * 24, NULL,
16046
         DYNAMIC_TYPE_BIGINT);
16047
     if (t == NULL) {
16048
         err = MP_MEM;
16049
     }
16050
#endif
16051
    if (err == MP_OKAY) {
16052
        SP_ASM_SQR(h, l, a->dp[0]);
16053
        t[0] = h;
16054
        h = 0;
16055
        SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[1]);
16056
        t[1] = l;
16057
        l = h;
16058
        h = o;
16059
        o = 0;
16060
        SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[2]);
16061
        SP_ASM_SQR_ADD(l, h, o, a->dp[1]);
16062
        t[2] = l;
16063
        l = h;
16064
        h = o;
16065
        o = 0;
16066
        SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[3]);
16067
        SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[2]);
16068
        t[3] = l;
16069
        l = h;
16070
        h = o;
16071
        o = 0;
16072
        SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[4]);
16073
        SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[3]);
16074
        SP_ASM_SQR_ADD(l, h, o, a->dp[2]);
16075
        t[4] = l;
16076
        l = h;
16077
        h = o;
16078
        o = 0;
16079
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[5]);
16080
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[4]);
16081
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[3]);
16082
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16083
        t[5] = l;
16084
        l = h;
16085
        h = o;
16086
        o = 0;
16087
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[6]);
16088
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[5]);
16089
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[4]);
16090
        SP_ASM_SQR_ADD(l, h, o, a->dp[3]);
16091
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16092
        t[6] = l;
16093
        l = h;
16094
        h = o;
16095
        o = 0;
16096
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[7]);
16097
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[6]);
16098
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[5]);
16099
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[4]);
16100
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16101
        t[7] = l;
16102
        l = h;
16103
        h = o;
16104
        o = 0;
16105
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[8]);
16106
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[7]);
16107
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[6]);
16108
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[5]);
16109
        SP_ASM_SQR_ADD(l, h, o, a->dp[4]);
16110
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16111
        t[8] = l;
16112
        l = h;
16113
        h = o;
16114
        o = 0;
16115
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[9]);
16116
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[8]);
16117
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[7]);
16118
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[6]);
16119
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[5]);
16120
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16121
        t[9] = l;
16122
        l = h;
16123
        h = o;
16124
        o = 0;
16125
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[10]);
16126
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[9]);
16127
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[8]);
16128
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[7]);
16129
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[6]);
16130
        SP_ASM_SQR_ADD(l, h, o, a->dp[5]);
16131
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16132
        t[10] = l;
16133
        l = h;
16134
        h = o;
16135
        o = 0;
16136
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[11]);
16137
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[10]);
16138
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[9]);
16139
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[8]);
16140
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[7]);
16141
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[6]);
16142
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16143
        t[11] = l;
16144
        l = h;
16145
        h = o;
16146
        o = 0;
16147
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[12]);
16148
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[11]);
16149
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[10]);
16150
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[9]);
16151
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[8]);
16152
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[7]);
16153
        SP_ASM_SQR_ADD(l, h, o, a->dp[6]);
16154
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16155
        t[12] = l;
16156
        l = h;
16157
        h = o;
16158
        o = 0;
16159
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[13]);
16160
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[12]);
16161
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[11]);
16162
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[10]);
16163
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[9]);
16164
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[8]);
16165
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[7]);
16166
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16167
        t[13] = l;
16168
        l = h;
16169
        h = o;
16170
        o = 0;
16171
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[14]);
16172
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[13]);
16173
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[12]);
16174
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[11]);
16175
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[10]);
16176
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[9]);
16177
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[8]);
16178
        SP_ASM_SQR_ADD(l, h, o, a->dp[7]);
16179
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16180
        t[14] = l;
16181
        l = h;
16182
        h = o;
16183
        o = 0;
16184
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[15]);
16185
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[14]);
16186
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[13]);
16187
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[12]);
16188
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[11]);
16189
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[10]);
16190
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[9]);
16191
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[8]);
16192
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16193
        t[15] = l;
16194
        l = h;
16195
        h = o;
16196
        o = 0;
16197
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[16]);
16198
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[15]);
16199
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[14]);
16200
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[13]);
16201
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[12]);
16202
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[11]);
16203
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[10]);
16204
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[9]);
16205
        SP_ASM_SQR_ADD(l, h, o, a->dp[8]);
16206
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16207
        t[16] = l;
16208
        l = h;
16209
        h = o;
16210
        o = 0;
16211
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[17]);
16212
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[16]);
16213
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[15]);
16214
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[14]);
16215
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[13]);
16216
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[12]);
16217
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[11]);
16218
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[10]);
16219
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[9]);
16220
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16221
        t[17] = l;
16222
        l = h;
16223
        h = o;
16224
        o = 0;
16225
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[18]);
16226
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[17]);
16227
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[16]);
16228
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[15]);
16229
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[14]);
16230
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[13]);
16231
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[12]);
16232
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[11]);
16233
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[10]);
16234
        SP_ASM_SQR_ADD(l, h, o, a->dp[9]);
16235
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16236
        t[18] = l;
16237
        l = h;
16238
        h = o;
16239
        o = 0;
16240
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[19]);
16241
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[18]);
16242
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[17]);
16243
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[16]);
16244
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[15]);
16245
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[14]);
16246
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[13]);
16247
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[12]);
16248
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[11]);
16249
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[10]);
16250
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16251
        t[19] = l;
16252
        l = h;
16253
        h = o;
16254
        o = 0;
16255
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[20]);
16256
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[19]);
16257
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[18]);
16258
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[17]);
16259
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[16]);
16260
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[15]);
16261
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[14]);
16262
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[13]);
16263
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[12]);
16264
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[11]);
16265
        SP_ASM_SQR_ADD(l, h, o, a->dp[10]);
16266
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16267
        t[20] = l;
16268
        l = h;
16269
        h = o;
16270
        o = 0;
16271
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[21]);
16272
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[20]);
16273
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[19]);
16274
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[18]);
16275
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[17]);
16276
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[16]);
16277
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[15]);
16278
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[14]);
16279
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[13]);
16280
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[12]);
16281
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[11]);
16282
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16283
        t[21] = l;
16284
        l = h;
16285
        h = o;
16286
        o = 0;
16287
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[22]);
16288
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[21]);
16289
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[20]);
16290
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[19]);
16291
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[18]);
16292
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[17]);
16293
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[16]);
16294
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[15]);
16295
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[14]);
16296
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[13]);
16297
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[12]);
16298
        SP_ASM_SQR_ADD(l, h, o, a->dp[11]);
16299
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16300
        t[22] = l;
16301
        l = h;
16302
        h = o;
16303
        o = 0;
16304
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[23]);
16305
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[22]);
16306
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[21]);
16307
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[20]);
16308
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[19]);
16309
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[18]);
16310
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[17]);
16311
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[16]);
16312
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[15]);
16313
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[14]);
16314
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[13]);
16315
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[12]);
16316
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16317
        t[23] = l;
16318
        l = h;
16319
        h = o;
16320
        o = 0;
16321
        SP_ASM_MUL_SET(tl, th, to, a->dp[1], a->dp[23]);
16322
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[22]);
16323
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[21]);
16324
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[20]);
16325
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[19]);
16326
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[18]);
16327
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[17]);
16328
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[16]);
16329
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[15]);
16330
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[14]);
16331
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[13]);
16332
        SP_ASM_SQR_ADD(l, h, o, a->dp[12]);
16333
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16334
        r->dp[24] = l;
16335
        l = h;
16336
        h = o;
16337
        o = 0;
16338
        SP_ASM_MUL_SET(tl, th, to, a->dp[2], a->dp[23]);
16339
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[22]);
16340
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[21]);
16341
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[20]);
16342
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[19]);
16343
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[18]);
16344
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[17]);
16345
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[16]);
16346
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[15]);
16347
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[14]);
16348
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[13]);
16349
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16350
        r->dp[25] = l;
16351
        l = h;
16352
        h = o;
16353
        o = 0;
16354
        SP_ASM_MUL_SET(tl, th, to, a->dp[3], a->dp[23]);
16355
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[22]);
16356
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[21]);
16357
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[20]);
16358
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[19]);
16359
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[18]);
16360
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[17]);
16361
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[16]);
16362
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[15]);
16363
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[14]);
16364
        SP_ASM_SQR_ADD(l, h, o, a->dp[13]);
16365
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16366
        r->dp[26] = l;
16367
        l = h;
16368
        h = o;
16369
        o = 0;
16370
        SP_ASM_MUL_SET(tl, th, to, a->dp[4], a->dp[23]);
16371
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[22]);
16372
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[21]);
16373
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[20]);
16374
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[19]);
16375
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[18]);
16376
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[17]);
16377
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[16]);
16378
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[15]);
16379
        SP_ASM_MUL_ADD(tl, th, to, a->dp[13], a->dp[14]);
16380
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16381
        r->dp[27] = l;
16382
        l = h;
16383
        h = o;
16384
        o = 0;
16385
        SP_ASM_MUL_SET(tl, th, to, a->dp[5], a->dp[23]);
16386
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[22]);
16387
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[21]);
16388
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[20]);
16389
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[19]);
16390
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[18]);
16391
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[17]);
16392
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[16]);
16393
        SP_ASM_MUL_ADD(tl, th, to, a->dp[13], a->dp[15]);
16394
        SP_ASM_SQR_ADD(l, h, o, a->dp[14]);
16395
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16396
        r->dp[28] = l;
16397
        l = h;
16398
        h = o;
16399
        o = 0;
16400
        SP_ASM_MUL_SET(tl, th, to, a->dp[6], a->dp[23]);
16401
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[22]);
16402
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[21]);
16403
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[20]);
16404
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[19]);
16405
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[18]);
16406
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[17]);
16407
        SP_ASM_MUL_ADD(tl, th, to, a->dp[13], a->dp[16]);
16408
        SP_ASM_MUL_ADD(tl, th, to, a->dp[14], a->dp[15]);
16409
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16410
        r->dp[29] = l;
16411
        l = h;
16412
        h = o;
16413
        o = 0;
16414
        SP_ASM_MUL_SET(tl, th, to, a->dp[7], a->dp[23]);
16415
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[22]);
16416
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[21]);
16417
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[20]);
16418
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[19]);
16419
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[18]);
16420
        SP_ASM_MUL_ADD(tl, th, to, a->dp[13], a->dp[17]);
16421
        SP_ASM_MUL_ADD(tl, th, to, a->dp[14], a->dp[16]);
16422
        SP_ASM_SQR_ADD(l, h, o, a->dp[15]);
16423
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16424
        r->dp[30] = l;
16425
        l = h;
16426
        h = o;
16427
        o = 0;
16428
        SP_ASM_MUL_SET(tl, th, to, a->dp[8], a->dp[23]);
16429
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[22]);
16430
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[21]);
16431
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[20]);
16432
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[19]);
16433
        SP_ASM_MUL_ADD(tl, th, to, a->dp[13], a->dp[18]);
16434
        SP_ASM_MUL_ADD(tl, th, to, a->dp[14], a->dp[17]);
16435
        SP_ASM_MUL_ADD(tl, th, to, a->dp[15], a->dp[16]);
16436
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16437
        r->dp[31] = l;
16438
        l = h;
16439
        h = o;
16440
        o = 0;
16441
        SP_ASM_MUL_SET(tl, th, to, a->dp[9], a->dp[23]);
16442
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[22]);
16443
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[21]);
16444
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[20]);
16445
        SP_ASM_MUL_ADD(tl, th, to, a->dp[13], a->dp[19]);
16446
        SP_ASM_MUL_ADD(tl, th, to, a->dp[14], a->dp[18]);
16447
        SP_ASM_MUL_ADD(tl, th, to, a->dp[15], a->dp[17]);
16448
        SP_ASM_SQR_ADD(l, h, o, a->dp[16]);
16449
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16450
        r->dp[32] = l;
16451
        l = h;
16452
        h = o;
16453
        o = 0;
16454
        SP_ASM_MUL_SET(tl, th, to, a->dp[10], a->dp[23]);
16455
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[22]);
16456
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[21]);
16457
        SP_ASM_MUL_ADD(tl, th, to, a->dp[13], a->dp[20]);
16458
        SP_ASM_MUL_ADD(tl, th, to, a->dp[14], a->dp[19]);
16459
        SP_ASM_MUL_ADD(tl, th, to, a->dp[15], a->dp[18]);
16460
        SP_ASM_MUL_ADD(tl, th, to, a->dp[16], a->dp[17]);
16461
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16462
        r->dp[33] = l;
16463
        l = h;
16464
        h = o;
16465
        o = 0;
16466
        SP_ASM_MUL_SET(tl, th, to, a->dp[11], a->dp[23]);
16467
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[22]);
16468
        SP_ASM_MUL_ADD(tl, th, to, a->dp[13], a->dp[21]);
16469
        SP_ASM_MUL_ADD(tl, th, to, a->dp[14], a->dp[20]);
16470
        SP_ASM_MUL_ADD(tl, th, to, a->dp[15], a->dp[19]);
16471
        SP_ASM_MUL_ADD(tl, th, to, a->dp[16], a->dp[18]);
16472
        SP_ASM_SQR_ADD(l, h, o, a->dp[17]);
16473
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16474
        r->dp[34] = l;
16475
        l = h;
16476
        h = o;
16477
        o = 0;
16478
        SP_ASM_MUL_SET(tl, th, to, a->dp[12], a->dp[23]);
16479
        SP_ASM_MUL_ADD(tl, th, to, a->dp[13], a->dp[22]);
16480
        SP_ASM_MUL_ADD(tl, th, to, a->dp[14], a->dp[21]);
16481
        SP_ASM_MUL_ADD(tl, th, to, a->dp[15], a->dp[20]);
16482
        SP_ASM_MUL_ADD(tl, th, to, a->dp[16], a->dp[19]);
16483
        SP_ASM_MUL_ADD(tl, th, to, a->dp[17], a->dp[18]);
16484
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16485
        r->dp[35] = l;
16486
        l = h;
16487
        h = o;
16488
        o = 0;
16489
        SP_ASM_MUL_SET(tl, th, to, a->dp[13], a->dp[23]);
16490
        SP_ASM_MUL_ADD(tl, th, to, a->dp[14], a->dp[22]);
16491
        SP_ASM_MUL_ADD(tl, th, to, a->dp[15], a->dp[21]);
16492
        SP_ASM_MUL_ADD(tl, th, to, a->dp[16], a->dp[20]);
16493
        SP_ASM_MUL_ADD(tl, th, to, a->dp[17], a->dp[19]);
16494
        SP_ASM_SQR_ADD(l, h, o, a->dp[18]);
16495
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16496
        r->dp[36] = l;
16497
        l = h;
16498
        h = o;
16499
        o = 0;
16500
        SP_ASM_MUL_SET(tl, th, to, a->dp[14], a->dp[23]);
16501
        SP_ASM_MUL_ADD(tl, th, to, a->dp[15], a->dp[22]);
16502
        SP_ASM_MUL_ADD(tl, th, to, a->dp[16], a->dp[21]);
16503
        SP_ASM_MUL_ADD(tl, th, to, a->dp[17], a->dp[20]);
16504
        SP_ASM_MUL_ADD(tl, th, to, a->dp[18], a->dp[19]);
16505
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16506
        r->dp[37] = l;
16507
        l = h;
16508
        h = o;
16509
        o = 0;
16510
        SP_ASM_MUL_SET(tl, th, to, a->dp[15], a->dp[23]);
16511
        SP_ASM_MUL_ADD(tl, th, to, a->dp[16], a->dp[22]);
16512
        SP_ASM_MUL_ADD(tl, th, to, a->dp[17], a->dp[21]);
16513
        SP_ASM_MUL_ADD(tl, th, to, a->dp[18], a->dp[20]);
16514
        SP_ASM_SQR_ADD(l, h, o, a->dp[19]);
16515
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16516
        r->dp[38] = l;
16517
        l = h;
16518
        h = o;
16519
        o = 0;
16520
        SP_ASM_MUL_SET(tl, th, to, a->dp[16], a->dp[23]);
16521
        SP_ASM_MUL_ADD(tl, th, to, a->dp[17], a->dp[22]);
16522
        SP_ASM_MUL_ADD(tl, th, to, a->dp[18], a->dp[21]);
16523
        SP_ASM_MUL_ADD(tl, th, to, a->dp[19], a->dp[20]);
16524
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16525
        r->dp[39] = l;
16526
        l = h;
16527
        h = o;
16528
        o = 0;
16529
        SP_ASM_MUL_SET(tl, th, to, a->dp[17], a->dp[23]);
16530
        SP_ASM_MUL_ADD(tl, th, to, a->dp[18], a->dp[22]);
16531
        SP_ASM_MUL_ADD(tl, th, to, a->dp[19], a->dp[21]);
16532
        SP_ASM_SQR_ADD(l, h, o, a->dp[20]);
16533
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16534
        r->dp[40] = l;
16535
        l = h;
16536
        h = o;
16537
        o = 0;
16538
        SP_ASM_MUL_SET(tl, th, to, a->dp[18], a->dp[23]);
16539
        SP_ASM_MUL_ADD(tl, th, to, a->dp[19], a->dp[22]);
16540
        SP_ASM_MUL_ADD(tl, th, to, a->dp[20], a->dp[21]);
16541
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
16542
        r->dp[41] = l;
16543
        l = h;
16544
        h = o;
16545
        o = 0;
16546
        SP_ASM_MUL_ADD2(l, h, o, a->dp[19], a->dp[23]);
16547
        SP_ASM_MUL_ADD2(l, h, o, a->dp[20], a->dp[22]);
16548
        SP_ASM_SQR_ADD(l, h, o, a->dp[21]);
16549
        r->dp[42] = l;
16550
        l = h;
16551
        h = o;
16552
        o = 0;
16553
        SP_ASM_MUL_ADD2(l, h, o, a->dp[20], a->dp[23]);
16554
        SP_ASM_MUL_ADD2(l, h, o, a->dp[21], a->dp[22]);
16555
        r->dp[43] = l;
16556
        l = h;
16557
        h = o;
16558
        o = 0;
16559
        SP_ASM_MUL_ADD2(l, h, o, a->dp[21], a->dp[23]);
16560
        SP_ASM_SQR_ADD(l, h, o, a->dp[22]);
16561
        r->dp[44] = l;
16562
        l = h;
16563
        h = o;
16564
        o = 0;
16565
        SP_ASM_MUL_ADD2(l, h, o, a->dp[22], a->dp[23]);
16566
        r->dp[45] = l;
16567
        l = h;
16568
        h = o;
16569
        SP_ASM_SQR_ADD_NO(l, h, a->dp[23]);
16570
        r->dp[46] = l;
16571
        r->dp[47] = h;
16572
        XMEMCPY(r->dp, t, 24 * sizeof(sp_int_digit));
16573
        r->used = 48;
16574
        sp_clamp(r);
16575
    }
16576
16577
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
16578
    XFREE(t, NULL, DYNAMIC_TYPE_BIGINT);
16579
#endif
16580
    return err;
16581
}
16582
    #endif /* SP_INT_DIGITS >= 48 */
16583
16584
    #if SP_INT_DIGITS >= 64
16585
/* Square a and store in r. r = a * a
16586
 *
16587
 * Karatsuba implementation.
16588
 *
16589
 * @param  [in]   a  SP integer to square.
16590
 * @param  [out]  r  SP integer result.
16591
 *
16592
 * @return  MP_OKAY on success.
16593
 * @return  MP_MEM when dynamic memory allocation fails.
16594
 */
16595
static int _sp_sqr_32(const sp_int* a, sp_int* r)
16596
{
16597
    int err = MP_OKAY;
16598
    unsigned int i;
16599
    sp_int_digit l;
16600
    sp_int_digit h;
16601
    sp_int* z0;
16602
    sp_int* z1;
16603
    sp_int* z2;
16604
    sp_int_digit ca;
16605
    DECL_SP_INT(a1, 16);
16606
    DECL_SP_INT_ARRAY(z, 33, 2);
16607
16608
    ALLOC_SP_INT(a1, 16, err, NULL);
16609
    ALLOC_SP_INT_ARRAY(z, 33, 2, err, NULL);
16610
    if (err == MP_OKAY) {
16611
        z1 = z[0];
16612
        z2 = z[1];
16613
        z0 = r;
16614
16615
        XMEMCPY(a1->dp, &a->dp[16], sizeof(sp_int_digit) * 16);
16616
        a1->used = 16;
16617
16618
        /* z2 = a1 ^ 2 */
16619
        err = _sp_sqr_16(a1, z2);
16620
    }
16621
    if (err == MP_OKAY) {
16622
        l = 0;
16623
        h = 0;
16624
        for (i = 0; i < 16; i++) {
16625
            SP_ASM_ADDC(l, h, a1->dp[i]);
16626
            SP_ASM_ADDC(l, h, a->dp[i]);
16627
            a1->dp[i] = l;
16628
            l = h;
16629
            h = 0;
16630
        }
16631
        ca = l;
16632
16633
        /* z0 = a0 ^ 2 */
16634
        err = _sp_sqr_16(a, z0);
16635
    }
16636
    if (err == MP_OKAY) {
16637
        /* z1 = (a0 + a1) ^ 2 */
16638
        err = _sp_sqr_16(a1, z1);
16639
    }
16640
    if (err == MP_OKAY) {
16641
        /* r = (z2 << 32) + (z1 - z0 - z2) << 16) + z0 */
16642
        /* r = z0 */
16643
        /* r += (z1 - z0 - z2) << 16 */
16644
        z1->dp[32] = ca;
16645
        l = 0;
16646
        if (ca) {
16647
            l = z1->dp[0 + 16];
16648
            h = 0;
16649
            SP_ASM_ADDC(l, h, a1->dp[0]);
16650
            SP_ASM_ADDC(l, h, a1->dp[0]);
16651
            z1->dp[0 + 16] = l;
16652
            l = h;
16653
            h = 0;
16654
            for (i = 1; i < 16; i++) {
16655
                SP_ASM_ADDC(l, h, z1->dp[i + 16]);
16656
                SP_ASM_ADDC(l, h, a1->dp[i]);
16657
                SP_ASM_ADDC(l, h, a1->dp[i]);
16658
                z1->dp[i + 16] = l;
16659
                l = h;
16660
                h = 0;
16661
            }
16662
        }
16663
        z1->dp[32] += l;
16664
        /* z1 = z1 - z0 - z1 */
16665
        l = z1->dp[0];
16666
        h = 0;
16667
        SP_ASM_SUBB(l, h, z0->dp[0]);
16668
        SP_ASM_SUBB(l, h, z2->dp[0]);
16669
        z1->dp[0] = l;
16670
        l = h;
16671
        h = 0;
16672
        for (i = 1; i < 32; i++) {
16673
            l += z1->dp[i];
16674
            SP_ASM_SUBB(l, h, z0->dp[i]);
16675
            SP_ASM_SUBB(l, h, z2->dp[i]);
16676
            z1->dp[i] = l;
16677
            l = h;
16678
            h = 0;
16679
        }
16680
        z1->dp[i] += l;
16681
        /* r += z1 << 16 */
16682
        l = 0;
16683
        h = 0;
16684
        for (i = 0; i < 16; i++) {
16685
            SP_ASM_ADDC(l, h, r->dp[i + 16]);
16686
            SP_ASM_ADDC(l, h, z1->dp[i]);
16687
            r->dp[i + 16] = l;
16688
            l = h;
16689
            h = 0;
16690
        }
16691
        for (; i < 33; i++) {
16692
            SP_ASM_ADDC(l, h, z1->dp[i]);
16693
            r->dp[i + 16] = l;
16694
            l = h;
16695
            h = 0;
16696
        }
16697
        /* r += z2 << 32  */
16698
        l = 0;
16699
        h = 0;
16700
        for (i = 0; i < 17; i++) {
16701
            SP_ASM_ADDC(l, h, r->dp[i + 32]);
16702
            SP_ASM_ADDC(l, h, z2->dp[i]);
16703
            r->dp[i + 32] = l;
16704
            l = h;
16705
            h = 0;
16706
        }
16707
        for (; i < 32; i++) {
16708
            SP_ASM_ADDC(l, h, z2->dp[i]);
16709
            r->dp[i + 32] = l;
16710
            l = h;
16711
            h = 0;
16712
        }
16713
        r->used = 64;
16714
        sp_clamp(r);
16715
    }
16716
16717
    FREE_SP_INT_ARRAY(z, NULL);
16718
    FREE_SP_INT(a1, NULL);
16719
    return err;
16720
}
16721
    #endif /* SP_INT_DIGITS >= 64 */
16722
16723
    #if SP_INT_DIGITS >= 96
16724
/* Square a and store in r. r = a * a
16725
 *
16726
 * Karatsuba implementation.
16727
 *
16728
 * @param  [in]   a  SP integer to square.
16729
 * @param  [out]  r  SP integer result.
16730
 *
16731
 * @return  MP_OKAY on success.
16732
 * @return  MP_MEM when dynamic memory allocation fails.
16733
 */
16734
static int _sp_sqr_48(const sp_int* a, sp_int* r)
16735
{
16736
    int err = MP_OKAY;
16737
    unsigned int i;
16738
    sp_int_digit l;
16739
    sp_int_digit h;
16740
    sp_int* z0;
16741
    sp_int* z1;
16742
    sp_int* z2;
16743
    sp_int_digit ca;
16744
    DECL_SP_INT(a1, 24);
16745
    DECL_SP_INT_ARRAY(z, 49, 2);
16746
16747
    ALLOC_SP_INT(a1, 24, err, NULL);
16748
    ALLOC_SP_INT_ARRAY(z, 49, 2, err, NULL);
16749
    if (err == MP_OKAY) {
16750
        z1 = z[0];
16751
        z2 = z[1];
16752
        z0 = r;
16753
16754
        XMEMCPY(a1->dp, &a->dp[24], sizeof(sp_int_digit) * 24);
16755
        a1->used = 24;
16756
16757
        /* z2 = a1 ^ 2 */
16758
        err = _sp_sqr_24(a1, z2);
16759
    }
16760
    if (err == MP_OKAY) {
16761
        l = 0;
16762
        h = 0;
16763
        for (i = 0; i < 24; i++) {
16764
            SP_ASM_ADDC(l, h, a1->dp[i]);
16765
            SP_ASM_ADDC(l, h, a->dp[i]);
16766
            a1->dp[i] = l;
16767
            l = h;
16768
            h = 0;
16769
        }
16770
        ca = l;
16771
16772
        /* z0 = a0 ^ 2 */
16773
        err = _sp_sqr_24(a, z0);
16774
    }
16775
    if (err == MP_OKAY) {
16776
        /* z1 = (a0 + a1) ^ 2 */
16777
        err = _sp_sqr_24(a1, z1);
16778
    }
16779
    if (err == MP_OKAY) {
16780
        /* r = (z2 << 48) + (z1 - z0 - z2) << 24) + z0 */
16781
        /* r = z0 */
16782
        /* r += (z1 - z0 - z2) << 24 */
16783
        z1->dp[48] = ca;
16784
        l = 0;
16785
        if (ca) {
16786
            l = z1->dp[0 + 24];
16787
            h = 0;
16788
            SP_ASM_ADDC(l, h, a1->dp[0]);
16789
            SP_ASM_ADDC(l, h, a1->dp[0]);
16790
            z1->dp[0 + 24] = l;
16791
            l = h;
16792
            h = 0;
16793
            for (i = 1; i < 24; i++) {
16794
                SP_ASM_ADDC(l, h, z1->dp[i + 24]);
16795
                SP_ASM_ADDC(l, h, a1->dp[i]);
16796
                SP_ASM_ADDC(l, h, a1->dp[i]);
16797
                z1->dp[i + 24] = l;
16798
                l = h;
16799
                h = 0;
16800
            }
16801
        }
16802
        z1->dp[48] += l;
16803
        /* z1 = z1 - z0 - z1 */
16804
        l = z1->dp[0];
16805
        h = 0;
16806
        SP_ASM_SUBB(l, h, z0->dp[0]);
16807
        SP_ASM_SUBB(l, h, z2->dp[0]);
16808
        z1->dp[0] = l;
16809
        l = h;
16810
        h = 0;
16811
        for (i = 1; i < 48; i++) {
16812
            l += z1->dp[i];
16813
            SP_ASM_SUBB(l, h, z0->dp[i]);
16814
            SP_ASM_SUBB(l, h, z2->dp[i]);
16815
            z1->dp[i] = l;
16816
            l = h;
16817
            h = 0;
16818
        }
16819
        z1->dp[i] += l;
16820
        /* r += z1 << 16 */
16821
        l = 0;
16822
        h = 0;
16823
        for (i = 0; i < 24; i++) {
16824
            SP_ASM_ADDC(l, h, r->dp[i + 24]);
16825
            SP_ASM_ADDC(l, h, z1->dp[i]);
16826
            r->dp[i + 24] = l;
16827
            l = h;
16828
            h = 0;
16829
        }
16830
        for (; i < 49; i++) {
16831
            SP_ASM_ADDC(l, h, z1->dp[i]);
16832
            r->dp[i + 24] = l;
16833
            l = h;
16834
            h = 0;
16835
        }
16836
        /* r += z2 << 48  */
16837
        l = 0;
16838
        h = 0;
16839
        for (i = 0; i < 25; i++) {
16840
            SP_ASM_ADDC(l, h, r->dp[i + 48]);
16841
            SP_ASM_ADDC(l, h, z2->dp[i]);
16842
            r->dp[i + 48] = l;
16843
            l = h;
16844
            h = 0;
16845
        }
16846
        for (; i < 48; i++) {
16847
            SP_ASM_ADDC(l, h, z2->dp[i]);
16848
            r->dp[i + 48] = l;
16849
            l = h;
16850
            h = 0;
16851
        }
16852
        r->used = 96;
16853
        sp_clamp(r);
16854
    }
16855
16856
    FREE_SP_INT_ARRAY(z, NULL);
16857
    FREE_SP_INT(a1, NULL);
16858
    return err;
16859
}
16860
    #endif /* SP_INT_DIGITS >= 96 */
16861
16862
    #if SP_INT_DIGITS >= 128
16863
/* Square a and store in r. r = a * a
16864
 *
16865
 * Karatsuba implementation.
16866
 *
16867
 * @param  [in]   a  SP integer to square.
16868
 * @param  [out]  r  SP integer result.
16869
 *
16870
 * @return  MP_OKAY on success.
16871
 * @return  MP_MEM when dynamic memory allocation fails.
16872
 */
16873
static int _sp_sqr_64(const sp_int* a, sp_int* r)
16874
{
16875
    int err = MP_OKAY;
16876
    unsigned int i;
16877
    sp_int_digit l;
16878
    sp_int_digit h;
16879
    sp_int* z0;
16880
    sp_int* z1;
16881
    sp_int* z2;
16882
    sp_int_digit ca;
16883
    DECL_SP_INT(a1, 32);
16884
    DECL_SP_INT_ARRAY(z, 65, 2);
16885
16886
    ALLOC_SP_INT(a1, 32, err, NULL);
16887
    ALLOC_SP_INT_ARRAY(z, 65, 2, err, NULL);
16888
    if (err == MP_OKAY) {
16889
        z1 = z[0];
16890
        z2 = z[1];
16891
        z0 = r;
16892
16893
        XMEMCPY(a1->dp, &a->dp[32], sizeof(sp_int_digit) * 32);
16894
        a1->used = 32;
16895
16896
        /* z2 = a1 ^ 2 */
16897
        err = _sp_sqr_32(a1, z2);
16898
    }
16899
    if (err == MP_OKAY) {
16900
        l = 0;
16901
        h = 0;
16902
        for (i = 0; i < 32; i++) {
16903
            SP_ASM_ADDC(l, h, a1->dp[i]);
16904
            SP_ASM_ADDC(l, h, a->dp[i]);
16905
            a1->dp[i] = l;
16906
            l = h;
16907
            h = 0;
16908
        }
16909
        ca = l;
16910
16911
        /* z0 = a0 ^ 2 */
16912
        err = _sp_sqr_32(a, z0);
16913
    }
16914
    if (err == MP_OKAY) {
16915
        /* z1 = (a0 + a1) ^ 2 */
16916
        err = _sp_sqr_32(a1, z1);
16917
    }
16918
    if (err == MP_OKAY) {
16919
        /* r = (z2 << 64) + (z1 - z0 - z2) << 32) + z0 */
16920
        /* r = z0 */
16921
        /* r += (z1 - z0 - z2) << 32 */
16922
        z1->dp[64] = ca;
16923
        l = 0;
16924
        if (ca) {
16925
            l = z1->dp[0 + 32];
16926
            h = 0;
16927
            SP_ASM_ADDC(l, h, a1->dp[0]);
16928
            SP_ASM_ADDC(l, h, a1->dp[0]);
16929
            z1->dp[0 + 32] = l;
16930
            l = h;
16931
            h = 0;
16932
            for (i = 1; i < 32; i++) {
16933
                SP_ASM_ADDC(l, h, z1->dp[i + 32]);
16934
                SP_ASM_ADDC(l, h, a1->dp[i]);
16935
                SP_ASM_ADDC(l, h, a1->dp[i]);
16936
                z1->dp[i + 32] = l;
16937
                l = h;
16938
                h = 0;
16939
            }
16940
        }
16941
        z1->dp[64] += l;
16942
        /* z1 = z1 - z0 - z1 */
16943
        l = z1->dp[0];
16944
        h = 0;
16945
        SP_ASM_SUBB(l, h, z0->dp[0]);
16946
        SP_ASM_SUBB(l, h, z2->dp[0]);
16947
        z1->dp[0] = l;
16948
        l = h;
16949
        h = 0;
16950
        for (i = 1; i < 64; i++) {
16951
            l += z1->dp[i];
16952
            SP_ASM_SUBB(l, h, z0->dp[i]);
16953
            SP_ASM_SUBB(l, h, z2->dp[i]);
16954
            z1->dp[i] = l;
16955
            l = h;
16956
            h = 0;
16957
        }
16958
        z1->dp[i] += l;
16959
        /* r += z1 << 16 */
16960
        l = 0;
16961
        h = 0;
16962
        for (i = 0; i < 32; i++) {
16963
            SP_ASM_ADDC(l, h, r->dp[i + 32]);
16964
            SP_ASM_ADDC(l, h, z1->dp[i]);
16965
            r->dp[i + 32] = l;
16966
            l = h;
16967
            h = 0;
16968
        }
16969
        for (; i < 65; i++) {
16970
            SP_ASM_ADDC(l, h, z1->dp[i]);
16971
            r->dp[i + 32] = l;
16972
            l = h;
16973
            h = 0;
16974
        }
16975
        /* r += z2 << 64  */
16976
        l = 0;
16977
        h = 0;
16978
        for (i = 0; i < 33; i++) {
16979
            SP_ASM_ADDC(l, h, r->dp[i + 64]);
16980
            SP_ASM_ADDC(l, h, z2->dp[i]);
16981
            r->dp[i + 64] = l;
16982
            l = h;
16983
            h = 0;
16984
        }
16985
        for (; i < 64; i++) {
16986
            SP_ASM_ADDC(l, h, z2->dp[i]);
16987
            r->dp[i + 64] = l;
16988
            l = h;
16989
            h = 0;
16990
        }
16991
        r->used = 128;
16992
        sp_clamp(r);
16993
    }
16994
16995
    FREE_SP_INT_ARRAY(z, NULL);
16996
    FREE_SP_INT(a1, NULL);
16997
    return err;
16998
}
16999
    #endif /* SP_INT_DIGITS >= 128 */
17000
17001
    #if SP_INT_DIGITS >= 192
17002
/* Square a and store in r. r = a * a
17003
 *
17004
 * Karatsuba implementation.
17005
 *
17006
 * @param  [in]   a  SP integer to square.
17007
 * @param  [out]  r  SP integer result.
17008
 *
17009
 * @return  MP_OKAY on success.
17010
 * @return  MP_MEM when dynamic memory allocation fails.
17011
 */
17012
static int _sp_sqr_96(const sp_int* a, sp_int* r)
17013
{
17014
    int err = MP_OKAY;
17015
    unsigned int i;
17016
    sp_int_digit l;
17017
    sp_int_digit h;
17018
    sp_int* z0;
17019
    sp_int* z1;
17020
    sp_int* z2;
17021
    sp_int_digit ca;
17022
    DECL_SP_INT(a1, 48);
17023
    DECL_SP_INT_ARRAY(z, 97, 2);
17024
17025
    ALLOC_SP_INT(a1, 48, err, NULL);
17026
    ALLOC_SP_INT_ARRAY(z, 97, 2, err, NULL);
17027
    if (err == MP_OKAY) {
17028
        z1 = z[0];
17029
        z2 = z[1];
17030
        z0 = r;
17031
17032
        XMEMCPY(a1->dp, &a->dp[48], sizeof(sp_int_digit) * 48);
17033
        a1->used = 48;
17034
17035
        /* z2 = a1 ^ 2 */
17036
        err = _sp_sqr_48(a1, z2);
17037
    }
17038
    if (err == MP_OKAY) {
17039
        l = 0;
17040
        h = 0;
17041
        for (i = 0; i < 48; i++) {
17042
            SP_ASM_ADDC(l, h, a1->dp[i]);
17043
            SP_ASM_ADDC(l, h, a->dp[i]);
17044
            a1->dp[i] = l;
17045
            l = h;
17046
            h = 0;
17047
        }
17048
        ca = l;
17049
17050
        /* z0 = a0 ^ 2 */
17051
        err = _sp_sqr_48(a, z0);
17052
    }
17053
    if (err == MP_OKAY) {
17054
        /* z1 = (a0 + a1) ^ 2 */
17055
        err = _sp_sqr_48(a1, z1);
17056
    }
17057
    if (err == MP_OKAY) {
17058
        /* r = (z2 << 96) + (z1 - z0 - z2) << 48) + z0 */
17059
        /* r = z0 */
17060
        /* r += (z1 - z0 - z2) << 48 */
17061
        z1->dp[96] = ca;
17062
        l = 0;
17063
        if (ca) {
17064
            l = z1->dp[0 + 48];
17065
            h = 0;
17066
            SP_ASM_ADDC(l, h, a1->dp[0]);
17067
            SP_ASM_ADDC(l, h, a1->dp[0]);
17068
            z1->dp[0 + 48] = l;
17069
            l = h;
17070
            h = 0;
17071
            for (i = 1; i < 48; i++) {
17072
                SP_ASM_ADDC(l, h, z1->dp[i + 48]);
17073
                SP_ASM_ADDC(l, h, a1->dp[i]);
17074
                SP_ASM_ADDC(l, h, a1->dp[i]);
17075
                z1->dp[i + 48] = l;
17076
                l = h;
17077
                h = 0;
17078
            }
17079
        }
17080
        z1->dp[96] += l;
17081
        /* z1 = z1 - z0 - z1 */
17082
        l = z1->dp[0];
17083
        h = 0;
17084
        SP_ASM_SUBB(l, h, z0->dp[0]);
17085
        SP_ASM_SUBB(l, h, z2->dp[0]);
17086
        z1->dp[0] = l;
17087
        l = h;
17088
        h = 0;
17089
        for (i = 1; i < 96; i++) {
17090
            l += z1->dp[i];
17091
            SP_ASM_SUBB(l, h, z0->dp[i]);
17092
            SP_ASM_SUBB(l, h, z2->dp[i]);
17093
            z1->dp[i] = l;
17094
            l = h;
17095
            h = 0;
17096
        }
17097
        z1->dp[i] += l;
17098
        /* r += z1 << 16 */
17099
        l = 0;
17100
        h = 0;
17101
        for (i = 0; i < 48; i++) {
17102
            SP_ASM_ADDC(l, h, r->dp[i + 48]);
17103
            SP_ASM_ADDC(l, h, z1->dp[i]);
17104
            r->dp[i + 48] = l;
17105
            l = h;
17106
            h = 0;
17107
        }
17108
        for (; i < 97; i++) {
17109
            SP_ASM_ADDC(l, h, z1->dp[i]);
17110
            r->dp[i + 48] = l;
17111
            l = h;
17112
            h = 0;
17113
        }
17114
        /* r += z2 << 96  */
17115
        l = 0;
17116
        h = 0;
17117
        for (i = 0; i < 49; i++) {
17118
            SP_ASM_ADDC(l, h, r->dp[i + 96]);
17119
            SP_ASM_ADDC(l, h, z2->dp[i]);
17120
            r->dp[i + 96] = l;
17121
            l = h;
17122
            h = 0;
17123
        }
17124
        for (; i < 96; i++) {
17125
            SP_ASM_ADDC(l, h, z2->dp[i]);
17126
            r->dp[i + 96] = l;
17127
            l = h;
17128
            h = 0;
17129
        }
17130
        r->used = 192;
17131
        sp_clamp(r);
17132
    }
17133
17134
    FREE_SP_INT_ARRAY(z, NULL);
17135
    FREE_SP_INT(a1, NULL);
17136
    return err;
17137
}
17138
    #endif /* SP_INT_DIGITS >= 192 */
17139
17140
#endif /* SQR_MUL_ASM && WOLFSSL_SP_INT_LARGE_COMBA */
17141
#endif /* !WOLFSSL_SP_SMALL */
17142
17143
/* Square a and store in r. r = a * a
17144
 *
17145
 * @param  [in]   a  SP integer to square.
17146
 * @param  [out]  r  SP integer result.
17147
 *
17148
 * @return  MP_OKAY on success.
17149
 * @return  MP_VAL when a or r is NULL, or the result will be too big for fixed
17150
 *          data length.
17151
 * @return  MP_MEM when dynamic memory allocation fails.
17152
 */
17153
int sp_sqr(const sp_int* a, sp_int* r)
17154
25.1M
{
17155
#if defined(WOLFSSL_SP_MATH) && defined(WOLFSSL_SP_SMALL)
17156
    return sp_mul(a, a, r);
17157
#else
17158
25.1M
    int err = MP_OKAY;
17159
17160
25.1M
    if ((a == NULL) || (r == NULL)) {
17161
0
        err = MP_VAL;
17162
0
    }
17163
    /* Need extra digit during calculation. */
17164
25.1M
    if ((err == MP_OKAY) && (a->used * 2 > r->size)) {
17165
18
        err = MP_VAL;
17166
18
    }
17167
17168
#if 0
17169
    if (err == MP_OKAY) {
17170
        sp_print(a, "a");
17171
    }
17172
#endif
17173
17174
25.1M
    if (err == MP_OKAY) {
17175
25.1M
        if (a->used == 0) {
17176
237k
            _sp_zero(r);
17177
237k
        }
17178
24.9M
    else
17179
24.9M
#ifndef WOLFSSL_SP_SMALL
17180
24.9M
#if !defined(WOLFSSL_HAVE_SP_ECC) && defined(HAVE_ECC)
17181
24.9M
#if (SP_WORD_SIZE == 64 && SP_INT_BITS >= 256)
17182
24.9M
        if (a->used == 4) {
17183
11.0M
            err = _sp_sqr_4(a, r);
17184
11.0M
        }
17185
13.9M
        else
17186
13.9M
#endif /* SP_WORD_SIZE == 64 */
17187
13.9M
#if (SP_WORD_SIZE == 64 && SP_INT_BITS >= 384)
17188
13.9M
#ifdef SQR_MUL_ASM
17189
13.9M
        if (a->used == 6) {
17190
3.72M
            err = _sp_sqr_6(a, r);
17191
3.72M
        }
17192
10.2M
        else
17193
10.2M
#endif /* SQR_MUL_ASM */
17194
10.2M
#endif /* SP_WORD_SIZE == 64 */
17195
#if (SP_WORD_SIZE == 32 && SP_INT_BITS >= 256)
17196
#ifdef SQR_MUL_ASM
17197
        if (a->used == 8) {
17198
            err = _sp_sqr_8(a, r);
17199
        }
17200
        else
17201
#endif /* SQR_MUL_ASM */
17202
#endif /* SP_WORD_SIZE == 32 */
17203
#if (SP_WORD_SIZE == 32 && SP_INT_BITS >= 384)
17204
#ifdef SQR_MUL_ASM
17205
        if (a->used == 12) {
17206
            err = _sp_sqr_12(a, r);
17207
        }
17208
        else
17209
#endif /* SQR_MUL_ASM */
17210
#endif /* SP_WORD_SIZE == 32 */
17211
10.2M
#endif /* !WOLFSSL_HAVE_SP_ECC && HAVE_ECC */
17212
#if defined(SQR_MUL_ASM) && (defined(WOLFSSL_SP_INT_LARGE_COMBA) || \
17213
    (!defined(WOLFSSL_SP_MATH) && defined(WOLFCRYPT_HAVE_SAKKE) && \
17214
    (SP_WORD_SIZE == 64)))
17215
    #if SP_INT_DIGITS >= 32
17216
        if (a->used == 16) {
17217
            err = _sp_sqr_16(a, r);
17218
        }
17219
        else
17220
    #endif /* SP_INT_DIGITS >= 32 */
17221
#endif /* SQR_MUL_ASM && (WOLFSSL_SP_INT_LARGE_COMBA || !WOLFSSL_SP_MATH &&
17222
        * WOLFCRYPT_HAVE_SAKKE && SP_WORD_SIZE == 64 */
17223
#if defined(SQR_MUL_ASM) && defined(WOLFSSL_SP_INT_LARGE_COMBA)
17224
    #if SP_INT_DIGITS >= 48
17225
        if (a->used == 24) {
17226
            err = _sp_sqr_24(a, r);
17227
        }
17228
        else
17229
    #endif /* SP_INT_DIGITS >= 48 */
17230
    #if SP_INT_DIGITS >= 64
17231
        if (a->used == 32) {
17232
            err = _sp_sqr_32(a, r);
17233
        }
17234
        else
17235
    #endif /* SP_INT_DIGITS >= 64 */
17236
    #if SP_INT_DIGITS >= 96
17237
        if (a->used == 48) {
17238
            err = _sp_sqr_48(a, r);
17239
        }
17240
        else
17241
    #endif /* SP_INT_DIGITS >= 96 */
17242
    #if SP_INT_DIGITS >= 128
17243
        if (a->used == 64) {
17244
            err = _sp_sqr_64(a, r);
17245
        }
17246
        else
17247
    #endif /* SP_INT_DIGITS >= 128 */
17248
    #if SP_INT_DIGITS >= 192
17249
        if (a->used == 96) {
17250
            err = _sp_sqr_96(a, r);
17251
        }
17252
        else
17253
    #endif /* SP_INT_DIGITS >= 192 */
17254
#endif /* SQR_MUL_ASM && WOLFSSL_SP_INT_LARGE_COMBA */
17255
10.2M
#endif /* !WOLFSSL_SP_SMALL */
17256
10.2M
        {
17257
10.2M
            err = _sp_sqr(a, r);
17258
10.2M
        }
17259
25.1M
    }
17260
17261
25.1M
#ifdef WOLFSSL_SP_INT_NEGATIVE
17262
25.1M
    if (err == MP_OKAY) {
17263
25.1M
        r->sign = MP_ZPOS;
17264
25.1M
    }
17265
25.1M
#endif
17266
17267
#if 0
17268
    if (err == MP_OKAY) {
17269
        sp_print(r, "rsqr");
17270
    }
17271
#endif
17272
17273
25.1M
    return err;
17274
25.1M
#endif /* WOLFSSL_SP_MATH && WOLFSSL_SP_SMALL */
17275
25.1M
}
17276
/* END SP_SQR implementations */
17277
17278
#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_HAVE_SP_DH || HAVE_ECC ||
17279
        * (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) */
17280
17281
#if defined(WOLFSSL_SP_MATH_ALL) || \
17282
    (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && \
17283
    !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || !defined(NO_DH) || defined(HAVE_ECC)
17284
/* Square a mod m and store in r: r = (a * a) mod m
17285
 *
17286
 * @param  [in]   a  SP integer to square.
17287
 * @param  [in]   m  SP integer that is the modulus.
17288
 * @param  [out]  r  SP integer result.
17289
 *
17290
 * @return  MP_OKAY on success.
17291
 * @return  MP_MEM when dynamic memory allocation fails.
17292
 */
17293
static int _sp_sqrmod(const sp_int* a, const sp_int* m, sp_int* r)
17294
103
{
17295
103
    int err = MP_OKAY;
17296
    /* Create temporary for multiplication result. */
17297
103
    DECL_SP_INT(t, a->used * 2);
17298
17299
103
    ALLOC_SP_INT(t, a->used * 2, err, NULL);
17300
103
    if (err == MP_OKAY) {
17301
82
        err = sp_init_size(t, a->used * 2U);
17302
82
    }
17303
17304
    /* Square and reduce. */
17305
103
    if (err == MP_OKAY) {
17306
62
        err = sp_sqr(a, t);
17307
62
    }
17308
103
    if (err == MP_OKAY) {
17309
52
        err = sp_mod(t, m, r);
17310
52
    }
17311
17312
    /* Dispose of an allocated SP int. */
17313
103
    FREE_SP_INT(t, NULL);
17314
103
    return err;
17315
103
}
17316
17317
/* Square a mod m and store in r: r = (a * a) mod m
17318
 *
17319
 * @param  [in]   a  SP integer to square.
17320
 * @param  [in]   m  SP integer that is the modulus.
17321
 * @param  [out]  r  SP integer result.
17322
 *
17323
 * @return  MP_OKAY on success.
17324
 * @return  MP_VAL when a, m or r is NULL; or m is 0; or a squared is too big
17325
 *          for fixed data length.
17326
 * @return  MP_MEM when dynamic memory allocation fails.
17327
 */
17328
int sp_sqrmod(const sp_int* a, const sp_int* m, sp_int* r)
17329
6.58M
{
17330
6.58M
    int err = MP_OKAY;
17331
17332
    /* Validate parameters. */
17333
6.58M
    if ((a == NULL) || (m == NULL) || (r == NULL)) {
17334
0
        err = MP_VAL;
17335
0
    }
17336
    /* Ensure r has space for intermediate result. */
17337
6.58M
    if ((err == MP_OKAY) && (r != m) && (a->used * 2 > r->size)) {
17338
37
        err = MP_VAL;
17339
37
    }
17340
    /* Ensure a is not too big. */
17341
6.58M
    if ((err == MP_OKAY) && (r == m) && (a->used * 2 > SP_INT_DIGITS)) {
17342
8
        err = MP_VAL;
17343
8
    }
17344
17345
    /* Use r as intermediate result if not same as pointer m which is needed
17346
     * after first intermediate result.
17347
     */
17348
6.58M
    if ((err == MP_OKAY) && (r != m)) {
17349
        /* Square and reduce. */
17350
6.58M
        err = sp_sqr(a, r);
17351
6.58M
        if (err == MP_OKAY) {
17352
6.58M
            err = sp_mod(r, m, r);
17353
6.58M
        }
17354
6.58M
    }
17355
148
    else if (err == MP_OKAY) {
17356
        /* Do operation with temporary. */
17357
103
        err = _sp_sqrmod(a, m, r);
17358
103
    }
17359
17360
6.58M
    return err;
17361
6.58M
}
17362
#endif /* !WOLFSSL_RSA_VERIFY_ONLY */
17363
17364
/**********************
17365
 * Montgomery functions
17366
 **********************/
17367
17368
#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_HAVE_SP_DH) || \
17369
    defined(WOLFCRYPT_HAVE_ECCSI) || defined(WOLFCRYPT_HAVE_SAKKE) || \
17370
    defined(OPENSSL_ALL)
17371
/* Reduce a number in Montgomery form.
17372
 *
17373
 * Assumes a and m are not NULL and m is not 0.
17374
 *
17375
 * DigitMask(a,i) := mask out the 'i'th digit in place.
17376
 *
17377
 * Algorithm:
17378
 *  1. mask = (1 << (NumBits(m) % WORD_SIZE)) - 1
17379
 *  2. For i = 0..NumDigits(m)-1
17380
 *   2.1. mu = (mp * DigitMask(a, i)) & WORD_MASK
17381
 *   2.2. If i == NumDigits(m)-1 and mask != 0 then mu & = mask
17382
 *   2.3. a += mu * DigitMask(m, 0)
17383
 *   2.4. For j = 1 up to NumDigits(m)-2
17384
 *    2.4.1 a += mu * DigitMask(m, j)
17385
 *   2.5 a += mu * DigitMask(m, NumDigits(m)-1))
17386
 * 3. a >>= NumBits(m)
17387
 * 4. a = a % m
17388
 *
17389
 * @param  [in,out]  a   SP integer to Montgomery reduce.
17390
 * @param  [in]      m   SP integer that is the modulus.
17391
 * @param  [in]      mp  SP integer digit that is the bottom digit of inv(-m).
17392
 * @param  [in]      ct  Indicates operation must be constant time.
17393
 *
17394
 * @return  MP_OKAY on success.
17395
 */
17396
static int _sp_mont_red(sp_int* a, const sp_int* m, sp_int_digit mp, int ct)
17397
100M
{
17398
#if !defined(SQR_MUL_ASM)
17399
    unsigned int i;
17400
    int bits;
17401
    sp_int_word w;
17402
    sp_int_digit mu;
17403
17404
#if 0
17405
    sp_print(a, "a");
17406
    sp_print(m, "m");
17407
#endif
17408
17409
    /* Count bits in modulus. */
17410
    bits = sp_count_bits(m);
17411
17412
    /* Adding numbers into m->used * 2 digits - zero out unused digits. */
17413
#ifndef WOLFSSL_NO_CT_OPS
17414
    if (ct) {
17415
        for (i = 0; i < (unsigned int)m->used * 2; i++) {
17416
            a->dp[i] &=
17417
                (sp_int_digit)
17418
                (sp_int_sdigit)ctMaskIntGTE((int)(a->used-1), (int)i);
17419
        }
17420
    }
17421
    else
17422
#endif /* !WOLFSSL_NO_CT_OPS */
17423
    {
17424
        for (i = a->used; i < (unsigned int)m->used * 2; i++) {
17425
            a->dp[i] = 0;
17426
        }
17427
    }
17428
17429
    /* Special case when modulus is 1 digit or less. */
17430
    if (m->used <= 1) {
17431
        /* mu = (mp * DigitMask(a, i)) & WORD_MASK */
17432
        mu = mp * a->dp[0];
17433
        /* a += mu * m */
17434
        w = a->dp[0];
17435
        w += (sp_int_word)mu * m->dp[0];
17436
        a->dp[0] = (sp_int_digit)w;
17437
        w >>= SP_WORD_SIZE;
17438
        w += a->dp[1];
17439
        a->dp[1] = (sp_int_digit)w;
17440
        w >>= SP_WORD_SIZE;
17441
        a->dp[2] = (sp_int_digit)w;
17442
        a->used = 3;
17443
        /* mp is SP_WORD_SIZE */
17444
        bits = SP_WORD_SIZE;
17445
    }
17446
    else {
17447
        /* 1. mask = (1 << (NumBits(m) % WORD_SIZE)) - 1
17448
         *    Mask when last digit of modulus doesn't have highest bit set.
17449
         */
17450
        volatile sp_int_digit mask = (sp_int_digit)
17451
            (((sp_int_digit)1 << (bits & (SP_WORD_SIZE - 1))) - 1);
17452
        /* Overflow. */
17453
        sp_int_word o = 0;
17454
17455
        /* 2. For i = 0..NumDigits(m)-1 */
17456
        for (i = 0; i < m->used; i++) {
17457
            unsigned int j;
17458
17459
            /* 2.1. mu = (mp * DigitMask(a, i)) & WORD_MASK */
17460
            mu = mp * a->dp[i];
17461
            /* 2.2. If i == NumDigits(m)-1 and mask != 0 then mu & = mask */
17462
            if ((i == (unsigned int)m->used - 1) && (mask != 0)) {
17463
                mu &= mask;
17464
            }
17465
17466
            /* 2.3. a += mu * DigitMask(m, 0) */
17467
            w = a->dp[i];
17468
            w += (sp_int_word)mu * m->dp[0];
17469
            a->dp[i] = (sp_int_digit)w;
17470
            w >>= SP_WORD_SIZE;
17471
            /* 2.4. For j = 1 up to NumDigits(m)-2 */
17472
            for (j = 1; j < (unsigned int)m->used - 1; j++) {
17473
                /* 2.4.1 a += mu * DigitMask(m, j) */
17474
                w += a->dp[i + j];
17475
                w += (sp_int_word)mu * m->dp[j];
17476
                a->dp[i + j] = (sp_int_digit)w;
17477
                w >>= SP_WORD_SIZE;
17478
            }
17479
            /* Handle overflow. */
17480
            w += o;
17481
            w += a->dp[i + j];
17482
            o = (sp_int_digit)(w >> SP_WORD_SIZE);
17483
            /* 2.5 a += mu * DigitMask(m, NumDigits(m)-1)) */
17484
            w = ((sp_int_word)mu * m->dp[j]) + (sp_int_digit)w;
17485
            a->dp[i + j] = (sp_int_digit)w;
17486
            w >>= SP_WORD_SIZE;
17487
            o += w;
17488
        }
17489
        /* Handle overflow. */
17490
        o += a->dp[m->used * 2 - 1];
17491
        a->dp[m->used * 2 - 1] = (sp_int_digit)o;
17492
        o >>= SP_WORD_SIZE;
17493
        a->dp[m->used * 2] = (sp_int_digit)o;
17494
        a->used = (sp_size_t)(m->used * 2 + 1);
17495
    }
17496
17497
    if (!ct) {
17498
        /* Remove leading zeros. */
17499
        sp_clamp(a);
17500
        /* 3. a >>= NumBits(m) */
17501
        (void)sp_rshb(a, bits, a);
17502
        /* 4. a = a mod m */
17503
        if (_sp_cmp_abs(a, m) != MP_LT) {
17504
            _sp_sub_off(a, m, a, 0);
17505
        }
17506
    }
17507
    else {
17508
        /* 3. a >>= NumBits(m) */
17509
        (void)sp_rshb(a, bits, a);
17510
        /* Constant time clamping. */
17511
        sp_clamp_ct(a);
17512
17513
        /* 4. a = a mod m
17514
         * Always subtract but at a too high offset if a is less than m.
17515
         */
17516
        _sp_submod_ct(a, m, m, m->used + 1U, a);
17517
    }
17518
17519
17520
#if 0
17521
    sp_print(a, "rr");
17522
#endif
17523
17524
    return MP_OKAY;
17525
#else /* !SQR_MUL_ASM */
17526
100M
    unsigned int i;
17527
100M
    unsigned int j;
17528
100M
    int bits;
17529
100M
    sp_int_digit mu;
17530
100M
    sp_int_digit o;
17531
100M
    volatile sp_int_digit mask;
17532
17533
#if 0
17534
    sp_print(a, "a");
17535
    sp_print(m, "m");
17536
#endif
17537
17538
100M
    bits = sp_count_bits(m);
17539
100M
    mask = ((sp_int_digit)1 << (bits & (SP_WORD_SIZE - 1))) - 1;
17540
17541
100M
#ifndef WOLFSSL_NO_CT_OPS
17542
100M
    if (ct) {
17543
895
        for (i = 0; i < (unsigned int)m->used * 2; i++) {
17544
856
            a->dp[i] &=
17545
856
                (sp_int_digit)
17546
856
                (sp_int_sdigit)ctMaskIntGTE((int)(a->used-1), (int)i);
17547
856
        }
17548
39
    }
17549
100M
    else
17550
100M
#endif
17551
100M
    {
17552
162M
        for (i = a->used; i < (unsigned int)m->used * 2; i++) {
17553
61.3M
            a->dp[i] = 0;
17554
61.3M
        }
17555
100M
    }
17556
17557
100M
    if (m->used <= 1) {
17558
120k
        sp_int_digit l;
17559
120k
        sp_int_digit h;
17560
17561
        /* mu = (mp * DigitMask(a, i)) & WORD_MASK */
17562
120k
        mu = mp * a->dp[0];
17563
        /* a += mu * m */
17564
120k
        l = a->dp[0];
17565
120k
        h = 0;
17566
120k
        SP_ASM_MUL_ADD_NO(l, h, mu, m->dp[0]);
17567
120k
        a->dp[0] = l;
17568
120k
        l = h;
17569
120k
        h = 0;
17570
120k
        SP_ASM_ADDC(l, h, a->dp[1]);
17571
120k
        a->dp[1] = l;
17572
120k
        a->dp[2] = h;
17573
120k
        a->used = (sp_size_t)(m->used * 2 + 1);
17574
        /* mp is SP_WORD_SIZE */
17575
120k
        bits = SP_WORD_SIZE;
17576
120k
    }
17577
100M
#if !defined(WOLFSSL_SP_MATH) && defined(HAVE_ECC)
17578
100M
#if SP_WORD_SIZE == 64
17579
100M
#if SP_INT_DIGITS >= 8
17580
100M
    else if ((m->used == 4) && (mask == 0)) {
17581
21.9M
        sp_int_digit l;
17582
21.9M
        sp_int_digit h;
17583
21.9M
        sp_int_digit o2;
17584
17585
21.9M
        l = 0;
17586
21.9M
        h = 0;
17587
21.9M
        o = 0;
17588
21.9M
        o2 = 0;
17589
        /* For i = 0..NumDigits(m)-1 */
17590
109M
        for (i = 0; i < 4; i++) {
17591
            /* mu = (mp * DigitMask(a, i)) & WORD_MASK */
17592
87.6M
            mu = mp * a->dp[0];
17593
87.6M
            l = a->dp[0];
17594
            /* a = (a + mu * m) >> WORD_SIZE */
17595
87.6M
            SP_ASM_MUL_ADD_NO(l, h, mu, m->dp[0]);
17596
87.6M
            l = h;
17597
87.6M
            h = 0;
17598
87.6M
            SP_ASM_ADDC(l, h, a->dp[1]);
17599
87.6M
            SP_ASM_MUL_ADD_NO(l, h, mu, m->dp[1]);
17600
87.6M
            a->dp[0] = l;
17601
87.6M
            l = h;
17602
87.6M
            h = 0;
17603
87.6M
            SP_ASM_ADDC(l, h, a->dp[2]);
17604
87.6M
            SP_ASM_MUL_ADD_NO(l, h, mu, m->dp[2]);
17605
87.6M
            a->dp[1] = l;
17606
87.6M
            l = h;
17607
87.6M
            h = o2;
17608
87.6M
            o2 = 0;
17609
87.6M
            SP_ASM_ADDC_REG(l, h, o);
17610
87.6M
            SP_ASM_ADDC(l, h, a->dp[i + 3]);
17611
87.6M
            SP_ASM_MUL_ADD(l, h, o2, mu, m->dp[3]);
17612
87.6M
            a->dp[2] = l;
17613
87.6M
            o = h;
17614
87.6M
            l = h;
17615
87.6M
            h = 0;
17616
87.6M
        }
17617
        /* Handle overflow. */
17618
21.9M
        SP_ASM_ADDC(l, o2, a->dp[7]);
17619
21.9M
        a->dp[3] = l;
17620
21.9M
        a->dp[4] = o2;
17621
21.9M
        a->used = 5;
17622
17623
        /* Remove leading zeros. */
17624
21.9M
        sp_clamp(a);
17625
17626
        /* a = a mod m */
17627
21.9M
        if (_sp_cmp_abs(a, m) != MP_LT) {
17628
5.77M
            _sp_sub_off(a, m, a, 0);
17629
5.77M
        }
17630
17631
21.9M
        return MP_OKAY;
17632
21.9M
    }
17633
78.6M
#endif /* SP_INT_DIGITS >= 8 */
17634
78.6M
#if SP_INT_DIGITS >= 12
17635
78.6M
    else if ((m->used == 6) && (mask == 0)) {
17636
14.4M
        sp_int_digit l;
17637
14.4M
        sp_int_digit h;
17638
14.4M
        sp_int_digit o2;
17639
17640
14.4M
        l = 0;
17641
14.4M
        h = 0;
17642
14.4M
        o = 0;
17643
14.4M
        o2 = 0;
17644
        /* For i = 0..NumDigits(m)-1 */
17645
100M
        for (i = 0; i < 6; i++) {
17646
            /* mu = (mp * DigitMask(a, i)) & WORD_MASK */
17647
86.4M
            mu = mp * a->dp[0];
17648
86.4M
            l = a->dp[0];
17649
            /* a = (a + mu * m) >> WORD_SIZE */
17650
86.4M
            SP_ASM_MUL_ADD_NO(l, h, mu, m->dp[0]);
17651
86.4M
            l = h;
17652
86.4M
            h = 0;
17653
86.4M
            SP_ASM_ADDC(l, h, a->dp[1]);
17654
86.4M
            SP_ASM_MUL_ADD_NO(l, h, mu, m->dp[1]);
17655
86.4M
            a->dp[0] = l;
17656
86.4M
            l = h;
17657
86.4M
            h = 0;
17658
86.4M
            SP_ASM_ADDC(l, h, a->dp[2]);
17659
86.4M
            SP_ASM_MUL_ADD_NO(l, h, mu, m->dp[2]);
17660
86.4M
            a->dp[1] = l;
17661
86.4M
            l = h;
17662
86.4M
            h = 0;
17663
86.4M
            SP_ASM_ADDC(l, h, a->dp[3]);
17664
86.4M
            SP_ASM_MUL_ADD_NO(l, h, mu, m->dp[3]);
17665
86.4M
            a->dp[2] = l;
17666
86.4M
            l = h;
17667
86.4M
            h = 0;
17668
86.4M
            SP_ASM_ADDC(l, h, a->dp[4]);
17669
86.4M
            SP_ASM_MUL_ADD_NO(l, h, mu, m->dp[4]);
17670
86.4M
            a->dp[3] = l;
17671
86.4M
            l = h;
17672
86.4M
            h = o2;
17673
86.4M
            o2 = 0;
17674
86.4M
            SP_ASM_ADDC_REG(l, h, o);
17675
86.4M
            SP_ASM_ADDC(l, h, a->dp[i + 5]);
17676
86.4M
            SP_ASM_MUL_ADD(l, h, o2, mu, m->dp[5]);
17677
86.4M
            a->dp[4] = l;
17678
86.4M
            o = h;
17679
86.4M
            l = h;
17680
86.4M
            h = 0;
17681
86.4M
        }
17682
        /* Handle overflow. */
17683
14.4M
        SP_ASM_ADDC(l, o2, a->dp[11]);
17684
14.4M
        a->dp[5] = l;
17685
14.4M
        a->dp[6] = o2;
17686
14.4M
        a->used = 7;
17687
17688
        /* Remove leading zeros. */
17689
14.4M
        sp_clamp(a);
17690
17691
        /* a = a mod m */
17692
14.4M
        if (_sp_cmp_abs(a, m) != MP_LT) {
17693
3.53M
            _sp_sub_off(a, m, a, 0);
17694
3.53M
        }
17695
17696
14.4M
        return MP_OKAY;
17697
14.4M
    }
17698
64.2M
#endif /* SP_INT_DIGITS >= 12 */
17699
#elif SP_WORD_SIZE == 32
17700
    else if ((m->used <= 12) && (mask == 0)) {
17701
        sp_int_digit l;
17702
        sp_int_digit h;
17703
        sp_int_digit o2;
17704
        sp_int_digit* ad;
17705
        const sp_int_digit* md;
17706
17707
        o = 0;
17708
        o2 = 0;
17709
        ad = a->dp;
17710
        /* For i = 0..NumDigits(m)-1 */
17711
        for (i = 0; i < m->used; i++) {
17712
            md = m->dp;
17713
            /*  mu = (mp * DigitMask(a, i)) & WORD_MASK */
17714
            mu = mp * ad[0];
17715
17716
            /* a = (a + mu * m, 0) >> WORD_SIZE */
17717
            l = ad[0];
17718
            h = 0;
17719
            SP_ASM_MUL_ADD_NO(l, h, mu, *(md++));
17720
            l = h;
17721
            for (j = 1; j < (unsigned int)m->used - 2; j += 2) {
17722
                h = 0;
17723
                SP_ASM_ADDC(l, h, ad[j]);
17724
                SP_ASM_MUL_ADD_NO(l, h, mu, *(md++));
17725
                ad[j - 1] = l;
17726
                l = 0;
17727
                SP_ASM_ADDC(h, l, ad[j + 1]);
17728
                SP_ASM_MUL_ADD_NO(h, l, mu, *(md++));
17729
                ad[j] = h;
17730
            }
17731
            for (; j < (unsigned int)m->used - 1; j++) {
17732
                h = 0;
17733
                SP_ASM_ADDC(l, h, ad[j]);
17734
                SP_ASM_MUL_ADD_NO(l, h, mu, *(md++));
17735
                ad[j - 1] = l;
17736
                l = h;
17737
            }
17738
            h = o2;
17739
            o2 = 0;
17740
            SP_ASM_ADDC_REG(l, h, o);
17741
            SP_ASM_ADDC(l, h, ad[i + j]);
17742
            SP_ASM_MUL_ADD(l, h, o2, mu, *md);
17743
            ad[j - 1] = l;
17744
            o = h;
17745
        }
17746
        /* Handle overflow. */
17747
        SP_ASM_ADDC(o, o2, a->dp[m->used * 2 - 1]);
17748
        a->dp[m->used  - 1] = o;
17749
        a->dp[m->used] = o2;
17750
        a->used = m->used + 1;
17751
17752
        /* Remove leading zeros. */
17753
        sp_clamp(a);
17754
17755
        /* a = a mod m */
17756
        if (_sp_cmp_abs(a, m) != MP_LT) {
17757
            _sp_sub_off(a, m, a, 0);
17758
        }
17759
17760
        return MP_OKAY;
17761
    }
17762
#endif /* SP_WORD_SIZE == 64 | 32 */
17763
64.2M
#endif /* !WOLFSSL_SP_MATH && HAVE_ECC */
17764
64.2M
    else {
17765
64.2M
        sp_int_digit l;
17766
64.2M
        sp_int_digit h;
17767
64.2M
        sp_int_digit o2;
17768
64.2M
        sp_int_digit* ad;
17769
64.2M
        const sp_int_digit* md;
17770
17771
64.2M
        o = 0;
17772
64.2M
        o2 = 0;
17773
64.2M
        ad = a->dp;
17774
        /* 2. For i = 0..NumDigits(m)-1 */
17775
414M
        for (i = 0; i < m->used; i++, ad++) {
17776
350M
            md = m->dp;
17777
            /* 2.1. mu = (mp * DigitMask(a, i)) & WORD_MASK */
17778
350M
            mu = mp * ad[0];
17779
            /* 2.2. If i == NumDigits(m)-1 and mask != 0 then mu & = mask */
17780
350M
            if ((i == (unsigned int)m->used - 1) && (mask != 0)) {
17781
53.2M
                mu &= mask;
17782
53.2M
            }
17783
17784
            /* 2.3 a += mu * DigitMask(m, 0) */
17785
350M
            l = ad[0];
17786
350M
            h = 0;
17787
350M
            SP_ASM_MUL_ADD_NO(l, h, mu, *(md++));
17788
350M
            ad[0] = l;
17789
350M
            l = h;
17790
            /* 2.4. For j = 1 up to NumDigits(m)-2 */
17791
1.37G
            for (j = 1; j < (unsigned int)m->used - 2; j += 2) {
17792
1.02G
                h = 0;
17793
                /* 2.4.1. a += mu * DigitMask(m, j) */
17794
1.02G
                SP_ASM_ADDC(l, h, ad[j + 0]);
17795
1.02G
                SP_ASM_MUL_ADD_NO(l, h, mu, *(md++));
17796
1.02G
                ad[j + 0] = l;
17797
1.02G
                l = 0;
17798
                /* 2.4.1. a += mu * DigitMask(m, j) */
17799
1.02G
                SP_ASM_ADDC(h, l, ad[j + 1]);
17800
1.02G
                SP_ASM_MUL_ADD_NO(h, l, mu, *(md++));
17801
1.02G
                ad[j + 1] = h;
17802
1.02G
            }
17803
531M
            for (; j < (unsigned int)m->used - 1; j++) {
17804
181M
                h = 0;
17805
                /* 2.4.1. a += mu * DigitMask(m, j) */
17806
181M
                SP_ASM_ADDC(l, h, ad[j]);
17807
181M
                SP_ASM_MUL_ADD_NO(l, h, mu, *(md++));
17808
181M
                ad[j] = l;
17809
181M
                l = h;
17810
181M
            }
17811
350M
            h = o2;
17812
350M
            o2 = 0;
17813
350M
            SP_ASM_ADDC_REG(l, h, o);
17814
            /* 2.5 a += mu * DigitMask(m, NumDigits(m)-1) */
17815
350M
            SP_ASM_ADDC(l, h, ad[j]);
17816
350M
            SP_ASM_MUL_ADD(l, h, o2, mu, *md);
17817
350M
            ad[j] = l;
17818
350M
            o = h;
17819
350M
        }
17820
        /* Handle overflow. */
17821
64.2M
        SP_ASM_ADDC(o, o2, a->dp[m->used * 2 - 1]);
17822
64.2M
        a->dp[m->used * 2 - 1] = o;
17823
64.2M
        a->dp[m->used * 2] = o2;
17824
64.2M
        a->used = (sp_size_t)(m->used * 2 + 1);
17825
64.2M
    }
17826
17827
64.3M
    if (!ct) {
17828
        /* Remove leading zeros. */
17829
64.3M
        sp_clamp(a);
17830
64.3M
        (void)sp_rshb(a, bits, a);
17831
        /* a = a mod m */
17832
64.3M
        if (_sp_cmp_abs(a, m) != MP_LT) {
17833
16.9M
            _sp_sub_off(a, m, a, 0);
17834
16.9M
        }
17835
64.3M
    }
17836
39
    else {
17837
39
        (void)sp_rshb(a, bits, a);
17838
        /* Constant time clamping. */
17839
39
        sp_clamp_ct(a);
17840
17841
39
        _sp_submod_ct(a, m, m, m->used + 1U, a);
17842
39
    }
17843
17844
#if 0
17845
    sp_print(a, "rr");
17846
#endif
17847
17848
64.3M
    return MP_OKAY;
17849
100M
#endif /* !SQR_MUL_ASM */
17850
100M
}
17851
17852
#if !defined(WOLFSSL_RSA_VERIFY_ONLY) || \
17853
    (defined(WOLFSSL_SP_MATH_ALL) && defined(HAVE_ECC))
17854
/* Reduce a number in Montgomery form.
17855
 *
17856
 * @param  [in,out]  a   SP integer to Montgomery reduce.
17857
 * @param  [in]      m   SP integer that is the modulus.
17858
 * @param  [in]      mp  SP integer digit that is the bottom digit of inv(-m).
17859
 * @param  [in]      ct  Indicates operation must be constant time.
17860
 *
17861
 * @return  MP_OKAY on success.
17862
 * @return  MP_VAL when a or m is NULL or m is zero.
17863
 */
17864
int sp_mont_red_ex(sp_int* a, const sp_int* m, sp_int_digit mp, int ct)
17865
114M
{
17866
114M
    int err;
17867
17868
    /* Validate parameters. */
17869
114M
    if ((a == NULL) || (m == NULL) || sp_iszero(m)) {
17870
0
        err = MP_VAL;
17871
0
    }
17872
114M
#ifdef WOLFSSL_SP_INT_NEGATIVE
17873
114M
    else if ((a->sign == MP_NEG) || (m->sign == MP_NEG)) {
17874
123
        err = MP_VAL;
17875
123
    }
17876
114M
#endif
17877
    /* Ensure a has enough space for calculation. */
17878
114M
    else if (a->size < m->used * 2 + 1) {
17879
10
        err = MP_VAL;
17880
10
    }
17881
114M
    else {
17882
        /* Perform Montogomery Reduction. */
17883
114M
        err = _sp_mont_red(a, m, mp, ct);
17884
114M
    }
17885
17886
114M
    return err;
17887
114M
}
17888
#endif
17889
17890
/* Calculate the bottom digit of the inverse of negative m.
17891
 * (rho * m) mod 2^n = -1, where n is the number of bits in a digit.
17892
 *
17893
 * Used when performing Montgomery Reduction.
17894
 * m must be odd.
17895
 * Jeffrey Hurchalla's method.
17896
 *   https://arxiv.org/pdf/2204.04342.pdf
17897
 *
17898
 * @param  [in]   m   SP integer that is the modulus.
17899
 * @param  [out]  mp  SP integer digit that is the bottom digit of inv(-m).
17900
 */
17901
static void _sp_mont_setup(const sp_int* m, sp_int_digit* rho)
17902
769k
{
17903
769k
    sp_int_digit d = m->dp[0];
17904
769k
    sp_int_digit x = (3 * d) ^ 2;
17905
769k
    sp_int_digit y = 1 - d * x;
17906
17907
769k
#if SP_WORD_SIZE >= 16
17908
769k
    x *= 1 + y; y *= y;
17909
769k
#endif
17910
769k
#if SP_WORD_SIZE >= 32
17911
769k
    x *= 1 + y; y *= y;
17912
769k
#endif
17913
769k
#if SP_WORD_SIZE >= 64
17914
769k
    x *= 1 + y; y *= y;
17915
769k
#endif
17916
769k
    x *= 1 + y;
17917
17918
    /* rho = -1/m mod d, subtract x (unsigned) from 0, assign negative */
17919
769k
    *rho = (sp_int_digit)((sp_int_sdigit)0 - (sp_int_sdigit)x);
17920
769k
}
17921
17922
/* Calculate the bottom digit of the inverse of negative m.
17923
 * (rho * m) mod 2^n = -1, where n is the number of bits in a digit.
17924
 *
17925
 * Used when performing Montgomery Reduction.
17926
 *
17927
 * @param  [in]   m   SP integer that is the modulus.
17928
 * @param  [out]  mp  SP integer digit that is the bottom digit of inv(-m).
17929
 *
17930
 * @return  MP_OKAY on success.
17931
 * @return  MP_VAL when m or rho is NULL.
17932
 */
17933
int sp_mont_setup(const sp_int* m, sp_int_digit* rho)
17934
62.3k
{
17935
62.3k
    int err = MP_OKAY;
17936
17937
    /* Validate parameters. */
17938
62.3k
    if ((m == NULL) || (rho == NULL)) {
17939
0
        err = MP_VAL;
17940
0
    }
17941
    /* Calculation only works with odd modulus. */
17942
62.3k
    if ((err == MP_OKAY) && !sp_isodd(m)) {
17943
59
        err = MP_VAL;
17944
59
    }
17945
17946
62.3k
    if (err == MP_OKAY) {
17947
        /* Calculate negative of inverse mod 2^n. */
17948
62.2k
        _sp_mont_setup(m, rho);
17949
62.2k
    }
17950
17951
62.3k
    return err;
17952
62.3k
}
17953
17954
/* Calculate the normalization value of m.
17955
 *   norm = 2^k - m, where k is the number of bits in m
17956
 *
17957
 * @param  [out]  norm   SP integer that normalises numbers into Montgomery
17958
 *                       form.
17959
 * @param  [in]   m      SP integer that is the modulus.
17960
 *
17961
 * @return  MP_OKAY on success.
17962
 * @return  MP_VAL when norm or m is NULL, or number of bits in m is maximual.
17963
 */
17964
int sp_mont_norm(sp_int* norm, const sp_int* m)
17965
783k
{
17966
783k
    int err = MP_OKAY;
17967
783k
    unsigned int bits = 0;
17968
17969
    /* Validate parameters. */
17970
783k
    if ((norm == NULL) || (m == NULL)) {
17971
0
        err = MP_VAL;
17972
0
    }
17973
783k
    if (err == MP_OKAY) {
17974
        /* Find top bit and ensure norm has enough space. */
17975
783k
        bits = (unsigned int)sp_count_bits(m);
17976
        /* NOLINTBEGIN(clang-analyzer-core.UndefinedBinaryOperatorResult) */
17977
        /* clang-tidy falsely believes that norm->size was corrupted by the
17978
         * _sp_copy() to "Set real working value to base." in _sp_exptmod_ex().
17979
         */
17980
783k
        if (bits >= (unsigned int)norm->size * SP_WORD_SIZE) {
17981
65
            err = MP_VAL;
17982
65
        }
17983
        /* NOLINTEND(clang-analyzer-core.UndefinedBinaryOperatorResult) */
17984
783k
    }
17985
783k
    if (err == MP_OKAY) {
17986
        /* Round up for case when m is less than a word - no advantage in using
17987
         * a smaller mask and would take more operations.
17988
         */
17989
783k
        if (bits < SP_WORD_SIZE) {
17990
662
            bits = SP_WORD_SIZE;
17991
662
        }
17992
        /* Smallest number greater than m of form 2^n. */
17993
783k
        _sp_zero(norm);
17994
783k
        err = sp_set_bit(norm, (int)bits);
17995
783k
    }
17996
783k
    if (err == MP_OKAY) {
17997
        /* norm = 2^n % m */
17998
783k
        err = sp_sub(norm, m, norm);
17999
783k
    }
18000
783k
    if ((err == MP_OKAY) && (bits == SP_WORD_SIZE)) {
18001
        /* Sub made norm one word and now finish calculation. */
18002
882
        norm->dp[0] %= m->dp[0];
18003
882
    }
18004
783k
    if (err == MP_OKAY) {
18005
        /* Remove leading zeros. */
18006
783k
        sp_clamp(norm);
18007
783k
    }
18008
18009
783k
    return err;
18010
783k
}
18011
#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_HAVE_SP_DH ||
18012
        * WOLFCRYPT_HAVE_ECCSI || WOLFCRYPT_HAVE_SAKKE */
18013
18014
/*********************************
18015
 * To and from binary and strings.
18016
 *********************************/
18017
18018
/* Calculate the number of 8-bit values required to represent the
18019
 * multi-precision number.
18020
 *
18021
 * When a is NULL, return s 0.
18022
 *
18023
 * @param  [in]  a  SP integer.
18024
 *
18025
 * @return  The count of 8-bit values.
18026
 * @return  0 when a is NULL.
18027
 */
18028
int sp_unsigned_bin_size(const sp_int* a)
18029
247k
{
18030
247k
    int cnt = 0;
18031
18032
247k
    if (a != NULL) {
18033
247k
        cnt = (sp_count_bits(a) + 7) >> 3;
18034
247k
    }
18035
18036
247k
    return cnt;
18037
247k
}
18038
18039
/* Convert a number as an array of bytes in big-endian format to a
18040
 * multi-precision number.
18041
 *
18042
 * @param  [out]  a     SP integer.
18043
 * @param  [in]   in    Array of bytes.
18044
 * @param  [in]   inSz  Number of data bytes in array.
18045
 *
18046
 * @return  MP_OKAY on success.
18047
 * @return  MP_VAL when the number is too big to fit in an SP.
18048
 */
18049
int sp_read_unsigned_bin(sp_int* a, const byte* in, word32 inSz)
18050
247k
{
18051
247k
    int err = MP_OKAY;
18052
18053
    /* Validate parameters. */
18054
247k
    if ((a == NULL) || ((in == NULL) && (inSz > 0))) {
18055
0
        err = MP_VAL;
18056
0
    }
18057
18058
    /* Check a has enough space for number. */
18059
247k
    if ((err == MP_OKAY) && (inSz > (word32)a->size * SP_WORD_SIZEOF)) {
18060
10.4k
        err = MP_VAL;
18061
10.4k
    }
18062
18063
247k
    if (err == MP_OKAY) {
18064
        /* Load full digits at a time from in. */
18065
236k
        int i;
18066
236k
        int j = 0;
18067
18068
236k
        a->used = (sp_size_t)((inSz + SP_WORD_SIZEOF - 1) / SP_WORD_SIZEOF);
18069
18070
    #if defined(BIG_ENDIAN_ORDER) && !defined(WOLFSSL_SP_INT_DIGIT_ALIGN)
18071
        /* Data endian matches representation of number.
18072
         * Directly copy if we don't have alignment issues.
18073
         */
18074
        for (i = (int)(inSz-1); i > SP_WORD_SIZEOF-1; i -= SP_WORD_SIZEOF) {
18075
            a->dp[j++] = *(sp_int_digit*)(in + i - (SP_WORD_SIZEOF - 1));
18076
        }
18077
    #else
18078
        /* Construct digit from required number of bytes. */
18079
2.98M
        for (i = (int)(inSz-1); i >= SP_WORD_SIZEOF - 1; i -= SP_WORD_SIZEOF) {
18080
2.74M
            a->dp[j]  = ((sp_int_digit)in[i - 0] <<  0)
18081
2.74M
        #if SP_WORD_SIZE >= 16
18082
2.74M
                      | ((sp_int_digit)in[i - 1] <<  8)
18083
2.74M
        #endif
18084
2.74M
        #if SP_WORD_SIZE >= 32
18085
2.74M
                      | ((sp_int_digit)in[i - 2] << 16) |
18086
2.74M
                        ((sp_int_digit)in[i - 3] << 24)
18087
2.74M
        #endif
18088
2.74M
        #if SP_WORD_SIZE >= 64
18089
2.74M
                      | ((sp_int_digit)in[i - 4] << 32) |
18090
2.74M
                        ((sp_int_digit)in[i - 5] << 40) |
18091
2.74M
                        ((sp_int_digit)in[i - 6] << 48) |
18092
2.74M
                        ((sp_int_digit)in[i - 7] << 56)
18093
2.74M
        #endif
18094
2.74M
                                                       ;
18095
2.74M
            j++;
18096
2.74M
        }
18097
236k
    #endif
18098
18099
236k
#if SP_WORD_SIZE >= 16
18100
        /* Handle leftovers. */
18101
236k
        if (i >= 0) {
18102
    #ifdef BIG_ENDIAN_ORDER
18103
            int s;
18104
18105
            /* Place remaining bytes into last digit. */
18106
            a->dp[a->used - 1] = 0;
18107
            for (s = 0; i >= 0; i--,s += 8) {
18108
                a->dp[j] |= ((sp_int_digit)in[i]) << s;
18109
            }
18110
    #else
18111
            /* Cast digits to an array of bytes so we can insert directly. */
18112
81.8k
            byte *d = (byte*)a->dp;
18113
18114
            /* Zero out all bytes in last digit. */
18115
81.8k
            a->dp[a->used - 1] = 0;
18116
            /* Place remaining bytes directly into digit. */
18117
81.8k
            switch (i) {
18118
0
            #if SP_WORD_SIZE >= 64
18119
4.35k
                case 6: d[inSz - 1 - 6] = in[6]; FALL_THROUGH;
18120
9.21k
                case 5: d[inSz - 1 - 5] = in[5]; FALL_THROUGH;
18121
15.1k
                case 4: d[inSz - 1 - 4] = in[4]; FALL_THROUGH;
18122
26.5k
                case 3: d[inSz - 1 - 3] = in[3]; FALL_THROUGH;
18123
26.5k
            #endif
18124
26.5k
            #if SP_WORD_SIZE >= 32
18125
39.4k
                case 2: d[inSz - 1 - 2] = in[2]; FALL_THROUGH;
18126
52.1k
                case 1: d[inSz - 1 - 1] = in[1]; FALL_THROUGH;
18127
52.1k
            #endif
18128
81.8k
                case 0: d[inSz - 1 - 0] = in[0];
18129
81.8k
            }
18130
81.8k
    #endif /* LITTLE_ENDIAN_ORDER */
18131
81.8k
        }
18132
236k
#endif
18133
236k
        sp_clamp_ct(a);
18134
236k
    }
18135
18136
247k
    return err;
18137
247k
}
18138
18139
/* Convert the multi-precision number to an array of bytes in big-endian format.
18140
 *
18141
 * The array must be large enough for encoded number - use mp_unsigned_bin_size
18142
 * to calculate the number of bytes required.
18143
 *
18144
 * @param  [in]   a    SP integer.
18145
 * @param  [out]  out  Array to put encoding into.
18146
 *
18147
 * @return  MP_OKAY on success.
18148
 * @return  MP_VAL when a or out is NULL.
18149
 */
18150
int sp_to_unsigned_bin(const sp_int* a, byte* out)
18151
53.4k
{
18152
    /* Write assuming output buffer is big enough. */
18153
53.4k
    return sp_to_unsigned_bin_len(a, out, sp_unsigned_bin_size(a));
18154
53.4k
}
18155
18156
/* Convert the multi-precision number to an array of bytes in big-endian format.
18157
 *
18158
 * The array must be large enough for encoded number - use mp_unsigned_bin_size
18159
 * to calculate the number of bytes required.
18160
 * Front-pads the output array with zeros to make number the size of the array.
18161
 *
18162
 * @param  [in]   a      SP integer.
18163
 * @param  [out]  out    Array to put encoding into.
18164
 * @param  [in]   outSz  Size of the array in bytes.
18165
 *
18166
 * @return  MP_OKAY on success.
18167
 * @return  MP_VAL when a or out is NULL.
18168
 */
18169
int sp_to_unsigned_bin_len(const sp_int* a, byte* out, int outSz)
18170
88.7k
{
18171
88.7k
    int err = MP_OKAY;
18172
18173
    /* Validate parameters. */
18174
88.7k
    if ((a == NULL) || (out == NULL) || (outSz < 0)) {
18175
3.86k
        err = MP_VAL;
18176
3.86k
    }
18177
18178
88.7k
#if SP_WORD_SIZE > 8
18179
88.7k
    if (err == MP_OKAY) {
18180
        /* Start at the end of the buffer - least significant byte. */
18181
84.9k
        int j = outSz - 1;
18182
18183
84.9k
        if (!sp_iszero(a)) {
18184
79.1k
            unsigned int i;
18185
18186
            /* Put each digit in. */
18187
590k
            for (i = 0; (j >= 0) && (i < a->used); i++) {
18188
511k
                int b;
18189
511k
                sp_int_digit d = a->dp[i];
18190
                /* Place each byte of a digit into the buffer. */
18191
4.39M
                for (b = 0; b < SP_WORD_SIZE; b += 8) {
18192
3.95M
                    out[j--] = (byte)d;
18193
3.95M
                    d >>= 8;
18194
                    /* Stop if the output buffer is filled. */
18195
3.95M
                    if (j < 0) {
18196
69.3k
                        if ((i < (unsigned int)a->used - 1) || (d > 0)) {
18197
227
                            err = MP_VAL;
18198
227
                        }
18199
69.3k
                        break;
18200
69.3k
                    }
18201
3.95M
                }
18202
511k
            }
18203
79.1k
        }
18204
        /* Front pad buffer with 0s. */
18205
289M
        for (; j >= 0; j--) {
18206
289M
            out[j] = 0;
18207
289M
        }
18208
84.9k
    }
18209
#else
18210
    if ((err == MP_OKAY) && ((unsigned int)outSz < a->used)) {
18211
        err = MP_VAL;
18212
    }
18213
    if (err == MP_OKAY) {
18214
        unsigned int i;
18215
        int j;
18216
18217
        XMEMSET(out, 0, (unsigned int)outSz - a->used);
18218
18219
        for (i = 0, j = outSz - 1; i < a->used; i++, j--) {
18220
            out[j] = a->dp[i];
18221
        }
18222
    }
18223
#endif
18224
18225
88.7k
    return err;
18226
88.7k
}
18227
18228
/* Convert the multi-precision number to an array of bytes in big-endian format.
18229
 *
18230
 * Constant-time implementation.
18231
 *
18232
 * The array must be large enough for encoded number - use mp_unsigned_bin_size
18233
 * to calculate the number of bytes required.
18234
 * Front-pads the output array with zeros to make number the size of the array.
18235
 *
18236
 * @param  [in]   a      SP integer.
18237
 * @param  [out]  out    Array to put encoding into.
18238
 * @param  [in]   outSz  Size of the array in bytes.
18239
 *
18240
 * @return  MP_OKAY on success.
18241
 * @return  MP_VAL when a or out is NULL.
18242
 */
18243
int sp_to_unsigned_bin_len_ct(const sp_int* a, byte* out, int outSz)
18244
16.0k
{
18245
16.0k
    int err = MP_OKAY;
18246
18247
    /* Validate parameters. */
18248
16.0k
    if ((a == NULL) || (out == NULL) || (outSz < 0)) {
18249
0
        err = MP_VAL;
18250
0
    }
18251
18252
16.0k
#if SP_WORD_SIZE > 8
18253
16.0k
    if (err == MP_OKAY) {
18254
        /* Start at the end of the buffer - least significant byte. */
18255
16.0k
        int j;
18256
16.0k
        unsigned int i;
18257
16.0k
        byte mask = (byte)-1;
18258
16.0k
        sp_int_digit d;
18259
18260
        /* Put each digit in. */
18261
16.0k
        i = 0;
18262
498k
        for (j = outSz - 1; j >= 0; ) {
18263
482k
            unsigned int b;
18264
482k
            volatile byte notFull = ctMaskLT((int)i, (int)a->used - 1);
18265
18266
482k
            d = a->dp[i];
18267
            /* Place each byte of a digit into the buffer. */
18268
4.33M
            for (b = 0; (j >= 0) && (b < SP_WORD_SIZEOF); b++) {
18269
3.85M
                out[j--] = (byte)(d & mask);
18270
3.85M
                d >>= 8;
18271
3.85M
            }
18272
482k
            mask &= notFull;
18273
482k
            i += (unsigned int)(1 & mask);
18274
482k
        }
18275
16.0k
    }
18276
#else
18277
    if ((err == MP_OKAY) && ((unsigned int)outSz < a->used)) {
18278
        err = MP_VAL;
18279
    }
18280
    if (err == MP_OKAY) {
18281
        unsigned int i;
18282
        int j;
18283
        volatile sp_int_digit mask = (sp_int_digit)-1;
18284
18285
        i = 0;
18286
        for (j = outSz - 1; j >= 0; j--) {
18287
            out[j] = a->dp[i] & mask;
18288
            mask &= (sp_int_digit)0 - (i < (unsigned int)a->used - 1);
18289
            i += (unsigned int)(1 & mask);
18290
        }
18291
    }
18292
#endif
18293
18294
16.0k
    return err;
18295
16.0k
}
18296
18297
#if defined(WOLFSSL_SP_MATH_ALL) && !defined(NO_RSA) && \
18298
    !defined(WOLFSSL_RSA_VERIFY_ONLY)
18299
/* Store the number in big-endian format in array at an offset.
18300
 * The array must be large enough for encoded number - use mp_unsigned_bin_size
18301
 * to calculate the number of bytes required.
18302
 *
18303
 * @param  [in]   o    Offset into array o start encoding.
18304
 * @param  [in]   a    SP integer.
18305
 * @param  [out]  out  Array to put encoding into.
18306
 *
18307
 * @return  Index of next byte after data.
18308
 * @return  MP_VAL when a or out is NULL.
18309
 */
18310
int sp_to_unsigned_bin_at_pos(int o, const sp_int* a, unsigned char* out)
18311
0
{
18312
    /* Get length of data that will be written. */
18313
0
    int len = sp_unsigned_bin_size(a);
18314
    /* Write number to buffer at offset. */
18315
0
    int ret = sp_to_unsigned_bin_len(a, out + o, len);
18316
18317
0
    if (ret == MP_OKAY) {
18318
        /* Return offset of next byte after number. */
18319
0
        ret = o + len;
18320
0
    }
18321
18322
0
    return ret;
18323
0
}
18324
#endif /* WOLFSSL_SP_MATH_ALL && !NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY */
18325
18326
#ifdef WOLFSSL_SP_READ_RADIX_16
18327
/* Convert hexadecimal number as string in big-endian format to a
18328
 * multi-precision number.
18329
 *
18330
 * Assumes negative sign and leading zeros have been stripped.
18331
 *
18332
 * @param  [out]  a   SP integer.
18333
 * @param  [in]   in  NUL terminated string.
18334
 *
18335
 * @return  MP_OKAY on success.
18336
 * @return  MP_VAL when radix not supported, value is negative, or a character
18337
 *          is not valid.
18338
 */
18339
static int _sp_read_radix_16(sp_int* a, const char* in)
18340
381k
{
18341
381k
    int err = MP_OKAY;
18342
381k
    int i;
18343
381k
    unsigned int s = 0;
18344
381k
    sp_size_t j = 0;
18345
381k
    sp_int_digit d;
18346
    /* Skip whitespace at end of line */
18347
381k
    int eol_done = 0;
18348
18349
    /* Make all nibbles in digit 0. */
18350
381k
    d = 0;
18351
    /* Step through string a character at a time starting at end - least
18352
     * significant byte. */
18353
23.8M
    for (i = (int)(XSTRLEN(in) - 1); i >= 0; i--) {
18354
23.4M
        volatile char c = in[i];
18355
        /* Convert character from hex. */
18356
23.4M
        int ch = (int)HexCharToByte(c);
18357
        /* Check for invalid character. */
18358
23.4M
        if (ch < 0) {
18359
1.16k
            if (!eol_done && CharIsWhiteSpace(c))
18360
220
                continue;
18361
945
            err = MP_VAL;
18362
945
            break;
18363
1.16k
        }
18364
23.4M
        eol_done = 1;
18365
18366
        /* Check whether we have filled the digit. */
18367
23.4M
        if (s == SP_WORD_SIZE) {
18368
            /* Store digit and move index to next in a. */
18369
1.93M
            a->dp[j++] = d;
18370
            /* Fail if we are out of space in a. */
18371
1.93M
            if (j >= a->size) {
18372
121
                err = MP_VAL;
18373
121
                break;
18374
121
            }
18375
            /* Set shift back to 0 - lowest nibble. */
18376
1.93M
            s = 0;
18377
            /* Make all nibbles in digit 0. */
18378
1.93M
            d = 0;
18379
1.93M
        }
18380
18381
        /* Put next nibble into digit. */
18382
23.4M
        d |= ((sp_int_digit)ch) << s;
18383
        /* Update shift for next nibble. */
18384
23.4M
        s += 4;
18385
23.4M
    }
18386
18387
381k
    if (err == MP_OKAY) {
18388
        /* If space, store last digit. */
18389
380k
        if (j < a->size) {
18390
380k
            a->dp[j] = d;
18391
380k
        }
18392
        /* Update used count. */
18393
380k
        a->used = (sp_size_t)(j + 1U);
18394
        /* Remove leading zeros. */
18395
380k
        sp_clamp(a);
18396
380k
    }
18397
18398
381k
    return err;
18399
381k
}
18400
#endif /* WOLFSSL_SP_READ_RADIX_16 */
18401
18402
#ifdef WOLFSSL_SP_READ_RADIX_10
18403
/* Convert decimal number as string in big-endian format to a multi-precision
18404
 * number.
18405
 *
18406
 * Assumes negative sign and leading zeros have been stripped.
18407
 *
18408
 * @param  [out]  a   SP integer.
18409
 * @param  [in]   in  NUL terminated string.
18410
 *
18411
 * @return  MP_OKAY on success.
18412
 * @return  MP_VAL when radix not supported, value is negative, or a character
18413
 *          is not valid.
18414
 */
18415
static int _sp_read_radix_10(sp_int* a, const char* in)
18416
155k
{
18417
155k
    int  err = MP_OKAY;
18418
155k
    int  i;
18419
18420
    /* Start with a being zero. */
18421
155k
    _sp_zero(a);
18422
18423
    /* Process all characters. */
18424
6.53M
    for (i = 0; in[i] != '\0'; i++) {
18425
        /* Get character. */
18426
6.38M
        volatile char ch = in[i];
18427
        /* Check character is valid. */
18428
6.38M
        if ((ch >= '0') && (ch <= '9')) {
18429
            /* Assume '0'..'9' are continuous values as characters. */
18430
6.38M
            ch = (char)(ch - '0');
18431
6.38M
        }
18432
0
        else {
18433
0
            if (CharIsWhiteSpace(ch))
18434
0
                continue;
18435
            /* Return error on invalid character. */
18436
0
            err = MP_VAL;
18437
0
            break;
18438
0
        }
18439
18440
        /* Multiply a by 10. */
18441
6.38M
        err = _sp_mul_d(a, 10, a, 0);
18442
6.38M
        if (err != MP_OKAY) {
18443
98
            break;
18444
98
        }
18445
        /* Add character value. */
18446
6.38M
        err = _sp_add_d(a, (sp_int_digit)ch, a);
18447
6.38M
        if (err != MP_OKAY) {
18448
0
            break;
18449
0
        }
18450
6.38M
    }
18451
18452
155k
    return err;
18453
155k
}
18454
#endif /* WOLFSSL_SP_READ_RADIX_10 */
18455
18456
#if defined(WOLFSSL_SP_READ_RADIX_16) || defined(WOLFSSL_SP_READ_RADIX_10)
18457
/* Convert a number as string in big-endian format to a big number.
18458
 * Only supports base-16 (hexadecimal) and base-10 (decimal).
18459
 *
18460
 * Negative values supported when WOLFSSL_SP_INT_NEGATIVE is defined.
18461
 *
18462
 * @param  [out]  a      SP integer.
18463
 * @param  [in]   in     NUL terminated string.
18464
 * @param  [in]   radix  Number of values in a digit.
18465
 *
18466
 * @return  MP_OKAY on success.
18467
 * @return  MP_VAL when a or in is NULL, radix not supported, value is negative,
18468
 *          or a character is not valid.
18469
 */
18470
int sp_read_radix(sp_int* a, const char* in, int radix)
18471
308k
{
18472
308k
    int err = MP_OKAY;
18473
308k
#ifdef WOLFSSL_SP_INT_NEGATIVE
18474
308k
    sp_uint8 sign = MP_ZPOS;
18475
308k
#endif
18476
18477
308k
    if ((a == NULL) || (in == NULL)) {
18478
0
        err = MP_VAL;
18479
0
    }
18480
18481
308k
    if (err == MP_OKAY) {
18482
    #ifndef WOLFSSL_SP_INT_NEGATIVE
18483
        if (*in == '-') {
18484
            err = MP_VAL;
18485
        }
18486
        else
18487
    #endif
18488
308k
        {
18489
308k
        #ifdef WOLFSSL_SP_INT_NEGATIVE
18490
308k
            if (*in == '-') {
18491
                /* Make number negative if signed string. */
18492
8.54k
                sign = MP_NEG;
18493
8.54k
                in++;
18494
8.54k
            }
18495
308k
        #endif /* WOLFSSL_SP_INT_NEGATIVE */
18496
            /* Skip leading zeros. */
18497
1.22M
            while (*in == '0') {
18498
921k
                in++;
18499
921k
            }
18500
18501
308k
            if (radix == 16) {
18502
203k
                err = _sp_read_radix_16(a, in);
18503
203k
            }
18504
104k
        #ifdef WOLFSSL_SP_READ_RADIX_10
18505
104k
            else if (radix == 10) {
18506
104k
                err = _sp_read_radix_10(a, in);
18507
104k
            }
18508
0
        #endif
18509
0
            else {
18510
0
                err = MP_VAL;
18511
0
            }
18512
18513
308k
        #ifdef WOLFSSL_SP_INT_NEGATIVE
18514
            /* Ensure not negative when zero. */
18515
308k
            if (err == MP_OKAY) {
18516
307k
                if (sp_iszero(a)) {
18517
81.1k
                    a->sign = MP_ZPOS;
18518
81.1k
                }
18519
226k
                else {
18520
226k
                    a->sign = sign;
18521
226k
                }
18522
307k
            }
18523
308k
        #endif
18524
308k
        }
18525
308k
    }
18526
18527
308k
    return err;
18528
308k
}
18529
#endif /* WOLFSSL_SP_READ_RADIX_16 || WOLFSSL_SP_READ_RADIX_10 */
18530
18531
#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
18532
    defined(WC_MP_TO_RADIX)
18533
/* Put the big-endian, hex string encoding of a into str.
18534
 *
18535
 * Assumes str is large enough for result.
18536
 * Use sp_radix_size() to calculate required length.
18537
 *
18538
 * @param  [in]   a    SP integer to convert.
18539
 * @param  [out]  str  String to hold hex string result.
18540
 *
18541
 * @return  MP_OKAY on success.
18542
 * @return  MP_VAL when a or str is NULL.
18543
 */
18544
int sp_tohex(const sp_int* a, char* str)
18545
12.9k
{
18546
12.9k
    int err = MP_OKAY;
18547
18548
    /* Validate parameters. */
18549
12.9k
    if ((a == NULL) || (str == NULL)) {
18550
0
        err = MP_VAL;
18551
0
    }
18552
18553
12.9k
    if (err == MP_OKAY) {
18554
        /* Quick out if number is zero. */
18555
12.9k
        if (sp_iszero(a) == MP_YES) {
18556
4.39k
        #ifndef WC_DISABLE_RADIX_ZERO_PAD
18557
            /* Make string represent complete bytes. */
18558
4.39k
            *str++ = '0';
18559
4.39k
        #endif /* WC_DISABLE_RADIX_ZERO_PAD */
18560
4.39k
            *str++ = '0';
18561
4.39k
        }
18562
8.52k
        else {
18563
8.52k
            int i;
18564
8.52k
            int j;
18565
8.52k
            sp_int_digit d;
18566
18567
8.52k
        #ifdef WOLFSSL_SP_INT_NEGATIVE
18568
8.52k
            if (a->sign == MP_NEG) {
18569
                /* Add negative sign character. */
18570
355
                *str = '-';
18571
355
                str++;
18572
355
            }
18573
8.52k
        #endif /* WOLFSSL_SP_INT_NEGATIVE */
18574
18575
            /* Start at last digit - most significant digit. */
18576
8.52k
            i = (int)(a->used - 1);
18577
8.52k
            d = a->dp[i];
18578
8.52k
        #ifndef WC_DISABLE_RADIX_ZERO_PAD
18579
            /* Find highest non-zero byte in most-significant word. */
18580
48.2k
            for (j = SP_WORD_SIZE - 8; j >= 0 && i >= 0; j -= 8) {
18581
                /* When a byte at this index is not 0 break out to start
18582
                 * writing.
18583
                 */
18584
48.2k
                if (((d >> j) & 0xff) != 0) {
18585
8.52k
                    break;
18586
8.52k
                }
18587
                /* Skip this digit if it was 0. */
18588
39.7k
                if (j == 0) {
18589
0
                    j = SP_WORD_SIZE - 8;
18590
0
                    d = a->dp[--i];
18591
0
                }
18592
39.7k
            }
18593
            /* Start with high nibble of byte. */
18594
8.52k
            j += 4;
18595
        #else
18596
            /* Find highest non-zero nibble in most-significant word. */
18597
            for (j = SP_WORD_SIZE - 4; j >= 0; j -= 4) {
18598
                /* When a nibble at this index is not 0 break out to start
18599
                 * writing.
18600
                 */
18601
                if (((d >> j) & 0xf) != 0) {
18602
                    break;
18603
                }
18604
                /* Skip this digit if it was 0. */
18605
                if (j == 0) {
18606
                    j = SP_WORD_SIZE - 4;
18607
                    d = a->dp[--i];
18608
                }
18609
            }
18610
        #endif /* WC_DISABLE_RADIX_ZERO_PAD */
18611
            /* Write out as much as required from most-significant digit. */
18612
55.7k
            for (; j >= 0; j -= 4) {
18613
47.2k
                *(str++) = ByteToHex((byte)(d >> j));
18614
47.2k
            }
18615
            /* Write rest of digits. */
18616
68.9k
            for (--i; i >= 0; i--) {
18617
                /* Get digit from memory. */
18618
60.3k
                d = a->dp[i];
18619
                /* Write out all nibbles of digit. */
18620
465k
                for (j = SP_WORD_SIZE - 4; j >= 0; j -= 4) {
18621
404k
                    *(str++) = (char)ByteToHex((byte)(d >> j));
18622
404k
                }
18623
60.3k
            }
18624
8.52k
        }
18625
        /* Terminate string. */
18626
12.9k
        *str = '\0';
18627
12.9k
    }
18628
18629
12.9k
    return err;
18630
12.9k
}
18631
#endif /* (WOLFSSL_SP_MATH_ALL && !WOLFSSL_RSA_VERIFY_ONLY) || WC_MP_TO_RADIX */
18632
18633
#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
18634
    defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) || \
18635
    defined(WC_MP_TO_RADIX)
18636
/* Put the big-endian, decimal string encoding of a into str.
18637
 *
18638
 * Assumes str is large enough for result.
18639
 * Use sp_radix_size() to calculate required length.
18640
 *
18641
 * @param  [in]   a    SP integer to convert.
18642
 * @param  [out]  str  String to hold hex string result.
18643
 *
18644
 * @return  MP_OKAY on success.
18645
 * @return  MP_VAL when a or str is NULL.
18646
 * @return  MP_MEM when dynamic memory allocation fails.
18647
 */
18648
int sp_todecimal(const sp_int* a, char* str)
18649
23.7k
{
18650
23.7k
    int err = MP_OKAY;
18651
23.7k
    int i;
18652
23.7k
    int j;
18653
23.7k
    sp_int_digit d = 0;
18654
18655
    /* Validate parameters. */
18656
23.7k
    if ((a == NULL) || (str == NULL)) {
18657
0
        err = MP_VAL;
18658
0
    }
18659
    /* Quick out if number is zero. */
18660
23.7k
    else if (sp_iszero(a) == MP_YES) {
18661
2.70k
        *str++ = '0';
18662
2.70k
        *str = '\0';
18663
2.70k
    }
18664
21.0k
    else if (a->used >= SP_INT_DIGITS) {
18665
8
        err = MP_VAL;
18666
8
    }
18667
21.0k
    else {
18668
        /* Temporary that is divided by 10. */
18669
21.0k
        DECL_SP_INT(t, a->used + 1);
18670
18671
21.0k
        ALLOC_SP_INT_SIZE(t, a->used + 1, err, NULL);
18672
21.0k
        if (err == MP_OKAY) {
18673
20.7k
            _sp_copy(a, t);
18674
20.7k
        }
18675
21.0k
        if (err == MP_OKAY) {
18676
20.7k
        #ifdef WOLFSSL_SP_INT_NEGATIVE
18677
20.7k
            if (a->sign == MP_NEG) {
18678
                /* Add negative sign character. */
18679
430
                *str = '-';
18680
430
                str++;
18681
430
            }
18682
20.7k
        #endif /* WOLFSSL_SP_INT_NEGATIVE */
18683
18684
            /* Write out little endian. */
18685
20.7k
            i = 0;
18686
1.72M
            do {
18687
                /* Divide by 10 and get remainder of division. */
18688
1.72M
                (void)sp_div_d(t, 10, t, &d);
18689
                /* Write out remainder as a character. */
18690
1.72M
                str[i++] = (char)('0' + d);
18691
1.72M
            }
18692
            /* Keep going while we there is a value to write. */
18693
1.72M
            while (!sp_iszero(t));
18694
            /* Terminate string. */
18695
20.7k
            str[i] = '\0';
18696
18697
20.7k
            if (err == MP_OKAY) {
18698
                /* Reverse string to big endian. */
18699
891k
                for (j = 0; j <= (i - 1) / 2; j++) {
18700
870k
                    int c = (unsigned char)str[j];
18701
870k
                    str[j] = str[i - 1 - j];
18702
870k
                    str[i - 1 - j] = (char)c;
18703
870k
                }
18704
20.7k
            }
18705
20.7k
        }
18706
18707
21.0k
        FREE_SP_INT(t, NULL);
18708
21.0k
    }
18709
18710
23.7k
    return err;
18711
23.7k
}
18712
#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_KEY_GEN || HAVE_COMP_KEY */
18713
18714
#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
18715
    defined(WC_MP_TO_RADIX)
18716
/* Put the string version, big-endian, of a in str using the given radix.
18717
 *
18718
 * @param  [in]   a      SP integer to convert.
18719
 * @param  [out]  str    String to hold hex string result.
18720
 * @param  [in]   radix  Base of character.
18721
 *                       Valid values: MP_RADIX_HEX, MP_RADIX_DEC.
18722
 *
18723
 * @return  MP_OKAY on success.
18724
 * @return  MP_VAL when a or str is NULL, or radix not supported.
18725
 */
18726
int sp_toradix(const sp_int* a, char* str, int radix)
18727
46.2k
{
18728
46.2k
    int err = MP_OKAY;
18729
18730
    /* Validate parameters. */
18731
46.2k
    if ((a == NULL) || (str == NULL)) {
18732
0
        err = MP_VAL;
18733
0
    }
18734
    /* Handle base 16 if requested. */
18735
46.2k
    else if (radix == MP_RADIX_HEX) {
18736
11.0k
        err = sp_tohex(a, str);
18737
11.0k
    }
18738
35.2k
#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_KEY_GEN) || \
18739
35.2k
    defined(HAVE_COMP_KEY)
18740
    /* Handle base 10 if requested. */
18741
35.2k
    else if (radix == MP_RADIX_DEC) {
18742
35.2k
        err = sp_todecimal(a, str);
18743
35.2k
    }
18744
0
#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_KEY_GEN || HAVE_COMP_KEY */
18745
0
    else {
18746
        /* Base not supported. */
18747
0
        err = MP_VAL;
18748
0
    }
18749
18750
46.2k
    return err;
18751
46.2k
}
18752
#endif /* (WOLFSSL_SP_MATH_ALL && !WOLFSSL_RSA_VERIFY_ONLY) || WC_MP_TO_RADIX */
18753
18754
#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
18755
    defined(WC_MP_TO_RADIX)
18756
/* Calculate the length of the string version, big-endian, of a using the given
18757
 * radix.
18758
 *
18759
 * @param  [in]   a      SP integer to convert.
18760
 * @param  [in]   radix  Base of character.
18761
 *                       Valid values: MP_RADIX_HEX, MP_RADIX_DEC.
18762
 * @param  [out]  size   The number of characters in encoding.
18763
 *
18764
 * @return  MP_OKAY on success.
18765
 * @return  MP_VAL when a or size is NULL, or radix not supported.
18766
 */
18767
int sp_radix_size(const sp_int* a, int radix, int* size)
18768
41.4k
{
18769
41.4k
    int err = MP_OKAY;
18770
18771
    /* Validate parameters. */
18772
41.4k
    if ((a == NULL) || (size == NULL)) {
18773
0
        err = MP_VAL;
18774
0
    }
18775
    /* Handle base 16 if requested. */
18776
41.4k
    else if (radix == MP_RADIX_HEX) {
18777
2.25k
        if (a->used == 0) {
18778
309
        #ifndef WC_DISABLE_RADIX_ZERO_PAD
18779
            /* 00 and '\0' */
18780
309
            *size = 2 + 1;
18781
        #else
18782
            /* Zero and '\0' */
18783
            *size = 1 + 1;
18784
        #endif /* WC_DISABLE_RADIX_ZERO_PAD */
18785
309
        }
18786
1.94k
        else {
18787
            /* Count of nibbles. */
18788
1.94k
            int cnt = (sp_count_bits(a) + 3) >> 2;
18789
1.94k
        #ifndef WC_DISABLE_RADIX_ZERO_PAD
18790
            /* Must have even number of nibbles to have complete bytes. */
18791
1.94k
            if (cnt & 1) {
18792
743
                cnt++;
18793
743
            }
18794
1.94k
        #endif /* WC_DISABLE_RADIX_ZERO_PAD */
18795
1.94k
        #ifdef WOLFSSL_SP_INT_NEGATIVE
18796
            /* Add to count of characters for negative sign. */
18797
1.94k
            if (a->sign == MP_NEG) {
18798
355
                cnt++;
18799
355
            }
18800
1.94k
        #endif /* WOLFSSL_SP_INT_NEGATIVE */
18801
            /* One more for \0 */
18802
1.94k
            *size = cnt + 1;
18803
1.94k
        }
18804
2.25k
    }
18805
39.1k
#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_KEY_GEN) || \
18806
39.1k
    defined(HAVE_COMP_KEY)
18807
    /* Handle base 10 if requested. */
18808
39.1k
    else if (radix == MP_RADIX_DEC) {
18809
24.2k
        int i;
18810
24.2k
        sp_int_digit d;
18811
18812
        /* quick out if its zero */
18813
24.2k
        if (sp_iszero(a) == MP_YES) {
18814
            /* Zero and '\0' */
18815
2.70k
            *size = 1 + 1;
18816
2.70k
        }
18817
21.5k
        else {
18818
21.5k
            DECL_SP_INT(t, a->used);
18819
18820
            /* Temporary to be divided by 10. */
18821
21.5k
            ALLOC_SP_INT(t, a->used, err, NULL);
18822
21.5k
            if (err == MP_OKAY) {
18823
21.0k
                t->size = a->used;
18824
21.0k
                _sp_copy(a, t);
18825
21.0k
            }
18826
18827
21.5k
            if (err == MP_OKAY) {
18828
                /* Count number of times number can be divided by 10. */
18829
1.80M
                for (i = 0; !sp_iszero(t); i++) {
18830
1.77M
                    (void)sp_div_d(t, 10, t, &d);
18831
1.77M
                }
18832
21.0k
            #ifdef WOLFSSL_SP_INT_NEGATIVE
18833
                /* Add to count of characters for negative sign. */
18834
21.0k
                if (a->sign == MP_NEG) {
18835
477
                    i++;
18836
477
                }
18837
21.0k
            #endif /* WOLFSSL_SP_INT_NEGATIVE */
18838
                /* One more for \0 */
18839
21.0k
                *size = i + 1;
18840
21.0k
            }
18841
18842
21.5k
            FREE_SP_INT(t, NULL);
18843
21.5k
        }
18844
24.2k
    }
18845
14.9k
#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_KEY_GEN || HAVE_COMP_KEY */
18846
14.9k
    else {
18847
        /* Base not supported. */
18848
14.9k
        err = MP_VAL;
18849
14.9k
    }
18850
18851
41.4k
    return err;
18852
41.4k
}
18853
#endif /* (WOLFSSL_SP_MATH_ALL && !WOLFSSL_RSA_VERIFY_ONLY) || WC_MP_TO_RADIX */
18854
18855
/***************************************
18856
 * Prime number generation and checking.
18857
 ***************************************/
18858
18859
#if defined(WOLFSSL_KEY_GEN) && (!defined(NO_RSA) || !defined(NO_DH) || \
18860
    !defined(NO_DSA)) && !defined(WC_NO_RNG)
18861
#ifndef WOLFSSL_SP_MILLER_RABIN_CNT
18862
/* Always done 8 iterations of Miller-Rabin on check of primality when
18863
 * generating.
18864
 */
18865
38.1k
#define WOLFSSL_SP_MILLER_RABIN_CNT     8
18866
#endif
18867
18868
/* Generate a random prime for RSA only.
18869
 *
18870
 * @param  [out]  r     SP integer to hold result.
18871
 * @param  [in]   len   Number of bytes in prime. Use -ve to indicate the two
18872
 *                      lowest bits must be set.
18873
 * @param  [in]   rng   Random number generator.
18874
 * @param  [in]   heap  Heap hint. Unused.
18875
 *
18876
 * @return  MP_OKAY on success
18877
 * @return  MP_VAL when r or rng is NULL, length is not supported or random
18878
 *          number generator fails.
18879
 */
18880
int sp_rand_prime(sp_int* r, int len, WC_RNG* rng, void* heap)
18881
995
{
18882
995
    static const byte USE_BBS = 3;
18883
995
    int  err = MP_OKAY;
18884
995
    byte low_bits = 1;
18885
995
    int  isPrime = MP_NO;
18886
995
#if defined(WOLFSSL_SP_MATH_ALL) || defined(BIG_ENDIAN_ORDER)
18887
995
    int  bits = 0;
18888
995
#endif /* WOLFSSL_SP_MATH_ALL */
18889
995
    unsigned int digits = 0;
18890
18891
995
    (void)heap;
18892
18893
    /* Check NULL parameters and 0 is not prime so 0 bytes is invalid. */
18894
995
    if ((r == NULL) || (rng == NULL) || (len == 0)) {
18895
33
        err = MP_VAL;
18896
33
    }
18897
18898
995
    if (err == MP_OKAY) {
18899
        /* Get type. */
18900
962
        if (len < 0) {
18901
0
            low_bits = USE_BBS;
18902
0
            len = -len;
18903
0
        }
18904
18905
        /* Get number of digits required to handle required number of bytes. */
18906
962
        digits = ((unsigned int)len + SP_WORD_SIZEOF - 1) / SP_WORD_SIZEOF;
18907
        /* Ensure result has space. */
18908
962
        if (r->size < digits) {
18909
38
            err = MP_VAL;
18910
38
        }
18911
962
    }
18912
18913
995
    if (err == MP_OKAY) {
18914
    #ifndef WOLFSSL_SP_MATH_ALL
18915
        /* For minimal maths, support only what's in SP and needed for DH. */
18916
    #if defined(WOLFSSL_HAVE_SP_DH) && defined(WOLFSSL_KEY_GEN)
18917
        if (len == 32) {
18918
        }
18919
        else
18920
    #endif /* WOLFSSL_HAVE_SP_DH && WOLFSSL_KEY_GEN */
18921
        /* Generate RSA primes that are half the modulus length. */
18922
    #ifdef WOLFSSL_SP_4096
18923
        if (len == 256) {
18924
            /* Support 2048-bit operations compiled in. */
18925
        }
18926
        else
18927
    #endif
18928
    #ifndef WOLFSSL_SP_NO_3072
18929
        if (len == 192) {
18930
            /* Support 1536-bit operations compiled in. */
18931
        }
18932
        else
18933
    #endif
18934
    #ifndef WOLFSSL_SP_NO_2048
18935
        if (len == 128) {
18936
            /* Support 1024-bit operations compiled in. */
18937
        }
18938
        else
18939
    #endif
18940
        {
18941
            /* Bit length not supported in SP. */
18942
            err = MP_VAL;
18943
        }
18944
    #endif /* !WOLFSSL_SP_MATH_ALL */
18945
18946
924
    #ifdef WOLFSSL_SP_INT_NEGATIVE
18947
        /* Generated number is always positive. */
18948
924
        r->sign = MP_ZPOS;
18949
924
    #endif /* WOLFSSL_SP_INT_NEGATIVE */
18950
        /* Set number of digits that will be used. */
18951
924
        r->used = (sp_size_t)digits;
18952
924
    #if defined(WOLFSSL_SP_MATH_ALL) || defined(BIG_ENDIAN_ORDER)
18953
        /* Calculate number of bits in last digit. */
18954
924
        bits = (len * 8) & SP_WORD_MASK;
18955
924
    #endif /* WOLFSSL_SP_MATH_ALL || BIG_ENDIAN_ORDER */
18956
924
    }
18957
18958
    /* Assume the candidate is probably prime and then test until it is proven
18959
     * composite.
18960
     */
18961
39.1k
    while ((err == MP_OKAY) && (isPrime == MP_NO)) {
18962
#ifdef SHOW_GEN
18963
        printf(".");
18964
        fflush(stdout);
18965
#endif /* SHOW_GEN */
18966
        /* Generate bytes into digit array. */
18967
38.2k
        err = wc_RNG_GenerateBlock(rng, (byte*)r->dp, (word32)len);
18968
38.2k
        if (err != 0) {
18969
161
            err = MP_VAL;
18970
161
            break;
18971
161
        }
18972
18973
        /* Set top bits to ensure bit length required is generated.
18974
         * Also set second top to help ensure product of two primes is
18975
         * going to be twice the number of bits of each.
18976
         */
18977
38.1k
#ifdef LITTLE_ENDIAN_ORDER
18978
38.1k
        ((byte*)r->dp)[len-1]             |= 0x80 | 0x40;
18979
#else
18980
        ((byte*)(r->dp + r->used - 1))[0] |= 0x80 | 0x40;
18981
#endif /* LITTLE_ENDIAN_ORDER */
18982
18983
#ifdef BIG_ENDIAN_ORDER
18984
        /* Bytes were put into wrong place when less than full digit. */
18985
        if (bits != 0) {
18986
            r->dp[r->used - 1] >>= SP_WORD_SIZE - bits;
18987
        }
18988
#endif /* BIG_ENDIAN_ORDER */
18989
38.1k
#ifdef WOLFSSL_SP_MATH_ALL
18990
        /* Mask top digit when less than a digit requested. */
18991
38.1k
        if (bits > 0) {
18992
21.8k
            r->dp[r->used - 1] &= ((sp_int_digit)1 << bits) - 1;
18993
21.8k
        }
18994
38.1k
#endif /* WOLFSSL_SP_MATH_ALL */
18995
        /* Set mandatory low bits
18996
         *  - bottom bit to make odd.
18997
         *  - For BBS, second lowest too to make Blum integer (3 mod 4).
18998
         */
18999
38.1k
        r->dp[0] |= low_bits;
19000
19001
        /* Running Miller-Rabin up to 3 times gives us a 2^{-80} chance
19002
         * of a 1024-bit candidate being a false positive, when it is our
19003
         * prime candidate. (Note 4.49 of Handbook of Applied Cryptography.)
19004
         */
19005
38.1k
        err = sp_prime_is_prime_ex(r, WOLFSSL_SP_MILLER_RABIN_CNT, &isPrime,
19006
38.1k
            rng);
19007
38.1k
    }
19008
19009
995
    return err;
19010
995
}
19011
#endif /* WOLFSSL_KEY_GEN && (!NO_DH || !NO_DSA) && !WC_NO_RNG */
19012
19013
#ifdef WOLFSSL_SP_PRIME_GEN
19014
/* Miller-Rabin test of "a" to the base of "b" as described in
19015
 * HAC pp. 139 Algorithm 4.24
19016
 *
19017
 * Sets result to 0 if definitely composite or 1 if probably prime.
19018
 * Randomly the chance of error is no more than 1/4 and often
19019
 * very much lower.
19020
 *
19021
 * a is assumed to be odd.
19022
 *
19023
 * @param  [in]   a       SP integer to check.
19024
 * @param  [in]   b       SP integer that is a small prime.
19025
 * @param  [out]  result  MP_YES when number is likely prime.
19026
 *                        MP_NO otherwise.
19027
 * @param  [in]   n1      SP integer temporary.
19028
 * @param  [in]   r       SP integer temporary.
19029
 *
19030
 * @return  MP_OKAY on success.
19031
 * @return  MP_MEM when dynamic memory allocation fails.
19032
 */
19033
static int sp_prime_miller_rabin(const sp_int* a, sp_int* b, int* result,
19034
    sp_int* n1, sp_int* r)
19035
102k
{
19036
102k
    int err = MP_OKAY;
19037
102k
    int s = 0;
19038
102k
    sp_int* y = b;
19039
19040
    /* Assume not prime. */
19041
102k
    *result = MP_NO;
19042
19043
    /* Ensure small prime is 2 or more. */
19044
102k
    if (sp_cmp_d(b, 1) != MP_GT) {
19045
0
        err = MP_VAL;
19046
0
    }
19047
102k
    if (err == MP_OKAY) {
19048
        /* n1 = a - 1 (a is assumed odd.) */
19049
102k
        (void)sp_copy(a, n1);
19050
102k
        n1->dp[0]--;
19051
19052
        /* Set 2**s * r = n1 */
19053
        /* Count the number of least significant bits which are zero. */
19054
102k
        s = sp_cnt_lsb(n1);
19055
        /* Divide n - 1 by 2**s into r. */
19056
102k
        (void)sp_rshb(n1, s, r);
19057
19058
        /* Compute y = b**r mod a */
19059
102k
        err = sp_exptmod(b, r, a, y);
19060
102k
    }
19061
102k
    if (err == MP_OKAY) {
19062
        /* Assume probably prime until shown otherwise. */
19063
102k
        *result = MP_YES;
19064
19065
        /* If y != 1 and y != n1 do */
19066
102k
        if ((sp_cmp_d(y, 1) != MP_EQ) && (_sp_cmp(y, n1) != MP_EQ)) {
19067
63.0k
            int j = 1;
19068
            /* While j <= s-1 and y != n1 */
19069
600k
            while ((j <= (s - 1)) && (_sp_cmp(y, n1) != MP_EQ)) {
19070
                /* Square for bit shifted down. */
19071
537k
                err = sp_sqrmod(y, a, y);
19072
537k
                if (err != MP_OKAY) {
19073
27
                    break;
19074
27
                }
19075
19076
                /* If y == 1 then composite. */
19077
537k
                if (sp_cmp_d(y, 1) == MP_EQ) {
19078
0
                    *result = MP_NO;
19079
0
                    break;
19080
0
                }
19081
537k
                ++j;
19082
537k
            }
19083
19084
            /* If y != n1 then composite. */
19085
63.0k
            if ((*result == MP_YES) && (_sp_cmp(y, n1) != MP_EQ)) {
19086
6.33k
                *result = MP_NO;
19087
6.33k
            }
19088
63.0k
        }
19089
102k
    }
19090
19091
102k
    return err;
19092
102k
}
19093
19094
#if SP_WORD_SIZE == 8
19095
/* Number of pre-computed primes. First n primes - fitting in a digit. */
19096
#define SP_PRIME_SIZE      54
19097
19098
static const sp_int_digit sp_primes[SP_PRIME_SIZE] = {
19099
    0x02, 0x03, 0x05, 0x07, 0x0B, 0x0D, 0x11, 0x13,
19100
    0x17, 0x1D, 0x1F, 0x25, 0x29, 0x2B, 0x2F, 0x35,
19101
    0x3B, 0x3D, 0x43, 0x47, 0x49, 0x4F, 0x53, 0x59,
19102
    0x61, 0x65, 0x67, 0x6B, 0x6D, 0x71, 0x7F, 0x83,
19103
    0x89, 0x8B, 0x95, 0x97, 0x9D, 0xA3, 0xA7, 0xAD,
19104
    0xB3, 0xB5, 0xBF, 0xC1, 0xC5, 0xC7, 0xD3, 0xDF,
19105
    0xE3, 0xE5, 0xE9, 0xEF, 0xF1, 0xFB
19106
};
19107
#else
19108
/* Number of pre-computed primes. First n primes. */
19109
157k
#define SP_PRIME_SIZE      256
19110
19111
/* The first 256 primes. */
19112
static const sp_uint16 sp_primes[SP_PRIME_SIZE] = {
19113
    0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
19114
    0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
19115
    0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
19116
    0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
19117
    0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
19118
    0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
19119
    0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
19120
    0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
19121
19122
    0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
19123
    0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
19124
    0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
19125
    0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
19126
    0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
19127
    0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
19128
    0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
19129
    0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
19130
19131
    0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
19132
    0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
19133
    0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
19134
    0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
19135
    0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
19136
    0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
19137
    0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
19138
    0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
19139
19140
    0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
19141
    0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
19142
    0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
19143
    0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
19144
    0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
19145
    0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
19146
    0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
19147
    0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
19148
};
19149
#endif
19150
19151
/* Compare the first n primes with a.
19152
 *
19153
 * @param [in]  a       Number to check.
19154
 * @param [out] result  Whether number was found to be prime.
19155
 * @return  0 when no small prime matches.
19156
 * @return  1 when small prime matches.
19157
 */
19158
static WC_INLINE int sp_cmp_primes(const sp_int* a, int* result)
19159
875
{
19160
875
    int i;
19161
875
    int haveRes = 0;
19162
19163
875
    *result = MP_NO;
19164
    /* Check one digit a against primes table. */
19165
128k
    for (i = 0; i < SP_PRIME_SIZE; i++) {
19166
128k
        if (sp_cmp_d(a, sp_primes[i]) == MP_EQ) {
19167
286
            *result = MP_YES;
19168
286
            haveRes = 1;
19169
286
            break;
19170
286
        }
19171
128k
    }
19172
19173
875
    return haveRes;
19174
875
}
19175
19176
/* Using composites is only faster when using 64-bit values. */
19177
#if !defined(WOLFSSL_SP_SMALL) && (SP_WORD_SIZE == 64)
19178
/* Number of composites. */
19179
291k
#define SP_COMP_CNT     38
19180
19181
/* Products of small primes that fit into 64-bits. */
19182
static sp_int_digit sp_comp[SP_COMP_CNT] = {
19183
    0x088886ffdb344692, 0x34091fa96ffdf47b, 0x3c47d8d728a77ebb,
19184
    0x077ab7da9d709ea9, 0x310df3e7bd4bc897, 0xe657d7a1fd5161d1,
19185
    0x02ad3dbe0cca85ff, 0x0787f9a02c3388a7, 0x1113c5cc6d101657,
19186
    0x2456c94f936bdb15, 0x4236a30b85ffe139, 0x805437b38eada69d,
19187
    0x00723e97bddcd2af, 0x00a5a792ee239667, 0x00e451352ebca269,
19188
    0x013a7955f14b7805, 0x01d37cbd653b06ff, 0x0288fe4eca4d7cdf,
19189
    0x039fddb60d3af63d, 0x04cd73f19080fb03, 0x0639c390b9313f05,
19190
    0x08a1c420d25d388f, 0x0b4b5322977db499, 0x0e94c170a802ee29,
19191
    0x11f6a0e8356100df, 0x166c8898f7b3d683, 0x1babda0a0afd724b,
19192
    0x2471b07c44024abf, 0x2d866dbc2558ad71, 0x3891410d45fb47df,
19193
    0x425d5866b049e263, 0x51f767298e2cf13b, 0x6d9f9ece5fc74f13,
19194
    0x7f5ffdb0f56ee64d, 0x943740d46a1bc71f, 0xaf2d7ca25cec848f,
19195
    0xcec010484e4ad877, 0xef972c3cfafbcd25
19196
};
19197
19198
/* Index of next prime after those used to create composite. */
19199
static int sp_comp_idx[SP_COMP_CNT] = {
19200
     15,  25,  34,  42,  50,  58,  65,  72,  79,  86,  93, 100, 106, 112, 118,
19201
    124, 130, 136, 142, 148, 154, 160, 166, 172, 178, 184, 190, 196, 202, 208,
19202
    214, 220, 226, 232, 238, 244, 250, 256
19203
};
19204
#endif
19205
19206
/* Determines whether any of the first n small primes divide a evenly.
19207
 *
19208
 * @param [in]      a        Number to check.
19209
 * @param [in, out] haveRes  Boolean indicating a no prime result found.
19210
 * @param [in, out] result   Whether a is known to be prime.
19211
 * @return  MP_OKAY on success.
19212
 * @return  Negative on failure.
19213
 */
19214
static WC_INLINE int sp_div_primes(const sp_int* a, int* haveRes, int* result)
19215
34.7k
{
19216
34.7k
    int i;
19217
34.7k
#if !defined(WOLFSSL_SP_SMALL) && (SP_WORD_SIZE == 64)
19218
34.7k
    int j;
19219
34.7k
#endif
19220
34.7k
    sp_int_digit d;
19221
34.7k
    int err = MP_OKAY;
19222
19223
#if defined(WOLFSSL_SP_SMALL) || (SP_WORD_SIZE < 64)
19224
    /* Do trial division of a with all known small primes. */
19225
    for (i = 0; i < SP_PRIME_SIZE; i++) {
19226
        /* Small prime divides a when remainder is 0. */
19227
        err = sp_mod_d(a, (sp_int_digit)sp_primes[i], &d);
19228
        if ((err != MP_OKAY) || (d == 0)) {
19229
            *result = MP_NO;
19230
            *haveRes = 1;
19231
            break;
19232
        }
19233
    }
19234
#else
19235
    /* Start with first prime in composite. */
19236
34.7k
    i = 0;
19237
320k
    for (j = 0; (!(*haveRes)) && (j < SP_COMP_CNT); j++) {
19238
        /* Reduce a down to a single word.  */
19239
285k
        err = sp_mod_d(a, sp_comp[j], &d);
19240
285k
        if ((err != MP_OKAY) || (d == 0)) {
19241
56
            *result = MP_NO;
19242
56
            *haveRes = 1;
19243
56
            break;
19244
56
        }
19245
        /* Do trial division of d with small primes that make up composite. */
19246
2.14M
        for (; i < sp_comp_idx[j]; i++) {
19247
            /* Small prime divides a when remainder is 0. */
19248
1.89M
            if (d % sp_primes[i] == 0) {
19249
28.5k
                *result = MP_NO;
19250
28.5k
                *haveRes = 1;
19251
28.5k
                break;
19252
28.5k
            }
19253
1.89M
        }
19254
285k
    }
19255
34.7k
#endif
19256
19257
34.7k
    return err;
19258
34.7k
}
19259
19260
/* Check whether a is prime by checking t iterations of Miller-Rabin.
19261
 *
19262
 * @param  [in]   a       SP integer to check.
19263
 * @param  [in]   trials  Number of trials of Miller-Rabin test to perform.
19264
 * @param  [out]  result  MP_YES when number is prime.
19265
 *                        MP_NO otherwise.
19266
 *
19267
 * @return  MP_OKAY on success.
19268
 * @return  MP_MEM when dynamic memory allocation fails.
19269
 */
19270
static int _sp_prime_trials(const sp_int* a, int trials, int* result)
19271
0
{
19272
0
    int err = MP_OKAY;
19273
0
    int i;
19274
0
    DECL_SP_INT(n1, a->used + 1);
19275
0
    DECL_SP_INT(r, a->used + 1);
19276
0
    DECL_SP_INT(b, a->used * 2 + 1);
19277
19278
0
    ALLOC_SP_INT(n1, a->used + 1, err, NULL);
19279
0
    ALLOC_SP_INT(r, a->used + 1, err, NULL);
19280
    /* Allocate number that will hold modular exponentiation result. */
19281
0
    ALLOC_SP_INT(b, a->used * 2 + 1, err, NULL);
19282
0
    if (err == MP_OKAY) {
19283
0
        _sp_init_size(n1, a->used + 1U);
19284
0
        _sp_init_size(r, a->used + 1U);
19285
0
        _sp_init_size(b, (sp_size_t)(a->used * 2U + 1U));
19286
19287
        /* Do requested number of trials of Miller-Rabin test. */
19288
0
        for (i = 0; i < trials; i++) {
19289
            /* Miller-Rabin test with known small prime. */
19290
0
            _sp_set(b, sp_primes[i]);
19291
0
            err = sp_prime_miller_rabin(a, b, result, n1, r);
19292
0
            if ((err != MP_OKAY) || (*result == MP_NO)) {
19293
0
                break;
19294
0
            }
19295
0
        }
19296
19297
        /* Clear temporary values. */
19298
0
        sp_clear(n1);
19299
0
        sp_clear(r);
19300
0
        sp_clear(b);
19301
0
    }
19302
19303
    /* Free allocated temporary. */
19304
0
    FREE_SP_INT(b, NULL);
19305
0
    FREE_SP_INT(r, NULL);
19306
0
    FREE_SP_INT(n1, NULL);
19307
0
    return err;
19308
0
}
19309
19310
/* Check whether a is prime.
19311
 * Checks against a number of small primes and does t iterations of
19312
 * Miller-Rabin.
19313
 *
19314
 * @param  [in]   a       SP integer to check.
19315
 * @param  [in]   trials  Number of trials of Miller-Rabin test to perform.
19316
 * @param  [out]  result  MP_YES when number is prime.
19317
 *                        MP_NO otherwise.
19318
 *
19319
 * @return  MP_OKAY on success.
19320
 * @return  MP_VAL when a or result is NULL, or trials is out of range.
19321
 * @return  MP_MEM when dynamic memory allocation fails.
19322
 */
19323
int sp_prime_is_prime(const sp_int* a, int trials, int* result)
19324
0
{
19325
0
    int         err = MP_OKAY;
19326
0
    int         haveRes = 0;
19327
19328
    /* Validate parameters. */
19329
0
    if ((a == NULL) || (result == NULL)) {
19330
0
        if (result != NULL) {
19331
0
            *result = MP_NO;
19332
0
        }
19333
0
        err = MP_VAL;
19334
0
    }
19335
0
    else if (a->used * 2 >= SP_INT_DIGITS) {
19336
0
        err = MP_VAL;
19337
0
    }
19338
    /* Check validity of Miller-Rabin iterations count.
19339
     * Must do at least one and need a unique pre-computed prime for each
19340
     * iteration.
19341
     */
19342
0
    if ((err == MP_OKAY) && ((trials <= 0) || (trials > SP_PRIME_SIZE))) {
19343
0
        *result = MP_NO;
19344
0
        err = MP_VAL;
19345
0
    }
19346
19347
    /* Short-cut, 1 is not prime. */
19348
0
    if ((err == MP_OKAY) && sp_isone(a)) {
19349
0
        *result = MP_NO;
19350
0
        haveRes = 1;
19351
0
    }
19352
19353
0
    SAVE_VECTOR_REGISTERS(err = _svr_ret;);
19354
19355
    /* Check against known small primes when a has 1 digit. */
19356
0
    if ((err == MP_OKAY) && (!haveRes) && (a->used == 1) &&
19357
0
            (a->dp[0] <= sp_primes[SP_PRIME_SIZE - 1])) {
19358
0
        haveRes = sp_cmp_primes(a, result);
19359
0
    }
19360
19361
    /* Check all small primes for even divisibility. */
19362
0
    if ((err == MP_OKAY) && (!haveRes)) {
19363
0
        err = sp_div_primes(a, &haveRes, result);
19364
0
    }
19365
19366
    /* Check a number of iterations of Miller-Rabin with small primes. */
19367
0
    if ((err == MP_OKAY) && (!haveRes)) {
19368
0
        err = _sp_prime_trials(a, trials, result);
19369
0
    }
19370
19371
0
    RESTORE_VECTOR_REGISTERS();
19372
19373
0
    return err;
19374
0
}
19375
19376
#ifndef WC_NO_RNG
19377
/* Check whether a is prime by doing t iterations of Miller-Rabin.
19378
 *
19379
 * t random numbers should give a (1/4)^t chance of a false prime.
19380
 *
19381
 * @param  [in]   a       SP integer to check.
19382
 * @param  [in]   trials  Number of iterations of Miller-Rabin test to perform.
19383
 * @param  [out]  result  MP_YES when number is prime.
19384
 *                        MP_NO otherwise.
19385
 * @param  [in]   rng     Random number generator for Miller-Rabin testing.
19386
 *
19387
 * @return  MP_OKAY on success.
19388
 * @return  MP_VAL when a, result or rng is NULL.
19389
 * @return  MP_MEM when dynamic memory allocation fails.
19390
 */
19391
static int _sp_prime_random_trials(const sp_int* a, int trials, int* result,
19392
    WC_RNG* rng)
19393
8.10k
{
19394
8.10k
    int err = MP_OKAY;
19395
8.10k
    int bits = sp_count_bits(a);
19396
8.10k
    word32 baseSz = ((word32)bits + 7) >> 3;
19397
8.10k
    DECL_SP_INT_ARRAY(ds, a->used + 1, 2);
19398
8.10k
    DECL_SP_INT_ARRAY(d, a->used * 2 + 1, 2);
19399
19400
8.10k
    ALLOC_SP_INT_ARRAY(ds, a->used + 1, 2, err, NULL);
19401
8.10k
    ALLOC_SP_INT_ARRAY(d, a->used * 2 + 1, 2, err, NULL);
19402
8.10k
    if (err == MP_OKAY) {
19403
8.02k
        sp_int* c  = ds[0];
19404
8.02k
        sp_int* n1 = ds[1];
19405
8.02k
        sp_int* b  = d[0];
19406
8.02k
        sp_int* r  = d[1];
19407
19408
8.02k
        _sp_init_size(c , a->used + 1U);
19409
8.02k
        _sp_init_size(n1, a->used + 1U);
19410
8.02k
        _sp_init_size(b , (sp_size_t)(a->used * 2U + 1U));
19411
8.02k
        _sp_init_size(r , (sp_size_t)(a->used * 2U + 1U));
19412
19413
8.02k
        _sp_sub_d(a, 2, c);
19414
19415
8.02k
        bits &= SP_WORD_MASK;
19416
19417
        /* Keep trying random numbers until all trials complete. */
19418
145k
        while (trials > 0) {
19419
            /* Generate random trial number. */
19420
144k
            err = wc_RNG_GenerateBlock(rng, (byte*)b->dp, baseSz);
19421
144k
            if (err != MP_OKAY) {
19422
101
                break;
19423
101
            }
19424
144k
            b->used = a->used;
19425
        #ifdef BIG_ENDIAN_ORDER
19426
            /* Fix top digit if fewer bytes than a full digit generated. */
19427
            if (((baseSz * 8) & SP_WORD_MASK) != 0) {
19428
                b->dp[b->used-1] >>=
19429
                    SP_WORD_SIZE - ((baseSz * 8) & SP_WORD_MASK);
19430
            }
19431
        #endif /* BIG_ENDIAN_ORDER */
19432
19433
            /* Ensure the top word has no more bits than necessary. */
19434
144k
            if (bits > 0) {
19435
106k
                b->dp[b->used - 1] &= ((sp_int_digit)1 << bits) - 1;
19436
106k
                sp_clamp(b);
19437
106k
            }
19438
19439
            /* Can't use random value it is: 0, 1, a-2, a-1, >= a  */
19440
144k
            if ((sp_cmp_d(b, 2) != MP_GT) || (_sp_cmp(b, c) != MP_LT)) {
19441
41.7k
                continue;
19442
41.7k
            }
19443
19444
            /* Perform Miller-Rabin test with random value. */
19445
102k
            err = sp_prime_miller_rabin(a, b, result, n1, r);
19446
102k
            if ((err != MP_OKAY) || (*result == MP_NO)) {
19447
6.58k
                break;
19448
6.58k
            }
19449
19450
            /* Trial complete. */
19451
96.0k
            trials--;
19452
96.0k
        }
19453
19454
        /* Zeroize temporary values used when generating private prime. */
19455
8.02k
        sp_forcezero(n1);
19456
8.02k
        sp_forcezero(r);
19457
8.02k
        sp_forcezero(b);
19458
8.02k
        sp_forcezero(c);
19459
8.02k
    }
19460
19461
8.10k
    FREE_SP_INT_ARRAY(d, NULL);
19462
8.10k
    FREE_SP_INT_ARRAY(ds, NULL);
19463
8.10k
    return err;
19464
8.10k
}
19465
#endif /*!WC_NO_RNG */
19466
19467
/* Check whether a is prime.
19468
 * Checks against a number of small primes and does t iterations of
19469
 * Miller-Rabin.
19470
 *
19471
 * @param  [in]   a       SP integer to check.
19472
 * @param  [in]   trials  Number of iterations of Miller-Rabin test to perform.
19473
 * @param  [out]  result  MP_YES when number is prime.
19474
 *                        MP_NO otherwise.
19475
 * @param  [in]   rng     Random number generator for Miller-Rabin testing.
19476
 *
19477
 * @return  MP_OKAY on success.
19478
 * @return  MP_VAL when a, result or rng is NULL.
19479
 * @return  MP_MEM when dynamic memory allocation fails.
19480
 */
19481
int sp_prime_is_prime_ex(const sp_int* a, int trials, int* result, WC_RNG* rng)
19482
26.1k
{
19483
26.1k
    int err = MP_OKAY;
19484
26.1k
    int ret = MP_YES;
19485
26.1k
    int haveRes = 0;
19486
19487
26.1k
    if ((a == NULL) || (result == NULL) || (rng == NULL)) {
19488
0
        err = MP_VAL;
19489
0
    }
19490
26.1k
#ifndef WC_NO_RNG
19491
26.1k
    if ((err == MP_OKAY) && (a->used * 2 >= SP_INT_DIGITS)) {
19492
3
        err = MP_VAL;
19493
3
    }
19494
26.1k
#endif
19495
26.1k
#ifdef WOLFSSL_SP_INT_NEGATIVE
19496
26.1k
    if ((err == MP_OKAY) && (a->sign == MP_NEG)) {
19497
17
        err = MP_VAL;
19498
17
    }
19499
26.1k
#endif
19500
19501
    /* Ensure trials is valid. Maximum based on number of small primes
19502
     * available. */
19503
26.1k
    if ((err == MP_OKAY) && ((trials <= 0) || (trials > SP_PRIME_SIZE))) {
19504
13
        err = MP_VAL;
19505
13
    }
19506
19507
26.1k
    if ((err == MP_OKAY) && sp_isone(a)) {
19508
11
        ret = MP_NO;
19509
11
        haveRes = 1;
19510
11
    }
19511
19512
26.1k
    SAVE_VECTOR_REGISTERS(err = _svr_ret;);
19513
19514
    /* Check against known small primes when a has 1 digit. */
19515
26.1k
    if ((err == MP_OKAY) && (!haveRes) && (a->used == 1) &&
19516
2.84k
            (a->dp[0] <= (sp_int_digit)sp_primes[SP_PRIME_SIZE - 1])) {
19517
423
        haveRes = sp_cmp_primes(a, &ret);
19518
423
    }
19519
19520
    /* Check all small primes for even divisibility. */
19521
26.1k
    if ((err == MP_OKAY) && (!haveRes)) {
19522
25.9k
        err = sp_div_primes(a, &haveRes, &ret);
19523
25.9k
    }
19524
19525
26.1k
#ifndef WC_NO_RNG
19526
    /* Check a number of iterations of Miller-Rabin with random large values. */
19527
26.1k
    if ((err == MP_OKAY) && (!haveRes)) {
19528
5.20k
        err = _sp_prime_random_trials(a, trials, &ret, rng);
19529
5.20k
    }
19530
#else
19531
    (void)trials;
19532
#endif /* !WC_NO_RNG */
19533
19534
26.1k
    if (result != NULL) {
19535
26.1k
        *result = ret;
19536
26.1k
    }
19537
19538
26.1k
    RESTORE_VECTOR_REGISTERS();
19539
19540
26.1k
    return err;
19541
26.1k
}
19542
#endif /* WOLFSSL_SP_PRIME_GEN */
19543
19544
#if !defined(NO_RSA) && defined(WOLFSSL_KEY_GEN)
19545
19546
/* Calculates the Greatest Common Denominator (GCD) of a and b into r.
19547
 *
19548
 * Find the largest number that divides both a and b without remainder.
19549
 * r <= a, r <= b, a % r == 0, b % r == 0
19550
 *
19551
 * a and b are positive integers.
19552
 *
19553
 * Euclidean Algorithm:
19554
 *  1. If a > b then a = b, b = a
19555
 *  2. u = a
19556
 *  3. v = b % a
19557
 *  4. While v != 0
19558
 *   4.1. t = u % v
19559
 *   4.2. u <= v, v <= t, t <= u
19560
 *  5. r = u
19561
 *
19562
 * @param  [in]   a  SP integer of first operand.
19563
 * @param  [in]   b  SP integer of second operand.
19564
 * @param  [out]  r  SP integer to hold result.
19565
 *
19566
 * @return  MP_OKAY on success.
19567
 * @return  MP_MEM when dynamic memory allocation fails.
19568
 */
19569
static WC_INLINE int _sp_gcd(const sp_int* a, const sp_int* b, sp_int* r)
19570
1.36k
{
19571
1.36k
    int err = MP_OKAY;
19572
1.36k
    sp_int* u = NULL;
19573
1.36k
    sp_int* v = NULL;
19574
1.36k
    sp_int* t = NULL;
19575
    /* Used for swapping sp_ints. */
19576
1.36k
    sp_int* s;
19577
    /* Determine maximum digit length numbers will reach. */
19578
1.36k
    unsigned int used = (a->used >= b->used) ? a->used + 1U : b->used + 1U;
19579
1.36k
    DECL_SP_INT_ARRAY(d, used, 3);
19580
19581
1.36k
    SAVE_VECTOR_REGISTERS(err = _svr_ret;);
19582
19583
1.36k
    ALLOC_SP_INT_ARRAY(d, used, 3, err, NULL);
19584
1.36k
    if (err == MP_OKAY) {
19585
1.34k
        u = d[0];
19586
1.34k
        v = d[1];
19587
1.34k
        t = d[2];
19588
19589
1.34k
        _sp_init_size(u, used);
19590
1.34k
        _sp_init_size(v, used);
19591
1.34k
        _sp_init_size(t, used);
19592
19593
        /* 1. If a > b then a = b, b = a.
19594
         *    Make a <= b.
19595
         */
19596
1.34k
        if (_sp_cmp(a, b) == MP_GT) {
19597
658
            const sp_int* tmp;
19598
658
            tmp = a;
19599
658
            a = b;
19600
658
            b = tmp;
19601
658
        }
19602
        /* 2. u = a, v = b mod a */
19603
1.34k
        _sp_copy(a, u);
19604
        /* 3. v = b mod a */
19605
1.34k
        if (a->used == 1) {
19606
855
            err = sp_mod_d(b, a->dp[0], &v->dp[0]);
19607
855
            v->used = (v->dp[0] != 0);
19608
855
        }
19609
492
        else {
19610
492
            err = sp_mod(b, a, v);
19611
492
        }
19612
1.34k
    }
19613
19614
    /* 4. While v != 0 */
19615
    /* Keep reducing larger by smaller until smaller is 0 or u and v both one
19616
     * digit.
19617
     */
19618
36.0k
    while ((err == MP_OKAY) && (!sp_iszero(v)) && (u->used > 1)) {
19619
        /* u' = v, v' = u mod v */
19620
        /* 4.1 t = u mod v */
19621
34.6k
        if (v->used == 1) {
19622
469
            err = sp_mod_d(u, v->dp[0], &t->dp[0]);
19623
469
            t->used = (t->dp[0] != 0);
19624
469
        }
19625
34.2k
        else {
19626
34.2k
            err = sp_mod(u, v, t);
19627
34.2k
        }
19628
        /* 4.2. u <= v, v <= t, t <= u */
19629
34.6k
        s = u; u = v; v = t; t = s;
19630
34.6k
    }
19631
    /* Only one digit remaining in u and v. */
19632
22.1k
    while ((err == MP_OKAY) && (!sp_iszero(v))) {
19633
        /* u' = v, v' = u mod v */
19634
        /* 4.1 t = u mod v */
19635
20.7k
        t->dp[0] = u->dp[0] % v->dp[0];
19636
20.7k
        t->used = (t->dp[0] != 0);
19637
        /* 4.2. u <= v, v <= t, t <= u */
19638
20.7k
        s = u; u = v; v = t; t = s;
19639
20.7k
    }
19640
1.36k
    if (err == MP_OKAY) {
19641
        /* 5. r = u */
19642
1.34k
        _sp_copy(u, r);
19643
1.34k
    }
19644
19645
1.36k
    FREE_SP_INT_ARRAY(d, NULL);
19646
19647
1.36k
    RESTORE_VECTOR_REGISTERS();
19648
19649
1.36k
    return err;
19650
1.36k
}
19651
19652
/* Calculates the Greatest Common Denominator (GCD) of a and b into r.
19653
 *
19654
 * Find the largest number that divides both a and b without remainder.
19655
 * r <= a, r <= b, a % r == 0, b % r == 0
19656
 *
19657
 * a and b are positive integers.
19658
 *
19659
 * @param  [in]   a  SP integer of first operand.
19660
 * @param  [in]   b  SP integer of second operand.
19661
 * @param  [out]  r  SP integer to hold result.
19662
 *
19663
 * @return  MP_OKAY on success.
19664
 * @return  MP_VAL when a, b or r is NULL or too large.
19665
 * @return  MP_MEM when dynamic memory allocation fails.
19666
 */
19667
int sp_gcd(const sp_int* a, const sp_int* b, sp_int* r)
19668
696
{
19669
696
    int err = MP_OKAY;
19670
19671
    /* Validate parameters. */
19672
696
    if ((a == NULL) || (b == NULL) || (r == NULL)) {
19673
0
        err = MP_VAL;
19674
0
    }
19675
    /* Check that we have space in numbers to do work. */
19676
696
    else if ((a->used >= SP_INT_DIGITS) || (b->used >= SP_INT_DIGITS)) {
19677
8
        err = MP_VAL;
19678
8
    }
19679
    /* Check that r is large enough to hold maximum sized result. */
19680
688
    else if (((a->used <= b->used) && (r->size < a->used)) ||
19681
684
             ((b->used < a->used) && (r->size < b->used))) {
19682
8
        err = MP_VAL;
19683
8
    }
19684
680
#ifdef WOLFSSL_SP_INT_NEGATIVE
19685
    /* Algorithm doesn't work with negative numbers. */
19686
680
    else if ((a->sign == MP_NEG) || (b->sign == MP_NEG)) {
19687
0
        err = MP_VAL;
19688
0
    }
19689
680
#endif
19690
680
    else if (sp_iszero(a)) {
19691
        /* GCD of 0 and 0 is undefined - all integers divide 0. */
19692
88
        if (sp_iszero(b)) {
19693
35
            err = MP_VAL;
19694
35
        }
19695
53
        else {
19696
            /* GCD of 0 and b is b - b divides 0. */
19697
53
            err = sp_copy(b, r);
19698
53
        }
19699
88
    }
19700
592
    else if (sp_iszero(b)) {
19701
        /* GCD of 0 and a is a - a divides 0. */
19702
46
        err = sp_copy(a, r);
19703
46
    }
19704
546
    else {
19705
        /* Calculate GCD. */
19706
546
        err = _sp_gcd(a, b, r);
19707
546
    }
19708
19709
696
    return err;
19710
696
}
19711
19712
#endif /* !NO_RSA && WOLFSSL_KEY_GEN */
19713
19714
#if !defined(NO_RSA) && defined(WOLFSSL_KEY_GEN) && \
19715
    (!defined(WC_RSA_BLINDING) || defined(HAVE_FIPS) || defined(HAVE_SELFTEST))
19716
19717
/* Calculates the Lowest Common Multiple (LCM) of a and b and stores in r.
19718
 * Smallest number divisible by both numbers.
19719
 *
19720
 * a and b are positive integers.
19721
 *
19722
 * lcm(a, b) = (a / gcd(a, b)) * b
19723
 * Divide the common divisor from a and multiply by b.
19724
 *
19725
 * Algorithm:
19726
 *  1. t0 = gcd(a, b)
19727
 *  2. If a > b then
19728
 *   2.1. t1 = a / t0
19729
 *   2.2. r = b * t1
19730
 *  3. Else
19731
 *   3.1. t1 = b / t0
19732
 *   3.2. r = a * t1
19733
 *
19734
 * @param  [in]   a  SP integer of first operand.
19735
 * @param  [in]   b  SP integer of second operand.
19736
 * @param  [out]  r  SP integer to hold result.
19737
 *
19738
 * @return  MP_OKAY on success.
19739
 * @return  MP_MEM when dynamic memory allocation fails.
19740
 */
19741
static int _sp_lcm(const sp_int* a, const sp_int* b, sp_int* r)
19742
{
19743
    int err = MP_OKAY;
19744
    /* Determine maximum digit length numbers will reach. */
19745
    unsigned int used = ((a->used >= b->used) ? a->used + 1: b->used + 1);
19746
    DECL_SP_INT_ARRAY(t, used, 2);
19747
19748
    ALLOC_SP_INT_ARRAY(t, used, 2, err, NULL);
19749
    if (err == MP_OKAY) {
19750
        _sp_init_size(t[0], used);
19751
        _sp_init_size(t[1], used);
19752
19753
        SAVE_VECTOR_REGISTERS(err = _svr_ret;);
19754
19755
        if (err == MP_OKAY) {
19756
            /* 1. t0 = gcd(a, b) */
19757
            err = sp_gcd(a, b, t[0]);
19758
        }
19759
19760
        if (err == MP_OKAY) {
19761
            /* Divide the greater by the common divisor and multiply by other
19762
             * to operate on the smallest length numbers.
19763
             */
19764
            /* 2. If a > b then */
19765
            if (_sp_cmp_abs(a, b) == MP_GT) {
19766
                /* 2.1. t1 = a / t0 */
19767
                err = sp_div(a, t[0], t[1], NULL);
19768
                if (err == MP_OKAY) {
19769
                    /* 2.2. r = b * t1 */
19770
                    err = sp_mul(b, t[1], r);
19771
                }
19772
            }
19773
            /* 3. Else */
19774
            else {
19775
                /* 3.1. t1 = b / t0 */
19776
                err = sp_div(b, t[0], t[1], NULL);
19777
                if (err == MP_OKAY) {
19778
                    /* 3.2. r = a * t1 */
19779
                    err = sp_mul(a, t[1], r);
19780
                }
19781
            }
19782
        }
19783
19784
        RESTORE_VECTOR_REGISTERS();
19785
    }
19786
19787
    FREE_SP_INT_ARRAY(t, NULL);
19788
    return err;
19789
}
19790
19791
/* Calculates the Lowest Common Multiple (LCM) of a and b and stores in r.
19792
 * Smallest number divisible by both numbers.
19793
 *
19794
 * a and b are positive integers.
19795
 *
19796
 * @param  [in]   a  SP integer of first operand.
19797
 * @param  [in]   b  SP integer of second operand.
19798
 * @param  [out]  r  SP integer to hold result.
19799
 *
19800
 * @return  MP_OKAY on success.
19801
 * @return  MP_VAL when a, b or r is NULL; or a or b is zero.
19802
 * @return  MP_MEM when dynamic memory allocation fails.
19803
 */
19804
int sp_lcm(const sp_int* a, const sp_int* b, sp_int* r)
19805
{
19806
    int err = MP_OKAY;
19807
19808
    /* Validate parameters. */
19809
    if ((a == NULL) || (b == NULL) || (r == NULL)) {
19810
        err = MP_VAL;
19811
    }
19812
#ifdef WOLFSSL_SP_INT_NEGATIVE
19813
    /* Ensure a and b are positive. */
19814
    else if ((a->sign == MP_NEG) || (b->sign >= MP_NEG)) {
19815
        err = MP_VAL;
19816
    }
19817
#endif
19818
    /* Ensure r has space for maximumal result. */
19819
    else if (r->size < a->used + b->used) {
19820
        err = MP_VAL;
19821
    }
19822
19823
    /* LCM of 0 and any number is undefined as 0 is not in the set of values
19824
     * being used.
19825
     */
19826
    if ((err == MP_OKAY) && (mp_iszero(a) || mp_iszero(b))) {
19827
        err = MP_VAL;
19828
    }
19829
19830
    if (err == MP_OKAY) {
19831
        /* Do operation. */
19832
        err = _sp_lcm(a, b, r);
19833
    }
19834
19835
    return err;
19836
}
19837
19838
#endif /* !NO_RSA && WOLFSSL_KEY_GEN && (!WC_RSA_BLINDING || HAVE_FIPS ||
19839
        * HAVE_SELFTEST) */
19840
19841
/* Returns the run time settings.
19842
 *
19843
 * @return  Settings value.
19844
 */
19845
word32 CheckRunTimeSettings(void)
19846
0
{
19847
0
    return CTC_SETTINGS;
19848
0
}
19849
19850
/* Returns the fast math settings.
19851
 *
19852
 * @return  Setting - number of bits in a digit.
19853
 */
19854
word32 CheckRunTimeFastMath(void)
19855
0
{
19856
0
    return SP_WORD_SIZE;
19857
0
}
19858
19859
#ifdef WOLFSSL_CHECK_MEM_ZERO
19860
/* Add an MP to check.
19861
 *
19862
 * @param [in] name  Name of address to check.
19863
 * @param [in] sp    sp_int that needs to be checked.
19864
 */
19865
void sp_memzero_add(const char* name, sp_int* sp)
19866
{
19867
    wc_MemZero_Add(name, sp->dp, sp->size * sizeof(sp_int_digit));
19868
}
19869
19870
/* Check the memory in the data pointer for memory that must be zero.
19871
 *
19872
 * @param [in] sp    sp_int that needs to be checked.
19873
 */
19874
void sp_memzero_check(sp_int* sp)
19875
{
19876
    wc_MemZero_Check(sp->dp, sp->size * sizeof(sp_int_digit));
19877
}
19878
#endif /* WOLFSSL_CHECK_MEM_ZERO */
19879
19880
#ifdef WOLFSSL_SP_DYN_STACK
19881
    PRAGMA_GCC_DIAG_POP
19882
#endif
19883
19884
#endif /* WOLFSSL_SP_MATH || WOLFSSL_SP_MATH_ALL */