Coverage Report

Created: 2024-11-21 07:03

/src/libgcrypt/mpi/ec-inline.h
Line
Count
Source (jump to first uncovered line)
1
/* ec-inline.h - EC inline addition/substraction helpers
2
 * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
3
 *
4
 * This file is part of Libgcrypt.
5
 *
6
 * Libgcrypt is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU Lesser General Public License as
8
 * published by the Free Software Foundation; either version 2.1 of
9
 * the License, or (at your option) any later version.
10
 *
11
 * Libgcrypt is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
18
 */
19
20
#ifndef GCRY_EC_INLINE_H
21
#define GCRY_EC_INLINE_H
22
23
#include "mpi-internal.h"
24
#include "longlong.h"
25
#include "ec-context.h"
26
#include "../cipher/bithelp.h"
27
#include "../cipher/bufhelp.h"
28
29
30
#if BYTES_PER_MPI_LIMB == 8
31
32
/* 64-bit limb definitions for 64-bit architectures.  */
33
34
9.23M
#define LIMBS_PER_LIMB64 1
35
448k
#define LOAD64(x, pos) ((x)[pos])
36
14.7M
#define STORE64(x, pos, v) ((x)[pos] = (mpi_limb_t)(v))
37
9.68M
#define LIMB_TO64(v) ((mpi_limb_t)(v))
38
#define LIMB_FROM64(v) ((mpi_limb_t)(v))
39
#define HIBIT_LIMB64(v) ((mpi_limb_t)(v) >> (BITS_PER_MPI_LIMB - 1))
40
2.11M
#define HI32_LIMB64(v) (u32)((mpi_limb_t)(v) >> (BITS_PER_MPI_LIMB - 32))
41
20.9M
#define LO32_LIMB64(v) ((u32)(v))
42
575M
#define LIMB64_C(hi, lo) (((mpi_limb_t)(u32)(hi) << 32) | (u32)(lo))
43
#define MASK_AND64(mask, val) ((mask) & (val))
44
29.5M
#define LIMB_OR64(val1, val2) ((val1) | (val2))
45
#define STORE64_COND(x, pos, mask1, val1, mask2, val2) \
46
32.4M
    ((x)[(pos)] = ((mask1) & (val1)) | ((mask2) & (val2)))
47
48
typedef mpi_limb_t mpi_limb64_t;
49
50
static inline u32
51
LOAD32(mpi_ptr_t x, unsigned int pos)
52
135M
{
53
135M
  unsigned int shr = (pos % 2) * 32;
54
135M
  return (x[pos / 2] >> shr);
55
135M
}
56
57
static inline mpi_limb64_t
58
LIMB64_HILO(u32 hi, u32 lo)
59
135M
{
60
135M
  mpi_limb64_t v = hi;
61
135M
  return (v << 32) | lo;
62
135M
}
63
64
65
/* x86-64 addition/subtraction helpers.  */
66
#if defined (__x86_64__) && defined(HAVE_CPU_ARCH_X86) && __GNUC__ >= 4
67
68
#define ADD3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \
69
14.1M
  __asm__ ("addq %8, %2\n" \
70
14.1M
     "adcq %7, %1\n" \
71
14.1M
     "adcq %6, %0\n" \
72
14.1M
     : "=r" (A2), \
73
14.1M
       "=&r" (A1), \
74
14.1M
       "=&r" (A0) \
75
14.1M
     : "0" ((mpi_limb_t)(B2)), \
76
14.1M
       "1" ((mpi_limb_t)(B1)), \
77
14.1M
       "2" ((mpi_limb_t)(B0)), \
78
14.1M
       "rme" ((mpi_limb_t)(C2)), \
79
14.1M
       "rme" ((mpi_limb_t)(C1)), \
80
14.1M
       "rme" ((mpi_limb_t)(C0)) \
81
14.1M
     : "cc")
82
83
#define SUB3_LIMB64(A3, A2, A1, A0, B2, B1, B0, C2, C1, C0) \
84
  __asm__ ("subq %8, %2\n" \
85
     "sbbq %7, %1\n" \
86
     "sbbq %6, %0\n" \
87
     : "=r" (A2), \
88
       "=&r" (A1), \
89
       "=&r" (A0) \
90
     : "0" ((mpi_limb_t)(B2)), \
91
       "1" ((mpi_limb_t)(B1)), \
92
       "2" ((mpi_limb_t)(B0)), \
93
       "rme" ((mpi_limb_t)(C2)), \
94
       "rme" ((mpi_limb_t)(C1)), \
95
       "rme" ((mpi_limb_t)(C0)) \
96
     : "cc")
97
98
#define ADD4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
99
35.1M
  __asm__ ("addq %11, %3\n" \
100
35.1M
     "adcq %10, %2\n" \
101
35.1M
     "adcq %9, %1\n" \
102
35.1M
     "adcq %8, %0\n" \
103
35.1M
     : "=r" (A3), \
104
35.1M
       "=&r" (A2), \
105
35.1M
       "=&r" (A1), \
106
35.1M
       "=&r" (A0) \
107
35.1M
     : "0" ((mpi_limb_t)(B3)), \
108
35.1M
       "1" ((mpi_limb_t)(B2)), \
109
35.1M
       "2" ((mpi_limb_t)(B1)), \
110
35.1M
       "3" ((mpi_limb_t)(B0)), \
111
35.1M
       "rme" ((mpi_limb_t)(C3)), \
112
35.1M
       "rme" ((mpi_limb_t)(C2)), \
113
35.1M
       "rme" ((mpi_limb_t)(C1)), \
114
35.1M
       "rme" ((mpi_limb_t)(C0)) \
115
35.1M
     : "cc")
116
117
#define SUB4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
118
11.5M
  __asm__ ("subq %11, %3\n" \
119
11.5M
     "sbbq %10, %2\n" \
120
11.5M
     "sbbq %9, %1\n" \
121
11.5M
     "sbbq %8, %0\n" \
122
11.5M
     : "=r" (A3), \
123
11.5M
       "=&r" (A2), \
124
11.5M
       "=&r" (A1), \
125
11.5M
       "=&r" (A0) \
126
11.5M
     : "0" ((mpi_limb_t)(B3)), \
127
11.5M
       "1" ((mpi_limb_t)(B2)), \
128
11.5M
       "2" ((mpi_limb_t)(B1)), \
129
11.5M
       "3" ((mpi_limb_t)(B0)), \
130
11.5M
       "rme" ((mpi_limb_t)(C3)), \
131
11.5M
       "rme" ((mpi_limb_t)(C2)), \
132
11.5M
       "rme" ((mpi_limb_t)(C1)), \
133
11.5M
       "rme" ((mpi_limb_t)(C0)) \
134
11.5M
     : "cc")
135
136
#define ADD5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
137
                    C4, C3, C2, C1, C0) \
138
52.7M
  __asm__ ("addq %14, %4\n" \
139
52.7M
     "adcq %13, %3\n" \
140
52.7M
     "adcq %12, %2\n" \
141
52.7M
     "adcq %11, %1\n" \
142
52.7M
     "adcq %10, %0\n" \
143
52.7M
     : "=r" (A4), \
144
52.7M
       "=&r" (A3), \
145
52.7M
       "=&r" (A2), \
146
52.7M
       "=&r" (A1), \
147
52.7M
       "=&r" (A0) \
148
52.7M
     : "0" ((mpi_limb_t)(B4)), \
149
52.7M
       "1" ((mpi_limb_t)(B3)), \
150
52.7M
       "2" ((mpi_limb_t)(B2)), \
151
52.7M
       "3" ((mpi_limb_t)(B1)), \
152
52.7M
       "4" ((mpi_limb_t)(B0)), \
153
52.7M
       "rme" ((mpi_limb_t)(C4)), \
154
52.7M
       "rme" ((mpi_limb_t)(C3)), \
155
52.7M
       "rme" ((mpi_limb_t)(C2)), \
156
52.7M
       "rme" ((mpi_limb_t)(C1)), \
157
52.7M
       "rme" ((mpi_limb_t)(C0)) \
158
52.7M
     : "cc")
159
160
#define SUB5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
161
                    C4, C3, C2, C1, C0) \
162
20.0M
  __asm__ ("subq %14, %4\n" \
163
20.0M
     "sbbq %13, %3\n" \
164
20.0M
     "sbbq %12, %2\n" \
165
20.0M
     "sbbq %11, %1\n" \
166
20.0M
     "sbbq %10, %0\n" \
167
20.0M
     : "=r" (A4), \
168
20.0M
       "=&r" (A3), \
169
20.0M
       "=&r" (A2), \
170
20.0M
       "=&r" (A1), \
171
20.0M
       "=&r" (A0) \
172
20.0M
     : "0" ((mpi_limb_t)(B4)), \
173
20.0M
       "1" ((mpi_limb_t)(B3)), \
174
20.0M
       "2" ((mpi_limb_t)(B2)), \
175
20.0M
       "3" ((mpi_limb_t)(B1)), \
176
20.0M
       "4" ((mpi_limb_t)(B0)), \
177
20.0M
       "rme" ((mpi_limb_t)(C4)), \
178
20.0M
       "rme" ((mpi_limb_t)(C3)), \
179
20.0M
       "rme" ((mpi_limb_t)(C2)), \
180
20.0M
       "rme" ((mpi_limb_t)(C1)), \
181
20.0M
       "rme" ((mpi_limb_t)(C0)) \
182
20.0M
     : "cc")
183
184
#endif /* __x86_64__ */
185
186
187
/* ARM AArch64 addition/subtraction helpers.  */
188
#if defined (__aarch64__) && defined(HAVE_CPU_ARCH_ARM) && __GNUC__ >= 4
189
190
#define ADD3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \
191
  __asm__ ("adds %2, %5, %8\n" \
192
     "adcs %1, %4, %7\n" \
193
     "adc  %0, %3, %6\n" \
194
     : "=r" (A2), \
195
       "=&r" (A1), \
196
       "=&r" (A0) \
197
     : "r" ((mpi_limb_t)(B2)), \
198
       "r" ((mpi_limb_t)(B1)), \
199
       "r" ((mpi_limb_t)(B0)), \
200
       "r" ((mpi_limb_t)(C2)), \
201
       "r" ((mpi_limb_t)(C1)), \
202
       "r" ((mpi_limb_t)(C0)) \
203
     : "cc")
204
205
#define SUB3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \
206
  __asm__ ("subs %2, %5, %8\n" \
207
     "sbcs %1, %4, %7\n" \
208
     "sbc  %0, %3, %6\n" \
209
     : "=r" (A2), \
210
       "=&r" (A1), \
211
       "=&r" (A0) \
212
     : "r" ((mpi_limb_t)(B2)), \
213
       "r" ((mpi_limb_t)(B1)), \
214
       "r" ((mpi_limb_t)(B0)), \
215
       "r" ((mpi_limb_t)(C2)), \
216
       "r" ((mpi_limb_t)(C1)), \
217
       "r" ((mpi_limb_t)(C0)) \
218
     : "cc")
219
220
#define ADD4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
221
  __asm__ ("adds %3, %7, %11\n" \
222
     "adcs %2, %6, %10\n" \
223
     "adcs %1, %5, %9\n" \
224
     "adc  %0, %4, %8\n" \
225
     : "=r" (A3), \
226
       "=&r" (A2), \
227
       "=&r" (A1), \
228
       "=&r" (A0) \
229
     : "r" ((mpi_limb_t)(B3)), \
230
       "r" ((mpi_limb_t)(B2)), \
231
       "r" ((mpi_limb_t)(B1)), \
232
       "r" ((mpi_limb_t)(B0)), \
233
       "r" ((mpi_limb_t)(C3)), \
234
       "r" ((mpi_limb_t)(C2)), \
235
       "r" ((mpi_limb_t)(C1)), \
236
       "r" ((mpi_limb_t)(C0)) \
237
     : "cc")
238
239
#define SUB4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
240
  __asm__ ("subs %3, %7, %11\n" \
241
     "sbcs %2, %6, %10\n" \
242
     "sbcs %1, %5, %9\n" \
243
     "sbc  %0, %4, %8\n" \
244
     : "=r" (A3), \
245
       "=&r" (A2), \
246
       "=&r" (A1), \
247
       "=&r" (A0) \
248
     : "r" ((mpi_limb_t)(B3)), \
249
       "r" ((mpi_limb_t)(B2)), \
250
       "r" ((mpi_limb_t)(B1)), \
251
       "r" ((mpi_limb_t)(B0)), \
252
       "r" ((mpi_limb_t)(C3)), \
253
       "r" ((mpi_limb_t)(C2)), \
254
       "r" ((mpi_limb_t)(C1)), \
255
       "r" ((mpi_limb_t)(C0)) \
256
     : "cc")
257
258
#define ADD5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
259
                    C4, C3, C2, C1, C0) \
260
  __asm__ ("adds %4, %9, %14\n" \
261
     "adcs %3, %8, %13\n" \
262
     "adcs %2, %7, %12\n" \
263
     "adcs %1, %6, %11\n" \
264
     "adc  %0, %5, %10\n" \
265
     : "=r" (A4), \
266
       "=&r" (A3), \
267
       "=&r" (A2), \
268
       "=&r" (A1), \
269
       "=&r" (A0) \
270
     : "r" ((mpi_limb_t)(B4)), \
271
       "r" ((mpi_limb_t)(B3)), \
272
       "r" ((mpi_limb_t)(B2)), \
273
       "r" ((mpi_limb_t)(B1)), \
274
       "r" ((mpi_limb_t)(B0)), \
275
       "r" ((mpi_limb_t)(C4)), \
276
       "r" ((mpi_limb_t)(C3)), \
277
       "r" ((mpi_limb_t)(C2)), \
278
       "r" ((mpi_limb_t)(C1)), \
279
       "r" ((mpi_limb_t)(C0)) \
280
     : "cc")
281
282
#define SUB5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
283
                    C4, C3, C2, C1, C0) \
284
  __asm__ ("subs %4, %9, %14\n" \
285
     "sbcs %3, %8, %13\n" \
286
     "sbcs %2, %7, %12\n" \
287
     "sbcs %1, %6, %11\n" \
288
     "sbc  %0, %5, %10\n" \
289
     : "=r" (A4), \
290
       "=&r" (A3), \
291
       "=&r" (A2), \
292
       "=&r" (A1), \
293
       "=&r" (A0) \
294
     : "r" ((mpi_limb_t)(B4)), \
295
       "r" ((mpi_limb_t)(B3)), \
296
       "r" ((mpi_limb_t)(B2)), \
297
       "r" ((mpi_limb_t)(B1)), \
298
       "r" ((mpi_limb_t)(B0)), \
299
       "r" ((mpi_limb_t)(C4)), \
300
       "r" ((mpi_limb_t)(C3)), \
301
       "r" ((mpi_limb_t)(C2)), \
302
       "r" ((mpi_limb_t)(C1)), \
303
       "r" ((mpi_limb_t)(C0)) \
304
     : "cc")
305
306
#endif /* __aarch64__ */
307
308
309
/* PowerPC64 addition/subtraction helpers.  */
310
#if defined (__powerpc__) && defined(HAVE_CPU_ARCH_PPC) && __GNUC__ >= 4
311
312
#define ADD3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \
313
  __asm__ ("addc %2, %8, %5\n" \
314
     "adde %1, %7, %4\n" \
315
     "adde %0, %6, %3\n" \
316
     : "=r" (A2), \
317
       "=&r" (A1), \
318
       "=&r" (A0) \
319
     : "r" ((mpi_limb_t)(B2)), \
320
       "r" ((mpi_limb_t)(B1)), \
321
       "r" ((mpi_limb_t)(B0)), \
322
       "r" ((mpi_limb_t)(C2)), \
323
       "r" ((mpi_limb_t)(C1)), \
324
       "r" ((mpi_limb_t)(C0)) \
325
     : "cc", "r0")
326
327
#define SUB3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \
328
  __asm__ ("subfc %2, %8, %5\n" \
329
     "subfe %1, %7, %4\n" \
330
     "subfe %0, %6, %3\n" \
331
     : "=r" (A2), \
332
       "=&r" (A1), \
333
       "=&r" (A0) \
334
     : "r" ((mpi_limb_t)(B2)), \
335
       "r" ((mpi_limb_t)(B1)), \
336
       "r" ((mpi_limb_t)(B0)), \
337
       "r" ((mpi_limb_t)(C2)), \
338
       "r" ((mpi_limb_t)(C1)), \
339
       "r" ((mpi_limb_t)(C0)) \
340
     : "cc", "r0")
341
342
#define ADD4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
343
  __asm__ ("addc %3, %11, %7\n" \
344
     "adde %2, %10, %6\n" \
345
     "adde %1, %9, %5\n" \
346
     "adde %0, %8, %4\n" \
347
     : "=r" (A3), \
348
       "=&r" (A2), \
349
       "=&r" (A1), \
350
       "=&r" (A0) \
351
     : "r" ((mpi_limb_t)(B3)), \
352
       "r" ((mpi_limb_t)(B2)), \
353
       "r" ((mpi_limb_t)(B1)), \
354
       "r" ((mpi_limb_t)(B0)), \
355
       "r" ((mpi_limb_t)(C3)), \
356
       "r" ((mpi_limb_t)(C2)), \
357
       "r" ((mpi_limb_t)(C1)), \
358
       "r" ((mpi_limb_t)(C0)) \
359
     : "cc")
360
361
#define SUB4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
362
  __asm__ ("subfc %3, %11, %7\n" \
363
     "subfe %2, %10, %6\n" \
364
     "subfe %1, %9, %5\n" \
365
     "subfe %0, %8, %4\n" \
366
     : "=r" (A3), \
367
       "=&r" (A2), \
368
       "=&r" (A1), \
369
       "=&r" (A0) \
370
     : "r" ((mpi_limb_t)(B3)), \
371
       "r" ((mpi_limb_t)(B2)), \
372
       "r" ((mpi_limb_t)(B1)), \
373
       "r" ((mpi_limb_t)(B0)), \
374
       "r" ((mpi_limb_t)(C3)), \
375
       "r" ((mpi_limb_t)(C2)), \
376
       "r" ((mpi_limb_t)(C1)), \
377
       "r" ((mpi_limb_t)(C0)) \
378
     : "cc")
379
380
#define ADD5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
381
                      C4, C3, C2, C1, C0) \
382
  __asm__ ("addc %4, %14, %9\n" \
383
     "adde %3, %13, %8\n" \
384
     "adde %2, %12, %7\n" \
385
     "adde %1, %11, %6\n" \
386
     "adde %0, %10, %5\n" \
387
     : "=r" (A4), \
388
       "=&r" (A3), \
389
       "=&r" (A2), \
390
       "=&r" (A1), \
391
       "=&r" (A0) \
392
     : "r" ((mpi_limb_t)(B4)), \
393
       "r" ((mpi_limb_t)(B3)), \
394
       "r" ((mpi_limb_t)(B2)), \
395
       "r" ((mpi_limb_t)(B1)), \
396
       "r" ((mpi_limb_t)(B0)), \
397
       "r" ((mpi_limb_t)(C4)), \
398
       "r" ((mpi_limb_t)(C3)), \
399
       "r" ((mpi_limb_t)(C2)), \
400
       "r" ((mpi_limb_t)(C1)), \
401
       "r" ((mpi_limb_t)(C0)) \
402
     : "cc")
403
404
#define SUB5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
405
                      C4, C3, C2, C1, C0) \
406
  __asm__ ("subfc %4, %14, %9\n" \
407
     "subfe %3, %13, %8\n" \
408
     "subfe %2, %12, %7\n" \
409
     "subfe %1, %11, %6\n" \
410
     "subfe %0, %10, %5\n" \
411
     : "=r" (A4), \
412
       "=&r" (A3), \
413
       "=&r" (A2), \
414
       "=&r" (A1), \
415
       "=&r" (A0) \
416
     : "r" ((mpi_limb_t)(B4)), \
417
       "r" ((mpi_limb_t)(B3)), \
418
       "r" ((mpi_limb_t)(B2)), \
419
       "r" ((mpi_limb_t)(B1)), \
420
       "r" ((mpi_limb_t)(B0)), \
421
       "r" ((mpi_limb_t)(C4)), \
422
       "r" ((mpi_limb_t)(C3)), \
423
       "r" ((mpi_limb_t)(C2)), \
424
       "r" ((mpi_limb_t)(C1)), \
425
       "r" ((mpi_limb_t)(C0)) \
426
     : "cc")
427
428
#endif /* __powerpc__ */
429
430
431
/* s390x/zSeries addition/subtraction helpers.  */
432
#if defined (__s390x__) && defined(HAVE_CPU_ARCH_S390X) && __GNUC__ >= 4
433
434
#define ADD3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \
435
  __asm__ ("algr %2, %8\n" \
436
     "alcgr %1, %7\n" \
437
     "alcgr %0, %6\n" \
438
     : "=r" (A2), \
439
       "=&r" (A1), \
440
       "=&r" (A0) \
441
     : "0" ((mpi_limb_t)(B2)), \
442
       "1" ((mpi_limb_t)(B1)), \
443
       "2" ((mpi_limb_t)(B0)), \
444
       "r" ((mpi_limb_t)(C2)), \
445
       "r" ((mpi_limb_t)(C1)), \
446
       "r" ((mpi_limb_t)(C0)) \
447
     : "cc")
448
449
#define SUB3_LIMB64(A3, A2, A1, A0, B2, B1, B0, C2, C1, C0) \
450
  __asm__ ("slgr %2, %8\n" \
451
     "slbgr %1, %7\n" \
452
     "slbgr %0, %6\n" \
453
     : "=r" (A2), \
454
       "=&r" (A1), \
455
       "=&r" (A0) \
456
     : "0" ((mpi_limb_t)(B2)), \
457
       "1" ((mpi_limb_t)(B1)), \
458
       "2" ((mpi_limb_t)(B0)), \
459
       "r" ((mpi_limb_t)(C2)), \
460
       "r" ((mpi_limb_t)(C1)), \
461
       "r" ((mpi_limb_t)(C0)) \
462
     : "cc")
463
464
#define ADD4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
465
  __asm__ ("algr %3, %11\n" \
466
     "alcgr %2, %10\n" \
467
     "alcgr %1, %9\n" \
468
     "alcgr %0, %8\n" \
469
     : "=r" (A3), \
470
       "=&r" (A2), \
471
       "=&r" (A1), \
472
       "=&r" (A0) \
473
     : "0" ((mpi_limb_t)(B3)), \
474
       "1" ((mpi_limb_t)(B2)), \
475
       "2" ((mpi_limb_t)(B1)), \
476
       "3" ((mpi_limb_t)(B0)), \
477
       "r" ((mpi_limb_t)(C3)), \
478
       "r" ((mpi_limb_t)(C2)), \
479
       "r" ((mpi_limb_t)(C1)), \
480
       "r" ((mpi_limb_t)(C0)) \
481
     : "cc")
482
483
#define SUB4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
484
  __asm__ ("slgr %3, %11\n" \
485
     "slbgr %2, %10\n" \
486
     "slbgr %1, %9\n" \
487
     "slbgr %0, %8\n" \
488
     : "=r" (A3), \
489
       "=&r" (A2), \
490
       "=&r" (A1), \
491
       "=&r" (A0) \
492
     : "0" ((mpi_limb_t)(B3)), \
493
       "1" ((mpi_limb_t)(B2)), \
494
       "2" ((mpi_limb_t)(B1)), \
495
       "3" ((mpi_limb_t)(B0)), \
496
       "r" ((mpi_limb_t)(C3)), \
497
       "r" ((mpi_limb_t)(C2)), \
498
       "r" ((mpi_limb_t)(C1)), \
499
       "r" ((mpi_limb_t)(C0)) \
500
     : "cc")
501
502
#define ADD5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
503
                    C4, C3, C2, C1, C0) \
504
  __asm__ ("algr %4, %14\n" \
505
     "alcgr %3, %13\n" \
506
     "alcgr %2, %12\n" \
507
     "alcgr %1, %11\n" \
508
     "alcgr %0, %10\n" \
509
     : "=r" (A4), \
510
       "=&r" (A3), \
511
       "=&r" (A2), \
512
       "=&r" (A1), \
513
       "=&r" (A0) \
514
     : "0" ((mpi_limb_t)(B4)), \
515
       "1" ((mpi_limb_t)(B3)), \
516
       "2" ((mpi_limb_t)(B2)), \
517
       "3" ((mpi_limb_t)(B1)), \
518
       "4" ((mpi_limb_t)(B0)), \
519
       "r" ((mpi_limb_t)(C4)), \
520
       "r" ((mpi_limb_t)(C3)), \
521
       "r" ((mpi_limb_t)(C2)), \
522
       "r" ((mpi_limb_t)(C1)), \
523
       "r" ((mpi_limb_t)(C0)) \
524
     : "cc")
525
526
#define SUB5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
527
                    C4, C3, C2, C1, C0) \
528
  __asm__ ("slgr %4, %14\n" \
529
     "slbgr %3, %13\n" \
530
     "slbgr %2, %12\n" \
531
     "slbgr %1, %11\n" \
532
     "slbgr %0, %10\n" \
533
     : "=r" (A4), \
534
       "=&r" (A3), \
535
       "=&r" (A2), \
536
       "=&r" (A1), \
537
       "=&r" (A0) \
538
     : "0" ((mpi_limb_t)(B4)), \
539
       "1" ((mpi_limb_t)(B3)), \
540
       "2" ((mpi_limb_t)(B2)), \
541
       "3" ((mpi_limb_t)(B1)), \
542
       "4" ((mpi_limb_t)(B0)), \
543
       "r" ((mpi_limb_t)(C4)), \
544
       "r" ((mpi_limb_t)(C3)), \
545
       "r" ((mpi_limb_t)(C2)), \
546
       "r" ((mpi_limb_t)(C1)), \
547
       "r" ((mpi_limb_t)(C0)) \
548
     : "cc")
549
550
#endif /* __s390x__ */
551
552
553
/* Common 64-bit arch addition/subtraction macros.  */
554
555
#define ADD2_LIMB64(A1, A0, B1, B0, C1, C0) \
556
12.6M
  add_ssaaaa(A1, A0, B1, B0, C1, C0)
557
558
#define SUB2_LIMB64(A1, A0, B1, B0, C1, C0) \
559
  sub_ddmmss(A1, A0, B1, B0, C1, C0)
560
561
#endif /* BYTES_PER_MPI_LIMB == 8 */
562
563
564
#if BYTES_PER_MPI_LIMB == 4
565
566
/* 64-bit limb definitions for 32-bit architectures.  */
567
568
#define LIMBS_PER_LIMB64 2
569
#define LIMB_FROM64(v) ((v).lo)
570
#define HIBIT_LIMB64(v) ((v).hi >> (BITS_PER_MPI_LIMB - 1))
571
#define HI32_LIMB64(v) ((v).hi)
572
#define LO32_LIMB64(v) ((v).lo)
573
#define LOAD32(x, pos) ((x)[pos])
574
#define LIMB64_C(hi, lo) { (lo), (hi) }
575
576
typedef struct
577
{
578
  mpi_limb_t lo;
579
  mpi_limb_t hi;
580
} mpi_limb64_t;
581
582
static inline mpi_limb64_t
583
LOAD64(const mpi_ptr_t x, unsigned int pos)
584
{
585
  mpi_limb64_t v;
586
  v.lo = x[pos * 2 + 0];
587
  v.hi = x[pos * 2 + 1];
588
  return v;
589
}
590
591
static inline void
592
STORE64(mpi_ptr_t x, unsigned int pos, mpi_limb64_t v)
593
{
594
  x[pos * 2 + 0] = v.lo;
595
  x[pos * 2 + 1] = v.hi;
596
}
597
598
static inline mpi_limb64_t
599
MASK_AND64(mpi_limb_t mask, mpi_limb64_t val)
600
{
601
  val.lo &= mask;
602
  val.hi &= mask;
603
  return val;
604
}
605
606
static inline mpi_limb64_t
607
LIMB_OR64(mpi_limb64_t val1, mpi_limb64_t val2)
608
{
609
  val1.lo |= val2.lo;
610
  val1.hi |= val2.hi;
611
  return val1;
612
}
613
614
static inline void
615
STORE64_COND(mpi_ptr_t x, unsigned int pos, mpi_limb_t mask1,
616
       mpi_limb64_t val1, mpi_limb_t mask2, mpi_limb64_t val2)
617
{
618
  x[pos * 2 + 0] = (mask1 & val1.lo) | (mask2 & val2.lo);
619
  x[pos * 2 + 1] = (mask1 & val1.hi) | (mask2 & val2.hi);
620
}
621
622
static inline mpi_limb64_t
623
LIMB_TO64(mpi_limb_t x)
624
{
625
  mpi_limb64_t v;
626
  v.lo = x;
627
  v.hi = 0;
628
  return v;
629
}
630
631
static inline mpi_limb64_t
632
LIMB64_HILO(mpi_limb_t hi, mpi_limb_t lo)
633
{
634
  mpi_limb64_t v;
635
  v.lo = lo;
636
  v.hi = hi;
637
  return v;
638
}
639
640
641
/* i386 addition/subtraction helpers.  */
642
#if defined (__i386__) && defined(HAVE_CPU_ARCH_X86) && __GNUC__ >= 4
643
644
#define ADD2_LIMB32_CARRY_OUT(carry, a1, a0, b1, b0, c1, c0) \
645
  __asm__ ("addl %7, %2\n" \
646
     "adcl %6, %1\n" \
647
     "sbbl %0, %0\n" \
648
     : "=r" (carry), \
649
       "=&r" (a1), \
650
       "=&r" (a0) \
651
     : "0" ((mpi_limb_t)(0)), \
652
       "1" ((mpi_limb_t)(b1)), \
653
       "2" ((mpi_limb_t)(b0)), \
654
       "re" ((mpi_limb_t)(c1)), \
655
       "re" ((mpi_limb_t)(c0)) \
656
     : "cc")
657
658
#define ADD2_LIMB32_CARRY_IN_OUT(a1, a0, b1, b0, c1, c0, carry) \
659
  __asm__ ("addl $1, %0\n" \
660
     "adcl %7, %2\n" \
661
     "adcl %6, %1\n" \
662
     "sbbl %0, %0\n" \
663
     : "=r" (carry), \
664
       "=&r" (a1), \
665
       "=&r" (a0) \
666
     : "0" ((mpi_limb_t)(carry)), \
667
       "1" ((mpi_limb_t)(b1)), \
668
       "2" ((mpi_limb_t)(b0)), \
669
       "re" ((mpi_limb_t)(c1)), \
670
       "re" ((mpi_limb_t)(c0)) \
671
     : "cc")
672
673
#define ADD2_LIMB32_CARRY_IN(a1, a0, b1, b0, c1, c0, carry) \
674
    __asm__ ("addl $1, %2\n" \
675
       "adcl %7, %1\n" \
676
       "adcl %6, %0\n" \
677
       : "=r" (a1), \
678
         "=&r" (a0), \
679
         "=&g" (carry) \
680
       : "0" ((mpi_limb_t)(b1)), \
681
         "1" ((mpi_limb_t)(b0)), \
682
         "2" ((mpi_limb_t)(carry)), \
683
         "re" ((mpi_limb_t)(c1)), \
684
         "re" ((mpi_limb_t)(c0)) \
685
     : "cc")
686
687
#define ADD4_LIMB32(a3, a2, a1, a0, b3, b2, b1, b0, c3, c2, c1, c0) do { \
688
    mpi_limb_t __carry4_32; \
689
    ADD2_LIMB32_CARRY_OUT(__carry4_32, a1, a0, b1, b0, c1, c0); \
690
    ADD2_LIMB32_CARRY_IN(a3, a2, b3, b2, c3, c2, __carry4_32); \
691
  } while (0)
692
693
#define ADD6_LIMB32(a5, a4, a3, a2, a1, a0, b5, b4, b3, b2, b1, b0, \
694
        c5, c4, c3, c2, c1, c0) do { \
695
    mpi_limb_t __carry6_32; \
696
    ADD2_LIMB32_CARRY_OUT(__carry6_32, a1, a0, b1, b0, c1, c0); \
697
    ADD2_LIMB32_CARRY_IN_OUT(a3, a2, b3, b2, c3, c2, __carry6_32); \
698
    ADD2_LIMB32_CARRY_IN(a5, a4, b5, b4, c5, c4, __carry6_32); \
699
  } while (0)
700
701
#define ADD8_LIMB32(a7, a6, a5, a4, a3, a2, a1, a0, \
702
        b7, b6, b5, b4, b3, b2, b1, b0, \
703
        c7, c6, c5, c4, c3, c2, c1, c0) do { \
704
    mpi_limb_t __carry8_32; \
705
    ADD2_LIMB32_CARRY_OUT(__carry8_32, a1, a0, b1, b0, c1, c0); \
706
    ADD2_LIMB32_CARRY_IN_OUT(a3, a2, b3, b2, c3, c2, __carry8_32); \
707
    ADD2_LIMB32_CARRY_IN_OUT(a5, a4, b5, b4, c5, c4, __carry8_32); \
708
    ADD2_LIMB32_CARRY_IN(a7, a6, b7, b6, c7, c6, __carry8_32); \
709
  } while (0)
710
711
#define ADD10_LIMB32(a9, a8, a7, a6, a5, a4, a3, a2, a1, a0, \
712
         b9, b8, b7, b6, b5, b4, b3, b2, b1, b0, \
713
         c9, c8, c7, c6, c5, c4, c3, c2, c1, c0) do { \
714
    mpi_limb_t __carry10_32; \
715
    ADD2_LIMB32_CARRY_OUT(__carry10_32, a1, a0, b1, b0, c1, c0); \
716
    ADD2_LIMB32_CARRY_IN_OUT(a3, a2, b3, b2, c3, c2, __carry10_32); \
717
    ADD2_LIMB32_CARRY_IN_OUT(a5, a4, b5, b4, c5, c4, __carry10_32); \
718
    ADD2_LIMB32_CARRY_IN_OUT(a7, a6, b7, b6, c7, c6, __carry10_32); \
719
    ADD2_LIMB32_CARRY_IN(a9, a8, b9, b8, c9, c8, __carry10_32); \
720
  } while (0)
721
722
#define ADD14_LIMB32(a13, a12, a11, a10, a9, a8, a7, \
723
         a6, a5, a4, a3, a2, a1, a0, \
724
         b13, b12, b11, b10, b9, b8, b7, \
725
         b6, b5, b4, b3, b2, b1, b0, \
726
         c13, c12, c11, c10, c9, c8, c7, \
727
         c6, c5, c4, c3, c2, c1, c0) do { \
728
    mpi_limb_t __carry14_32; \
729
    ADD2_LIMB32_CARRY_OUT(__carry14_32, a1, a0, b1, b0, c1, c0); \
730
    ADD2_LIMB32_CARRY_IN_OUT(a3, a2, b3, b2, c3, c2, __carry14_32); \
731
    ADD2_LIMB32_CARRY_IN_OUT(a5, a4, b5, b4, c5, c4, __carry14_32); \
732
    ADD2_LIMB32_CARRY_IN_OUT(a7, a6, b7, b6, c7, c6, __carry14_32); \
733
    ADD2_LIMB32_CARRY_IN_OUT(a9, a8, b9, b8, c9, c8, __carry14_32); \
734
    ADD2_LIMB32_CARRY_IN_OUT(a11, a10, b11, b10, c11, c10, __carry14_32); \
735
    ADD2_LIMB32_CARRY_IN(a13, a12, b13, b12, c13, c12, __carry14_32); \
736
  } while (0)
737
738
#define SUB2_LIMB32_CARRY_OUT(carry, a1, a0, b1, b0, c1, c0) \
739
  __asm__ ("subl %7, %2\n" \
740
     "sbbl %6, %1\n" \
741
     "sbbl %0, %0\n" \
742
     : "=r" (carry), \
743
       "=&r" (a1), \
744
       "=&r" (a0) \
745
     : "0" ((mpi_limb_t)(0)), \
746
       "1" ((mpi_limb_t)(b1)), \
747
       "2" ((mpi_limb_t)(b0)), \
748
       "re" ((mpi_limb_t)(c1)), \
749
       "re" ((mpi_limb_t)(c0)) \
750
     : "cc")
751
752
#define SUB2_LIMB32_CARRY_IN_OUT(a1, a0, b1, b0, c1, c0, carry) \
753
  __asm__ ("addl $1, %0\n" \
754
     "sbbl %7, %2\n" \
755
     "sbbl %6, %1\n" \
756
     "sbbl %0, %0\n" \
757
     : "=r" (carry), \
758
       "=&r" (a1), \
759
       "=&r" (a0) \
760
     : "0" ((mpi_limb_t)(carry)), \
761
       "1" ((mpi_limb_t)(b1)), \
762
       "2" ((mpi_limb_t)(b0)), \
763
       "re" ((mpi_limb_t)(c1)), \
764
       "re" ((mpi_limb_t)(c0)) \
765
     : "cc")
766
767
#define SUB2_LIMB32_CARRY_IN(a1, a0, b1, b0, c1, c0, carry) \
768
    __asm__ ("addl $1, %2\n" \
769
       "sbbl %7, %1\n" \
770
       "sbbl %6, %0\n" \
771
       : "=r" (a1), \
772
         "=&r" (a0), \
773
         "=&g" (carry) \
774
       : "0" ((mpi_limb_t)(b1)), \
775
         "1" ((mpi_limb_t)(b0)), \
776
         "2" ((mpi_limb_t)(carry)), \
777
         "re" ((mpi_limb_t)(c1)), \
778
         "re" ((mpi_limb_t)(c0)) \
779
     : "cc")
780
781
#define SUB4_LIMB32(a3, a2, a1, a0, b3, b2, b1, b0, c3, c2, c1, c0) do { \
782
    mpi_limb_t __carry4_32; \
783
    SUB2_LIMB32_CARRY_OUT(__carry4_32, a1, a0, b1, b0, c1, c0); \
784
    SUB2_LIMB32_CARRY_IN(a3, a2, b3, b2, c3, c2, __carry4_32); \
785
  } while (0)
786
787
#define SUB6_LIMB32(a5, a4, a3, a2, a1, a0, b5, b4, b3, b2, b1, b0, \
788
        c5, c4, c3, c2, c1, c0) do { \
789
    mpi_limb_t __carry6_32; \
790
    SUB2_LIMB32_CARRY_OUT(__carry6_32, a1, a0, b1, b0, c1, c0); \
791
    SUB2_LIMB32_CARRY_IN_OUT(a3, a2, b3, b2, c3, c2, __carry6_32); \
792
    SUB2_LIMB32_CARRY_IN(a5, a4, b5, b4, c5, c4, __carry6_32); \
793
  } while (0)
794
795
#define SUB8_LIMB32(a7, a6, a5, a4, a3, a2, a1, a0, \
796
        b7, b6, b5, b4, b3, b2, b1, b0, \
797
        c7, c6, c5, c4, c3, c2, c1, c0) do { \
798
    mpi_limb_t __carry8_32; \
799
    SUB2_LIMB32_CARRY_OUT(__carry8_32, a1, a0, b1, b0, c1, c0); \
800
    SUB2_LIMB32_CARRY_IN_OUT(a3, a2, b3, b2, c3, c2, __carry8_32); \
801
    SUB2_LIMB32_CARRY_IN_OUT(a5, a4, b5, b4, c5, c4, __carry8_32); \
802
    SUB2_LIMB32_CARRY_IN(a7, a6, b7, b6, c7, c6, __carry8_32); \
803
  } while (0)
804
805
#define SUB10_LIMB32(a9, a8, a7, a6, a5, a4, a3, a2, a1, a0, \
806
         b9, b8, b7, b6, b5, b4, b3, b2, b1, b0, \
807
         c9, c8, c7, c6, c5, c4, c3, c2, c1, c0) do { \
808
    mpi_limb_t __carry10_32; \
809
    SUB2_LIMB32_CARRY_OUT(__carry10_32, a1, a0, b1, b0, c1, c0); \
810
    SUB2_LIMB32_CARRY_IN_OUT(a3, a2, b3, b2, c3, c2, __carry10_32); \
811
    SUB2_LIMB32_CARRY_IN_OUT(a5, a4, b5, b4, c5, c4, __carry10_32); \
812
    SUB2_LIMB32_CARRY_IN_OUT(a7, a6, b7, b6, c7, c6, __carry10_32); \
813
    SUB2_LIMB32_CARRY_IN(a9, a8, b9, b8, c9, c8, __carry10_32); \
814
  } while (0)
815
816
#define SUB14_LIMB32(a13, a12, a11, a10, a9, a8, a7, \
817
         a6, a5, a4, a3, a2, a1, a0, \
818
         b13, b12, b11, b10, b9, b8, b7, \
819
         b6, b5, b4, b3, b2, b1, b0, \
820
         c13, c12, c11, c10, c9, c8, c7, \
821
         c6, c5, c4, c3, c2, c1, c0) do { \
822
    mpi_limb_t __carry14_32; \
823
    SUB2_LIMB32_CARRY_OUT(__carry14_32, a1, a0, b1, b0, c1, c0); \
824
    SUB2_LIMB32_CARRY_IN_OUT(a3, a2, b3, b2, c3, c2, __carry14_32); \
825
    SUB2_LIMB32_CARRY_IN_OUT(a5, a4, b5, b4, c5, c4, __carry14_32); \
826
    SUB2_LIMB32_CARRY_IN_OUT(a7, a6, b7, b6, c7, c6, __carry14_32); \
827
    SUB2_LIMB32_CARRY_IN_OUT(a9, a8, b9, b8, c9, c8, __carry14_32); \
828
    SUB2_LIMB32_CARRY_IN_OUT(a11, a10, b11, b10, c11, c10, __carry14_32); \
829
    SUB2_LIMB32_CARRY_IN(a13, a12, b13, b12, c13, c12, __carry14_32); \
830
  } while (0)
831
832
#endif /* __i386__ */
833
834
835
/* ARM addition/subtraction helpers.  */
836
#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
837
838
#define ADD4_LIMB32(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
839
  __asm__ ("adds %3, %3, %11\n" \
840
     "adcs %2, %2, %10\n" \
841
     "adcs %1, %1, %9\n" \
842
     "adc  %0, %0, %8\n" \
843
     : "=r" (A3), \
844
       "=&r" (A2), \
845
       "=&r" (A1), \
846
       "=&r" (A0) \
847
     : "0" ((mpi_limb_t)(B3)), \
848
       "1" ((mpi_limb_t)(B2)), \
849
       "2" ((mpi_limb_t)(B1)), \
850
       "3" ((mpi_limb_t)(B0)), \
851
       "Ir" ((mpi_limb_t)(C3)), \
852
       "Ir" ((mpi_limb_t)(C2)), \
853
       "Ir" ((mpi_limb_t)(C1)), \
854
       "Ir" ((mpi_limb_t)(C0)) \
855
     : "cc")
856
857
#define ADD6_LIMB32(A5, A4, A3, A2, A1, A0, B5, B4, B3, B2, B1, B0, \
858
        C5, C4, C3, C2, C1, C0) do { \
859
    mpi_limb_t __carry6_32; \
860
    __asm__ ("adds %3, %3, %10\n" \
861
       "adcs %2, %2, %9\n" \
862
       "adcs %1, %1, %8\n" \
863
       "adc  %0, %0, %0\n" \
864
       : "=r" (__carry6_32), \
865
         "=&r" (A2), \
866
         "=&r" (A1), \
867
         "=&r" (A0) \
868
       : "0" ((mpi_limb_t)(0)), \
869
         "1" ((mpi_limb_t)(B2)), \
870
         "2" ((mpi_limb_t)(B1)), \
871
         "3" ((mpi_limb_t)(B0)), \
872
         "Ir" ((mpi_limb_t)(C2)), \
873
         "Ir" ((mpi_limb_t)(C1)), \
874
         "Ir" ((mpi_limb_t)(C0)) \
875
       : "cc"); \
876
    ADD4_LIMB32(A5, A4, A3, __carry6_32, B5, B4, B3, __carry6_32, \
877
    C5, C4, C3, 0xffffffffU); \
878
  } while (0)
879
880
#define SUB4_LIMB32(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
881
  __asm__ ("subs %3, %3, %11\n" \
882
     "sbcs %2, %2, %10\n" \
883
     "sbcs %1, %1, %9\n" \
884
     "sbc  %0, %0, %8\n" \
885
     : "=r" (A3), \
886
       "=&r" (A2), \
887
       "=&r" (A1), \
888
       "=&r" (A0) \
889
     : "0" ((mpi_limb_t)(B3)), \
890
       "1" ((mpi_limb_t)(B2)), \
891
       "2" ((mpi_limb_t)(B1)), \
892
       "3" ((mpi_limb_t)(B0)), \
893
       "Ir" ((mpi_limb_t)(C3)), \
894
       "Ir" ((mpi_limb_t)(C2)), \
895
       "Ir" ((mpi_limb_t)(C1)), \
896
       "Ir" ((mpi_limb_t)(C0)) \
897
     : "cc")
898
899
#define SUB6_LIMB32(A5, A4, A3, A2, A1, A0, B5, B4, B3, B2, B1, B0, \
900
        C5, C4, C3, C2, C1, C0) do { \
901
    mpi_limb_t __borrow6_32; \
902
    __asm__ ("subs %3, %3, %9\n" \
903
       "sbcs %2, %2, %8\n" \
904
       "sbcs %1, %1, %7\n" \
905
       "sbc  %0, %0, %0\n" \
906
       : "=r" (__borrow6_32), \
907
         "=&r" (A2), \
908
         "=&r" (A1), \
909
         "=&r" (A0) \
910
       : "1" ((mpi_limb_t)(B2)), \
911
         "2" ((mpi_limb_t)(B1)), \
912
         "3" ((mpi_limb_t)(B0)), \
913
         "Ir" ((mpi_limb_t)(C2)), \
914
         "Ir" ((mpi_limb_t)(C1)), \
915
         "Ir" ((mpi_limb_t)(C0)) \
916
       : "cc"); \
917
    SUB4_LIMB32(A5, A4, A3, __borrow6_32, B5, B4, B3, 0, \
918
    C5, C4, C3, -__borrow6_32); \
919
  } while (0)
920
921
#endif /* HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS */
922
923
#if defined (__hppa) && __GNUC__ >= 4
924
#define ADD4_LIMB32(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
925
  __asm__ ("add %7,%11,%3\n\t" \
926
     "addc %6,%10,%2\n\t" \
927
     "addc %5,%9,%1\n\t" \
928
     "addc %4,%8,%0" \
929
     : "=r" (A3), \
930
       "=&r" (A2), \
931
       "=&r" (A1), \
932
       "=&r" (A0) \
933
     : "rM" ((mpi_limb_t)(B3)), \
934
       "rM" ((mpi_limb_t)(B2)), \
935
       "rM" ((mpi_limb_t)(B1)), \
936
       "rM" ((mpi_limb_t)(B0)), \
937
       "rM" ((mpi_limb_t)(C3)), \
938
       "rM" ((mpi_limb_t)(C2)), \
939
       "rM" ((mpi_limb_t)(C1)), \
940
       "rM" ((mpi_limb_t)(C0)) \
941
     : "cc")
942
943
#define SUB4_LIMB32(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
944
  __asm__ ("sub %7,%11,%3\n\t" \
945
     "subb %6,%10,%2\n\t" \
946
     "subb %5,%9,%1\n\t" \
947
     "subb %4,%8,%0\n\t" \
948
     : "=r" (A3), \
949
       "=&r" (A2), \
950
       "=&r" (A1), \
951
       "=&r" (A0) \
952
     : "rM" ((mpi_limb_t)(B3)), \
953
       "rM" ((mpi_limb_t)(B2)), \
954
       "rM" ((mpi_limb_t)(B1)), \
955
       "rM" ((mpi_limb_t)(B0)), \
956
       "rM" ((mpi_limb_t)(C3)), \
957
       "rM" ((mpi_limb_t)(C2)), \
958
       "rM" ((mpi_limb_t)(C1)), \
959
       "rM" ((mpi_limb_t)(C0)) \
960
     : "cc")
961
962
#endif /* __hppa */
963
964
/* Common 32-bit arch addition/subtraction macros.  */
965
966
#if defined(ADD4_LIMB32)
967
/* A[0..1] = B[0..1] + C[0..1] */
968
#define ADD2_LIMB64(A1, A0, B1, B0, C1, C0) \
969
  ADD4_LIMB32(A1.hi, A1.lo, A0.hi, A0.lo, \
970
        B1.hi, B1.lo, B0.hi, B0.lo, \
971
        C1.hi, C1.lo, C0.hi, C0.lo)
972
#else
973
/* A[0..1] = B[0..1] + C[0..1] */
974
#define ADD2_LIMB64(A1, A0, B1, B0, C1, C0) do { \
975
    mpi_limb_t __carry2_0, __carry2_1; \
976
    add_ssaaaa(__carry2_0, A0.lo, 0, B0.lo, 0, C0.lo); \
977
    add_ssaaaa(__carry2_1, A0.hi, 0, B0.hi, 0, C0.hi); \
978
    add_ssaaaa(__carry2_1, A0.hi, __carry2_1, A0.hi, 0, __carry2_0); \
979
    add_ssaaaa(A1.hi, A1.lo, B1.hi, B1.lo, C1.hi, C1.lo); \
980
    add_ssaaaa(A1.hi, A1.lo, A1.hi, A1.lo, 0, __carry2_1); \
981
  } while (0)
982
#endif
983
984
#if defined(ADD6_LIMB32)
985
/* A[0..2] = B[0..2] + C[0..2] */
986
#define ADD3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \
987
  ADD6_LIMB32(A2.hi, A2.lo, A1.hi, A1.lo, A0.hi, A0.lo, \
988
        B2.hi, B2.lo, B1.hi, B1.lo, B0.hi, B0.lo, \
989
        C2.hi, C2.lo, C1.hi, C1.lo, C0.hi, C0.lo)
990
#endif
991
992
#if defined(ADD8_LIMB32)
993
/* A[0..3] = B[0..3] + C[0..3] */
994
#define ADD4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
995
    ADD8_LIMB32(A3.hi, A3.lo, A2.hi, A2.lo, A1.hi, A1.lo, A0.hi, A0.lo, \
996
    B3.hi, B3.lo, B2.hi, B2.lo, B1.hi, B1.lo, B0.hi, B0.lo, \
997
    C3.hi, C3.lo, C2.hi, C2.lo, C1.hi, C1.lo, C0.hi, C0.lo)
998
#elif defined(ADD6_LIMB32)
999
/* A[0..3] = B[0..3] + C[0..3] */
1000
#define ADD4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) do { \
1001
    mpi_limb_t __carry4; \
1002
    ADD6_LIMB32(__carry4, A2.lo, A1.hi, A1.lo, A0.hi, A0.lo, \
1003
    0, B2.lo, B1.hi, B1.lo, B0.hi, B0.lo, \
1004
    0, C2.lo, C1.hi, C1.lo, C0.hi, C0.lo); \
1005
    ADD4_LIMB32(A3.hi, A3.lo, A2.hi, __carry4, \
1006
    B3.hi, B3.lo, B2.hi, __carry4, \
1007
    C3.hi, C3.lo, C2.hi, 0xffffffffU); \
1008
  } while (0)
1009
#endif
1010
1011
#if defined(ADD10_LIMB32)
1012
/* A[0..4] = B[0..4] + C[0..4] */
1013
#define ADD5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
1014
        C4, C3, C2, C1, C0) \
1015
    ADD10_LIMB32(A4.hi, A4.lo, A3.hi, A3.lo, A2.hi, A2.lo, A1.hi, A1.lo, \
1016
     A0.hi, A0.lo, B4.hi, B4.lo, B3.hi, B3.lo, B2.hi, B2.lo, \
1017
     B1.hi, B1.lo, B0.hi, B0.lo, C4.hi, C4.lo, C3.hi, C3.lo, \
1018
     C2.hi, C2.lo, C1.hi, C1.lo, C0.hi, C0.lo)
1019
#endif
1020
1021
#if defined(ADD14_LIMB32)
1022
/* A[0..6] = B[0..6] + C[0..6] */
1023
#define ADD7_LIMB64(A6, A5, A4, A3, A2, A1, A0, B6, B5, B4, B3, B2, B1, B0, \
1024
        C6, C5, C4, C3, C2, C1, C0) \
1025
    ADD14_LIMB32(A6.hi, A6.lo, A5.hi, A5.lo, A4.hi, A4.lo, A3.hi, A3.lo, \
1026
     A2.hi, A2.lo, A1.hi, A1.lo, A0.hi, A0.lo, B6.hi, B6.lo, \
1027
     B5.hi, B5.lo, B4.hi, B4.lo, B3.hi, B3.lo, B2.hi, B2.lo, \
1028
     B1.hi, B1.lo, B0.hi, B0.lo, C6.hi, C6.lo, C5.hi, C5.lo, \
1029
     C4.hi, C4.lo, C3.hi, C3.lo, C2.hi, C2.lo, C1.hi, C1.lo, \
1030
     C0.hi, C0.lo)
1031
#endif
1032
1033
#if defined(SUB4_LIMB32)
1034
/* A[0..1] = B[0..1] - C[0..1] */
1035
#define SUB2_LIMB64(A1, A0, B1, B0, C1, C0) \
1036
  SUB4_LIMB32(A1.hi, A1.lo, A0.hi, A0.lo, \
1037
        B1.hi, B1.lo, B0.hi, B0.lo, \
1038
        C1.hi, C1.lo, C0.hi, C0.lo)
1039
#else
1040
/* A[0..1] = B[0..1] - C[0..1] */
1041
#define SUB2_LIMB64(A1, A0, B1, B0, C1, C0) do { \
1042
    mpi_limb_t __borrow2_0, __borrow2_1; \
1043
    sub_ddmmss(__borrow2_0, A0.lo, 0, B0.lo, 0, C0.lo); \
1044
    sub_ddmmss(__borrow2_1, A0.hi, 0, B0.hi, 0, C0.hi); \
1045
    sub_ddmmss(__borrow2_1, A0.hi, __borrow2_1, A0.hi, 0, -__borrow2_0); \
1046
    sub_ddmmss(A1.hi, A1.lo, B1.hi, B1.lo, C1.hi, C1.lo); \
1047
    sub_ddmmss(A1.hi, A1.lo, A1.hi, A1.lo, 0, -__borrow2_1); \
1048
  } while (0)
1049
#endif
1050
1051
#if defined(SUB6_LIMB32)
1052
/* A[0..2] = B[0..2] - C[0..2] */
1053
#define SUB3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \
1054
  SUB6_LIMB32(A2.hi, A2.lo, A1.hi, A1.lo, A0.hi, A0.lo, \
1055
        B2.hi, B2.lo, B1.hi, B1.lo, B0.hi, B0.lo, \
1056
        C2.hi, C2.lo, C1.hi, C1.lo, C0.hi, C0.lo)
1057
#endif
1058
1059
#if defined(SUB8_LIMB32)
1060
/* A[0..3] = B[0..3] - C[0..3] */
1061
#define SUB4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
1062
    SUB8_LIMB32(A3.hi, A3.lo, A2.hi, A2.lo, A1.hi, A1.lo, A0.hi, A0.lo, \
1063
    B3.hi, B3.lo, B2.hi, B2.lo, B1.hi, B1.lo, B0.hi, B0.lo, \
1064
    C3.hi, C3.lo, C2.hi, C2.lo, C1.hi, C1.lo, C0.hi, C0.lo)
1065
#elif defined(SUB6_LIMB32)
1066
/* A[0..3] = B[0..3] - C[0..3] */
1067
#define SUB4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) do { \
1068
    mpi_limb_t __borrow4; \
1069
    SUB6_LIMB32(__borrow4, A2.lo, A1.hi, A1.lo, A0.hi, A0.lo, \
1070
    0, B2.lo, B1.hi, B1.lo, B0.hi, B0.lo, \
1071
    0, C2.lo, C1.hi, C1.lo, C0.hi, C0.lo); \
1072
    SUB4_LIMB32(A3.hi, A3.lo, A2.hi, __borrow4, \
1073
    B3.hi, B3.lo, B2.hi, 0, \
1074
    C3.hi, C3.lo, C2.hi, -__borrow4); \
1075
  } while (0)
1076
#endif
1077
1078
#if defined(SUB10_LIMB32)
1079
/* A[0..4] = B[0..4] - C[0..4] */
1080
#define SUB5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
1081
        C4, C3, C2, C1, C0) \
1082
    SUB10_LIMB32(A4.hi, A4.lo, A3.hi, A3.lo, A2.hi, A2.lo, A1.hi, A1.lo, \
1083
     A0.hi, A0.lo, B4.hi, B4.lo, B3.hi, B3.lo, B2.hi, B2.lo, \
1084
     B1.hi, B1.lo, B0.hi, B0.lo, C4.hi, C4.lo, C3.hi, C3.lo, \
1085
     C2.hi, C2.lo, C1.hi, C1.lo, C0.hi, C0.lo)
1086
#endif
1087
1088
#if defined(SUB14_LIMB32)
1089
/* A[0..6] = B[0..6] - C[0..6] */
1090
#define SUB7_LIMB64(A6, A5, A4, A3, A2, A1, A0, B6, B5, B4, B3, B2, B1, B0, \
1091
        C6, C5, C4, C3, C2, C1, C0) \
1092
    SUB14_LIMB32(A6.hi, A6.lo, A5.hi, A5.lo, A4.hi, A4.lo, A3.hi, A3.lo, \
1093
     A2.hi, A2.lo, A1.hi, A1.lo, A0.hi, A0.lo, B6.hi, B6.lo, \
1094
     B5.hi, B5.lo, B4.hi, B4.lo, B3.hi, B3.lo, B2.hi, B2.lo, \
1095
     B1.hi, B1.lo, B0.hi, B0.lo, C6.hi, C6.lo, C5.hi, C5.lo, \
1096
     C4.hi, C4.lo, C3.hi, C3.lo, C2.hi, C2.lo, C1.hi, C1.lo, \
1097
     C0.hi, C0.lo)
1098
#endif
1099
1100
#endif /* BYTES_PER_MPI_LIMB == 4 */
1101
1102
1103
/* Common definitions.  */
1104
#define BITS_PER_MPI_LIMB64 (BITS_PER_MPI_LIMB * LIMBS_PER_LIMB64)
1105
#define BYTES_PER_MPI_LIMB64 (BYTES_PER_MPI_LIMB * LIMBS_PER_LIMB64)
1106
1107
1108
/* Common addition/subtraction macros.  */
1109
1110
#ifndef ADD3_LIMB64
1111
/* A[0..2] = B[0..2] + C[0..2] */
1112
#define ADD3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) do { \
1113
    mpi_limb64_t __carry3; \
1114
    ADD2_LIMB64(__carry3, A0, zero, B0, zero, C0); \
1115
    ADD2_LIMB64(A2, A1, B2, B1, C2, C1); \
1116
    ADD2_LIMB64(A2, A1, A2, A1, zero, __carry3); \
1117
  } while (0)
1118
#endif
1119
1120
#ifndef ADD4_LIMB64
1121
/* A[0..3] = B[0..3] + C[0..3] */
1122
#define ADD4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) do { \
1123
    mpi_limb64_t __carry4; \
1124
    ADD3_LIMB64(__carry4, A1, A0, zero, B1, B0, zero, C1, C0); \
1125
    ADD2_LIMB64(A3, A2, B3, B2, C3, C2); \
1126
    ADD2_LIMB64(A3, A2, A3, A2, zero, __carry4); \
1127
  } while (0)
1128
#endif
1129
1130
#ifndef ADD5_LIMB64
1131
/* A[0..4] = B[0..4] + C[0..4] */
1132
#define ADD5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
1133
                    C4, C3, C2, C1, C0) do { \
1134
    mpi_limb64_t __carry5; \
1135
    ADD4_LIMB64(__carry5, A2, A1, A0, zero, B2, B1, B0, zero, C2, C1, C0); \
1136
    ADD2_LIMB64(A4, A3, B4, B3, C4, C3); \
1137
    ADD2_LIMB64(A4, A3, A4, A3, zero, __carry5); \
1138
  } while (0)
1139
#endif
1140
1141
#ifndef ADD7_LIMB64
1142
/* A[0..6] = B[0..6] + C[0..6] */
1143
#define ADD7_LIMB64(A6, A5, A4, A3, A2, A1, A0, B6, B5, B4, B3, B2, B1, B0, \
1144
26.9M
                    C6, C5, C4, C3, C2, C1, C0) do { \
1145
26.9M
    mpi_limb64_t __carry7; \
1146
26.9M
    ADD4_LIMB64(__carry7, A2, A1, A0, zero, B2, B1, B0, \
1147
26.9M
    zero, C2, C1, C0); \
1148
26.9M
    ADD5_LIMB64(A6, A5, A4, A3, __carry7, B6, B5, B4, B3, \
1149
26.9M
    __carry7, C6, C5, C4, C3, LIMB64_HILO(-1, -1)); \
1150
26.9M
  } while (0)
1151
#endif
1152
1153
#ifndef SUB3_LIMB64
1154
/* A[0..2] = B[0..2] - C[0..2] */
1155
#define SUB3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) do { \
1156
    mpi_limb64_t __borrow3; \
1157
    SUB2_LIMB64(__borrow3, A0, zero, B0, zero, C0); \
1158
    SUB2_LIMB64(A2, A1, B2, B1, C2, C1); \
1159
    SUB2_LIMB64(A2, A1, A2, A1, zero, LIMB_TO64(-LIMB_FROM64(__borrow3))); \
1160
  } while (0)
1161
#endif
1162
1163
#ifndef SUB4_LIMB64
1164
/* A[0..3] = B[0..3] - C[0..3] */
1165
#define SUB4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) do { \
1166
    mpi_limb64_t __borrow4; \
1167
    SUB3_LIMB64(__borrow4, A1, A0, zero, B1, B0, zero, C1, C0); \
1168
    SUB2_LIMB64(A3, A2, B3, B2, C3, C2); \
1169
    SUB2_LIMB64(A3, A2, A3, A2, zero, LIMB_TO64(-LIMB_FROM64(__borrow4))); \
1170
  } while (0)
1171
#endif
1172
1173
#ifndef SUB5_LIMB64
1174
/* A[0..4] = B[0..4] - C[0..4] */
1175
#define SUB5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
1176
                    C4, C3, C2, C1, C0) do { \
1177
    mpi_limb64_t __borrow5; \
1178
    SUB4_LIMB64(__borrow5, A2, A1, A0, zero, B2, B1, B0, zero, C2, C1, C0); \
1179
    SUB2_LIMB64(A4, A3, B4, B3, C4, C3); \
1180
    SUB2_LIMB64(A4, A3, A4, A3, zero, LIMB_TO64(-LIMB_FROM64(__borrow5))); \
1181
  } while (0)
1182
#endif
1183
1184
#ifndef SUB7_LIMB64
1185
/* A[0..6] = B[0..6] - C[0..6] */
1186
#define SUB7_LIMB64(A6, A5, A4, A3, A2, A1, A0, B6, B5, B4, B3, B2, B1, B0, \
1187
8.97M
                    C6, C5, C4, C3, C2, C1, C0) do { \
1188
8.97M
    mpi_limb64_t __borrow7; \
1189
8.97M
    SUB4_LIMB64(__borrow7, A2, A1, A0, zero, B2, B1, B0, \
1190
8.97M
    zero, C2, C1, C0); \
1191
8.97M
    SUB5_LIMB64(A6, A5, A4, A3, __borrow7, B6, B5, B4, B3, zero, \
1192
8.97M
    C6, C5, C4, C3, LIMB_TO64(-LIMB_FROM64(__borrow7))); \
1193
8.97M
  } while (0)
1194
#endif
1195
1196
1197
#if defined(WORDS_BIGENDIAN) || (BITS_PER_MPI_LIMB64 != BITS_PER_MPI_LIMB)
1198
#define LOAD64_UNALIGNED(x, pos) \
1199
  LIMB64_HILO(LOAD32(x, 2 * (pos) + 2), LOAD32(x, 2 * (pos) + 1))
1200
#else
1201
#define LOAD64_UNALIGNED(x, pos) \
1202
  buf_get_le64((const byte *)(&(x)[pos]) + 4)
1203
#endif
1204
1205
1206
/* Helper functions.  */
1207
1208
static inline int
1209
mpi_nbits_more_than (gcry_mpi_t w, unsigned int nbits)
1210
14.1M
{
1211
14.1M
  unsigned int nbits_nlimbs;
1212
14.1M
  mpi_limb_t wlimb;
1213
14.1M
  unsigned int n;
1214
1215
14.1M
  nbits_nlimbs = (nbits + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB;
1216
1217
  /* Note: Assumes that 'w' is normalized. */
1218
1219
14.1M
  if (w->nlimbs > nbits_nlimbs)
1220
0
    return 1;
1221
14.1M
  if (w->nlimbs < nbits_nlimbs)
1222
5.67M
    return 0;
1223
8.44M
  if ((nbits % BITS_PER_MPI_LIMB) == 0)
1224
5.80M
    return 0;
1225
1226
2.64M
  wlimb = w->d[nbits_nlimbs - 1];
1227
2.64M
  if (wlimb == 0)
1228
0
    log_bug ("mpi_nbits_more_than: input mpi not normalized\n");
1229
1230
2.64M
  count_leading_zeros (n, wlimb);
1231
1232
2.64M
  return (BITS_PER_MPI_LIMB - n) > (nbits % BITS_PER_MPI_LIMB);
1233
2.64M
}
1234
1235
#endif /* GCRY_EC_INLINE_H */