/src/botan/src/lib/pubkey/curve448/curve448_gf.cpp

Source (jump to first uncovered line)
/*
* X448 Gf Modulo 2^448 - 2^224 - 1
* (C) 2024 Jack Lloyd
*     2024 Fabian Albert - Rohde & Schwarz Cybersecurity
*
* Botan is released under the Simplified BSD License (see license.txt)
*/
#include <botan/internal/curve448_gf.h>

#include <botan/internal/ct_utils.h>
#include <botan/internal/loadstor.h>
#include <botan/internal/mp_asmi.h>
#include <botan/internal/mp_core.h>

#include <algorithm>

namespace Botan {

namespace {

/**
 * @brief Compute (a + b). The carry is returned in the carry parameter.
 *  The carry is not included for the addition.
 */
inline uint64_t u64_add(uint64_t a, uint64_t b, bool* carry) {
   // Let the compiler optimize this into fancy instructions
   const uint64_t sum = a + b;
   *carry = sum < a;
   return sum;
}

/**
 * @brief Compute (a + b + carry), where carry is in {0, 1}. The carry of this computation
 * is store in the in/out @p carry parameter.
 */
inline uint64_t u64_add_with_carry(uint64_t a, uint64_t b, bool* carry) {
   // Let the compiler optimize this into fancy instructions
   uint64_t sum = a + b;
   const bool carry_a_plus_b = (sum < a);
   sum += static_cast<uint64_t>(*carry);
   *carry = static_cast<uint64_t>(carry_a_plus_b) | static_cast<uint64_t>(sum < static_cast<uint64_t>(*carry));
   return sum;
}

/**
 * @brief Compute (a - (b + borrow)). The borrow is returned in the carry parameter.
 *
 * I.e. borrow = 1 if a < b + borrow, else 0.
 */
inline uint64_t u64_sub_with_borrow(uint64_t a, uint64_t b, bool* borrow) {
   // Let the compiler optimize this into fancy instructions
   const uint64_t diff = a - b;
   const bool borrow_a_min_b = diff > a;
   const uint64_t z = diff - static_cast<uint64_t>(*borrow);
   *borrow = static_cast<uint64_t>(borrow_a_min_b) | static_cast<uint64_t>(z > diff);
   return z;
}

/**
 * @brief Reduce the result of a addition modulo 2^448 - 2^224 - 1.
 *
 * Algorithm 1 of paper "Reduction Modulo 2^448 - 2^224 - 1", from line 27.
 *
 * @param h_3 Output
 * @param h_1 Input
 */
void reduce_after_add(std::span<uint64_t, WORDS_448> h_3, std::span<const uint64_t, 8> h_1) {
   std::array<uint64_t, 8> h_2;
   bool carry;

   // Line 27+ (of the paper's algorithm 1)
   h_2[0] = u64_add(h_1[0], h_1[7], &carry);

   h_2[1] = u64_add(h_1[1], carry, &carry);
   h_2[2] = u64_add(h_1[2], carry, &carry);

   // Line 30
   h_2[3] = u64_add_with_carry(h_1[3], h_1[7] << 32, &carry);

   // Line 31+
   h_2[4] = u64_add(h_1[4], carry, &carry);
   h_2[5] = u64_add(h_1[5], carry, &carry);
   h_2[6] = u64_add(h_1[6], carry, &carry);

   h_2[7] = carry;

   h_3[0] = u64_add(h_2[0], h_2[7], &carry);
   h_3[1] = u64_add(h_2[1], carry, &carry);
   h_3[2] = u64_add(h_2[2], carry, &carry);
   // Line 37
   h_3[3] = h_2[3] + (h_2[7] << 32) + carry;

   // Line 38
   h_3[4] = h_2[4];
   h_3[5] = h_2[5];
   h_3[6] = h_2[6];
}

/**
 * @brief Reduce the result of a addition modulo 2^448 - 2^224 - 1.
 *
 * Algorithm 1 of paper "Reduction Modulo 2^448 - 2^224 - 1".
 */
void reduce_after_mul(std::span<uint64_t, WORDS_448> out, std::span<const uint64_t, 14> in) {
   std::array<uint64_t, 8> r;
   std::array<uint64_t, 8> s;
   std::array<uint64_t, 8> t_0;
   std::array<uint64_t, 8> h_1;

   bool carry;

   // Line 4 (of the paper's algorithm 1)
   r[0] = u64_add(in[0], in[7], &carry);

   // Line 5-7
   for(size_t i = 1; i < 7; ++i) {
      r[i] = u64_add_with_carry(in[i], in[i + 7], &carry);
   }
   r[7] = carry;
   s[0] = r[0];
   s[1] = r[1];
   s[2] = r[2];
   // Line 10
   s[3] = u64_add(r[3], in[10] & 0xFFFFFFFF00000000, &carry);
   // Line 11-13
   for(size_t i = 4; i < 7; ++i) {
      s[i] = u64_add_with_carry(r[i], in[i + 7], &carry);
   }
   s[7] = r[7] + carry;

   // Line 15-17
   for(size_t i = 0; i < 3; ++i) {
      t_0[i] = (in[i + 11] << 32) | (in[i + 10] >> 32);
   }
   // Line 18
   t_0[3] = (in[7] << 32) | (in[13] >> 32);
   // Line 19-21
   for(size_t i = 4; i < 7; ++i) {
      t_0[i] = (in[i + 4] << 32) | (in[i + 3] >> 32);
   }
   h_1[0] = u64_add(s[0], t_0[0], &carry);
   // Line 23-25
   for(size_t i = 1; i < 7; ++i) {
      h_1[i] = u64_add_with_carry(s[i], t_0[i], &carry);
   }
   h_1[7] = s[7] + carry;

   reduce_after_add(out, h_1);
}

void gf_mul(std::span<uint64_t, WORDS_448> out,
            std::span<const uint64_t, WORDS_448> a,
            std::span<const uint64_t, WORDS_448> b) {
   std::array<uint64_t, 14> ws;
   comba_mul<7>(ws.data(), a.data(), b.data());
   reduce_after_mul(out, ws);
}

void gf_square(std::span<uint64_t, WORDS_448> out, std::span<const uint64_t, WORDS_448> a) {
   std::array<uint64_t, 14> ws;
   comba_sqr<7>(ws.data(), a.data());
   reduce_after_mul(out, ws);
}

void gf_add(std::span<uint64_t, WORDS_448> out,
            std::span<const uint64_t, WORDS_448> a,
            std::span<const uint64_t, WORDS_448> b) {
   std::array<uint64_t, WORDS_448 + 1> ws;
   copy_mem(std::span(ws).first<WORDS_448>(), a);
   ws[WORDS_448] = 0;

   bool carry = false;
   for(size_t i = 0; i < WORDS_448; ++i) {
      ws[i] = u64_add_with_carry(a[i], b[i], &carry);
   }
   ws[WORDS_448] = carry;

   reduce_after_add(out, ws);
}

/**
 * @brief Subtract two elements in GF(P). out = a - b
 *
 * Algorithm 2 of paper: "Reduction Modulo 2^448 - 2^224 - 1"
 */
void gf_sub(std::span<uint64_t, WORDS_448> out,
            std::span<const uint64_t, WORDS_448> a,
            std::span<const uint64_t, WORDS_448> b) {
   std::array<uint64_t, WORDS_448> h_0;
   std::array<uint64_t, WORDS_448> h_1;

   bool borrow = false;
   for(size_t i = 0; i < WORDS_448; ++i) {
      h_0[i] = u64_sub_with_borrow(a[i], b[i], &borrow);
   }
   uint64_t delta = borrow;
   uint64_t delta_p = delta << 32;
   borrow = false;

   h_1[0] = u64_sub_with_borrow(h_0[0], delta, &borrow);
   h_1[1] = u64_sub_with_borrow(h_0[1], 0, &borrow);
   h_1[2] = u64_sub_with_borrow(h_0[2], 0, &borrow);
   h_1[3] = u64_sub_with_borrow(h_0[3], delta_p, &borrow);
   h_1[4] = u64_sub_with_borrow(h_0[4], 0, &borrow);
   h_1[5] = u64_sub_with_borrow(h_0[5], 0, &borrow);
   h_1[6] = u64_sub_with_borrow(h_0[6], 0, &borrow);

   delta = borrow;
   delta_p = delta << 32;
   borrow = false;

   out[0] = u64_sub_with_borrow(h_1[0], delta, &borrow);
   out[1] = u64_sub_with_borrow(h_1[1], 0, &borrow);
   out[2] = u64_sub_with_borrow(h_1[2], 0, &borrow);
   out[3] = u64_sub_with_borrow(h_1[3], delta_p, &borrow);
   out[4] = h_1[4];
   out[5] = h_1[5];
   out[6] = h_1[6];
}

/**
 * @brief Inversion in GF(P) using Fermat's little theorem:
 * x^-1 = x^(P-2) mod P
 */
void gf_inv(std::span<uint64_t, WORDS_448> out, std::span<const uint64_t, WORDS_448> a) {
   clear_mem(out);
   out[0] = 1;
   // Square and multiply
   for(int16_t t = 448; t >= 0; --t) {
      gf_square(out, out);
      // (P-2) has zero bits at indices 1, 224, 448. All others are one.
      if(t != 448 && t != 224 && t != 1) {
         gf_mul(out, out, a);
      }
   }
}

/**
 * @brief Convert a number to its canonical representation.
 *
 * I.e. if the number is greater than P, subtract P. The number cannot be >= 2P
 * since 2*P > 2^(7*64).
 */
std::array<uint64_t, WORDS_448> to_canonical(std::span<const uint64_t, WORDS_448> in) {
   const std::array<uint64_t, WORDS_448> p = {0xffffffffffffffff,
                                              0xffffffffffffffff,
                                              0xffffffffffffffff,
                                              0xfffffffeffffffff,
                                              0xffffffffffffffff,
                                              0xffffffffffffffff,
                                              0xffffffffffffffff};

   std::array<uint64_t, WORDS_448> in_minus_p;
   bool borrow = false;
   for(size_t i = 0; i < WORDS_448; ++i) {
      in_minus_p[i] = u64_sub_with_borrow(in[i], p[i], &borrow);
   }
   std::array<uint64_t, WORDS_448> out;
   CT::Mask<uint64_t>::expand(borrow).select_n(out.data(), in.data(), in_minus_p.data(), WORDS_448);
   return out;
}

}  // namespace

Gf448Elem::Gf448Elem(std::span<const uint8_t, BYTES_448> x) {
   load_le(m_x, x);
}

Gf448Elem::Gf448Elem(uint64_t least_sig_word) {
   clear_mem(m_x);
   m_x[0] = least_sig_word;
}

void Gf448Elem::to_bytes(std::span<uint8_t, BYTES_448> out) const {
   store_le(out, to_canonical(m_x));
}

std::array<uint8_t, BYTES_448> Gf448Elem::to_bytes() const {
   std::array<uint8_t, BYTES_448> bytes;
   to_bytes(bytes);
   return bytes;
}

void Gf448Elem::ct_cond_swap(bool b, Gf448Elem& other) {
   for(size_t i = 0; i < WORDS_448; ++i) {
      CT::conditional_swap(b, m_x[i], other.m_x[i]);
   }
}

void Gf448Elem::ct_cond_assign(bool b, const Gf448Elem& other) {
   CT::conditional_assign_mem(static_cast<uint64_t>(b), m_x.data(), other.m_x.data(), WORDS_448);
}

Gf448Elem Gf448Elem::operator+(const Gf448Elem& other) const {
   Gf448Elem res(0);
   gf_add(res.m_x, m_x, other.m_x);
   return res;
}

Gf448Elem Gf448Elem::operator-(const Gf448Elem& other) const {
   Gf448Elem res(0);
   gf_sub(res.m_x, m_x, other.m_x);
   return res;
}

Gf448Elem Gf448Elem::operator-() const {
   Gf448Elem res(0);
   gf_sub(res.m_x, res.m_x, m_x);
   return res;
}

Gf448Elem Gf448Elem::operator*(const Gf448Elem& other) const {
   Gf448Elem res(0);
   gf_mul(res.m_x, m_x, other.m_x);
   return res;
}

Gf448Elem Gf448Elem::operator/(const Gf448Elem& other) const {
   Gf448Elem res(0);
   gf_inv(res.m_x, other.m_x);
   gf_mul(res.m_x, m_x, res.m_x);
   return res;
}

bool Gf448Elem::operator==(const Gf448Elem& other) const {
   const auto canonical_form_this = to_canonical(m_x);
   const auto canonical_form_other = to_canonical(other.m_x);
   return CT::is_equal(canonical_form_this.data(), canonical_form_other.data(), WORDS_448).as_bool();
}

bool Gf448Elem::is_zero() const {
   const auto canonical_form = to_canonical(m_x);

   return CT::all_zeros(canonical_form.data(), WORDS_448).as_bool();
}

bool Gf448Elem::is_odd() const {
   const auto canonical_form = to_canonical(m_x);
   return (canonical_form[0] & 1) == 1;
}

bool Gf448Elem::bytes_are_canonical_representation(std::span<const uint8_t, BYTES_448> x) {
   const auto x_words = load_le<std::array<uint64_t, WORDS_448>>(x);
   const auto x_words_canonical = to_canonical(x_words);
   return CT::is_equal(x_words.data(), x_words_canonical.data(), WORDS_448).as_bool();
}

Gf448Elem square(const Gf448Elem& elem) {
   Gf448Elem res(0);
   gf_square(res.words(), elem.words());
   return res;
}

Gf448Elem root(const Gf448Elem& elem) {
   Gf448Elem res(1);

   // (P-3)/4 is an 445 bit integer with one zero bits at 222. All others are one.
   for(int16_t t = 445; t >= 0; --t) {
      gf_square(res.words(), res.words());
      if(t != 222) {
         gf_mul(res.words(), res.words(), elem.words());
      }
   }

   return res;
}

}  // namespace Botan

Coverage Report

Created: 2024-11-29 06:10

Line	Count	Source (jump to first uncovered line)
1		/*
2		* X448 Gf Modulo 2^448 - 2^224 - 1
3		* (C) 2024 Jack Lloyd
4		* 2024 Fabian Albert - Rohde & Schwarz Cybersecurity
5		*
6		* Botan is released under the Simplified BSD License (see license.txt)
7		*/
8		#include <botan/internal/curve448_gf.h>
9
10		#include <botan/internal/ct_utils.h>
11		#include <botan/internal/loadstor.h>
12		#include <botan/internal/mp_asmi.h>
13		#include <botan/internal/mp_core.h>
14
15		#include <algorithm>
16
17		namespace Botan {
18
19		namespace {
20
21		/**
22		* @brief Compute (a + b). The carry is returned in the carry parameter.
23		* The carry is not included for the addition.
24		*/
25	212M	inline uint64_t u64_add(uint64_t a, uint64_t b, bool* carry) {
26		// Let the compiler optimize this into fancy instructions
27	212M	const uint64_t sum = a + b;
28	212M	*carry = sum < a;
29	212M	return sum;
30	212M	}
31
32		/**
33		* @brief Compute (a + b + carry), where carry is in {0, 1}. The carry of this computation
34		* is store in the in/out @p carry parameter.
35		*/
36	264M	inline uint64_t u64_add_with_carry(uint64_t a, uint64_t b, bool* carry) {
37		// Let the compiler optimize this into fancy instructions
38	264M	uint64_t sum = a + b;
39	264M	const bool carry_a_plus_b = (sum < a);
40	264M	sum += static_cast<uint64_t>(*carry);
41	264M	carry = static_cast<uint64_t>(carry_a_plus_b) \| static_cast<uint64_t>(sum < static_cast<uint64_t>(carry));
42	264M	return sum;
43	264M	}
44
45		/**
46		* @brief Compute (a - (b + borrow)). The borrow is returned in the carry parameter.
47		*
48		* I.e. borrow = 1 if a < b + borrow, else 0.
49		*/
50	85.8M	inline uint64_t u64_sub_with_borrow(uint64_t a, uint64_t b, bool* borrow) {
51		// Let the compiler optimize this into fancy instructions
52	85.8M	const uint64_t diff = a - b;
53	85.8M	const bool borrow_a_min_b = diff > a;
54	85.8M	const uint64_t z = diff - static_cast<uint64_t>(*borrow);
55	85.8M	*borrow = static_cast<uint64_t>(borrow_a_min_b) \| static_cast<uint64_t>(z > diff);
56	85.8M	return z;
57	85.8M	}
58
59		/**
60		* @brief Reduce the result of a addition modulo 2^448 - 2^224 - 1.
61		*
62		* Algorithm 1 of paper "Reduction Modulo 2^448 - 2^224 - 1", from line 27.
63		*
64		* @param h_3 Output
65		* @param h_1 Input
66		*/
67	18.8M	void reduce_after_add(std::span<uint64_t, WORDS_448> h_3, std::span<const uint64_t, 8> h_1) {
68	18.8M	std::array<uint64_t, 8> h_2;
69	18.8M	bool carry;
70
71		// Line 27+ (of the paper's algorithm 1)
72	18.8M	h_2[0] = u64_add(h_1[0], h_1[7], &carry);
73
74	18.8M	h_2[1] = u64_add(h_1[1], carry, &carry);
75	18.8M	h_2[2] = u64_add(h_1[2], carry, &carry);
76
77		// Line 30
78	18.8M	h_2[3] = u64_add_with_carry(h_1[3], h_1[7] << 32, &carry);
79
80		// Line 31+
81	18.8M	h_2[4] = u64_add(h_1[4], carry, &carry);
82	18.8M	h_2[5] = u64_add(h_1[5], carry, &carry);
83	18.8M	h_2[6] = u64_add(h_1[6], carry, &carry);
84
85	18.8M	h_2[7] = carry;
86
87	18.8M	h_3[0] = u64_add(h_2[0], h_2[7], &carry);
88	18.8M	h_3[1] = u64_add(h_2[1], carry, &carry);
89	18.8M	h_3[2] = u64_add(h_2[2], carry, &carry);
90		// Line 37
91	18.8M	h_3[3] = h_2[3] + (h_2[7] << 32) + carry;
92
93		// Line 38
94	18.8M	h_3[4] = h_2[4];
95	18.8M	h_3[5] = h_2[5];
96	18.8M	h_3[6] = h_2[6];
97	18.8M	}
98
99		/**
100		* @brief Reduce the result of a addition modulo 2^448 - 2^224 - 1.
101		*
102		* Algorithm 1 of paper "Reduction Modulo 2^448 - 2^224 - 1".
103		*/
104	14.2M	void reduce_after_mul(std::span<uint64_t, WORDS_448> out, std::span<const uint64_t, 14> in) {
105	14.2M	std::array<uint64_t, 8> r;
106	14.2M	std::array<uint64_t, 8> s;
107	14.2M	std::array<uint64_t, 8> t_0;
108	14.2M	std::array<uint64_t, 8> h_1;
109
110	14.2M	bool carry;
111
112		// Line 4 (of the paper's algorithm 1)
113	14.2M	r[0] = u64_add(in[0], in[7], &carry);
114
115		// Line 5-7
116	99.6M	for(size_t i = 1; i < 7; ++i) {
117	85.3M	r[i] = u64_add_with_carry(in[i], in[i + 7], &carry);
118	85.3M	}
119	14.2M	r[7] = carry;
120	14.2M	s[0] = r[0];
121	14.2M	s[1] = r[1];
122	14.2M	s[2] = r[2];
123		// Line 10
124	14.2M	s[3] = u64_add(r[3], in[10] & 0xFFFFFFFF00000000, &carry);
125		// Line 11-13
126	56.9M	for(size_t i = 4; i < 7; ++i) {
127	42.6M	s[i] = u64_add_with_carry(r[i], in[i + 7], &carry);
128	42.6M	}
129	14.2M	s[7] = r[7] + carry;
130
131		// Line 15-17
132	56.9M	for(size_t i = 0; i < 3; ++i) {
133	42.6M	t_0[i] = (in[i + 11] << 32) \| (in[i + 10] >> 32);
134	42.6M	}
135		// Line 18
136	14.2M	t_0[3] = (in[7] << 32) \| (in[13] >> 32);
137		// Line 19-21
138	56.9M	for(size_t i = 4; i < 7; ++i) {
139	42.6M	t_0[i] = (in[i + 4] << 32) \| (in[i + 3] >> 32);
140	42.6M	}
141	14.2M	h_1[0] = u64_add(s[0], t_0[0], &carry);
142		// Line 23-25
143	99.6M	for(size_t i = 1; i < 7; ++i) {
144	85.3M	h_1[i] = u64_add_with_carry(s[i], t_0[i], &carry);
145	85.3M	}
146	14.2M	h_1[7] = s[7] + carry;
147
148	14.2M	reduce_after_add(out, h_1);
149	14.2M	}
150
151		void gf_mul(std::span<uint64_t, WORDS_448> out,
152		std::span<const uint64_t, WORDS_448> a,
153	8.48M	std::span<const uint64_t, WORDS_448> b) {
154	8.48M	std::array<uint64_t, 14> ws;
155	8.48M	comba_mul<7>(ws.data(), a.data(), b.data());
156	8.48M	reduce_after_mul(out, ws);
157	8.48M	}
158
159	5.74M	void gf_square(std::span<uint64_t, WORDS_448> out, std::span<const uint64_t, WORDS_448> a) {
160	5.74M	std::array<uint64_t, 14> ws;
161	5.74M	comba_sqr<7>(ws.data(), a.data());
162	5.74M	reduce_after_mul(out, ws);
163	5.74M	}
164
165		void gf_add(std::span<uint64_t, WORDS_448> out,
166		std::span<const uint64_t, WORDS_448> a,
167	4.62M	std::span<const uint64_t, WORDS_448> b) {
168	4.62M	std::array<uint64_t, WORDS_448 + 1> ws;
169	4.62M	copy_mem(std::span(ws).first<WORDS_448>(), a);
170	4.62M	ws[WORDS_448] = 0;
171
172	4.62M	bool carry = false;
173	36.9M	for(size_t i = 0; i < WORDS_448; ++i) {
174	32.3M	ws[i] = u64_add_with_carry(a[i], b[i], &carry);
175	32.3M	}
176	4.62M	ws[WORDS_448] = carry;
177
178	4.62M	reduce_after_add(out, ws);
179	4.62M	}
180
181		/**
182		* @brief Subtract two elements in GF(P). out = a - b
183		*
184		* Algorithm 2 of paper: "Reduction Modulo 2^448 - 2^224 - 1"
185		*/
186		void gf_sub(std::span<uint64_t, WORDS_448> out,
187		std::span<const uint64_t, WORDS_448> a,
188	4.76M	std::span<const uint64_t, WORDS_448> b) {
189	4.76M	std::array<uint64_t, WORDS_448> h_0;
190	4.76M	std::array<uint64_t, WORDS_448> h_1;
191
192	4.76M	bool borrow = false;
193	38.1M	for(size_t i = 0; i < WORDS_448; ++i) {
194	33.3M	h_0[i] = u64_sub_with_borrow(a[i], b[i], &borrow);
195	33.3M	}
196	4.76M	uint64_t delta = borrow;
197	4.76M	uint64_t delta_p = delta << 32;
198	4.76M	borrow = false;
199
200	4.76M	h_1[0] = u64_sub_with_borrow(h_0[0], delta, &borrow);
201	4.76M	h_1[1] = u64_sub_with_borrow(h_0[1], 0, &borrow);
202	4.76M	h_1[2] = u64_sub_with_borrow(h_0[2], 0, &borrow);
203	4.76M	h_1[3] = u64_sub_with_borrow(h_0[3], delta_p, &borrow);
204	4.76M	h_1[4] = u64_sub_with_borrow(h_0[4], 0, &borrow);
205	4.76M	h_1[5] = u64_sub_with_borrow(h_0[5], 0, &borrow);
206	4.76M	h_1[6] = u64_sub_with_borrow(h_0[6], 0, &borrow);
207
208	4.76M	delta = borrow;
209	4.76M	delta_p = delta << 32;
210	4.76M	borrow = false;
211
212	4.76M	out[0] = u64_sub_with_borrow(h_1[0], delta, &borrow);
213	4.76M	out[1] = u64_sub_with_borrow(h_1[1], 0, &borrow);
214	4.76M	out[2] = u64_sub_with_borrow(h_1[2], 0, &borrow);
215	4.76M	out[3] = u64_sub_with_borrow(h_1[3], delta_p, &borrow);
216	4.76M	out[4] = h_1[4];
217	4.76M	out[5] = h_1[5];
218	4.76M	out[6] = h_1[6];
219	4.76M	}
220
221		/**
222		* @brief Inversion in GF(P) using Fermat's little theorem:
223		* x^-1 = x^(P-2) mod P
224		*/
225	2.66k	void gf_inv(std::span<uint64_t, WORDS_448> out, std::span<const uint64_t, WORDS_448> a) {
226	2.66k	clear_mem(out);
227	2.66k	out[0] = 1;
228		// Square and multiply
229	1.19M	for(int16_t t = 448; t >= 0; --t) {
230	1.19M	gf_square(out, out);
231		// (P-2) has zero bits at indices 1, 224, 448. All others are one.
232	1.19M	if(t != 448 && t != 224 && t != 1) {
233	1.18M	gf_mul(out, out, a);
234	1.18M	}
235	1.19M	}
236	2.66k	}
237
238		/**
239		* @brief Convert a number to its canonical representation.
240		*
241		* I.e. if the number is greater than P, subtract P. The number cannot be >= 2P
242		* since 2P > 2^(764).
243		*/
244	2.66k	std::array<uint64_t, WORDS_448> to_canonical(std::span<const uint64_t, WORDS_448> in) {
245	2.66k	const std::array<uint64_t, WORDS_448> p = {0xffffffffffffffff,
246	2.66k	0xffffffffffffffff,
247	2.66k	0xffffffffffffffff,
248	2.66k	0xfffffffeffffffff,
249	2.66k	0xffffffffffffffff,
250	2.66k	0xffffffffffffffff,
251	2.66k	0xffffffffffffffff};
252
253	2.66k	std::array<uint64_t, WORDS_448> in_minus_p;
254	2.66k	bool borrow = false;
255	21.2k	for(size_t i = 0; i < WORDS_448; ++i) {
256	18.6k	in_minus_p[i] = u64_sub_with_borrow(in[i], p[i], &borrow);
257	18.6k	}
258	2.66k	std::array<uint64_t, WORDS_448> out;
259	2.66k	CT::Mask<uint64_t>::expand(borrow).select_n(out.data(), in.data(), in_minus_p.data(), WORDS_448);
260	2.66k	return out;
261	2.66k	}
262
263		} // namespace
264
265	4.66k	Gf448Elem::Gf448Elem(std::span<const uint8_t, BYTES_448> x) {
266	4.66k	load_le(m_x, x);
267	4.66k	}
268
269	21.3M	Gf448Elem::Gf448Elem(uint64_t least_sig_word) {
270	21.3M	clear_mem(m_x);
271	21.3M	m_x[0] = least_sig_word;
272	21.3M	}
273
274	2.49k	void Gf448Elem::to_bytes(std::span<uint8_t, BYTES_448> out) const {
275	2.49k	store_le(out, to_canonical(m_x));
276	2.49k	}
277
278	2.33k	std::array<uint8_t, BYTES_448> Gf448Elem::to_bytes() const {
279	2.33k	std::array<uint8_t, BYTES_448> bytes;
280	2.33k	to_bytes(bytes);
281	2.33k	return bytes;
282	2.33k	}
283
284	2.09M	void Gf448Elem::ct_cond_swap(bool b, Gf448Elem& other) {
285	16.7M	for(size_t i = 0; i < WORDS_448; ++i) {
286	14.6M	CT::conditional_swap(b, m_x[i], other.m_x[i]);
287	14.6M	}
288	2.09M	}
289
290	219k	void Gf448Elem::ct_cond_assign(bool b, const Gf448Elem& other) {
291	219k	CT::conditional_assign_mem(static_cast<uint64_t>(b), m_x.data(), other.m_x.data(), WORDS_448);
292	219k	}
293
294	4.62M	Gf448Elem Gf448Elem::operator+(const Gf448Elem& other) const {
295	4.62M	Gf448Elem res(0);
296	4.62M	gf_add(res.m_x, m_x, other.m_x);
297	4.62M	return res;
298	4.62M	}
299
300	4.69M	Gf448Elem Gf448Elem::operator-(const Gf448Elem& other) const {
301	4.69M	Gf448Elem res(0);
302	4.69M	gf_sub(res.m_x, m_x, other.m_x);
303	4.69M	return res;
304	4.69M	}
305
306	73.1k	Gf448Elem Gf448Elem::operator-() const {
307	73.1k	Gf448Elem res(0);
308	73.1k	gf_sub(res.m_x, res.m_x, m_x);
309	73.1k	return res;
310	73.1k	}
311
312	7.29M	Gf448Elem Gf448Elem::operator*(const Gf448Elem& other) const {
313	7.29M	Gf448Elem res(0);
314	7.29M	gf_mul(res.m_x, m_x, other.m_x);
315	7.29M	return res;
316	7.29M	}
317
318	2.66k	Gf448Elem Gf448Elem::operator/(const Gf448Elem& other) const {
319	2.66k	Gf448Elem res(0);
320	2.66k	gf_inv(res.m_x, other.m_x);
321	2.66k	gf_mul(res.m_x, m_x, res.m_x);
322	2.66k	return res;
323	2.66k	}
324
325	0	bool Gf448Elem::operator==(const Gf448Elem& other) const {
326	0	const auto canonical_form_this = to_canonical(m_x);
327	0	const auto canonical_form_other = to_canonical(other.m_x);
328	0	return CT::is_equal(canonical_form_this.data(), canonical_form_other.data(), WORDS_448).as_bool();
329	0	}
330
331	0	bool Gf448Elem::is_zero() const {
332	0	const auto canonical_form = to_canonical(m_x);
333
334	0	return CT::all_zeros(canonical_form.data(), WORDS_448).as_bool();
335	0	}
336
337	164	bool Gf448Elem::is_odd() const {
338	164	const auto canonical_form = to_canonical(m_x);
339	164	return (canonical_form[0] & 1) == 1;
340	164	}
341
342	0	bool Gf448Elem::bytes_are_canonical_representation(std::span<const uint8_t, BYTES_448> x) {
343	0	const auto x_words = load_le<std::array<uint64_t, WORDS_448>>(x);
344	0	const auto x_words_canonical = to_canonical(x_words);
345	0	return CT::is_equal(x_words.data(), x_words_canonical.data(), WORDS_448).as_bool();
346	0	}
347
348	4.54M	Gf448Elem square(const Gf448Elem& elem) {
349	4.54M	Gf448Elem res(0);
350	4.54M	gf_square(res.words(), elem.words());
351	4.54M	return res;
352	4.54M	}
353
354	0	Gf448Elem root(const Gf448Elem& elem) {
355	0	Gf448Elem res(1);
356
357		// (P-3)/4 is an 445 bit integer with one zero bits at 222. All others are one.
358	0	for(int16_t t = 445; t >= 0; --t) {
359	0	gf_square(res.words(), res.words());
360	0	if(t != 222) {
361	0	gf_mul(res.words(), res.words(), elem.words());
362	0	}
363	0	}
364
365	0	return res;
366	0	}
367
368		} // namespace Botan