/src/BearSSL/src/symcipher/chacha20_sse2.c

Source
/*
 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
 *
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#define BR_ENABLE_INTRINSICS   1
#include "inner.h"

#if BR_SSE2

/*
 * This file contains a ChaCha20 implementation that leverages SSE2
 * opcodes for better performance.
 */

/* see bearssl_block.h */
br_chacha20_run
br_chacha20_sse2_get(void)
{
  /*
   * If using 64-bit mode, then SSE2 opcodes should be automatically
   * available, since they are part of the ABI.
   *
   * In 32-bit mode, we use CPUID to detect the SSE2 feature.
   */

#if BR_amd64
  return &br_chacha20_sse2_run;
#else

  /*
   * SSE2 support is indicated by bit 26 in EDX.
   */
  if (br_cpuid(0, 0, 0, 0x04000000)) {
    return &br_chacha20_sse2_run;
  } else {
    return 0;
  }
#endif
}

BR_TARGETS_X86_UP

/* see bearssl_block.h */
BR_TARGET("sse2")
uint32_t
br_chacha20_sse2_run(const void *key,
  const void *iv, uint32_t cc, void *data, size_t len)
{
  unsigned char *buf;
  uint32_t ivtmp[4];
  __m128i kw0, kw1;
  __m128i iw, cw;
  __m128i one;

  static const uint32_t CW[] = {
    0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
  };

  buf = data;
  kw0 = _mm_loadu_si128(key);
  kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16));
  ivtmp[0] = cc;
  memcpy(ivtmp + 1, iv, 12);
  iw = _mm_loadu_si128((const void *)ivtmp);
  cw = _mm_loadu_si128((const void *)CW);
  one = _mm_set_epi32(0, 0, 0, 1);

  while (len > 0) {
    /*
     * sj contains state words 4*j to 4*j+3.
     */
    __m128i s0, s1, s2, s3;
    int i;

    s0 = cw;
    s1 = kw0;
    s2 = kw1;
    s3 = iw;
    for (i = 0; i < 10; i ++) {
      /*
       * Even round is straightforward application on
       * the state words.
       */
      s0 = _mm_add_epi32(s0, s1);
      s3 = _mm_xor_si128(s3, s0);
      s3 = _mm_or_si128(
        _mm_slli_epi32(s3, 16),
        _mm_srli_epi32(s3, 16));

      s2 = _mm_add_epi32(s2, s3);
      s1 = _mm_xor_si128(s1, s2);
      s1 = _mm_or_si128(
        _mm_slli_epi32(s1, 12),
        _mm_srli_epi32(s1, 20));

      s0 = _mm_add_epi32(s0, s1);
      s3 = _mm_xor_si128(s3, s0);
      s3 = _mm_or_si128(
        _mm_slli_epi32(s3, 8),
        _mm_srli_epi32(s3, 24));

      s2 = _mm_add_epi32(s2, s3);
      s1 = _mm_xor_si128(s1, s2);
      s1 = _mm_or_si128(
        _mm_slli_epi32(s1, 7),
        _mm_srli_epi32(s1, 25));

      /*
       * For the odd round, we must rotate some state
       * words so that the computations apply on the
       * right combinations of words.
       */
      s1 = _mm_shuffle_epi32(s1, 0x39);
      s2 = _mm_shuffle_epi32(s2, 0x4E);
      s3 = _mm_shuffle_epi32(s3, 0x93);

      s0 = _mm_add_epi32(s0, s1);
      s3 = _mm_xor_si128(s3, s0);
      s3 = _mm_or_si128(
        _mm_slli_epi32(s3, 16),
        _mm_srli_epi32(s3, 16));

      s2 = _mm_add_epi32(s2, s3);
      s1 = _mm_xor_si128(s1, s2);
      s1 = _mm_or_si128(
        _mm_slli_epi32(s1, 12),
        _mm_srli_epi32(s1, 20));

      s0 = _mm_add_epi32(s0, s1);
      s3 = _mm_xor_si128(s3, s0);
      s3 = _mm_or_si128(
        _mm_slli_epi32(s3, 8),
        _mm_srli_epi32(s3, 24));

      s2 = _mm_add_epi32(s2, s3);
      s1 = _mm_xor_si128(s1, s2);
      s1 = _mm_or_si128(
        _mm_slli_epi32(s1, 7),
        _mm_srli_epi32(s1, 25));

      /*
       * After the odd round, we rotate back the values
       * to undo the rotate at the start of the odd round.
       */
      s1 = _mm_shuffle_epi32(s1, 0x93);
      s2 = _mm_shuffle_epi32(s2, 0x4E);
      s3 = _mm_shuffle_epi32(s3, 0x39);
    }

    /*
     * Addition with the initial state.
     */
    s0 = _mm_add_epi32(s0, cw);
    s1 = _mm_add_epi32(s1, kw0);
    s2 = _mm_add_epi32(s2, kw1);
    s3 = _mm_add_epi32(s3, iw);

    /*
     * Increment block counter.
     */
    iw = _mm_add_epi32(iw, one);

    /*
     * XOR final state with the data.
     */
    if (len < 64) {
      unsigned char tmp[64];
      size_t u;

      _mm_storeu_si128((void *)(tmp +  0), s0);
      _mm_storeu_si128((void *)(tmp + 16), s1);
      _mm_storeu_si128((void *)(tmp + 32), s2);
      _mm_storeu_si128((void *)(tmp + 48), s3);
      for (u = 0; u < len; u ++) {
        buf[u] ^= tmp[u];
      }
      break;
    } else {
      __m128i b0, b1, b2, b3;

      b0 = _mm_loadu_si128((const void *)(buf +  0));
      b1 = _mm_loadu_si128((const void *)(buf + 16));
      b2 = _mm_loadu_si128((const void *)(buf + 32));
      b3 = _mm_loadu_si128((const void *)(buf + 48));
      b0 = _mm_xor_si128(b0, s0);
      b1 = _mm_xor_si128(b1, s1);
      b2 = _mm_xor_si128(b2, s2);
      b3 = _mm_xor_si128(b3, s3);
      _mm_storeu_si128((void *)(buf +  0), b0);
      _mm_storeu_si128((void *)(buf + 16), b1);
      _mm_storeu_si128((void *)(buf + 32), b2);
      _mm_storeu_si128((void *)(buf + 48), b3);
      buf += 64;
      len -= 64;
    }
  }

  /*
   * _mm_extract_epi32() requires SSE4.1. We prefer to stick to
   * raw SSE2, thus we use _mm_extract_epi16().
   */
  return (uint32_t)_mm_extract_epi16(iw, 0)
    | ((uint32_t)_mm_extract_epi16(iw, 1) << 16);
}

BR_TARGETS_X86_DOWN

#else

/* see bearssl_block.h */
br_chacha20_run
br_chacha20_sse2_get(void)
{
  return 0;
}

#endif

Coverage Report

Created: 2026-06-08 07:04

Line	Count	Source
1		/*
2		* Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3		*
4		* Permission is hereby granted, free of charge, to any person obtaining
5		* a copy of this software and associated documentation files (the
6		* "Software"), to deal in the Software without restriction, including
7		* without limitation the rights to use, copy, modify, merge, publish,
8		* distribute, sublicense, and/or sell copies of the Software, and to
9		* permit persons to whom the Software is furnished to do so, subject to
10		* the following conditions:
11		*
12		* The above copyright notice and this permission notice shall be
13		* included in all copies or substantial portions of the Software.
14		*
15		* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16		* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17		* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18		* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19		* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20		* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21		* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22		* SOFTWARE.
23		*/
24
25		#define BR_ENABLE_INTRINSICS 1
26		#include "inner.h"
27
28		#if BR_SSE2
29
30		/*
31		* This file contains a ChaCha20 implementation that leverages SSE2
32		* opcodes for better performance.
33		*/
34
35		/* see bearssl_block.h */
36		br_chacha20_run
37		br_chacha20_sse2_get(void)
38	177	{
39		/*
40		* If using 64-bit mode, then SSE2 opcodes should be automatically
41		* available, since they are part of the ABI.
42		*
43		* In 32-bit mode, we use CPUID to detect the SSE2 feature.
44		*/
45
46	177	#if BR_amd64
47	177	return &br_chacha20_sse2_run;
48		#else
49
50		/*
51		* SSE2 support is indicated by bit 26 in EDX.
52		*/
53		if (br_cpuid(0, 0, 0, 0x04000000)) {
54		return &br_chacha20_sse2_run;
55		} else {
56		return 0;
57		}
58		#endif
59	177	}
60
61		BR_TARGETS_X86_UP
62
63		/* see bearssl_block.h */
64		BR_TARGET("sse2")
65		uint32_t
66		br_chacha20_sse2_run(const void *key,
67		const void iv, uint32_t cc, void data, size_t len)
68	244	{
69	244	unsigned char *buf;
70	244	uint32_t ivtmp[4];
71	244	__m128i kw0, kw1;
72	244	__m128i iw, cw;
73	244	__m128i one;
74
75	244	static const uint32_t CW[] = {
76	244	0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
77	244	};
78
79	244	buf = data;
80	244	kw0 = _mm_loadu_si128(key);
81	244	kw1 = _mm_loadu_si128((const void )((const unsigned char )key + 16));
82	244	ivtmp[0] = cc;
83	244	memcpy(ivtmp + 1, iv, 12);
84	244	iw = _mm_loadu_si128((const void *)ivtmp);
85	244	cw = _mm_loadu_si128((const void *)CW);
86	244	one = _mm_set_epi32(0, 0, 0, 1);
87
88	1.61k	while (len > 0) {
89		/*
90		* sj contains state words 4j to 4j+3.
91		*/
92	1.52k	__m128i s0, s1, s2, s3;
93	1.52k	int i;
94
95	1.52k	s0 = cw;
96	1.52k	s1 = kw0;
97	1.52k	s2 = kw1;
98	1.52k	s3 = iw;
99	16.8k	for (i = 0; i < 10; i ++) {
100		/*
101		* Even round is straightforward application on
102		* the state words.
103		*/
104	15.2k	s0 = _mm_add_epi32(s0, s1);
105	15.2k	s3 = _mm_xor_si128(s3, s0);
106	15.2k	s3 = _mm_or_si128(
107	15.2k	_mm_slli_epi32(s3, 16),
108	15.2k	_mm_srli_epi32(s3, 16));
109
110	15.2k	s2 = _mm_add_epi32(s2, s3);
111	15.2k	s1 = _mm_xor_si128(s1, s2);
112	15.2k	s1 = _mm_or_si128(
113	15.2k	_mm_slli_epi32(s1, 12),
114	15.2k	_mm_srli_epi32(s1, 20));
115
116	15.2k	s0 = _mm_add_epi32(s0, s1);
117	15.2k	s3 = _mm_xor_si128(s3, s0);
118	15.2k	s3 = _mm_or_si128(
119	15.2k	_mm_slli_epi32(s3, 8),
120	15.2k	_mm_srli_epi32(s3, 24));
121
122	15.2k	s2 = _mm_add_epi32(s2, s3);
123	15.2k	s1 = _mm_xor_si128(s1, s2);
124	15.2k	s1 = _mm_or_si128(
125	15.2k	_mm_slli_epi32(s1, 7),
126	15.2k	_mm_srli_epi32(s1, 25));
127
128		/*
129		* For the odd round, we must rotate some state
130		* words so that the computations apply on the
131		* right combinations of words.
132		*/
133	15.2k	s1 = _mm_shuffle_epi32(s1, 0x39);
134	15.2k	s2 = _mm_shuffle_epi32(s2, 0x4E);
135	15.2k	s3 = _mm_shuffle_epi32(s3, 0x93);
136
137	15.2k	s0 = _mm_add_epi32(s0, s1);
138	15.2k	s3 = _mm_xor_si128(s3, s0);
139	15.2k	s3 = _mm_or_si128(
140	15.2k	_mm_slli_epi32(s3, 16),
141	15.2k	_mm_srli_epi32(s3, 16));
142
143	15.2k	s2 = _mm_add_epi32(s2, s3);
144	15.2k	s1 = _mm_xor_si128(s1, s2);
145	15.2k	s1 = _mm_or_si128(
146	15.2k	_mm_slli_epi32(s1, 12),
147	15.2k	_mm_srli_epi32(s1, 20));
148
149	15.2k	s0 = _mm_add_epi32(s0, s1);
150	15.2k	s3 = _mm_xor_si128(s3, s0);
151	15.2k	s3 = _mm_or_si128(
152	15.2k	_mm_slli_epi32(s3, 8),
153	15.2k	_mm_srli_epi32(s3, 24));
154
155	15.2k	s2 = _mm_add_epi32(s2, s3);
156	15.2k	s1 = _mm_xor_si128(s1, s2);
157	15.2k	s1 = _mm_or_si128(
158	15.2k	_mm_slli_epi32(s1, 7),
159	15.2k	_mm_srli_epi32(s1, 25));
160
161		/*
162		* After the odd round, we rotate back the values
163		* to undo the rotate at the start of the odd round.
164		*/
165	15.2k	s1 = _mm_shuffle_epi32(s1, 0x93);
166	15.2k	s2 = _mm_shuffle_epi32(s2, 0x4E);
167	15.2k	s3 = _mm_shuffle_epi32(s3, 0x39);
168	15.2k	}
169
170		/*
171		* Addition with the initial state.
172		*/
173	1.52k	s0 = _mm_add_epi32(s0, cw);
174	1.52k	s1 = _mm_add_epi32(s1, kw0);
175	1.52k	s2 = _mm_add_epi32(s2, kw1);
176	1.52k	s3 = _mm_add_epi32(s3, iw);
177
178		/*
179		* Increment block counter.
180		*/
181	1.52k	iw = _mm_add_epi32(iw, one);
182
183		/*
184		* XOR final state with the data.
185		*/
186	1.52k	if (len < 64) {
187	158	unsigned char tmp[64];
188	158	size_t u;
189
190	158	_mm_storeu_si128((void *)(tmp + 0), s0);
191	158	_mm_storeu_si128((void *)(tmp + 16), s1);
192	158	_mm_storeu_si128((void *)(tmp + 32), s2);
193	158	_mm_storeu_si128((void *)(tmp + 48), s3);
194	3.58k	for (u = 0; u < len; u ++) {
195	3.42k	buf[u] ^= tmp[u];
196	3.42k	}
197	158	break;
198	1.37k	} else {
199	1.37k	__m128i b0, b1, b2, b3;
200
201	1.37k	b0 = _mm_loadu_si128((const void *)(buf + 0));
202	1.37k	b1 = _mm_loadu_si128((const void *)(buf + 16));
203	1.37k	b2 = _mm_loadu_si128((const void *)(buf + 32));
204	1.37k	b3 = _mm_loadu_si128((const void *)(buf + 48));
205	1.37k	b0 = _mm_xor_si128(b0, s0);
206	1.37k	b1 = _mm_xor_si128(b1, s1);
207	1.37k	b2 = _mm_xor_si128(b2, s2);
208	1.37k	b3 = _mm_xor_si128(b3, s3);
209	1.37k	_mm_storeu_si128((void *)(buf + 0), b0);
210	1.37k	_mm_storeu_si128((void *)(buf + 16), b1);
211	1.37k	_mm_storeu_si128((void *)(buf + 32), b2);
212	1.37k	_mm_storeu_si128((void *)(buf + 48), b3);
213	1.37k	buf += 64;
214	1.37k	len -= 64;
215	1.37k	}
216	1.52k	}
217
218		/*
219		* _mm_extract_epi32() requires SSE4.1. We prefer to stick to
220		* raw SSE2, thus we use _mm_extract_epi16().
221		*/
222	244	return (uint32_t)_mm_extract_epi16(iw, 0)
223		\| ((uint32_t)_mm_extract_epi16(iw, 1) << 16);
224	244	}
225
226		BR_TARGETS_X86_DOWN
227
228		#else
229
230		/* see bearssl_block.h */
231		br_chacha20_run
232		br_chacha20_sse2_get(void)
233		{
234		return 0;
235		}
236
237		#endif