Coverage Report

Created: 2026-06-08 07:04

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/BearSSL/src/symcipher/chacha20_sse2.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining 
5
 * a copy of this software and associated documentation files (the
6
 * "Software"), to deal in the Software without restriction, including
7
 * without limitation the rights to use, copy, modify, merge, publish,
8
 * distribute, sublicense, and/or sell copies of the Software, and to
9
 * permit persons to whom the Software is furnished to do so, subject to
10
 * the following conditions:
11
 *
12
 * The above copyright notice and this permission notice shall be 
13
 * included in all copies or substantial portions of the Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
16
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
18
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
 * SOFTWARE.
23
 */
24
25
#define BR_ENABLE_INTRINSICS   1
26
#include "inner.h"
27
28
#if BR_SSE2
29
30
/*
31
 * This file contains a ChaCha20 implementation that leverages SSE2
32
 * opcodes for better performance.
33
 */
34
35
/* see bearssl_block.h */
36
br_chacha20_run
37
br_chacha20_sse2_get(void)
38
177
{
39
  /*
40
   * If using 64-bit mode, then SSE2 opcodes should be automatically
41
   * available, since they are part of the ABI.
42
   *
43
   * In 32-bit mode, we use CPUID to detect the SSE2 feature.
44
   */
45
46
177
#if BR_amd64
47
177
  return &br_chacha20_sse2_run;
48
#else
49
50
  /*
51
   * SSE2 support is indicated by bit 26 in EDX.
52
   */
53
  if (br_cpuid(0, 0, 0, 0x04000000)) {
54
    return &br_chacha20_sse2_run;
55
  } else {
56
    return 0;
57
  }
58
#endif
59
177
}
60
61
BR_TARGETS_X86_UP
62
63
/* see bearssl_block.h */
64
BR_TARGET("sse2")
65
uint32_t
66
br_chacha20_sse2_run(const void *key,
67
  const void *iv, uint32_t cc, void *data, size_t len)
68
244
{
69
244
  unsigned char *buf;
70
244
  uint32_t ivtmp[4];
71
244
  __m128i kw0, kw1;
72
244
  __m128i iw, cw;
73
244
  __m128i one;
74
75
244
  static const uint32_t CW[] = {
76
244
    0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
77
244
  };
78
79
244
  buf = data;
80
244
  kw0 = _mm_loadu_si128(key);
81
244
  kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16));
82
244
  ivtmp[0] = cc;
83
244
  memcpy(ivtmp + 1, iv, 12);
84
244
  iw = _mm_loadu_si128((const void *)ivtmp);
85
244
  cw = _mm_loadu_si128((const void *)CW);
86
244
  one = _mm_set_epi32(0, 0, 0, 1);
87
88
1.61k
  while (len > 0) {
89
    /*
90
     * sj contains state words 4*j to 4*j+3.
91
     */
92
1.52k
    __m128i s0, s1, s2, s3;
93
1.52k
    int i;
94
95
1.52k
    s0 = cw;
96
1.52k
    s1 = kw0;
97
1.52k
    s2 = kw1;
98
1.52k
    s3 = iw;
99
16.8k
    for (i = 0; i < 10; i ++) {
100
      /*
101
       * Even round is straightforward application on
102
       * the state words.
103
       */
104
15.2k
      s0 = _mm_add_epi32(s0, s1);
105
15.2k
      s3 = _mm_xor_si128(s3, s0);
106
15.2k
      s3 = _mm_or_si128(
107
15.2k
        _mm_slli_epi32(s3, 16),
108
15.2k
        _mm_srli_epi32(s3, 16));
109
110
15.2k
      s2 = _mm_add_epi32(s2, s3);
111
15.2k
      s1 = _mm_xor_si128(s1, s2);
112
15.2k
      s1 = _mm_or_si128(
113
15.2k
        _mm_slli_epi32(s1, 12),
114
15.2k
        _mm_srli_epi32(s1, 20));
115
116
15.2k
      s0 = _mm_add_epi32(s0, s1);
117
15.2k
      s3 = _mm_xor_si128(s3, s0);
118
15.2k
      s3 = _mm_or_si128(
119
15.2k
        _mm_slli_epi32(s3, 8),
120
15.2k
        _mm_srli_epi32(s3, 24));
121
122
15.2k
      s2 = _mm_add_epi32(s2, s3);
123
15.2k
      s1 = _mm_xor_si128(s1, s2);
124
15.2k
      s1 = _mm_or_si128(
125
15.2k
        _mm_slli_epi32(s1, 7),
126
15.2k
        _mm_srli_epi32(s1, 25));
127
128
      /*
129
       * For the odd round, we must rotate some state
130
       * words so that the computations apply on the
131
       * right combinations of words.
132
       */
133
15.2k
      s1 = _mm_shuffle_epi32(s1, 0x39);
134
15.2k
      s2 = _mm_shuffle_epi32(s2, 0x4E);
135
15.2k
      s3 = _mm_shuffle_epi32(s3, 0x93);
136
137
15.2k
      s0 = _mm_add_epi32(s0, s1);
138
15.2k
      s3 = _mm_xor_si128(s3, s0);
139
15.2k
      s3 = _mm_or_si128(
140
15.2k
        _mm_slli_epi32(s3, 16),
141
15.2k
        _mm_srli_epi32(s3, 16));
142
143
15.2k
      s2 = _mm_add_epi32(s2, s3);
144
15.2k
      s1 = _mm_xor_si128(s1, s2);
145
15.2k
      s1 = _mm_or_si128(
146
15.2k
        _mm_slli_epi32(s1, 12),
147
15.2k
        _mm_srli_epi32(s1, 20));
148
149
15.2k
      s0 = _mm_add_epi32(s0, s1);
150
15.2k
      s3 = _mm_xor_si128(s3, s0);
151
15.2k
      s3 = _mm_or_si128(
152
15.2k
        _mm_slli_epi32(s3, 8),
153
15.2k
        _mm_srli_epi32(s3, 24));
154
155
15.2k
      s2 = _mm_add_epi32(s2, s3);
156
15.2k
      s1 = _mm_xor_si128(s1, s2);
157
15.2k
      s1 = _mm_or_si128(
158
15.2k
        _mm_slli_epi32(s1, 7),
159
15.2k
        _mm_srli_epi32(s1, 25));
160
161
      /*
162
       * After the odd round, we rotate back the values
163
       * to undo the rotate at the start of the odd round.
164
       */
165
15.2k
      s1 = _mm_shuffle_epi32(s1, 0x93);
166
15.2k
      s2 = _mm_shuffle_epi32(s2, 0x4E);
167
15.2k
      s3 = _mm_shuffle_epi32(s3, 0x39);
168
15.2k
    }
169
170
    /*
171
     * Addition with the initial state.
172
     */
173
1.52k
    s0 = _mm_add_epi32(s0, cw);
174
1.52k
    s1 = _mm_add_epi32(s1, kw0);
175
1.52k
    s2 = _mm_add_epi32(s2, kw1);
176
1.52k
    s3 = _mm_add_epi32(s3, iw);
177
178
    /*
179
     * Increment block counter.
180
     */
181
1.52k
    iw = _mm_add_epi32(iw, one);
182
183
    /*
184
     * XOR final state with the data.
185
     */
186
1.52k
    if (len < 64) {
187
158
      unsigned char tmp[64];
188
158
      size_t u;
189
190
158
      _mm_storeu_si128((void *)(tmp +  0), s0);
191
158
      _mm_storeu_si128((void *)(tmp + 16), s1);
192
158
      _mm_storeu_si128((void *)(tmp + 32), s2);
193
158
      _mm_storeu_si128((void *)(tmp + 48), s3);
194
3.58k
      for (u = 0; u < len; u ++) {
195
3.42k
        buf[u] ^= tmp[u];
196
3.42k
      }
197
158
      break;
198
1.37k
    } else {
199
1.37k
      __m128i b0, b1, b2, b3;
200
201
1.37k
      b0 = _mm_loadu_si128((const void *)(buf +  0));
202
1.37k
      b1 = _mm_loadu_si128((const void *)(buf + 16));
203
1.37k
      b2 = _mm_loadu_si128((const void *)(buf + 32));
204
1.37k
      b3 = _mm_loadu_si128((const void *)(buf + 48));
205
1.37k
      b0 = _mm_xor_si128(b0, s0);
206
1.37k
      b1 = _mm_xor_si128(b1, s1);
207
1.37k
      b2 = _mm_xor_si128(b2, s2);
208
1.37k
      b3 = _mm_xor_si128(b3, s3);
209
1.37k
      _mm_storeu_si128((void *)(buf +  0), b0);
210
1.37k
      _mm_storeu_si128((void *)(buf + 16), b1);
211
1.37k
      _mm_storeu_si128((void *)(buf + 32), b2);
212
1.37k
      _mm_storeu_si128((void *)(buf + 48), b3);
213
1.37k
      buf += 64;
214
1.37k
      len -= 64;
215
1.37k
    }
216
1.52k
  }
217
218
  /*
219
   * _mm_extract_epi32() requires SSE4.1. We prefer to stick to
220
   * raw SSE2, thus we use _mm_extract_epi16().
221
   */
222
244
  return (uint32_t)_mm_extract_epi16(iw, 0)
223
    | ((uint32_t)_mm_extract_epi16(iw, 1) << 16);
224
244
}
225
226
BR_TARGETS_X86_DOWN
227
228
#else
229
230
/* see bearssl_block.h */
231
br_chacha20_run
232
br_chacha20_sse2_get(void)
233
{
234
  return 0;
235
}
236
237
#endif