Coverage Report

Created: 2026-05-19 06:33

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/nss/lib/freebl/sha256-x86.c
Line
Count
Source
1
/* This Source Code Form is subject to the terms of the Mozilla Public
2
 * License, v. 2.0. If a copy of the MPL was not distributed with this
3
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5
#ifdef USE_HW_SHA2
6
7
#include <immintrin.h>
8
9
#ifdef FREEBL_NO_DEPEND
10
#include "stubs.h"
11
#endif
12
13
#include "blapii.h"
14
#include "prcpucfg.h"
15
#include "prtypes.h" /* for PRUintXX */
16
#include "prlong.h"
17
#include "blapi.h"
18
#include "sha256.h"
19
20
/* SHA-256 constants, K256. */
21
pre_align static const PRUint32 K256[64] post_align = {
22
    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
23
    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
24
    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
25
    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
26
    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
27
    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
28
    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
29
    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
30
    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
31
    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
32
    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
33
    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
34
    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
35
    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
36
    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
37
    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
38
};
39
40
#define ROUND(n, a, b, c, d)                                \
41
351M
    {                                                       \
42
351M
        __m128i t = _mm_add_epi32(a, k##n);                 \
43
351M
        w1 = _mm_sha256rnds2_epu32(w1, w0, t);              \
44
351M
        t = _mm_shuffle_epi32(t, 0x0e);                     \
45
351M
        w0 = _mm_sha256rnds2_epu32(w0, w1, t);              \
46
351M
        if (n < 12) {                                       \
47
263M
            a = _mm_sha256msg1_epu32(a, b);                 \
48
263M
            a = _mm_add_epi32(a, _mm_alignr_epi8(d, c, 4)); \
49
263M
            a = _mm_sha256msg2_epu32(a, d);                 \
50
263M
        }                                                   \
51
351M
    }
52
53
void
54
SHA256_Compress_Native(SHA256Context *ctx)
55
9.45M
{
56
9.45M
    __m128i h0, h1, th;
57
9.45M
    __m128i a, b, c, d;
58
9.45M
    __m128i w0, w1;
59
9.45M
    const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
60
61
9.45M
    const __m128i *K = (__m128i *)K256;
62
9.45M
    const __m128i k0 = _mm_load_si128(K);
63
9.45M
    const __m128i k1 = _mm_load_si128(K + 1);
64
9.45M
    const __m128i k2 = _mm_load_si128(K + 2);
65
9.45M
    const __m128i k3 = _mm_load_si128(K + 3);
66
9.45M
    const __m128i k4 = _mm_load_si128(K + 4);
67
9.45M
    const __m128i k5 = _mm_load_si128(K + 5);
68
9.45M
    const __m128i k6 = _mm_load_si128(K + 6);
69
9.45M
    const __m128i k7 = _mm_load_si128(K + 7);
70
9.45M
    const __m128i k8 = _mm_load_si128(K + 8);
71
9.45M
    const __m128i k9 = _mm_load_si128(K + 9);
72
9.45M
    const __m128i k10 = _mm_load_si128(K + 10);
73
9.45M
    const __m128i k11 = _mm_load_si128(K + 11);
74
9.45M
    const __m128i k12 = _mm_load_si128(K + 12);
75
9.45M
    const __m128i k13 = _mm_load_si128(K + 13);
76
9.45M
    const __m128i k14 = _mm_load_si128(K + 14);
77
9.45M
    const __m128i k15 = _mm_load_si128(K + 15);
78
79
9.45M
    const __m128i *input = (__m128i *)ctx->u.b;
80
81
9.45M
    h0 = _mm_loadu_si128((__m128i *)(ctx->h));
82
9.45M
    h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));
83
84
    /* H0123:4567 -> H01256:H2367 */
85
9.45M
    th = _mm_shuffle_epi32(h0, 0xb1);
86
9.45M
    h1 = _mm_shuffle_epi32(h1, 0x1b);
87
9.45M
    h0 = _mm_alignr_epi8(th, h1, 8);
88
9.45M
    h1 = _mm_blend_epi16(h1, th, 0xf0);
89
90
9.45M
    a = _mm_shuffle_epi8(_mm_loadu_si128(input), shuffle);
91
9.45M
    b = _mm_shuffle_epi8(_mm_loadu_si128(input + 1), shuffle);
92
9.45M
    c = _mm_shuffle_epi8(_mm_loadu_si128(input + 2), shuffle);
93
9.45M
    d = _mm_shuffle_epi8(_mm_loadu_si128(input + 3), shuffle);
94
95
9.45M
    w0 = h0;
96
9.45M
    w1 = h1;
97
98
9.45M
    ROUND(0, a, b, c, d)
99
9.45M
    ROUND(1, b, c, d, a)
100
9.45M
    ROUND(2, c, d, a, b)
101
9.45M
    ROUND(3, d, a, b, c)
102
9.45M
    ROUND(4, a, b, c, d)
103
9.45M
    ROUND(5, b, c, d, a)
104
9.45M
    ROUND(6, c, d, a, b)
105
9.45M
    ROUND(7, d, a, b, c)
106
9.45M
    ROUND(8, a, b, c, d)
107
9.45M
    ROUND(9, b, c, d, a)
108
9.45M
    ROUND(10, c, d, a, b)
109
9.45M
    ROUND(11, d, a, b, c)
110
9.45M
    ROUND(12, a, b, c, d)
111
9.45M
    ROUND(13, b, c, d, a)
112
9.45M
    ROUND(14, c, d, a, b)
113
9.45M
    ROUND(15, d, a, b, c)
114
115
9.45M
    h0 = _mm_add_epi32(h0, w0);
116
9.45M
    h1 = _mm_add_epi32(h1, w1);
117
118
    /* H0145:2367 -> H0123:4567 */
119
9.45M
    th = _mm_shuffle_epi32(h0, 0x1b);
120
9.45M
    h1 = _mm_shuffle_epi32(h1, 0xb1);
121
9.45M
    h0 = _mm_blend_epi16(th, h1, 0xf0);
122
9.45M
    h1 = _mm_alignr_epi8(h1, th, 8);
123
124
9.45M
    _mm_storeu_si128((__m128i *)ctx->h, h0);
125
9.45M
    _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
126
9.45M
}
127
128
void
129
SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input,
130
                     unsigned int inputLen)
131
28.4M
{
132
28.4M
    __m128i h0, h1, th;
133
28.4M
    const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
134
135
28.4M
    const __m128i *K = (__m128i *)K256;
136
28.4M
    const __m128i k0 = _mm_load_si128(K);
137
28.4M
    const __m128i k1 = _mm_load_si128(K + 1);
138
28.4M
    const __m128i k2 = _mm_load_si128(K + 2);
139
28.4M
    const __m128i k3 = _mm_load_si128(K + 3);
140
28.4M
    const __m128i k4 = _mm_load_si128(K + 4);
141
28.4M
    const __m128i k5 = _mm_load_si128(K + 5);
142
28.4M
    const __m128i k6 = _mm_load_si128(K + 6);
143
28.4M
    const __m128i k7 = _mm_load_si128(K + 7);
144
28.4M
    const __m128i k8 = _mm_load_si128(K + 8);
145
28.4M
    const __m128i k9 = _mm_load_si128(K + 9);
146
28.4M
    const __m128i k10 = _mm_load_si128(K + 10);
147
28.4M
    const __m128i k11 = _mm_load_si128(K + 11);
148
28.4M
    const __m128i k12 = _mm_load_si128(K + 12);
149
28.4M
    const __m128i k13 = _mm_load_si128(K + 13);
150
28.4M
    const __m128i k14 = _mm_load_si128(K + 14);
151
28.4M
    const __m128i k15 = _mm_load_si128(K + 15);
152
153
28.4M
    unsigned int inBuf = ctx->sizeLo & 0x3f;
154
28.4M
    if (!inputLen) {
155
90.4k
        return;
156
90.4k
    }
157
158
    /* Add inputLen into the count of bytes processed, before processing */
159
28.3M
    if ((ctx->sizeLo += inputLen) < inputLen) {
160
0
        ctx->sizeHi++;
161
0
    }
162
163
    /* if data already in buffer, attempt to fill rest of buffer */
164
28.3M
    if (inBuf) {
165
11.1M
        unsigned int todo = SHA256_BLOCK_LENGTH - inBuf;
166
11.1M
        if (inputLen < todo) {
167
10.4M
            todo = inputLen;
168
10.4M
        }
169
11.1M
        memcpy(ctx->u.b + inBuf, input, todo);
170
11.1M
        input += todo;
171
11.1M
        inputLen -= todo;
172
11.1M
        if (inBuf + todo == SHA256_BLOCK_LENGTH) {
173
752k
            SHA256_Compress_Native(ctx);
174
752k
        }
175
11.1M
    }
176
177
28.3M
    h0 = _mm_loadu_si128((__m128i *)(ctx->h));
178
28.3M
    h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));
179
180
    /* H0123:4567 -> H01256:H2367 */
181
28.3M
    th = _mm_shuffle_epi32(h0, 0xb1);
182
28.3M
    h1 = _mm_shuffle_epi32(h1, 0x1b);
183
28.3M
    h0 = _mm_alignr_epi8(th, h1, 8);
184
28.3M
    h1 = _mm_blend_epi16(h1, th, 0xf0);
185
186
    /* if enough data to fill one or more whole buffers, process them. */
187
40.8M
    while (inputLen >= SHA256_BLOCK_LENGTH) {
188
12.4M
        __m128i a, b, c, d;
189
12.4M
        __m128i w0, w1;
190
12.4M
        a = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)input), shuffle);
191
12.4M
        b = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 16)), shuffle);
192
12.4M
        c = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 32)), shuffle);
193
12.4M
        d = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 48)), shuffle);
194
12.4M
        input += SHA256_BLOCK_LENGTH;
195
12.4M
        inputLen -= SHA256_BLOCK_LENGTH;
196
197
12.4M
        w0 = h0;
198
12.4M
        w1 = h1;
199
200
12.4M
        ROUND(0, a, b, c, d)
201
12.4M
        ROUND(1, b, c, d, a)
202
12.4M
        ROUND(2, c, d, a, b)
203
12.4M
        ROUND(3, d, a, b, c)
204
12.4M
        ROUND(4, a, b, c, d)
205
12.4M
        ROUND(5, b, c, d, a)
206
12.4M
        ROUND(6, c, d, a, b)
207
12.4M
        ROUND(7, d, a, b, c)
208
12.4M
        ROUND(8, a, b, c, d)
209
12.4M
        ROUND(9, b, c, d, a)
210
12.4M
        ROUND(10, c, d, a, b)
211
12.4M
        ROUND(11, d, a, b, c)
212
12.4M
        ROUND(12, a, b, c, d)
213
12.4M
        ROUND(13, b, c, d, a)
214
12.4M
        ROUND(14, c, d, a, b)
215
12.4M
        ROUND(15, d, a, b, c)
216
217
12.4M
        h0 = _mm_add_epi32(h0, w0);
218
12.4M
        h1 = _mm_add_epi32(h1, w1);
219
12.4M
    }
220
221
    // H01234567 -> H01256 and H2367
222
28.3M
    th = _mm_shuffle_epi32(h0, 0x1b);
223
28.3M
    h1 = _mm_shuffle_epi32(h1, 0xb1);
224
28.3M
    h0 = _mm_blend_epi16(th, h1, 0xf0);
225
28.3M
    h1 = _mm_alignr_epi8(h1, th, 8);
226
227
28.3M
    _mm_storeu_si128((__m128i *)ctx->h, h0);
228
28.3M
    _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
229
230
    /* if data left over, fill it into buffer */
231
28.3M
    if (inputLen) {
232
9.59M
        memcpy(ctx->u.b, input, inputLen);
233
9.59M
    }
234
28.3M
}
235
236
#endif /* USE_HW_SHA2 */