Coverage Report

Created: 2025-09-17 07:05

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/nss/lib/freebl/sha256-x86.c
Line
Count
Source
1
/* This Source Code Form is subject to the terms of the Mozilla Public
2
 * License, v. 2.0. If a copy of the MPL was not distributed with this
3
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5
#ifdef USE_HW_SHA2
6
7
#include <immintrin.h>
8
9
#ifdef FREEBL_NO_DEPEND
10
#include "stubs.h"
11
#endif
12
13
#include "blapii.h"
14
#include "prcpucfg.h"
15
#include "prtypes.h" /* for PRUintXX */
16
#include "prlong.h"
17
#include "blapi.h"
18
#include "sha256.h"
19
20
/* SHA-256 constants, K256. */
21
pre_align static const PRUint32 K256[64] post_align = {
22
    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
23
    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
24
    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
25
    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
26
    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
27
    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
28
    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
29
    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
30
    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
31
    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
32
    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
33
    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
34
    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
35
    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
36
    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
37
    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
38
};
39
40
#define ROUND(n, a, b, c, d)                                \
41
169M
    {                                                       \
42
169M
        __m128i t = _mm_add_epi32(a, k##n);                 \
43
169M
        w1 = _mm_sha256rnds2_epu32(w1, w0, t);              \
44
169M
        t = _mm_shuffle_epi32(t, 0x0e);                     \
45
169M
        w0 = _mm_sha256rnds2_epu32(w0, w1, t);              \
46
169M
        if (n < 12) {                                       \
47
127M
            a = _mm_sha256msg1_epu32(a, b);                 \
48
127M
            a = _mm_add_epi32(a, _mm_alignr_epi8(d, c, 4)); \
49
127M
            a = _mm_sha256msg2_epu32(a, d);                 \
50
127M
        }                                                   \
51
169M
    }
52
53
void
54
SHA256_Compress_Native(SHA256Context *ctx)
55
4.00M
{
56
4.00M
    __m128i h0, h1, th;
57
4.00M
    __m128i a, b, c, d;
58
4.00M
    __m128i w0, w1;
59
4.00M
    const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
60
61
4.00M
    const __m128i *K = (__m128i *)K256;
62
4.00M
    const __m128i k0 = _mm_load_si128(K);
63
4.00M
    const __m128i k1 = _mm_load_si128(K + 1);
64
4.00M
    const __m128i k2 = _mm_load_si128(K + 2);
65
4.00M
    const __m128i k3 = _mm_load_si128(K + 3);
66
4.00M
    const __m128i k4 = _mm_load_si128(K + 4);
67
4.00M
    const __m128i k5 = _mm_load_si128(K + 5);
68
4.00M
    const __m128i k6 = _mm_load_si128(K + 6);
69
4.00M
    const __m128i k7 = _mm_load_si128(K + 7);
70
4.00M
    const __m128i k8 = _mm_load_si128(K + 8);
71
4.00M
    const __m128i k9 = _mm_load_si128(K + 9);
72
4.00M
    const __m128i k10 = _mm_load_si128(K + 10);
73
4.00M
    const __m128i k11 = _mm_load_si128(K + 11);
74
4.00M
    const __m128i k12 = _mm_load_si128(K + 12);
75
4.00M
    const __m128i k13 = _mm_load_si128(K + 13);
76
4.00M
    const __m128i k14 = _mm_load_si128(K + 14);
77
4.00M
    const __m128i k15 = _mm_load_si128(K + 15);
78
79
4.00M
    const __m128i *input = (__m128i *)ctx->u.b;
80
81
4.00M
    h0 = _mm_loadu_si128((__m128i *)(ctx->h));
82
4.00M
    h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));
83
84
    /* H0123:4567 -> H01256:H2367 */
85
4.00M
    th = _mm_shuffle_epi32(h0, 0xb1);
86
4.00M
    h1 = _mm_shuffle_epi32(h1, 0x1b);
87
4.00M
    h0 = _mm_alignr_epi8(th, h1, 8);
88
4.00M
    h1 = _mm_blend_epi16(h1, th, 0xf0);
89
90
4.00M
    a = _mm_shuffle_epi8(_mm_loadu_si128(input), shuffle);
91
4.00M
    b = _mm_shuffle_epi8(_mm_loadu_si128(input + 1), shuffle);
92
4.00M
    c = _mm_shuffle_epi8(_mm_loadu_si128(input + 2), shuffle);
93
4.00M
    d = _mm_shuffle_epi8(_mm_loadu_si128(input + 3), shuffle);
94
95
4.00M
    w0 = h0;
96
4.00M
    w1 = h1;
97
98
4.00M
    ROUND(0, a, b, c, d)
99
4.00M
    ROUND(1, b, c, d, a)
100
4.00M
    ROUND(2, c, d, a, b)
101
4.00M
    ROUND(3, d, a, b, c)
102
4.00M
    ROUND(4, a, b, c, d)
103
4.00M
    ROUND(5, b, c, d, a)
104
4.00M
    ROUND(6, c, d, a, b)
105
4.00M
    ROUND(7, d, a, b, c)
106
4.00M
    ROUND(8, a, b, c, d)
107
4.00M
    ROUND(9, b, c, d, a)
108
4.00M
    ROUND(10, c, d, a, b)
109
4.00M
    ROUND(11, d, a, b, c)
110
4.00M
    ROUND(12, a, b, c, d)
111
4.00M
    ROUND(13, b, c, d, a)
112
4.00M
    ROUND(14, c, d, a, b)
113
4.00M
    ROUND(15, d, a, b, c)
114
115
4.00M
    h0 = _mm_add_epi32(h0, w0);
116
4.00M
    h1 = _mm_add_epi32(h1, w1);
117
118
    /* H0145:2367 -> H0123:4567 */
119
4.00M
    th = _mm_shuffle_epi32(h0, 0x1b);
120
4.00M
    h1 = _mm_shuffle_epi32(h1, 0xb1);
121
4.00M
    h0 = _mm_blend_epi16(th, h1, 0xf0);
122
4.00M
    h1 = _mm_alignr_epi8(h1, th, 8);
123
124
4.00M
    _mm_storeu_si128((__m128i *)ctx->h, h0);
125
4.00M
    _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
126
4.00M
}
127
128
void
129
SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input,
130
                     unsigned int inputLen)
131
13.4M
{
132
13.4M
    __m128i h0, h1, th;
133
13.4M
    const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
134
135
13.4M
    const __m128i *K = (__m128i *)K256;
136
13.4M
    const __m128i k0 = _mm_load_si128(K);
137
13.4M
    const __m128i k1 = _mm_load_si128(K + 1);
138
13.4M
    const __m128i k2 = _mm_load_si128(K + 2);
139
13.4M
    const __m128i k3 = _mm_load_si128(K + 3);
140
13.4M
    const __m128i k4 = _mm_load_si128(K + 4);
141
13.4M
    const __m128i k5 = _mm_load_si128(K + 5);
142
13.4M
    const __m128i k6 = _mm_load_si128(K + 6);
143
13.4M
    const __m128i k7 = _mm_load_si128(K + 7);
144
13.4M
    const __m128i k8 = _mm_load_si128(K + 8);
145
13.4M
    const __m128i k9 = _mm_load_si128(K + 9);
146
13.4M
    const __m128i k10 = _mm_load_si128(K + 10);
147
13.4M
    const __m128i k11 = _mm_load_si128(K + 11);
148
13.4M
    const __m128i k12 = _mm_load_si128(K + 12);
149
13.4M
    const __m128i k13 = _mm_load_si128(K + 13);
150
13.4M
    const __m128i k14 = _mm_load_si128(K + 14);
151
13.4M
    const __m128i k15 = _mm_load_si128(K + 15);
152
153
13.4M
    unsigned int inBuf = ctx->sizeLo & 0x3f;
154
13.4M
    if (!inputLen) {
155
77.1k
        return;
156
77.1k
    }
157
158
    /* Add inputLen into the count of bytes processed, before processing */
159
13.4M
    if ((ctx->sizeLo += inputLen) < inputLen) {
160
0
        ctx->sizeHi++;
161
0
    }
162
163
    /* if data already in buffer, attempt to fill rest of buffer */
164
13.4M
    if (inBuf) {
165
6.85M
        unsigned int todo = SHA256_BLOCK_LENGTH - inBuf;
166
6.85M
        if (inputLen < todo) {
167
6.11M
            todo = inputLen;
168
6.11M
        }
169
6.85M
        memcpy(ctx->u.b + inBuf, input, todo);
170
6.85M
        input += todo;
171
6.85M
        inputLen -= todo;
172
6.85M
        if (inBuf + todo == SHA256_BLOCK_LENGTH) {
173
740k
            SHA256_Compress_Native(ctx);
174
740k
        }
175
6.85M
    }
176
177
13.4M
    h0 = _mm_loadu_si128((__m128i *)(ctx->h));
178
13.4M
    h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));
179
180
    /* H0123:4567 -> H01256:H2367 */
181
13.4M
    th = _mm_shuffle_epi32(h0, 0xb1);
182
13.4M
    h1 = _mm_shuffle_epi32(h1, 0x1b);
183
13.4M
    h0 = _mm_alignr_epi8(th, h1, 8);
184
13.4M
    h1 = _mm_blend_epi16(h1, th, 0xf0);
185
186
    /* if enough data to fill one or more whole buffers, process them. */
187
20.0M
    while (inputLen >= SHA256_BLOCK_LENGTH) {
188
6.59M
        __m128i a, b, c, d;
189
6.59M
        __m128i w0, w1;
190
6.59M
        a = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)input), shuffle);
191
6.59M
        b = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 16)), shuffle);
192
6.59M
        c = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 32)), shuffle);
193
6.59M
        d = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 48)), shuffle);
194
6.59M
        input += SHA256_BLOCK_LENGTH;
195
6.59M
        inputLen -= SHA256_BLOCK_LENGTH;
196
197
6.59M
        w0 = h0;
198
6.59M
        w1 = h1;
199
200
6.59M
        ROUND(0, a, b, c, d)
201
6.59M
        ROUND(1, b, c, d, a)
202
6.59M
        ROUND(2, c, d, a, b)
203
6.59M
        ROUND(3, d, a, b, c)
204
6.59M
        ROUND(4, a, b, c, d)
205
6.59M
        ROUND(5, b, c, d, a)
206
6.59M
        ROUND(6, c, d, a, b)
207
6.59M
        ROUND(7, d, a, b, c)
208
6.59M
        ROUND(8, a, b, c, d)
209
6.59M
        ROUND(9, b, c, d, a)
210
6.59M
        ROUND(10, c, d, a, b)
211
6.59M
        ROUND(11, d, a, b, c)
212
6.59M
        ROUND(12, a, b, c, d)
213
6.59M
        ROUND(13, b, c, d, a)
214
6.59M
        ROUND(14, c, d, a, b)
215
6.59M
        ROUND(15, d, a, b, c)
216
217
6.59M
        h0 = _mm_add_epi32(h0, w0);
218
6.59M
        h1 = _mm_add_epi32(h1, w1);
219
6.59M
    }
220
221
    // H01234567 -> H01256 and H2367
222
13.4M
    th = _mm_shuffle_epi32(h0, 0x1b);
223
13.4M
    h1 = _mm_shuffle_epi32(h1, 0xb1);
224
13.4M
    h0 = _mm_blend_epi16(th, h1, 0xf0);
225
13.4M
    h1 = _mm_alignr_epi8(h1, th, 8);
226
227
13.4M
    _mm_storeu_si128((__m128i *)ctx->h, h0);
228
13.4M
    _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
229
230
    /* if data left over, fill it into buffer */
231
13.4M
    if (inputLen) {
232
4.09M
        memcpy(ctx->u.b, input, inputLen);
233
4.09M
    }
234
13.4M
}
235
236
#endif /* USE_HW_SHA2 */