Coverage Report

Created: 2024-11-21 07:03

/src/nss-nspr/nss/lib/freebl/sha256-x86.c
Line
Count
Source (jump to first uncovered line)
1
/* This Source Code Form is subject to the terms of the Mozilla Public
2
 * License, v. 2.0. If a copy of the MPL was not distributed with this
3
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5
#ifdef USE_HW_SHA2
6
7
#include <immintrin.h>
8
9
#ifdef FREEBL_NO_DEPEND
10
#include "stubs.h"
11
#endif
12
13
#include "blapii.h"
14
#include "prcpucfg.h"
15
#include "prtypes.h" /* for PRUintXX */
16
#include "prlong.h"
17
#include "blapi.h"
18
#include "sha256.h"
19
20
/* SHA-256 constants, K256. */
21
pre_align static const PRUint32 K256[64] post_align = {
22
    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
23
    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
24
    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
25
    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
26
    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
27
    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
28
    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
29
    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
30
    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
31
    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
32
    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
33
    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
34
    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
35
    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
36
    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
37
    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
38
};
39
40
#define ROUND(n, a, b, c, d)                                \
41
1.68M
    {                                                       \
42
1.68M
        __m128i t = _mm_add_epi32(a, k##n);                 \
43
1.68M
        w1 = _mm_sha256rnds2_epu32(w1, w0, t);              \
44
1.68M
        t = _mm_shuffle_epi32(t, 0x0e);                     \
45
1.68M
        w0 = _mm_sha256rnds2_epu32(w0, w1, t);              \
46
1.68M
        if (n < 12) {                                       \
47
1.26M
            a = _mm_sha256msg1_epu32(a, b);                 \
48
1.26M
            a = _mm_add_epi32(a, _mm_alignr_epi8(d, c, 4)); \
49
1.26M
            a = _mm_sha256msg2_epu32(a, d);                 \
50
1.26M
        }                                                   \
51
1.68M
    }
52
53
void
54
SHA256_Compress_Native(SHA256Context *ctx)
55
3.24k
{
56
3.24k
    __m128i h0, h1, th;
57
3.24k
    __m128i a, b, c, d;
58
3.24k
    __m128i w0, w1;
59
3.24k
    const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
60
61
3.24k
    const __m128i *K = (__m128i *)K256;
62
3.24k
    const __m128i k0 = _mm_load_si128(K);
63
3.24k
    const __m128i k1 = _mm_load_si128(K + 1);
64
3.24k
    const __m128i k2 = _mm_load_si128(K + 2);
65
3.24k
    const __m128i k3 = _mm_load_si128(K + 3);
66
3.24k
    const __m128i k4 = _mm_load_si128(K + 4);
67
3.24k
    const __m128i k5 = _mm_load_si128(K + 5);
68
3.24k
    const __m128i k6 = _mm_load_si128(K + 6);
69
3.24k
    const __m128i k7 = _mm_load_si128(K + 7);
70
3.24k
    const __m128i k8 = _mm_load_si128(K + 8);
71
3.24k
    const __m128i k9 = _mm_load_si128(K + 9);
72
3.24k
    const __m128i k10 = _mm_load_si128(K + 10);
73
3.24k
    const __m128i k11 = _mm_load_si128(K + 11);
74
3.24k
    const __m128i k12 = _mm_load_si128(K + 12);
75
3.24k
    const __m128i k13 = _mm_load_si128(K + 13);
76
3.24k
    const __m128i k14 = _mm_load_si128(K + 14);
77
3.24k
    const __m128i k15 = _mm_load_si128(K + 15);
78
79
3.24k
    const __m128i *input = (__m128i *)ctx->u.b;
80
81
3.24k
    h0 = _mm_loadu_si128((__m128i *)(ctx->h));
82
3.24k
    h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));
83
84
    /* H0123:4567 -> H01256:H2367 */
85
3.24k
    th = _mm_shuffle_epi32(h0, 0xb1);
86
3.24k
    h1 = _mm_shuffle_epi32(h1, 0x1b);
87
3.24k
    h0 = _mm_alignr_epi8(th, h1, 8);
88
3.24k
    h1 = _mm_blend_epi16(h1, th, 0xf0);
89
90
3.24k
    a = _mm_shuffle_epi8(_mm_loadu_si128(input), shuffle);
91
3.24k
    b = _mm_shuffle_epi8(_mm_loadu_si128(input + 1), shuffle);
92
3.24k
    c = _mm_shuffle_epi8(_mm_loadu_si128(input + 2), shuffle);
93
3.24k
    d = _mm_shuffle_epi8(_mm_loadu_si128(input + 3), shuffle);
94
95
3.24k
    w0 = h0;
96
3.24k
    w1 = h1;
97
98
3.24k
    ROUND(0, a, b, c, d)
99
3.24k
    ROUND(1, b, c, d, a)
100
3.24k
    ROUND(2, c, d, a, b)
101
3.24k
    ROUND(3, d, a, b, c)
102
3.24k
    ROUND(4, a, b, c, d)
103
3.24k
    ROUND(5, b, c, d, a)
104
3.24k
    ROUND(6, c, d, a, b)
105
3.24k
    ROUND(7, d, a, b, c)
106
3.24k
    ROUND(8, a, b, c, d)
107
3.24k
    ROUND(9, b, c, d, a)
108
3.24k
    ROUND(10, c, d, a, b)
109
3.24k
    ROUND(11, d, a, b, c)
110
3.24k
    ROUND(12, a, b, c, d)
111
3.24k
    ROUND(13, b, c, d, a)
112
3.24k
    ROUND(14, c, d, a, b)
113
3.24k
    ROUND(15, d, a, b, c)
114
115
3.24k
    h0 = _mm_add_epi32(h0, w0);
116
3.24k
    h1 = _mm_add_epi32(h1, w1);
117
118
    /* H0145:2367 -> H0123:4567 */
119
3.24k
    th = _mm_shuffle_epi32(h0, 0x1b);
120
3.24k
    h1 = _mm_shuffle_epi32(h1, 0xb1);
121
3.24k
    h0 = _mm_blend_epi16(th, h1, 0xf0);
122
3.24k
    h1 = _mm_alignr_epi8(h1, th, 8);
123
124
3.24k
    _mm_storeu_si128((__m128i *)ctx->h, h0);
125
3.24k
    _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
126
3.24k
}
127
128
void
129
SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input,
130
                     unsigned int inputLen)
131
8.93k
{
132
8.93k
    __m128i h0, h1, th;
133
8.93k
    const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
134
135
8.93k
    const __m128i *K = (__m128i *)K256;
136
8.93k
    const __m128i k0 = _mm_load_si128(K);
137
8.93k
    const __m128i k1 = _mm_load_si128(K + 1);
138
8.93k
    const __m128i k2 = _mm_load_si128(K + 2);
139
8.93k
    const __m128i k3 = _mm_load_si128(K + 3);
140
8.93k
    const __m128i k4 = _mm_load_si128(K + 4);
141
8.93k
    const __m128i k5 = _mm_load_si128(K + 5);
142
8.93k
    const __m128i k6 = _mm_load_si128(K + 6);
143
8.93k
    const __m128i k7 = _mm_load_si128(K + 7);
144
8.93k
    const __m128i k8 = _mm_load_si128(K + 8);
145
8.93k
    const __m128i k9 = _mm_load_si128(K + 9);
146
8.93k
    const __m128i k10 = _mm_load_si128(K + 10);
147
8.93k
    const __m128i k11 = _mm_load_si128(K + 11);
148
8.93k
    const __m128i k12 = _mm_load_si128(K + 12);
149
8.93k
    const __m128i k13 = _mm_load_si128(K + 13);
150
8.93k
    const __m128i k14 = _mm_load_si128(K + 14);
151
8.93k
    const __m128i k15 = _mm_load_si128(K + 15);
152
153
8.93k
    unsigned int inBuf = ctx->sizeLo & 0x3f;
154
8.93k
    if (!inputLen) {
155
0
        return;
156
0
    }
157
158
    /* Add inputLen into the count of bytes processed, before processing */
159
8.93k
    if ((ctx->sizeLo += inputLen) < inputLen) {
160
0
        ctx->sizeHi++;
161
0
    }
162
163
    /* if data already in buffer, attempt to fill rest of buffer */
164
8.93k
    if (inBuf) {
165
4.38k
        unsigned int todo = SHA256_BLOCK_LENGTH - inBuf;
166
4.38k
        if (inputLen < todo) {
167
3.44k
            todo = inputLen;
168
3.44k
        }
169
4.38k
        memcpy(ctx->u.b + inBuf, input, todo);
170
4.38k
        input += todo;
171
4.38k
        inputLen -= todo;
172
4.38k
        if (inBuf + todo == SHA256_BLOCK_LENGTH) {
173
939
            SHA256_Compress_Native(ctx);
174
939
        }
175
4.38k
    }
176
177
8.93k
    h0 = _mm_loadu_si128((__m128i *)(ctx->h));
178
8.93k
    h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));
179
180
    /* H0123:4567 -> H01256:H2367 */
181
8.93k
    th = _mm_shuffle_epi32(h0, 0xb1);
182
8.93k
    h1 = _mm_shuffle_epi32(h1, 0x1b);
183
8.93k
    h0 = _mm_alignr_epi8(th, h1, 8);
184
8.93k
    h1 = _mm_blend_epi16(h1, th, 0xf0);
185
186
    /* if enough data to fill one or more whole buffers, process them. */
187
110k
    while (inputLen >= SHA256_BLOCK_LENGTH) {
188
101k
        __m128i a, b, c, d;
189
101k
        __m128i w0, w1;
190
101k
        a = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)input), shuffle);
191
101k
        b = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 16)), shuffle);
192
101k
        c = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 32)), shuffle);
193
101k
        d = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 48)), shuffle);
194
101k
        input += SHA256_BLOCK_LENGTH;
195
101k
        inputLen -= SHA256_BLOCK_LENGTH;
196
197
101k
        w0 = h0;
198
101k
        w1 = h1;
199
200
101k
        ROUND(0, a, b, c, d)
201
101k
        ROUND(1, b, c, d, a)
202
101k
        ROUND(2, c, d, a, b)
203
101k
        ROUND(3, d, a, b, c)
204
101k
        ROUND(4, a, b, c, d)
205
101k
        ROUND(5, b, c, d, a)
206
101k
        ROUND(6, c, d, a, b)
207
101k
        ROUND(7, d, a, b, c)
208
101k
        ROUND(8, a, b, c, d)
209
101k
        ROUND(9, b, c, d, a)
210
101k
        ROUND(10, c, d, a, b)
211
101k
        ROUND(11, d, a, b, c)
212
101k
        ROUND(12, a, b, c, d)
213
101k
        ROUND(13, b, c, d, a)
214
101k
        ROUND(14, c, d, a, b)
215
101k
        ROUND(15, d, a, b, c)
216
217
101k
        h0 = _mm_add_epi32(h0, w0);
218
101k
        h1 = _mm_add_epi32(h1, w1);
219
101k
    }
220
221
    // H01234567 -> H01256 and H2367
222
8.93k
    th = _mm_shuffle_epi32(h0, 0x1b);
223
8.93k
    h1 = _mm_shuffle_epi32(h1, 0xb1);
224
8.93k
    h0 = _mm_blend_epi16(th, h1, 0xf0);
225
8.93k
    h1 = _mm_alignr_epi8(h1, th, 8);
226
227
8.93k
    _mm_storeu_si128((__m128i *)ctx->h, h0);
228
8.93k
    _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
229
230
    /* if data left over, fill it into buffer */
231
8.93k
    if (inputLen) {
232
3.24k
        memcpy(ctx->u.b, input, inputLen);
233
3.24k
    }
234
8.93k
}
235
236
#endif /* USE_HW_SHA2 */