/src/nss/lib/freebl/sha256-x86.c

Source
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#ifdef USE_HW_SHA2

#include <immintrin.h>

#ifdef FREEBL_NO_DEPEND
#include "stubs.h"
#endif

#include "blapii.h"
#include "prcpucfg.h"
#include "prtypes.h" /* for PRUintXX */
#include "prlong.h"
#include "blapi.h"
#include "sha256.h"

/* SHA-256 constants, K256. */
pre_align static const PRUint32 K256[64] post_align = {
    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};

#define ROUND(n, a, b, c, d)                                \
    {                                                       \
        __m128i t = _mm_add_epi32(a, k##n);                 \
        w1 = _mm_sha256rnds2_epu32(w1, w0, t);              \
        t = _mm_shuffle_epi32(t, 0x0e);                     \
        w0 = _mm_sha256rnds2_epu32(w0, w1, t);              \
        if (n < 12) {                                       \
            a = _mm_sha256msg1_epu32(a, b);                 \
            a = _mm_add_epi32(a, _mm_alignr_epi8(d, c, 4)); \
            a = _mm_sha256msg2_epu32(a, d);                 \
        }                                                   \
    }

void
SHA256_Compress_Native(SHA256Context *ctx)
{
    __m128i h0, h1, th;
    __m128i a, b, c, d;
    __m128i w0, w1;
    const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);

    const __m128i *K = (__m128i *)K256;
    const __m128i k0 = _mm_load_si128(K);
    const __m128i k1 = _mm_load_si128(K + 1);
    const __m128i k2 = _mm_load_si128(K + 2);
    const __m128i k3 = _mm_load_si128(K + 3);
    const __m128i k4 = _mm_load_si128(K + 4);
    const __m128i k5 = _mm_load_si128(K + 5);
    const __m128i k6 = _mm_load_si128(K + 6);
    const __m128i k7 = _mm_load_si128(K + 7);
    const __m128i k8 = _mm_load_si128(K + 8);
    const __m128i k9 = _mm_load_si128(K + 9);
    const __m128i k10 = _mm_load_si128(K + 10);
    const __m128i k11 = _mm_load_si128(K + 11);
    const __m128i k12 = _mm_load_si128(K + 12);
    const __m128i k13 = _mm_load_si128(K + 13);
    const __m128i k14 = _mm_load_si128(K + 14);
    const __m128i k15 = _mm_load_si128(K + 15);

    const __m128i *input = (__m128i *)ctx->u.b;

    h0 = _mm_loadu_si128((__m128i *)(ctx->h));
    h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));

    /* H0123:4567 -> H01256:H2367 */
    th = _mm_shuffle_epi32(h0, 0xb1);
    h1 = _mm_shuffle_epi32(h1, 0x1b);
    h0 = _mm_alignr_epi8(th, h1, 8);
    h1 = _mm_blend_epi16(h1, th, 0xf0);

    a = _mm_shuffle_epi8(_mm_loadu_si128(input), shuffle);
    b = _mm_shuffle_epi8(_mm_loadu_si128(input + 1), shuffle);
    c = _mm_shuffle_epi8(_mm_loadu_si128(input + 2), shuffle);
    d = _mm_shuffle_epi8(_mm_loadu_si128(input + 3), shuffle);

    w0 = h0;
    w1 = h1;

    ROUND(0, a, b, c, d)
    ROUND(1, b, c, d, a)
    ROUND(2, c, d, a, b)
    ROUND(3, d, a, b, c)
    ROUND(4, a, b, c, d)
    ROUND(5, b, c, d, a)
    ROUND(6, c, d, a, b)
    ROUND(7, d, a, b, c)
    ROUND(8, a, b, c, d)
    ROUND(9, b, c, d, a)
    ROUND(10, c, d, a, b)
    ROUND(11, d, a, b, c)
    ROUND(12, a, b, c, d)
    ROUND(13, b, c, d, a)
    ROUND(14, c, d, a, b)
    ROUND(15, d, a, b, c)

    h0 = _mm_add_epi32(h0, w0);
    h1 = _mm_add_epi32(h1, w1);

    /* H0145:2367 -> H0123:4567 */
    th = _mm_shuffle_epi32(h0, 0x1b);
    h1 = _mm_shuffle_epi32(h1, 0xb1);
    h0 = _mm_blend_epi16(th, h1, 0xf0);
    h1 = _mm_alignr_epi8(h1, th, 8);

    _mm_storeu_si128((__m128i *)ctx->h, h0);
    _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
}

void
SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input,
                     unsigned int inputLen)
{
    __m128i h0, h1, th;
    const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);

    const __m128i *K = (__m128i *)K256;
    const __m128i k0 = _mm_load_si128(K);
    const __m128i k1 = _mm_load_si128(K + 1);
    const __m128i k2 = _mm_load_si128(K + 2);
    const __m128i k3 = _mm_load_si128(K + 3);
    const __m128i k4 = _mm_load_si128(K + 4);
    const __m128i k5 = _mm_load_si128(K + 5);
    const __m128i k6 = _mm_load_si128(K + 6);
    const __m128i k7 = _mm_load_si128(K + 7);
    const __m128i k8 = _mm_load_si128(K + 8);
    const __m128i k9 = _mm_load_si128(K + 9);
    const __m128i k10 = _mm_load_si128(K + 10);
    const __m128i k11 = _mm_load_si128(K + 11);
    const __m128i k12 = _mm_load_si128(K + 12);
    const __m128i k13 = _mm_load_si128(K + 13);
    const __m128i k14 = _mm_load_si128(K + 14);
    const __m128i k15 = _mm_load_si128(K + 15);

    unsigned int inBuf = ctx->sizeLo & 0x3f;
    if (!inputLen) {
        return;
    }

    /* Add inputLen into the count of bytes processed, before processing */
    if ((ctx->sizeLo += inputLen) < inputLen) {
        ctx->sizeHi++;
    }

    /* if data already in buffer, attempt to fill rest of buffer */
    if (inBuf) {
        unsigned int todo = SHA256_BLOCK_LENGTH - inBuf;
        if (inputLen < todo) {
            todo = inputLen;
        }
        memcpy(ctx->u.b + inBuf, input, todo);
        input += todo;
        inputLen -= todo;
        if (inBuf + todo == SHA256_BLOCK_LENGTH) {
            SHA256_Compress_Native(ctx);
        }
    }

    h0 = _mm_loadu_si128((__m128i *)(ctx->h));
    h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));

    /* H0123:4567 -> H01256:H2367 */
    th = _mm_shuffle_epi32(h0, 0xb1);
    h1 = _mm_shuffle_epi32(h1, 0x1b);
    h0 = _mm_alignr_epi8(th, h1, 8);
    h1 = _mm_blend_epi16(h1, th, 0xf0);

    /* if enough data to fill one or more whole buffers, process them. */
    while (inputLen >= SHA256_BLOCK_LENGTH) {
        __m128i a, b, c, d;
        __m128i w0, w1;
        a = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)input), shuffle);
        b = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 16)), shuffle);
        c = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 32)), shuffle);
        d = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 48)), shuffle);
        input += SHA256_BLOCK_LENGTH;
        inputLen -= SHA256_BLOCK_LENGTH;

        w0 = h0;
        w1 = h1;

        ROUND(0, a, b, c, d)
        ROUND(1, b, c, d, a)
        ROUND(2, c, d, a, b)
        ROUND(3, d, a, b, c)
        ROUND(4, a, b, c, d)
        ROUND(5, b, c, d, a)
        ROUND(6, c, d, a, b)
        ROUND(7, d, a, b, c)
        ROUND(8, a, b, c, d)
        ROUND(9, b, c, d, a)
        ROUND(10, c, d, a, b)
        ROUND(11, d, a, b, c)
        ROUND(12, a, b, c, d)
        ROUND(13, b, c, d, a)
        ROUND(14, c, d, a, b)
        ROUND(15, d, a, b, c)

        h0 = _mm_add_epi32(h0, w0);
        h1 = _mm_add_epi32(h1, w1);
    }

    // H01234567 -> H01256 and H2367
    th = _mm_shuffle_epi32(h0, 0x1b);
    h1 = _mm_shuffle_epi32(h1, 0xb1);
    h0 = _mm_blend_epi16(th, h1, 0xf0);
    h1 = _mm_alignr_epi8(h1, th, 8);

    _mm_storeu_si128((__m128i *)ctx->h, h0);
    _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);

    /* if data left over, fill it into buffer */
    if (inputLen) {
        memcpy(ctx->u.b, input, inputLen);
    }
}

#endif /* USE_HW_SHA2 */

Coverage Report

Created: 2025-09-17 07:05

Line	Count	Source
1		/* This Source Code Form is subject to the terms of the Mozilla Public
2		* License, v. 2.0. If a copy of the MPL was not distributed with this
3		* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5		#ifdef USE_HW_SHA2
6
7		#include <immintrin.h>
8
9		#ifdef FREEBL_NO_DEPEND
10		#include "stubs.h"
11		#endif
12
13		#include "blapii.h"
14		#include "prcpucfg.h"
15		#include "prtypes.h" /* for PRUintXX */
16		#include "prlong.h"
17		#include "blapi.h"
18		#include "sha256.h"
19
20		/* SHA-256 constants, K256. */
21		pre_align static const PRUint32 K256[64] post_align = {
22		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
23		0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
24		0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
25		0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
26		0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
27		0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
28		0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
29		0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
30		0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
31		0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
32		0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
33		0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
34		0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
35		0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
36		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
37		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
38		};
39
40		#define ROUND(n, a, b, c, d) \
41	169M	{ \
42	169M	__m128i t = _mm_add_epi32(a, k##n); \
43	169M	w1 = _mm_sha256rnds2_epu32(w1, w0, t); \
44	169M	t = _mm_shuffle_epi32(t, 0x0e); \
45	169M	w0 = _mm_sha256rnds2_epu32(w0, w1, t); \
46	169M	if (n < 12) { \
47	127M	a = _mm_sha256msg1_epu32(a, b); \
48	127M	a = _mm_add_epi32(a, _mm_alignr_epi8(d, c, 4)); \
49	127M	a = _mm_sha256msg2_epu32(a, d); \
50	127M	} \
51	169M	}
52
53		void
54		SHA256_Compress_Native(SHA256Context *ctx)
55	4.00M	{
56	4.00M	__m128i h0, h1, th;
57	4.00M	__m128i a, b, c, d;
58	4.00M	__m128i w0, w1;
59	4.00M	const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
60
61	4.00M	const __m128i K = (__m128i )K256;
62	4.00M	const __m128i k0 = _mm_load_si128(K);
63	4.00M	const __m128i k1 = _mm_load_si128(K + 1);
64	4.00M	const __m128i k2 = _mm_load_si128(K + 2);
65	4.00M	const __m128i k3 = _mm_load_si128(K + 3);
66	4.00M	const __m128i k4 = _mm_load_si128(K + 4);
67	4.00M	const __m128i k5 = _mm_load_si128(K + 5);
68	4.00M	const __m128i k6 = _mm_load_si128(K + 6);
69	4.00M	const __m128i k7 = _mm_load_si128(K + 7);
70	4.00M	const __m128i k8 = _mm_load_si128(K + 8);
71	4.00M	const __m128i k9 = _mm_load_si128(K + 9);
72	4.00M	const __m128i k10 = _mm_load_si128(K + 10);
73	4.00M	const __m128i k11 = _mm_load_si128(K + 11);
74	4.00M	const __m128i k12 = _mm_load_si128(K + 12);
75	4.00M	const __m128i k13 = _mm_load_si128(K + 13);
76	4.00M	const __m128i k14 = _mm_load_si128(K + 14);
77	4.00M	const __m128i k15 = _mm_load_si128(K + 15);
78
79	4.00M	const __m128i input = (__m128i )ctx->u.b;
80
81	4.00M	h0 = _mm_loadu_si128((__m128i *)(ctx->h));
82	4.00M	h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));
83
84		/* H0123:4567 -> H01256:H2367 */
85	4.00M	th = _mm_shuffle_epi32(h0, 0xb1);
86	4.00M	h1 = _mm_shuffle_epi32(h1, 0x1b);
87	4.00M	h0 = _mm_alignr_epi8(th, h1, 8);
88	4.00M	h1 = _mm_blend_epi16(h1, th, 0xf0);
89
90	4.00M	a = _mm_shuffle_epi8(_mm_loadu_si128(input), shuffle);
91	4.00M	b = _mm_shuffle_epi8(_mm_loadu_si128(input + 1), shuffle);
92	4.00M	c = _mm_shuffle_epi8(_mm_loadu_si128(input + 2), shuffle);
93	4.00M	d = _mm_shuffle_epi8(_mm_loadu_si128(input + 3), shuffle);
94
95	4.00M	w0 = h0;
96	4.00M	w1 = h1;
97
98	4.00M	ROUND(0, a, b, c, d)
99	4.00M	ROUND(1, b, c, d, a)
100	4.00M	ROUND(2, c, d, a, b)
101	4.00M	ROUND(3, d, a, b, c)
102	4.00M	ROUND(4, a, b, c, d)
103	4.00M	ROUND(5, b, c, d, a)
104	4.00M	ROUND(6, c, d, a, b)
105	4.00M	ROUND(7, d, a, b, c)
106	4.00M	ROUND(8, a, b, c, d)
107	4.00M	ROUND(9, b, c, d, a)
108	4.00M	ROUND(10, c, d, a, b)
109	4.00M	ROUND(11, d, a, b, c)
110	4.00M	ROUND(12, a, b, c, d)
111	4.00M	ROUND(13, b, c, d, a)
112	4.00M	ROUND(14, c, d, a, b)
113	4.00M	ROUND(15, d, a, b, c)
114
115	4.00M	h0 = _mm_add_epi32(h0, w0);
116	4.00M	h1 = _mm_add_epi32(h1, w1);
117
118		/* H0145:2367 -> H0123:4567 */
119	4.00M	th = _mm_shuffle_epi32(h0, 0x1b);
120	4.00M	h1 = _mm_shuffle_epi32(h1, 0xb1);
121	4.00M	h0 = _mm_blend_epi16(th, h1, 0xf0);
122	4.00M	h1 = _mm_alignr_epi8(h1, th, 8);
123
124	4.00M	_mm_storeu_si128((__m128i *)ctx->h, h0);
125	4.00M	_mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
126	4.00M	}
127
128		void
129		SHA256_Update_Native(SHA256Context ctx, const unsigned char input,
130		unsigned int inputLen)
131	13.4M	{
132	13.4M	__m128i h0, h1, th;
133	13.4M	const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
134
135	13.4M	const __m128i K = (__m128i )K256;
136	13.4M	const __m128i k0 = _mm_load_si128(K);
137	13.4M	const __m128i k1 = _mm_load_si128(K + 1);
138	13.4M	const __m128i k2 = _mm_load_si128(K + 2);
139	13.4M	const __m128i k3 = _mm_load_si128(K + 3);
140	13.4M	const __m128i k4 = _mm_load_si128(K + 4);
141	13.4M	const __m128i k5 = _mm_load_si128(K + 5);
142	13.4M	const __m128i k6 = _mm_load_si128(K + 6);
143	13.4M	const __m128i k7 = _mm_load_si128(K + 7);
144	13.4M	const __m128i k8 = _mm_load_si128(K + 8);
145	13.4M	const __m128i k9 = _mm_load_si128(K + 9);
146	13.4M	const __m128i k10 = _mm_load_si128(K + 10);
147	13.4M	const __m128i k11 = _mm_load_si128(K + 11);
148	13.4M	const __m128i k12 = _mm_load_si128(K + 12);
149	13.4M	const __m128i k13 = _mm_load_si128(K + 13);
150	13.4M	const __m128i k14 = _mm_load_si128(K + 14);
151	13.4M	const __m128i k15 = _mm_load_si128(K + 15);
152
153	13.4M	unsigned int inBuf = ctx->sizeLo & 0x3f;
154	13.4M	if (!inputLen) {
155	77.1k	return;
156	77.1k	}
157
158		/* Add inputLen into the count of bytes processed, before processing */
159	13.4M	if ((ctx->sizeLo += inputLen) < inputLen) {
160	0	ctx->sizeHi++;
161	0	}
162
163		/* if data already in buffer, attempt to fill rest of buffer */
164	13.4M	if (inBuf) {
165	6.85M	unsigned int todo = SHA256_BLOCK_LENGTH - inBuf;
166	6.85M	if (inputLen < todo) {
167	6.11M	todo = inputLen;
168	6.11M	}
169	6.85M	memcpy(ctx->u.b + inBuf, input, todo);
170	6.85M	input += todo;
171	6.85M	inputLen -= todo;
172	6.85M	if (inBuf + todo == SHA256_BLOCK_LENGTH) {
173	740k	SHA256_Compress_Native(ctx);
174	740k	}
175	6.85M	}
176
177	13.4M	h0 = _mm_loadu_si128((__m128i *)(ctx->h));
178	13.4M	h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));
179
180		/* H0123:4567 -> H01256:H2367 */
181	13.4M	th = _mm_shuffle_epi32(h0, 0xb1);
182	13.4M	h1 = _mm_shuffle_epi32(h1, 0x1b);
183	13.4M	h0 = _mm_alignr_epi8(th, h1, 8);
184	13.4M	h1 = _mm_blend_epi16(h1, th, 0xf0);
185
186		/* if enough data to fill one or more whole buffers, process them. */
187	20.0M	while (inputLen >= SHA256_BLOCK_LENGTH) {
188	6.59M	__m128i a, b, c, d;
189	6.59M	__m128i w0, w1;
190	6.59M	a = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)input), shuffle);
191	6.59M	b = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 16)), shuffle);
192	6.59M	c = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 32)), shuffle);
193	6.59M	d = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 48)), shuffle);
194	6.59M	input += SHA256_BLOCK_LENGTH;
195	6.59M	inputLen -= SHA256_BLOCK_LENGTH;
196
197	6.59M	w0 = h0;
198	6.59M	w1 = h1;
199
200	6.59M	ROUND(0, a, b, c, d)
201	6.59M	ROUND(1, b, c, d, a)
202	6.59M	ROUND(2, c, d, a, b)
203	6.59M	ROUND(3, d, a, b, c)
204	6.59M	ROUND(4, a, b, c, d)
205	6.59M	ROUND(5, b, c, d, a)
206	6.59M	ROUND(6, c, d, a, b)
207	6.59M	ROUND(7, d, a, b, c)
208	6.59M	ROUND(8, a, b, c, d)
209	6.59M	ROUND(9, b, c, d, a)
210	6.59M	ROUND(10, c, d, a, b)
211	6.59M	ROUND(11, d, a, b, c)
212	6.59M	ROUND(12, a, b, c, d)
213	6.59M	ROUND(13, b, c, d, a)
214	6.59M	ROUND(14, c, d, a, b)
215	6.59M	ROUND(15, d, a, b, c)
216
217	6.59M	h0 = _mm_add_epi32(h0, w0);
218	6.59M	h1 = _mm_add_epi32(h1, w1);
219	6.59M	}
220
221		// H01234567 -> H01256 and H2367
222	13.4M	th = _mm_shuffle_epi32(h0, 0x1b);
223	13.4M	h1 = _mm_shuffle_epi32(h1, 0xb1);
224	13.4M	h0 = _mm_blend_epi16(th, h1, 0xf0);
225	13.4M	h1 = _mm_alignr_epi8(h1, th, 8);
226
227	13.4M	_mm_storeu_si128((__m128i *)ctx->h, h0);
228	13.4M	_mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
229
230		/* if data left over, fill it into buffer */
231	13.4M	if (inputLen) {
232	4.09M	memcpy(ctx->u.b, input, inputLen);
233	4.09M	}
234	13.4M	}
235
236		#endif /* USE_HW_SHA2 */