/src/nss/lib/freebl/sha256-x86.c

Source
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#ifdef USE_HW_SHA2

#include <immintrin.h>

#ifdef FREEBL_NO_DEPEND
#include "stubs.h"
#endif

#include "blapii.h"
#include "prcpucfg.h"
#include "prtypes.h" /* for PRUintXX */
#include "prlong.h"
#include "blapi.h"
#include "sha256.h"

/* SHA-256 constants, K256. */
pre_align static const PRUint32 K256[64] post_align = {
    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};

#define ROUND(n, a, b, c, d)                                \
    {                                                       \
        __m128i t = _mm_add_epi32(a, k##n);                 \
        w1 = _mm_sha256rnds2_epu32(w1, w0, t);              \
        t = _mm_shuffle_epi32(t, 0x0e);                     \
        w0 = _mm_sha256rnds2_epu32(w0, w1, t);              \
        if (n < 12) {                                       \
            a = _mm_sha256msg1_epu32(a, b);                 \
            a = _mm_add_epi32(a, _mm_alignr_epi8(d, c, 4)); \
            a = _mm_sha256msg2_epu32(a, d);                 \
        }                                                   \
    }

void
SHA256_Compress_Native(SHA256Context *ctx)
{
    __m128i h0, h1, th;
    __m128i a, b, c, d;
    __m128i w0, w1;
    const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);

    const __m128i *K = (__m128i *)K256;
    const __m128i k0 = _mm_load_si128(K);
    const __m128i k1 = _mm_load_si128(K + 1);
    const __m128i k2 = _mm_load_si128(K + 2);
    const __m128i k3 = _mm_load_si128(K + 3);
    const __m128i k4 = _mm_load_si128(K + 4);
    const __m128i k5 = _mm_load_si128(K + 5);
    const __m128i k6 = _mm_load_si128(K + 6);
    const __m128i k7 = _mm_load_si128(K + 7);
    const __m128i k8 = _mm_load_si128(K + 8);
    const __m128i k9 = _mm_load_si128(K + 9);
    const __m128i k10 = _mm_load_si128(K + 10);
    const __m128i k11 = _mm_load_si128(K + 11);
    const __m128i k12 = _mm_load_si128(K + 12);
    const __m128i k13 = _mm_load_si128(K + 13);
    const __m128i k14 = _mm_load_si128(K + 14);
    const __m128i k15 = _mm_load_si128(K + 15);

    const __m128i *input = (__m128i *)ctx->u.b;

    h0 = _mm_loadu_si128((__m128i *)(ctx->h));
    h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));

    /* H0123:4567 -> H01256:H2367 */
    th = _mm_shuffle_epi32(h0, 0xb1);
    h1 = _mm_shuffle_epi32(h1, 0x1b);
    h0 = _mm_alignr_epi8(th, h1, 8);
    h1 = _mm_blend_epi16(h1, th, 0xf0);

    a = _mm_shuffle_epi8(_mm_loadu_si128(input), shuffle);
    b = _mm_shuffle_epi8(_mm_loadu_si128(input + 1), shuffle);
    c = _mm_shuffle_epi8(_mm_loadu_si128(input + 2), shuffle);
    d = _mm_shuffle_epi8(_mm_loadu_si128(input + 3), shuffle);

    w0 = h0;
    w1 = h1;

    ROUND(0, a, b, c, d)
    ROUND(1, b, c, d, a)
    ROUND(2, c, d, a, b)
    ROUND(3, d, a, b, c)
    ROUND(4, a, b, c, d)
    ROUND(5, b, c, d, a)
    ROUND(6, c, d, a, b)
    ROUND(7, d, a, b, c)
    ROUND(8, a, b, c, d)
    ROUND(9, b, c, d, a)
    ROUND(10, c, d, a, b)
    ROUND(11, d, a, b, c)
    ROUND(12, a, b, c, d)
    ROUND(13, b, c, d, a)
    ROUND(14, c, d, a, b)
    ROUND(15, d, a, b, c)

    h0 = _mm_add_epi32(h0, w0);
    h1 = _mm_add_epi32(h1, w1);

    /* H0145:2367 -> H0123:4567 */
    th = _mm_shuffle_epi32(h0, 0x1b);
    h1 = _mm_shuffle_epi32(h1, 0xb1);
    h0 = _mm_blend_epi16(th, h1, 0xf0);
    h1 = _mm_alignr_epi8(h1, th, 8);

    _mm_storeu_si128((__m128i *)ctx->h, h0);
    _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
}

void
SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input,
                     unsigned int inputLen)
{
    __m128i h0, h1, th;
    const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);

    const __m128i *K = (__m128i *)K256;
    const __m128i k0 = _mm_load_si128(K);
    const __m128i k1 = _mm_load_si128(K + 1);
    const __m128i k2 = _mm_load_si128(K + 2);
    const __m128i k3 = _mm_load_si128(K + 3);
    const __m128i k4 = _mm_load_si128(K + 4);
    const __m128i k5 = _mm_load_si128(K + 5);
    const __m128i k6 = _mm_load_si128(K + 6);
    const __m128i k7 = _mm_load_si128(K + 7);
    const __m128i k8 = _mm_load_si128(K + 8);
    const __m128i k9 = _mm_load_si128(K + 9);
    const __m128i k10 = _mm_load_si128(K + 10);
    const __m128i k11 = _mm_load_si128(K + 11);
    const __m128i k12 = _mm_load_si128(K + 12);
    const __m128i k13 = _mm_load_si128(K + 13);
    const __m128i k14 = _mm_load_si128(K + 14);
    const __m128i k15 = _mm_load_si128(K + 15);

    unsigned int inBuf = ctx->sizeLo & 0x3f;
    if (!inputLen) {
        return;
    }

    /* Add inputLen into the count of bytes processed, before processing */
    if ((ctx->sizeLo += inputLen) < inputLen) {
        ctx->sizeHi++;
    }

    /* if data already in buffer, attempt to fill rest of buffer */
    if (inBuf) {
        unsigned int todo = SHA256_BLOCK_LENGTH - inBuf;
        if (inputLen < todo) {
            todo = inputLen;
        }
        memcpy(ctx->u.b + inBuf, input, todo);
        input += todo;
        inputLen -= todo;
        if (inBuf + todo == SHA256_BLOCK_LENGTH) {
            SHA256_Compress_Native(ctx);
        }
    }

    h0 = _mm_loadu_si128((__m128i *)(ctx->h));
    h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));

    /* H0123:4567 -> H01256:H2367 */
    th = _mm_shuffle_epi32(h0, 0xb1);
    h1 = _mm_shuffle_epi32(h1, 0x1b);
    h0 = _mm_alignr_epi8(th, h1, 8);
    h1 = _mm_blend_epi16(h1, th, 0xf0);

    /* if enough data to fill one or more whole buffers, process them. */
    while (inputLen >= SHA256_BLOCK_LENGTH) {
        __m128i a, b, c, d;
        __m128i w0, w1;
        a = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)input), shuffle);
        b = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 16)), shuffle);
        c = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 32)), shuffle);
        d = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 48)), shuffle);
        input += SHA256_BLOCK_LENGTH;
        inputLen -= SHA256_BLOCK_LENGTH;

        w0 = h0;
        w1 = h1;

        ROUND(0, a, b, c, d)
        ROUND(1, b, c, d, a)
        ROUND(2, c, d, a, b)
        ROUND(3, d, a, b, c)
        ROUND(4, a, b, c, d)
        ROUND(5, b, c, d, a)
        ROUND(6, c, d, a, b)
        ROUND(7, d, a, b, c)
        ROUND(8, a, b, c, d)
        ROUND(9, b, c, d, a)
        ROUND(10, c, d, a, b)
        ROUND(11, d, a, b, c)
        ROUND(12, a, b, c, d)
        ROUND(13, b, c, d, a)
        ROUND(14, c, d, a, b)
        ROUND(15, d, a, b, c)

        h0 = _mm_add_epi32(h0, w0);
        h1 = _mm_add_epi32(h1, w1);
    }

    // H01234567 -> H01256 and H2367
    th = _mm_shuffle_epi32(h0, 0x1b);
    h1 = _mm_shuffle_epi32(h1, 0xb1);
    h0 = _mm_blend_epi16(th, h1, 0xf0);
    h1 = _mm_alignr_epi8(h1, th, 8);

    _mm_storeu_si128((__m128i *)ctx->h, h0);
    _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);

    /* if data left over, fill it into buffer */
    if (inputLen) {
        memcpy(ctx->u.b, input, inputLen);
    }
}

#endif /* USE_HW_SHA2 */

Coverage Report

Created: 2026-05-19 06:33

Line	Count	Source
1		/* This Source Code Form is subject to the terms of the Mozilla Public
2		* License, v. 2.0. If a copy of the MPL was not distributed with this
3		* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5		#ifdef USE_HW_SHA2
6
7		#include <immintrin.h>
8
9		#ifdef FREEBL_NO_DEPEND
10		#include "stubs.h"
11		#endif
12
13		#include "blapii.h"
14		#include "prcpucfg.h"
15		#include "prtypes.h" /* for PRUintXX */
16		#include "prlong.h"
17		#include "blapi.h"
18		#include "sha256.h"
19
20		/* SHA-256 constants, K256. */
21		pre_align static const PRUint32 K256[64] post_align = {
22		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
23		0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
24		0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
25		0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
26		0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
27		0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
28		0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
29		0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
30		0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
31		0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
32		0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
33		0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
34		0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
35		0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
36		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
37		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
38		};
39
40		#define ROUND(n, a, b, c, d) \
41	351M	{ \
42	351M	__m128i t = _mm_add_epi32(a, k##n); \
43	351M	w1 = _mm_sha256rnds2_epu32(w1, w0, t); \
44	351M	t = _mm_shuffle_epi32(t, 0x0e); \
45	351M	w0 = _mm_sha256rnds2_epu32(w0, w1, t); \
46	351M	if (n < 12) { \
47	263M	a = _mm_sha256msg1_epu32(a, b); \
48	263M	a = _mm_add_epi32(a, _mm_alignr_epi8(d, c, 4)); \
49	263M	a = _mm_sha256msg2_epu32(a, d); \
50	263M	} \
51	351M	}
52
53		void
54		SHA256_Compress_Native(SHA256Context *ctx)
55	9.45M	{
56	9.45M	__m128i h0, h1, th;
57	9.45M	__m128i a, b, c, d;
58	9.45M	__m128i w0, w1;
59	9.45M	const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
60
61	9.45M	const __m128i K = (__m128i )K256;
62	9.45M	const __m128i k0 = _mm_load_si128(K);
63	9.45M	const __m128i k1 = _mm_load_si128(K + 1);
64	9.45M	const __m128i k2 = _mm_load_si128(K + 2);
65	9.45M	const __m128i k3 = _mm_load_si128(K + 3);
66	9.45M	const __m128i k4 = _mm_load_si128(K + 4);
67	9.45M	const __m128i k5 = _mm_load_si128(K + 5);
68	9.45M	const __m128i k6 = _mm_load_si128(K + 6);
69	9.45M	const __m128i k7 = _mm_load_si128(K + 7);
70	9.45M	const __m128i k8 = _mm_load_si128(K + 8);
71	9.45M	const __m128i k9 = _mm_load_si128(K + 9);
72	9.45M	const __m128i k10 = _mm_load_si128(K + 10);
73	9.45M	const __m128i k11 = _mm_load_si128(K + 11);
74	9.45M	const __m128i k12 = _mm_load_si128(K + 12);
75	9.45M	const __m128i k13 = _mm_load_si128(K + 13);
76	9.45M	const __m128i k14 = _mm_load_si128(K + 14);
77	9.45M	const __m128i k15 = _mm_load_si128(K + 15);
78
79	9.45M	const __m128i input = (__m128i )ctx->u.b;
80
81	9.45M	h0 = _mm_loadu_si128((__m128i *)(ctx->h));
82	9.45M	h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));
83
84		/* H0123:4567 -> H01256:H2367 */
85	9.45M	th = _mm_shuffle_epi32(h0, 0xb1);
86	9.45M	h1 = _mm_shuffle_epi32(h1, 0x1b);
87	9.45M	h0 = _mm_alignr_epi8(th, h1, 8);
88	9.45M	h1 = _mm_blend_epi16(h1, th, 0xf0);
89
90	9.45M	a = _mm_shuffle_epi8(_mm_loadu_si128(input), shuffle);
91	9.45M	b = _mm_shuffle_epi8(_mm_loadu_si128(input + 1), shuffle);
92	9.45M	c = _mm_shuffle_epi8(_mm_loadu_si128(input + 2), shuffle);
93	9.45M	d = _mm_shuffle_epi8(_mm_loadu_si128(input + 3), shuffle);
94
95	9.45M	w0 = h0;
96	9.45M	w1 = h1;
97
98	9.45M	ROUND(0, a, b, c, d)
99	9.45M	ROUND(1, b, c, d, a)
100	9.45M	ROUND(2, c, d, a, b)
101	9.45M	ROUND(3, d, a, b, c)
102	9.45M	ROUND(4, a, b, c, d)
103	9.45M	ROUND(5, b, c, d, a)
104	9.45M	ROUND(6, c, d, a, b)
105	9.45M	ROUND(7, d, a, b, c)
106	9.45M	ROUND(8, a, b, c, d)
107	9.45M	ROUND(9, b, c, d, a)
108	9.45M	ROUND(10, c, d, a, b)
109	9.45M	ROUND(11, d, a, b, c)
110	9.45M	ROUND(12, a, b, c, d)
111	9.45M	ROUND(13, b, c, d, a)
112	9.45M	ROUND(14, c, d, a, b)
113	9.45M	ROUND(15, d, a, b, c)
114
115	9.45M	h0 = _mm_add_epi32(h0, w0);
116	9.45M	h1 = _mm_add_epi32(h1, w1);
117
118		/* H0145:2367 -> H0123:4567 */
119	9.45M	th = _mm_shuffle_epi32(h0, 0x1b);
120	9.45M	h1 = _mm_shuffle_epi32(h1, 0xb1);
121	9.45M	h0 = _mm_blend_epi16(th, h1, 0xf0);
122	9.45M	h1 = _mm_alignr_epi8(h1, th, 8);
123
124	9.45M	_mm_storeu_si128((__m128i *)ctx->h, h0);
125	9.45M	_mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
126	9.45M	}
127
128		void
129		SHA256_Update_Native(SHA256Context ctx, const unsigned char input,
130		unsigned int inputLen)
131	28.4M	{
132	28.4M	__m128i h0, h1, th;
133	28.4M	const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
134
135	28.4M	const __m128i K = (__m128i )K256;
136	28.4M	const __m128i k0 = _mm_load_si128(K);
137	28.4M	const __m128i k1 = _mm_load_si128(K + 1);
138	28.4M	const __m128i k2 = _mm_load_si128(K + 2);
139	28.4M	const __m128i k3 = _mm_load_si128(K + 3);
140	28.4M	const __m128i k4 = _mm_load_si128(K + 4);
141	28.4M	const __m128i k5 = _mm_load_si128(K + 5);
142	28.4M	const __m128i k6 = _mm_load_si128(K + 6);
143	28.4M	const __m128i k7 = _mm_load_si128(K + 7);
144	28.4M	const __m128i k8 = _mm_load_si128(K + 8);
145	28.4M	const __m128i k9 = _mm_load_si128(K + 9);
146	28.4M	const __m128i k10 = _mm_load_si128(K + 10);
147	28.4M	const __m128i k11 = _mm_load_si128(K + 11);
148	28.4M	const __m128i k12 = _mm_load_si128(K + 12);
149	28.4M	const __m128i k13 = _mm_load_si128(K + 13);
150	28.4M	const __m128i k14 = _mm_load_si128(K + 14);
151	28.4M	const __m128i k15 = _mm_load_si128(K + 15);
152
153	28.4M	unsigned int inBuf = ctx->sizeLo & 0x3f;
154	28.4M	if (!inputLen) {
155	90.4k	return;
156	90.4k	}
157
158		/* Add inputLen into the count of bytes processed, before processing */
159	28.3M	if ((ctx->sizeLo += inputLen) < inputLen) {
160	0	ctx->sizeHi++;
161	0	}
162
163		/* if data already in buffer, attempt to fill rest of buffer */
164	28.3M	if (inBuf) {
165	11.1M	unsigned int todo = SHA256_BLOCK_LENGTH - inBuf;
166	11.1M	if (inputLen < todo) {
167	10.4M	todo = inputLen;
168	10.4M	}
169	11.1M	memcpy(ctx->u.b + inBuf, input, todo);
170	11.1M	input += todo;
171	11.1M	inputLen -= todo;
172	11.1M	if (inBuf + todo == SHA256_BLOCK_LENGTH) {
173	752k	SHA256_Compress_Native(ctx);
174	752k	}
175	11.1M	}
176
177	28.3M	h0 = _mm_loadu_si128((__m128i *)(ctx->h));
178	28.3M	h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));
179
180		/* H0123:4567 -> H01256:H2367 */
181	28.3M	th = _mm_shuffle_epi32(h0, 0xb1);
182	28.3M	h1 = _mm_shuffle_epi32(h1, 0x1b);
183	28.3M	h0 = _mm_alignr_epi8(th, h1, 8);
184	28.3M	h1 = _mm_blend_epi16(h1, th, 0xf0);
185
186		/* if enough data to fill one or more whole buffers, process them. */
187	40.8M	while (inputLen >= SHA256_BLOCK_LENGTH) {
188	12.4M	__m128i a, b, c, d;
189	12.4M	__m128i w0, w1;
190	12.4M	a = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)input), shuffle);
191	12.4M	b = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 16)), shuffle);
192	12.4M	c = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 32)), shuffle);
193	12.4M	d = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 48)), shuffle);
194	12.4M	input += SHA256_BLOCK_LENGTH;
195	12.4M	inputLen -= SHA256_BLOCK_LENGTH;
196
197	12.4M	w0 = h0;
198	12.4M	w1 = h1;
199
200	12.4M	ROUND(0, a, b, c, d)
201	12.4M	ROUND(1, b, c, d, a)
202	12.4M	ROUND(2, c, d, a, b)
203	12.4M	ROUND(3, d, a, b, c)
204	12.4M	ROUND(4, a, b, c, d)
205	12.4M	ROUND(5, b, c, d, a)
206	12.4M	ROUND(6, c, d, a, b)
207	12.4M	ROUND(7, d, a, b, c)
208	12.4M	ROUND(8, a, b, c, d)
209	12.4M	ROUND(9, b, c, d, a)
210	12.4M	ROUND(10, c, d, a, b)
211	12.4M	ROUND(11, d, a, b, c)
212	12.4M	ROUND(12, a, b, c, d)
213	12.4M	ROUND(13, b, c, d, a)
214	12.4M	ROUND(14, c, d, a, b)
215	12.4M	ROUND(15, d, a, b, c)
216
217	12.4M	h0 = _mm_add_epi32(h0, w0);
218	12.4M	h1 = _mm_add_epi32(h1, w1);
219	12.4M	}
220
221		// H01234567 -> H01256 and H2367
222	28.3M	th = _mm_shuffle_epi32(h0, 0x1b);
223	28.3M	h1 = _mm_shuffle_epi32(h1, 0xb1);
224	28.3M	h0 = _mm_blend_epi16(th, h1, 0xf0);
225	28.3M	h1 = _mm_alignr_epi8(h1, th, 8);
226
227	28.3M	_mm_storeu_si128((__m128i *)ctx->h, h0);
228	28.3M	_mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
229
230		/* if data left over, fill it into buffer */
231	28.3M	if (inputLen) {
232	9.59M	memcpy(ctx->u.b, input, inputLen);
233	9.59M	}
234	28.3M	}
235
236		#endif /* USE_HW_SHA2 */