Coverage Report

Created: 2025-06-13 06:43

/src/php-src/ext/hash/hash_sha_ni.c
Line
Count
Source
1
/*-
2
 * Copyright 2018 Tarsnap Backup Inc.
3
 * All rights reserved.
4
 *
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions
7
 * are met:
8
 * 1. Redistributions of source code must retain the above copyright
9
 *    notice, this list of conditions and the following disclaimer.
10
 * 2. Redistributions in binary form must reproduce the above copyright
11
 *    notice, this list of conditions and the following disclaimer in the
12
 *    documentation and/or other materials provided with the distribution.
13
 *
14
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24
 * SUCH DAMAGE.
25
 */
26
27
#include "php_hash.h"
28
#include "php_hash_sha.h"
29
30
#if defined(PHP_HASH_INTRIN_SHA_NATIVE) || defined(PHP_HASH_INTRIN_SHA_RESOLVER)
31
32
# include <immintrin.h>
33
34
# if defined(PHP_HASH_INTRIN_SHA_RESOLVER) && defined(HAVE_FUNC_ATTRIBUTE_TARGET)
35
static __m128i be32dec_128(const uint8_t * src)  __attribute__((target("ssse3")));
36
void SHA256_Transform_shani(uint32_t state[PHP_STATIC_RESTRICT 8], const uint8_t block[PHP_STATIC_RESTRICT 64])  __attribute__((target("ssse3,sha")));
37
# endif
38
39
/* Original implementation from libcperciva follows.
40
 *
41
 * Modified to use `PHP_STATIC_RESTRICT` for MSVC compatibility.
42
 */
43
44
/**
45
 * This code uses intrinsics from the following feature sets:
46
 * SHANI: _mm_sha256msg1_epu32, _mm_sha256msg2_epu32, _mm_sha256rnds2_epu32
47
 * SSSE3: _mm_shuffle_epi8, _mm_alignr_epi8
48
 * SSE2: Everything else
49
 *
50
 * The SSSE3 intrinsics could be avoided at a slight cost by using a few SSE2
51
 * instructions in their place; we have not done this since to our knowledge
52
 * there are presently no CPUs which support the SHANI instruction set but do
53
 * not support SSSE3.
54
 */
55
56
/* Load 32-bit big-endian words. */
57
static __m128i
58
be32dec_128(const uint8_t * src)
59
173k
{
60
173k
  const __m128i SHUF = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11,
61
173k
      4, 5, 6, 7, 0, 1, 2, 3);
62
173k
  __m128i x;
63
64
  /* Load four 32-bit words. */
65
173k
  x = _mm_loadu_si128((const __m128i *)src);
66
67
  /* Reverse the order of the bytes in each word. */
68
173k
  return (_mm_shuffle_epi8(x, SHUF));
69
173k
}
70
71
/* Convert an unsigned 32-bit immediate into a signed value. */
72
2.76M
#define I32(a) ((UINT32_C(a) >= UINT32_C(0x80000000)) ?      \
73
2.76M
    -(int32_t)(UINT32_C(0xffffffff) - UINT32_C(a)) - 1 : (int32_t)INT32_C(a))
74
75
/* Load four unsigned 32-bit immediates into a vector register. */
76
692k
#define IMM4(a, b, c, d) _mm_set_epi32(I32(a), I32(b), I32(c), I32(d))
77
78
/* Run four rounds of SHA256. */
79
692k
#define RND4(S, W, K0, K1, K2, K3) do {           \
80
692k
  __m128i M;                \
81
692k
                    \
82
692k
  /* Add the next four words of message schedule and round constants. */  \
83
692k
  M = _mm_add_epi32(W, IMM4(K3, K2, K1, K0));        \
84
692k
                    \
85
692k
  /* Perform two rounds of SHA256, using the low two words in M. */ \
86
692k
  S[1] = _mm_sha256rnds2_epu32(S[1], S[0], M);        \
87
692k
                    \
88
692k
  /* Shift the two words of M down and perform the next two rounds. */  \
89
692k
  M = _mm_srli_si128(M, 8);           \
90
692k
  S[0] = _mm_sha256rnds2_epu32(S[0], S[1], M);        \
91
692k
} while (0)
92
93
/* Compute the ith set of four words of message schedule. */
94
519k
#define MSG4(W, i) do {               \
95
519k
  W[(i + 0) % 4] = _mm_sha256msg1_epu32(W[(i + 0) % 4], W[(i + 1) % 4]);  \
96
519k
  W[(i + 0) % 4] = _mm_add_epi32(W[(i + 0) % 4],        \
97
519k
      _mm_alignr_epi8(W[(i + 3) % 4], W[(i + 2) % 4], 4));    \
98
519k
  W[(i + 0) % 4] = _mm_sha256msg2_epu32(W[(i + 0) % 4], W[(i + 3) % 4]);  \
99
519k
} while (0)
100
101
/* Perform 4 rounds of SHA256 and generate more message schedule if needed. */
102
692k
#define RNDMSG(S, W, i, K0, K1, K2, K3) do {     \
103
692k
  RND4(S, W[i % 4], K0, K1, K2, K3);      \
104
692k
  if (i < 12)           \
105
692k
    MSG4(W, i + 4);         \
106
692k
} while (0)
107
108
/**
109
 * SHA256_Transform_shani(state, block):
110
 * Compute the SHA256 block compression function, transforming ${state} using
111
 * the data in ${block}.  This implementation uses x86 SHANI and SSSE3
112
 * instructions, and should only be used if CPUSUPPORT_X86_SHANI and _SSSE3
113
 * are defined and cpusupport_x86_shani() and _ssse3() return nonzero.
114
 */
115
void
116
SHA256_Transform_shani(uint32_t state[PHP_STATIC_RESTRICT 8],
117
    const uint8_t block[PHP_STATIC_RESTRICT 64])
118
43.2k
{
119
43.2k
  __m128i S3210, S7654;
120
43.2k
  __m128i S0123, S4567;
121
43.2k
  __m128i S0145, S2367;
122
43.2k
  __m128i W[4];
123
43.2k
  __m128i S[2];
124
125
  /* Load state. */
126
43.2k
  S3210 = _mm_loadu_si128((const __m128i *)&state[0]);
127
43.2k
  S7654 = _mm_loadu_si128((const __m128i *)&state[4]);
128
129
  /* Shuffle the 8 32-bit values into the order we need them. */
130
43.2k
  S0123 = _mm_shuffle_epi32(S3210, 0x1B);
131
43.2k
  S4567 = _mm_shuffle_epi32(S7654, 0x1B);
132
43.2k
  S0145 = _mm_unpackhi_epi64(S4567, S0123);
133
43.2k
  S2367 = _mm_unpacklo_epi64(S4567, S0123);
134
135
  /* Load input block; this is the start of the message schedule. */
136
43.2k
  W[0] = be32dec_128(&block[0]);
137
43.2k
  W[1] = be32dec_128(&block[16]);
138
43.2k
  W[2] = be32dec_128(&block[32]);
139
43.2k
  W[3] = be32dec_128(&block[48]);
140
141
  /* Initialize working variables. */
142
43.2k
  S[0] = S0145;
143
43.2k
  S[1] = S2367;
144
145
  /* Perform 64 rounds, 4 at a time. */
146
43.2k
  RNDMSG(S, W, 0, 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5);
147
43.2k
  RNDMSG(S, W, 1, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5);
148
43.2k
  RNDMSG(S, W, 2, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3);
149
43.2k
  RNDMSG(S, W, 3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174);
150
43.2k
  RNDMSG(S, W, 4, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc);
151
43.2k
  RNDMSG(S, W, 5, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da);
152
43.2k
  RNDMSG(S, W, 6, 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7);
153
43.2k
  RNDMSG(S, W, 7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967);
154
43.2k
  RNDMSG(S, W, 8, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13);
155
43.2k
  RNDMSG(S, W, 9, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85);
156
43.2k
  RNDMSG(S, W, 10, 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3);
157
43.2k
  RNDMSG(S, W, 11, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070);
158
43.2k
  RNDMSG(S, W, 12, 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5);
159
43.2k
  RNDMSG(S, W, 13, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3);
160
43.2k
  RNDMSG(S, W, 14, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208);
161
43.2k
  RNDMSG(S, W, 15, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2);
162
163
  /* Mix local working variables into global state. */
164
43.2k
  S0145 = _mm_add_epi32(S0145, S[0]);
165
43.2k
  S2367 = _mm_add_epi32(S2367, S[1]);
166
167
  /* Shuffle state back to the original word order and store. */
168
43.2k
  S0123 = _mm_unpackhi_epi64(S2367, S0145);
169
43.2k
  S4567 = _mm_unpacklo_epi64(S2367, S0145);
170
43.2k
  S3210 = _mm_shuffle_epi32(S0123, 0x1B);
171
43.2k
  S7654 = _mm_shuffle_epi32(S4567, 0x1B);
172
43.2k
  _mm_storeu_si128((__m128i *)&state[0], S3210);
173
43.2k
  _mm_storeu_si128((__m128i *)&state[4], S7654);
174
43.2k
}
175
176
#endif