Line data Source code
1 :
2 : // Source originally from https://github.com/BLAKE3-team/BLAKE3
3 : // From commit: c0ea395cf91d242f078c23d5f8d87eb9dd5f7b78
4 :
5 : #include "fd_blake3_private.h"
6 : #include "../../util/simd/fd_avx512.h"
7 : #include "../../util/simd/fd_avx.h"
8 :
9 : static inline __attribute__((always_inline)) void
10 : round_fn16( wwu_t v[16],
11 : wwu_t m[16],
12 916265 : ulong r ) {
13 916265 : v[0x0] = wwu_add(v[0x0], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][0]]);
14 916265 : v[0x1] = wwu_add(v[0x1], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][2]]);
15 916265 : v[0x2] = wwu_add(v[0x2], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][4]]);
16 916265 : v[0x3] = wwu_add(v[0x3], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][6]]);
17 916265 : v[0x0] = wwu_add(v[0x0], v[0x4]);
18 916265 : v[0x1] = wwu_add(v[0x1], v[0x5]);
19 916265 : v[0x2] = wwu_add(v[0x2], v[0x6]);
20 916265 : v[0x3] = wwu_add(v[0x3], v[0x7]);
21 916265 : v[0xc] = wwu_xor(v[0xc], v[0x0]);
22 916265 : v[0xd] = wwu_xor(v[0xd], v[0x1]);
23 916265 : v[0xe] = wwu_xor(v[0xe], v[0x2]);
24 916265 : v[0xf] = wwu_xor(v[0xf], v[0x3]);
25 916265 : v[0xc] = wwu_ror(v[0xc], 16);
26 916265 : v[0xd] = wwu_ror(v[0xd], 16);
27 916265 : v[0xe] = wwu_ror(v[0xe], 16);
28 916265 : v[0xf] = wwu_ror(v[0xf], 16);
29 916265 : v[0x8] = wwu_add(v[0x8], v[0xc]);
30 916265 : v[0x9] = wwu_add(v[0x9], v[0xd]);
31 916265 : v[0xa] = wwu_add(v[0xa], v[0xe]);
32 916265 : v[0xb] = wwu_add(v[0xb], v[0xf]);
33 916265 : v[0x4] = wwu_xor(v[0x4], v[0x8]);
34 916265 : v[0x5] = wwu_xor(v[0x5], v[0x9]);
35 916265 : v[0x6] = wwu_xor(v[0x6], v[0xa]);
36 916265 : v[0x7] = wwu_xor(v[0x7], v[0xb]);
37 916265 : v[0x4] = wwu_ror(v[0x4], 12);
38 916265 : v[0x5] = wwu_ror(v[0x5], 12);
39 916265 : v[0x6] = wwu_ror(v[0x6], 12);
40 916265 : v[0x7] = wwu_ror(v[0x7], 12);
41 916265 : v[0x0] = wwu_add(v[0x0], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][1]]);
42 916265 : v[0x1] = wwu_add(v[0x1], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][3]]);
43 916265 : v[0x2] = wwu_add(v[0x2], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][5]]);
44 916265 : v[0x3] = wwu_add(v[0x3], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][7]]);
45 916265 : v[0x0] = wwu_add(v[0x0], v[0x4]);
46 916265 : v[0x1] = wwu_add(v[0x1], v[0x5]);
47 916265 : v[0x2] = wwu_add(v[0x2], v[0x6]);
48 916265 : v[0x3] = wwu_add(v[0x3], v[0x7]);
49 916265 : v[0xc] = wwu_xor(v[0xc], v[0x0]);
50 916265 : v[0xd] = wwu_xor(v[0xd], v[0x1]);
51 916265 : v[0xe] = wwu_xor(v[0xe], v[0x2]);
52 916265 : v[0xf] = wwu_xor(v[0xf], v[0x3]);
53 916265 : v[0xc] = wwu_ror(v[0xc], 8);
54 916265 : v[0xd] = wwu_ror(v[0xd], 8);
55 916265 : v[0xe] = wwu_ror(v[0xe], 8);
56 916265 : v[0xf] = wwu_ror(v[0xf], 8);
57 916265 : v[0x8] = wwu_add(v[0x8], v[0xc]);
58 916265 : v[0x9] = wwu_add(v[0x9], v[0xd]);
59 916265 : v[0xa] = wwu_add(v[0xa], v[0xe]);
60 916265 : v[0xb] = wwu_add(v[0xb], v[0xf]);
61 916265 : v[0x4] = wwu_xor(v[0x4], v[0x8]);
62 916265 : v[0x5] = wwu_xor(v[0x5], v[0x9]);
63 916265 : v[0x6] = wwu_xor(v[0x6], v[0xa]);
64 916265 : v[0x7] = wwu_xor(v[0x7], v[0xb]);
65 916265 : v[0x4] = wwu_ror(v[0x4], 7);
66 916265 : v[0x5] = wwu_ror(v[0x5], 7);
67 916265 : v[0x6] = wwu_ror(v[0x6], 7);
68 916265 : v[0x7] = wwu_ror(v[0x7], 7);
69 :
70 916265 : v[0x0] = wwu_add(v[0x0], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][8]]);
71 916265 : v[0x1] = wwu_add(v[0x1], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][10]]);
72 916265 : v[0x2] = wwu_add(v[0x2], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][12]]);
73 916265 : v[0x3] = wwu_add(v[0x3], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][14]]);
74 916265 : v[0x0] = wwu_add(v[0x0], v[0x5]);
75 916265 : v[0x1] = wwu_add(v[0x1], v[0x6]);
76 916265 : v[0x2] = wwu_add(v[0x2], v[0x7]);
77 916265 : v[0x3] = wwu_add(v[0x3], v[0x4]);
78 916265 : v[0xf] = wwu_xor(v[0xf], v[0x0]);
79 916265 : v[0xc] = wwu_xor(v[0xc], v[0x1]);
80 916265 : v[0xd] = wwu_xor(v[0xd], v[0x2]);
81 916265 : v[0xe] = wwu_xor(v[0xe], v[0x3]);
82 916265 : v[0xf] = wwu_ror(v[0xf], 16);
83 916265 : v[0xc] = wwu_ror(v[0xc], 16);
84 916265 : v[0xd] = wwu_ror(v[0xd], 16);
85 916265 : v[0xe] = wwu_ror(v[0xe], 16);
86 916265 : v[0xa] = wwu_add(v[0xa], v[0xf]);
87 916265 : v[0xb] = wwu_add(v[0xb], v[0xc]);
88 916265 : v[0x8] = wwu_add(v[0x8], v[0xd]);
89 916265 : v[0x9] = wwu_add(v[0x9], v[0xe]);
90 916265 : v[0x5] = wwu_xor(v[0x5], v[0xa]);
91 916265 : v[0x6] = wwu_xor(v[0x6], v[0xb]);
92 916265 : v[0x7] = wwu_xor(v[0x7], v[0x8]);
93 916265 : v[0x4] = wwu_xor(v[0x4], v[0x9]);
94 916265 : v[0x5] = wwu_ror(v[0x5], 12);
95 916265 : v[0x6] = wwu_ror(v[0x6], 12);
96 916265 : v[0x7] = wwu_ror(v[0x7], 12);
97 916265 : v[0x4] = wwu_ror(v[0x4], 12);
98 916265 : v[0x0] = wwu_add(v[0x0], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][9]]);
99 916265 : v[0x1] = wwu_add(v[0x1], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][11]]);
100 916265 : v[0x2] = wwu_add(v[0x2], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][13]]);
101 916265 : v[0x3] = wwu_add(v[0x3], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][15]]);
102 916265 : v[0x0] = wwu_add(v[0x0], v[0x5]);
103 916265 : v[0x1] = wwu_add(v[0x1], v[0x6]);
104 916265 : v[0x2] = wwu_add(v[0x2], v[0x7]);
105 916265 : v[0x3] = wwu_add(v[0x3], v[0x4]);
106 916265 : v[0xf] = wwu_xor(v[0xf], v[0x0]);
107 916265 : v[0xc] = wwu_xor(v[0xc], v[0x1]);
108 916265 : v[0xd] = wwu_xor(v[0xd], v[0x2]);
109 916265 : v[0xe] = wwu_xor(v[0xe], v[0x3]);
110 916265 : v[0xf] = wwu_ror(v[0xf], 8);
111 916265 : v[0xc] = wwu_ror(v[0xc], 8);
112 916265 : v[0xd] = wwu_ror(v[0xd], 8);
113 916265 : v[0xe] = wwu_ror(v[0xe], 8);
114 916265 : v[0xa] = wwu_add(v[0xa], v[0xf]);
115 916265 : v[0xb] = wwu_add(v[0xb], v[0xc]);
116 916265 : v[0x8] = wwu_add(v[0x8], v[0xd]);
117 916265 : v[0x9] = wwu_add(v[0x9], v[0xe]);
118 916265 : v[0x5] = wwu_xor(v[0x5], v[0xa]);
119 916265 : v[0x6] = wwu_xor(v[0x6], v[0xb]);
120 916265 : v[0x7] = wwu_xor(v[0x7], v[0x8]);
121 916265 : v[0x4] = wwu_xor(v[0x4], v[0x9]);
122 916265 : v[0x5] = wwu_ror(v[0x5], 7);
123 916265 : v[0x6] = wwu_ror(v[0x6], 7);
124 916265 : v[0x7] = wwu_ror(v[0x7], 7);
125 916265 : v[0x4] = wwu_ror(v[0x4], 7);
126 916265 : }
127 :
128 : void
129 : fd_blake3_avx512_compress16( ulong batch_cnt,
130 : void const * restrict _batch_data,
131 : uint const * restrict batch_sz,
132 : ulong const * restrict ctr_vec,
133 : uint const * restrict batch_flags,
134 : void * const * restrict _batch_hash,
135 : ushort * restrict lthash,
136 : uint out_sz,
137 19244 : void const * restrict batch_cv ) {
138 19244 : if( FD_UNLIKELY( lthash && batch_cnt!=16 ) ) FD_LOG_ERR(( "Lane masking not supported for fd_blake3_avx512_compress16 in LtHash mode" ));
139 19244 : if( FD_UNLIKELY( batch_cnt==0 || batch_cnt>16 ) ) FD_LOG_ERR(( "Invalid batch_cnt %lu", batch_cnt ));
140 :
141 : /* We can only process input blocks of 64 bytes, but message data size
142 : is not necessarily a multiple of 64. We compute the tail block of
143 : each message here. We then process complete blocks of the original
144 : message in place, switching to processing to these tail blocks in
145 : the same pass toward the end. */
146 :
147 19244 : ulong const * batch_data = (ulong const *)_batch_data;
148 :
149 19244 : if( FD_UNLIKELY( batch_cnt==1 ) ) {
150 1851 : fd_blake3_sse_compress1( (uchar *)(_batch_hash[0]),
151 1851 : (uchar const *)(batch_data[0]),
152 1851 : batch_sz[0],
153 1851 : ctr_vec[0],
154 1851 : batch_flags[0],
155 1851 : NULL,
156 1851 : NULL );
157 1851 : return;
158 1851 : }
159 :
160 : #if FD_BLAKE3_TRACING
161 : /* This log_line buffer is oversized by a fair bit (due to all the
162 : NULL terminators) but that's fine */
163 : char log_line[
164 : sizeof( "fd_blake3_avx512_compress16" )+
165 : sizeof( "(batch_cnt=" )+21+
166 : sizeof( ",sz=[" )+(16*11)+sizeof( "]" )+
167 : sizeof( ",counter=[" )+(16*21)+sizeof( "]" )+
168 : sizeof( ",flags=[" )+(16* 2)+sizeof( "]" )+
169 : sizeof( ",custom_cv" )+
170 : sizeof( ",lthash" )+
171 : sizeof( ")" ) ];
172 :
173 : char * p = fd_cstr_init( log_line );
174 : p = fd_cstr_append_text( p, "fd_blake3_avx512_compress16(batch_cnt=", 38UL );
175 : p = fd_cstr_append_ulong_as_text( p, 0, 0, batch_cnt, fd_uchar_base10_dig_cnt( (uchar)batch_cnt ) );
176 : p = fd_cstr_append_text( p, ",sz=[", 5UL );
177 : for( ulong i=0UL; i<batch_cnt; i++ ) {
178 : p = fd_cstr_append_uint_as_text( p, ' ', 0, batch_sz[ i ], fd_uint_base10_dig_cnt( batch_sz[ i ] ) );
179 : if( i+1<batch_cnt ) p = fd_cstr_append_char( p, ',' );
180 : }
181 : p = fd_cstr_append_text( p, "],counter=[", 11UL );
182 : for( ulong i=0UL; i<batch_cnt; i++ ) {
183 : p = fd_cstr_append_ulong_as_text( p, ' ', 0, ctr_vec[ i ], fd_ulong_base10_dig_cnt( ctr_vec[ i ] ) );
184 : if( i+1<batch_cnt ) p = fd_cstr_append_char( p, ',' );
185 : }
186 : p = fd_cstr_append_text( p, "],flags=[", 9UL );
187 : for( ulong i=0UL; i<batch_cnt; i++ ) {
188 : static char const hex_lut[ 16 ] = {
189 : '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'
190 : };
191 : p = fd_cstr_append_char( p, hex_lut[ batch_flags[ i ]&0xf ] );
192 : if( i+1<batch_cnt ) p = fd_cstr_append_char( p, ',' );
193 : }
194 : p = fd_cstr_append_char( p, ']' );
195 : if( batch_cv ) p = fd_cstr_append_text( p, ",custom_cv", 10UL );
196 : if( lthash ) p = fd_cstr_append_text( p, ",lthash", 7UL );
197 : p = fd_cstr_append_char( p, ')' );
198 : ulong line_len = (ulong)( p-log_line );
199 : fd_cstr_fini( p );
200 :
201 : FD_BLAKE3_TRACE(( "%.*s", (int)line_len, log_line ));
202 : #endif
203 :
204 17393 : ulong batch_tail_data[ 16 ] __attribute__((aligned(64)));
205 17393 : ulong batch_tail_rem [ 16 ] __attribute__((aligned(64)));
206 :
207 17393 : uchar scratch[ 16*FD_BLAKE3_BLOCK_SZ ] __attribute__((aligned(128)));
208 17393 : do {
209 17393 : ulong scratch_free = (ulong)scratch;
210 :
211 17393 : wwv_t zero = wwv_zero();
212 :
213 212292 : for( ulong batch_idx=0UL; batch_idx<batch_cnt; batch_idx++ ) {
214 :
215 : /* Allocate the tail blocks for this message */
216 :
217 194899 : ulong data = batch_data[ batch_idx ];
218 194899 : ulong sz = batch_sz [ batch_idx ];
219 :
220 194899 : ulong tail_data = scratch_free;
221 194899 : ulong tail_data_sz = sz & (FD_BLAKE3_BLOCK_SZ-1UL);
222 194899 : ulong tail_data_off = fd_ulong_align_dn( sz, FD_BLAKE3_BLOCK_SZ );
223 :
224 194899 : batch_tail_data[ batch_idx ] = tail_data;
225 194899 : batch_tail_rem [ batch_idx ] = (!!tail_data_sz) ^ (!sz); /* (hash 1 tail block if 0 sz) */
226 :
227 194899 : scratch_free += FD_BLAKE3_BLOCK_SZ;
228 :
229 : /* Populate the tail blocks. We first clear the blocks. Then we
230 : copy any straggler data bytes into the tail. */
231 :
232 194899 : wwv_st( (ulong *) tail_data, zero );
233 :
234 194899 : # if 1
235 : /* See fd_sha256_private_batch_avx */
236 194899 : ulong src = (ulong)data + tail_data_off;
237 194899 : ulong dst = tail_data;
238 194899 : ulong rem = tail_data_sz;
239 214582 : while( rem>=32UL ) { wv_st( (ulong *)dst, wv_ldu( (ulong const *)src ) ); dst += 32UL; src += 32UL; rem -= 32UL; }
240 275692 : while( rem>= 8UL ) { *(ulong *)dst = FD_LOAD( ulong, src ); dst += 8UL; src += 8UL; rem -= 8UL; }
241 194899 : if ( rem>= 4UL ) { *(uint *)dst = FD_LOAD( uint, src ); dst += 4UL; src += 4UL; rem -= 4UL; }
242 194899 : if ( rem>= 2UL ) { *(ushort *)dst = FD_LOAD( ushort, src ); dst += 2UL; src += 2UL; rem -= 2UL; }
243 194899 : if ( rem ) { *(uchar *)dst = FD_LOAD( uchar, src ); dst++; }
244 : # else
245 : fd_memcpy( (void *)tail_data, (void const *)(data + tail_data_off), tail_data_sz );
246 : # endif
247 194899 : }
248 17393 : } while(0);
249 :
250 :
251 17393 : wwu_t const iv0 = wwu_bcast( FD_BLAKE3_IV[0] );
252 17393 : wwu_t const iv1 = wwu_bcast( FD_BLAKE3_IV[1] );
253 17393 : wwu_t const iv2 = wwu_bcast( FD_BLAKE3_IV[2] );
254 17393 : wwu_t const iv3 = wwu_bcast( FD_BLAKE3_IV[3] );
255 17393 : wwu_t const iv4 = wwu_bcast( FD_BLAKE3_IV[4] );
256 17393 : wwu_t const iv5 = wwu_bcast( FD_BLAKE3_IV[5] );
257 17393 : wwu_t const iv6 = wwu_bcast( FD_BLAKE3_IV[6] );
258 17393 : wwu_t const iv7 = wwu_bcast( FD_BLAKE3_IV[7] );
259 :
260 17393 : wwu_t h0=iv0; wwu_t h1=iv1; wwu_t h2=iv2; wwu_t h3=iv3;
261 17393 : wwu_t h4=iv4; wwu_t h5=iv5; wwu_t h6=iv6; wwu_t h7=iv7;
262 17393 : if( FD_UNLIKELY( batch_cv ) ) {
263 : /* If the input chaining value is overridden, transpose the input
264 : to AVX512 representation. (wwu 16x8 transpose) FIXME There's
265 : probably a way to do this using AVX512 instead of AVX. */
266 8415 : __m256i const ** cv_vec = (__m256i const **)batch_cv;
267 8415 : wu_t cv_lo[8]; wu_t cv_hi[8];
268 8415 : cv_lo[ 0 ] = _mm256_loadu_si256( cv_vec[ 0 ] );
269 8415 : cv_lo[ 1 ] = _mm256_loadu_si256( cv_vec[ 1 ] );
270 8415 : cv_lo[ 2 ] = _mm256_loadu_si256( cv_vec[ 2 ] );
271 8415 : cv_lo[ 3 ] = _mm256_loadu_si256( cv_vec[ 3 ] );
272 8415 : cv_lo[ 4 ] = _mm256_loadu_si256( cv_vec[ 4 ] );
273 8415 : cv_lo[ 5 ] = _mm256_loadu_si256( cv_vec[ 5 ] );
274 8415 : cv_lo[ 6 ] = _mm256_loadu_si256( cv_vec[ 6 ] );
275 8415 : cv_lo[ 7 ] = _mm256_loadu_si256( cv_vec[ 7 ] );
276 8415 : cv_hi[ 0 ] = _mm256_loadu_si256( cv_vec[ 8 ] );
277 8415 : cv_hi[ 1 ] = _mm256_loadu_si256( cv_vec[ 9 ] );
278 8415 : cv_hi[ 2 ] = _mm256_loadu_si256( cv_vec[ 10 ] );
279 8415 : cv_hi[ 3 ] = _mm256_loadu_si256( cv_vec[ 11 ] );
280 8415 : cv_hi[ 4 ] = _mm256_loadu_si256( cv_vec[ 12 ] );
281 8415 : cv_hi[ 5 ] = _mm256_loadu_si256( cv_vec[ 13 ] );
282 8415 : cv_hi[ 6 ] = _mm256_loadu_si256( cv_vec[ 14 ] );
283 8415 : cv_hi[ 7 ] = _mm256_loadu_si256( cv_vec[ 15 ] );
284 8415 : wu_transpose_8x8( cv_lo[0], cv_lo[1], cv_lo[2], cv_lo[3], cv_lo[4], cv_lo[5], cv_lo[6], cv_lo[7],
285 8415 : cv_lo[0], cv_lo[1], cv_lo[2], cv_lo[3], cv_lo[4], cv_lo[5], cv_lo[6], cv_lo[7] );
286 8415 : wu_transpose_8x8( cv_hi[0], cv_hi[1], cv_hi[2], cv_hi[3], cv_hi[4], cv_hi[5], cv_hi[6], cv_hi[7],
287 8415 : cv_hi[0], cv_hi[1], cv_hi[2], cv_hi[3], cv_hi[4], cv_hi[5], cv_hi[6], cv_hi[7] );
288 8415 : h0 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 0 ] ), cv_hi[ 0 ], 1 );
289 8415 : h1 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 1 ] ), cv_hi[ 1 ], 1 );
290 8415 : h2 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 2 ] ), cv_hi[ 2 ], 1 );
291 8415 : h3 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 3 ] ), cv_hi[ 3 ], 1 );
292 8415 : h4 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 4 ] ), cv_hi[ 4 ], 1 );
293 8415 : h5 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 5 ] ), cv_hi[ 5 ], 1 );
294 8415 : h6 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 6 ] ), cv_hi[ 6 ], 1 );
295 8415 : h7 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 7 ] ), cv_hi[ 7 ], 1 );
296 8415 : }
297 :
298 17393 : wwu_t ctr_lo = wwu( ctr_vec[ 0], ctr_vec[ 1], ctr_vec[ 2], ctr_vec[ 3],
299 17393 : ctr_vec[ 4], ctr_vec[ 5], ctr_vec[ 6], ctr_vec[ 7],
300 17393 : ctr_vec[ 8], ctr_vec[ 9], ctr_vec[10], ctr_vec[11],
301 17393 : ctr_vec[12], ctr_vec[13], ctr_vec[14], ctr_vec[15] );
302 17393 : wwu_t ctr_hi = wwu( ctr_vec[ 0]>>32, ctr_vec[ 1]>>32, ctr_vec[ 2]>>32, ctr_vec[ 3]>>32,
303 17393 : ctr_vec[ 4]>>32, ctr_vec[ 5]>>32, ctr_vec[ 6]>>32, ctr_vec[ 7]>>32,
304 17393 : ctr_vec[ 8]>>32, ctr_vec[ 9]>>32, ctr_vec[10]>>32, ctr_vec[11]>>32,
305 17393 : ctr_vec[12]>>32, ctr_vec[13]>>32, ctr_vec[14]>>32, ctr_vec[15]>>32 );
306 17393 : wwu_t flags = wwu_ldu( batch_flags );
307 17393 : wwu_t off = wwu_zero();
308 17393 : wwu_t sz = wwu_ldu( batch_sz );
309 :
310 17393 : wwv_t zero = wwv_zero();
311 17393 : wwv_t one = wwv_one();
312 17393 : wwu_t wwu_64 = wwu_bcast( FD_BLAKE3_BLOCK_SZ );
313 17393 : wwv_t wwv_64 = wwv_bcast( FD_BLAKE3_BLOCK_SZ );
314 17393 : wwv_t W_sentinel = wwv_bcast( (ulong)scratch );
315 : //wwc_t batch_lane = wc_unpack( (1<<batch_cnt)-1 );
316 :
317 17393 : wwv_t tail_lo = wwv_ld( batch_tail_data );
318 17393 : wwv_t tail_hi = wwv_ld( batch_tail_data+8 );
319 :
320 17393 : wwv_t tail_rem_lo = wwv_ld( batch_tail_rem );
321 17393 : wwv_t tail_rem_hi = wwv_ld( batch_tail_rem+8 );
322 :
323 17393 : wwv_t W_lo = wwv_ld( batch_data );
324 17393 : wwv_t W_hi = wwv_ld( batch_data+8 );
325 :
326 17393 : wwv_t batch_sz_lo = _mm512_cvtepi32_epi64( _mm512_extracti32x8_epi32( sz, 0 ) );
327 17393 : wwv_t batch_sz_hi = _mm512_cvtepi32_epi64( _mm512_extracti32x8_epi32( sz, 1 ) );
328 :
329 17393 : wwv_t block_rem_lo = wwv_if( ((1<<batch_cnt)-1) & 0xff,
330 17393 : wwv_add( wwv_shr( batch_sz_lo, FD_BLAKE3_BLOCK_LG_SZ ), tail_rem_lo ), zero );
331 17393 : wwv_t block_rem_hi = wwv_if( ((1<<batch_cnt)-1) >> 8,
332 17393 : wwv_add( wwv_shr( batch_sz_hi, FD_BLAKE3_BLOCK_LG_SZ ), tail_rem_hi ), zero );
333 :
334 : /* Upper half of the compression function output.
335 : Usually thrown away, but kept in the final compression round if
336 : out_sz==64. */
337 17393 : wwu_t hu[8] = {0};
338 :
339 17393 : ulong lthash_rem = lthash ? 32 : 0; /* Number of LtHash (XOF) blocks remaining */
340 17393 : int compress_done = 0;
341 61609 : for(;;) {
342 : /* Switch lanes that have hit the end of their in-place bulk
343 : processing to their out-of-place scratch tail regions as
344 : necessary. */
345 :
346 61609 : W_lo = wwv_if( wwv_eq( block_rem_lo, tail_rem_lo ), tail_lo, W_lo );
347 61609 : W_hi = wwv_if( wwv_eq( block_rem_hi, tail_rem_hi ), tail_hi, W_hi );
348 :
349 : /* Derive per-block flags and block sizes */
350 :
351 61609 : int block_first = wwu_eq( off, wwu_zero() );
352 61609 : int block_last = wwi_le( sz, wwu_add( off, wwu_bcast( FD_BLAKE3_BLOCK_SZ ) ) );
353 :
354 : /* Suppress root flag unless last block */
355 :
356 61609 : wwu_t root_mask = wwu_if( block_last, wwu_bcast( UINT_MAX ), wwu_bcast( ~FD_BLAKE3_FLAG_ROOT ) );
357 61609 : wwu_t block_flags = wwu_and( flags, root_mask );
358 :
359 : /* Mask lanes that completed */
360 :
361 61609 : int active_lane_lo;
362 61609 : int active_lane_hi;
363 61609 : if( FD_UNLIKELY( lthash ) ) {
364 : /* Compress until root block */
365 0 : wwu_t all_root = wwu_bcast( FD_BLAKE3_FLAG_ROOT );
366 0 : int not_root = wwu_ne( wwu_and( block_flags, all_root ), all_root );
367 0 : active_lane_lo = (int)(__mmask8)not_root;
368 0 : active_lane_hi = (int)(__mmask8)(not_root>>8);
369 61609 : } else {
370 : /* Complete when there is no more input data */
371 61609 : active_lane_lo = wwv_ne( block_rem_lo, zero );
372 61609 : active_lane_hi = wwv_ne( block_rem_hi, zero );
373 61609 : }
374 :
375 : /* Suppress CHUNK_{START,END} flags unless leaf node */
376 :
377 61609 : int is_parent = wwu_ne( wwu_and( flags, wwu_bcast( FD_BLAKE3_FLAG_PARENT ) ), wwu_zero() );
378 61609 : wwu_t chunk_flags = wwu_if( block_last, wwu_bcast( FD_BLAKE3_FLAG_CHUNK_END ), wwu_zero() );
379 61609 : if( out_sz==32 ) {
380 : /* Hacky: out_sz==64 is only used for post-compress XOF hashing,
381 : so use that as a hint when to suppress the 'CHUNK_START' flag. */
382 44849 : chunk_flags = wwu_or( chunk_flags, wwu_if( block_first, wwu_bcast( FD_BLAKE3_FLAG_CHUNK_START ), wwu_zero() ) );
383 44849 : }
384 61609 : wwu_t block_sz = wwu_min( wwu_sub( sz, off ), wwu_64 );
385 61609 : block_flags = wwu_or( block_flags, wwu_if( is_parent, wwu_zero(), chunk_flags ) );
386 :
387 : /* Check if we are done compressing */
388 :
389 61609 : compress_done |= !(active_lane_lo | active_lane_hi);
390 61609 : if( FD_UNLIKELY( compress_done ) ) {
391 17389 : if( FD_UNLIKELY( !lthash_rem ) ) break;
392 0 : active_lane_lo = 0xff;
393 0 : active_lane_hi = 0xff;
394 : /* Load the next message block and fall through to XOF expansion */
395 0 : }
396 :
397 : /* At this point, we have at least 1 block in this message segment
398 : pass that has not been processed. Load the next 64 bytes of
399 : each unprocessed block. Inactive lanes (e.g. message segments
400 : in this pass for which we've already processed all the blocks)
401 : will load garbage from a sentinel location (and the result of
402 : the state computations for the inactive lane will be ignored). */
403 :
404 44220 : ulong _W0; ulong _W1; ulong _W2; ulong _W3; ulong _W4; ulong _W5; ulong _W6; ulong _W7;
405 44220 : ulong _W8; ulong _W9; ulong _Wa; ulong _Wb; ulong _Wc; ulong _Wd; ulong _We; ulong _Wf;
406 44220 : wwv_unpack( wwv_if( active_lane_lo, W_lo, W_sentinel ), _W0, _W1, _W2, _W3, _W4, _W5, _W6, _W7 );
407 44220 : wwv_unpack( wwv_if( active_lane_hi, W_hi, W_sentinel ), _W8, _W9, _Wa, _Wb, _Wc, _Wd, _We, _Wf );
408 44220 : uchar const * W0 = (uchar const *)_W0; uchar const * W1 = (uchar const *)_W1;
409 44220 : uchar const * W2 = (uchar const *)_W2; uchar const * W3 = (uchar const *)_W3;
410 44220 : uchar const * W4 = (uchar const *)_W4; uchar const * W5 = (uchar const *)_W5;
411 44220 : uchar const * W6 = (uchar const *)_W6; uchar const * W7 = (uchar const *)_W7;
412 44220 : uchar const * W8 = (uchar const *)_W8; uchar const * W9 = (uchar const *)_W9;
413 44220 : uchar const * Wa = (uchar const *)_Wa; uchar const * Wb = (uchar const *)_Wb;
414 44220 : uchar const * Wc = (uchar const *)_Wc; uchar const * Wd = (uchar const *)_Wd;
415 44220 : uchar const * We = (uchar const *)_We; uchar const * Wf = (uchar const *)_Wf;
416 :
417 44220 : wwu_t m[16];
418 44220 : m[0x0] = wwu_ldu( W0 ); m[0x1] = wwu_ldu( W1 );
419 44220 : m[0x2] = wwu_ldu( W2 ); m[0x3] = wwu_ldu( W3 );
420 44220 : m[0x4] = wwu_ldu( W4 ); m[0x5] = wwu_ldu( W5 );
421 44220 : m[0x6] = wwu_ldu( W6 ); m[0x7] = wwu_ldu( W7 );
422 44220 : m[0x8] = wwu_ldu( W8 ); m[0x9] = wwu_ldu( W9 );
423 44220 : m[0xa] = wwu_ldu( Wa ); m[0xb] = wwu_ldu( Wb );
424 44220 : m[0xc] = wwu_ldu( Wc ); m[0xd] = wwu_ldu( Wd );
425 44220 : m[0xe] = wwu_ldu( We ); m[0xf] = wwu_ldu( Wf );
426 :
427 44220 : wwu_transpose_16x16( m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
428 44220 : m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf],
429 44220 : m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
430 44220 : m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf] );
431 :
432 : /* Compute the BLAKE3 compression function updates */
433 :
434 44220 : compress: (void)0;
435 44157 : wwu_t v[16] = {
436 44157 : h0, h1, h2, h3,
437 44157 : h4, h5, h6, h7,
438 44157 : iv0, iv1, iv2, iv3,
439 44157 : ctr_lo, ctr_hi, block_sz, block_flags,
440 44157 : };
441 :
442 : /* Debug utility */
443 44157 : #define STATE_FMT "[%u] =\n %08x %08x %08x %08x\n %08x %08x %08x %08x\n %08x %08x %08x %08x\n %08x %08x %08x %08x"
444 44157 : #define STATE_FMT_ARGS(v,i) (uint)i,\
445 44157 : fd_uint_bswap(wwu_extract(v[0x0],i)),fd_uint_bswap(wwu_extract(v[0x1],i)),fd_uint_bswap(wwu_extract(v[0x2],i)),fd_uint_bswap(wwu_extract(v[0x3],i)),\
446 44157 : fd_uint_bswap(wwu_extract(v[0x4],i)),fd_uint_bswap(wwu_extract(v[0x5],i)),fd_uint_bswap(wwu_extract(v[0x6],i)),fd_uint_bswap(wwu_extract(v[0x7],i)),\
447 44157 : fd_uint_bswap(wwu_extract(v[0x8],i)),fd_uint_bswap(wwu_extract(v[0x9],i)),fd_uint_bswap(wwu_extract(v[0xa],i)),fd_uint_bswap(wwu_extract(v[0xb],i)),\
448 44157 : fd_uint_bswap(wwu_extract(v[0xc],i)),fd_uint_bswap(wwu_extract(v[0xd],i)),fd_uint_bswap(wwu_extract(v[0xe],i)),fd_uint_bswap(wwu_extract(v[0xf],i))
449 :
450 : // FD_LOG_NOTICE(( "pre " STATE_FMT, STATE_FMT_ARGS(v,0) ));
451 44157 : round_fn16( v, m, 0 );
452 44157 : round_fn16( v, m, 1 );
453 44157 : round_fn16( v, m, 2 );
454 44157 : round_fn16( v, m, 3 );
455 44157 : round_fn16( v, m, 4 );
456 44157 : round_fn16( v, m, 5 );
457 44157 : round_fn16( v, m, 6 );
458 : // FD_LOG_NOTICE(( "post" STATE_FMT, STATE_FMT_ARGS(v,0) ));
459 :
460 44270 : if( FD_LIKELY( !compress_done ) ) {
461 :
462 : /* Apply the state updates to the active lanes */
463 :
464 44270 : int active_lane = active_lane_lo | (active_lane_hi<<8);
465 44270 : FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16: compress lanes %02x%02x", active_lane_hi, active_lane_lo ));
466 :
467 44270 : if( FD_UNLIKELY( out_sz==64 ) ) {
468 : /* FIXME only export in the last iteration */
469 8414 : hu[0] = wwu_xor_if( active_lane, h0, v[ 8], hu[0] );
470 8414 : hu[1] = wwu_xor_if( active_lane, h1, v[ 9], hu[1] );
471 8414 : hu[2] = wwu_xor_if( active_lane, h2, v[10], hu[2] );
472 8414 : hu[3] = wwu_xor_if( active_lane, h3, v[11], hu[3] );
473 8414 : hu[4] = wwu_xor_if( active_lane, h4, v[12], hu[4] );
474 8414 : hu[5] = wwu_xor_if( active_lane, h5, v[13], hu[5] );
475 8414 : hu[6] = wwu_xor_if( active_lane, h6, v[14], hu[6] );
476 8414 : hu[7] = wwu_xor_if( active_lane, h7, v[15], hu[7] );
477 8414 : }
478 44270 : h0 = wwu_xor_if( active_lane, v[ 0], v[ 8], h0 );
479 44270 : h1 = wwu_xor_if( active_lane, v[ 1], v[ 9], h1 );
480 44270 : h2 = wwu_xor_if( active_lane, v[ 2], v[10], h2 );
481 44270 : h3 = wwu_xor_if( active_lane, v[ 3], v[11], h3 );
482 44270 : h4 = wwu_xor_if( active_lane, v[ 4], v[12], h4 );
483 44270 : h5 = wwu_xor_if( active_lane, v[ 5], v[13], h5 );
484 44270 : h6 = wwu_xor_if( active_lane, v[ 6], v[14], h6 );
485 44270 : h7 = wwu_xor_if( active_lane, v[ 7], v[15], h7 );
486 :
487 : /* Advance to the next message segment blocks. In pseudo code,
488 : the below is:
489 :
490 : W += 64; if( block_rem ) block_rem--;
491 :
492 : Since we do not load anything at W(lane) above unless
493 : block_rem(lane) is non-zero, we can omit vector conditional
494 : operations for W(lane) below. */
495 :
496 44270 : W_lo = wwv_add_if( active_lane_lo, W_lo, wwv_64, W_lo );
497 44270 : W_hi = wwv_add_if( active_lane_hi, W_hi, wwv_64, W_hi );
498 44270 : off = wwu_add_if( active_lane, off, wwu_64, off );
499 :
500 44270 : block_rem_lo = wwv_sub_if( active_lane_lo, block_rem_lo, one, block_rem_lo );
501 44270 : block_rem_hi = wwv_sub_if( active_lane_hi, block_rem_hi, one, block_rem_hi );
502 :
503 >1844*10^16 : } else { /* LtHash mode */
504 :
505 : /* d[i] contains output_off+(i*4) 32-bit words across output[0..8] */
506 >1844*10^16 : FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16: expand lanes" ));
507 >1844*10^16 : wwu_t d[ 16 ] = {
508 >1844*10^16 : wwu_xor( v[0x0], v[0x8] ),
509 >1844*10^16 : wwu_xor( v[0x1], v[0x9] ),
510 >1844*10^16 : wwu_xor( v[0x2], v[0xa] ),
511 >1844*10^16 : wwu_xor( v[0x3], v[0xb] ),
512 >1844*10^16 : wwu_xor( v[0x4], v[0xc] ),
513 >1844*10^16 : wwu_xor( v[0x5], v[0xd] ),
514 >1844*10^16 : wwu_xor( v[0x6], v[0xe] ),
515 >1844*10^16 : wwu_xor( v[0x7], v[0xf] ),
516 >1844*10^16 : wwu_xor( h0, v[0x8] ),
517 >1844*10^16 : wwu_xor( h1, v[0x9] ),
518 >1844*10^16 : wwu_xor( h2, v[0xa] ),
519 >1844*10^16 : wwu_xor( h3, v[0xb] ),
520 >1844*10^16 : wwu_xor( h4, v[0xc] ),
521 >1844*10^16 : wwu_xor( h5, v[0xd] ),
522 >1844*10^16 : wwu_xor( h6, v[0xe] ),
523 >1844*10^16 : wwu_xor( h7, v[0xf] )
524 >1844*10^16 : };
525 :
526 : /* Transpose each 8x8 block */
527 >1844*10^16 : wwu_transpose_16x16( d[0x0], d[0x1], d[0x2], d[0x3], d[0x4], d[0x5], d[0x6], d[0x7],
528 >1844*10^16 : d[0x8], d[0x9], d[0xa], d[0xb], d[0xc], d[0xd], d[0xe], d[0xf],
529 >1844*10^16 : d[0x0], d[0x1], d[0x2], d[0x3], d[0x4], d[0x5], d[0x6], d[0x7],
530 >1844*10^16 : d[0x8], d[0x9], d[0xa], d[0xb], d[0xc], d[0xd], d[0xe], d[0xf] );
531 :
532 : /* Reduce-add into d[0] */
533 >1844*10^16 : d[0x0] = wwh_add( d[0x0], d[0x1] ); /* sum(l[0 1]) */
534 >1844*10^16 : d[0x2] = wwh_add( d[0x2], d[0x3] ); /* sum(l[2 3]) */
535 >1844*10^16 : d[0x4] = wwh_add( d[0x4], d[0x5] ); /* sum(l[4 5]) */
536 >1844*10^16 : d[0x6] = wwh_add( d[0x6], d[0x7] ); /* sum(l[6 7]) */
537 >1844*10^16 : d[0x8] = wwh_add( d[0x8], d[0x9] ); /* sum(l[8 9]) */
538 >1844*10^16 : d[0xa] = wwh_add( d[0xa], d[0xb] ); /* sum(l[a b]) */
539 >1844*10^16 : d[0xc] = wwh_add( d[0xc], d[0xd] ); /* sum(l[c d]) */
540 >1844*10^16 : d[0xe] = wwh_add( d[0xe], d[0xf] ); /* sum(l[e f]) */
541 >1844*10^16 : d[0x0] = wwh_add( d[0x0], d[0x2] ); /* sum(l[0 1 2 3]) */
542 >1844*10^16 : d[0x4] = wwh_add( d[0x4], d[0x6] ); /* sum(l[4 5 6 7]) */
543 >1844*10^16 : d[0x8] = wwh_add( d[0x8], d[0xa] ); /* sum(l[8 9 a b]) */
544 >1844*10^16 : d[0xc] = wwh_add( d[0xc], d[0xe] ); /* sum(l[c d e f]) */
545 >1844*10^16 : d[0x0] = wwh_add( d[0x0], d[0x4] ); /* sum(l[0 1 2 3 4 5 6 7]) */
546 >1844*10^16 : d[0x8] = wwh_add( d[0x8], d[0xc] ); /* sum(l[8 9 a b c d e f]) */
547 >1844*10^16 : d[0x0] = wwh_add( d[0x0], d[0x8] ); /* sum(l[0 1 2 3 4 5 6 7 8 9 a b c d e f]) */
548 >1844*10^16 : wwh_st( lthash, d[0x0] );
549 :
550 : /* Wind up for next iteration */
551 >1844*10^16 : lthash += 32; /* 64 byte stride */
552 >1844*10^16 : lthash_rem--;
553 >1844*10^16 : wwu_t ctr_add = wwu_bcast( 1 );
554 >1844*10^16 : /**/ ctr_lo = wwu_add( ctr_lo, ctr_add );
555 >1844*10^16 : int ctr_carry = wwi_gt ( wwu_xor( ctr_add, wwu_bcast( 0x80000000 ) ),
556 >1844*10^16 : wwu_xor( ctr_lo, wwu_bcast( 0x80000000 ) ) );
557 >1844*10^16 : /**/ ctr_hi = wwu_add_if( ctr_carry, ctr_hi, wwu_one(), ctr_hi );
558 >1844*10^16 : if( FD_UNLIKELY( !lthash_rem ) ) {
559 0 : FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16: done (lthash para)" ));
560 0 : return;
561 0 : }
562 >1844*10^16 : goto compress;
563 :
564 >1844*10^16 : # undef STATE_FMT
565 >1844*10^16 : # undef STATE_FMT_ARGS
566 >1844*10^16 : }
567 44157 : }
568 :
569 : /* Store the results */
570 :
571 17443 : wwu_t o0; wwu_t o1; wwu_t o2; wwu_t o3; wwu_t o4; wwu_t o5; wwu_t o6; wwu_t o7;
572 17443 : wwu_t o8; wwu_t o9; wwu_t oA; wwu_t oB; wwu_t oC; wwu_t oD; wwu_t oE; wwu_t oF;
573 :
574 17443 : wwu_transpose_16x16( h0, h1, h2, h3, h4, h5, h6, h7,
575 17443 : hu[0],hu[1],hu[2],hu[3],hu[4],hu[5],hu[6],hu[7],
576 17443 : o0, o1, o2, o3, o4, o5, o6, o7,
577 17443 : o8, o9, oA, oB, oC, oD, oE, oF );
578 :
579 17443 : uint * const * batch_hash = (uint * const *)_batch_hash;
580 17443 : if( FD_LIKELY( out_sz==32 ) ) {
581 8984 : switch( batch_cnt ) { /* application dependent prob */
582 1794 : case 16UL: wu_stu( batch_hash[15], _mm512_castsi512_si256( oF ) ); __attribute__((fallthrough));
583 1794 : case 15UL: wu_stu( batch_hash[14], _mm512_castsi512_si256( oE ) ); __attribute__((fallthrough));
584 1794 : case 14UL: wu_stu( batch_hash[13], _mm512_castsi512_si256( oD ) ); __attribute__((fallthrough));
585 1794 : case 13UL: wu_stu( batch_hash[12], _mm512_castsi512_si256( oC ) ); __attribute__((fallthrough));
586 1794 : case 12UL: wu_stu( batch_hash[11], _mm512_castsi512_si256( oB ) ); __attribute__((fallthrough));
587 1794 : case 11UL: wu_stu( batch_hash[10], _mm512_castsi512_si256( oA ) ); __attribute__((fallthrough));
588 2392 : case 10UL: wu_stu( batch_hash[ 9], _mm512_castsi512_si256( o9 ) ); __attribute__((fallthrough));
589 2392 : case 9UL: wu_stu( batch_hash[ 8], _mm512_castsi512_si256( o8 ) ); __attribute__((fallthrough));
590 2994 : case 8UL: wu_stu( batch_hash[ 7], _mm512_castsi512_si256( o7 ) ); __attribute__((fallthrough));
591 2994 : case 7UL: wu_stu( batch_hash[ 6], _mm512_castsi512_si256( o6 ) ); __attribute__((fallthrough));
592 3591 : case 6UL: wu_stu( batch_hash[ 5], _mm512_castsi512_si256( o5 ) ); __attribute__((fallthrough));
593 4787 : case 5UL: wu_stu( batch_hash[ 4], _mm512_castsi512_si256( o4 ) ); __attribute__((fallthrough));
594 5990 : case 4UL: wu_stu( batch_hash[ 3], _mm512_castsi512_si256( o3 ) ); __attribute__((fallthrough));
595 6586 : case 3UL: wu_stu( batch_hash[ 2], _mm512_castsi512_si256( o2 ) ); __attribute__((fallthrough));
596 8985 : case 2UL: wu_stu( batch_hash[ 1], _mm512_castsi512_si256( o1 ) ); __attribute__((fallthrough));
597 8985 : case 1UL: wu_stu( batch_hash[ 0], _mm512_castsi512_si256( o0 ) ); __attribute__((fallthrough));
598 8985 : default: break;
599 8984 : }
600 8984 : FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16: done" ));
601 8984 : } else if( out_sz==64 ) {
602 8411 : switch( batch_cnt ) { /* application dependent prob */
603 8412 : case 16UL: wwu_stu( batch_hash[15], oF ); __attribute__((fallthrough));
604 8412 : case 15UL: wwu_stu( batch_hash[14], oE ); __attribute__((fallthrough));
605 8412 : case 14UL: wwu_stu( batch_hash[13], oD ); __attribute__((fallthrough));
606 8412 : case 13UL: wwu_stu( batch_hash[12], oC ); __attribute__((fallthrough));
607 8412 : case 12UL: wwu_stu( batch_hash[11], oB ); __attribute__((fallthrough));
608 8412 : case 11UL: wwu_stu( batch_hash[10], oA ); __attribute__((fallthrough));
609 8412 : case 10UL: wwu_stu( batch_hash[ 9], o9 ); __attribute__((fallthrough));
610 8412 : case 9UL: wwu_stu( batch_hash[ 8], o8 ); __attribute__((fallthrough));
611 8412 : case 8UL: wwu_stu( batch_hash[ 7], o7 ); __attribute__((fallthrough));
612 8412 : case 7UL: wwu_stu( batch_hash[ 6], o6 ); __attribute__((fallthrough));
613 8412 : case 6UL: wwu_stu( batch_hash[ 5], o5 ); __attribute__((fallthrough));
614 8412 : case 5UL: wwu_stu( batch_hash[ 4], o4 ); __attribute__((fallthrough));
615 8412 : case 4UL: wwu_stu( batch_hash[ 3], o3 ); __attribute__((fallthrough));
616 8412 : case 3UL: wwu_stu( batch_hash[ 2], o2 ); __attribute__((fallthrough));
617 8412 : case 2UL: wwu_stu( batch_hash[ 1], o1 ); __attribute__((fallthrough));
618 8412 : case 1UL: wwu_stu( batch_hash[ 0], o0 ); __attribute__((fallthrough));
619 8412 : default: break;
620 8411 : }
621 8413 : FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16: done (out_sz=64)" ));
622 8413 : } else {
623 48 : FD_LOG_ERR(( "Invalid out_sz %u", out_sz ));
624 48 : }
625 17443 : }
626 :
627 : void
628 : fd_blake3_avx512_compress16_fast( uchar const * restrict msg,
629 : uchar * restrict out,
630 : ulong counter,
631 7774 : uchar flags ) {
632 7774 : FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16_fast(msg=%p,out=%p,counter=%lu,flags=%02x)", (void *)msg, (void *)out, counter, flags ));
633 :
634 7774 : int parent = flags & FD_BLAKE3_FLAG_PARENT;
635 7774 : int lg_sz = fd_int_if( parent, FD_BLAKE3_OUTCHAIN_LG_SZ+1, FD_BLAKE3_CHUNK_LG_SZ );
636 7774 : ulong sz = 1UL<<lg_sz;
637 :
638 : /* counters stay the same for each block. Across chunks, they
639 : increment if we are hashing leaves. Otherwise, they are zero. */
640 :
641 7774 : wwu_t ctr_add = wwu_and( wwu_bcast( parent ? 0 : UINT_MAX ),
642 7774 : wwu( 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
643 7774 : 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf ) );
644 7774 : wwu_t ctr_lo = wwu_add( wwu_bcast( counter ), ctr_add );
645 7774 : int ctr_carry = wwi_gt ( wwu_xor( ctr_add, wwu_bcast( 0x80000000 ) ),
646 7774 : wwu_xor( ctr_lo, wwu_bcast( 0x80000000 ) ) );
647 7774 : wwu_t ctr_hi = wwu_add_if( ctr_carry, wwu_bcast( counter>>32 ), wwu_one(), wwu_bcast( counter>>32 ) );
648 7774 : wwu_t sz_vec = wwu_bcast( FD_BLAKE3_BLOCK_SZ );
649 :
650 7774 : wwu_t const iv0 = wwu_bcast( FD_BLAKE3_IV[0] );
651 7774 : wwu_t const iv1 = wwu_bcast( FD_BLAKE3_IV[1] );
652 7774 : wwu_t const iv2 = wwu_bcast( FD_BLAKE3_IV[2] );
653 7774 : wwu_t const iv3 = wwu_bcast( FD_BLAKE3_IV[3] );
654 7774 : wwu_t const iv4 = wwu_bcast( FD_BLAKE3_IV[4] );
655 7774 : wwu_t const iv5 = wwu_bcast( FD_BLAKE3_IV[5] );
656 7774 : wwu_t const iv6 = wwu_bcast( FD_BLAKE3_IV[6] );
657 7774 : wwu_t const iv7 = wwu_bcast( FD_BLAKE3_IV[7] );
658 :
659 7774 : wwu_t h0=iv0; wwu_t h1=iv1; wwu_t h2=iv2; wwu_t h3=iv3;
660 7774 : wwu_t h4=iv4; wwu_t h5=iv5; wwu_t h6=iv6; wwu_t h7=iv7;
661 :
662 7774 : ulong off = 0UL;
663 87551 : do {
664 87551 : ulong const off_next = off+FD_BLAKE3_BLOCK_SZ;
665 87551 : int chunk_flags =
666 87551 : ( off ==0UL ? FD_BLAKE3_FLAG_CHUNK_START : 0 ) |
667 87551 : ( off_next==sz ? FD_BLAKE3_FLAG_CHUNK_END : 0 );
668 87551 : int flags_ = flags | fd_int_if( parent, 0, chunk_flags );
669 87551 : wwu_t flags_vec = wwu_bcast( flags_ );
670 :
671 87551 : wwu_t m[16];
672 87551 : m[0x0] = wwu_ldu( msg + (0x0<<lg_sz) + off );
673 87551 : m[0x1] = wwu_ldu( msg + (0x1<<lg_sz) + off );
674 87551 : m[0x2] = wwu_ldu( msg + (0x2<<lg_sz) + off );
675 87551 : m[0x3] = wwu_ldu( msg + (0x3<<lg_sz) + off );
676 87551 : m[0x4] = wwu_ldu( msg + (0x4<<lg_sz) + off );
677 87551 : m[0x5] = wwu_ldu( msg + (0x5<<lg_sz) + off );
678 87551 : m[0x6] = wwu_ldu( msg + (0x6<<lg_sz) + off );
679 87551 : m[0x7] = wwu_ldu( msg + (0x7<<lg_sz) + off );
680 87551 : m[0x8] = wwu_ldu( msg + (0x8<<lg_sz) + off );
681 87551 : m[0x9] = wwu_ldu( msg + (0x9<<lg_sz) + off );
682 87551 : m[0xa] = wwu_ldu( msg + (0xa<<lg_sz) + off );
683 87551 : m[0xb] = wwu_ldu( msg + (0xb<<lg_sz) + off );
684 87551 : m[0xc] = wwu_ldu( msg + (0xc<<lg_sz) + off );
685 87551 : m[0xd] = wwu_ldu( msg + (0xd<<lg_sz) + off );
686 87551 : m[0xe] = wwu_ldu( msg + (0xe<<lg_sz) + off );
687 87551 : m[0xf] = wwu_ldu( msg + (0xf<<lg_sz) + off );
688 :
689 87551 : wwu_transpose_16x16( m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
690 87551 : m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf],
691 87551 : m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
692 87551 : m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf] );
693 :
694 87551 : wwu_t v[16] = {
695 87551 : h0, h1, h2, h3,
696 87551 : h4, h5, h6, h7,
697 87551 : iv0, iv1, iv2, iv3,
698 87551 : ctr_lo, ctr_hi, sz_vec, flags_vec,
699 87551 : };
700 :
701 87551 : round_fn16( v, m, 0 );
702 87551 : round_fn16( v, m, 1 );
703 87551 : round_fn16( v, m, 2 );
704 87551 : round_fn16( v, m, 3 );
705 87551 : round_fn16( v, m, 4 );
706 87551 : round_fn16( v, m, 5 );
707 87551 : round_fn16( v, m, 6 );
708 :
709 87551 : h0 = wwu_xor( v[ 0], v[ 8] );
710 87551 : h1 = wwu_xor( v[ 1], v[ 9] );
711 87551 : h2 = wwu_xor( v[ 2], v[10] );
712 87551 : h3 = wwu_xor( v[ 3], v[11] );
713 87551 : h4 = wwu_xor( v[ 4], v[12] );
714 87551 : h5 = wwu_xor( v[ 5], v[13] );
715 87551 : h6 = wwu_xor( v[ 6], v[14] );
716 87551 : h7 = wwu_xor( v[ 7], v[15] );
717 :
718 87551 : off = off_next;
719 87551 : } while( off!=sz );
720 :
721 7774 : wwu_t o0; wwu_t o1; wwu_t o2; wwu_t o3; wwu_t o4; wwu_t o5; wwu_t o6; wwu_t o7;
722 7774 : wwu_t o8; wwu_t o9; wwu_t oA; wwu_t oB; wwu_t oC; wwu_t oD; wwu_t oE; wwu_t oF;
723 :
724 7774 : wwu_t zero = wwu_zero();
725 7774 : wwu_transpose_16x16( h0, h1, h2, h3, h4, h5, h6, h7,
726 7774 : zero, zero, zero, zero, zero, zero, zero, zero,
727 7774 : o0, o1, o2, o3, o4, o5, o6, o7,
728 7774 : o8, o9, oA, oB, oC, oD, oE, oF );
729 :
730 7774 : wb_st( out + (0x0UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o0 ) );
731 7774 : wb_st( out + (0x1UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o1 ) );
732 7774 : wb_st( out + (0x2UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o2 ) );
733 7774 : wb_st( out + (0x3UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o3 ) );
734 7774 : wb_st( out + (0x4UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o4 ) );
735 7774 : wb_st( out + (0x5UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o5 ) );
736 7774 : wb_st( out + (0x6UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o6 ) );
737 7774 : wb_st( out + (0x7UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o7 ) );
738 7774 : wb_st( out + (0x8UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o8 ) );
739 7774 : wb_st( out + (0x9UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o9 ) );
740 7774 : wb_st( out + (0xaUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oA ) );
741 7774 : wb_st( out + (0xbUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oB ) );
742 7774 : wb_st( out + (0xcUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oC ) );
743 7774 : wb_st( out + (0xdUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oD ) );
744 7774 : wb_st( out + (0xeUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oE ) );
745 7774 : wb_st( out + (0xfUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oF ) );
746 7774 : }
|