Line data Source code
1 : #include "fd_wksp_private.h"
2 :
3 : int
4 49970 : fd_wksp_private_lock( fd_wksp_t * wksp ) {
5 : # if FD_WKSP_LOCK_RECLAIM
6 : int warning = 0;
7 : #endif
8 49970 : ulong me = fd_log_group_id();
9 :
10 49970 : ulong * _owner = &wksp->owner;
11 77038 : for(;;) {
12 :
13 : /* Note that we emulate CAS on platforms without FD_HAS_ATOMIC
14 : to minimize the amount of code differences we have to test. On
15 : platforms without FD_HAS_ATOMIC, a workspace should not be used
16 : concurrently though. */
17 :
18 77038 : FD_COMPILER_MFENCE();
19 77038 : # if FD_HAS_ATOMIC
20 77038 : ulong pid = FD_ATOMIC_CAS( _owner, ULONG_MAX, me );
21 : # else
22 : ulong pid = FD_VOLATILE_CONST( *_owner );
23 : if( pid==ULONG_MAX ) FD_VOLATILE( *_owner ) = me;
24 : # endif
25 77038 : FD_COMPILER_MFENCE();
26 :
27 77038 : if( FD_LIKELY( pid==ULONG_MAX ) ) return FD_WKSP_SUCCESS;
28 :
29 : # if FD_WKSP_LOCK_RECLAIM
30 : int status = fd_log_group_id_query( pid );
31 : if( FD_UNLIKELY( status==FD_LOG_GROUP_ID_QUERY_DEAD ) ) { /* A process died while holding the lock, try to recover the lock */
32 :
33 : FD_COMPILER_MFENCE();
34 : # if FD_HAS_ATOMIC
35 : ulong cur = FD_ATOMIC_CAS( _owner, pid, me );
36 : # else
37 : ulong cur = FD_VOLATILE_CONST( *_owner );
38 : if( cur==pid ) FD_VOLATILE( *_owner ) = me;
39 : # endif
40 : FD_COMPILER_MFENCE();
41 :
42 : if( FD_LIKELY( cur==pid ) ) { /* We recovered the lock from the dead pid, try to fix up incomplete ops */
43 :
44 : FD_LOG_WARNING(( "Process %lu died in an operation on wksp %s; verifying", pid, wksp->name ));
45 : if( FD_LIKELY( !fd_wksp_verify( wksp ) ) ) { /* logs details of issues detected */
46 : FD_LOG_NOTICE(( "wksp verified" ));
47 : return FD_WKSP_SUCCESS;
48 : }
49 :
50 : FD_LOG_WARNING(( "Issues detected; rebuilding" ));
51 : if( FD_UNLIKELY( fd_wksp_rebuild( wksp, wksp->seed ) ) ) { /* Rebuild failed (logs details of issues detected) */
52 : /* Return control of the lock to the previous owner */
53 : FD_COMPILER_MFENCE();
54 : FD_VOLATILE( *_owner ) = pid;
55 : FD_COMPILER_MFENCE();
56 : FD_LOG_WARNING(( "corrupt wksp detected" ));
57 : return FD_WKSP_ERR_CORRUPT;
58 : }
59 :
60 : FD_LOG_NOTICE(( "wksp rebuilt" ));
61 : return FD_WKSP_SUCCESS;
62 :
63 : }
64 :
65 : /* Somebody beat us to recovering the lock ... try again */
66 :
67 : } else if( FD_UNLIKELY( status!=FD_LOG_GROUP_ID_QUERY_LIVE ) ) { /* Unclear pid status ... issue a warning and try again */
68 :
69 : if( FD_UNLIKELY( !warning ) ) {
70 : FD_LOG_WARNING(( "wksp %s is owned by unknown pid %li; attempting to recover", wksp->name, pid ));
71 : warning = 1;
72 : }
73 :
74 : }
75 :
76 : /* At this point, either another thread in this process has the
77 : lock, another active thread in another process has the lock,
78 : another unknown status thread in other process has the lock or
79 : another thread beat us to reclaim the lock from a dead process.
80 : In any case, we don't have the lock. Wait a while to limit O/S
81 : contention and try again. */
82 :
83 : FD_YIELD();
84 : # else
85 :
86 : /* If we are running without FD_WKSP_LOCK_RECLAIM then it is assumed
87 : that the contention is caused by a tile pinned to another core,
88 : and that this core is itself pinned so spin locking is best. */
89 26796 : FD_SPIN_PAUSE();
90 :
91 26796 : #endif
92 26796 : }
93 :
94 : /* never get here */
95 49970 : }
96 :
97 : /* Public APIs ********************************************************/
98 :
99 : ulong
100 : fd_wksp_part_max_est( ulong footprint,
101 1 : ulong sz_typical ) {
102 1 : footprint = fd_ulong_align_dn( footprint, FD_WKSP_ALIGN );
103 1 : ulong data_end = footprint - 1UL;
104 1 : ulong pinfo_off = fd_wksp_private_pinfo_off();
105 1 : ulong consumed = sizeof(fd_wksp_private_pinfo_t) + sz_typical;
106 1 : ulong part_max = (data_end - pinfo_off) / (consumed + (ulong)!consumed); /* avoid div-by-zero */
107 1 : if( FD_UNLIKELY( (!footprint) | (!sz_typical) | (sz_typical>consumed) | (pinfo_off>data_end) ) ) return 0UL;
108 1 : return fd_ulong_min( part_max, FD_WKSP_PRIVATE_PINFO_IDX_NULL );
109 1 : }
110 :
111 : ulong
112 : fd_wksp_data_max_est( ulong footprint,
113 0 : ulong part_max ) {
114 0 : footprint = fd_ulong_align_dn( footprint, FD_WKSP_ALIGN );
115 0 : ulong data_end = footprint - 1UL;
116 0 : ulong data_off = fd_wksp_private_data_off( part_max );
117 0 : if( FD_UNLIKELY( (!part_max) | (part_max>FD_WKSP_PRIVATE_PINFO_IDX_NULL) |
118 0 : (part_max > ((ULONG_MAX - fd_wksp_private_pinfo_off())/sizeof(fd_wksp_private_pinfo_t))) | /* covered above */
119 0 : (!footprint) | (data_off>=data_end) ) ) return 0UL;
120 0 : return data_end - data_off;
121 0 : }
122 :
123 : ulong
124 0 : fd_wksp_align( void ) {
125 0 : return FD_WKSP_ALIGN;
126 0 : }
127 :
128 : ulong
129 : fd_wksp_footprint( ulong part_max,
130 4 : ulong data_max ) {
131 4 : ulong data_off = fd_wksp_private_data_off( part_max );
132 4 : if( FD_UNLIKELY( (!part_max) | (part_max>FD_WKSP_PRIVATE_PINFO_IDX_NULL) | (!data_max) |
133 4 : (part_max > ((ULONG_MAX - fd_wksp_private_pinfo_off())/sizeof(fd_wksp_private_pinfo_t))) | /* Covered above */
134 4 : (data_max > (ULONG_MAX - FD_WKSP_ALIGN + 1UL - data_off - 1UL) ) ) ) return 0UL;
135 4 : return fd_ulong_align_up( data_off + data_max + 1UL, FD_WKSP_ALIGN );
136 4 : }
137 :
138 : void *
139 : fd_wksp_new( void * shmem,
140 : char const * name,
141 : uint seed,
142 : ulong part_max,
143 1 : ulong data_max ) {
144 1 : fd_wksp_t * wksp = (fd_wksp_t *)shmem;
145 :
146 1 : if( FD_UNLIKELY( !wksp ) ) {
147 0 : FD_LOG_WARNING(( "NULL shmem" ));
148 0 : return NULL;
149 0 : }
150 :
151 1 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)wksp, FD_WKSP_ALIGN ) ) ) {
152 0 : FD_LOG_WARNING(( "bad align" ));
153 0 : return NULL;
154 0 : }
155 :
156 1 : ulong name_len = fd_shmem_name_len( name );
157 1 : if( FD_UNLIKELY( !name_len ) ) {
158 0 : FD_LOG_WARNING(( "bad name" ));
159 0 : return NULL;
160 0 : }
161 :
162 1 : ulong footprint = fd_wksp_footprint( part_max, data_max );
163 1 : if( FD_UNLIKELY( !footprint ) ) {
164 0 : FD_LOG_WARNING(( "bad part_max and/or data_max" ));
165 0 : return NULL;
166 0 : }
167 :
168 1 : fd_memset( wksp, 0, fd_wksp_footprint( part_max, 1UL ) );
169 :
170 1 : wksp->part_max = part_max;
171 1 : wksp->data_max = data_max;
172 1 : wksp->gaddr_lo = fd_wksp_private_data_off( part_max );
173 1 : wksp->gaddr_hi = wksp->gaddr_lo + data_max;
174 1 : fd_memcpy( wksp->name, name, name_len+1UL );
175 1 : wksp->seed = seed;
176 1 : wksp->idle_top_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
177 1 : wksp->part_head_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
178 1 : wksp->part_tail_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
179 1 : wksp->part_used_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
180 1 : wksp->part_free_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
181 1 : wksp->cycle_tag = 4UL; /* Verify uses tags 0-3 */
182 1 : wksp->owner = 0UL; /* Mark as locked and in construction */
183 :
184 : /* Note that wksp->owner was set to zero above, "locking" the wksp by
185 : group_id 0. And the memset above set all the partition tags to
186 : zero such that there are no allocated partitions. So once we set
187 : magic below, we can finish the initialization by rebuilding and
188 : unlocking. Since fd_log_group_id is non-zero, the zero owner
189 : indicates to any remote observer of the shared memory region that
190 : the wksp is being built for the first time. */
191 :
192 1 : FD_COMPILER_MFENCE();
193 1 : FD_VOLATILE( wksp->magic ) = FD_WKSP_MAGIC;
194 1 : FD_COMPILER_MFENCE();
195 :
196 1 : int err = fd_wksp_rebuild( wksp, seed );
197 1 : if( FD_UNLIKELY( err ) ) { /* Should be impossible at this point */
198 :
199 0 : FD_COMPILER_MFENCE();
200 0 : FD_VOLATILE( wksp->magic ) = 0UL;
201 0 : FD_COMPILER_MFENCE();
202 :
203 0 : FD_LOG_WARNING(( "fd_wksp_rebuild failed (%i-%s)", err, fd_wksp_strerror( err ) ));
204 0 : return NULL;
205 0 : }
206 :
207 : #if FD_HAS_DEEPASAN
208 : /* Poison entire workspace except wksp header and the pinfo array. */
209 : void * wksp_data = (void*)((ulong)wksp + fd_wksp_private_pinfo_off());
210 : fd_asan_poison( wksp_data, footprint - fd_wksp_private_pinfo_off() );
211 : fd_wksp_private_pinfo_t * pinfo = fd_wksp_private_pinfo( wksp );
212 : fd_asan_unpoison( pinfo, FD_WKSP_PRIVATE_PINFO_FOOTPRINT * part_max );
213 : #endif
214 :
215 1 : fd_wksp_private_unlock( wksp );
216 :
217 1 : return wksp;
218 1 : }
219 :
220 : fd_wksp_t *
221 1 : fd_wksp_join( void * shwksp ) {
222 1 : fd_wksp_t * wksp = (fd_wksp_t *)shwksp;
223 :
224 1 : if( FD_UNLIKELY( !wksp ) ) {
225 0 : FD_LOG_WARNING(( "NULL shwksp" ));
226 0 : return NULL;
227 0 : }
228 :
229 1 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)wksp, FD_WKSP_ALIGN ) ) ) {
230 0 : FD_LOG_WARNING(( "bad align" ));
231 0 : return NULL;
232 0 : }
233 :
234 1 : if( FD_UNLIKELY( wksp->magic!=FD_WKSP_MAGIC ) ) {
235 0 : FD_LOG_WARNING(( "bad magic" ));
236 0 : return NULL;
237 0 : }
238 :
239 1 : return wksp;
240 1 : }
241 :
242 : void *
243 1 : fd_wksp_leave( fd_wksp_t * wksp ) {
244 1 : if( FD_UNLIKELY( !wksp ) ) {
245 0 : FD_LOG_WARNING(( "NULL wksp" ));
246 0 : return NULL;
247 0 : }
248 :
249 1 : return (void *)wksp;
250 1 : }
251 :
252 : void *
253 1 : fd_wksp_delete( void * shwksp ) {
254 1 : fd_wksp_t * wksp = (fd_wksp_t *)shwksp;
255 :
256 1 : if( FD_UNLIKELY( !wksp ) ) {
257 0 : FD_LOG_WARNING(( "NULL shwksp" ));
258 0 : return NULL;
259 0 : }
260 :
261 1 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)wksp, FD_WKSP_ALIGN ) ) ) {
262 0 : FD_LOG_WARNING(( "bad align" ));
263 0 : return NULL;
264 0 : }
265 :
266 1 : if( FD_UNLIKELY( wksp->magic!=FD_WKSP_MAGIC ) ) {
267 0 : FD_LOG_WARNING(( "bad magic" ));
268 0 : return NULL;
269 0 : }
270 :
271 : /* TODO: consider testing owner */
272 :
273 1 : FD_COMPILER_MFENCE();
274 1 : FD_VOLATILE( wksp->magic ) = 0UL;
275 1 : FD_COMPILER_MFENCE();
276 :
277 : # if FD_HAS_DEEPASAN
278 : /* Unpoison entire wksp region. */
279 : ulong footprint = fd_wksp_footprint( wksp->part_max, wksp->data_max );
280 : void * wksp_data = (void*)((ulong)wksp + fd_wksp_private_pinfo_off());
281 : fd_asan_unpoison( wksp_data, footprint - fd_wksp_private_pinfo_off());
282 : # endif
283 :
284 1 : return wksp;
285 1 : }
286 :
287 0 : char const * fd_wksp_name ( fd_wksp_t const * wksp ) { return wksp->name; }
288 0 : uint fd_wksp_seed ( fd_wksp_t const * wksp ) { return wksp->seed; }
289 0 : ulong fd_wksp_part_max( fd_wksp_t const * wksp ) { return wksp->part_max; }
290 0 : ulong fd_wksp_data_max( fd_wksp_t const * wksp ) { return wksp->data_max; }
291 :
292 : ulong
293 0 : fd_wksp_owner( fd_wksp_t const * wksp ) {
294 0 : FD_COMPILER_MFENCE();
295 0 : ulong owner = FD_VOLATILE_CONST( wksp->owner );
296 0 : FD_COMPILER_MFENCE();
297 0 : return owner;
298 0 : }
299 :
300 : char const *
301 0 : fd_wksp_strerror( int err ) {
302 0 : switch( err ) {
303 0 : case FD_WKSP_SUCCESS: return "success";
304 0 : case FD_WKSP_ERR_INVAL: return "inval";
305 0 : case FD_WKSP_ERR_FAIL: return "fail";
306 0 : case FD_WKSP_ERR_CORRUPT: return "corrupt";
307 0 : default: break;
308 0 : }
309 0 : return "unknown";
310 0 : }
311 :
312 : int
313 0 : fd_wksp_verify( fd_wksp_t * wksp ) {
314 :
315 0 : # define TEST(c) do { \
316 0 : if( FD_UNLIKELY( !(c) ) ) { FD_LOG_WARNING(( "FAIL: %s", #c )); return FD_WKSP_ERR_CORRUPT; } \
317 0 : } while(0)
318 :
319 : /* Validate metadata */
320 :
321 0 : TEST( wksp );
322 0 : TEST( wksp->magic==FD_WKSP_MAGIC );
323 :
324 0 : ulong part_max = wksp->part_max;
325 0 : ulong data_max = wksp->data_max;
326 0 : TEST( fd_wksp_footprint( part_max, data_max ) );
327 :
328 0 : ulong gaddr_lo = wksp->gaddr_lo; TEST( gaddr_lo==fd_wksp_private_data_off( part_max ) );
329 0 : ulong gaddr_hi = wksp->gaddr_hi; TEST( gaddr_hi==gaddr_lo+data_max );
330 :
331 0 : TEST( fd_shmem_name_len( wksp->name ) );
332 :
333 : /* seed is arbitrary */
334 :
335 0 : TEST( wksp->cycle_tag >= 4UL );
336 :
337 : /* TODO: consider verifying owner */
338 :
339 0 : fd_wksp_private_pinfo_t * pinfo = fd_wksp_private_pinfo( wksp );
340 :
341 : /* Clear out cycle tags */
342 :
343 0 : for( ulong i=0UL; i<part_max; i++ ) pinfo[ i ].cycle_tag = 0UL;
344 :
345 : /* Verify the idle stack */
346 :
347 0 : ulong idle_cnt = 0UL;
348 :
349 0 : do {
350 0 : ulong i = fd_wksp_private_pinfo_idx( wksp->idle_top_cidx );
351 0 : while( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
352 :
353 : /* Visit i. Note that i has not been validated yet. */
354 :
355 0 : TEST( i<part_max ); /* Validate i */
356 0 : TEST( !pinfo[ i ].cycle_tag ); /* Make sure not visited before */
357 0 : pinfo[ i ].cycle_tag = 1UL; /* Mark as visited in idle stack */
358 0 : idle_cnt++; /* Update the idle cnt */
359 :
360 : /* Advance to the next idle */
361 :
362 0 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx );
363 0 : }
364 0 : } while(0);
365 :
366 : /* Idle stack looks intact, verify partitioning */
367 :
368 0 : ulong free_cnt = 0UL;
369 0 : ulong used_cnt = 0UL;
370 :
371 0 : do {
372 0 : ulong j = FD_WKSP_PRIVATE_PINFO_IDX_NULL;
373 0 : ulong i = fd_wksp_private_pinfo_idx( wksp->part_head_cidx );
374 0 : ulong g = gaddr_lo;
375 :
376 0 : int last_free = 0;
377 :
378 0 : while( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
379 :
380 : /* At this point, we last visited j. Visit i. Note that j has
381 : been validated but i has not. */
382 :
383 0 : TEST( i<part_max ); /* Validate i */
384 0 : TEST( pinfo[ i ].gaddr_lo==g ); /* Make sure partition is tightly adjacent to previous */
385 0 : TEST( pinfo[ i ].gaddr_hi> g ); /* Make sure partition size is non-zero */
386 0 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].prev_cidx )==j ); /* Make sure correct prev partition */
387 0 : TEST( !pinfo[ i ].cycle_tag ); /* Make sure not visited before */
388 0 : pinfo[ i ].cycle_tag = 2UL; /* Mark as visited in partitioning */
389 :
390 0 : g = pinfo[ i ].gaddr_hi; /* Extract where the next partition should start */
391 0 : int is_free = !pinfo[ i ].tag; /* Determine if this partition is free or used */
392 0 : TEST( !(last_free & is_free) ); /* Make sure no adjacent free partitions */
393 0 : free_cnt += (ulong) is_free; /* Update the free cnt */
394 0 : used_cnt += (ulong)!is_free; /* Update the used cnt */
395 :
396 : /* Advance to the next partition */
397 :
398 0 : last_free = is_free;
399 :
400 0 : j = i;
401 0 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].next_cidx );
402 0 : }
403 :
404 0 : TEST( fd_wksp_private_pinfo_idx( wksp->part_tail_cidx )==j ); /* Make sure correct partition tail */
405 0 : TEST( g==gaddr_hi ); /* Make sure complete partitioning */
406 0 : TEST( (idle_cnt + free_cnt + used_cnt)==part_max ); /* Make sure no lost idle partitions */
407 0 : } while(0);
408 :
409 : /* Idle stack and partitioning look intact, validate used treap */
410 :
411 0 : do {
412 0 : ulong visit_cnt = 0UL;
413 :
414 0 : ulong i = fd_wksp_private_pinfo_idx( wksp->part_used_cidx );
415 0 : ulong s = FD_WKSP_PRIVATE_PINFO_IDX_NULL;
416 0 : ulong g = gaddr_lo;
417 :
418 0 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
419 0 : TEST( i<part_max ); /* Validate i */
420 0 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==s ); /* Validate parent */
421 0 : }
422 :
423 0 : for(;;) {
424 :
425 : /* At this point i is and everything on stack is validated */
426 :
427 0 : if( fd_wksp_private_pinfo_idx_is_null( i ) ) {
428 0 : if( fd_wksp_private_pinfo_idx_is_null( s ) ) break; /* Done */
429 :
430 : /* Pop stack */
431 :
432 0 : i = s;
433 0 : s = fd_wksp_private_pinfo_idx( pinfo[ i ].stack_cidx );
434 :
435 : /* Visit i */
436 :
437 0 : ulong p = fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx ); /* Extract the parent */
438 :
439 0 : TEST( pinfo[ i ].gaddr_lo>=g ); /* Make sure this starts after last visited */
440 0 : TEST( pinfo[ i ].tag ); /* Make sure tagged as a used partition */
441 0 : TEST( pinfo[ i ].cycle_tag==2UL ); /* Make sure in partitioning and not visited yet this traversal */
442 0 : if( !fd_wksp_private_pinfo_idx_is_null( p ) ) /* Make sure heap property satisfied */
443 0 : TEST( pinfo[ p ].heap_prio >= pinfo[ i ].heap_prio );
444 :
445 0 : TEST( !pinfo[ i ].in_same ); /* Make sure unique */
446 0 : TEST( fd_wksp_private_pinfo_idx_is_null( fd_wksp_private_pinfo_idx( pinfo[ i ].same_cidx ) ) ); /* " */
447 :
448 0 : pinfo[ i ].cycle_tag = 3UL; /* Mark as visited this traversal */
449 0 : visit_cnt++; /* Update the visit cnt */
450 0 : g = pinfo[ i ].gaddr_hi; /* Get minimum start for next partition */
451 :
452 : /* Traverse the right subtree */
453 :
454 0 : p = i;
455 0 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].right_cidx );
456 0 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
457 0 : TEST( i<part_max ); /* Validate i */
458 0 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==p ); /* Validate parent */
459 0 : }
460 :
461 0 : } else {
462 :
463 : /* At this point i and everything on the stack is validated.
464 : Push i to the stack and recurse on the left subtree. */
465 :
466 0 : pinfo[ i ].stack_cidx = fd_wksp_private_pinfo_cidx( s );
467 0 : s = i;
468 0 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].left_cidx );
469 0 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
470 0 : TEST( i<part_max ); /* Validate i */
471 0 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==s ); /* Validate parent */
472 0 : }
473 :
474 0 : }
475 0 : }
476 :
477 0 : TEST( visit_cnt==used_cnt ); /* Make sure all used partitions in used treap */
478 0 : } while(0);
479 :
480 : /* Idle stack, partitioning and used treap look intact, validate the
481 : free treap. */
482 :
483 0 : do {
484 0 : ulong visit_cnt = 0UL;
485 :
486 0 : ulong i = fd_wksp_private_pinfo_idx( wksp->part_free_cidx );
487 0 : ulong s = FD_WKSP_PRIVATE_PINFO_IDX_NULL;
488 0 : ulong sz = 0UL;
489 :
490 0 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
491 0 : TEST( i<part_max ); /* Validate i */
492 0 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==s ); /* Validate parent */
493 0 : }
494 :
495 0 : for(;;) {
496 :
497 : /* At this point i and everything on the stack is validated */
498 :
499 0 : if( fd_wksp_private_pinfo_idx_is_null( i ) ) {
500 0 : if( fd_wksp_private_pinfo_idx_is_null( s ) ) break; /* Done */
501 :
502 : /* Pop stack */
503 :
504 0 : i = s;
505 0 : s = fd_wksp_private_pinfo_idx( pinfo[ i ].stack_cidx );
506 :
507 : /* Visit i */
508 :
509 0 : ulong p = fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx ); /* Extract the parent */
510 0 : ulong isz = fd_wksp_private_pinfo_sz( pinfo + i ); /* Extract the size */
511 :
512 0 : TEST( isz>sz ); /* Make sure this partition i larger than previous */
513 0 : TEST( !pinfo[ i ].tag ); /* Make sure tagged as a free partition */
514 0 : TEST( !pinfo[ i ].in_same ); /* Make sure marked as not in same */
515 :
516 0 : if( !fd_wksp_private_pinfo_idx_is_null( p ) ) { /* Make sure heap property satisfied */
517 0 : TEST( pinfo[ p ].heap_prio >= pinfo[ i ].heap_prio );
518 0 : }
519 :
520 0 : sz = isz; /* Update largest size partition seen so far */
521 :
522 : /* Traverse all same sized partitions */
523 :
524 0 : ulong j = i;
525 0 : for(;;) {
526 :
527 : /* At this point, j is validated */
528 :
529 0 : TEST( pinfo[ j ].cycle_tag==2UL ); /* Make sure in partitioning and not visited yet this traversal */
530 0 : pinfo[ j ].cycle_tag = 3UL; /* Mark as visited this traversal */
531 0 : visit_cnt++;
532 :
533 0 : ulong k = fd_wksp_private_pinfo_idx( pinfo[ j ].same_cidx ); /* Get the next same sized */
534 0 : if( fd_wksp_private_pinfo_idx_is_null( k ) ) break; /* If no more, we are done with this node */
535 0 : TEST( k<part_max ); /* Make sure valid index */
536 0 : TEST( fd_wksp_private_pinfo_sz( pinfo + k )==sz ); /* Make sure same size */
537 0 : TEST( pinfo[ k ].in_same ); /* Make sure marked as in same */
538 0 : TEST( fd_wksp_private_pinfo_idx_is_null( fd_wksp_private_pinfo_idx( pinfo[ k ].left_cidx ) ) );
539 0 : TEST( fd_wksp_private_pinfo_idx_is_null( fd_wksp_private_pinfo_idx( pinfo[ k ].right_cidx ) ) );
540 0 : TEST( fd_wksp_private_pinfo_idx( pinfo[ k ].parent_cidx )==j ); /* Make sure correct parent */
541 0 : j = k;
542 0 : }
543 :
544 : /* Recurse on the right subtree */
545 :
546 0 : p = i;
547 0 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].right_cidx );
548 0 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
549 0 : TEST( i<part_max ); /* Validate i */
550 0 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==p ); /* Validate parent */
551 0 : }
552 :
553 0 : } else {
554 :
555 0 : TEST( i<part_max ); /* Validate i */
556 :
557 : /* At this point i and everything on the stack is validated.
558 : Push i to the stack and recurse on the left subtree. */
559 :
560 0 : pinfo[ i ].stack_cidx = fd_wksp_private_pinfo_cidx( s );
561 0 : s = i;
562 0 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].left_cidx );
563 0 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
564 0 : TEST( i<part_max ); /* Validate i */
565 0 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==s ); /* Validate parent */
566 0 : }
567 0 : }
568 0 : }
569 :
570 0 : TEST( visit_cnt==free_cnt ); /* Make sure all free partitions in free treap */
571 :
572 0 : } while(0);
573 :
574 0 : # undef TEST
575 :
576 0 : return FD_WKSP_SUCCESS;
577 0 : }
578 :
579 : int
580 : fd_wksp_rebuild( fd_wksp_t * wksp,
581 1 : uint seed ) {
582 :
583 : /* Load the wksp metadata, don't rebuild if any of it looks even
584 : slightly off. */
585 :
586 1 : if( FD_UNLIKELY( !wksp ) ) {
587 0 : FD_LOG_WARNING(( "NULL wksp" ));
588 0 : return FD_WKSP_ERR_CORRUPT;
589 0 : }
590 :
591 1 : ulong magic = wksp->magic;
592 1 : ulong part_max = wksp->part_max;
593 1 : ulong data_max = wksp->data_max;
594 1 : ulong gaddr_lo = wksp->gaddr_lo;
595 1 : ulong gaddr_hi = wksp->gaddr_hi;
596 1 : ulong cycle_tag = wksp->cycle_tag;
597 :
598 : /* TODO: consider verifying owner */
599 :
600 1 : ulong footprint = fd_wksp_footprint( part_max, data_max );
601 1 : ulong gaddr_lo_exp = fd_wksp_private_data_off( part_max );
602 1 : ulong gaddr_hi_exp = gaddr_lo_exp + data_max;
603 1 : if( FD_UNLIKELY( (magic!=FD_WKSP_MAGIC) | (!footprint) | (!fd_shmem_name_len( wksp->name )) |
604 1 : (gaddr_lo!=gaddr_lo_exp) | (gaddr_hi!=gaddr_hi_exp) | (cycle_tag<4UL) ) ) {
605 0 : FD_LOG_WARNING(( "bad metadata\n\t"
606 0 : "magic %016lx (exp %016lx)\n\t"
607 0 : "part_max %lu data_max %lu (footprint %lu)\n\t"
608 0 : "gaddr_lo %lu (exp %lu)\n\t"
609 0 : "gaddr_hi %lu (exp %lu)\n\t"
610 0 : "cycle_tag %lu (exp>=4)",
611 0 : magic, FD_WKSP_MAGIC, part_max, data_max, footprint,
612 0 : gaddr_lo, gaddr_lo_exp, gaddr_hi, gaddr_hi_exp, cycle_tag ));
613 0 : return FD_WKSP_ERR_CORRUPT;
614 0 : }
615 :
616 : /* Scan the wksp pinfo and insert any used partitions into the used
617 : treap and put the rest on the idle stack. If there is any sign of
618 : corruption (empty, bad range or overlap between used partitions),
619 : we abort the rebuild (this is almost certainly data corruption of
620 : some form and we don't have enough info to resolve a conflict
621 : without potentially making the situation worse). We do the scan in
622 : reverse order to rebuild the idle stack in forward order.
623 :
624 : Note that we don't ever change the gaddr_lo,gaddr_hi of any tagged
625 : partitions such that operation is guaranteed to never change the
626 : single source of truth. As such, this operation can be interrupted
627 : and restarted arbitrarily safely.*/
628 :
629 1 : fd_wksp_private_pinfo_t * pinfo = fd_wksp_private_pinfo( wksp );
630 :
631 1 : do {
632 1 : wksp->seed = seed;
633 1 : wksp->idle_top_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL ); /* Flush idle stack */
634 1 : wksp->part_used_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL ); /* Flush used treap */
635 1 : wksp->part_free_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL ); /* Flush free treap */
636 :
637 1 : ulong i = part_max;
638 1374914 : while( i ) {
639 1374913 : i--;
640 :
641 : /* Ideally, heap priorities should just be a shuffling of the
642 : integers [0,part_max). fd_uint_hash will generate such a
643 : shuffling for part_max = 2^32. Using the lower 30 bits
644 : (reserving bit 31 for bulk operations) will yield something
645 : very close. We use seed to mix it up some more. */
646 :
647 1374913 : pinfo[ i ].in_same = 0U;
648 1374913 : pinfo[ i ].heap_prio = fd_uint_hash( seed ^ (uint)i ) & ((1U<<30)-1U);
649 1374913 : pinfo[ i ].stack_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
650 1374913 : pinfo[ i ].cycle_tag = 0U;
651 :
652 1374913 : ulong tag = pinfo[ i ].tag;
653 1374913 : if( !tag ) { /* Not used ... make it available for reuse below */
654 1374913 : fd_wksp_private_idle_stack_push( i, wksp, pinfo );
655 1374913 : continue;
656 1374913 : }
657 :
658 0 : pinfo[ i ].prev_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
659 0 : pinfo[ i ].next_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
660 :
661 0 : if( FD_UNLIKELY( fd_wksp_private_used_treap_insert( i, wksp, pinfo ) ) ) return FD_WKSP_ERR_CORRUPT; /* Logs details */
662 0 : }
663 1 : } while(0);
664 :
665 : /* At this point, a partition is either in the idle stack or used
666 : treap. Further, we have:
667 :
668 : | used | idle
669 : ----------+----------------------------+--------
670 : gaddr_* | non-empty range | 0
671 : | no overlap with other used | 0
672 : tag | non-zero | 0
673 : in_same | 0 | 0
674 : heap_prio | randomized | randomized
675 : prev | NULL | NULL
676 : next | NULL | NULL
677 : left | used treap managed | NULL
678 : right | used treap managed | NULL
679 : same | used treap managed (NULL) | NULL
680 : parent | used treap managed | idle stack managed
681 : stack | wksp managed | wksp managed
682 : cycle_tag | wksp managed | wksp managed
683 :
684 : In-order traverse the used treap to rebuild the partitioning and
685 : the free treap. */
686 :
687 1 : do {
688 1 : uint * j_next_cidx_ptr = &wksp->part_head_cidx; /* Location of most recently added partition next link */
689 :
690 1 : ulong j = FD_WKSP_PRIVATE_PINFO_IDX_NULL; /* Most recently added partition */
691 1 : ulong g0 = gaddr_lo; /* Most recently added partition end */
692 :
693 1 : ulong i = fd_wksp_private_pinfo_idx( wksp->part_used_cidx );
694 1 : ulong s = FD_WKSP_PRIVATE_PINFO_IDX_NULL;
695 1 : for(;;) {
696 1 : if( fd_wksp_private_pinfo_idx_is_null( i ) ) {
697 1 : if( fd_wksp_private_pinfo_idx_is_null( s ) ) break; /* Done */
698 :
699 : /* Pop traversal stack */
700 :
701 0 : i = s;
702 0 : s = fd_wksp_private_pinfo_idx( pinfo[ i ].stack_cidx );
703 :
704 : /* Visit i */
705 :
706 0 : ulong g1 = pinfo[ i ].gaddr_lo;
707 0 : if( g1 > g0 ) { /* There's a gap between i and the most recently added partition */
708 :
709 : /* Acquire an idle partition to hold the gap */
710 :
711 0 : if( FD_UNLIKELY( fd_wksp_private_idle_stack_is_empty( wksp ) ) ) {
712 0 : FD_LOG_WARNING(( "part_max (%lu) too small to fill gap before partition %lu (tag %lu gaddr_lo %lu gaddr_hi %lu)",
713 0 : part_max, i, pinfo[i].tag, pinfo[i].gaddr_lo, pinfo[i].gaddr_hi ));
714 0 : return FD_WKSP_ERR_CORRUPT;
715 0 : }
716 0 : ulong k = fd_wksp_private_idle_stack_pop( wksp, pinfo );
717 :
718 : /* Populate the acquired partition with the gap details,
719 : append it to the wksp partitioning and insert it into the
720 : free treap. Note that stack_push/pop reset gaddr_lo,
721 : gaddr_hi, tag, in_same, {prev, next, left, right, same,
722 : parent}_cidx. It preserved heap_prio from its original
723 : assignment and didn't touch stack_cidx or cycle_tag. */
724 :
725 0 : pinfo[ k ].gaddr_lo = g0;
726 0 : pinfo[ k ].gaddr_hi = g1;
727 0 : pinfo[ k ].prev_cidx = fd_wksp_private_pinfo_cidx( j );
728 0 : *j_next_cidx_ptr = fd_wksp_private_pinfo_cidx( k );
729 0 : j_next_cidx_ptr = &pinfo[ k ].next_cidx;
730 0 : j = k;
731 0 : g0 = g1;
732 :
733 0 : fd_wksp_private_free_treap_insert( j, wksp, pinfo );
734 0 : }
735 :
736 : /* Add i to the partitioning. */
737 :
738 0 : pinfo[ i ].prev_cidx = fd_wksp_private_pinfo_cidx( j );
739 0 : *j_next_cidx_ptr = fd_wksp_private_pinfo_cidx( i );
740 0 : j_next_cidx_ptr = &pinfo[ i ].next_cidx;
741 0 : j = i;
742 0 : g0 = pinfo[ i ].gaddr_hi;
743 :
744 : /* Traverse the right subtree */
745 :
746 0 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].right_cidx );
747 :
748 0 : } else {
749 :
750 : /* Push i to the stack and recurse on the left subtree. */
751 :
752 0 : pinfo[ i ].stack_cidx = fd_wksp_private_pinfo_cidx( s );
753 0 : s = i;
754 0 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].left_cidx );
755 :
756 0 : }
757 1 : }
758 :
759 1 : if( g0 < gaddr_hi ) { /* Have final gap to fill */
760 :
761 : /* This works the same as the above */
762 :
763 1 : if( FD_UNLIKELY( fd_wksp_private_idle_stack_is_empty( wksp ) ) ) {
764 0 : FD_LOG_WARNING(( "part_max (%lu) too small to complete partitioning", part_max ));
765 0 : return FD_WKSP_ERR_CORRUPT;
766 0 : }
767 1 : ulong k = fd_wksp_private_idle_stack_pop( wksp, pinfo );
768 :
769 1 : pinfo[ k ].gaddr_lo = g0;
770 1 : pinfo[ k ].gaddr_hi = gaddr_hi;
771 1 : pinfo[ k ].prev_cidx = fd_wksp_private_pinfo_cidx( j );
772 1 : *j_next_cidx_ptr = fd_wksp_private_pinfo_cidx( k );
773 1 : j_next_cidx_ptr = &pinfo[ k ].next_cidx;
774 1 : j = k;
775 : //g0 = gaddr_hi;
776 :
777 1 : fd_wksp_private_free_treap_insert( j, wksp, pinfo );
778 1 : }
779 :
780 1 : wksp->part_tail_cidx = fd_wksp_private_pinfo_cidx( j );
781 :
782 1 : } while(0);
783 :
784 1 : return FD_WKSP_SUCCESS;
785 1 : }
786 :
787 : #if defined(__linux__)
788 :
789 : #include <sys/mman.h>
790 : #include <errno.h>
791 :
792 : void
793 0 : fd_wksp_mprotect( fd_wksp_t * wksp, int flag ) {
794 0 : if( FD_UNLIKELY( !wksp ) ) {
795 0 : FD_LOG_WARNING(( "NULL wksp" ));
796 0 : return;
797 0 : }
798 :
799 0 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)wksp, FD_WKSP_ALIGN ) ) ) {
800 0 : FD_LOG_WARNING(( "bad align" ));
801 0 : return;
802 0 : }
803 :
804 0 : if( FD_UNLIKELY( wksp->magic!=FD_WKSP_MAGIC ) ) {
805 0 : FD_LOG_WARNING(( "bad magic" ));
806 0 : return;
807 0 : }
808 :
809 0 : ulong wksp_footprint = fd_wksp_footprint( wksp->part_max, wksp->data_max );
810 0 : if( FD_UNLIKELY( mprotect( (void*)wksp, wksp_footprint, ( flag ? PROT_READ : (PROT_READ|PROT_WRITE) ) ) ) ) {
811 0 : FD_LOG_WARNING(( "mprotect failed (%i-%s)", errno, fd_io_strerror( errno ) ));
812 0 : return;
813 0 : }
814 0 : }
815 :
816 : #endif
|