Line data Source code
1 : #ifndef HEADER_fd_src_discof_replay_fd_sched_h
2 : #define HEADER_fd_src_discof_replay_fd_sched_h
3 :
4 : #include "fd_rdisp.h"
5 : #include "../../disco/store/fd_store.h" /* for fd_store_fec_t */
6 : #include "../../disco/pack/fd_microblock.h" /* for fd_txn_p_t */
7 :
8 : #include "../../flamenco/accdb/fd_accdb_user.h"
9 : #include "../../util/spad/fd_spad.h" /* for ALUTs */
10 :
11 : /* fd_sched wraps all the smarts and mechanical chores around scheduling
12 : transactions for replay execution. It is built on top of the
13 : dispatcher fd_rdisp. The dispatcher is responsible for high
14 : performance lane-based scheduling of transactions. On top of that,
15 : we add fork-aware management of lanes, and policies regarding which
16 : lanes to prioritize for execution.
17 :
18 : Conceptually, transactions in a block form a DAG. We would like to
19 : make our way through a block with a sufficient degree of parallelism,
20 : such that the execution time of the critical path of the DAG is the
21 : limiting factor. The dispatcher does a good job of emerging the
22 : critical path of the DAG on the fly. Blocks are tracked by the
23 : dispatcher either as a block staged on a lane, or as an unstaged
24 : block. When a block is staged, it will enjoy the most intelligent
25 : online scheduling that the dispatcher has to offer. Lanes have to
26 : consist of linear chains of blocks down a fork. So to map a fork
27 : tree to lanes, we will need multiple lanes. Ideally, every branch in
28 : the fork tree sits on some lane. However, memory footprint limits us
29 : to a few number of lanes.
30 :
31 : This module implements a state machine for ensuring that blocks enter
32 : into and exit ouf of lanes in an orderly fashion. The public APIs of
33 : this module are invoked to drive state transitions on a small number
34 : of events, such as new transactions arriving, or transactions
35 : completing, or a block being aborted/abandoned. We also implement
36 : policies for deciding which blocks get staged onto lanes, or evicted
37 : from lanes, as well as which lanes to prioritize for execution.
38 :
39 :
40 : The general order in which calls happen under the normal case is:
41 :
42 : fd_sched_fec_ingest()* ... fd_sched_txn_next_ready()* ... fd_sched_txn_done()* ...
43 : more ingest, more ready, more done ...
44 : ...
45 : fd_sched_txn_next_ready() indicates that the last transaction in the block is being scheduled
46 : fd_sched_txn_done()*
47 : fd_sched_block_is_done()
48 : end-of-block processing in caller
49 : fd_sched_txn_next_ready() starts returning transactions from the next block
50 : more ingest, more ready, more done ...
51 : ... */
52 :
53 : #define FD_SCHED_MIN_DEPTH 478
54 : #define FD_SCHED_MAX_DEPTH FD_RDISP_MAX_DEPTH
55 :
56 : struct fd_sched;
57 : typedef struct fd_sched fd_sched_t;
58 :
59 : struct fd_sched_alut_ctx {
60 : fd_accdb_user_t accdb[1];
61 : fd_funk_txn_xid_t xid[1];
62 : ulong els; /* Effective lookup slot. */
63 : };
64 : typedef struct fd_sched_alut_ctx fd_sched_alut_ctx_t;
65 :
66 : struct fd_sched_fec {
67 : ulong bank_idx; /* Index of the block. Assumed to be in [0, block_cnt_max). Caller
68 : is responsible for ensuring that bank idx is in bounds and unique
69 : across equivocated blocks. */
70 : ulong parent_bank_idx; /* Index of the parent block. Assumed to be in [0, block_cnt_max).
71 : Caller is responsible for ensuring that parent bank idx is in
72 : bounds and unique across equivocated blocks. */
73 : ulong slot; /* Slot number of the block. */
74 : ulong parent_slot; /* Slot number of the parent block. */
75 : fd_store_fec_t * fec; /* FEC set data. */
76 : uint shred_cnt; /* Number of shreds in the FEC set. */
77 : uint is_last_in_batch:1; /* Set if this is the last FEC set in the batch; relevant because the
78 : parser should ignore trailing bytes at the end of a batch. */
79 : uint is_last_in_block:1; /* Set if this is the last FEC set in the block. */
80 : uint is_first_in_block:1; /* Set if this is the first FEC set in the block. Bank should increment refcnt for sched if such a FEC set has been ingested by sched. */
81 :
82 : fd_sched_alut_ctx_t alut_ctx[ 1 ];
83 : };
84 : typedef struct fd_sched_fec fd_sched_fec_t;
85 :
86 : /* The state of a transaction. Non mutually exclusive. */
87 0 : #define FD_SCHED_TXN_EXEC_DONE (0x0001UL)
88 0 : #define FD_SCHED_TXN_SIGVERIFY_DONE (0x0002UL)
89 0 : #define FD_SCHED_TXN_IS_COMMITTABLE (0x0004UL)
90 0 : #define FD_SCHED_TXN_IS_FEES_ONLY (0x0008UL)
91 : #define FD_SCHED_TXN_REPLAY_DONE (FD_SCHED_TXN_EXEC_DONE|FD_SCHED_TXN_SIGVERIFY_DONE)
92 :
93 : struct fd_sched_txn_info {
94 : ulong flags;
95 : int txn_err;
96 : long tick_parsed;
97 : long tick_sigverify_disp;
98 : long tick_sigverify_done;
99 : long tick_exec_disp;
100 : long tick_exec_done;
101 : };
102 : typedef struct fd_sched_txn_info fd_sched_txn_info_t;
103 :
104 : /* The scheduler may return one of the following types of tasks for the
105 : replay tile.
106 :
107 : e - passed down to exec tiles.
108 : i - replay completes the task immediately.
109 : q - replay may either do it immediately or queue the task up. */
110 0 : #define FD_SCHED_TT_NULL (0UL)
111 0 : #define FD_SCHED_TT_BLOCK_START (1UL) /* (i) Start-of-block processing. */
112 0 : #define FD_SCHED_TT_BLOCK_END (2UL) /* (q) End-of-block processing. */
113 0 : #define FD_SCHED_TT_TXN_EXEC (3UL) /* (e) Transaction execution. */
114 0 : #define FD_SCHED_TT_TXN_SIGVERIFY (4UL) /* (e) Transaction sigverify. */
115 : #define FD_SCHED_TT_LTHASH (5UL) /* (e) Account lthash. */
116 0 : #define FD_SCHED_TT_POH_HASH (6UL) /* (e) PoH hashing. */
117 0 : #define FD_SCHED_TT_MARK_DEAD (7UL) /* (i) Mark the block dead. */
118 :
119 : struct fd_sched_block_start {
120 : ulong bank_idx; /* Same as in fd_sched_fec_t. */
121 : ulong parent_bank_idx; /* Same as in fd_sched_fec_t. */
122 : ulong slot; /* Slot number of the block. */
123 : };
124 : typedef struct fd_sched_block_start fd_sched_block_start_t;
125 :
126 : struct fd_sched_block_end {
127 : ulong bank_idx;
128 : };
129 : typedef struct fd_sched_block_end fd_sched_block_end_t;
130 :
131 : struct fd_sched_txn_exec {
132 : ulong bank_idx;
133 : ulong slot;
134 : ulong txn_idx;
135 : ulong exec_idx;
136 : };
137 : typedef struct fd_sched_txn_exec fd_sched_txn_exec_t;
138 :
139 : struct fd_sched_txn_sigverify {
140 : ulong bank_idx;
141 : ulong txn_idx;
142 : ulong exec_idx;
143 : };
144 : typedef struct fd_sched_txn_sigverify fd_sched_txn_sigverify_t;
145 :
146 : struct fd_sched_poh_hash {
147 : ulong bank_idx;
148 : ulong mblk_idx;
149 : ulong exec_idx;
150 : ulong hashcnt;
151 : fd_hash_t hash[ 1 ];
152 : };
153 : typedef struct fd_sched_poh_hash fd_sched_poh_hash_t;
154 :
155 : struct fd_sched_mark_dead {
156 : ulong bank_idx;
157 : };
158 : typedef struct fd_sched_mark_dead fd_sched_mark_dead_t;
159 :
160 : struct fd_sched_task {
161 : ulong task_type; /* Set to one of the task types defined above. */
162 : union {
163 : fd_sched_block_start_t block_start[ 1 ];
164 : fd_sched_block_end_t block_end[ 1 ];
165 : fd_sched_txn_exec_t txn_exec[ 1 ];
166 : fd_sched_txn_sigverify_t txn_sigverify[ 1 ];
167 : fd_sched_poh_hash_t poh_hash[ 1 ];
168 : fd_sched_mark_dead_t mark_dead[ 1 ];
169 : };
170 : };
171 : typedef struct fd_sched_task fd_sched_task_t;
172 :
173 : FD_PROTOTYPES_BEGIN
174 :
175 : /* fd_sched_{align,footprint} return the required alignment and
176 : footprint in bytes for a region of memory to be used as a scheduler.
177 : footprint silently returns 0 if params are invalid (thus convenient
178 : to validate params).
179 :
180 : depth controls the reorder buffer transaction count (~1 million
181 : recommended for live replay, ~10k recommended for async replay).
182 : block_cnt_max is the maximum number of blocks that will be tracked by
183 : the scheduler. */
184 :
185 : ulong
186 : fd_sched_align( void );
187 :
188 : ulong
189 : fd_sched_footprint( ulong depth, /* in [FD_SCHED_MIN_DEPTH,FD_SCHED_MAX_DEPTH] */
190 : ulong block_cnt_max ); /* >= 1 */
191 :
192 : /* fd_sched_new creates a sched object backed by the given memory region
193 : (conforming to align() and footprint()). Returns NULL if any
194 : parameter is invalid. */
195 :
196 : void *
197 : fd_sched_new( void * mem,
198 : ulong depth,
199 : ulong block_cnt_max,
200 : ulong exec_cnt );
201 :
202 : fd_sched_t *
203 : fd_sched_join( void * mem );
204 :
205 : /* Add the data in the FEC set to the scheduler. If is_last_fec is 1,
206 : then this is the last FEC set in the block. Transactions may span
207 : FEC set boundaries. The scheduler is responsible for incrementally
208 : parsing transactions from concatenated FEC set data. Assumes that
209 : FEC sets are delivered in replay order. That is, forks form a
210 : partial ordering over FEC sets: in-order per fork, but arbitrary
211 : ordering across forks. The fork tree is implied by the stream of
212 : parent-child relationships delivered in FEC sets. Also assumes that
213 : there is enough space in the scheduler to ingest the FEC set. The
214 : caller should generally call fd_sched_fec_can_ingest() first.
215 :
216 : Returns 1 on success, 0 if the block is bad and should be marked
217 : dead. */
218 : FD_WARN_UNUSED int
219 : fd_sched_fec_ingest( fd_sched_t * sched, fd_sched_fec_t * fec );
220 :
221 : /* Check if there is enough space in the scheduler to ingest the data in
222 : the FEC set. Returns 1 if there is, 0 otherwise. This is a cheap
223 : and conservative check. */
224 : int
225 : fd_sched_fec_can_ingest( fd_sched_t * sched, fd_sched_fec_t * fec );
226 :
227 : /* Returns the number of worst-case FEC sets sched can ingest. This is a
228 : cheap and conservative check. */
229 : ulong
230 : fd_sched_can_ingest_cnt( fd_sched_t * sched );
231 :
232 : /* Returns 1 if sched is drained, 0 otherwise. A drained scheduler will
233 : not return more work. Otherwise, next_ready will return more work,
234 : so long as there are exec tiles available. */
235 : int
236 : fd_sched_is_drained( fd_sched_t * sched );
237 :
238 : /* Obtain a transaction eligible for execution. This implies that all
239 : prior transactions with w-r or w-w conflicts have completed.
240 : Information regarding the scheduled transaction is written to the out
241 : pointer. Returns 1 on success, 0 on failure. Failures are generally
242 : transient and non-fatal, and are simply an indication that no
243 : transaction is ready for execution yet. When in-flight transactions
244 : retire or when more FEC sets are ingested, more transactions may
245 : become ready for execution.
246 :
247 : Transactions on the same fork will be returned in a way that
248 : maintains the serial fiction. That is, reordering can happen, but
249 : only within the constraint that transactions appear to be ready in
250 : the order in which they occur in the block. Transactions from
251 : different forks may interleave, and the caller should be prepared to
252 : switch execution context in response to interleavings. The scheduler
253 : will barrier on block boundaries, in the sense that transactions from
254 : a subsequent block will not be returned for execution until all
255 : transactions from the previous block have completed. This gives the
256 : caller a chance to perform end-of-block processing before
257 : transactions from a subsequent block start executing. In general,
258 : the caller should check if the last transaction in the current block
259 : is done, and if so, do end-of-block processing before calling this
260 : function to start the next block.
261 :
262 : In addition to returning transactions for execution, this function
263 : may also return a sigverify task. Sigverify can be completed
264 : aynschronously outside the critical path of transaction execution, as
265 : long as every transaction in a block passes sigverify before we
266 : commit the block. The scheduler prioritizes actual execution of
267 : transactions over sigverify, and in general sigverify tasks are only
268 : returned when no real transaction can be dispatched. In other words,
269 : the scheduler tries to exploit idle cycles in the exec tiles during
270 : times of low parallelism critical path progression.
271 :
272 : This function may also return a PoH hashing task. These tasks are
273 : lower priority than transaction execution, but higher priority than
274 : sigverify. This is because sigverify tasks are generally bite-sized,
275 : whereas PoH hashing can be longer, so we would like to get started on
276 : hashing sooner rather than later. */
277 : ulong
278 : fd_sched_task_next_ready( fd_sched_t * sched, fd_sched_task_t * out );
279 :
280 : /* Mark a task as complete. For transaction execution, this means that
281 : the effects of the execution are now visible on any core that could
282 : execute a subsequent transaction. Returns 0 on success, -1 if given
283 : the result of the task, the block turns out to be bad. -1 is only
284 : returned from PoH tasks.
285 :
286 : If a block has been abandoned or marked dead for any reason, it'll be
287 : pruned the moment in-flight task count hits 0 due to the last task
288 : completing. Then, in the immediate ensuing stem run loop,
289 : sched_pruned_next() will return the index for the corresponding bank
290 : so the refcnt can be decremented for sched.
291 :
292 : The transaction at the given index may be freed upon return from this
293 : function. Nonetheless, as long as there is no intervening FEC
294 : ingestion, it would still be safe to query the transaction using
295 : get_txn(). */
296 : int
297 : fd_sched_task_done( fd_sched_t * sched, ulong task_type, ulong txn_idx, ulong exec_idx, void * data );
298 :
299 : /* Abandon a block. This means that we are no longer interested in
300 : executing the block. This also implies that any block which chains
301 : off of the provided block shall be abandoned. This is mainly used
302 : when a block is aborted because we decided that it would be a
303 : dead/invalid block, and so there's no point in spending resources
304 : executing it. The scheduler will no longer return transactions from
305 : abandoned blocks for execution. This should only be invoked on an
306 : actively replayed block, and should only be invoked once on it.
307 :
308 : An abandoned block will be pruned from sched as soon as, and only if,
309 : the block has no more in-flight tasks associated with it. No sooner,
310 : no later. In the immediate ensuing stem run loop,
311 : sched_pruned_next() will return the index for the corresponding bank
312 : so the refcnt can be decremented for sched. After that point, the
313 : bank_idx may be recycled for another block. */
314 : void
315 : fd_sched_block_abandon( fd_sched_t * sched, ulong bank_idx );
316 :
317 : /* Add a block as immediately done to the scheduler. This is useful for
318 : installing the snapshot slot, or for informing the scheduler of a
319 : packed leader block. Parent block should be ULONG_MAX for the
320 : snapshot slot, and otherwise a block that hasn't been pruned. */
321 : void
322 : fd_sched_block_add_done( fd_sched_t * sched, ulong bank_idx, ulong parent_bank_idx, ulong slot );
323 :
324 : /* Advance the root, pruning all blocks across forks that do not descend
325 : from the new root. Assumes the new root is in the fork tree and
326 : connected to the current root. Also assumes that there are no more
327 : in-flight transactions from the soon-to-be-pruned blocks. This
328 : should be called after root_notify() and the caller is responsible
329 : for figuring out the new root to safely prune to. */
330 : void
331 : fd_sched_advance_root( fd_sched_t * sched, ulong root_idx );
332 :
333 : /* Notify the scheduler of a new root. This has the effect of calling
334 : abandon() on all minority forks that do not descend from the new
335 : root. Shortly after a call to this function, in-flight transactions
336 : from these abandoned blocks should retire from the execution
337 : pipeline, and the new root will be safe for pruning. */
338 : void
339 : fd_sched_root_notify( fd_sched_t * sched, ulong root_idx );
340 :
341 : /* Returns the index of a bank whose refcnt should be decremented for
342 : sched. This function should be called in a loop to drain all
343 : outstanding refcnt decrements before any other sched API is called in
344 : a stem run loop. Returns ULONG_MAX when there are no more
345 : outstanding refrences from sched and the loop should break. */
346 : ulong
347 : fd_sched_pruned_block_next( fd_sched_t * sched );
348 :
349 : void
350 : fd_sched_set_poh_params( fd_sched_t * sched, ulong bank_idx, ulong tick_height, ulong max_tick_height, ulong hashes_per_tick, fd_hash_t const * start_poh );
351 :
352 : fd_txn_p_t *
353 : fd_sched_get_txn( fd_sched_t * sched, ulong txn_idx );
354 :
355 : fd_sched_txn_info_t *
356 : fd_sched_get_txn_info( fd_sched_t * sched, ulong txn_idx );
357 :
358 : fd_hash_t *
359 : fd_sched_get_poh( fd_sched_t * sched, ulong bank_idx );
360 :
361 : uint
362 : fd_sched_get_shred_cnt( fd_sched_t * sched, ulong bank_idx );
363 :
364 : void
365 : fd_sched_metrics_write( fd_sched_t * sched );
366 :
367 : /* Serialize the current state as a cstr to the returned buffer. Caller
368 : may read from the buffer until the next invocation of any fd_sched
369 : function. */
370 : char *
371 : fd_sched_get_state_cstr( fd_sched_t * sched );
372 :
373 : void *
374 : fd_sched_leave( fd_sched_t * sched );
375 :
376 : void *
377 : fd_sched_delete( void * mem );
378 :
379 : FD_PROTOTYPES_END
380 :
381 : #endif /* HEADER_fd_src_discof_replay_fd_sched_h */
|