Line data Source code
1 : #ifndef HEADER_fd_src_flamenco_stakes_fd_vote_stakes_h
2 : #define HEADER_fd_src_flamenco_stakes_fd_vote_stakes_h
3 :
4 : #include "../../util/fd_util_base.h"
5 : #include "../types/fd_types_custom.h"
6 :
7 : /* fd_vote_stakes_t is a data structure that stores vote account stake
8 : updates across epoch boundaries. It offers a mapping from vote
9 : account pubkeys to their t_1 and t_2 stakes. The structure is
10 : designed to work with a large amount of vote accounts (in the order
11 : of 10s of millions) along with a relatively small number of forks
12 : across epoch boundaries.
13 :
14 : The structure is designed around these operations:
15 : 1. Inserting updated vote account stakes into a given fork.
16 : 2. Querying the stake for a given vote account with a given fork.
17 :
18 : Concurrent queries are allowed but concurrent inserts are not. This
19 : is fine because the structure is only modified during boot and during
20 : the epoch boundary.
21 :
22 : Given a large number of vote accounts (e.g. 2^25 = 33,554,432), we
23 : need to store the pubkey, t_1 stake, and t_2 stake for each vote
24 : account. We also need to store potentially ~32 forks across each
25 : epoch boundary. If done naively, this would require
26 : 2^25 * (32 + 8 + 8) * 32 = 51GB of memory not including any overhead
27 : for maintaining lookups for accounts.
28 :
29 : To avoid this, we can use some important runtime protocol properties.
30 : The most notable is that across forks, we will only have a small
31 : number of differences in vote account stakes: this is because
32 : realistically very few vote accounts will have differences in stakes
33 : that are caused by forks right before the epoch boundary. So we can
34 : set our bound on elements as the total number of vote accounts +
35 : stakes across forks. Let's say this is 2^26 if the max number of
36 : vote accounts we want to support is 2^25. Now to store our index we
37 : only need:
38 : 2^26 * (32 + 8 + 8) = 3GB of memory (not including overhead).
39 : Our index structure will look like this:
40 : pool<pubkey, stake_t_1, stake_t_2>
41 :
42 : What is described above the index of all vote accounts. We need to
43 : account for the stakes across forks. We have to maintain a list of
44 : all of the index entries used by each fork. It is sufficient to use
45 : a uint list of indices into the index. So each fork is just:
46 : pool<uint>.
47 :
48 : 4 (uint pool idx) * 2^25 (# vote accounts) * 32 (# forks) = 4GiB
49 :
50 : For a given fork, when we insert a vote account we check if:
51 : 1. The vote account + stake pair is already in the index. If it is,
52 : we just increment a reference to the pair. Add it to the list of
53 : pool indices that the fork maintains.
54 : 2. The vote account + stake pair is not in the index. If it is not,
55 : we need to insert it into the index and assume a reference of 1.
56 : Add it to the local list of pool indices that the fork maintains.
57 : In order to make both of these cases performant we need a mapping
58 : from pubkey + stake to the index pool element. This is simply
59 : represented as:
60 : map<(pubkey+stake), index_pool_idx>.
61 :
62 : Now for queries, we need a way to query the t_1 and t_2 stake for a
63 : pubkey on a given fork. The problem is the above map requires the
64 : t_1 stake as part of the key and there are potentially multiple
65 : entries for a given pubkey. This is solved with a
66 : multi_map<pubkey, index_pool_idx>. We query for a given pubkey and
67 : iterate through all of the entries: this is an acceptable trade-off
68 : since there will almost always be one element for a given pubkey
69 : (except around the epoch boundary). However, for each index entry
70 : we need a way to make sure that it's the one that is in our fork's
71 : list of indices. This is solved with a map for each fork that is
72 : keyed by the index pool idx.
73 : map<index_pool_idx, fork list entry>.
74 :
75 : Now, we can quickly insert entries for a given fork and also do fast
76 : queries for a given pubkey.
77 :
78 : The only remaining operation is updating the root fork. If we are
79 : updating which fork is the root, we can safely assume that all other
80 : forks are no longer valid:
81 : 1. either the fork was a competing fork that executed the epoch
82 : boundary and is no longer needed
83 : 2. the fork corresponds to a fork for the previous epoch boundary.
84 :
85 : For any fork that's being removed, we need to reset its fork's pool
86 : and remove any references to the index pool entries. If an index
87 : entry has a reference count of 0, we can remove it from the index
88 : entirely. Under the hood, the forks in use are stored in a deque;
89 : when a root is being advanced, all entries from the deque are removed
90 : and each removed fork's entries are released.
91 :
92 : The memory footprint of what is actually described above is larger
93 : because each key of the index needs to be a compound of
94 : the pubkey, stake_t_1, node_account_t_1, and epoch.
95 :
96 : As an important note, the vote stakes object can be used globally
97 : across different threads, but it is not safe to access concurrently.
98 : The caller is responsible for ensuring that reads and writes are
99 : properly synchronized. */
100 :
101 : FD_PROTOTYPES_BEGIN
102 :
103 60595 : #define FD_VOTE_STAKES_ALIGN (128UL)
104 :
105 : struct fd_vote_stakes;
106 : typedef struct fd_vote_stakes fd_vote_stakes_t;
107 :
108 : /* fd_vote_stakes_align returns the minimum alignment required for the
109 : fd_vote_stakes_t struct. */
110 :
111 : ulong
112 : fd_vote_stakes_align( void );
113 :
114 : /* fd_vote_stakes_footprint returns the minimum footprint required for
115 : the fd_vote_stakes_t object given the max number of vote accounts,
116 : the max fork width (number of forks that can cross the epoch
117 : boundary), and the max number of map chains. The map chains should
118 : be a power of 2 that is roughly equal to the expected number of vote
119 : accounts and not the maximum. */
120 :
121 : ulong
122 : fd_vote_stakes_footprint( ulong max_vote_accounts,
123 : ulong expected_vote_accounts,
124 : ulong max_fork_width );
125 :
126 :
127 : /* fd_vote_stakes_new creates a new fd_vote_stakes_t object given a
128 : region of memory sized out according to fd_vote_stakes_footprint. */
129 :
130 : void *
131 : fd_vote_stakes_new( void * shmem,
132 : ulong max_vote_accounts,
133 : ulong expected_vote_accounts,
134 : ulong max_fork_width,
135 : ulong seed );
136 :
137 :
138 : /* fd_vote_stakes_join joins a valid fd_vote_stakes_t object from a
139 : region of memory. */
140 :
141 : fd_vote_stakes_t *
142 : fd_vote_stakes_join( void * shmem );
143 :
144 : /* fd_vote_stakes_root_{insert, update}_key are APIs for
145 : inserting and updating keys for the root fork. These
146 : operations are split out in order to support the snapshot loading
147 : process. The set of stakes from the T-1 epoch are inserted into
148 : the root fork with a call to fd_vote_stakes_root_insert_key. The
149 : set of stakes from the T-2 epoch are updated with a call to
150 : fd_vote_stakes_root_update_meta. The caller is responsible for
151 : ensuring that for a given pubkey, insert_key is called before
152 : update_meta. It is important that these APIs should only be called
153 : while the root fork is the only and current fork in use.
154 :
155 : If update_meta is called on a key that has not had a corresponding
156 : insert_key call, a key is created into the root fork with a t-1 stake
157 : of 0. This usually means the vote account has been deleted, but it
158 : can be possible in the case where the only staker of a vote account
159 : has been marked delinquent in epoch T-1 and needs to be counted
160 : towards clock calculation for the rest of the epoch. */
161 :
162 : void
163 : fd_vote_stakes_root_insert_key( fd_vote_stakes_t * vote_stakes,
164 : fd_pubkey_t const * pubkey,
165 : fd_pubkey_t const * node_account_t_1,
166 : ulong stake_t_1,
167 : ulong epoch );
168 :
169 : void
170 : fd_vote_stakes_root_update_meta( fd_vote_stakes_t * vote_stakes,
171 : fd_pubkey_t const * pubkey,
172 : fd_pubkey_t const * node_account_t_2,
173 : ulong stake_t_2,
174 : ulong epoch );
175 :
176 : /* fd_vote_stakes_insert_{key, update, fini} is API for inserting
177 : entries into a given fork. It reflects the access pattern during
178 : epoch rewards, where the current stake for a vote account is
179 : accumulated by iterating over the set of vote accounts. The caller
180 : is responsible for ensuring that fd_vote_stakes_insert_key is only
181 : called once for each vote account. It is unsafe to call any
182 : other vote_stakes API between calls to insert_key and insert_fini
183 : except other insert_* APIs.
184 :
185 : The calling pattern is as follows:
186 :
187 : for each vote account: call fd_vote_stakes_insert_key() once
188 : for each stake delegation: call fd_vote_stakes_insert_update()
189 :
190 : after all entries are inserted, call fd_vote_stakes_insert_fini()
191 :
192 : Under the hood, insert_key inserts an entry into the fork's map and
193 : into the index. Each call to insert_update increments the stake for
194 : the given vote account. insert_fini will either dedup the entry if
195 : one already exists, or insert a new map entry. */
196 :
197 : void
198 : fd_vote_stakes_insert_key( fd_vote_stakes_t * vote_stakes,
199 : ushort fork_idx,
200 : fd_pubkey_t const * pubkey,
201 : fd_pubkey_t const * node_account_t_1,
202 : fd_pubkey_t const * node_account_t_2,
203 : ulong stake_t_2,
204 : ulong epoch,
205 : uchar exists_curr );
206 :
207 : void
208 : fd_vote_stakes_insert_update( fd_vote_stakes_t * vote_stakes,
209 : ushort fork_idx,
210 : fd_pubkey_t const * pubkey,
211 : ulong stake );
212 :
213 : void
214 : fd_vote_stakes_insert_fini( fd_vote_stakes_t * vote_stakes,
215 : ushort fork_idx );
216 :
217 : /* fd_vote_stakes_genesis_fini finalizes the vote stakes on the genesis
218 : block. Any vote stakes that have been inserted will be updated to
219 : have identical T-1/T-2 stakes and node accounts. This function
220 : assumes that all vote accounts have already been inserted into the
221 : genesis fork. */
222 :
223 : void
224 : fd_vote_stakes_genesis_fini( fd_vote_stakes_t * vote_stakes );
225 :
226 : /* fd_vote_stakes_new_child creates a new child fork and returns the
227 : index identifier for the new fork. */
228 :
229 : ushort
230 : fd_vote_stakes_new_child( fd_vote_stakes_t * vote_stakes );
231 :
232 : /* fd_vote_stakes_advance_root will move the root fork to the new
233 : candidate root fork. If the root_idx is equal to the root, this
234 : function is a no-op. However, if the root is different, all other
235 : child nodes will be removed from the structure. */
236 :
237 : void
238 : fd_vote_stakes_advance_root( fd_vote_stakes_t * vote_stakes,
239 : ushort root_idx );
240 :
241 : /* fd_vote_stakes_query_stake queries the stake for a given vote account
242 : in the given fork. If the element is found returns 1, otherwise
243 : returns 0. If any of the optional fields are set to NULL, then their
244 : corresponding value will not be set. If the stake_t_{1,2}_out_opt is
245 : set to 0UL and the record is found, that means the vote account
246 : either did not exist at the end of the t-{1,2} epoch boundary or had
247 : zero stake: they are treated as the same thing. */
248 :
249 : int
250 : fd_vote_stakes_query( fd_vote_stakes_t const * vote_stakes,
251 : ushort fork_idx,
252 : fd_pubkey_t const * pubkey,
253 : ulong * stake_t_1_out_opt,
254 : ulong * stake_t_2_out_opt,
255 : fd_pubkey_t * node_account_t_1_out_opt,
256 : fd_pubkey_t * node_account_t_2_out_opt );
257 :
258 : int
259 : fd_vote_stakes_query_pubkey( fd_vote_stakes_t const * vote_stakes,
260 : ushort fork_idx,
261 : fd_pubkey_t const * pubkey );
262 :
263 : /* fd_vote_stakes_query_t_1 and fd_vote_stakes_query_t_2 are shortcuts
264 : for querying the t_1 and t_2 stake for a given vote account in the
265 : given fork. 0 is returned if the vote account does not exist for the
266 : epoch or if it has zero stake. If the account is found, stake_out
267 : and node_account_out will be set. */
268 :
269 : int
270 : fd_vote_stakes_query_t_1( fd_vote_stakes_t const * vote_stakes,
271 : ushort fork_idx,
272 : fd_pubkey_t const * pubkey,
273 : ulong * stake_out,
274 : fd_pubkey_t * node_account_out );
275 :
276 : int
277 : fd_vote_stakes_query_t_2( fd_vote_stakes_t const * vote_stakes,
278 : ushort fork_idx,
279 : fd_pubkey_t const * pubkey,
280 : ulong * stake_out,
281 : fd_pubkey_t * node_account_out );
282 :
283 : /* fd_vote_stakes_ele_cnt returns the number of entries for a given
284 : fork. */
285 :
286 : uint
287 : fd_vote_stakes_ele_cnt( fd_vote_stakes_t * vote_stakes,
288 : ushort fork_idx );
289 :
290 : /* fd_vote_stakes_get_root_idx returns the index of the root fork. */
291 :
292 : ushort
293 : fd_vote_stakes_get_root_idx( fd_vote_stakes_t * vote_stakes );
294 :
295 : /* fd_vote_stakes_reset resets the vote stakes object to the initial
296 : state. This is useful for resetting vote stakes if a new snapshot
297 : manifest is being loaded. */
298 :
299 : void
300 : fd_vote_stakes_reset( fd_vote_stakes_t * vote_stakes );
301 :
302 : #define FD_VOTE_STAKES_ITER_FOOTPRINT (16UL)
303 : #define FD_VOTE_STAKES_ITER_ALIGN (8UL)
304 : struct stakes_map_iter_t;
305 : typedef struct stakes_map_iter_t fd_vote_stakes_iter_t;
306 :
307 : /* A caller can iterate through the entries for a given fork. The
308 : iterator is initialized by a call to fd_vote_stakes_fork_iter_init.
309 : The caller is responsible for managing the memory for the iterator.
310 : It is safe to call fd_vote_stakes_fork_iter_next if the result of
311 : fd_vote_stakes_fork_iter_done() == 0. It is safe to call
312 : fd_vote_stakes_fork_iter_ele() to get the current entry if there is
313 : a valid initialized iterator. fd_vote_stakes_fork_iter_next is
314 : called to advance the iterator.
315 :
316 : It is not safe to call any other vote stakes apis while an iteration
317 : is in progress.
318 :
319 : Example use:
320 : uchar __attribute__((aligned(FD_VOTE_STAKES_ITER_ALIGN))) iter_mem[ FD_VOTE_STAKES_ITER_FOOTPRINT ];
321 : for( fd_vote_stakes_iter_t * iter = fd_vote_stakes_fork_iter_init( vote_stakes, fork_idx, iter_mem );
322 : !fd_vote_stakes_fork_iter_done( vote_stakes, fork_idx, iter );
323 : fd_vote_stakes_fork_iter_next( vote_stakes, fork_idx, iter ) ) {
324 : fd_vote_stakes_fork_iter_ele( vote_stakes, fork_idx, iter, &pubkey, &stake_t_1, &stake_t_2, &node_account_t_1, &node_account_t_2 );
325 : }
326 :
327 : Under the hood, the vote stakes iterator is a wrapper of the map
328 : chain iterator.
329 :
330 : TODO: fork_idx can probably get absorbed into the iterator. */
331 :
332 : fd_vote_stakes_iter_t *
333 : fd_vote_stakes_fork_iter_init( fd_vote_stakes_t * vote_stakes,
334 : ushort fork_idx,
335 : uchar iter_mem[ static FD_VOTE_STAKES_ITER_FOOTPRINT ] );
336 :
337 : int
338 : fd_vote_stakes_fork_iter_done( fd_vote_stakes_t * vote_stakes,
339 : ushort fork_idx,
340 : fd_vote_stakes_iter_t * iter );
341 :
342 : void
343 : fd_vote_stakes_fork_iter_next( fd_vote_stakes_t * vote_stakes,
344 : ushort fork_idx,
345 : fd_vote_stakes_iter_t * iter );
346 :
347 : void
348 : fd_vote_stakes_fork_iter_ele( fd_vote_stakes_t * vote_stakes,
349 : ushort fork_idx,
350 : fd_vote_stakes_iter_t * iter,
351 : fd_pubkey_t * pubkey_out,
352 : ulong * stake_t_1_out_opt,
353 : ulong * stake_t_2_out_opt,
354 : fd_pubkey_t * node_account_t_1_out_opt,
355 : fd_pubkey_t * node_account_t_2_out_opt );
356 :
357 : FD_PROTOTYPES_END
358 :
359 : #endif
|