Line data Source code
1 : #ifndef HEADER_fd_discof_restore_fd_snapwm_tile_private_h
2 : #define HEADER_fd_discof_restore_fd_snapwm_tile_private_h
3 :
4 : /* fd_snapwm_tile_private.h contains private APIs for the "snapwm" tile,
5 : which is the tile responsible for directing vinyl database writes. */
6 :
7 : #include "utils/fd_slot_delta_parser.h"
8 : #include "utils/fd_ssparse.h"
9 : #include "utils/fd_vinyl_admin.h"
10 : #include "../../ballet/lthash/fd_lthash.h"
11 : #include "../../ballet/lthash/fd_lthash_adder.h"
12 : #include "../../disco/stem/fd_stem.h"
13 : #include "../../disco/topo/fd_topo.h"
14 : #include "../../vinyl/io/fd_vinyl_io.h"
15 : #include "../../vinyl/meta/fd_vinyl_meta.h"
16 :
17 0 : #define FD_SNAPWM_WR_MTU (16UL<<20)
18 0 : #define FD_SNAPWM_PAIR_BATCH_CNT_MAX (FD_SSPARSE_ACC_BATCH_MAX)
19 0 : #define FD_SNAPWM_PAIR_SZ_MAX (fd_vinyl_bstream_pair_sz(FD_RUNTIME_ACC_SZ_MAX))
20 0 : #define FD_SNAPWM_PAIR_BATCH_SZ_MAX (FD_SNAPWM_PAIR_BATCH_CNT_MAX*FD_SNAPWM_PAIR_SZ_MAX)
21 :
22 0 : #define FD_SNAPWM_DUP_META_BATCH_CNT_MAX (FD_SNAPWM_PAIR_BATCH_CNT_MAX)
23 0 : #define FD_SNAPWM_DUP_META_SZ (sizeof(ulong)+sizeof(fd_vinyl_bstream_phdr_t))
24 0 : #define FD_SNAPWM_DUP_META_BATCH_SZ (FD_SNAPWM_DUP_META_BATCH_CNT_MAX*FD_SNAPWM_DUP_META_SZ)
25 :
26 0 : #define FD_SNAPWM_DUP_BATCH_CREDIT_MIN (1UL)
27 0 : #define FD_SNAPWM_DUP_LTHASH_CREDIT_MIN ((sizeof(fd_ssctrl_hash_result_t)+(ctx->hash_out.mtu-1))/ctx->hash_out.mtu)
28 :
29 : struct fd_snapwm_out_link {
30 : ulong idx;
31 : fd_wksp_t * mem;
32 : ulong chunk0;
33 : ulong wmark;
34 : ulong chunk;
35 : ulong mtu;
36 : ulong depth;
37 : ulong const * consumer_fseq;
38 : };
39 : typedef struct fd_snapwm_out_link fd_snapwm_out_link_t;
40 :
41 : struct fd_snapwm_tile {
42 : int state;
43 : uint full : 1; /* loading a full snapshot? */
44 : uint lthash_disabled : 1; /* disable lthash checking? */
45 :
46 : ulong seed;
47 : long boot_timestamp;
48 :
49 : fd_sstxncache_entry_t * txncache_entries;
50 : ulong const * txncache_entries_len_ptr;
51 :
52 : struct {
53 : /* Account counters (full + incremental) */
54 : ulong accounts_loaded;
55 : ulong accounts_replaced;
56 : ulong accounts_ignored;
57 :
58 : /* Account counters (snapshot taken for full snapshot only) */
59 : ulong full_accounts_loaded;
60 : ulong full_accounts_replaced;
61 : ulong full_accounts_ignored;
62 : } metrics;
63 :
64 : struct {
65 : fd_wksp_t * wksp;
66 : ulong chunk0;
67 : ulong wmark;
68 : ulong mtu;
69 : ulong pos;
70 : } in;
71 :
72 : ulong out_ct_idx;
73 : fd_snapwm_out_link_t hash_out;
74 :
75 : struct {
76 : uchar * bstream_mem;
77 : ulong bstream_sz;
78 :
79 : ulong pair_cnt;
80 : ulong full_pair_cnt;
81 : ulong pair_cnt_max;
82 :
83 : /* Vinyl in either io_wd or io_mm mode */
84 : fd_vinyl_io_t * io;
85 : fd_vinyl_io_t * io_wd;
86 : fd_vinyl_io_t * io_mm;
87 : ulong io_seed;
88 :
89 : fd_vinyl_meta_t map[1];
90 :
91 : ulong txn_seq; /* bstream seq of first txn record (in [seq_past,seq_present]) */
92 : uint txn_active : 1;
93 : uint txn_commit : 1;
94 :
95 : ulong duplicate_accounts_batch_sz;
96 : ulong duplicate_accounts_batch_cnt;
97 :
98 : fd_lthash_adder_t adder;
99 : fd_lthash_value_t running_lthash;
100 : ulong running_capitalization; /* stores capitalization of duplicate accounts */
101 :
102 : ulong wr_cnt;
103 : fd_vinyl_admin_t * admin;
104 :
105 : struct {
106 : ulong seq_ancient;
107 : ulong seq_past;
108 : ulong seq_present;
109 : ulong seq_future;
110 : } recovery;
111 : } vinyl;
112 : };
113 :
114 : typedef struct fd_snapwm_tile fd_snapwm_tile_t;
115 :
116 : FD_PROTOTYPES_BEGIN
117 :
118 0 : #define FD_SNAPWM_IO_SPAD_MAX (64UL<<20) /* 64 MiB of I/O scratch space */
119 :
120 : /* fd_snapwm_vinyl_privileged_init performs administrative tasks, such
121 : as opening and mapping the bstream file descriptor. */
122 :
123 : void
124 : fd_snapwm_vinyl_privileged_init( fd_snapwm_tile_t * ctx,
125 : fd_topo_t * topo,
126 : fd_topo_tile_t * tile );
127 :
128 : /* fd_snapwm_vinyl_unprivileged_init performs setup tasks after being
129 : sandboxed. (anything that might be exposed to untrusted data) */
130 :
131 : void
132 : fd_snapwm_vinyl_unprivileged_init( fd_snapwm_tile_t * ctx,
133 : fd_topo_t * topo,
134 : fd_topo_tile_t * tile,
135 : void * io_mm_mem,
136 : void * io_wd_mem );
137 :
138 : /* fd_snapwm_vinyl_seccomp returns a seccomp sandbox policy suitable
139 : for vinyl operation. */
140 :
141 : ulong
142 : fd_snapwm_vinyl_seccomp( ulong out_cnt,
143 : struct sock_filter * out );
144 :
145 : /* fd_snapwm_vinyl_reset pauses the snapwr tile (waits for the snapwr
146 : tile to ack) and formats a bstream file to be empty. THIS IS A
147 : DESTRUCTIVE ACTION. */
148 :
149 : void
150 : fd_snapwm_vinyl_reset( fd_snapwm_tile_t * ctx );
151 :
152 : /* fd_snapwm_vinyl_txn_begin starts a transactional burst write.
153 : Assumes vinyl uses the io_mm backend. The write can then either be
154 : committed or cancelled. There is no practical limit on the size of
155 : this burst. */
156 :
157 : void
158 : fd_snapwm_vinyl_txn_begin( fd_snapwm_tile_t * ctx );
159 :
160 : /* fd_snapwm_vinyl_txn_commit finishes a transactional burst write.
161 : Assumes vinyl uses the io_mm backend. Reads through bstream records
162 : written since txn_begin was called and updates the vinyl_meta index. */
163 :
164 : void
165 : fd_snapwm_vinyl_txn_commit( fd_snapwm_tile_t * ctx, fd_stem_context_t * stem );
166 :
167 : /* fd_snapwm_vinyl_txn_cancel abandons a transactional burst write.
168 : Assumes vinyl uses the io_mm backend. Reverts the bstream state to
169 : when txn_begin was called. */
170 :
171 : void
172 : fd_snapwm_vinyl_txn_cancel( fd_snapwm_tile_t * ctx );
173 :
174 : /* fd_snapwm_vinyl_wd_init transitions the vinyl backend from generic
175 : vinyl accessor (io_mm) to fast dumb direct account insertion (io_wd).
176 : This must be called before calling fd_snapwm_process_account_*.
177 : Starts the snapwr tile (waits for the snapwr tile to ack). */
178 :
179 : void
180 : fd_snapwm_vinyl_wd_init( fd_snapwm_tile_t * ctx );
181 :
182 : /* fd_snapwm_vinyl_wd_fini transitions the vinyl backend from fast dumb
183 : direct account insertion (io_wd) back to generic mode (io_mm).
184 : Pauses the snapwr tile (waits for the snapwr to ack). */
185 :
186 : void
187 : fd_snapwm_vinyl_wd_fini( fd_snapwm_tile_t * ctx );
188 :
189 : /* fd_snapwm_vinyl_shutdown instructs vinyl-related tiles of the loader
190 : to shut down. Blocks until all affected tiles have acknowledged the
191 : shutdown signal. */
192 :
193 : void
194 : fd_snapwm_vinyl_shutdown( fd_snapwm_tile_t * ctx );
195 :
196 : /* fd_snapwm_vinyl_process_account reads a set of pre-generated bstream
197 : pairs and decides whether to actually add then to the vinyl database.
198 : It supports batch mode as well as single account (pair). */
199 :
200 : void
201 : fd_snapwm_vinyl_process_account( fd_snapwm_tile_t * ctx,
202 : ulong chunk,
203 : ulong acc_cnt,
204 : fd_stem_context_t * stem );
205 :
206 : /* fd_snapwm_vinyl_read_account retrieves an account from the vinyl
207 : database. */
208 :
209 : void
210 : fd_snapwm_vinyl_read_account( fd_snapwm_tile_t * ctx,
211 : void const * acct_addr,
212 : fd_account_meta_t * meta,
213 : uchar * data,
214 : ulong data_max );
215 :
216 : /* fd_snapwm_vinyl_duplicate_accounts_batch_{init,append,fini} handle
217 : duplicate accounts batching when lthash computation is enabled.
218 : The batch is needed to minimize the STEM_BURST, and make the stem
219 : credit handling possible. _fini is responsible for sending the
220 : message downstream.
221 :
222 : Typical usage:
223 : fd_snapwm_vinyl_duplicate_accounts_batch_init( ctx, stem );
224 : for(...) {
225 : ...
226 : fd_snapwm_vinyl_duplicate_accounts_batch_append( ctx, phdr, seq );
227 : }
228 : fd_snapwm_vinyl_duplicate_accounts_batch_fini( ctx, stem );
229 :
230 : They all return 1 on success, and 0 otherwise.
231 :
232 : IMPORTANT: there is an fseq check inside init, since every append
233 : modifies the output link's dcache directly. However, there is no
234 : fseq check inside fini. This is a performance optimization, which
235 : requires no fd_stem_publish between init and fini. */
236 : int
237 : fd_snapwm_vinyl_duplicate_accounts_batch_init( fd_snapwm_tile_t * ctx,
238 : fd_stem_context_t * stem );
239 : int
240 : fd_snapwm_vinyl_duplicate_accounts_batch_append( fd_snapwm_tile_t * ctx,
241 : fd_vinyl_bstream_phdr_t * phdr,
242 : ulong seq );
243 : int
244 : fd_snapwm_vinyl_duplicate_accounts_batch_fini( fd_snapwm_tile_t * ctx,
245 : fd_stem_context_t * stem );
246 :
247 : /* fd_snapwm_vinyl_duplicate_accounts_lthash_{init,append,fini} handle
248 : duplicate accounts lthash local calculation when lthash computation
249 : is enabled. This is typically only needed when the account is an
250 : "old" duplicate (meaning that it corresponds to an older slot than
251 : what is currently in the database). _fini is responsible for
252 : sending the message downstream.
253 :
254 : Typical usage:
255 : fd_snapwm_vinyl_duplicate_accounts_lthash_init( ctx, stem );
256 : for(...) {
257 : ...
258 : fd_snapwm_vinyl_duplicate_accounts_lthash_append( ctx, pair );
259 : }
260 : fd_snapwm_vinyl_duplicate_accounts_lthash_fini( ctx, stem );
261 :
262 : They all return 1 on success, and 0 otherwise.
263 :
264 : IMPORTANT: the fseq check happens only inside fini, since append
265 : only operates on internal variables. Therefore, it is safe to have
266 : fd_stem_publish in between init and fini. */
267 : int
268 : fd_snapwm_vinyl_duplicate_accounts_lthash_init( fd_snapwm_tile_t * ctx,
269 : fd_stem_context_t * stem );
270 : int
271 : fd_snapwm_vinyl_duplicate_accounts_lthash_append( fd_snapwm_tile_t * ctx,
272 : uchar * pair );
273 : int
274 : fd_snapwm_vinyl_duplicate_accounts_lthash_fini( fd_snapwm_tile_t * ctx,
275 : fd_stem_context_t * stem );
276 :
277 : /* fd_snapwm_vinyl_{init,update}_admin provide init and update helper
278 : functions on the vinyl admin object. do_rwlock is a flag indicating
279 : whether the lock is required or not. They return 1 on success and
280 : 0 otherwise. */
281 : int
282 : fd_snapwm_vinyl_init_admin( fd_snapwm_tile_t * ctx,
283 : int do_rwlock );
284 :
285 : int
286 : fd_snapwm_vinyl_update_admin( fd_snapwm_tile_t * ctx,
287 : int do_rwlock );
288 :
289 : /* fd_snapwm_vinyl_recovery_seq_{backup,apply} are helper functions
290 : that handle vinyl io bstream seq backup and apply (for recovery).
291 : Both operate on vinyl io_mm seq values, since this is the io that
292 : keeps track of those values. That means that backup must happen
293 : after io init, and apply must happen before io sync. */
294 :
295 : void
296 : fd_snapwm_vinyl_recovery_seq_backup( fd_snapwm_tile_t * ctx );
297 :
298 : void
299 : fd_snapwm_vinyl_recovery_seq_apply( fd_snapwm_tile_t * ctx );
300 :
301 : /* fd_snapwm_vinyl_revert_full provides the mechanism to revert any
302 : changes that happened during a full snapshot load that has been
303 : cancelled. It frees all elements of the vinyl meta map. Finally,
304 : it reverts the bstream seq(s) in vinyl io and syncs the bstream. */
305 :
306 : void
307 : fd_snapwm_vinyl_revert_full( fd_snapwm_tile_t * ctx );
308 :
309 : /* fd_snapwm_vinyl_revert_incr provides the mechanism to revert any
310 : changes that happened during an incr snapshot load that has been
311 : cancelled. To do this, every bstream pair's phdr info, as well
312 : as the corresponding meta map element's phdr info, is modified to
313 : include val_sz (32 bits), recovery_seq (48 bits) and slot (48 bits)
314 : in the info length of 16 bytes (128 bits). When a new account is
315 : written to the bstream, recovery_seq=0UL is assigned (which works
316 : as a sentinel value). When an account is a duplicate update of an
317 : existing account, the update's recovery_seq corresponds to the seq
318 : value of the existing account in the bstream. This is essentially
319 : a reference to the account that is being superseded.
320 :
321 : bstream: [ full | incr | free )
322 : revert: (*)->......)
323 :
324 : To revert the incremental snapshot, the function walks the bstream
325 : from recovery seq_present (*) towards the future, until all pairs
326 : are processed. If the recovery_seq (in the pair's phdr info) is
327 : 0UL (the sentinel) this account was a new account, and the meta map
328 : entry needs to be freed. If the recovery_seq is less than the
329 : recovery seq_present, the phdr of the pair at recovery_seq is read,
330 : and used to update the meta map element. If the recovery_seq is
331 : greater or equal to the recovery seq_present, this means that the
332 : update was a duplicate on the incr snapshot itself, and it can
333 : be discarded altogether.
334 : Note that as the recovery process moves forward, the meta map entry
335 : and an account update on the incr side of the bstream may see
336 : different revovery_seq values (e.g. consider what happens with
337 : chained duplicate updates). This means that the true recovery_seq
338 : is the one in the phdr info of the bstream pair.
339 : Finally, it reverts the bstream seq(s) in vinyl io and syncs the
340 : bstream. */
341 :
342 : void
343 : fd_snapwm_vinyl_revert_incr( fd_snapwm_tile_t * ctx );
344 :
345 : /* fd_snapin_vinyl_pair_info_{from_parts,update_recovery_seq} are
346 : helper functions to update the pair's info.
347 : fd_snapin_vinyl_pair_info_{val_sz,recovery_seq,slot} are helper
348 : functions to retrieve the corresponding values.
349 : In order to facilitate a recovery process, in particular when an
350 : incr snapshot is cancelled, every bstream pair's phdr info, as well
351 : as the corresponding meta map element's phdr info, is modified to
352 : include val_sz (32 bits), recovery_seq (48 bits) and slot (48 bits)
353 : in the info length of 16 bytes (128 bits). */
354 :
355 : void
356 : fd_snapin_vinyl_pair_info_from_parts( fd_vinyl_info_t * info,
357 : ulong val_sz,
358 : ulong recovery_seq,
359 : ulong slot );
360 : void
361 : fd_snapin_vinyl_pair_info_update_recovery_seq( fd_vinyl_info_t * info,
362 : ulong recovery_seq );
363 : ulong fd_snapin_vinyl_pair_info_val_sz ( fd_vinyl_info_t const * info );
364 : ulong fd_snapin_vinyl_pair_info_recovery_seq( fd_vinyl_info_t const * info );
365 : ulong fd_snapin_vinyl_pair_info_slot ( fd_vinyl_info_t const * info );
366 :
367 : FD_PROTOTYPES_END
368 :
369 : #endif /* HEADER_fd_discof_restore_fd_snapwm_tile_private_h */
|