Line data Source code
1 : /* The repair command spawns a smaller topology for profiling the repair
2 : tile. This is a standalone application, and it can be run in mainnet,
3 : testnet and/or a private cluster. */
4 :
5 : #include "../../../disco/net/fd_net_tile.h"
6 : #include "../../../disco/tiles.h"
7 : #include "../../../disco/topo/fd_topob.h"
8 : #include "../../../disco/topo/fd_cpu_topo.h"
9 : #include "../../../util/pod/fd_pod_format.h"
10 : #include "../../../util/tile/fd_tile_private.h"
11 :
12 : #include "../../firedancer/topology.h"
13 : #include "../../firedancer/topology.c"
14 : #include "../../shared/commands/configure/configure.h"
15 : #include "../../shared/commands/run/run.h" /* initialize_workspaces */
16 : #include "../../shared/fd_config.h" /* config_t */
17 : #include "../../shared_dev/commands/dev.h"
18 : #include "../../../disco/tiles.h"
19 : #include "../../../disco/topo/fd_topob.h"
20 : #include "../../../util/pod/fd_pod_format.h"
21 : #include "../../../waltz/resolv/fd_io_readline.h"
22 : #include "../../platform/fd_sys_util.h"
23 : #include "../../shared/commands/monitor/helper.h"
24 : #include "../../../disco/metrics/fd_metrics.h"
25 : #include "../../../discof/repair/fd_repair_tile.c"
26 :
27 : #include "gossip.h"
28 : #include "core_subtopo.h"
29 :
30 : #include <unistd.h> /* pause */
31 : #include <fcntl.h>
32 : #include <stdio.h>
33 : #include <termios.h>
34 : #include <errno.h>
35 :
36 : struct fd_location_info {
37 : ulong ip4_addr; /* for map key convenience */
38 : char location[ 128 ];
39 : };
40 : typedef struct fd_location_info fd_location_info_t;
41 :
42 : #define MAP_NAME fd_location_table
43 0 : #define MAP_T fd_location_info_t
44 0 : #define MAP_KEY ip4_addr
45 0 : #define MAP_LG_SLOT_CNT 16
46 : #define MAP_MEMOIZE 0
47 : #include "../../../util/tmpl/fd_map.c"
48 :
49 : uchar __attribute__((aligned(alignof(fd_location_info_t)))) location_table_mem[ sizeof(fd_location_info_t) * (1 << 16 ) ];
50 :
51 : static struct termios termios_backup;
52 :
53 : static void
54 0 : restore_terminal( void ) {
55 0 : (void)tcsetattr( STDIN_FILENO, TCSANOW, &termios_backup );
56 0 : }
57 :
58 : fd_topo_run_tile_t
59 : fdctl_tile_run( fd_topo_tile_t const * tile );
60 :
61 : /* repair_topo is a subset of "src/app/firedancer/topology.c" at commit
62 : 0d8386f4f305bb15329813cfe4a40c3594249e96, slightly modified to work
63 : as a repair catchup. TODO ideally, one should invoke the firedancer
64 : topology first, and exclude the parts that are not needed, instead of
65 : manually generating new topologies for every command. This would
66 : also guarantee that the catchup is replicating (as close as possible)
67 : the full topology. */
68 : static void
69 0 : repair_topo( config_t * config ) {
70 0 : resolve_gossip_entrypoints( config );
71 :
72 0 : ulong net_tile_cnt = config->layout.net_tile_count;
73 0 : ulong shred_tile_cnt = config->layout.shred_tile_count;
74 0 : ulong quic_tile_cnt = config->layout.quic_tile_count;
75 0 : ulong sign_tile_cnt = config->firedancer.layout.sign_tile_count;
76 0 : ulong gossvf_tile_cnt = config->firedancer.layout.gossvf_tile_count;
77 :
78 0 : fd_topo_t * topo = { fd_topob_new( &config->topo, config->name ) };
79 0 : topo->max_page_size = fd_cstr_to_shmem_page_sz( config->hugetlbfs.max_page_size );
80 0 : topo->gigantic_page_threshold = config->hugetlbfs.gigantic_page_threshold_mib << 20;
81 :
82 0 : ulong tile_to_cpu[ FD_TILE_MAX ] = {0};
83 0 : ushort parsed_tile_to_cpu[ FD_TILE_MAX ];
84 : /* Unassigned tiles will be floating, unless auto topology is enabled. */
85 0 : for( ulong i=0UL; i<FD_TILE_MAX; i++ ) parsed_tile_to_cpu[ i ] = USHORT_MAX;
86 :
87 0 : int is_auto_affinity = !strcmp( config->layout.affinity, "auto" );
88 0 : int is_bench_auto_affinity = !strcmp( config->development.bench.affinity, "auto" );
89 :
90 0 : if( FD_UNLIKELY( is_auto_affinity != is_bench_auto_affinity ) ) {
91 0 : FD_LOG_ERR(( "The CPU affinity string in the configuration file under [layout.affinity] and [development.bench.affinity] must all be set to 'auto' or all be set to a specific CPU affinity string." ));
92 0 : }
93 :
94 0 : fd_topo_cpus_t cpus[1];
95 0 : fd_topo_cpus_init( cpus );
96 :
97 0 : ulong affinity_tile_cnt = 0UL;
98 0 : if( FD_LIKELY( !is_auto_affinity ) ) affinity_tile_cnt = fd_tile_private_cpus_parse( config->layout.affinity, parsed_tile_to_cpu );
99 :
100 0 : for( ulong i=0UL; i<affinity_tile_cnt; i++ ) {
101 0 : if( FD_UNLIKELY( parsed_tile_to_cpu[ i ]!=USHORT_MAX && parsed_tile_to_cpu[ i ]>=cpus->cpu_cnt ) )
102 0 : FD_LOG_ERR(( "The CPU affinity string in the configuration file under [layout.affinity] specifies a CPU index of %hu, but the system "
103 0 : "only has %lu CPUs. You should either change the CPU allocations in the affinity string, or increase the number of CPUs "
104 0 : "in the system.",
105 0 : parsed_tile_to_cpu[ i ], cpus->cpu_cnt ));
106 0 : tile_to_cpu[ i ] = fd_ulong_if( parsed_tile_to_cpu[ i ]==USHORT_MAX, ULONG_MAX, (ulong)parsed_tile_to_cpu[ i ] );
107 0 : }
108 :
109 0 : fd_core_subtopo( config, tile_to_cpu );
110 0 : fd_gossip_subtopo( config, tile_to_cpu );
111 :
112 : /* topo, name */
113 0 : fd_topob_wksp( topo, "net_shred" );
114 0 : fd_topob_wksp( topo, "net_repair" );
115 0 : fd_topob_wksp( topo, "net_quic" );
116 :
117 0 : fd_topob_wksp( topo, "shred_out" );
118 0 : fd_topob_wksp( topo, "replay_epoch" );
119 :
120 0 : fd_topob_wksp( topo, "poh_shred" );
121 :
122 0 : fd_topob_wksp( topo, "shred_sign" );
123 0 : fd_topob_wksp( topo, "sign_shred" );
124 :
125 0 : fd_topob_wksp( topo, "repair_sign" );
126 0 : fd_topob_wksp( topo, "sign_repair" );
127 0 : fd_topob_wksp( topo, "rnonce" );
128 0 : fd_topob_wksp( topo, "repair_out" );
129 :
130 0 : fd_topob_wksp( topo, "txsend_out" );
131 :
132 0 : fd_topob_wksp( topo, "shred" );
133 0 : fd_topob_wksp( topo, "repair" );
134 0 : fd_topob_wksp( topo, "fec_sets" );
135 0 : fd_topob_wksp( topo, "snapin_manif" );
136 :
137 0 : fd_topob_wksp( topo, "slot_fseqs" ); /* fseqs for marked slots eg. turbine slot */
138 0 : fd_topob_wksp( topo, "genesi_out" ); /* mock genesi_out for ipecho */
139 :
140 0 : fd_topob_wksp( topo, "tower_out" ); /* mock tower_out for confirmation msgs. Not needed for any topo except eqvoc. */
141 :
142 0 : #define FOR(cnt) for( ulong i=0UL; i<cnt; i++ )
143 :
144 0 : ulong pending_fec_shreds_depth = fd_ulong_min( fd_ulong_pow2_up( config->tiles.shred.max_pending_shred_sets * FD_REEDSOL_DATA_SHREDS_MAX ), USHORT_MAX + 1 /* dcache max */ );
145 :
146 : /* topo, link_name, wksp_name, depth, mtu, burst */
147 0 : FOR(quic_tile_cnt) fd_topob_link( topo, "quic_net", "net_quic", config->net.ingress_buffer_size, FD_NET_MTU, 1UL );
148 0 : FOR(shred_tile_cnt) fd_topob_link( topo, "shred_net", "net_shred", config->net.ingress_buffer_size, FD_NET_MTU, 1UL );
149 :
150 0 : /**/ fd_topob_link( topo, "replay_epoch", "replay_epoch", 128UL, FD_EPOCH_OUT_MTU, 1UL );
151 :
152 0 : FOR(shred_tile_cnt) fd_topob_link( topo, "shred_sign", "shred_sign", 128UL, 32UL, 1UL );
153 0 : FOR(shred_tile_cnt) fd_topob_link( topo, "sign_shred", "sign_shred", 128UL, 64UL, 1UL );
154 :
155 0 : /**/ fd_topob_link( topo, "repair_net", "net_repair", config->net.ingress_buffer_size, FD_NET_MTU, 1UL );
156 :
157 0 : FOR(shred_tile_cnt) fd_topob_link( topo, "shred_out", "shred_out", pending_fec_shreds_depth, FD_SHRED_OUT_MTU, 2UL /* at most 2 msgs per after_frag */ );
158 0 : FOR(sign_tile_cnt-1) fd_topob_link( topo, "repair_sign", "repair_sign", 256UL, FD_REPAIR_MAX_PREIMAGE_SZ, 1UL );
159 0 : FOR(sign_tile_cnt-1) fd_topob_link( topo, "sign_repair", "sign_repair", 128UL, sizeof(fd_ed25519_sig_t), 1UL );
160 :
161 0 : /**/ fd_topob_link( topo, "repair_out", "repair_out", 128UL, FD_SHRED_OUT_MTU, 1UL );
162 :
163 0 : /**/ fd_topob_link( topo, "poh_shred", "poh_shred", 16384UL, USHORT_MAX, 1UL );
164 :
165 0 : /**/ fd_topob_link( topo, "txsend_out", "txsend_out", 128UL, FD_TXN_MTU, 1UL );
166 :
167 : /**/ fd_topob_link( topo, "snapin_manif", "snapin_manif", 2UL, sizeof(fd_snapshot_manifest_t),1UL );
168 :
169 0 : /**/ fd_topob_link( topo, "genesi_out", "genesi_out", 1UL, FD_GENESIS_TILE_MTU, 1UL );
170 0 : /**/ fd_topob_link( topo, "tower_out", "tower_out", 1024UL, sizeof(fd_tower_msg_t), 1UL );
171 :
172 0 : FOR(net_tile_cnt) fd_topos_net_rx_link( topo, "net_repair", i, config->net.ingress_buffer_size );
173 0 : FOR(net_tile_cnt) fd_topos_net_rx_link( topo, "net_quic", i, config->net.ingress_buffer_size );
174 0 : FOR(net_tile_cnt) fd_topos_net_rx_link( topo, "net_shred", i, config->net.ingress_buffer_size );
175 :
176 : /* topo, tile_name, tile_wksp, metrics_wksp, cpu_idx, is_agave, uses_id_keyswitch, uses_av_keyswitch */
177 0 : FOR(shred_tile_cnt) fd_topob_tile( topo, "shred", "shred", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 1, 0 );
178 0 : fd_topo_tile_t * repair_tile = fd_topob_tile( topo, "repair", "repair", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 1, 0 );
179 :
180 : /* Setup a shared wksp object for fec sets. */
181 :
182 0 : ulong shred_depth = 65536UL; /* from fdctl/topology.c shred_store link. MAKE SURE TO KEEP IN SYNC. */
183 0 : ulong fec_set_cnt = 2UL*shred_depth + config->tiles.shred.max_pending_shred_sets + 6UL;
184 0 : ulong fec_sets_sz = fec_set_cnt*sizeof(fd_fec_set_t); /* mirrors # of dcache entires in frankendancer */
185 0 : fd_topo_obj_t * fec_sets_obj = setup_topo_fec_sets( topo, "fec_sets", shred_tile_cnt*fec_sets_sz );
186 0 : for( ulong i=0UL; i<shred_tile_cnt; i++ ) {
187 0 : fd_topo_tile_t * shred_tile = &topo->tiles[ fd_topo_find_tile( topo, "shred", i ) ];
188 0 : fd_topob_tile_uses( topo, shred_tile, fec_sets_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
189 0 : }
190 0 : fd_topob_tile_uses( topo, repair_tile, fec_sets_obj, FD_SHMEM_JOIN_MODE_READ_ONLY );
191 0 : FD_TEST( fd_pod_insertf_ulong( topo->props, fec_sets_obj->id, "fec_sets" ) );
192 :
193 : /* There's another special fseq that's used to communicate the shred
194 : version from the Agave boot path to the shred tile. */
195 0 : fd_topo_obj_t * poh_shred_obj = fd_topob_obj( topo, "fseq", "poh_shred" );
196 0 : fd_topo_tile_t * poh_tile = &topo->tiles[ fd_topo_find_tile( topo, "gossip", 0UL ) ];
197 0 : fd_topob_tile_uses( topo, poh_tile, poh_shred_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
198 :
199 : /* root_slot is an fseq marking the validator's current Tower root. */
200 :
201 0 : fd_topo_obj_t * root_slot_obj = fd_topob_obj( topo, "fseq", "slot_fseqs" );
202 0 : FD_TEST( fd_pod_insertf_ulong( topo->props, root_slot_obj->id, "root_slot" ) );
203 :
204 0 : for( ulong i=0UL; i<shred_tile_cnt; i++ ) {
205 0 : fd_topo_tile_t * shred_tile = &topo->tiles[ fd_topo_find_tile( topo, "shred", i ) ];
206 0 : fd_topob_tile_uses( topo, shred_tile, poh_shred_obj, FD_SHMEM_JOIN_MODE_READ_ONLY );
207 0 : }
208 0 : FD_TEST( fd_pod_insertf_ulong( topo->props, poh_shred_obj->id, "poh_shred" ) );
209 :
210 0 : if( FD_LIKELY( !is_auto_affinity ) ) {
211 0 : if( FD_UNLIKELY( affinity_tile_cnt<topo->tile_cnt ) )
212 0 : FD_LOG_ERR(( "The topology you are using has %lu tiles, but the CPU affinity specified in the config tile as [layout.affinity] only provides for %lu cores. "
213 0 : "You should either increase the number of cores dedicated to Firedancer in the affinity string, or decrease the number of cores needed by reducing "
214 0 : "the total tile count. You can reduce the tile count by decreasing individual tile counts in the [layout] section of the configuration file.",
215 0 : topo->tile_cnt, affinity_tile_cnt ));
216 0 : if( FD_UNLIKELY( affinity_tile_cnt>topo->tile_cnt ) )
217 0 : FD_LOG_WARNING(( "The topology you are using has %lu tiles, but the CPU affinity specified in the config tile as [layout.affinity] provides for %lu cores. "
218 0 : "Not all cores in the affinity will be used by Firedancer. You may wish to increase the number of tiles in the system by increasing "
219 0 : "individual tile counts in the [layout] section of the configuration file.",
220 0 : topo->tile_cnt, affinity_tile_cnt ));
221 0 : }
222 :
223 : /* topo, tile_name, tile_kind_id, fseq_wksp, link_name, link_kind_id, reliable, polled */
224 0 : for( ulong j=0UL; j<shred_tile_cnt; j++ )
225 0 : fd_topos_tile_in_net( topo, "metric_in", "shred_net", j, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); /* No reliable consumers of networking fragments, may be dropped or overrun */
226 0 : for( ulong j=0UL; j<quic_tile_cnt; j++ )
227 0 : {fd_topos_tile_in_net( topo, "metric_in", "quic_net", j, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED );} /* No reliable consumers of networking fragments, may be dropped or overrun */
228 :
229 0 : /**/ fd_topob_tile_in( topo, "gossip", 0UL, "metric_in", "txsend_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
230 :
231 0 : /**/ fd_topos_tile_in_net( topo, "metric_in", "repair_net", 0UL, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); /* No reliable consumers of networking fragments, may be dropped or overrun */
232 :
233 0 : FOR(shred_tile_cnt) for( ulong j=0UL; j<net_tile_cnt; j++ )
234 0 : fd_topob_tile_in( topo, "shred", i, "metric_in", "net_shred", j, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); /* No reliable consumers of networking fragments, may be dropped or overrun */
235 0 : FOR(shred_tile_cnt) fd_topob_tile_in( topo, "shred", i, "metric_in", "poh_shred", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
236 0 : FOR(shred_tile_cnt) fd_topob_tile_in( topo, "shred", i, "metric_in", "replay_epoch", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
237 0 : FOR(shred_tile_cnt) fd_topob_tile_in( topo, "shred", i, "metric_in", "gossip_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
238 0 : FOR(shred_tile_cnt) fd_topob_tile_out( topo, "shred", i, "shred_out", i );
239 0 : FOR(shred_tile_cnt) fd_topob_tile_out( topo, "shred", i, "shred_net", i );
240 0 : FOR(shred_tile_cnt) fd_topob_tile_in ( topo, "shred", i, "metric_in", "ipecho_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
241 :
242 : /**/ fd_topob_tile_out( topo, "repair", 0UL, "repair_net", 0UL );
243 :
244 : /* Sign links don't need to be reliable because they are synchronous,
245 : so there's at most one fragment in flight at a time anyway. The
246 : sign links are also not polled by the mux, instead the tiles will
247 : read the sign responses out of band in a dedicated spin loop. */
248 0 : for( ulong i=0UL; i<shred_tile_cnt; i++ ) {
249 0 : /**/ fd_topob_tile_in( topo, "sign", 0UL, "metric_in", "shred_sign", i, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED );
250 0 : /**/ fd_topob_tile_out( topo, "shred", i, "shred_sign", i );
251 0 : /**/ fd_topob_tile_in( topo, "shred", i, "metric_in", "sign_shred", i, FD_TOPOB_UNRELIABLE, FD_TOPOB_UNPOLLED );
252 0 : /**/ fd_topob_tile_out( topo, "sign", 0UL, "sign_shred", i );
253 0 : }
254 0 : FOR(gossvf_tile_cnt) fd_topob_tile_in ( topo, "gossvf", i, "metric_in", "replay_epoch", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
255 :
256 0 : /**/ fd_topob_tile_in ( topo, "gossip", 0UL, "metric_in", "replay_epoch", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
257 :
258 0 : FOR(net_tile_cnt) fd_topob_tile_in( topo, "repair", 0UL, "metric_in", "net_repair", i, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); /* No reliable consumers of networking fragments, may be dropped or overrun */
259 0 : /**/ fd_topob_tile_in( topo, "repair", 0UL, "metric_in", "gossip_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
260 0 : fd_topob_tile_in( topo, "repair", 0UL, "metric_in", "snapin_manif", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
261 0 : FOR(shred_tile_cnt) fd_topob_tile_in( topo, "repair", 0UL, "metric_in", "shred_out", i, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
262 0 : FOR(sign_tile_cnt-1) fd_topob_tile_out( topo, "repair", 0UL, "repair_sign", i );
263 0 : FOR(sign_tile_cnt-1) fd_topob_tile_in ( topo, "sign", i+1, "metric_in", "repair_sign", i, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
264 0 : FOR(sign_tile_cnt-1) fd_topob_tile_out( topo, "sign", i+1, "sign_repair", i );
265 0 : FOR(sign_tile_cnt-1) fd_topob_tile_in ( topo, "repair", 0UL, "metric_in", "sign_repair", i, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED );
266 :
267 : /**/ fd_topob_tile_out( topo, "repair", 0UL, "repair_out", 0UL );
268 0 : /**/ fd_topob_tile_in ( topo, "gossip", 0UL, "metric_in", "sign_gossip", 0UL, FD_TOPOB_UNRELIABLE, FD_TOPOB_UNPOLLED );
269 0 : /**/ fd_topob_tile_in ( topo, "ipecho", 0UL, "metric_in", "genesi_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
270 0 : /**/ fd_topob_tile_in ( topo, "repair", 0UL, "metric_in", "tower_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
271 :
272 0 : if( 1 ) {
273 0 : fd_topob_wksp( topo, "scap" );
274 :
275 0 : fd_topo_tile_t * scap_tile = fd_topob_tile( topo, "scap", "scap", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 0, 0 );
276 :
277 0 : fd_topob_tile_in( topo, "scap", 0UL, "metric_in", "repair_net", 0UL, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED );
278 0 : for( ulong j=0UL; j<net_tile_cnt; j++ ) {
279 0 : fd_topob_tile_in( topo, "scap", 0UL, "metric_in", "net_shred", j, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED );
280 0 : }
281 0 : for( ulong j=0UL; j<shred_tile_cnt; j++ ) {
282 0 : fd_topob_tile_in( topo, "scap", 0UL, "metric_in", "shred_out", j, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED );
283 0 : }
284 0 : fd_topob_tile_in( topo, "scap", 0UL, "metric_in", "gossip_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
285 :
286 0 : fd_topob_tile_uses( topo, scap_tile, root_slot_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
287 0 : fd_topob_tile_out( topo, "scap", 0UL, "replay_epoch", 0UL );
288 0 : fd_topob_tile_out( topo, "scap", 0UL, "snapin_manif", 0UL );
289 0 : }
290 :
291 : /* Repair and shred share a secret they use to generate the nonces.
292 : It's not super security sensitive, but for good hygiene, we make it
293 : an object. */
294 0 : if( 1 /* just restrict the scope for these variables in this big function */ ) {
295 0 : fd_topo_obj_t * rnonce_ss_obj = fd_topob_obj( topo, "rnonce_ss", "rnonce" );
296 0 : fd_topo_tile_t * repair_tile = &topo->tiles[ fd_topo_find_tile( topo, "repair", 0UL ) ];
297 0 : fd_topob_tile_uses( topo, repair_tile, rnonce_ss_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
298 0 : for( ulong i=0UL; i<shred_tile_cnt; i++ ) {
299 0 : fd_topo_tile_t * shred_tile = &topo->tiles[ fd_topo_find_tile( topo, "shred", i ) ];
300 0 : fd_topob_tile_uses( topo, shred_tile, rnonce_ss_obj, FD_SHMEM_JOIN_MODE_READ_ONLY );
301 0 : }
302 0 : FD_TEST( fd_pod_insertf_ulong( topo->props, rnonce_ss_obj->id, "rnonce_ss" ) );
303 0 : }
304 :
305 0 : FD_TEST( fd_link_permit_no_producers( topo, "quic_net" ) == quic_tile_cnt );
306 0 : FD_TEST( fd_link_permit_no_producers( topo, "poh_shred" ) == 1UL );
307 0 : FD_TEST( fd_link_permit_no_producers( topo, "txsend_out" ) == 1UL );
308 0 : FD_TEST( fd_link_permit_no_producers( topo, "genesi_out" ) == 1UL );
309 0 : FD_TEST( fd_link_permit_no_producers( topo, "tower_out" ) == 1UL );
310 0 : FD_TEST( fd_link_permit_no_consumers( topo, "net_quic" ) == net_tile_cnt );
311 0 : FD_TEST( fd_link_permit_no_consumers( topo, "repair_out" ) == 1UL );
312 :
313 0 : config->tiles.txsend.txsend_src_port = 0; /* disable txsend */
314 :
315 0 : FOR(net_tile_cnt) fd_topos_net_tile_finish( topo, i );
316 :
317 0 : for( ulong i=0UL; i<topo->tile_cnt; i++ ) {
318 0 : fd_topo_tile_t * tile = &topo->tiles[ i ];
319 0 : fd_topo_configure_tile( tile, config );
320 0 : }
321 :
322 0 : if( FD_UNLIKELY( is_auto_affinity ) ) fd_topob_auto_layout( topo, 0 );
323 :
324 0 : fd_topob_finish( topo, CALLBACKS );
325 :
326 0 : config->topo = *topo;
327 0 : }
328 :
329 : extern int * fd_log_private_shared_lock;
330 :
331 : static char *
332 0 : fmt_count( char buf[ static 64 ], ulong count ) {
333 0 : char tmp[ 64 ];
334 0 : if( FD_LIKELY( count<1000UL ) ) FD_TEST( fd_cstr_printf_check( tmp, 64UL, NULL, "%lu", count ) );
335 0 : else if( FD_LIKELY( count<1000000UL ) ) FD_TEST( fd_cstr_printf_check( tmp, 64UL, NULL, "%.1f K", (double)count/1000.0 ) );
336 0 : else if( FD_LIKELY( count<1000000000UL ) ) FD_TEST( fd_cstr_printf_check( tmp, 64UL, NULL, "%.1f M", (double)count/1000000.0 ) );
337 :
338 0 : FD_TEST( fd_cstr_printf_check( buf, 64UL, NULL, "%12s", tmp ) );
339 0 : return buf;
340 0 : }
341 :
342 : static void
343 : print_histogram_buckets( volatile ulong * metrics,
344 : ulong offset,
345 : int converter,
346 : double histmin,
347 : double histmax,
348 0 : char * title ) {
349 0 : fd_histf_t hist[1];
350 :
351 : /* Create histogram structure only to get bucket edges for display */
352 0 : if( FD_LIKELY( converter == FD_METRICS_CONVERTER_SECONDS ) ) {
353 : /* For SLOT_COMPLETE_TIME: min=0.2, max=2.0 seconds */
354 0 : FD_TEST( fd_histf_new( hist, fd_metrics_convert_seconds_to_ticks( histmin ), fd_metrics_convert_seconds_to_ticks( histmax ) ) );
355 0 : } else if( FD_LIKELY( converter == FD_METRICS_CONVERTER_NONE ) ) {
356 : /* For non-time histograms, we'd need the actual min/max values */
357 0 : FD_TEST( fd_histf_new( hist, (ulong)histmin, (ulong)histmax ) );
358 0 : } else {
359 0 : FD_LOG_ERR(( "unknown converter %i", converter ));
360 0 : }
361 :
362 0 : printf( " +---------------------+--------------------+--------------+\n" );
363 0 : printf( " | %-19s | | Count |\n", title );
364 0 : printf( " +---------------------+--------------------+--------------+\n" );
365 :
366 0 : ulong total_count = 0;
367 0 : for( ulong k = 0; k < FD_HISTF_BUCKET_CNT; k++ ) {
368 0 : ulong bucket_count = metrics[ offset + k ];
369 0 : total_count += bucket_count;
370 0 : }
371 :
372 0 : for( ulong k = 0; k < FD_HISTF_BUCKET_CNT; k++ ) {
373 : /* Get individual bucket count directly from metrics array */
374 0 : ulong bucket_count = metrics[ offset + k ];
375 :
376 0 : char * le_str;
377 0 : char le_buf[ 64 ];
378 0 : if( FD_UNLIKELY( k == FD_HISTF_BUCKET_CNT - 1UL ) ) {
379 0 : le_str = "+Inf";
380 0 : } else {
381 0 : ulong edge = fd_histf_right( hist, k );
382 0 : if( FD_LIKELY( converter == FD_METRICS_CONVERTER_SECONDS ) ) {
383 0 : double edgef = fd_metrics_convert_ticks_to_seconds( edge - 1 );
384 0 : FD_TEST( fd_cstr_printf_check( le_buf, sizeof( le_buf ), NULL, "%.3f", edgef ) );
385 0 : } else {
386 0 : FD_TEST( fd_cstr_printf_check( le_buf, sizeof( le_buf ), NULL, "%.3f", (double)(edge - 1) / 1000000.0 ) );
387 0 : }
388 0 : le_str = le_buf;
389 0 : }
390 :
391 0 : char count_buf[ 64 ];
392 0 : fmt_count( count_buf, bucket_count );
393 :
394 : /* Match visual bar length to the %-18s display column width. */
395 0 : char bar_buf[ 19 ];
396 0 : ulong bar_max = sizeof( bar_buf ) - 1UL;
397 0 : if( bucket_count > 0 && total_count > 0 ) {
398 0 : ulong bar_length = (bucket_count * bar_max) / total_count;
399 0 : if( bar_length == 0 ) bar_length = 1;
400 0 : if( bar_length > bar_max ) bar_length = bar_max;
401 0 : for( ulong i = 0; i < bar_length; i++ ) { bar_buf[ i ] = '|'; }
402 0 : bar_buf[ bar_length ] = '\0';
403 0 : } else {
404 0 : bar_buf[ 0 ] = '\0';
405 0 : }
406 :
407 0 : printf( " | %-19s | %-18s | %s |\n", le_str, bar_buf, count_buf );
408 0 : }
409 :
410 : /* Print sum and total count */
411 0 : char sum_buf[ 64 ];
412 0 : char avg_buf[ 64 ];
413 0 : if( FD_LIKELY( converter == FD_METRICS_CONVERTER_SECONDS ) ) {
414 0 : double sumf = fd_metrics_convert_ticks_to_seconds( metrics[ offset + FD_HISTF_BUCKET_CNT ] );
415 0 : FD_TEST( fd_cstr_printf_check( sum_buf, sizeof( sum_buf ), NULL, "%.6f", sumf ) );
416 0 : double avg = sumf / (double)total_count;
417 0 : FD_TEST( fd_cstr_printf_check( avg_buf, sizeof( avg_buf ), NULL, "%.6f", avg ) );
418 0 : } else {
419 0 : FD_TEST( fd_cstr_printf_check( sum_buf, sizeof( sum_buf ), NULL, "%lu", metrics[ offset + FD_HISTF_BUCKET_CNT ] ));
420 0 : }
421 :
422 0 : printf( " +---------------------+--------------------+---------------+\n" );
423 0 : printf( " | Sum: %-14s | Count: %-11lu | Avg: %-8s |\n", sum_buf, total_count, avg_buf );
424 0 : printf( " +---------------------+--------------------+---------------+\n" );
425 0 : }
426 :
427 : static fd_slot_metrics_t temp_slots[ FD_CATCHUP_METRICS_MAX ];
428 :
429 : static void
430 0 : print_catchup_slots( fd_wksp_t * repair_tile_wksp, ctx_t * repair_ctx, int verbose, int sort_by_slot ) {
431 0 : fd_repair_metrics_t * catchup = repair_ctx->slot_metrics;
432 0 : ulong catchup_gaddr = fd_wksp_gaddr_fast( repair_ctx->wksp, catchup );
433 0 : fd_repair_metrics_t * catchup_table = (fd_repair_metrics_t *)fd_wksp_laddr( repair_tile_wksp, catchup_gaddr );
434 0 : if( FD_LIKELY( sort_by_slot ) ) {
435 0 : fd_repair_metrics_print_sorted( catchup_table, verbose, temp_slots );
436 0 : } else {
437 0 : fd_repair_metrics_print( catchup_table, verbose );
438 0 : }
439 0 : }
440 :
441 : static fd_location_info_t * location_table;
442 : static fd_pubkey_t peers_copy[ FD_REPAIR_PEER_MAX];
443 :
444 : static ulong
445 0 : sort_peers_by_latency( fd_policy_peer_map_t * active_table, fd_policy_peer_dlist_t * peers_dlist, fd_policy_peer_dlist_t * peers_wlist, fd_policy_peer_t * peers_arr ) {
446 0 : ulong i = 0;
447 0 : fd_policy_peer_dlist_iter_t iter = fd_policy_peer_dlist_iter_fwd_init( peers_dlist, peers_arr );
448 0 : while( !fd_policy_peer_dlist_iter_done( iter, peers_dlist, peers_arr ) ) {
449 0 : fd_policy_peer_t * peer = fd_policy_peer_dlist_iter_ele( iter, peers_dlist, peers_arr );
450 0 : if( FD_UNLIKELY( !peer ) ) break;
451 0 : peers_copy[ i++ ] = peer->key;
452 0 : if( FD_UNLIKELY( i >= FD_REPAIR_PEER_MAX ) ) break;
453 0 : iter = fd_policy_peer_dlist_iter_fwd_next( iter, peers_dlist, peers_arr );
454 0 : }
455 0 : ulong fast_cnt = i;
456 0 : iter = fd_policy_peer_dlist_iter_fwd_init( peers_wlist, peers_arr );
457 0 : while( !fd_policy_peer_dlist_iter_done( iter, peers_wlist, peers_arr ) ) {
458 0 : fd_policy_peer_t * peer = fd_policy_peer_dlist_iter_ele( iter, peers_wlist, peers_arr );
459 0 : if( FD_UNLIKELY( !peer ) ) break;
460 0 : peers_copy[ i++ ] = peer->key;
461 0 : if( FD_UNLIKELY( i >= FD_REPAIR_PEER_MAX ) ) break;
462 0 : iter = fd_policy_peer_dlist_iter_fwd_next( iter, peers_wlist, peers_arr );
463 0 : }
464 0 : FD_LOG_NOTICE(( "Fast peers cnt: %lu. Slow peers cnt: %lu.", fast_cnt, i - fast_cnt ));
465 :
466 0 : ulong peer_cnt = i;
467 0 : for( uint i = 0; i < peer_cnt - 1; i++ ) {
468 0 : int swapped = 0;
469 0 : for( uint j = 0; j < peer_cnt - 1 - i; j++ ) {
470 0 : fd_policy_peer_t const * active_j = fd_policy_peer_map_ele_query( active_table, &peers_copy[ j ], NULL, peers_arr );
471 0 : fd_policy_peer_t const * active_j1 = fd_policy_peer_map_ele_query( active_table, &peers_copy[ j + 1 ], NULL, peers_arr );
472 :
473 : /* Skip peers with no responses */
474 0 : double latency_j = 10e9;
475 0 : double latency_j1 = 10e9;
476 0 : if( FD_LIKELY( active_j && active_j->res_cnt > 0 ) ) latency_j = ((double)active_j->total_lat / (double)active_j->res_cnt);
477 0 : if( FD_LIKELY( active_j1 && active_j1->res_cnt > 0 ) ) latency_j1 = ((double)active_j1->total_lat / (double)active_j1->res_cnt);
478 :
479 : /* Swap if j has higher latency than j+1 */
480 0 : if( latency_j > latency_j1 ) {
481 0 : fd_pubkey_t temp = peers_copy[ j ];
482 0 : peers_copy[ j ] = peers_copy[ j + 1 ];
483 0 : peers_copy[ j + 1 ] = temp;
484 0 : swapped = 1;
485 0 : }
486 0 : }
487 0 : if( !swapped ) break;
488 0 : }
489 0 : return peer_cnt;
490 0 : }
491 :
492 : static void
493 0 : print_peer_location_latency( fd_wksp_t * repair_tile_wksp, ctx_t * tile_ctx ) {
494 0 : ulong policy_gaddr = fd_wksp_gaddr_fast( tile_ctx->wksp, tile_ctx->policy );
495 0 : fd_policy_t * policy = fd_wksp_laddr ( repair_tile_wksp, policy_gaddr );
496 0 : ulong peermap_gaddr = fd_wksp_gaddr_fast( tile_ctx->wksp, policy->peers.map );
497 0 : ulong peerarr_gaddr = fd_wksp_gaddr_fast( tile_ctx->wksp, policy->peers.pool );
498 0 : ulong peerlst_gaddr = fd_wksp_gaddr_fast( tile_ctx->wksp, policy->peers.fast );
499 0 : ulong peerwst_gaddr = fd_wksp_gaddr_fast( tile_ctx->wksp, policy->peers.slow );
500 0 : fd_policy_peer_map_t * peers_map = (fd_policy_peer_map_t *) fd_wksp_laddr( repair_tile_wksp, peermap_gaddr );
501 0 : fd_policy_peer_dlist_t * peers_dlist = (fd_policy_peer_dlist_t *)fd_wksp_laddr( repair_tile_wksp, peerlst_gaddr );
502 0 : fd_policy_peer_dlist_t * peers_wlist = (fd_policy_peer_dlist_t *)fd_wksp_laddr( repair_tile_wksp, peerwst_gaddr );
503 0 : fd_policy_peer_t * peers_arr = (fd_policy_peer_t *) fd_wksp_laddr( repair_tile_wksp, peerarr_gaddr );
504 :
505 0 : ulong peer_cnt = sort_peers_by_latency( peers_map, peers_dlist, peers_wlist, peers_arr );
506 0 : printf("\nPeer Location/Latency Information\n");
507 0 : printf( "| %-46s | %-7s | %-8s | %-8s | %-7s | %12s | %s\n", "Pubkey", "Req Cnt", "Req B/s", "Rx B/s", "Rx Rate", "Avg Latency", "Location Info" );
508 0 : for( uint i = 0; i < peer_cnt; i++ ) {
509 0 : fd_policy_peer_t const * active = fd_policy_peer_map_ele_query( peers_map, &peers_copy[ i ], NULL, peers_arr );
510 0 : if( FD_LIKELY( active && active->res_cnt > 0 ) ) {
511 0 : fd_location_info_t * info = fd_location_table_query( location_table, active->ip4, NULL );
512 0 : char * geolocation = info ? info->location : "Unknown";
513 0 : double peer_bps = (double)(active->res_cnt * FD_SHRED_MIN_SZ) / ((double)(active->last_resp_ts - active->first_resp_ts) / 1e9);
514 0 : double req_bps = (double)active->req_cnt * 202 / ((double)(active->last_req_ts - active->first_req_ts) / 1e9);
515 0 : FD_BASE58_ENCODE_32_BYTES( active->key.key, key_b58 );
516 0 : printf( "%-5u | %-46s | %-7lu | %-8.2f | %-8.2f | %-7.2f | %10.3fms | %s\n", i, key_b58, active->req_cnt, req_bps, peer_bps, (double)active->res_cnt / (double)active->req_cnt, ((double)active->total_lat / (double)active->res_cnt) / 1e6, geolocation );
517 0 : }
518 0 : }
519 0 : printf("\n");
520 0 : fflush( stdout );
521 0 : }
522 :
523 : static void
524 0 : read_iptable( char * iptable_path, fd_location_info_t * location_table ) {
525 0 : int iptable_fd = open( iptable_path, O_RDONLY );
526 0 : if( FD_UNLIKELY( iptable_fd<0 ) ) return;
527 :
528 : /* read iptable line by line */
529 0 : if( FD_LIKELY( iptable_fd>=0 ) ) {
530 0 : char line[ 256 ];
531 0 : uchar istream_buf[256];
532 0 : fd_io_buffered_istream_t istream[1];
533 0 : fd_io_buffered_istream_init( istream, iptable_fd, istream_buf, sizeof(istream_buf) );
534 0 : for(;;) {
535 0 : int err;
536 0 : if( !fd_io_fgets( line, sizeof(line), istream, &err ) ) break;
537 0 : fd_location_info_t location_info;
538 0 : sscanf( line, "%lu %[^\n]", &location_info.ip4_addr, location_info.location );
539 0 : fd_location_info_t * info = fd_location_table_insert( location_table, location_info.ip4_addr );
540 0 : if( FD_UNLIKELY( info==NULL ) ) break;
541 0 : memcpy( info->location, location_info.location, sizeof(info->location) );
542 0 : }
543 0 : }
544 0 : }
545 :
546 : static void
547 : print_tile_metrics( volatile ulong * shred_metrics,
548 : volatile ulong * repair_metrics,
549 : volatile ulong * repair_metrics_prev, /* for diffing metrics */
550 : volatile ulong ** repair_net_links,
551 : volatile ulong ** net_shred_links,
552 : ulong net_tile_cnt,
553 : ulong * last_sent_cnt,
554 : long last_print_ts,
555 0 : long now ) {
556 0 : char buf2[ 64 ];
557 0 : ulong rcvd = shred_metrics [ MIDX( COUNTER, SHRED, SHRED_REPAIR_RCV ) ];
558 0 : ulong sent = repair_metrics[ MIDX( COUNTER, REPAIR, SENT_PKT_TYPES_NEEDED_WINDOW ) ] +
559 0 : repair_metrics[ MIDX( COUNTER, REPAIR, SENT_PKT_TYPES_NEEDED_HIGHEST_WINDOW ) ] +
560 0 : repair_metrics[ MIDX( COUNTER, REPAIR, SENT_PKT_TYPES_NEEDED_ORPHAN ) ];
561 0 : printf(" Requests received: (%lu/%lu) %.1f%% \n", rcvd, sent, (double)rcvd / (double)sent * 100.0 );
562 0 : printf( " +---------------+--------------+\n" );
563 0 : printf( " | Request Type | Count |\n" );
564 0 : printf( " +---------------+--------------+\n" );
565 0 : printf( " | Orphan | %s |\n", fmt_count( buf2, repair_metrics[ MIDX( COUNTER, REPAIR, SENT_PKT_TYPES_NEEDED_ORPHAN ) ] ) );
566 0 : printf( " | HighestWindow | %s |\n", fmt_count( buf2, repair_metrics[ MIDX( COUNTER, REPAIR, SENT_PKT_TYPES_NEEDED_HIGHEST_WINDOW ) ] ) );
567 0 : printf( " | Index | %s |\n", fmt_count( buf2, repair_metrics[ MIDX( COUNTER, REPAIR, SENT_PKT_TYPES_NEEDED_WINDOW ) ] ) );
568 0 : printf( " +---------------+--------------+\n" );
569 0 : printf( " Send Pkt Rate: %s pps\n", fmt_count( buf2, (ulong)((sent - *last_sent_cnt)*1e9L / (now - last_print_ts) ) ) );
570 0 : *last_sent_cnt = sent;
571 :
572 : /* Sum overrun across all net tiles connected to repair_net */
573 0 : ulong total_overrun = repair_net_links[0][ MIDX( COUNTER, LINK, OVERRUN_POLLING_FRAG_COUNT ) ]; /* coarse double counting prevention */
574 0 : ulong total_consumed = 0UL;
575 0 : for( ulong i = 0UL; i < net_tile_cnt; i++ ) {
576 0 : volatile ulong * ovar_net_metrics = repair_net_links[i];
577 0 : total_overrun += ovar_net_metrics[ MIDX( COUNTER, LINK, OVERRUN_READING_FRAG_COUNT ) ];
578 0 : total_consumed += ovar_net_metrics[ MIDX( COUNTER, LINK, CONSUMED_COUNT ) ]; /* consumed is incremented after after_frag is called */
579 0 : }
580 0 : printf( " Outgoing requests overrun: %s\n", fmt_count( buf2, total_overrun ) );
581 0 : printf( " Outgoing requests consumed: %s\n", fmt_count( buf2, total_consumed ) );
582 :
583 0 : total_overrun = net_shred_links[0][ MIDX( COUNTER, LINK, OVERRUN_READING_FRAG_COUNT ) ];
584 0 : total_consumed = 0UL;
585 0 : for( ulong i = 0UL; i < net_tile_cnt; i++ ) {
586 0 : volatile ulong * ovar_net_metrics = net_shred_links[i];
587 0 : total_overrun += ovar_net_metrics[ MIDX( COUNTER, LINK, OVERRUN_READING_FRAG_COUNT ) ];
588 0 : total_consumed += ovar_net_metrics[ MIDX( COUNTER, LINK, CONSUMED_COUNT ) ]; /* shred frag filtering happens manually in after_frag, so no need to index every shred_tile. */
589 0 : }
590 :
591 0 : printf( " Incoming shreds overrun: %s\n", fmt_count( buf2, total_overrun ) );
592 0 : printf( " Incoming shreds consumed: %s\n", fmt_count( buf2, total_consumed ) );
593 :
594 0 : print_histogram_buckets( repair_metrics,
595 0 : MIDX( HISTOGRAM, REPAIR, RESPONSE_LATENCY ),
596 0 : FD_METRICS_CONVERTER_NONE,
597 0 : FD_METRICS_HISTOGRAM_REPAIR_RESPONSE_LATENCY_MIN,
598 0 : FD_METRICS_HISTOGRAM_REPAIR_RESPONSE_LATENCY_MAX,
599 0 : "Response Latency" );
600 :
601 0 : printf(" Repair Peers: %lu\n", repair_metrics[ MIDX( COUNTER, REPAIR, REQUEST_PEERS ) ] );
602 0 : printf(" Shreds rejected (no stakes): %lu\n", shred_metrics[ MIDX( COUNTER, SHRED, SHRED_PROCESSED ) ] );
603 : /* Print histogram buckets similar to Prometheus format */
604 0 : print_histogram_buckets( repair_metrics,
605 0 : MIDX( HISTOGRAM, REPAIR, SLOT_COMPLETE_TIME ),
606 0 : FD_METRICS_CONVERTER_SECONDS,
607 0 : FD_METRICS_HISTOGRAM_REPAIR_SLOT_COMPLETE_TIME_MIN,
608 0 : FD_METRICS_HISTOGRAM_REPAIR_SLOT_COMPLETE_TIME_MAX,
609 0 : "Slot Complete Time" );
610 :
611 0 : #define DIFFX(METRIC) repair_metrics[ MIDX( COUNTER, TILE, METRIC ) ] - repair_metrics_prev[ MIDX( COUNTER, TILE, METRIC ) ]
612 0 : ulong hkeep_ticks = DIFFX(REGIME_DURATION_NANOS_CAUGHT_UP_HOUSEKEEPING) + DIFFX(REGIME_DURATION_NANOS_PROCESSING_HOUSEKEEPING) + DIFFX(REGIME_DURATION_NANOS_BACKPRESSURE_HOUSEKEEPING);
613 0 : ulong busy_ticks = DIFFX(REGIME_DURATION_NANOS_PROCESSING_PREFRAG) + DIFFX(REGIME_DURATION_NANOS_PROCESSING_POSTFRAG ) + DIFFX(REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG);
614 0 : ulong caught_up_ticks = DIFFX(REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG);
615 0 : ulong backpressure_ticks = DIFFX(REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG);
616 0 : ulong total_ticks = hkeep_ticks + busy_ticks + caught_up_ticks + backpressure_ticks;
617 :
618 0 : printf( " Repair Hkeep: %.1f %% Busy: %.1f %% Idle: %.1f %% Backp: %0.1f %%\n",
619 0 : (double)hkeep_ticks/(double)total_ticks*100.0,
620 0 : (double)busy_ticks/(double)total_ticks*100.0,
621 0 : (double)caught_up_ticks/(double)total_ticks*100.0,
622 0 : (double)backpressure_ticks/(double)total_ticks*100.0 );
623 0 : #undef DIFFX
624 0 : fflush( stdout );
625 0 : for( ulong i=0UL; i<FD_METRICS_TOTAL_SZ/sizeof(ulong); i++ ) repair_metrics_prev[ i ] = repair_metrics[ i ];
626 0 : }
627 :
628 : static void
629 : repair_ctx_wksp( args_t * args,
630 : config_t * config,
631 : ctx_t ** repair_ctx,
632 0 : fd_topo_wksp_t ** repair_wksp ) {
633 0 : (void)args;
634 :
635 0 : fd_topo_t * topo = &config->topo;
636 0 : ulong wksp_id = fd_topo_find_wksp( topo, "repair" );
637 0 : if( FD_UNLIKELY( wksp_id==ULONG_MAX ) ) FD_LOG_ERR(( "repair workspace not found" ));
638 :
639 0 : fd_topo_wksp_t * _repair_wksp = &topo->workspaces[ wksp_id ];
640 :
641 0 : ulong tile_id = fd_topo_find_tile( topo, "repair", 0UL );
642 0 : if( FD_UNLIKELY( tile_id==ULONG_MAX ) ) FD_LOG_ERR(( "repair tile not found" ));
643 :
644 0 : fd_topo_join_workspace( topo, _repair_wksp, FD_SHMEM_JOIN_MODE_READ_ONLY, FD_TOPO_CORE_DUMP_LEVEL_DISABLED );
645 :
646 : /* Access the repair tile scratch memory where repair_tile_ctx is stored */
647 0 : fd_topo_tile_t * tile = &topo->tiles[ tile_id ];
648 0 : void * scratch = fd_topo_obj_laddr( &config->topo, tile->tile_obj_id );
649 0 : if( FD_UNLIKELY( !scratch ) ) FD_LOG_ERR(( "Failed to access repair tile scratch memory" ));
650 :
651 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
652 0 : ctx_t * _repair_ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(ctx_t), sizeof(ctx_t) );
653 :
654 0 : *repair_ctx = _repair_ctx;
655 0 : *repair_wksp = _repair_wksp;
656 0 : }
657 :
658 : static void
659 : repair_cmd_fn_catchup( args_t * args,
660 0 : config_t * config ) {
661 :
662 0 : memset( &config->topo, 0, sizeof(config->topo) );
663 0 : repair_topo( config );
664 :
665 0 : for( ulong i=0UL; i<config->topo.tile_cnt; i++ ) {
666 0 : fd_topo_tile_t * tile = &config->topo.tiles[ i ];
667 0 : if( FD_UNLIKELY( !strcmp( tile->name, "scap" ) ) ) {
668 : /* This is not part of the config, and it must be set manually
669 : on purpose as a safety mechanism. */
670 0 : tile->shredcap.enable_publish_stake_weights = 1;
671 0 : fd_cstr_ncpy( tile->shredcap.manifest_path, args->repair.manifest_path, PATH_MAX );
672 0 : }
673 0 : if( FD_UNLIKELY( !strcmp( tile->name, "repair" ) ) ) {
674 0 : tile->repair.end_slot = args->repair.end_slot;
675 0 : }
676 0 : }
677 :
678 0 : fd_topo_print_log( 1, &config->topo );
679 :
680 0 : args_t configure_args = {
681 0 : .configure.command = CONFIGURE_CMD_INIT,
682 0 : };
683 0 : for( ulong i=0UL; STAGES[ i ]; i++ ) {
684 0 : configure_args.configure.stages[ i ] = STAGES[ i ];
685 0 : }
686 0 : configure_cmd_fn( &configure_args, config );
687 0 : if( 0==strcmp( config->net.provider, "xdp" ) ) {
688 0 : fd_topo_install_xdp_simple( &config->topo, config->net.bind_address_parsed );
689 0 : }
690 0 : run_firedancer_init( config, 1, 0 );
691 :
692 0 : fd_log_private_shared_lock[ 1 ] = 0;
693 0 : fd_topo_join_workspaces( &config->topo, FD_SHMEM_JOIN_MODE_READ_WRITE, FD_TOPO_CORE_DUMP_LEVEL_DISABLED );
694 :
695 0 : fd_topo_fill( &config->topo );
696 :
697 : /* Access repair workspace memory and metrics */
698 :
699 0 : ulong repair_tile_idx = fd_topo_find_tile( &config->topo, "repair", 0UL );
700 0 : ulong shred_tile_idx = fd_topo_find_tile( &config->topo, "shred", 0UL );
701 0 : FD_TEST( repair_tile_idx!=ULONG_MAX );
702 0 : FD_TEST( shred_tile_idx !=ULONG_MAX );
703 0 : fd_topo_tile_t * repair_tile = &config->topo.tiles[ repair_tile_idx ];
704 0 : fd_topo_tile_t * shred_tile = &config->topo.tiles[ shred_tile_idx ];
705 :
706 0 : fd_topo_wksp_t * repair_wksp;
707 0 : ctx_t * repair_ctx;
708 0 : repair_ctx_wksp( args, config, &repair_ctx, &repair_wksp );
709 :
710 0 : volatile ulong * shred_metrics = fd_metrics_tile( shred_tile->metrics );
711 0 : volatile ulong * repair_metrics = fd_metrics_tile( repair_tile->metrics );
712 0 : FD_TEST( repair_metrics );
713 0 : ulong * repair_metrics_prev = aligned_alloc( 8UL, sizeof(ulong) * FD_METRICS_TOTAL_SZ );
714 0 : FD_TEST( repair_metrics_prev );
715 0 : memset( repair_metrics_prev, 0, sizeof(ulong) * FD_METRICS_TOTAL_SZ );
716 :
717 : /* Collect link metrics */
718 :
719 : /* Collect all net tiles and their repair_net link metrics */
720 0 : ulong net_cnt = config->layout.net_tile_count;
721 0 : volatile ulong ** repair_net_links = aligned_alloc( 8UL, net_cnt * sizeof(volatile ulong*) );
722 0 : volatile ulong ** net_shred_links = aligned_alloc( 8UL, net_cnt * sizeof(volatile ulong*) );
723 0 : FD_TEST( repair_net_links );
724 0 : FD_TEST( net_shred_links );
725 :
726 0 : for( ulong i = 0UL; i < net_cnt; i++ ) {
727 0 : ulong tile_idx = fd_topo_find_tile( &config->topo, "net", i );
728 0 : if( FD_UNLIKELY( tile_idx == ULONG_MAX ) ) FD_LOG_ERR(( "net tile %lu not found", i ));
729 0 : fd_topo_tile_t * tile = &config->topo.tiles[ tile_idx ];
730 :
731 0 : ulong repair_net_in_idx = fd_topo_find_tile_in_link( &config->topo, tile, "repair_net", 0UL );
732 0 : if( FD_UNLIKELY( repair_net_in_idx == ULONG_MAX ) ) FD_LOG_ERR(( "repair_net link not found for net tile %lu", i ));
733 0 : FD_TEST( tile->metrics );
734 0 : repair_net_links[i] = fd_metrics_link_in( tile->metrics, repair_net_in_idx );
735 0 : FD_TEST( repair_net_links[i] );
736 :
737 : /* process all net_shred links */
738 0 : ulong shred_tile_idx = fd_topo_find_tile( &config->topo, "shred", 0 );
739 0 : if( FD_UNLIKELY( shred_tile_idx == ULONG_MAX ) ) FD_LOG_ERR(( "shred tile 0 not found" ));
740 0 : fd_topo_tile_t * shred_tile = &config->topo.tiles[ shred_tile_idx ];
741 :
742 0 : ulong shred_out_in_idx = fd_topo_find_tile_in_link( &config->topo, shred_tile, "net_shred", i );
743 0 : if( FD_UNLIKELY( shred_out_in_idx == ULONG_MAX ) ) FD_LOG_ERR(( "net_shred link not found for shred tile 0" ));
744 0 : FD_TEST( shred_tile->metrics );
745 0 : net_shred_links[i] = fd_metrics_link_in( shred_tile->metrics, shred_out_in_idx );
746 0 : FD_TEST( net_shred_links[i] );
747 0 : }
748 :
749 0 : FD_LOG_NOTICE(( "Repair catchup run" ));
750 :
751 0 : ulong shred_out_link_idx = fd_topo_find_link( &config->topo, "shred_out", 0UL );
752 0 : FD_TEST( shred_out_link_idx!=ULONG_MAX );
753 0 : fd_topo_link_t * shred_out_link = &config->topo.links[ shred_out_link_idx ];
754 0 : fd_frag_meta_t * shred_out_mcache = shred_out_link->mcache;
755 :
756 0 : ulong turbine_slot0 = 0;
757 0 : long last_print = fd_log_wallclock();
758 0 : ulong last_sent = 0UL;
759 :
760 0 : if( FD_LIKELY( args->repair.end_slot ) ) turbine_slot0 = args->repair.end_slot;
761 :
762 0 : fd_topo_run_single_process( &config->topo, 0, config->uid, config->gid, fdctl_tile_run );
763 0 : for(;;) {
764 :
765 0 : if( FD_UNLIKELY( !turbine_slot0 ) ) {
766 0 : fd_frag_meta_t * frag = &shred_out_mcache[0]; /* hack to get first frag */
767 0 : if ( frag->sz > 0 ) {
768 0 : turbine_slot0 = fd_disco_shred_out_shred_sig_slot( frag->sig );
769 0 : FD_LOG_NOTICE(("turbine_slot0: %lu", turbine_slot0));
770 0 : }
771 0 : }
772 :
773 : /* print metrics */
774 :
775 0 : long now = fd_log_wallclock();
776 0 : int catchup_finished = 0;
777 0 : if( FD_UNLIKELY( now - last_print > 1e9L ) ) {
778 0 : print_tile_metrics( shred_metrics, repair_metrics, repair_metrics_prev, repair_net_links, net_shred_links, net_cnt, &last_sent, last_print, now );
779 0 : ulong slots_behind = turbine_slot0 > repair_metrics[ MIDX( COUNTER, REPAIR, REPAIRED_SLOTS ) ] ? turbine_slot0 - repair_metrics[ MIDX( COUNTER, REPAIR, REPAIRED_SLOTS ) ] : 0;
780 0 : printf(" Repaired slots: %lu/%lu (slots behind: %lu)\n", repair_metrics[ MIDX( COUNTER, REPAIR, REPAIRED_SLOTS ) ], turbine_slot0, slots_behind );
781 0 : if( turbine_slot0 && !slots_behind && ( !args->repair.end_slot || FD_VOLATILE_CONST( repair_ctx->profiler.complete ) ) ) {
782 0 : catchup_finished = 1;
783 0 : }
784 0 : printf("\n");
785 0 : fflush( stdout );
786 0 : last_print = now;
787 0 : }
788 :
789 0 : if( FD_UNLIKELY( catchup_finished ) ) {
790 : /* repair cmd owned memory */
791 0 : location_table = fd_location_table_join( fd_location_table_new( location_table_mem ) );
792 0 : read_iptable( args->repair.iptable_path, location_table );
793 0 : print_peer_location_latency( repair_wksp->wksp, repair_ctx );
794 0 : print_catchup_slots( repair_wksp->wksp, repair_ctx, 0, 1 );
795 0 : FD_LOG_NOTICE(("Catchup to slot %lu completed successfully", turbine_slot0));
796 0 : fd_sys_util_exit_group( 0 );
797 0 : }
798 0 : }
799 0 : }
800 :
801 : /* Tests equivocation detection & repair path. */
802 : static void
803 : repair_cmd_fn_eqvoc( args_t * args,
804 0 : config_t * config ) {
805 0 : memset( &config->topo, 0, sizeof(config->topo) );
806 0 : repair_topo( config );
807 :
808 0 : for( ulong i=0UL; i<config->topo.tile_cnt; i++ ) {
809 0 : fd_topo_tile_t * tile = &config->topo.tiles[ i ];
810 0 : if( FD_UNLIKELY( !strcmp( tile->name, "scap" ) ) ) {
811 : /* This is not part of the config, and it must be set manually
812 : on purpose as a safety mechanism. */
813 0 : tile->shredcap.enable_publish_stake_weights = 1;
814 0 : fd_cstr_ncpy( tile->shredcap.manifest_path, args->repair.manifest_path, PATH_MAX );
815 0 : }
816 0 : if( FD_UNLIKELY( !strcmp( tile->name, "repair" ) ) ) {
817 0 : tile->repair.end_slot = args->repair.end_slot;
818 0 : }
819 0 : }
820 :
821 0 : FD_LOG_NOTICE(( "Repair eqvoc testing init" ));
822 0 : fd_topo_print_log( 1, &config->topo );
823 :
824 0 : args_t configure_args = { .configure.command = CONFIGURE_CMD_INIT, };
825 0 : for( ulong i=0UL; STAGES[ i ]; i++ ) configure_args.configure.stages[ i ] = STAGES[ i ];
826 0 : configure_cmd_fn( &configure_args, config );
827 0 : if( 0==strcmp( config->net.provider, "xdp" ) ) fd_topo_install_xdp_simple( &config->topo, config->net.bind_address_parsed );
828 :
829 0 : run_firedancer_init( config, 1, 0 );
830 0 : fd_log_private_shared_lock[ 1 ] = 0;
831 0 : fd_topo_join_workspaces( &config->topo, FD_SHMEM_JOIN_MODE_READ_WRITE, FD_TOPO_CORE_DUMP_LEVEL_DISABLED );
832 0 : fd_topo_fill( &config->topo );
833 0 : ulong repair_tile_idx = fd_topo_find_tile( &config->topo, "repair", 0UL );
834 0 : fd_topo_tile_t * repair_tile = &config->topo.tiles[ repair_tile_idx ];
835 0 : volatile ulong * repair_metrics = fd_metrics_tile( repair_tile->metrics );
836 :
837 0 : void * scratch = fd_topo_obj_laddr( &config->topo, repair_tile->tile_obj_id );
838 0 : if( FD_UNLIKELY( !scratch ) ) FD_LOG_ERR(( "Failed to access repair tile scratch memory" ));
839 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
840 0 : ctx_t * repair_ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(ctx_t), sizeof(ctx_t) );
841 0 : (void)repair_ctx;
842 :
843 : /* read tower_out mcache dcache */
844 0 : ulong tower_out_link_idx = fd_topo_find_link( &config->topo, "tower_out", 0UL );
845 0 : FD_TEST( tower_out_link_idx!=ULONG_MAX );
846 0 : fd_topo_link_t * tower_out_link = &config->topo.links[ tower_out_link_idx ];
847 0 : fd_frag_meta_t * tower_out_mcache = tower_out_link->mcache;
848 0 : fd_wksp_t * tower_out_mem = config->topo.workspaces[ config->topo.objs[ tower_out_link->dcache_obj_id ].wksp_id ].wksp;
849 0 : ulong tower_out_chunk0 = fd_dcache_compact_chunk0( tower_out_mem, tower_out_link->dcache );
850 0 : ulong tower_out_wmark = fd_dcache_compact_wmark( tower_out_mem, tower_out_link->dcache, tower_out_link->mtu );
851 0 : ulong tower_out_chunk = tower_out_chunk0;
852 :
853 0 : fd_topo_run_single_process( &config->topo, 0, config->uid, config->gid, fdctl_tile_run );
854 0 : int confirmed = 0;
855 0 : for(;;) {
856 : /* publish a confirmation on tower_out */
857 0 : if( FD_UNLIKELY( !confirmed && repair_metrics[ MIDX( COUNTER, REPAIR, REPAIRED_SLOTS ) ] != 0 ) ) {
858 0 : fd_tower_slot_confirmed_t * msg = fd_chunk_to_laddr( tower_out_mem, tower_out_chunk );
859 0 : FD_LOG_NOTICE(( "publishing confirmation for slot %lu", msg->slot ));
860 0 : fd_mcache_publish( tower_out_mcache, tower_out_link->depth, 0, FD_TOWER_SIG_SLOT_CONFIRMED, tower_out_chunk, sizeof(fd_tower_slot_confirmed_t), 0, 0, 0 );
861 0 : tower_out_chunk = fd_dcache_compact_next( tower_out_chunk, sizeof(fd_tower_slot_confirmed_t), tower_out_chunk0, tower_out_wmark );
862 0 : confirmed = 1;
863 0 : }
864 0 : sleep( 1 );
865 0 : }
866 0 : }
867 :
868 : static void
869 : repair_cmd_fn_metrics( args_t * args,
870 0 : config_t * config ) {
871 : //memset( &config->topo, 0, sizeof(config->topo) );
872 :
873 0 : fd_log_private_shared_lock[ 1 ] = 0;
874 0 : fd_topo_join_workspaces( &config->topo, FD_SHMEM_JOIN_MODE_READ_ONLY, FD_TOPO_CORE_DUMP_LEVEL_DISABLED );
875 0 : fd_topo_fill( &config->topo );
876 :
877 0 : ctx_t * repair_ctx;
878 0 : fd_topo_wksp_t * repair_wksp;
879 0 : repair_ctx_wksp( args, config, &repair_ctx, &repair_wksp );
880 :
881 0 : ulong shred_tile_idx = fd_topo_find_tile( &config->topo, "shred", 0UL );
882 0 : ulong repair_tile_idx = fd_topo_find_tile( &config->topo, "repair", 0UL );
883 0 : FD_TEST( shred_tile_idx != ULONG_MAX );
884 0 : FD_TEST( repair_tile_idx!= ULONG_MAX );
885 0 : fd_topo_tile_t * shred_tile = &config->topo.tiles[ shred_tile_idx ];
886 0 : fd_topo_tile_t * repair_tile = &config->topo.tiles[ repair_tile_idx ];
887 :
888 0 : volatile ulong * shred_metrics = fd_metrics_tile( shred_tile->metrics );
889 0 : FD_TEST( shred_metrics );
890 :
891 0 : volatile ulong * repair_metrics = fd_metrics_tile( repair_tile->metrics );
892 0 : FD_TEST( repair_metrics );
893 0 : ulong * repair_metrics_prev = aligned_alloc( 8UL, sizeof(ulong) * FD_METRICS_TOTAL_SZ );
894 0 : FD_TEST( repair_metrics_prev );
895 0 : memset( repair_metrics_prev, 0, sizeof(ulong) * FD_METRICS_TOTAL_SZ );
896 :
897 :
898 0 : ulong net_tile_cnt = config->layout.net_tile_count;
899 0 : volatile ulong ** repair_net_links = aligned_alloc( 8UL, net_tile_cnt * sizeof(volatile ulong*) );
900 0 : volatile ulong ** net_shred_links = aligned_alloc( 8UL, net_tile_cnt * sizeof(volatile ulong*) );
901 0 : FD_TEST( repair_net_links );
902 0 : FD_TEST( net_shred_links );
903 :
904 0 : for( ulong i = 0UL; i < net_tile_cnt; i++ ) {
905 : /* process all repair_net links */
906 0 : ulong tile_idx = fd_topo_find_tile( &config->topo, "net", i );
907 0 : if( FD_UNLIKELY( tile_idx == ULONG_MAX ) ) FD_LOG_ERR(( "net tile %lu not found", i ));
908 0 : fd_topo_tile_t * tile = &config->topo.tiles[ tile_idx ];
909 :
910 0 : ulong repair_net_in_idx = fd_topo_find_tile_in_link( &config->topo, tile, "repair_net", 0UL );
911 0 : if( FD_UNLIKELY( repair_net_in_idx == ULONG_MAX ) ) FD_LOG_ERR(( "repair_net link not found for net tile %lu", i ));
912 0 : repair_net_links[i] = fd_metrics_link_in( tile->metrics, repair_net_in_idx );
913 0 : FD_TEST( repair_net_links[i] );
914 :
915 : /* process all net_shred links */
916 0 : tile_idx = fd_topo_find_tile( &config->topo, "shred", 0 );
917 0 : if( FD_UNLIKELY( tile_idx == ULONG_MAX ) ) FD_LOG_ERR(( "shred tile 0 not found" ));
918 0 : fd_topo_tile_t * shred_tile = &config->topo.tiles[ tile_idx ];
919 :
920 0 : ulong shred_out_in_idx = fd_topo_find_tile_in_link( &config->topo, shred_tile, "net_shred", i );
921 0 : if( FD_UNLIKELY( shred_out_in_idx == ULONG_MAX ) ) FD_LOG_ERR(( "net_shred link not found for shred tile 0" ));
922 0 : net_shred_links[i] = fd_metrics_link_in( shred_tile->metrics, shred_out_in_idx );
923 0 : FD_TEST( net_shred_links[i] );
924 0 : }
925 :
926 0 : long last_print_ts = fd_log_wallclock();
927 0 : ulong last_sent = 0UL;
928 0 : for(;;) {
929 0 : long now = fd_log_wallclock();
930 0 : if( FD_UNLIKELY( now - last_print_ts > 1e9L ) ) {
931 0 : print_tile_metrics( shred_metrics, repair_metrics, repair_metrics_prev, repair_net_links, net_shred_links, net_tile_cnt, &last_sent, last_print_ts, now );
932 0 : last_print_ts = now;
933 0 : }
934 0 : }
935 0 : }
936 :
937 : static void
938 : repair_cmd_fn_forest( args_t * args,
939 0 : config_t * config ) {
940 0 : ctx_t * repair_ctx;
941 0 : fd_topo_wksp_t * repair_wksp;
942 0 : repair_ctx_wksp( args, config, &repair_ctx, &repair_wksp );
943 :
944 0 : ulong forest_gaddr = fd_wksp_gaddr_fast( repair_ctx->wksp, repair_ctx->forest );
945 0 : fd_forest_t * forest = (fd_forest_t *)fd_wksp_laddr( repair_wksp->wksp, forest_gaddr );
946 :
947 0 : for( ;; ) {
948 0 : fd_forest_print( forest );
949 0 : sleep( 1 );
950 0 : }
951 0 : }
952 :
953 : static void
954 : repair_cmd_fn_inflight( args_t * args,
955 0 : config_t * config ) {
956 0 : ctx_t * repair_ctx;
957 0 : fd_topo_wksp_t * repair_wksp;
958 0 : repair_ctx_wksp( args, config, &repair_ctx, &repair_wksp );
959 :
960 0 : ulong inflights_gaddr = fd_wksp_gaddr_fast( repair_ctx->wksp, repair_ctx->inflights );
961 0 : fd_inflights_t * inflights = (fd_inflights_t *)fd_wksp_laddr( repair_wksp->wksp, inflights_gaddr );
962 :
963 0 : ulong inflight_pool_off = (ulong)inflights->pool - (ulong)repair_ctx->inflights;
964 0 : fd_inflight_t * inflight_pool = (fd_inflight_t *)fd_wksp_laddr( repair_wksp->wksp, inflights_gaddr + inflight_pool_off );
965 :
966 0 : for( ;; ) {
967 0 : fd_inflights_print( inflights->outstanding_dl, inflight_pool );
968 0 : sleep( 1 );
969 0 : }
970 0 : }
971 :
972 : static void
973 : repair_cmd_fn_requests( args_t * args,
974 0 : config_t * config ) {
975 0 : ctx_t * repair_ctx;
976 0 : fd_topo_wksp_t * repair_wksp;
977 0 : repair_ctx_wksp( args, config, &repair_ctx, &repair_wksp );
978 :
979 0 : fd_forest_t * forest = fd_forest_join( fd_wksp_laddr( repair_wksp->wksp, fd_wksp_gaddr_fast( repair_ctx->wksp, repair_ctx->forest ) ) );
980 0 : fd_forest_reqslist_t * dlist = fd_forest_reqslist( forest );
981 0 : fd_forest_ref_t * pool = fd_forest_reqspool( forest );
982 :
983 0 : fd_forest_reqslist_t * orphlist = fd_forest_orphlist( forest );
984 :
985 0 : for( ;; ) {
986 0 : printf("%-15s %-12s %-12s %-12s %-20s %-12s\n",
987 0 : "Slot", "Buffered Idx", "Complete Idx", "First Shred ts", "Turbine Cnt", "Repair Cnt");
988 0 : printf("%-15s %-12s %-12s %-12s %-20s %-12s\n",
989 0 : "---------------", "------------", "------------", "------------",
990 0 : "--------------------", "------------");
991 0 : for( fd_forest_reqslist_iter_t iter = fd_forest_reqslist_iter_fwd_init( dlist, pool );
992 0 : !fd_forest_reqslist_iter_done( iter, dlist, pool );
993 0 : iter = fd_forest_reqslist_iter_fwd_next( iter, dlist, pool ) ) {
994 0 : fd_forest_ref_t * req = fd_forest_reqslist_iter_ele( iter, dlist, pool );
995 0 : fd_forest_blk_t * blk = fd_forest_pool_ele( fd_forest_pool( forest ), req->idx );
996 :
997 0 : printf("%-15lu %-12u %-12u %-20ld %-12u %-10u\n",
998 0 : blk->slot,
999 0 : blk->buffered_idx,
1000 0 : blk->complete_idx,
1001 0 : blk->first_shred_ts,
1002 0 : blk->turbine_cnt,
1003 0 : blk->repair_cnt);
1004 0 : }
1005 0 : printf("\n");
1006 :
1007 : /* now lets print the orphreqs */
1008 :
1009 0 : printf("Orphan Requests:\n");
1010 0 : printf("%-15s %-12s %-12s %-12s %-20s %-12s %-10s\n",
1011 0 : "Slot", "Consumed Idx", "Buffered Idx", "Complete Idx",
1012 0 : "First Shred Timestamp", "Turbine Cnt", "Repair Cnt");
1013 0 : printf("%-15s %-12s %-12s %-12s %-20s %-12s %-10s\n",
1014 0 : "---------------", "------------", "------------", "------------",
1015 0 : "--------------------", "------------", "----------");
1016 :
1017 0 : for( fd_forest_reqslist_iter_t iter = fd_forest_reqslist_iter_fwd_init( orphlist, pool );
1018 0 : !fd_forest_reqslist_iter_done( iter, orphlist, pool );
1019 0 : iter = fd_forest_reqslist_iter_fwd_next( iter, orphlist, pool ) ) {
1020 0 : fd_forest_ref_t * req = fd_forest_reqslist_iter_ele( iter, orphlist, pool );
1021 0 : fd_forest_blk_t * blk = fd_forest_pool_ele( fd_forest_pool( forest ), req->idx );
1022 0 : printf("%-15lu %-12u %-12u %-20ld %-12u %-10u\n",
1023 0 : blk->slot,
1024 0 : blk->buffered_idx,
1025 0 : blk->complete_idx,
1026 0 : blk->first_shred_ts,
1027 0 : blk->turbine_cnt,
1028 0 : blk->repair_cnt);
1029 0 : }
1030 0 : sleep( 1 );
1031 0 : }
1032 0 : }
1033 :
1034 : static void
1035 : repair_cmd_fn_waterfall( args_t * args,
1036 0 : config_t * config ) {
1037 :
1038 0 : fd_topo_t * topo = &config->topo;
1039 0 : ulong wksp_id = fd_topo_find_wksp( topo, "repair" );
1040 0 : if( FD_UNLIKELY( wksp_id==ULONG_MAX ) ) FD_LOG_ERR(( "repair workspace not found" ));
1041 0 : fd_topo_wksp_t * repair_wksp = &topo->workspaces[ wksp_id ];
1042 0 : fd_topo_join_workspace( topo, repair_wksp, FD_SHMEM_JOIN_MODE_READ_ONLY, FD_TOPO_CORE_DUMP_LEVEL_DISABLED );
1043 :
1044 : /* Access the repair tile scratch memory where repair_tile_ctx is stored */
1045 0 : ulong tile_id = fd_topo_find_tile( topo, "repair", 0UL );
1046 0 : if( FD_UNLIKELY( tile_id==ULONG_MAX ) ) FD_LOG_ERR(( "repair tile not found" ));
1047 0 : fd_topo_tile_t * tile = &topo->tiles[ tile_id ];
1048 0 : void * scratch = fd_topo_obj_laddr( &config->topo, tile->tile_obj_id );
1049 0 : if( FD_UNLIKELY( !scratch ) ) FD_LOG_ERR(( "Failed to access repair tile scratch memory" ));
1050 :
1051 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
1052 0 : ctx_t * repair_ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(ctx_t), sizeof(ctx_t) );
1053 :
1054 : /* catchup cmd owned memory */
1055 0 : location_table = fd_location_table_join( fd_location_table_new( location_table_mem ) );
1056 0 : read_iptable( args->repair.iptable_path, location_table );
1057 :
1058 : // Add terminal setup here - same as monitor.c
1059 0 : atexit( restore_terminal );
1060 0 : if( FD_UNLIKELY( 0!=tcgetattr( STDIN_FILENO, &termios_backup ) ) ) {
1061 0 : FD_LOG_ERR(( "tcgetattr(STDIN_FILENO) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
1062 0 : }
1063 :
1064 : /* Disable character echo and line buffering */
1065 0 : struct termios term = termios_backup;
1066 0 : term.c_lflag &= (tcflag_t)~(ICANON | ECHO);
1067 0 : if( FD_UNLIKELY( 0!=tcsetattr( STDIN_FILENO, TCSANOW, &term ) ) ) {
1068 0 : FD_LOG_WARNING(( "tcsetattr(STDIN_FILENO) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
1069 0 : }
1070 :
1071 0 : int catchup_verbose = 0;
1072 0 : long last_print = 0;
1073 0 : for( ;; ) {
1074 0 : int c = fd_getchar();
1075 0 : if( FD_UNLIKELY( c=='i' ) ) catchup_verbose = !catchup_verbose;
1076 0 : if( FD_UNLIKELY( c=='\x04' ) ) break; /* Ctrl-D */
1077 :
1078 0 : long now = fd_log_wallclock();
1079 0 : if( FD_UNLIKELY( now - last_print > 1e9L ) ) {
1080 0 : last_print = now;
1081 0 : print_catchup_slots( repair_wksp->wksp, repair_ctx, catchup_verbose, args->repair.sort_by_slot );
1082 0 : printf( "catchup slots | Use 'i' to toggle extra slot information" TEXT_NEWLINE );
1083 0 : fflush( stdout );
1084 :
1085 : /* Peer location latency is not that useful post catchup, and also
1086 : requires some concurrent dlist iteration, so only print it when
1087 : in catchup mode. */
1088 0 : }
1089 0 : }
1090 0 : }
1091 :
1092 : static void
1093 : repair_cmd_fn_peers( args_t * args,
1094 0 : config_t * config ) {
1095 0 : ctx_t * repair_ctx;
1096 0 : fd_topo_wksp_t * repair_wksp;
1097 0 : repair_ctx_wksp( args, config, &repair_ctx, &repair_wksp );
1098 :
1099 0 : fd_policy_t * policy = fd_wksp_laddr( repair_wksp->wksp, fd_wksp_gaddr_fast( repair_ctx->wksp, repair_ctx->policy ) );
1100 :
1101 0 : fd_policy_peer_dlist_t * best_dlist = fd_wksp_laddr( repair_wksp->wksp, fd_wksp_gaddr_fast( repair_ctx->wksp, policy->peers.fast ) );
1102 0 : fd_policy_peer_dlist_t * worst_dlist = fd_wksp_laddr( repair_wksp->wksp, fd_wksp_gaddr_fast( repair_ctx->wksp, policy->peers.slow ) );
1103 0 : fd_policy_peer_t * pool = fd_wksp_laddr( repair_wksp->wksp, fd_wksp_gaddr_fast( repair_ctx->wksp, policy->peers.pool ) );
1104 :
1105 0 : printf("FAST REPAIR PEERS (latency < 80ms)\n");
1106 0 : int i = 1;
1107 0 : for( fd_policy_peer_dlist_iter_t iter = fd_policy_peer_dlist_iter_fwd_init( best_dlist, pool );
1108 0 : !fd_policy_peer_dlist_iter_done( iter, best_dlist, pool );
1109 0 : iter = fd_policy_peer_dlist_iter_fwd_next( iter, best_dlist, pool ) ) {
1110 0 : fd_policy_peer_t * peer = fd_policy_peer_dlist_iter_ele( iter, best_dlist, pool );
1111 0 : FD_BASE58_ENCODE_32_BYTES( peer->key.key, p );
1112 0 : printf(" %d. %s\n", i, p );
1113 0 : i++;
1114 0 : }
1115 :
1116 0 : printf("SLOW REPAIR PEERS (latency > 80ms)\n");
1117 0 : i = 1;
1118 0 : for( fd_policy_peer_dlist_iter_t iter = fd_policy_peer_dlist_iter_fwd_init( worst_dlist, pool );
1119 0 : !fd_policy_peer_dlist_iter_done( iter, worst_dlist, pool );
1120 0 : iter = fd_policy_peer_dlist_iter_fwd_next( iter, worst_dlist, pool ) ) {
1121 0 : fd_policy_peer_t * peer = fd_policy_peer_dlist_iter_ele( iter, worst_dlist, pool );
1122 0 : FD_BASE58_ENCODE_32_BYTES( peer->key.key, p );
1123 0 : printf(" %d. %s\n", i, p);
1124 0 : i++;
1125 0 : }
1126 0 : }
1127 :
1128 :
1129 : static const char * HELP =
1130 : "\n\n"
1131 : "usage: repair [-h] {catchup,forest,inflight,requests,waterfall,peers,metrics} ...\n"
1132 : "\n"
1133 : "positional arguments:\n"
1134 : " {catchup,forest,inflight,requests,waterfall,peers,metrics}\n"
1135 : " catchup runs Firedancer with a reduced topology that only repairs slots until catchup\n"
1136 : " eqvoc tests equivocation detection & repair path\n"
1137 : " forest prints the repair forest\n"
1138 : " inflight prints the inflight repairs\n"
1139 : " requests prints the queued repair requests\n"
1140 : " waterfall prints a waterfall diagram of recent slot completion times and response latencies\n"
1141 : " peers prints list of slow and fast repair peers\n"
1142 : " metrics prints repair tile metrics in a digestible format\n"
1143 : "\n"
1144 : "optional arguments:\n"
1145 : " -h, --help show this help message and exit\n";
1146 :
1147 : static const char * CATCHUP_HELP =
1148 : "\n\n"
1149 : "usage: repair catchup [-h] [--manifest-path MANIFEST_PATH] [--iptable-path IPTABLE_PATH] [--sort-by-slot]\n"
1150 : "\n"
1151 : "required arguments:\n"
1152 : " --manifest-path MANIFEST_PATH\n"
1153 : " path to manifest file\n"
1154 : "\n"
1155 : "optional arguments:\n"
1156 : " -h, --help show this help message and exit\n"
1157 : " --end-slot END_SLOT slot to catchup to (generally should be a rooted slot)\n"
1158 : " --iptable-path IPTABLE_PATH\n"
1159 : " path to iptable file\n"
1160 : " --sort-by-slot sort results by slot\n";
1161 :
1162 : static const char * EQVOC_HELP =
1163 : "\n\n"
1164 : "usage: repair eqvoc [-h] [--manifest-path MANIFEST_PATH] \n"
1165 : "\n"
1166 : "optional arguments:\n"
1167 : " -h, --help show this help message and exit\n";
1168 :
1169 : static const char * FOREST_HELP =
1170 : "\n\n"
1171 : "usage: repair forest [-h]\n"
1172 : "\n"
1173 : "optional arguments:\n"
1174 : " -h, --help show this help message and exit\n"
1175 : " --slot SLOT specific forest slot to drill into\n";
1176 :
1177 : static const char * INFLIGHT_HELP =
1178 : "\n\n"
1179 : "usage: repair inflight [-h]\n"
1180 : "\n"
1181 : "optional arguments:\n"
1182 : " -h, --help show this help message and exit";
1183 :
1184 : static const char * REQUESTS_HELP =
1185 : "\n\n"
1186 : "usage: repair requests [-h]\n"
1187 : "\n"
1188 : "optional arguments:\n"
1189 : " -h, --help show this help message and exit\n";
1190 :
1191 : static const char * WATERFALL_HELP =
1192 : "\n\n"
1193 : "usage: repair waterfall [-h] [--iptable IPTABLE_PATH] [--sort-by-slot]\n"
1194 : "\n"
1195 : "optional arguments:\n"
1196 : " -h, --help show this help message and exit\n"
1197 : " --iptable IPTABLE_PATH\n"
1198 : " path to iptable file\n"
1199 : " --sort-by-slot sort results by slot\n";
1200 :
1201 : static const char * PEERS_HELP =
1202 : "\n\n"
1203 : "usage: repair peers [-h]\n"
1204 : "\n"
1205 : "optional arguments:\n"
1206 : " -h, --help show this help message and exit\n";
1207 :
1208 : static const char * METRICS_HELP =
1209 : "\n\n"
1210 : "usage: repair metrics [-h] --config CONFIG_PATH\n"
1211 : "\n"
1212 : "optional arguments:\n"
1213 : " -h, --help show this help message and exit\n";
1214 :
1215 : void
1216 0 : repair_cmd_help( char const * arg ) {
1217 0 : if ( FD_LIKELY( !arg ) ) FD_LOG_NOTICE(( "%s", HELP ));
1218 0 : else if ( FD_LIKELY( !strcmp( arg, "catchup" ) ) ) FD_LOG_NOTICE(( "%s", CATCHUP_HELP ));
1219 0 : else if ( FD_LIKELY( !strcmp( arg, "eqvoc" ) ) ) FD_LOG_NOTICE(( "%s", EQVOC_HELP ));
1220 0 : else if ( FD_LIKELY( !strcmp( arg, "forest" ) ) ) FD_LOG_NOTICE(( "%s", FOREST_HELP ));
1221 0 : else if ( FD_LIKELY( !strcmp( arg, "inflight" ) ) ) FD_LOG_NOTICE(( "%s", INFLIGHT_HELP ));
1222 0 : else if ( FD_LIKELY( !strcmp( arg, "requests" ) ) ) FD_LOG_NOTICE(( "%s", REQUESTS_HELP ));
1223 0 : else if ( FD_LIKELY( !strcmp( arg, "waterfall" ) ) ) FD_LOG_NOTICE(( "%s", WATERFALL_HELP ));
1224 0 : else if ( FD_LIKELY( !strcmp( arg, "peers" ) ) ) FD_LOG_NOTICE(( "%s", PEERS_HELP ));
1225 0 : else if ( FD_LIKELY( !strcmp( arg, "metrics" ) ) ) FD_LOG_NOTICE(( "%s", METRICS_HELP ));
1226 0 : else FD_LOG_NOTICE(( "%s", HELP ));
1227 0 : }
1228 :
1229 : void
1230 : repair_cmd_args( int * pargc,
1231 : char *** pargv,
1232 0 : args_t * args ) {
1233 :
1234 : /* help */
1235 :
1236 0 : args->repair.help = fd_env_strip_cmdline_contains( pargc, pargv, "--help" );
1237 0 : args->repair.help = args->repair.help || fd_env_strip_cmdline_contains( pargc, pargv, "-h" );
1238 :
1239 : /* positional arg */
1240 :
1241 0 : args->repair.pos_arg = (*pargv)[0];
1242 0 : if( FD_UNLIKELY( !args->repair.pos_arg ) ) {
1243 0 : args->repair.help = 1;
1244 0 : return;
1245 0 : }
1246 :
1247 : /* required args */
1248 :
1249 0 : char const * manifest_path = fd_env_strip_cmdline_cstr ( pargc, pargv, "--manifest-path", NULL, NULL );
1250 :
1251 : /* optional args */
1252 :
1253 0 : char const * iptable_path = fd_env_strip_cmdline_cstr ( pargc, pargv, "--iptable", NULL, NULL );
1254 0 : ulong slot = fd_env_strip_cmdline_ulong ( pargc, pargv, "--slot", NULL, ULONG_MAX );
1255 0 : int sort_by_slot = fd_env_strip_cmdline_contains( pargc, pargv, "--sort-by-slot" );
1256 0 : ulong end_slot = fd_env_strip_cmdline_ulong ( pargc, pargv, "--end-slot", NULL, 0 );
1257 :
1258 0 : if( FD_UNLIKELY( !strcmp( args->repair.pos_arg, "catchup" ) && !manifest_path ) ) {
1259 0 : args->repair.help = 1;
1260 0 : return;
1261 0 : } else {
1262 0 : (*pargc)--;
1263 0 : }
1264 :
1265 0 : fd_cstr_fini( fd_cstr_append_cstr_safe( fd_cstr_init( args->repair.manifest_path ), manifest_path, sizeof(args->repair.manifest_path)-1UL ) );
1266 0 : fd_cstr_fini( fd_cstr_append_cstr_safe( fd_cstr_init( args->repair.iptable_path ), iptable_path, sizeof(args->repair.iptable_path )-1UL ) );
1267 0 : args->repair.slot = slot;
1268 0 : args->repair.sort_by_slot = sort_by_slot;
1269 0 : args->repair.end_slot = end_slot;
1270 0 : }
1271 :
1272 : static void
1273 : repair_cmd_fn( args_t * args,
1274 0 : config_t * config ) {
1275 :
1276 0 : if( args->repair.help ) {
1277 0 : repair_cmd_help( args->repair.pos_arg );
1278 0 : return;
1279 0 : }
1280 :
1281 0 : if ( !strcmp( args->repair.pos_arg, "catchup" ) ) repair_cmd_fn_catchup ( args, config );
1282 0 : else if( !strcmp( args->repair.pos_arg, "eqvoc" ) ) repair_cmd_fn_eqvoc ( args, config );
1283 0 : else if( !strcmp( args->repair.pos_arg, "forest" ) ) repair_cmd_fn_forest ( args, config );
1284 0 : else if( !strcmp( args->repair.pos_arg, "inflight" ) ) repair_cmd_fn_inflight ( args, config );
1285 0 : else if( !strcmp( args->repair.pos_arg, "requests" ) ) repair_cmd_fn_requests ( args, config );
1286 0 : else if( !strcmp( args->repair.pos_arg, "waterfall" ) ) repair_cmd_fn_waterfall( args, config );
1287 0 : else if( !strcmp( args->repair.pos_arg, "peers" ) ) repair_cmd_fn_peers ( args, config );
1288 0 : else if( !strcmp( args->repair.pos_arg, "metrics" ) ) repair_cmd_fn_metrics ( args, config );
1289 0 : else repair_cmd_help( NULL );
1290 0 : }
1291 :
1292 : action_t fd_action_repair = {
1293 : .name = "repair",
1294 : .args = repair_cmd_args,
1295 : .fn = repair_cmd_fn,
1296 : .perm = dev_cmd_perm,
1297 : };
|