@@ -191,6 +191,16 @@ func RestartNetwork(ctx context.Context, log logging.Logger, dir string) error {
191
191
return network .Restart (ctx )
192
192
}
193
193
194
+ // Restart the provided nodes. Blocks on the nodes accepting API requests but not their health.
195
+ func restartNodes (ctx context.Context , nodes []* Node ) error {
196
+ for _ , node := range nodes {
197
+ if err := node .Restart (ctx ); err != nil {
198
+ return fmt .Errorf ("failed to restart node %s: %w" , node .NodeID , err )
199
+ }
200
+ }
201
+ return nil
202
+ }
203
+
194
204
// Reads a network from the provided directory.
195
205
func ReadNetwork (ctx context.Context , log logging.Logger , dir string ) (* Network , error ) {
196
206
canonicalDir , err := toCanonicalDir (dir )
@@ -441,26 +451,20 @@ func (n *Network) Bootstrap(ctx context.Context, log logging.Logger) error {
441
451
bootstrapNode .Flags [config .SybilProtectionEnabledKey ] = * existingSybilProtectionValue
442
452
}
443
453
454
+ // Ensure the bootstrap node is restarted to pick up subnet and chain configuration
455
+ //
456
+ // TODO(marun) This restart might be unnecessary if:
457
+ // - sybil protection didn't change
458
+ // - the node is not a subnet validator
444
459
log .Info ("restarting bootstrap node" ,
445
460
zap .Stringer ("nodeID" , bootstrapNode .NodeID ),
446
461
)
447
-
448
- if len (n .Nodes ) == 1 {
449
- // Ensure the node is restarted to pick up subnet and chain configuration
450
- return n .RestartNode (ctx , bootstrapNode )
462
+ if err := bootstrapNode .Restart (ctx ); err != nil {
463
+ return err
451
464
}
452
465
453
- // TODO(marun) This last restart of the bootstrap node might be unnecessary if:
454
- // - sybil protection didn't change
455
- // - the node is not a subnet validator
456
-
457
- // Ensure the bootstrap node is restarted to pick up configuration changes. Avoid using
458
- // RestartNode since the node won't be able to report healthy until other nodes are started.
459
- if err := bootstrapNode .Stop (ctx ); err != nil {
460
- return fmt .Errorf ("failed to stop node %s: %w" , bootstrapNode .NodeID , err )
461
- }
462
- if err := n .StartNode (ctx , bootstrapNode ); err != nil {
463
- return fmt .Errorf ("failed to start node %s: %w" , bootstrapNode .NodeID , err )
466
+ if len (n .Nodes ) == 1 {
467
+ return nil
464
468
}
465
469
466
470
log .Info ("starting remaining nodes" )
@@ -486,31 +490,6 @@ func (n *Network) StartNode(ctx context.Context, node *Node) error {
486
490
return nil
487
491
}
488
492
489
- // Restart a single node.
490
- func (n * Network ) RestartNode (ctx context.Context , node * Node ) error {
491
- runtimeConfig := node .getRuntimeConfig ()
492
- if runtimeConfig .Process != nil && runtimeConfig .Process .ReuseDynamicPorts {
493
- // Attempt to save the API port currently being used so the
494
- // restarted node can reuse it. This may result in the node
495
- // failing to start if the operating system allocates the port
496
- // to a different process between node stop and start.
497
- if err := node .SaveAPIPort (); err != nil {
498
- return err
499
- }
500
- }
501
-
502
- if err := node .Stop (ctx ); err != nil {
503
- return fmt .Errorf ("failed to stop node %s: %w" , node .NodeID , err )
504
- }
505
- if err := n .StartNode (ctx , node ); err != nil {
506
- return fmt .Errorf ("failed to start node %s: %w" , node .NodeID , err )
507
- }
508
- n .log .Info ("waiting for node to report healthy" ,
509
- zap .Stringer ("nodeID" , node .NodeID ),
510
- )
511
- return node .WaitForHealthy (ctx )
512
- }
513
-
514
493
// Stops all nodes in the network.
515
494
func (n * Network ) Stop (ctx context.Context ) error {
516
495
// Ensure the node state is up-to-date
@@ -540,11 +519,29 @@ func (n *Network) Stop(ctx context.Context) error {
540
519
return nil
541
520
}
542
521
543
- // Restarts all nodes in the network.
522
+ // Restarts all running nodes in the network.
544
523
func (n * Network ) Restart (ctx context.Context ) error {
545
524
n .log .Info ("restarting network" )
525
+ nodes := make ([]* Node , 0 , len (n .Nodes ))
546
526
for _ , node := range n .Nodes {
547
- if err := n .RestartNode (ctx , node ); err != nil {
527
+ if ! node .IsRunning () {
528
+ continue
529
+ }
530
+ nodes = append (nodes , node )
531
+ }
532
+ if err := restartNodes (ctx , nodes ); err != nil {
533
+ return err
534
+ }
535
+ return WaitForHealthyNodes (ctx , n .log , n .Nodes )
536
+ }
537
+
538
+ // Waits for the provided nodes to become healthy.
539
+ func WaitForHealthyNodes (ctx context.Context , log logging.Logger , nodes []* Node ) error {
540
+ for _ , node := range nodes {
541
+ log .Info ("waiting for node to become healthy" ,
542
+ zap .Stringer ("nodeID" , node .NodeID ),
543
+ )
544
+ if err := node .WaitForHealthy (ctx ); err != nil {
548
545
return err
549
546
}
550
547
}
@@ -669,15 +666,20 @@ func (n *Network) CreateSubnets(ctx context.Context, log logging.Logger, apiURI
669
666
if restartRequired {
670
667
log .Info ("restarting node(s) to enable them to track the new subnet(s)" )
671
668
669
+ runningNodes := make ([]* Node , 0 , len (reconfiguredNodes ))
672
670
for _ , node := range reconfiguredNodes {
673
- if len (node .URI ) == 0 {
674
- // Only running nodes should be restarted
675
- continue
676
- }
677
- if err := n .RestartNode (ctx , node ); err != nil {
678
- return err
671
+ if node .IsRunning () {
672
+ runningNodes = append (runningNodes , node )
679
673
}
680
674
}
675
+
676
+ if err := restartNodes (ctx , runningNodes ); err != nil {
677
+ return err
678
+ }
679
+
680
+ if err := WaitForHealthyNodes (ctx , n .log , runningNodes ); err != nil {
681
+ return err
682
+ }
681
683
}
682
684
683
685
// Add validators for the subnet
@@ -738,15 +740,21 @@ func (n *Network) CreateSubnets(ctx context.Context, log logging.Logger, apiURI
738
740
log .Info ("restarting node(s) to pick up chain configuration" )
739
741
740
742
// Restart nodes to allow configuration for the new chains to take effect
743
+ nodesToRestart := make ([]* Node , 0 , len (n .Nodes ))
741
744
for _ , node := range n .Nodes {
742
- if ! validatorsToRestart .Contains (node .NodeID ) {
743
- continue
744
- }
745
- if err := n .RestartNode (ctx , node ); err != nil {
746
- return err
745
+ if validatorsToRestart .Contains (node .NodeID ) {
746
+ nodesToRestart = append (nodesToRestart , node )
747
747
}
748
748
}
749
749
750
+ if err := restartNodes (ctx , nodesToRestart ); err != nil {
751
+ return err
752
+ }
753
+
754
+ if err := WaitForHealthyNodes (ctx , log , nodesToRestart ); err != nil {
755
+ return err
756
+ }
757
+
750
758
return nil
751
759
}
752
760
0 commit comments