@@ -319,18 +319,19 @@ private boolean doStart(boolean initial) {
319
319
// Initialize the endpoints.
320
320
maybeUpdateEndpoints ();
321
321
322
- logger .info ("[{}/{}] Watching the service, nodes and pods..." , namespace , serviceName );
323
322
watchService (service .getMetadata ().getResourceVersion ());
324
323
watchNode (nodes .getMetadata ().getResourceVersion ());
325
324
watchPod (pods .getMetadata ().getResourceVersion ());
326
325
} catch (Exception e ) {
327
- logger .warn ("[{}/{}] Failed to start {}." , namespace , serviceName , this , e );
326
+ logger .warn ("[{}/{}] Failed to start {}. (initial: {}) " , namespace , serviceName , this , initial , e );
328
327
if (initial ) {
329
328
failInit (e );
329
+ // Do not retry if the initialization fails since the error is likely to be persistent.
330
+ return false ;
330
331
} else {
331
- scheduleRestartWithBackoff (numStartFailures ++);
332
+ scheduleRestartWithBackoff (++numStartFailures );
333
+ return true ;
332
334
}
333
- return false ;
334
335
}
335
336
336
337
if (closed ) {
@@ -345,7 +346,8 @@ private boolean doStart(boolean initial) {
345
346
}
346
347
347
348
private void watchService (String resourceVersion ) {
348
- logger .info ("[{}/{}] Start the service watcher..." , namespace , serviceName );
349
+ logger .info ("[{}/{}] Start the service watcher... (resource version: {})" , namespace , serviceName ,
350
+ resourceVersion );
349
351
serviceWatch = doWatchService (resourceVersion );
350
352
logger .info ("[{}/{}] Service watcher is started." , namespace , serviceName );
351
353
}
@@ -391,7 +393,7 @@ public void onClose(WatcherException cause) {
391
393
logger .warn ("[{}/{}] Service watcher is closed." , namespace , serviceName , cause );
392
394
393
395
// Immediately retry on the first failure.
394
- scheduleRestartWithBackoff (numServiceFailures ++ );
396
+ scheduleRestartWithBackoff (++ numServiceFailures );
395
397
}
396
398
397
399
@ Override
@@ -437,7 +439,8 @@ private boolean updateService(Service service) {
437
439
}
438
440
439
441
private void watchPod (String resourceVersion ) {
440
- logger .info ("[{}/{}] Start the pod watcher..." , namespace , serviceName );
442
+ logger .info ("[{}/{}] Start the pod watcher... (resource version: {})" , namespace , serviceName ,
443
+ resourceVersion );
441
444
podWatch = doWatchPod (resourceVersion );
442
445
logger .info ("[{}/{}] Pod watcher is started." , namespace , serviceName );
443
446
}
@@ -464,7 +467,7 @@ public void onClose(WatcherException cause) {
464
467
}
465
468
466
469
logger .warn ("[{}/{}] Pod watcher is closed." , namespace , serviceName , cause );
467
- scheduleRestartWithBackoff (numPodFailures ++ );
470
+ scheduleRestartWithBackoff (++ numPodFailures );
468
471
}
469
472
470
473
@ Override
@@ -515,7 +518,8 @@ private boolean updatePod(Action action, Pod resource) {
515
518
}
516
519
517
520
private void watchNode (String resourceVersion ) {
518
- logger .info ("[{}/{}] Start the node watcher..." , namespace , serviceName );
521
+ logger .info ("[{}/{}] Start the node watcher... (resource version: {})" , namespace , serviceName ,
522
+ resourceVersion );
519
523
nodeWatch = doWatchNode (resourceVersion );
520
524
logger .info ("[{}/{}] Node watcher is started." , namespace , serviceName );
521
525
}
@@ -544,7 +548,7 @@ public void onClose(WatcherException cause) {
544
548
return ;
545
549
}
546
550
logger .warn ("[{}/{}] Node watcher is closed." , namespace , serviceName , cause );
547
- scheduleRestartWithBackoff (numNodeFailures ++ );
551
+ scheduleRestartWithBackoff (++ numNodeFailures );
548
552
}
549
553
550
554
@ Override
@@ -562,8 +566,8 @@ private boolean updateNode(Action action, Node node) {
562
566
}
563
567
564
568
final String nodeName = node .getMetadata ().getName ();
565
- logger .debug ("[{}/{}] Node event received. action: {}, node: {}" ,
566
- namespace , serviceName , action , nodeName );
569
+ logger .debug ("[{}/{}] Node event received. action: {}, node: {}, resource version: {} " ,
570
+ namespace , serviceName , action , nodeName , node . getMetadata (). getResourceVersion () );
567
571
switch (action ) {
568
572
case ADDED :
569
573
case MODIFIED :
@@ -596,6 +600,8 @@ private ScheduledFuture<?> scheduleJob(Runnable job, long delayMillis) {
596
600
597
601
private void scheduleRestartWithBackoff (int numFailures ) {
598
602
final long delayMillis = delayMillis (numFailures );
603
+ logger .info ("[{}/{}] Reconnecting to the Kubernetes API in {} ms (numFailures: {})" ,
604
+ namespace , serviceName , delayMillis , numFailures );
599
605
scheduleRestart (delayMillis );
600
606
}
601
607
@@ -631,11 +637,12 @@ private Runnable safeRunnable(Runnable job) {
631
637
};
632
638
}
633
639
634
- private static long delayMillis (int numAttempts ) {
635
- if (numAttempts == 0 ) {
640
+ private static long delayMillis (int numFailures ) {
641
+ if (numFailures == 1 ) {
642
+ // Retry immediately on the first failure.
636
643
return 0 ;
637
644
}
638
- return Backoff .ofDefault ().nextDelayMillis (numAttempts );
645
+ return Backoff .ofDefault ().nextDelayMillis (numFailures - 1 );
639
646
}
640
647
641
648
private void maybeUpdateEndpoints () {
0 commit comments