From b7900fce29ebd0505b336e86ab995200e03adcbf Mon Sep 17 00:00:00 2001 From: pikehuang <148873604+pikehuang@users.noreply.github.com> Date: Wed, 25 Oct 2023 15:41:31 +0800 Subject: [PATCH] fix(health check): check cluster status for multiple times (#2311) in health check to go get rid of socket connection failure #2310 --- pkg/controller/controller_utils.go | 21 +++++++++++++++++++ .../controller/cluster/cluster_controller.go | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/pkg/controller/controller_utils.go b/pkg/controller/controller_utils.go index 3ed1a83c87..372dc9924b 100755 --- a/pkg/controller/controller_utils.go +++ b/pkg/controller/controller_utils.go @@ -21,10 +21,13 @@ package controller import ( "context" "fmt" + "math/rand" + "time" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/version" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" "tkestack.io/tke/pkg/util/log" @@ -109,3 +112,21 @@ func CatchPanic(funcName string, addon string) { runtime.HandleError(fmt.Errorf("recover from %s.%s(), err is %v", addon, funcName, err)) } } + +// CheckClusterHealthStatus uses client to probe cluster health status for mutiple time +func CheckClusterHealthStatus(client kubernetes.Interface) (*version.Info, error) { + totalProbeCount := 3 + var version *version.Info + var err error + + for i := 0; i < totalProbeCount; i++ { + version, err = client.Discovery().ServerVersion() + if err == nil { + break + } + // sleep 20ms-100ms when failed + randomNumber := rand.Intn(80) + 20 + time.Sleep(time.Millisecond * time.Duration(randomNumber)) + } + return version, err +} diff --git a/pkg/platform/controller/cluster/cluster_controller.go b/pkg/platform/controller/cluster/cluster_controller.go index 3764ca5e55..7f7095ebe6 100644 --- a/pkg/platform/controller/cluster/cluster_controller.go +++ b/pkg/platform/controller/cluster/cluster_controller.go @@ -615,7 +615,7 @@ func (c *Controller) checkHealth(ctx context.Context, cluster *typesv1.Cluster) healthCheckCondition.Reason = failedHealthCheckReason healthCheckCondition.Message = err.Error() } else { - version, err := client.Discovery().ServerVersion() + version, err := controllerutil.CheckClusterHealthStatus(client) if err != nil { cluster.Status.Phase = platformv1.ClusterFailed