From 157ed65c352e40c71fe6ab91738321d95bd19b34 Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Mon, 23 Jul 2018 12:21:25 -0700 Subject: [PATCH] Fix healthcheck handler to check errors in local disks only (#6184) Healthcheck handler in current implementation was performing ListBuckets() to check for liveness of Minio service. ListBuckets() implementation on the other hand doesn't do quorum based listing and if one of the disks returned error, an I/O error it would be lead to kubernetes taking the minio pod down prematurely even if the disk is not local to that minio server. The reason is ListBuckets() call cannot be trusted to provide us the valid information that we need, Minio is a clustered application which is designed to handle disk failures. Error on one of the disks doesn't mean the pod should become fully non-operational. This PR attempts to fix this by only checking for alive disks which are local to each setup and also by simply performing a Stat() operation, if the Stat() returned error on all disks local to a particular server then we can let kubernetes safely take it down, until then we should be operational. --- cmd/healthcheck-handler.go | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/cmd/healthcheck-handler.go b/cmd/healthcheck-handler.go index 7fa80f2c4..73ce949ec 100644 --- a/cmd/healthcheck-handler.go +++ b/cmd/healthcheck-handler.go @@ -17,10 +17,12 @@ package cmd import ( - "context" "fmt" "net/http" + "os" "runtime" + + "github.com/minio/minio/cmd/logger" ) const ( @@ -40,19 +42,38 @@ func ReadinessCheckHandler(w http.ResponseWriter, r *http.Request) { writeResponse(w, http.StatusOK, nil, mimeNone) } -// LivenessCheckHandler -- checks if server can ListBuckets internally. If not, server is -// considered to have failed and needs to be restarted. +// LivenessCheckHandler -- checks if server can reach its disks internally. +// If not, server is considered to have failed and needs to be restarted. // Liveness probes are used to detect situations where application (minio) // has gone into a state where it can not recover except by being restarted. func LivenessCheckHandler(w http.ResponseWriter, r *http.Request) { + ctx := newContext(r, w, "LivenessCheckHandler") + objLayer := newObjectLayerFn() // Service not initialized yet if objLayer == nil { writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone) return } - // List buckets is unsuccessful, means server is having issues, send 503 service unavailable - if _, err := objLayer.ListBuckets(context.Background()); err != nil { + var totalLocalDisks int + var erroredDisks int + for _, endpoint := range globalEndpoints { + // Check only if local disks are accessible, we do not have + // to reach to rest of the other servers in a distributed setup. + if endpoint.IsLocal { + totalLocalDisks++ + // Attempt a stat to backend, any error resulting + // from this Stat() operation is considered as backend + // is not available, count them as errors. + if _, err := os.Stat(endpoint.Path); err != nil { + logger.LogIf(ctx, err) + erroredDisks++ + } + } + } + // If all exported local disks have errored, we simply let kubernetes + // take us down. + if totalLocalDisks == erroredDisks { writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone) return }