Fix healthcheck handler to check errors in local disks only (#6184)

Healthcheck handler in current implementation was
performing ListBuckets() to check for liveness of Minio
service. ListBuckets() implementation on the other hand
doesn't do quorum based listing and if one of the disks
returned error, an I/O error it would be lead to kubernetes
taking the minio pod down prematurely even if the disk
is not local to that minio server.

The reason is ListBuckets() call cannot be trusted to
provide us the valid information that we need, Minio is a
clustered application which is designed to handle disk
failures. Error on one of the disks doesn't mean the pod
should become fully non-operational.

This PR attempts to fix this by only checking for alive
disks which are local to each setup and also by simply
performing a Stat() operation, if the Stat() returned
error on all disks local to a particular server then
we can let kubernetes safely take it down, until then
we should be operational.
master
Harshavardhana 6 years ago committed by kannappanr
parent 869018ad14
commit 157ed65c35
  1. 31
      cmd/healthcheck-handler.go

@ -17,10 +17,12 @@
package cmd package cmd
import ( import (
"context"
"fmt" "fmt"
"net/http" "net/http"
"os"
"runtime" "runtime"
"github.com/minio/minio/cmd/logger"
) )
const ( const (
@ -40,19 +42,38 @@ func ReadinessCheckHandler(w http.ResponseWriter, r *http.Request) {
writeResponse(w, http.StatusOK, nil, mimeNone) writeResponse(w, http.StatusOK, nil, mimeNone)
} }
// LivenessCheckHandler -- checks if server can ListBuckets internally. If not, server is // LivenessCheckHandler -- checks if server can reach its disks internally.
// considered to have failed and needs to be restarted. // If not, server is considered to have failed and needs to be restarted.
// Liveness probes are used to detect situations where application (minio) // Liveness probes are used to detect situations where application (minio)
// has gone into a state where it can not recover except by being restarted. // has gone into a state where it can not recover except by being restarted.
func LivenessCheckHandler(w http.ResponseWriter, r *http.Request) { func LivenessCheckHandler(w http.ResponseWriter, r *http.Request) {
ctx := newContext(r, w, "LivenessCheckHandler")
objLayer := newObjectLayerFn() objLayer := newObjectLayerFn()
// Service not initialized yet // Service not initialized yet
if objLayer == nil { if objLayer == nil {
writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone) writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
return return
} }
// List buckets is unsuccessful, means server is having issues, send 503 service unavailable var totalLocalDisks int
if _, err := objLayer.ListBuckets(context.Background()); err != nil { var erroredDisks int
for _, endpoint := range globalEndpoints {
// Check only if local disks are accessible, we do not have
// to reach to rest of the other servers in a distributed setup.
if endpoint.IsLocal {
totalLocalDisks++
// Attempt a stat to backend, any error resulting
// from this Stat() operation is considered as backend
// is not available, count them as errors.
if _, err := os.Stat(endpoint.Path); err != nil {
logger.LogIf(ctx, err)
erroredDisks++
}
}
}
// If all exported local disks have errored, we simply let kubernetes
// take us down.
if totalLocalDisks == erroredDisks {
writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone) writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
return return
} }

Loading…
Cancel
Save