Add healthcheck endpoints (#5543)
This PR adds readiness and liveness endpoints to probe Minio server instance health. Endpoints can only be accessed without authentication and the paths are /minio/health/live and /minio/health/ready for liveness and readiness respectively. The new healthcheck liveness endpoint is used for Docker healthcheck now. Fixes #5357 Fixes #5514master
parent
d90985b6d8
commit
10b01ac836
@ -0,0 +1,69 @@ |
|||||||
|
/* |
||||||
|
* Minio Cloud Storage, (C) 2018 Minio, Inc. |
||||||
|
* |
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||||
|
* you may not use this file except in compliance with the License. |
||||||
|
* You may obtain a copy of the License at |
||||||
|
* |
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* |
||||||
|
* Unless required by applicable law or agreed to in writing, software |
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
* See the License for the specific language governing permissions and |
||||||
|
* limitations under the License. |
||||||
|
*/ |
||||||
|
|
||||||
|
package cmd |
||||||
|
|
||||||
|
import ( |
||||||
|
"fmt" |
||||||
|
"net/http" |
||||||
|
"runtime" |
||||||
|
) |
||||||
|
|
||||||
|
const ( |
||||||
|
minioHealthGoroutineThreshold = 1000 |
||||||
|
) |
||||||
|
|
||||||
|
// ReadinessCheckHandler -- checks if there are more than threshold number of goroutines running,
|
||||||
|
// returns service unavailable.
|
||||||
|
// Readiness probes are used to detect situations where application is under heavy load
|
||||||
|
// and temporarily unable to serve. In a orchestrated setup like Kubernetes, containers reporting
|
||||||
|
// that they are not ready do not receive traffic through Kubernetes Services.
|
||||||
|
func ReadinessCheckHandler(w http.ResponseWriter, r *http.Request) { |
||||||
|
if err := goroutineCountCheck(minioHealthGoroutineThreshold); err != nil { |
||||||
|
writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone) |
||||||
|
return |
||||||
|
} |
||||||
|
writeResponse(w, http.StatusOK, nil, mimeNone) |
||||||
|
} |
||||||
|
|
||||||
|
// LivenessCheckHandler -- checks if server can ListBuckets internally. If not, server is
|
||||||
|
// considered to have failed and needs to be restarted.
|
||||||
|
// Liveness probes are used to detect situations where application (minio)
|
||||||
|
// has gone into a state where it can not recover except by being restarted.
|
||||||
|
func LivenessCheckHandler(w http.ResponseWriter, r *http.Request) { |
||||||
|
objLayer := newObjectLayerFn() |
||||||
|
// Service not initialized yet
|
||||||
|
if objLayer == nil { |
||||||
|
writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone) |
||||||
|
return |
||||||
|
} |
||||||
|
// List buckets is unsuccessful, means server is having issues, send 503 service unavailable
|
||||||
|
if _, err := objLayer.ListBuckets(); err != nil { |
||||||
|
writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone) |
||||||
|
return |
||||||
|
} |
||||||
|
writeResponse(w, http.StatusOK, nil, mimeNone) |
||||||
|
} |
||||||
|
|
||||||
|
// checks threshold against total number of go-routines in the system and throws error if
|
||||||
|
// more than threshold go-routines are running.
|
||||||
|
func goroutineCountCheck(threshold int) error { |
||||||
|
count := runtime.NumGoroutine() |
||||||
|
if count > threshold { |
||||||
|
return fmt.Errorf("too many goroutines (%d > %d)", count, threshold) |
||||||
|
} |
||||||
|
return nil |
||||||
|
} |
@ -0,0 +1,44 @@ |
|||||||
|
/* |
||||||
|
* Minio Cloud Storage, (C) 2018 Minio, Inc. |
||||||
|
* |
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||||
|
* you may not use this file except in compliance with the License. |
||||||
|
* You may obtain a copy of the License at |
||||||
|
* |
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* |
||||||
|
* Unless required by applicable law or agreed to in writing, software |
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
* See the License for the specific language governing permissions and |
||||||
|
* limitations under the License. |
||||||
|
*/ |
||||||
|
|
||||||
|
package cmd |
||||||
|
|
||||||
|
import ( |
||||||
|
"testing" |
||||||
|
"time" |
||||||
|
) |
||||||
|
|
||||||
|
func TestGoroutineCountCheck(t *testing.T) { |
||||||
|
tests := []struct { |
||||||
|
threshold int |
||||||
|
wantErr bool |
||||||
|
}{ |
||||||
|
{5000, false}, |
||||||
|
{5, true}, |
||||||
|
{6, true}, |
||||||
|
} |
||||||
|
for _, tt := range tests { |
||||||
|
// Make goroutines -- to make sure number of go-routines is higher than threshold
|
||||||
|
if tt.threshold == 5 || tt.threshold == 6 { |
||||||
|
for i := 0; i < 6; i++ { |
||||||
|
go time.Sleep(5) |
||||||
|
} |
||||||
|
} |
||||||
|
if err := goroutineCountCheck(tt.threshold); (err != nil) != tt.wantErr { |
||||||
|
t.Errorf("goroutineCountCheck() error = %v, wantErr %v", err, tt.wantErr) |
||||||
|
} |
||||||
|
} |
||||||
|
} |
@ -0,0 +1,43 @@ |
|||||||
|
/* |
||||||
|
* Minio Cloud Storage, (C) 2018 Minio, Inc. |
||||||
|
* |
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||||
|
* you may not use this file except in compliance with the License. |
||||||
|
* You may obtain a copy of the License at |
||||||
|
* |
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* |
||||||
|
* Unless required by applicable law or agreed to in writing, software |
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
* See the License for the specific language governing permissions and |
||||||
|
* limitations under the License. |
||||||
|
*/ |
||||||
|
|
||||||
|
package cmd |
||||||
|
|
||||||
|
import ( |
||||||
|
"net/http" |
||||||
|
|
||||||
|
router "github.com/gorilla/mux" |
||||||
|
) |
||||||
|
|
||||||
|
const ( |
||||||
|
healthCheckPath = "/health" |
||||||
|
healthCheckLivenessPath = "/live" |
||||||
|
healthCheckReadinessPath = "/ready" |
||||||
|
healthCheckPathPrefix = minioReservedBucketPath + healthCheckPath |
||||||
|
) |
||||||
|
|
||||||
|
// registerHealthCheckRouter - add handler functions for liveness and readiness routes.
|
||||||
|
func registerHealthCheckRouter(mux *router.Router) { |
||||||
|
|
||||||
|
// Healthcheck router
|
||||||
|
healthRouter := mux.NewRoute().PathPrefix(healthCheckPathPrefix).Subrouter() |
||||||
|
|
||||||
|
// Liveness handler
|
||||||
|
healthRouter.Methods(http.MethodGet).Path(healthCheckLivenessPath).HandlerFunc(LivenessCheckHandler) |
||||||
|
|
||||||
|
// Readiness handler
|
||||||
|
healthRouter.Methods(http.MethodGet).Path(healthCheckReadinessPath).HandlerFunc(ReadinessCheckHandler) |
||||||
|
} |
@ -0,0 +1,39 @@ |
|||||||
|
## Minio Healthcheck |
||||||
|
|
||||||
|
Minio server exposes two un-authenticated, healthcheck endpoints - liveness probe and readiness probe at `/minio/health/live` and `/minio/health/ready` respectively. |
||||||
|
|
||||||
|
### Liveness probe |
||||||
|
This probe is used to identify situations where the server is running but may not behave optimally, i.e. sluggish response or corrupt backend. Such problems can be *only* fixed by a restart. |
||||||
|
|
||||||
|
Internally, Minio liveness probe handler does a ListBuckets call. If successful, the server returns 200 OK, otherwise 503 Service Unavailable. |
||||||
|
|
||||||
|
When liveness probe fails, Kubernetes like platforms restart the container. |
||||||
|
|
||||||
|
Sample configuration in a Kubernetes `yaml` file. |
||||||
|
|
||||||
|
```yaml |
||||||
|
livenessProbe: |
||||||
|
httpGet: |
||||||
|
path: /minio/health/live |
||||||
|
port: 9000 |
||||||
|
initialDelaySeconds: 10 |
||||||
|
periodSeconds: 20 |
||||||
|
``` |
||||||
|
|
||||||
|
### Readiness probe |
||||||
|
This probe is used to identify situations where the server is not ready to accept requests yet. In most cases, such conditions recover in some time. |
||||||
|
|
||||||
|
Internally, Minio readiness probe handler checks for total go-routines. If the number of go-routines is less than 1000 (threshold), the server returns 200 OK, otherwise 503 Service Unavailable. |
||||||
|
|
||||||
|
Platforms like Kubernetes *do not* forward traffic to a pod until its readiness probe is successful. |
||||||
|
|
||||||
|
Sample configuration in a Kubernetes `yaml` file. |
||||||
|
|
||||||
|
```yaml |
||||||
|
livenessProbe: |
||||||
|
httpGet: |
||||||
|
path: /minio/health/ready |
||||||
|
port: 9000 |
||||||
|
initialDelaySeconds: 10 |
||||||
|
periodSeconds: 20 |
||||||
|
``` |
Loading…
Reference in new issue