Updated Prometheus metrics (#11141)

* Add metrics for nodes online and offline
* Add cluster capacity metrics
* Introduce v2 metrics
master
Ritesh H Shukla 3 years ago committed by GitHub
parent 3bda8f755c
commit b4add82bb6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 32
      cmd/admin-server-info.go
  2. 8
      cmd/disk-cache-stats.go
  3. 4
      cmd/erasure-server-pool.go
  4. 4
      cmd/fs-v1.go
  5. 2
      cmd/gateway-common.go
  6. 26
      cmd/gateway-metrics.go
  7. 4
      cmd/gateway-unsupported.go
  8. 4
      cmd/gateway/azure/gateway-azure.go
  9. 4
      cmd/gateway/gcs/gateway-gcs.go
  10. 4
      cmd/gateway/s3/gateway-s3.go
  11. 4
      cmd/generic-handlers.go
  12. 14
      cmd/http-stats.go
  13. 13
      cmd/metrics-router.go
  14. 1187
      cmd/metrics-v2.go
  15. 127
      cmd/metrics.go
  16. 54
      cmd/notification-summary.go
  17. 68
      cmd/notification.go
  18. 9
      cmd/object-api-interface.go
  19. 23
      cmd/peer-rest-client.go
  20. 1
      cmd/peer-rest-common.go
  21. 32
      cmd/peer-rest-server.go
  22. 11
      docs/metrics/README.md
  23. 202
      docs/metrics/prometheus/README.md
  24. 47
      docs/metrics/prometheus/list.md
  25. 3
      go.mod
  26. 6
      go.sum
  27. 28
      pkg/madmin/health.go

@ -69,3 +69,35 @@ func getLocalServerProperty(endpointServerPools EndpointServerPools, r *http.Req
Disks: storageInfo.Disks,
}
}
func getLocalDisks(endpointServerPools EndpointServerPools) []madmin.Disk {
var localEndpoints Endpoints
network := make(map[string]string)
for _, ep := range endpointServerPools {
for _, endpoint := range ep.Endpoints {
nodeName := endpoint.Host
if nodeName == "" {
nodeName = "localhost"
}
if endpoint.IsLocal {
// Only proceed for local endpoints
network[nodeName] = "online"
localEndpoints = append(localEndpoints, endpoint)
continue
}
_, present := network[nodeName]
if !present {
if err := isServerResolvable(endpoint); err == nil {
network[nodeName] = "online"
} else {
network[nodeName] = "offline"
}
}
}
}
localDisks, _ := initStorageDisksWithErrors(localEndpoints)
defer closeStorageDisks(localDisks)
storageInfo, _ := getStorageInfo(localDisks, localEndpoints.GetAllStrings())
return storageInfo.Disks
}

@ -34,6 +34,14 @@ type CacheDiskStats struct {
Dir string
}
// GetUsageLevelString gets the string representation for the usage level.
func (c *CacheDiskStats) GetUsageLevelString() (u string) {
if atomic.LoadInt32(&c.UsageState) == 0 {
return "low"
}
return "high"
}
// CacheStats - represents bytes served from cache,
// cache hits and cache misses.
type CacheStats struct {

@ -1377,9 +1377,9 @@ func (z *erasureServerPools) HealObject(ctx context.Context, bucket, object, ver
}
// GetMetrics - no op
func (z *erasureServerPools) GetMetrics(ctx context.Context) (*Metrics, error) {
func (z *erasureServerPools) GetMetrics(ctx context.Context) (*BackendMetrics, error) {
logger.LogIf(ctx, NotImplemented{})
return &Metrics{}, NotImplemented{}
return &BackendMetrics{}, NotImplemented{}
}
func (z *erasureServerPools) getZoneAndSet(id string) (int, int, error) {

@ -1554,9 +1554,9 @@ func (fs *FSObjects) HealObjects(ctx context.Context, bucket, prefix string, opt
}
// GetMetrics - no op
func (fs *FSObjects) GetMetrics(ctx context.Context) (*Metrics, error) {
func (fs *FSObjects) GetMetrics(ctx context.Context) (*BackendMetrics, error) {
logger.LogIf(ctx, NotImplemented{})
return &Metrics{}, NotImplemented{}
return &BackendMetrics{}, NotImplemented{}
}
// ListObjectsV2 lists all blobs in bucket filtered by prefix

@ -389,7 +389,7 @@ func shouldMeterRequest(req *http.Request) bool {
// MetricsTransport is a custom wrapper around Transport to track metrics
type MetricsTransport struct {
Transport *http.Transport
Metrics *Metrics
Metrics *BackendMetrics
}
// RoundTrip implements the RoundTrip method for MetricsTransport

@ -29,36 +29,28 @@ type RequestStats struct {
Post uint64 `json:"Post"`
}
// Metrics - represents bytes served from backend
// only implemented for S3 Gateway
type Metrics struct {
bytesReceived uint64
bytesSent uint64
requestStats RequestStats
}
// IncBytesReceived - Increase total bytes received from gateway backend
func (s *Metrics) IncBytesReceived(n uint64) {
func (s *BackendMetrics) IncBytesReceived(n uint64) {
atomic.AddUint64(&s.bytesReceived, n)
}
// GetBytesReceived - Get total bytes received from gateway backend
func (s *Metrics) GetBytesReceived() uint64 {
func (s *BackendMetrics) GetBytesReceived() uint64 {
return atomic.LoadUint64(&s.bytesReceived)
}
// IncBytesSent - Increase total bytes sent to gateway backend
func (s *Metrics) IncBytesSent(n uint64) {
func (s *BackendMetrics) IncBytesSent(n uint64) {
atomic.AddUint64(&s.bytesSent, n)
}
// GetBytesSent - Get total bytes received from gateway backend
func (s *Metrics) GetBytesSent() uint64 {
func (s *BackendMetrics) GetBytesSent() uint64 {
return atomic.LoadUint64(&s.bytesSent)
}
// IncRequests - Increase request count sent to gateway backend by 1
func (s *Metrics) IncRequests(method string) {
func (s *BackendMetrics) IncRequests(method string) {
// Only increment for Head & Get requests, else no op
if method == http.MethodGet {
atomic.AddUint64(&s.requestStats.Get, 1)
@ -72,11 +64,11 @@ func (s *Metrics) IncRequests(method string) {
}
// GetRequests - Get total number of Get & Headrequests sent to gateway backend
func (s *Metrics) GetRequests() RequestStats {
func (s *BackendMetrics) GetRequests() RequestStats {
return s.requestStats
}
// NewMetrics - Prepare new Metrics structure
func NewMetrics() *Metrics {
return &Metrics{}
// NewMetrics - Prepare new BackendMetrics structure
func NewMetrics() *BackendMetrics {
return &BackendMetrics{}
}

@ -202,9 +202,9 @@ func (a GatewayUnsupported) CopyObject(ctx context.Context, srcBucket string, sr
}
// GetMetrics - no op
func (a GatewayUnsupported) GetMetrics(ctx context.Context) (*Metrics, error) {
func (a GatewayUnsupported) GetMetrics(ctx context.Context) (*BackendMetrics, error) {
logger.LogIf(ctx, NotImplemented{})
return &Metrics{}, NotImplemented{}
return &BackendMetrics{}, NotImplemented{}
}
// PutObjectTags - not implemented.

@ -419,7 +419,7 @@ type azureObjects struct {
minio.GatewayUnsupported
endpoint *url.URL
httpClient *http.Client
metrics *minio.Metrics
metrics *minio.BackendMetrics
client azblob.ServiceURL // Azure sdk client
}
@ -533,7 +533,7 @@ func parseAzurePart(metaPartFileName, prefix string) (partID int, err error) {
}
// GetMetrics returns this gateway's metrics
func (a *azureObjects) GetMetrics(ctx context.Context) (*minio.Metrics, error) {
func (a *azureObjects) GetMetrics(ctx context.Context) (*minio.BackendMetrics, error) {
return a.metrics, nil
}

@ -341,7 +341,7 @@ type gcsGateway struct {
minio.GatewayUnsupported
client *storage.Client
httpClient *http.Client
metrics *minio.Metrics
metrics *minio.BackendMetrics
projectID string
}
@ -359,7 +359,7 @@ func gcsParseProjectID(credsFile string) (projectID string, err error) {
}
// GetMetrics returns this gateway's metrics
func (l *gcsGateway) GetMetrics(ctx context.Context) (*minio.Metrics, error) {
func (l *gcsGateway) GetMetrics(ctx context.Context) (*minio.BackendMetrics, error) {
return l.metrics, nil
}

@ -259,11 +259,11 @@ type s3Objects struct {
minio.GatewayUnsupported
Client *miniogo.Core
HTTPClient *http.Client
Metrics *minio.Metrics
Metrics *minio.BackendMetrics
}
// GetMetrics returns this gateway's metrics
func (l *s3Objects) GetMetrics(ctx context.Context) (*minio.Metrics, error) {
func (l *s3Objects) GetMetrics(ctx context.Context) (*minio.BackendMetrics, error) {
return l.Metrics, nil
}

@ -228,7 +228,9 @@ func guessIsMetricsReq(req *http.Request) bool {
}
aType := getRequestAuthType(req)
return (aType == authTypeAnonymous || aType == authTypeJWT) &&
req.URL.Path == minioReservedBucketPath+prometheusMetricsPath
req.URL.Path == minioReservedBucketPath+prometheusMetricsPathLegacy ||
req.URL.Path == minioReservedBucketPath+prometheusMetricsV2ClusterPath ||
req.URL.Path == minioReservedBucketPath+prometheusMetricsV2NodePath
}
// guessIsRPCReq - returns true if the request is for an RPC endpoint.

@ -79,10 +79,10 @@ func (s *ConnStats) getS3OutputBytes() uint64 {
// Return connection stats (total input/output bytes and total s3 input/output bytes)
func (s *ConnStats) toServerConnStats() ServerConnStats {
return ServerConnStats{
TotalInputBytes: s.getTotalInputBytes(),
TotalOutputBytes: s.getTotalOutputBytes(),
S3InputBytes: s.getS3InputBytes(),
S3OutputBytes: s.getS3OutputBytes(),
TotalInputBytes: s.getTotalInputBytes(), // Traffic including reserved bucket
TotalOutputBytes: s.getTotalOutputBytes(), // Traffic including reserved bucket
S3InputBytes: s.getS3InputBytes(), // Traffic for client buckets
S3OutputBytes: s.getS3OutputBytes(), // Traffic for client buckets
}
}
@ -163,9 +163,11 @@ func (st *HTTPStats) toServerHTTPStats() ServerHTTPStats {
// Update statistics from http request and response data
func (st *HTTPStats) updateStats(api string, r *http.Request, w *logger.ResponseWriter) {
// A successful request has a 2xx response code
successReq := (w.StatusCode >= 200 && w.StatusCode < 300)
successReq := w.StatusCode >= 200 && w.StatusCode < 300
if !strings.HasSuffix(r.URL.Path, prometheusMetricsPath) {
if !strings.HasSuffix(r.URL.Path, prometheusMetricsPathLegacy) ||
!strings.HasSuffix(r.URL.Path, prometheusMetricsV2ClusterPath) ||
!strings.HasSuffix(r.URL.Path, prometheusMetricsV2NodePath) {
st.totalS3Requests.Inc(api)
if !successReq && w.StatusCode != 0 {
st.totalS3Errors.Inc(api)

@ -24,7 +24,9 @@ import (
)
const (
prometheusMetricsPath = "/prometheus/metrics"
prometheusMetricsPathLegacy = "/prometheus/metrics"
prometheusMetricsV2ClusterPath = "/v2/metrics/cluster"
prometheusMetricsV2NodePath = "/v2/metrics/node"
)
// Standard env prometheus auth type
@ -43,14 +45,17 @@ const (
func registerMetricsRouter(router *mux.Router) {
// metrics router
metricsRouter := router.NewRoute().PathPrefix(minioReservedBucketPath).Subrouter()
authType := strings.ToLower(os.Getenv(EnvPrometheusAuthType))
switch prometheusAuthType(authType) {
case prometheusPublic:
metricsRouter.Handle(prometheusMetricsPath, metricsHandler())
metricsRouter.Handle(prometheusMetricsPathLegacy, metricsHandler())
metricsRouter.Handle(prometheusMetricsV2ClusterPath, metricsServerHandler())
metricsRouter.Handle(prometheusMetricsV2NodePath, metricsNodeHandler())
case prometheusJWT:
fallthrough
default:
metricsRouter.Handle(prometheusMetricsPath, AuthMiddleware(metricsHandler()))
metricsRouter.Handle(prometheusMetricsPathLegacy, AuthMiddleware(metricsHandler()))
metricsRouter.Handle(prometheusMetricsV2ClusterPath, AuthMiddleware(metricsServerHandler()))
metricsRouter.Handle(prometheusMetricsV2NodePath, AuthMiddleware(metricsNodeHandler()))
}
}

File diff suppressed because it is too large Load Diff

@ -51,6 +51,17 @@ var (
)
)
const (
healMetricsNamespace = "self_heal"
gatewayNamespace = "gateway"
cacheNamespace = "cache"
s3Namespace = "s3"
bucketNamespace = "bucket"
minioNamespace = "minio"
diskNamespace = "disk"
interNodeNamespace = "internode"
)
func init() {
prometheus.MustRegister(httpRequestsDuration)
prometheus.MustRegister(newMinioCollector())
@ -81,9 +92,10 @@ func (c *minioCollector) Describe(ch chan<- *prometheus.Desc) {
func (c *minioCollector) Collect(ch chan<- prometheus.Metric) {
// Expose MinIO's version information
minioVersionInfo.WithLabelValues(Version, CommitID).Set(float64(1.0))
minioVersionInfo.WithLabelValues(Version, CommitID).Set(1.0)
storageMetricsPrometheus(ch)
nodeHealthMetricsPrometheus(ch)
bucketUsageMetricsPrometheus(ch)
networkMetricsPrometheus(ch)
httpMetricsPrometheus(ch)
@ -92,6 +104,26 @@ func (c *minioCollector) Collect(ch chan<- prometheus.Metric) {
healingMetricsPrometheus(ch)
}
func nodeHealthMetricsPrometheus(ch chan<- prometheus.Metric) {
nodesUp, nodesDown := GetPeerOnlineCount()
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(minioNamespace, "nodes", "online"),
"Total number of MinIO nodes online",
nil, nil),
prometheus.GaugeValue,
float64(nodesUp),
)
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(minioNamespace, "nodes", "offline"),
"Total number of MinIO nodes offline",
nil, nil),
prometheus.GaugeValue,
float64(nodesDown),
)
}
// collects healing specific metrics for MinIO instance in Prometheus specific format
// and sends to given channel
func healingMetricsPrometheus(ch chan<- prometheus.Metric) {
@ -102,7 +134,6 @@ func healingMetricsPrometheus(ch chan<- prometheus.Metric) {
if !exists {
return
}
healMetricsNamespace := "self_heal"
var dur time.Duration
if !bgSeq.lastHealActivity.IsZero() {
@ -172,7 +203,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("gateway", globalGatewayName, "bytes_received"),
prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "bytes_received"),
"Total number of bytes received by current MinIO Gateway "+globalGatewayName+" backend",
nil, nil),
prometheus.CounterValue,
@ -180,7 +211,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) {
)
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("gateway", globalGatewayName, "bytes_sent"),
prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "bytes_sent"),
"Total number of bytes sent by current MinIO Gateway to "+globalGatewayName+" backend",
nil, nil),
prometheus.CounterValue,
@ -189,7 +220,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) {
s := m.GetRequests()
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("gateway", globalGatewayName, "requests"),
prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "requests"),
"Total number of requests made to "+globalGatewayName+" by current MinIO Gateway",
[]string{"method"}, nil),
prometheus.CounterValue,
@ -198,7 +229,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) {
)
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("gateway", globalGatewayName, "requests"),
prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "requests"),
"Total number of requests made to "+globalGatewayName+" by current MinIO Gateway",
[]string{"method"}, nil),
prometheus.CounterValue,
@ -207,7 +238,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) {
)
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("gateway", globalGatewayName, "requests"),
prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "requests"),
"Total number of requests made to "+globalGatewayName+" by current MinIO Gateway",
[]string{"method"}, nil),
prometheus.CounterValue,
@ -216,7 +247,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) {
)
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("gateway", globalGatewayName, "requests"),
prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "requests"),
"Total number of requests made to "+globalGatewayName+" by current MinIO Gateway",
[]string{"method"}, nil),
prometheus.CounterValue,
@ -236,7 +267,7 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("cache", "hits", "total"),
prometheus.BuildFQName(cacheNamespace, "hits", "total"),
"Total number of disk cache hits in current MinIO instance",
nil, nil),
prometheus.CounterValue,
@ -244,7 +275,7 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) {
)
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("cache", "misses", "total"),
prometheus.BuildFQName(cacheNamespace, "misses", "total"),
"Total number of disk cache misses in current MinIO instance",
nil, nil),
prometheus.CounterValue,
@ -252,7 +283,7 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) {
)
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("cache", "data", "served"),
prometheus.BuildFQName(cacheNamespace, "data", "served"),
"Total number of bytes served from cache of current MinIO instance",
nil, nil),
prometheus.CounterValue,
@ -262,7 +293,7 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) {
// Cache disk usage percentage
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("cache", "usage", "percent"),
prometheus.BuildFQName(cacheNamespace, "usage", "percent"),
"Total percentage cache usage",
[]string{"disk"}, nil),
prometheus.GaugeValue,
@ -271,7 +302,7 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) {
)
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("cache", "usage", "high"),
prometheus.BuildFQName(cacheNamespace, "usage", "high"),
"Indicates cache usage is high or low, relative to current cache 'quota' settings",
[]string{"disk"}, nil),
prometheus.GaugeValue,
@ -309,7 +340,7 @@ func httpMetricsPrometheus(ch chan<- prometheus.Metric) {
for api, value := range httpStats.CurrentS3Requests.APIStats {
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("s3", "requests", "current"),
prometheus.BuildFQName(s3Namespace, "requests", "current"),
"Total number of running s3 requests in current MinIO server instance",
[]string{"api"}, nil),
prometheus.CounterValue,
@ -321,7 +352,7 @@ func httpMetricsPrometheus(ch chan<- prometheus.Metric) {
for api, value := range httpStats.TotalS3Requests.APIStats {
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("s3", "requests", "total"),
prometheus.BuildFQName(s3Namespace, "requests", "total"),
"Total number of s3 requests in current MinIO server instance",
[]string{"api"}, nil),
prometheus.CounterValue,
@ -333,7 +364,7 @@ func httpMetricsPrometheus(ch chan<- prometheus.Metric) {
for api, value := range httpStats.TotalS3Errors.APIStats {
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("s3", "errors", "total"),
prometheus.BuildFQName(s3Namespace, "errors", "total"),
"Total number of s3 errors in current MinIO server instance",
[]string{"api"}, nil),
prometheus.CounterValue,
@ -351,7 +382,7 @@ func networkMetricsPrometheus(ch chan<- prometheus.Metric) {
// Network Sent/Received Bytes (internode)
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("internode", "tx", "bytes_total"),
prometheus.BuildFQName(interNodeNamespace, "tx", "bytes_total"),
"Total number of bytes sent to the other peer nodes by current MinIO server instance",
nil, nil),
prometheus.CounterValue,
@ -360,7 +391,7 @@ func networkMetricsPrometheus(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("internode", "rx", "bytes_total"),
prometheus.BuildFQName(interNodeNamespace, "rx", "bytes_total"),
"Total number of internode bytes received by current MinIO server instance",
nil, nil),
prometheus.CounterValue,
@ -370,7 +401,7 @@ func networkMetricsPrometheus(ch chan<- prometheus.Metric) {
// Network Sent/Received Bytes (Outbound)
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("s3", "tx", "bytes_total"),
prometheus.BuildFQName(s3Namespace, "tx", "bytes_total"),
"Total number of s3 bytes sent by current MinIO server instance",
nil, nil),
prometheus.CounterValue,
@ -379,7 +410,7 @@ func networkMetricsPrometheus(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("s3", "rx", "bytes_total"),
prometheus.BuildFQName(s3Namespace, "rx", "bytes_total"),
"Total number of s3 bytes received by current MinIO server instance",
nil, nil),
prometheus.CounterValue,
@ -414,7 +445,7 @@ func bucketUsageMetricsPrometheus(ch chan<- prometheus.Metric) {
// Total space used by bucket
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("bucket", "usage", "size"),
prometheus.BuildFQName(bucketNamespace, "usage", "size"),
"Total bucket size",
[]string{"bucket"}, nil),
prometheus.GaugeValue,
@ -423,7 +454,7 @@ func bucketUsageMetricsPrometheus(ch chan<- prometheus.Metric) {
)
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("bucket", "objects", "count"),
prometheus.BuildFQName(bucketNamespace, "objects", "count"),
"Total number of objects in a bucket",
[]string{"bucket"}, nil),
prometheus.GaugeValue,
@ -469,7 +500,7 @@ func bucketUsageMetricsPrometheus(ch chan<- prometheus.Metric) {
for k, v := range usageInfo.ObjectSizesHistogram {
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("bucket", "objects", "histogram"),
prometheus.BuildFQName(bucketNamespace, "objects", "histogram"),
"Total number of objects of different sizes in a bucket",
[]string{"bucket", "object_size"}, nil),
prometheus.GaugeValue,
@ -497,10 +528,50 @@ func storageMetricsPrometheus(ch chan<- prometheus.Metric) {
onlineDisks, offlineDisks := getOnlineOfflineDisksStats(server.Disks)
totalDisks := offlineDisks.Merge(onlineDisks)
// Report total capacity
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(minioNamespace, "capacity_raw", "total"),
"Total capacity online in the cluster",
nil, nil),
prometheus.GaugeValue,
float64(GetTotalCapacity(GlobalContext)),
)
// Report total capacity free
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(minioNamespace, "capacity_raw_free", "total"),
"Total free capacity online in the cluster",
nil, nil),
prometheus.GaugeValue,
float64(GetTotalCapacityFree(GlobalContext)),
)
s, _ := objLayer.StorageInfo(GlobalContext)
// Report total usable capacity
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(minioNamespace, "capacity_usable", "total"),
"Total usable capacity online in the cluster",
nil, nil),
prometheus.GaugeValue,
GetTotalUsableCapacity(GlobalContext, s),
)
// Report total usable capacity free
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(minioNamespace, "capacity_usable_free", "total"),
"Total free usable capacity online in the cluster",
nil, nil),
prometheus.GaugeValue,
GetTotalUsableCapacityFree(GlobalContext, s),
)
// MinIO Offline Disks per node
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("minio", "disks", "offline"),
prometheus.BuildFQName(minioNamespace, "disks", "offline"),
"Total number of offline disks in current MinIO server instance",
nil, nil),
prometheus.GaugeValue,
@ -510,7 +581,7 @@ func storageMetricsPrometheus(ch chan<- prometheus.Metric) {
// MinIO Total Disks per node
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("minio", "disks", "total"),
prometheus.BuildFQName(minioNamespace, "disks", "total"),
"Total number of disks for current MinIO server instance",
nil, nil),
prometheus.GaugeValue,
@ -521,7 +592,7 @@ func storageMetricsPrometheus(ch chan<- prometheus.Metric) {
// Total disk usage by the disk
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("disk", "storage", "used"),
prometheus.BuildFQName(diskNamespace, "storage", "used"),
"Total disk storage used on the disk",
[]string{"disk"}, nil),
prometheus.GaugeValue,
@ -532,7 +603,7 @@ func storageMetricsPrometheus(ch chan<- prometheus.Metric) {
// Total available space in the disk
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("disk", "storage", "available"),
prometheus.BuildFQName(diskNamespace, "storage", "available"),
"Total available space left on the disk",
[]string{"disk"}, nil),
prometheus.GaugeValue,
@ -543,7 +614,7 @@ func storageMetricsPrometheus(ch chan<- prometheus.Metric) {
// Total storage space of the disk
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName("disk", "storage", "total"),
prometheus.BuildFQName(diskNamespace, "storage", "total"),
"Total space on the disk",
[]string{"disk"}, nil),
prometheus.GaugeValue,

@ -0,0 +1,54 @@
/*
* MinIO Cloud Storage, (C) 2020 MinIO, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package cmd
import (
"context"
)
// GetTotalCapacity gets the total capacity in the cluster.
func GetTotalCapacity(ctx context.Context) (capacity uint64) {
d := globalNotificationSys.DiskHwInfo(ctx)
for _, s := range d {
capacity += s.GetTotalCapacity()
}
return
}
// GetTotalUsableCapacity gets the total usable capacity in the cluster.
func GetTotalUsableCapacity(ctx context.Context, s StorageInfo) (capacity float64) {
raw := GetTotalCapacity(ctx)
ratio := float64(s.Backend.StandardSCData) / float64(s.Backend.StandardSCData+s.Backend.StandardSCParity)
return float64(raw) * ratio
}
// GetTotalCapacityFree gets the total capacity free in the cluster.
func GetTotalCapacityFree(ctx context.Context) (capacity uint64) {
d := globalNotificationSys.DiskHwInfo(ctx)
for _, s := range d {
capacity += s.GetTotalFreeCapacity()
}
return
}
// GetTotalUsableCapacityFree gets the total usable capacity free in the cluster.
func GetTotalUsableCapacityFree(ctx context.Context, s StorageInfo) (capacity float64) {
raw := GetTotalCapacityFree(ctx)
ratio := float64(s.Backend.StandardSCData) / float64(s.Backend.StandardSCData+s.Backend.StandardSCParity)
return float64(raw) * ratio
}

@ -51,8 +51,8 @@ type NotificationSys struct {
targetResCh chan event.TargetIDResult
bucketRulesMap map[string]event.RulesMap
bucketRemoteTargetRulesMap map[string]map[event.TargetID]event.RulesMap
peerClients []*peerRESTClient
allPeerClients []*peerRESTClient
peerClients []*peerRESTClient // Excludes self
allPeerClients []*peerRESTClient // Includes nil client for self
}
// GetARNList - returns available ARNs.
@ -1294,6 +1294,21 @@ func NewNotificationSys(endpoints EndpointServerPools) *NotificationSys {
}
}
// GetPeerOnlineCount gets the count of online and offline nodes.
func GetPeerOnlineCount() (nodesOnline, nodesOffline int) {
nodesOnline = 1 // Self is always online.
nodesOffline = 0
servers := globalNotificationSys.ServerInfo()
for _, s := range servers {
if s.State == "ok" {
nodesOnline++
continue
}
nodesOffline++
}
return
}
type eventArgs struct {
EventName event.Name
BucketName string
@ -1428,3 +1443,52 @@ func (sys *NotificationSys) GetBandwidthReports(ctx context.Context, buckets ...
}
return consolidatedReport
}
// GetClusterMetrics - gets the cluster metrics from all nodes excluding self.
func (sys *NotificationSys) GetClusterMetrics(ctx context.Context) chan Metric {
g := errgroup.WithNErrs(len(sys.peerClients))
peerChannels := make([]<-chan Metric, len(sys.peerClients))
for index := range sys.peerClients {
if sys.peerClients[index] == nil {
continue
}
index := index
g.Go(func() error {
var err error
peerChannels[index], err = sys.peerClients[index].GetPeerMetrics(ctx)
return err
}, index)
}
ch := make(chan Metric)
var wg sync.WaitGroup
for index, err := range g.Wait() {
reqInfo := (&logger.ReqInfo{}).AppendTags("peerAddress",
sys.peerClients[index].host.String())
ctx := logger.SetReqInfo(ctx, reqInfo)
if err != nil {
logger.LogOnceIf(ctx, err, sys.peerClients[index].host.String())
continue
}
wg.Add(1)
go func(ctx context.Context, peerChannel <-chan Metric, wg *sync.WaitGroup) {
defer wg.Done()
for {
select {
case m, ok := <-peerChannel:
if !ok {
return
}
ch <- m
case <-ctx.Done():
return
}
}
}(ctx, peerChannels[index], &wg)
}
go func(wg *sync.WaitGroup, ch chan Metric) {
wg.Wait()
close(ch)
}(&wg, ch)
return ch
}

@ -72,6 +72,13 @@ const (
writeLock
)
// BackendMetrics - represents bytes served from backend
type BackendMetrics struct {
bytesReceived uint64
bytesSent uint64
requestStats RequestStats
}
// ObjectLayer implements primitives for object API layer.
type ObjectLayer interface {
SetDriveCount() int // Only implemented by erasure layer
@ -143,7 +150,7 @@ type ObjectLayer interface {
IsCompressionSupported() bool
// Backend related metrics
GetMetrics(ctx context.Context) (*Metrics, error)
GetMetrics(ctx context.Context) (*BackendMetrics, error)
// Returns health of the backend
Health(ctx context.Context, opts HealthOptions) HealthResult

@ -749,7 +749,7 @@ func (client *peerRESTClient) doListen(listenCh chan interface{}, doneCh <-chan
dec := gob.NewDecoder(respBody)
for {
var ev event.Event
if err = dec.Decode(&ev); err != nil {
if err := dec.Decode(&ev); err != nil {
return
}
if len(ev.EventVersion) > 0 {
@ -906,3 +906,24 @@ func (client *peerRESTClient) MonitorBandwidth(ctx context.Context, buckets []st
err = dec.Decode(&bandwidthReport)
return &bandwidthReport, err
}
func (client *peerRESTClient) GetPeerMetrics(ctx context.Context) (<-chan Metric, error) {
respBody, err := client.callWithContext(ctx, peerRESTMethodGetPeerMetrics, nil, nil, -1)
if err != nil {
return nil, err
}
dec := gob.NewDecoder(respBody)
ch := make(chan Metric)
go func(ch chan<- Metric) {
for {
var metric Metric
if err := dec.Decode(&metric); err != nil {
http.DrainBody(respBody)
close(ch)
return
}
ch <- metric
}
}(ch)
return ch, nil
}

@ -58,6 +58,7 @@ const (
peerRESTMethodGetBandwidth = "/bandwidth"
peerRESTMethodGetMetacacheListing = "/getmetacache"
peerRESTMethodUpdateMetacacheListing = "/updatemetacache"
peerRESTMethodGetPeerMetrics = "/peermetrics"
)
const (

@ -801,7 +801,7 @@ func (s *peerRESTServer) SignalServiceHandler(w http.ResponseWriter, r *http.Req
// ListenHandler sends http trace messages back to peer rest client
func (s *peerRESTServer) ListenHandler(w http.ResponseWriter, r *http.Request) {
if !s.IsValid(w, r) {
s.writeErrorResponse(w, errors.New("Invalid request"))
s.writeErrorResponse(w, errors.New("invalid request"))
return
}
@ -809,7 +809,7 @@ func (s *peerRESTServer) ListenHandler(w http.ResponseWriter, r *http.Request) {
var prefix string
if len(values[peerRESTListenPrefix]) > 1 {
s.writeErrorResponse(w, errors.New("Invalid request"))
s.writeErrorResponse(w, errors.New("invalid request"))
return
}
@ -824,7 +824,7 @@ func (s *peerRESTServer) ListenHandler(w http.ResponseWriter, r *http.Request) {
var suffix string
if len(values[peerRESTListenSuffix]) > 1 {
s.writeErrorResponse(w, errors.New("Invalid request"))
s.writeErrorResponse(w, errors.New("invalid request"))
return
}
@ -1004,7 +1004,7 @@ func (s *peerRESTServer) IsValid(w http.ResponseWriter, r *http.Request) bool {
// GetBandwidth gets the bandwidth for the buckets requested.
func (s *peerRESTServer) GetBandwidth(w http.ResponseWriter, r *http.Request) {
if !s.IsValid(w, r) {
s.writeErrorResponse(w, errors.New("Invalid request"))
s.writeErrorResponse(w, errors.New("invalid request"))
return
}
bucketsString := r.URL.Query().Get("buckets")
@ -1025,6 +1025,29 @@ func (s *peerRESTServer) GetBandwidth(w http.ResponseWriter, r *http.Request) {
w.(http.Flusher).Flush()
}
// GetPeerMetrics gets the metrics to be federated across peers.
func (s *peerRESTServer) GetPeerMetrics(w http.ResponseWriter, r *http.Request) {
if !s.IsValid(w, r) {
s.writeErrorResponse(w, errors.New("invalid request"))
}
w.WriteHeader(http.StatusOK)
w.(http.Flusher).Flush()
doneCh := make(chan struct{})
defer close(doneCh)
enc := gob.NewEncoder(w)
ch := ReportMetrics(r.Context(), GetGeneratorsForPeer)
for m := range ch {
if err := enc.Encode(m); err != nil {
s.writeErrorResponse(w, errors.New("Encoding metric failed: "+err.Error()))
return
}
}
w.(http.Flusher).Flush()
}
// registerPeerRESTHandlers - register peer rest router.
func registerPeerRESTHandlers(router *mux.Router) {
server := &peerRESTServer{}
@ -1064,4 +1087,5 @@ func registerPeerRESTHandlers(router *mux.Router) {
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodGetBandwidth).HandlerFunc(httpTraceHdrs(server.GetBandwidth))
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodGetMetacacheListing).HandlerFunc(httpTraceHdrs(server.GetMetacacheListingHandler))
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodUpdateMetacacheListing).HandlerFunc(httpTraceHdrs(server.UpdateMetacacheListingHandler))
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodGetPeerMetrics).HandlerFunc(httpTraceHdrs(server.GetPeerMetrics))
}

@ -13,8 +13,15 @@ Read more on how to use these endpoints in [MinIO healthcheck guide](https://git
### Prometheus Probe
MinIO server exposes Prometheus compatible data on a single endpoint. By default, the endpoint is authenticated.
MinIO allows reading metrics for the entire cluster from any single node. The cluster wide metrics can be read at
`<Address for MinIO Service>/minio/prometheus/cluster`.
- Prometheus data available at `/minio/prometheus/metrics`
The additional node specific metrics which include go metrics or process metrics are exposed at
`<Address for MinIO Node>/minio/prometheus/node`.
To use this endpoint, setup Prometheus to scrape data from this endpoint. Read more on how to configure and use Prometheus to monitor MinIO server in [How to monitor MinIO server with Prometheus](https://github.com/minio/minio/blob/master/docs/metrics/prometheus/README.md).
**Deprecated metrics monitoring**
- Prometheus' data available at `/minio/prometheus/metrics` is deprecated

@ -1,8 +1,13 @@
# How to monitor MinIO server with Prometheus [![Slack](https://slack.min.io/slack?type=svg)](https://slack.min.io)
[Prometheus](https://prometheus.io) is a cloud-native monitoring platform, built originally at SoundCloud. Prometheus offers a multi-dimensional data model with time series data identified by metric name and key/value pairs. The data collection happens via a pull model over HTTP/HTTPS. Targets to pull data from are discovered via service discovery or static configuration.
[Prometheus](https://prometheus.io) is a cloud-native monitoring platform.
MinIO exports Prometheus compatible data by default as an authorized endpoint at `/minio/prometheus/metrics`. Users looking to monitor their MinIO instances can point Prometheus configuration to scrape data from this endpoint.
Prometheus offers a multi-dimensional data model with time series data identified by metric name and key/value pairs.
The data collection happens via a pull model over HTTP/HTTPS.
MinIO exports Prometheus compatible data by default as an authorized endpoint at `/minio/prometheus/metrics/cluster`.
Users looking to monitor their MinIO instances can point Prometheus configuration to scrape data from this endpoint.
This document explains how to setup Prometheus and configure it to scrape data from MinIO servers.
@ -20,7 +25,8 @@ This document explains how to setup Prometheus and configure it to scrape data f
- [List of metrics exposed by MinIO](#list-of-metrics-exposed-by-minio)
## Prerequisites
To get started with MinIO, refer [MinIO QuickStart Document](https://docs.min.io/docs/minio-quickstart-guide). Follow below steps to get started with MinIO monitoring using Prometheus.
To get started with MinIO, refer [MinIO QuickStart Document](https://docs.min.io/docs/minio-quickstart-guide).
Follow below steps to get started with MinIO monitoring using Prometheus.
### 1. Download Prometheus
@ -68,7 +74,7 @@ The command will generate the `scrape_configs` section of the prometheus.yml as
scrape_configs:
- job_name: minio-job
bearer_token: <secret>
metrics_path: /minio/prometheus/metrics
metrics_path: /minio/v2/metrics/cluster
scheme: http
static_configs:
- targets: ['localhost:9000']
@ -77,16 +83,26 @@ scrape_configs:
#### 3.2 Public Prometheus config
If Prometheus endpoint authentication type is set to `public`. Following prometheus config is sufficient to start scraping metrics data from MinIO.
This can be collected from any server once per collection.
##### Cluster
```yaml
scrape_configs:
- job_name: minio-job
metrics_path: /minio/prometheus/metrics
metrics_path: /minio/v2/metrics/cluster
scheme: http
static_configs:
- targets: ['localhost:9000']
```
##### Node
Optionally you can also collect per node metrics. This needs to be done on a per server instance.
```yaml
scrape_configs:
- job_name: minio-job
metrics_path: /minio/v2/metrics/node
scheme: http
static_configs:
- targets: ['localhost:9000']
```
### 4. Update `scrape_configs` section in prometheus.yml
To authorize every scrape request, copy and paste the generated `scrape_configs` section in the prometheus.yml and restart the Prometheus service.
@ -103,172 +119,16 @@ Here `prometheus.yml` is the name of configuration file. You can now see MinIO m
### 6. Configure Grafana
After Prometheus is configured, you can use Grafana to visualize MinIO metrics. Refer the [document here to setup Grafana with MinIO prometheus metrics](https://github.com/minio/minio/blob/master/docs/metrics/prometheus/grafana/README.md).
After Prometheus is configured, you can use Grafana to visualize MinIO metrics.
Refer the [document here to setup Grafana with MinIO prometheus metrics](https://github.com/minio/minio/blob/master/docs/metrics/prometheus/grafana/README.md).
## List of metrics exposed by MinIO
MinIO server exposes the following metrics on `/minio/prometheus/metrics` endpoint. All of these can be accessed via Prometheus dashboard. The full list of exposed metrics along with their definition is available in the demo server at https://play.min.io:9000/minio/prometheus/metrics
These are the new set of metrics which will be in effect after `RELEASE.2019-10-16*`. Some of the key changes in this update are listed below.
- Metrics are bound the respective nodes and is not cluster-wide. Each and every node in a cluster will expose its own metrics.
- Additional metrics to cover the s3 and internode traffic statistics were added.
- Metrics that records the http statistics and latencies are labeled to their respective APIs (putobject,getobject etc).
- Disk usage metrics are distributed and labeled to the respective disk paths.
For more details, please check the `Migration guide for the new set of metrics`.
The list of metrics and its definition are as follows. (NOTE: instance here is one MinIO node)
> NOTES:
> 1. Instance here is one MinIO node.
> 2. `s3 requests` exclude internode requests.
### Default set of information
| name | description |
|:------------|:--------------------------------|
| `go_` | all standard go runtime metrics |
| `process_` | all process level metrics |
| `promhttp_` | all prometheus scrape metrics |
### MinIO node specific information
| name | description |
|:---------------------------|:-------------------------------------------------------------------------------|
| `minio_version_info` | Current MinIO version with its commit-id |
| `minio_disks_offline` | Total number of offline disks on current MinIO instance |
| `minio_disks_total` | Total number of disks on current MinIO instance |
### Disk metrics are labeled by 'disk' which indentifies each disk
| name | description |
|:---------------------------|:-------------------------------------------------------------------------------|
| `disk_storage_total` | Total size of the disk |
| `disk_storage_used` | Total disk space used per disk |
| `disk_storage_available` | Total available disk space per disk |
### S3 API metrics are labeled by 'api' which identifies different S3 API requests
| name | description |
|:---------------------------|:-------------------------------------------------------------------------------|
| `s3_requests_total` | Total number of s3 requests in current MinIO instance |
| `s3_errors_total` | Total number of errors in s3 requests in current MinIO instance |
| `s3_requests_current` | Total number of active s3 requests in current MinIO instance |
| `s3_rx_bytes_total` | Total number of s3 bytes received by current MinIO server instance |
| `s3_tx_bytes_total` | Total number of s3 bytes sent by current MinIO server instance |
| `s3_ttfb_seconds` | Histogram that holds the latency information of the requests |
#### Internode metrics only available in a distributed setup
| name | description |
|:---------------------------|:-------------------------------------------------------------------------------|
| `internode_rx_bytes_total` | Total number of internode bytes received by current MinIO server instance |
| `internode_tx_bytes_total` | Total number of bytes sent to the other nodes by current MinIO server instance |
Apart from above metrics, MinIO also exposes below mode specific metrics
### Bucket usage specific metrics
All metrics are labeled by `bucket`, each metric is displayed per bucket. `buckets_objects_histogram` is additionally labeled by `object_size` string which is represented by any of the following values
- *LESS_THAN_1024_B*
- *BETWEEN_1024_B_AND_1_MB*
- *BETWEEN_1_MB_AND_10_MB*
- *BETWEEN_10_MB_AND_64_MB*
- *BETWEEN_64_MB_AND_128_MB*
- *BETWEEN_128_MB_AND_512_MB*
- *GREATER_THAN_512_MB*
Units defintions:
- 1 MB = 1024 KB
- 1 KB = 1024 B
| name | description |
|:------------------------------------|:----------------------------------------------------|
| `bucket_usage_size` | Total size of the bucket |
| `bucket_objects_count` | Total number of objects in a bucket |
| `bucket_objects_histogram` | Total number of objects filtered by different sizes |
| `bucket_replication_pending_size` | Total capacity not replicated |
| `bucket_replication_failed_size` | Total capacity failed to replicate at least once |
| `bucket_replication_successful_size`| Total capacity successfully replicated |
| `bucket_replication_received_size` | Total capacity received as replicated objects |
### Cache specific metrics
MinIO Gateway instances enabled with Disk-Caching expose caching related metrics.
#### Global cache metrics
| name | description |
|:---------------------|:--------------------------------------------------|
| `cache_hits_total` | Total number of cache hits |
| `cache_misses_total` | Total number of cache misses |
| `cache_data_served` | Total number of bytes served from cache |
#### Per disk cache metrics
| name | description |
|:-----------------------|:---------------------------------------------------------------------------------|
| `cache_usage_size` | Total cache usage in bytes |
| `cache_total_capacity` | Total size of cache disk |
| `cache_usage_percent` | Total percentage cache usage |
| `cache_usage_state` | Indicates cache usage is high or low, relative to current cache 'quota' settings |
`cache_usage_state` holds only two states
- '1' indicates high disk usage
- '0' indicates low disk usage
### Gateway specific metrics
MinIO Gateway instance exposes metrics related to Gateway communication with the cloud backend (S3, Azure & GCS Gateway).
`<gateway_type>` changes based on the gateway in use can be 's3', 'gcs' or 'azure'. Other metrics are labeled with `method` that identifies HTTP GET, HEAD, PUT and POST requests to the backend.
| name | description |
|:----------------------------------------|:---------------------------------------------------------------------------|
| `gateway_<gateway_type>_requests` | Total number of requests made to the gateway backend |
| `gateway_<gateway_type>_bytes_sent` | Total number of bytes sent to cloud backend (in PUT & POST Requests) |
| `gateway_<gateway_type>_bytes_received` | Total number of bytes received from cloud backend (in GET & HEAD Requests) |
Note that this is currently only support for Azure, S3 and GCS Gateway.
### MinIO self-healing metrics - `self_heal_*`
MinIO exposes self-healing related metrics for erasure-code deployments _only_. These metrics are _not_ available on Gateway or Single Node, Single Drive deployments. Note that these metrics will be exposed _only_ when there is a relevant event happening on MinIO server.
| name | description |
|:-------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `self_heal_time_since_last_activity` | Time elapsed since last self-healing related activity |
| `self_heal_objects_scanned` | Number of objects scanned by self-healing thread in its current run. This will reset when a fresh self-healing run starts. This is labeled with the object type scanned |
| `self_heal_objects_healed` | Number of objects healing by self-healing thread in its current run. This will reset when a fresh self-healing run starts. This is labeled with the object type scanned |
| `self_heal_objects_heal_failed` | Number of objects for which self-healing failed in its current run. This will reset when a fresh self-healing run starts. This is labeled with disk status and its endpoint |
## Migration guide for the new set of metrics
This migration guide applies for older releases or any releases before `RELEASE.2019-10-23*`
### MinIO disk level metrics - `disk_*`
The migrations include
- `minio_total_disks` to `minio_disks_total`
- `minio_offline_disks` to `minio_disks_offline`
### MinIO disk level metrics - `disk_storage_*`
These metrics have one label.
- `disk`: Holds the disk path
The migrations include
- `minio_disk_storage_used_bytes` to `disk_storage_used`
- `minio_disk_storage_available_bytes` to `disk_storage_available`
- `minio_disk_storage_total_bytes` to `disk_storage_total`
### MinIO network level metrics
These metrics are detailed to cover the s3 and internode network statistics.
The migrations include
- `minio_network_sent_bytes_total` to `s3_tx_bytes_total` and `internode_tx_bytes_total`
- `minio_network_received_bytes_total` to `s3_rx_bytes_total` and `internode_rx_bytes_total`
MinIO server exposes the following metrics on `/minio/prometheus/metrics/cluster` endpoint.
All of these can be accessed via Prometheus dashboard.
A sample list of exposed metrics along with their definition is available in the demo server at
`curl https://play.min.io:9000/minio/prometheus/metrics/cluster`
Some of the additional metrics added were
### List of metrics reported
- `s3_requests_total`
- `s3_errors_total`
- `s3_ttfb_seconds`
[The list of metrics reported can be here](https://github.com/minio/minio/blob/master/docs/metrics/prometheus/list.md)

@ -0,0 +1,47 @@
# List of metrics reported cluster wide
Each metric includes a label for the server that calculated the metric.
Each metric has a label for the server that generated the metric.
These metrics can be from any MinIO server once per collection.
| Name | Description |
|:-----------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------|
|`minio_bucket_objects_size_distribution` |Distribution of object sizes in the bucket, includes label for the bucket name. |
|`minio_bucket_replication_failed_bytes` |Total number of bytes failed at least once to replicate. |
|`minio_bucket_replication_pending_bytes` |Total bytes pending to replicate. |
|`minio_bucket_replication_received_bytes` |Total number of bytes replicated to this bucket from another source bucket. |
|`minio_bucket_replication_sent_bytes` |Total number of bytes replicated to the target bucket. |
|`minio_bucket_usage_object_total` |Total number of objects |
|`minio_bucket_usage_total_bytes` |Total bucket size in bytes |
|`minio_cluster_capacity_raw_free_bytes` |Total free capacity online in the cluster. |
|`minio_cluster_capacity_raw_total_bytes` |Total capacity online in the cluster. |
|`minio_cluster_capacity_usable_free_bytes` |Total free usable capacity online in the cluster. |
|`minio_cluster_capacity_usable_total_bytes` |Total usable capacity online in the cluster. |
|`minio_cluster_disk_offline_total` |Total disks offline. |
|`minio_cluster_disk_online_total` |Total disks online. |
|`minio_cluster_nodes_offline_total` |Total number of MinIO nodes offline. |
|`minio_cluster_nodes_online_total` |Total number of MinIO nodes online. |
|`minio_heal_objects_error_total` |Objects for which healing failed in current self healing run |
|`minio_heal_objects_heal_total` |Objects healed in current self healing run |
|`minio_heal_objects_total` |Objects scanned in current self healing run |
|`minio_heal_time_last_activity_nano_seconds` |Time elapsed (in nano seconds) since last self healing activity. This is set to -1 until initial self heal activity |
|`minio_inter_node_traffic_received_bytes` |Total number of bytes received from other peer nodes. |
|`minio_inter_node_traffic_sent_bytes` |Total number of bytes sent to the other peer nodes. |
|`minio_node_disk_free_bytes` |Total storage available on a disk. |
|`minio_node_disk_total_bytes` |Total storage on a disk. |
|`minio_node_disk_used_bytes` |Total storage used on a disk. |
|`minio_s3_requests_error_total` |Total number S3 requests with errors |
|`minio_s3_requests_inflight_total` |Total number of S3 requests currently in flight. |
|`minio_s3_requests_total` |Total number S3 requests |
|`minio_s3_time_ttbf_seconds_distribution` |Distribution of the time to first byte across API calls. |
|`minio_s3_traffic_received_bytes` |Total number of s3 bytes received. |
|`minio_s3_traffic_sent_bytes` |Total number of s3 bytes sent |
|`minio_cache_hits_total` |Total number of disk cache hits |
|`minio_cache_missed_total` |Total number of disk cache misses |
|`minio_cache_sent_bytes` |Total number of bytes served from cache |
|`minio_cache_total_bytes` |Total size of cache disk in bytes |
|`minio_cache_usage_info` |Total percentage cache usage, value of 1 indicates high and 0 low, label level is set as well |
|`minio_cache_used_bytes` |Current cache usage in bytes |
|`minio_software_commit_info` |Git commit hash for the MinIO release. |
|`minio_software_version_info` |MinIO Release tag for the server |

@ -65,6 +65,9 @@ require (
github.com/pierrec/lz4 v2.5.2+incompatible
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.8.0
github.com/quasilyte/go-ruleguard v0.2.1 // indirect
github.com/quasilyte/go-ruleguard/dsl/fluent v0.0.0-20201222093424-5d7e62a465d3 // indirect
github.com/prometheus/client_model v0.2.0
github.com/rjeczalik/notify v0.9.2
github.com/rs/cors v1.7.0
github.com/secure-io/sio-go v0.3.0

@ -529,6 +529,9 @@ github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+Gx
github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=
github.com/prometheus/procfs v0.2.0 h1:wH4vA7pcjKuZzjF7lM8awk4fnuJO6idemZXoKnULUx4=
github.com/prometheus/procfs v0.2.0/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=
github.com/quasilyte/go-ruleguard v0.2.1 h1:56eRm0daAyny9UhJnmtJW/UyLZQusukBAB8oT8AHKHo=
github.com/quasilyte/go-ruleguard v0.2.1/go.mod h1:hN2rVc/uS4bQhQKTio2XaSJSafJwqBUWWwtssT3cQmc=
github.com/quasilyte/go-ruleguard/dsl/fluent v0.0.0-20201222093424-5d7e62a465d3/go.mod h1:P7JlQWFT7jDcFZMtUPQbtGzzzxva3rBn6oIF+LPwFcM=
github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a h1:9ZKAASQSHhDYGoxY8uLVpewe1GDZ2vu2Tr/vTdVAkFQ=
github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4=
github.com/rcrowley/go-metrics v0.0.0-20200313005456-10cdbea86bc0 h1:MkV+77GLUNo5oJ0jf870itWm3D0Sjh7+Za9gazKc5LQ=
@ -619,6 +622,7 @@ github.com/xdg/stringprep v1.0.0 h1:d9X0esnoa3dFsV0FG35rAT0RIhYFlPq7MiP+DW89La0=
github.com/xdg/stringprep v1.0.0/go.mod h1:Jhud4/sHMO4oL310DaZAKk9ZaJ08SJfe+sJh0HrGL1Y=
github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2 h1:eY9dn8+vbi4tKz5Qo6v2eYzo7kUS51QINcR5jNpbZS8=
github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU=
github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
go.etcd.io/bbolt v1.3.3 h1:MUGmc65QhB3pIlaQ5bB4LwqSj6GIonVJXpZiaKNyaKk=
go.etcd.io/bbolt v1.3.3/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU=
@ -711,6 +715,7 @@ golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9 h1:SQFwaSi55rU7vdNs9Yr0Z324VNlrF+0wMqRXT4St8ck=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@ -782,6 +787,7 @@ golang.org/x/tools v0.0.0-20191029190741-b9c20aec41a5/go.mod h1:b+2E5dAYhXwXZwtn
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191216052735-49a3e744a425/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200103221440-774c71fcf114/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200812195022-5ae4c3c160a0/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA=
golang.org/x/tools v0.0.0-20201105001634-bc3cf281b174/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.0.0-20210115202250-e0d201561e39 h1:BTs2GMGSMWpgtCpv1CE7vkJTv7XcHdcLLnAMu7UbgTY=
golang.org/x/tools v0.0.0-20210115202250-e0d201561e39/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=

@ -158,8 +158,8 @@ type PerfInfo struct {
// ServerDrivesInfo - Drive info about all drives in a single MinIO node
type ServerDrivesInfo struct {
Addr string `json:"addr"`
Serial []DrivePerfInfo `json:"serial,omitempty"`
Parallel []DrivePerfInfo `json:"parallel,omitempty"`
Serial []DrivePerfInfo `json:"serial,omitempty"` // Drive perf info collected one drive at a time
Parallel []DrivePerfInfo `json:"parallel,omitempty"` // Drive perf info collected in parallel
Error string `json:"error,omitempty"`
}
@ -316,3 +316,27 @@ func (adm *AdminClient) ServerHealthInfo(ctx context.Context, healthDataTypes []
return respChan
}
// GetTotalCapacity gets the total capacity a server holds.
func (s *ServerDiskHwInfo) GetTotalCapacity() (capacity uint64) {
for _, u := range s.Usage {
capacity += u.Total
}
return
}
// GetTotalFreeCapacity gets the total capacity that is free.
func (s *ServerDiskHwInfo) GetTotalFreeCapacity() (capacity uint64) {
for _, u := range s.Usage {
capacity += u.Free
}
return
}
// GetTotalUsedCapacity gets the total capacity used.
func (s *ServerDiskHwInfo) GetTotalUsedCapacity() (capacity uint64) {
for _, u := range s.Usage {
capacity += u.Used
}
return
}

Loading…
Cancel
Save