Updated Prometheus metrics (#11141)

* Add metrics for nodes online and offline * Add cluster capacity metrics * Introduce v2 metrics
4 years ago · b4add82bb6
parent 3bda8f755c
commit b4add82bb6
27 changed files with 1669 additions and 252 deletions
--- a/cmd/admin-server-info.go
+++ b/cmd/admin-server-info.go
@ -69,3 +69,35 @@ func getLocalServerProperty(endpointServerPools EndpointServerPools, r *http.Req
 		Disks:    storageInfo.Disks,
 	}
 }
+
+func getLocalDisks(endpointServerPools EndpointServerPools) []madmin.Disk {
+	var localEndpoints Endpoints
+	network := make(map[string]string)
+
+	for _, ep := range endpointServerPools {
+		for _, endpoint := range ep.Endpoints {
+			nodeName := endpoint.Host
+			if nodeName == "" {
+				nodeName = "localhost"
+			}
+			if endpoint.IsLocal {
+				// Only proceed for local endpoints
+				network[nodeName] = "online"
+				localEndpoints = append(localEndpoints, endpoint)
+				continue
+			}
+			_, present := network[nodeName]
+			if !present {
+				if err := isServerResolvable(endpoint); err == nil {
+					network[nodeName] = "online"
+				} else {
+					network[nodeName] = "offline"
+				}
+			}
+		}
+	}
+	localDisks, _ := initStorageDisksWithErrors(localEndpoints)
+	defer closeStorageDisks(localDisks)
+	storageInfo, _ := getStorageInfo(localDisks, localEndpoints.GetAllStrings())
+	return storageInfo.Disks
+}
--- a/cmd/disk-cache-stats.go
+++ b/cmd/disk-cache-stats.go
@ -34,6 +34,14 @@ type CacheDiskStats struct {
 	Dir          string
 }

+// GetUsageLevelString gets the string representation for the usage level.
+func (c *CacheDiskStats) GetUsageLevelString() (u string) {
+	if atomic.LoadInt32(&c.UsageState) == 0 {
+		return "low"
+	}
+	return "high"
+}
+
 // CacheStats - represents bytes served from cache,
 // cache hits and cache misses.
 type CacheStats struct {
--- a/cmd/erasure-server-pool.go
+++ b/cmd/erasure-server-pool.go
@ -1377,9 +1377,9 @@ func (z *erasureServerPools) HealObject(ctx context.Context, bucket, object, ver
 }

 // GetMetrics - no op
-func (z *erasureServerPools) GetMetrics(ctx context.Context) (*Metrics, error) {
+func (z *erasureServerPools) GetMetrics(ctx context.Context) (*BackendMetrics, error) {
 	logger.LogIf(ctx, NotImplemented{})
-	return &Metrics{}, NotImplemented{}
+	return &BackendMetrics{}, NotImplemented{}
 }

 func (z *erasureServerPools) getZoneAndSet(id string) (int, int, error) {
--- a/cmd/fs-v1.go
+++ b/cmd/fs-v1.go
@ -1554,9 +1554,9 @@ func (fs *FSObjects) HealObjects(ctx context.Context, bucket, prefix string, opt
 }

 // GetMetrics - no op
-func (fs *FSObjects) GetMetrics(ctx context.Context) (*Metrics, error) {
+func (fs *FSObjects) GetMetrics(ctx context.Context) (*BackendMetrics, error) {
 	logger.LogIf(ctx, NotImplemented{})
-	return &Metrics{}, NotImplemented{}
+	return &BackendMetrics{}, NotImplemented{}
 }

 // ListObjectsV2 lists all blobs in bucket filtered by prefix
--- a/cmd/gateway-common.go
+++ b/cmd/gateway-common.go
@ -389,7 +389,7 @@ func shouldMeterRequest(req *http.Request) bool {
 // MetricsTransport is a custom wrapper around Transport to track metrics
 type MetricsTransport struct {
 	Transport *http.Transport
-	Metrics   *Metrics
+	Metrics   *BackendMetrics
 }

 // RoundTrip implements the RoundTrip method for MetricsTransport
--- a/cmd/gateway-metrics.go
+++ b/cmd/gateway-metrics.go
@ -29,36 +29,28 @@ type RequestStats struct {
 	Post uint64 `json:"Post"`
 }

-// Metrics - represents bytes served from backend
-// only implemented for S3 Gateway
-type Metrics struct {
-	bytesReceived uint64
-	bytesSent     uint64
-	requestStats  RequestStats
-}
-
 // IncBytesReceived - Increase total bytes received from gateway backend
-func (s *Metrics) IncBytesReceived(n uint64) {
+func (s *BackendMetrics) IncBytesReceived(n uint64) {
 	atomic.AddUint64(&s.bytesReceived, n)
 }

 // GetBytesReceived - Get total bytes received from gateway backend
-func (s *Metrics) GetBytesReceived() uint64 {
+func (s *BackendMetrics) GetBytesReceived() uint64 {
 	return atomic.LoadUint64(&s.bytesReceived)
 }

 // IncBytesSent - Increase total bytes sent to gateway backend
-func (s *Metrics) IncBytesSent(n uint64) {
+func (s *BackendMetrics) IncBytesSent(n uint64) {
 	atomic.AddUint64(&s.bytesSent, n)
 }

 // GetBytesSent - Get total bytes received from gateway backend
-func (s *Metrics) GetBytesSent() uint64 {
+func (s *BackendMetrics) GetBytesSent() uint64 {
 	return atomic.LoadUint64(&s.bytesSent)
 }

 // IncRequests - Increase request count sent to gateway backend by 1
-func (s *Metrics) IncRequests(method string) {
+func (s *BackendMetrics) IncRequests(method string) {
 	// Only increment for Head & Get requests, else no op
 	if method == http.MethodGet {
 		atomic.AddUint64(&s.requestStats.Get, 1)
@ -72,11 +64,11 @@ func (s *Metrics) IncRequests(method string) {
 }

 // GetRequests - Get total number of Get & Headrequests sent to gateway backend
-func (s *Metrics) GetRequests() RequestStats {
+func (s *BackendMetrics) GetRequests() RequestStats {
 	return s.requestStats
 }

-// NewMetrics - Prepare new Metrics structure
-func NewMetrics() *Metrics {
-	return &Metrics{}
+// NewMetrics - Prepare new BackendMetrics structure
+func NewMetrics() *BackendMetrics {
+	return &BackendMetrics{}
 }
--- a/cmd/gateway-unsupported.go
+++ b/cmd/gateway-unsupported.go
@ -202,9 +202,9 @@ func (a GatewayUnsupported) CopyObject(ctx context.Context, srcBucket string, sr
 }

 // GetMetrics - no op
-func (a GatewayUnsupported) GetMetrics(ctx context.Context) (*Metrics, error) {
+func (a GatewayUnsupported) GetMetrics(ctx context.Context) (*BackendMetrics, error) {
 	logger.LogIf(ctx, NotImplemented{})
-	return &Metrics{}, NotImplemented{}
+	return &BackendMetrics{}, NotImplemented{}
 }

 // PutObjectTags - not implemented.
--- a/cmd/gateway/azure/gateway-azure.go
+++ b/cmd/gateway/azure/gateway-azure.go
@ -419,7 +419,7 @@ type azureObjects struct {
 	minio.GatewayUnsupported
 	endpoint   *url.URL
 	httpClient *http.Client
-	metrics    *minio.Metrics
+	metrics    *minio.BackendMetrics
 	client     azblob.ServiceURL // Azure sdk client
 }

@ -533,7 +533,7 @@ func parseAzurePart(metaPartFileName, prefix string) (partID int, err error) {
 }

 // GetMetrics returns this gateway's metrics
-func (a *azureObjects) GetMetrics(ctx context.Context) (*minio.Metrics, error) {
+func (a *azureObjects) GetMetrics(ctx context.Context) (*minio.BackendMetrics, error) {
 	return a.metrics, nil
 }

--- a/cmd/gateway/gcs/gateway-gcs.go
+++ b/cmd/gateway/gcs/gateway-gcs.go
@ -341,7 +341,7 @@ type gcsGateway struct {
 	minio.GatewayUnsupported
 	client     *storage.Client
 	httpClient *http.Client
-	metrics    *minio.Metrics
+	metrics    *minio.BackendMetrics
 	projectID  string
 }

@ -359,7 +359,7 @@ func gcsParseProjectID(credsFile string) (projectID string, err error) {
 }

 // GetMetrics returns this gateway's metrics
-func (l *gcsGateway) GetMetrics(ctx context.Context) (*minio.Metrics, error) {
+func (l *gcsGateway) GetMetrics(ctx context.Context) (*minio.BackendMetrics, error) {
 	return l.metrics, nil
 }

--- a/cmd/gateway/s3/gateway-s3.go
+++ b/cmd/gateway/s3/gateway-s3.go
@ -259,11 +259,11 @@ type s3Objects struct {
 	minio.GatewayUnsupported
 	Client     *miniogo.Core
 	HTTPClient *http.Client
-	Metrics    *minio.Metrics
+	Metrics    *minio.BackendMetrics
 }

 // GetMetrics returns this gateway's metrics
-func (l *s3Objects) GetMetrics(ctx context.Context) (*minio.Metrics, error) {
+func (l *s3Objects) GetMetrics(ctx context.Context) (*minio.BackendMetrics, error) {
 	return l.Metrics, nil
 }

--- a/cmd/generic-handlers.go
+++ b/cmd/generic-handlers.go
@ -228,7 +228,9 @@ func guessIsMetricsReq(req *http.Request) bool {
 	}
 	aType := getRequestAuthType(req)
 	return (aType == authTypeAnonymous || aType == authTypeJWT) &&
-		req.URL.Path == minioReservedBucketPath+prometheusMetricsPath
+		req.URL.Path == minioReservedBucketPath+prometheusMetricsPathLegacy ||
+		req.URL.Path == minioReservedBucketPath+prometheusMetricsV2ClusterPath ||
+		req.URL.Path == minioReservedBucketPath+prometheusMetricsV2NodePath
 }

 // guessIsRPCReq - returns true if the request is for an RPC endpoint.
--- a/cmd/http-stats.go
+++ b/cmd/http-stats.go
@ -79,10 +79,10 @@ func (s *ConnStats) getS3OutputBytes() uint64 {
 // Return connection stats (total input/output bytes and total s3 input/output bytes)
 func (s *ConnStats) toServerConnStats() ServerConnStats {
 	return ServerConnStats{
-		TotalInputBytes:  s.getTotalInputBytes(),
-		TotalOutputBytes: s.getTotalOutputBytes(),
-		S3InputBytes:     s.getS3InputBytes(),
-		S3OutputBytes:    s.getS3OutputBytes(),
+		TotalInputBytes:  s.getTotalInputBytes(),  // Traffic including reserved bucket
+		TotalOutputBytes: s.getTotalOutputBytes(), // Traffic including reserved bucket
+		S3InputBytes:     s.getS3InputBytes(),     // Traffic for client buckets
+		S3OutputBytes:    s.getS3OutputBytes(),    // Traffic for client buckets
 	}
 }

@ -163,9 +163,11 @@ func (st *HTTPStats) toServerHTTPStats() ServerHTTPStats {
 // Update statistics from http request and response data
 func (st *HTTPStats) updateStats(api string, r *http.Request, w *logger.ResponseWriter) {
 	// A successful request has a 2xx response code
-	successReq := (w.StatusCode >= 200 && w.StatusCode < 300)
+	successReq := w.StatusCode >= 200 && w.StatusCode < 300

-	if !strings.HasSuffix(r.URL.Path, prometheusMetricsPath) {
+	if !strings.HasSuffix(r.URL.Path, prometheusMetricsPathLegacy) ||
+		!strings.HasSuffix(r.URL.Path, prometheusMetricsV2ClusterPath) ||
+		!strings.HasSuffix(r.URL.Path, prometheusMetricsV2NodePath) {
 		st.totalS3Requests.Inc(api)
 		if !successReq && w.StatusCode != 0 {
 			st.totalS3Errors.Inc(api)
--- a/cmd/metrics-router.go
+++ b/cmd/metrics-router.go
@ -24,7 +24,9 @@ import (
 )

 const (
-	prometheusMetricsPath = "/prometheus/metrics"
+	prometheusMetricsPathLegacy    = "/prometheus/metrics"
+	prometheusMetricsV2ClusterPath = "/v2/metrics/cluster"
+	prometheusMetricsV2NodePath    = "/v2/metrics/node"
 )

 // Standard env prometheus auth type
@ -43,14 +45,17 @@ const (
 func registerMetricsRouter(router *mux.Router) {
 	// metrics router
 	metricsRouter := router.NewRoute().PathPrefix(minioReservedBucketPath).Subrouter()
-
 	authType := strings.ToLower(os.Getenv(EnvPrometheusAuthType))
 	switch prometheusAuthType(authType) {
 	case prometheusPublic:
-		metricsRouter.Handle(prometheusMetricsPath, metricsHandler())
+		metricsRouter.Handle(prometheusMetricsPathLegacy, metricsHandler())
+		metricsRouter.Handle(prometheusMetricsV2ClusterPath, metricsServerHandler())
+		metricsRouter.Handle(prometheusMetricsV2NodePath, metricsNodeHandler())
 	case prometheusJWT:
 		fallthrough
 	default:
-		metricsRouter.Handle(prometheusMetricsPath, AuthMiddleware(metricsHandler()))
+		metricsRouter.Handle(prometheusMetricsPathLegacy, AuthMiddleware(metricsHandler()))
+		metricsRouter.Handle(prometheusMetricsV2ClusterPath, AuthMiddleware(metricsServerHandler()))
+		metricsRouter.Handle(prometheusMetricsV2NodePath, AuthMiddleware(metricsNodeHandler()))
 	}
 }
--- a/cmd/metrics-v2.go
+++ b/cmd/metrics-v2.go
--- a/cmd/metrics.go
+++ b/cmd/metrics.go
@ -51,6 +51,17 @@ var (
 	)
 )

+const (
+	healMetricsNamespace = "self_heal"
+	gatewayNamespace     = "gateway"
+	cacheNamespace       = "cache"
+	s3Namespace          = "s3"
+	bucketNamespace      = "bucket"
+	minioNamespace       = "minio"
+	diskNamespace        = "disk"
+	interNodeNamespace   = "internode"
+)
+
 func init() {
 	prometheus.MustRegister(httpRequestsDuration)
 	prometheus.MustRegister(newMinioCollector())
@ -81,9 +92,10 @@ func (c *minioCollector) Describe(ch chan<- *prometheus.Desc) {
 func (c *minioCollector) Collect(ch chan<- prometheus.Metric) {

 	// Expose MinIO's version information
-	minioVersionInfo.WithLabelValues(Version, CommitID).Set(float64(1.0))
+	minioVersionInfo.WithLabelValues(Version, CommitID).Set(1.0)

 	storageMetricsPrometheus(ch)
+	nodeHealthMetricsPrometheus(ch)
 	bucketUsageMetricsPrometheus(ch)
 	networkMetricsPrometheus(ch)
 	httpMetricsPrometheus(ch)
@ -92,6 +104,26 @@ func (c *minioCollector) Collect(ch chan<- prometheus.Metric) {
 	healingMetricsPrometheus(ch)
 }

+func nodeHealthMetricsPrometheus(ch chan<- prometheus.Metric) {
+	nodesUp, nodesDown := GetPeerOnlineCount()
+	ch <- prometheus.MustNewConstMetric(
+		prometheus.NewDesc(
+			prometheus.BuildFQName(minioNamespace, "nodes", "online"),
+			"Total number of MinIO nodes online",
+			nil, nil),
+		prometheus.GaugeValue,
+		float64(nodesUp),
+	)
+	ch <- prometheus.MustNewConstMetric(
+		prometheus.NewDesc(
+			prometheus.BuildFQName(minioNamespace, "nodes", "offline"),
+			"Total number of MinIO nodes offline",
+			nil, nil),
+		prometheus.GaugeValue,
+		float64(nodesDown),
+	)
+}
+
 // collects healing specific metrics for MinIO instance in Prometheus specific format
 // and sends to given channel
 func healingMetricsPrometheus(ch chan<- prometheus.Metric) {
@ -102,7 +134,6 @@ func healingMetricsPrometheus(ch chan<- prometheus.Metric) {
 	if !exists {
 		return
 	}
-	healMetricsNamespace := "self_heal"

 	var dur time.Duration
 	if !bgSeq.lastHealActivity.IsZero() {
@ -172,7 +203,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) {

 	ch <- prometheus.MustNewConstMetric(
 		prometheus.NewDesc(
-			prometheus.BuildFQName("gateway", globalGatewayName, "bytes_received"),
+			prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "bytes_received"),
 			"Total number of bytes received by current MinIO Gateway "+globalGatewayName+" backend",
 			nil, nil),
 		prometheus.CounterValue,
@ -180,7 +211,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) {
 	)
 	ch <- prometheus.MustNewConstMetric(
 		prometheus.NewDesc(
-			prometheus.BuildFQName("gateway", globalGatewayName, "bytes_sent"),
+			prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "bytes_sent"),
 			"Total number of bytes sent by current MinIO Gateway to "+globalGatewayName+" backend",
 			nil, nil),
 		prometheus.CounterValue,
@ -189,7 +220,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) {
 	s := m.GetRequests()
 	ch <- prometheus.MustNewConstMetric(
 		prometheus.NewDesc(
-			prometheus.BuildFQName("gateway", globalGatewayName, "requests"),
+			prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "requests"),
 			"Total number of requests made to "+globalGatewayName+" by current MinIO Gateway",
 			[]string{"method"}, nil),
 		prometheus.CounterValue,
@ -198,7 +229,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) {
 	)
 	ch <- prometheus.MustNewConstMetric(
 		prometheus.NewDesc(
-			prometheus.BuildFQName("gateway", globalGatewayName, "requests"),
+			prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "requests"),
 			"Total number of requests made to "+globalGatewayName+" by current MinIO Gateway",
 			[]string{"method"}, nil),
 		prometheus.CounterValue,
@ -207,7 +238,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) {
 	)
 	ch <- prometheus.MustNewConstMetric(
 		prometheus.NewDesc(
-			prometheus.BuildFQName("gateway", globalGatewayName, "requests"),
+			prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "requests"),
 			"Total number of requests made to "+globalGatewayName+" by current MinIO Gateway",
 			[]string{"method"}, nil),
 		prometheus.CounterValue,
@ -216,7 +247,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) {
 	)
 	ch <- prometheus.MustNewConstMetric(
 		prometheus.NewDesc(
-			prometheus.BuildFQName("gateway", globalGatewayName, "requests"),
+			prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "requests"),
 			"Total number of requests made to "+globalGatewayName+" by current MinIO Gateway",
 			[]string{"method"}, nil),
 		prometheus.CounterValue,
@ -236,7 +267,7 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) {

 	ch <- prometheus.MustNewConstMetric(
 		prometheus.NewDesc(
-			prometheus.BuildFQName("cache", "hits", "total"),
+			prometheus.BuildFQName(cacheNamespace, "hits", "total"),
 			"Total number of disk cache hits in current MinIO instance",
 			nil, nil),
 		prometheus.CounterValue,
@ -244,7 +275,7 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) {
 	)
 	ch <- prometheus.MustNewConstMetric(
 		prometheus.NewDesc(
-			prometheus.BuildFQName("cache", "misses", "total"),
+			prometheus.BuildFQName(cacheNamespace, "misses", "total"),
 			"Total number of disk cache misses in current MinIO instance",
 			nil, nil),
 		prometheus.CounterValue,
@ -252,7 +283,7 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) {
 	)
 	ch <- prometheus.MustNewConstMetric(
 		prometheus.NewDesc(
-			prometheus.BuildFQName("cache", "data", "served"),
+			prometheus.BuildFQName(cacheNamespace, "data", "served"),
 			"Total number of bytes served from cache of current MinIO instance",
 			nil, nil),
 		prometheus.CounterValue,
@ -262,7 +293,7 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) {
 		// Cache disk usage percentage
 		ch <- prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
-				prometheus.BuildFQName("cache", "usage", "percent"),
+				prometheus.BuildFQName(cacheNamespace, "usage", "percent"),
 				"Total percentage cache usage",
 				[]string{"disk"}, nil),
 			prometheus.GaugeValue,
@ -271,7 +302,7 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) {
 		)
 		ch <- prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
-				prometheus.BuildFQName("cache", "usage", "high"),
+				prometheus.BuildFQName(cacheNamespace, "usage", "high"),
 				"Indicates cache usage is high or low, relative to current cache 'quota' settings",
 				[]string{"disk"}, nil),
 			prometheus.GaugeValue,
@ -309,7 +340,7 @@ func httpMetricsPrometheus(ch chan<- prometheus.Metric) {
 	for api, value := range httpStats.CurrentS3Requests.APIStats {
 		ch <- prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
-				prometheus.BuildFQName("s3", "requests", "current"),
+				prometheus.BuildFQName(s3Namespace, "requests", "current"),
 				"Total number of running s3 requests in current MinIO server instance",
 				[]string{"api"}, nil),
 			prometheus.CounterValue,
@ -321,7 +352,7 @@ func httpMetricsPrometheus(ch chan<- prometheus.Metric) {
 	for api, value := range httpStats.TotalS3Requests.APIStats {
 		ch <- prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
-				prometheus.BuildFQName("s3", "requests", "total"),
+				prometheus.BuildFQName(s3Namespace, "requests", "total"),
 				"Total number of s3 requests in current MinIO server instance",
 				[]string{"api"}, nil),
 			prometheus.CounterValue,
@ -333,7 +364,7 @@ func httpMetricsPrometheus(ch chan<- prometheus.Metric) {
 	for api, value := range httpStats.TotalS3Errors.APIStats {
 		ch <- prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
-				prometheus.BuildFQName("s3", "errors", "total"),
+				prometheus.BuildFQName(s3Namespace, "errors", "total"),
 				"Total number of s3 errors in current MinIO server instance",
 				[]string{"api"}, nil),
 			prometheus.CounterValue,
@ -351,7 +382,7 @@ func networkMetricsPrometheus(ch chan<- prometheus.Metric) {
 	// Network Sent/Received Bytes (internode)
 	ch <- prometheus.MustNewConstMetric(
 		prometheus.NewDesc(
-			prometheus.BuildFQName("internode", "tx", "bytes_total"),
+			prometheus.BuildFQName(interNodeNamespace, "tx", "bytes_total"),
 			"Total number of bytes sent to the other peer nodes by current MinIO server instance",
 			nil, nil),
 		prometheus.CounterValue,
@ -360,7 +391,7 @@ func networkMetricsPrometheus(ch chan<- prometheus.Metric) {

 	ch <- prometheus.MustNewConstMetric(
 		prometheus.NewDesc(
-			prometheus.BuildFQName("internode", "rx", "bytes_total"),
+			prometheus.BuildFQName(interNodeNamespace, "rx", "bytes_total"),
 			"Total number of internode bytes received by current MinIO server instance",
 			nil, nil),
 		prometheus.CounterValue,
@ -370,7 +401,7 @@ func networkMetricsPrometheus(ch chan<- prometheus.Metric) {
 	// Network Sent/Received Bytes (Outbound)
 	ch <- prometheus.MustNewConstMetric(
 		prometheus.NewDesc(
-			prometheus.BuildFQName("s3", "tx", "bytes_total"),
+			prometheus.BuildFQName(s3Namespace, "tx", "bytes_total"),
 			"Total number of s3 bytes sent by current MinIO server instance",
 			nil, nil),
 		prometheus.CounterValue,
@ -379,7 +410,7 @@ func networkMetricsPrometheus(ch chan<- prometheus.Metric) {

 	ch <- prometheus.MustNewConstMetric(
 		prometheus.NewDesc(
-			prometheus.BuildFQName("s3", "rx", "bytes_total"),
+			prometheus.BuildFQName(s3Namespace, "rx", "bytes_total"),
 			"Total number of s3 bytes received by current MinIO server instance",
 			nil, nil),
 		prometheus.CounterValue,
@ -414,7 +445,7 @@ func bucketUsageMetricsPrometheus(ch chan<- prometheus.Metric) {
 		// Total space used by bucket
 		ch <- prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
-				prometheus.BuildFQName("bucket", "usage", "size"),
+				prometheus.BuildFQName(bucketNamespace, "usage", "size"),
 				"Total bucket size",
 				[]string{"bucket"}, nil),
 			prometheus.GaugeValue,
@ -423,7 +454,7 @@ func bucketUsageMetricsPrometheus(ch chan<- prometheus.Metric) {
 		)
 		ch <- prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
-				prometheus.BuildFQName("bucket", "objects", "count"),
+				prometheus.BuildFQName(bucketNamespace, "objects", "count"),
 				"Total number of objects in a bucket",
 				[]string{"bucket"}, nil),
 			prometheus.GaugeValue,
@ -469,7 +500,7 @@ func bucketUsageMetricsPrometheus(ch chan<- prometheus.Metric) {
 		for k, v := range usageInfo.ObjectSizesHistogram {
 			ch <- prometheus.MustNewConstMetric(
 				prometheus.NewDesc(
-					prometheus.BuildFQName("bucket", "objects", "histogram"),
+					prometheus.BuildFQName(bucketNamespace, "objects", "histogram"),
 					"Total number of objects of different sizes in a bucket",
 					[]string{"bucket", "object_size"}, nil),
 				prometheus.GaugeValue,
@ -497,10 +528,50 @@ func storageMetricsPrometheus(ch chan<- prometheus.Metric) {
 	onlineDisks, offlineDisks := getOnlineOfflineDisksStats(server.Disks)
 	totalDisks := offlineDisks.Merge(onlineDisks)

+	// Report total capacity
+	ch <- prometheus.MustNewConstMetric(
+		prometheus.NewDesc(
+			prometheus.BuildFQName(minioNamespace, "capacity_raw", "total"),
+			"Total capacity online in the cluster",
+			nil, nil),
+		prometheus.GaugeValue,
+		float64(GetTotalCapacity(GlobalContext)),
+	)
+
+	// Report total capacity free
+	ch <- prometheus.MustNewConstMetric(
+		prometheus.NewDesc(
+			prometheus.BuildFQName(minioNamespace, "capacity_raw_free", "total"),
+			"Total free capacity online in the cluster",
+			nil, nil),
+		prometheus.GaugeValue,
+		float64(GetTotalCapacityFree(GlobalContext)),
+	)
+
+	s, _ := objLayer.StorageInfo(GlobalContext)
+	// Report total usable capacity
+	ch <- prometheus.MustNewConstMetric(
+		prometheus.NewDesc(
+			prometheus.BuildFQName(minioNamespace, "capacity_usable", "total"),
+			"Total usable capacity online in the cluster",
+			nil, nil),
+		prometheus.GaugeValue,
+		GetTotalUsableCapacity(GlobalContext, s),
+	)
+	// Report total usable capacity free
+	ch <- prometheus.MustNewConstMetric(
+		prometheus.NewDesc(
+			prometheus.BuildFQName(minioNamespace, "capacity_usable_free", "total"),
+			"Total free usable capacity online in the cluster",
+			nil, nil),
+		prometheus.GaugeValue,
+		GetTotalUsableCapacityFree(GlobalContext, s),
+	)
+
 	// MinIO Offline Disks per node
 	ch <- prometheus.MustNewConstMetric(
 		prometheus.NewDesc(
-			prometheus.BuildFQName("minio", "disks", "offline"),
+			prometheus.BuildFQName(minioNamespace, "disks", "offline"),
 			"Total number of offline disks in current MinIO server instance",
 			nil, nil),
 		prometheus.GaugeValue,
@ -510,7 +581,7 @@ func storageMetricsPrometheus(ch chan<- prometheus.Metric) {
 	// MinIO Total Disks per node
 	ch <- prometheus.MustNewConstMetric(
 		prometheus.NewDesc(
-			prometheus.BuildFQName("minio", "disks", "total"),
+			prometheus.BuildFQName(minioNamespace, "disks", "total"),
 			"Total number of disks for current MinIO server instance",
 			nil, nil),
 		prometheus.GaugeValue,
@ -521,7 +592,7 @@ func storageMetricsPrometheus(ch chan<- prometheus.Metric) {
 		// Total disk usage by the disk
 		ch <- prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
-				prometheus.BuildFQName("disk", "storage", "used"),
+				prometheus.BuildFQName(diskNamespace, "storage", "used"),
 				"Total disk storage used on the disk",
 				[]string{"disk"}, nil),
 			prometheus.GaugeValue,
@ -532,7 +603,7 @@ func storageMetricsPrometheus(ch chan<- prometheus.Metric) {
 		// Total available space in the disk
 		ch <- prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
-				prometheus.BuildFQName("disk", "storage", "available"),
+				prometheus.BuildFQName(diskNamespace, "storage", "available"),
 				"Total available space left on the disk",
 				[]string{"disk"}, nil),
 			prometheus.GaugeValue,
@ -543,7 +614,7 @@ func storageMetricsPrometheus(ch chan<- prometheus.Metric) {
 		// Total storage space of the disk
 		ch <- prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
-				prometheus.BuildFQName("disk", "storage", "total"),
+				prometheus.BuildFQName(diskNamespace, "storage", "total"),
 				"Total space on the disk",
 				[]string{"disk"}, nil),
 			prometheus.GaugeValue,
--- a/cmd/notification-summary.go
+++ b/cmd/notification-summary.go
@ -0,0 +1,54 @@
+/*
+ * MinIO Cloud Storage, (C) 2020 MinIO, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package cmd
+
+import (
+	"context"
+)
+
+// GetTotalCapacity gets the total capacity in the cluster.
+func GetTotalCapacity(ctx context.Context) (capacity uint64) {
+	d := globalNotificationSys.DiskHwInfo(ctx)
+	for _, s := range d {
+		capacity += s.GetTotalCapacity()
+	}
+	return
+}
+
+// GetTotalUsableCapacity gets the total usable capacity in the cluster.
+func GetTotalUsableCapacity(ctx context.Context, s StorageInfo) (capacity float64) {
+	raw := GetTotalCapacity(ctx)
+	ratio := float64(s.Backend.StandardSCData) / float64(s.Backend.StandardSCData+s.Backend.StandardSCParity)
+	return float64(raw) * ratio
+}
+
+// GetTotalCapacityFree gets the total capacity free in the cluster.
+func GetTotalCapacityFree(ctx context.Context) (capacity uint64) {
+	d := globalNotificationSys.DiskHwInfo(ctx)
+	for _, s := range d {
+		capacity += s.GetTotalFreeCapacity()
+	}
+	return
+}
+
+// GetTotalUsableCapacityFree gets the total usable capacity free in the cluster.
+func GetTotalUsableCapacityFree(ctx context.Context, s StorageInfo) (capacity float64) {
+	raw := GetTotalCapacityFree(ctx)
+	ratio := float64(s.Backend.StandardSCData) / float64(s.Backend.StandardSCData+s.Backend.StandardSCParity)
+	return float64(raw) * ratio
+}
--- a/cmd/notification.go
+++ b/cmd/notification.go
@ -51,8 +51,8 @@ type NotificationSys struct {
 	targetResCh                chan event.TargetIDResult
 	bucketRulesMap             map[string]event.RulesMap
 	bucketRemoteTargetRulesMap map[string]map[event.TargetID]event.RulesMap
-	peerClients                []*peerRESTClient
-	allPeerClients             []*peerRESTClient
+	peerClients                []*peerRESTClient // Excludes self
+	allPeerClients             []*peerRESTClient // Includes nil client for self
 }

 // GetARNList - returns available ARNs.
@ -1294,6 +1294,21 @@ func NewNotificationSys(endpoints EndpointServerPools) *NotificationSys {
 	}
 }

+// GetPeerOnlineCount gets the count of online and offline nodes.
+func GetPeerOnlineCount() (nodesOnline, nodesOffline int) {
+	nodesOnline = 1 // Self is always online.
+	nodesOffline = 0
+	servers := globalNotificationSys.ServerInfo()
+	for _, s := range servers {
+		if s.State == "ok" {
+			nodesOnline++
+			continue
+		}
+		nodesOffline++
+	}
+	return
+}
+
 type eventArgs struct {
 	EventName    event.Name
 	BucketName   string
@ -1428,3 +1443,52 @@ func (sys *NotificationSys) GetBandwidthReports(ctx context.Context, buckets ...
 	}
 	return consolidatedReport
 }
+
+// GetClusterMetrics - gets the cluster metrics from all nodes excluding self.
+func (sys *NotificationSys) GetClusterMetrics(ctx context.Context) chan Metric {
+	g := errgroup.WithNErrs(len(sys.peerClients))
+	peerChannels := make([]<-chan Metric, len(sys.peerClients))
+	for index := range sys.peerClients {
+		if sys.peerClients[index] == nil {
+			continue
+		}
+		index := index
+		g.Go(func() error {
+			var err error
+			peerChannels[index], err = sys.peerClients[index].GetPeerMetrics(ctx)
+			return err
+		}, index)
+	}
+
+	ch := make(chan Metric)
+	var wg sync.WaitGroup
+	for index, err := range g.Wait() {
+		reqInfo := (&logger.ReqInfo{}).AppendTags("peerAddress",
+			sys.peerClients[index].host.String())
+		ctx := logger.SetReqInfo(ctx, reqInfo)
+		if err != nil {
+			logger.LogOnceIf(ctx, err, sys.peerClients[index].host.String())
+			continue
+		}
+		wg.Add(1)
+		go func(ctx context.Context, peerChannel <-chan Metric, wg *sync.WaitGroup) {
+			defer wg.Done()
+			for {
+				select {
+				case m, ok := <-peerChannel:
+					if !ok {
+						return
+					}
+					ch <- m
+				case <-ctx.Done():
+					return
+				}
+			}
+		}(ctx, peerChannels[index], &wg)
+	}
+	go func(wg *sync.WaitGroup, ch chan Metric) {
+		wg.Wait()
+		close(ch)
+	}(&wg, ch)
+	return ch
+}
--- a/cmd/object-api-interface.go
+++ b/cmd/object-api-interface.go
@ -72,6 +72,13 @@ const (
 	writeLock
 )

+// BackendMetrics - represents bytes served from backend
+type BackendMetrics struct {
+	bytesReceived uint64
+	bytesSent     uint64
+	requestStats  RequestStats
+}
+
 // ObjectLayer implements primitives for object API layer.
 type ObjectLayer interface {
 	SetDriveCount() int // Only implemented by erasure layer
@ -143,7 +150,7 @@ type ObjectLayer interface {
 	IsCompressionSupported() bool

 	// Backend related metrics
-	GetMetrics(ctx context.Context) (*Metrics, error)
+	GetMetrics(ctx context.Context) (*BackendMetrics, error)

 	// Returns health of the backend
 	Health(ctx context.Context, opts HealthOptions) HealthResult
--- a/cmd/peer-rest-client.go
+++ b/cmd/peer-rest-client.go
@ -749,7 +749,7 @@ func (client *peerRESTClient) doListen(listenCh chan interface{}, doneCh <-chan
 	dec := gob.NewDecoder(respBody)
 	for {
 		var ev event.Event
-		if err = dec.Decode(&ev); err != nil {
+		if err := dec.Decode(&ev); err != nil {
 			return
 		}
 		if len(ev.EventVersion) > 0 {
@ -906,3 +906,24 @@ func (client *peerRESTClient) MonitorBandwidth(ctx context.Context, buckets []st
 	err = dec.Decode(&bandwidthReport)
 	return &bandwidthReport, err
 }
+
+func (client *peerRESTClient) GetPeerMetrics(ctx context.Context) (<-chan Metric, error) {
+	respBody, err := client.callWithContext(ctx, peerRESTMethodGetPeerMetrics, nil, nil, -1)
+	if err != nil {
+		return nil, err
+	}
+	dec := gob.NewDecoder(respBody)
+	ch := make(chan Metric)
+	go func(ch chan<- Metric) {
+		for {
+			var metric Metric
+			if err := dec.Decode(&metric); err != nil {
+				http.DrainBody(respBody)
+				close(ch)
+				return
+			}
+			ch <- metric
+		}
+	}(ch)
+	return ch, nil
+}
--- a/cmd/peer-rest-common.go
+++ b/cmd/peer-rest-common.go
@ -58,6 +58,7 @@ const (
 	peerRESTMethodGetBandwidth           = "/bandwidth"
 	peerRESTMethodGetMetacacheListing    = "/getmetacache"
 	peerRESTMethodUpdateMetacacheListing = "/updatemetacache"
+	peerRESTMethodGetPeerMetrics         = "/peermetrics"
 )

 const (
--- a/cmd/peer-rest-server.go
+++ b/cmd/peer-rest-server.go
@ -801,7 +801,7 @@ func (s *peerRESTServer) SignalServiceHandler(w http.ResponseWriter, r *http.Req
 // ListenHandler sends http trace messages back to peer rest client
 func (s *peerRESTServer) ListenHandler(w http.ResponseWriter, r *http.Request) {
 	if !s.IsValid(w, r) {
-		s.writeErrorResponse(w, errors.New("Invalid request"))
+		s.writeErrorResponse(w, errors.New("invalid request"))
 		return
 	}

@ -809,7 +809,7 @@ func (s *peerRESTServer) ListenHandler(w http.ResponseWriter, r *http.Request) {

 	var prefix string
 	if len(values[peerRESTListenPrefix]) > 1 {
-		s.writeErrorResponse(w, errors.New("Invalid request"))
+		s.writeErrorResponse(w, errors.New("invalid request"))
 		return
 	}

@ -824,7 +824,7 @@ func (s *peerRESTServer) ListenHandler(w http.ResponseWriter, r *http.Request) {

 	var suffix string
 	if len(values[peerRESTListenSuffix]) > 1 {
-		s.writeErrorResponse(w, errors.New("Invalid request"))
+		s.writeErrorResponse(w, errors.New("invalid request"))
 		return
 	}

@ -1004,7 +1004,7 @@ func (s *peerRESTServer) IsValid(w http.ResponseWriter, r *http.Request) bool {
 // GetBandwidth gets the bandwidth for the buckets requested.
 func (s *peerRESTServer) GetBandwidth(w http.ResponseWriter, r *http.Request) {
 	if !s.IsValid(w, r) {
-		s.writeErrorResponse(w, errors.New("Invalid request"))
+		s.writeErrorResponse(w, errors.New("invalid request"))
 		return
 	}
 	bucketsString := r.URL.Query().Get("buckets")
@ -1025,6 +1025,29 @@ func (s *peerRESTServer) GetBandwidth(w http.ResponseWriter, r *http.Request) {
 	w.(http.Flusher).Flush()
 }

+// GetPeerMetrics gets the metrics to be federated across peers.
+func (s *peerRESTServer) GetPeerMetrics(w http.ResponseWriter, r *http.Request) {
+	if !s.IsValid(w, r) {
+		s.writeErrorResponse(w, errors.New("invalid request"))
+	}
+	w.WriteHeader(http.StatusOK)
+	w.(http.Flusher).Flush()
+
+	doneCh := make(chan struct{})
+	defer close(doneCh)
+
+	enc := gob.NewEncoder(w)
+
+	ch := ReportMetrics(r.Context(), GetGeneratorsForPeer)
+	for m := range ch {
+		if err := enc.Encode(m); err != nil {
+			s.writeErrorResponse(w, errors.New("Encoding metric failed: "+err.Error()))
+			return
+		}
+	}
+	w.(http.Flusher).Flush()
+}
+
 // registerPeerRESTHandlers - register peer rest router.
 func registerPeerRESTHandlers(router *mux.Router) {
 	server := &peerRESTServer{}
@ -1064,4 +1087,5 @@ func registerPeerRESTHandlers(router *mux.Router) {
 	subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodGetBandwidth).HandlerFunc(httpTraceHdrs(server.GetBandwidth))
 	subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodGetMetacacheListing).HandlerFunc(httpTraceHdrs(server.GetMetacacheListingHandler))
 	subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodUpdateMetacacheListing).HandlerFunc(httpTraceHdrs(server.UpdateMetacacheListingHandler))
+	subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodGetPeerMetrics).HandlerFunc(httpTraceHdrs(server.GetPeerMetrics))
 }
--- a/docs/metrics/README.md
+++ b/docs/metrics/README.md
@ -13,8 +13,15 @@ Read more on how to use these endpoints in [MinIO healthcheck guide](https://git

 ### Prometheus Probe

-MinIO server exposes Prometheus compatible data on a single endpoint. By default, the endpoint is authenticated.
+MinIO allows reading metrics for the entire cluster from any single node. The cluster wide metrics can be read at
+`<Address for MinIO Service>/minio/prometheus/cluster`.

- Prometheus data available at `/minio/prometheus/metrics`
+The additional node specific metrics which include go metrics or process metrics are exposed at
+`<Address for MinIO Node>/minio/prometheus/node`.

 To use this endpoint, setup Prometheus to scrape data from this endpoint. Read more on how to configure and use Prometheus to monitor MinIO server in [How to monitor MinIO server with Prometheus](https://github.com/minio/minio/blob/master/docs/metrics/prometheus/README.md).
+
+**Deprecated metrics monitoring**
+
+- Prometheus' data available at `/minio/prometheus/metrics` is deprecated
+
--- a/docs/metrics/prometheus/README.md
+++ b/docs/metrics/prometheus/README.md
@ -1,8 +1,13 @@
 # How to monitor MinIO server with Prometheus [![Slack](https://slack.min.io/slack?type=svg)](https://slack.min.io)

-[Prometheus](https://prometheus.io) is a cloud-native monitoring platform, built originally at SoundCloud. Prometheus offers a multi-dimensional data model with time series data identified by metric name and key/value pairs. The data collection happens via a pull model over HTTP/HTTPS. Targets to pull data from are discovered via service discovery or static configuration.
+[Prometheus](https://prometheus.io) is a cloud-native monitoring platform. 

-MinIO exports Prometheus compatible data by default as an authorized endpoint at `/minio/prometheus/metrics`. Users looking to monitor their MinIO instances can point Prometheus configuration to scrape data from this endpoint.
+Prometheus offers a multi-dimensional data model with time series data identified by metric name and key/value pairs. 
+The data collection happens via a pull model over HTTP/HTTPS.
+
+MinIO exports Prometheus compatible data by default as an authorized endpoint at `/minio/prometheus/metrics/cluster`. 
+
+Users looking to monitor their MinIO instances can point Prometheus configuration to scrape data from this endpoint.

 This document explains how to setup Prometheus and configure it to scrape data from MinIO servers.

@ -20,7 +25,8 @@ This document explains how to setup Prometheus and configure it to scrape data f
 - [List of metrics exposed by MinIO](#list-of-metrics-exposed-by-minio)

 ## Prerequisites
-To get started with MinIO, refer [MinIO QuickStart Document](https://docs.min.io/docs/minio-quickstart-guide). Follow below steps to get started with MinIO monitoring using Prometheus.
+To get started with MinIO, refer [MinIO QuickStart Document](https://docs.min.io/docs/minio-quickstart-guide). 
+Follow below steps to get started with MinIO monitoring using Prometheus.

 ### 1. Download Prometheus

@ -68,7 +74,7 @@ The command will generate the `scrape_configs` section of the prometheus.yml as
 scrape_configs:
 - job_name: minio-job
  bearer_token: <secret>
-  metrics_path: /minio/prometheus/metrics
+  metrics_path: /minio/v2/metrics/cluster
  scheme: http
  static_configs:
  - targets: ['localhost:9000']
@ -77,16 +83,26 @@ scrape_configs:
 #### 3.2 Public Prometheus config

 If Prometheus endpoint authentication type is set to `public`. Following prometheus config is sufficient to start scraping metrics data from MinIO.
-
+This can be collected from any server once per collection.
+##### Cluster
 ```yaml
 scrape_configs:
 - job_name: minio-job
-  metrics_path: /minio/prometheus/metrics
+  metrics_path: /minio/v2/metrics/cluster
+  scheme: http
+  static_configs:
+  - targets: ['localhost:9000']
+```
+##### Node
+Optionally you can also collect per node metrics. This needs to be done on a per server instance.
+```yaml
+scrape_configs:
+- job_name: minio-job
+  metrics_path: /minio/v2/metrics/node
  scheme: http
  static_configs:
  - targets: ['localhost:9000']
 ```
-
 ### 4. Update `scrape_configs` section in prometheus.yml

 To authorize every scrape request, copy and paste the generated `scrape_configs` section in the prometheus.yml and restart the Prometheus service.
@ -103,172 +119,16 @@ Here `prometheus.yml` is the name of configuration file. You can now see MinIO m

 ### 6. Configure Grafana

-After Prometheus is configured, you can use Grafana to visualize MinIO metrics. Refer the [document here to setup Grafana with MinIO prometheus metrics](https://github.com/minio/minio/blob/master/docs/metrics/prometheus/grafana/README.md).
+After Prometheus is configured, you can use Grafana to visualize MinIO metrics. 
+Refer the [document here to setup Grafana with MinIO prometheus metrics](https://github.com/minio/minio/blob/master/docs/metrics/prometheus/grafana/README.md).

 ## List of metrics exposed by MinIO

-MinIO server exposes the following metrics on `/minio/prometheus/metrics` endpoint. All of these can be accessed via Prometheus dashboard. The full list of exposed metrics along with their definition is available in the demo server at https://play.min.io:9000/minio/prometheus/metrics
-
-These are the new set of metrics which will be in effect after `RELEASE.2019-10-16*`. Some of the key changes in this update are listed below.
-    - Metrics are bound the respective nodes and is not cluster-wide. Each and every node in a cluster will expose its own metrics.
-    - Additional metrics to cover the s3 and internode traffic statistics were added.
-    - Metrics that records the http statistics and latencies are labeled to their respective APIs (putobject,getobject etc).
-    - Disk usage metrics are distributed and labeled to the respective disk paths.
-
-For more details, please check the `Migration guide for the new set of metrics`.
-
-The list of metrics and its definition are as follows. (NOTE: instance here is one MinIO node)
-
-> NOTES:
-    > 1. Instance here is one MinIO node.
-    > 2. `s3 requests` exclude internode requests.
-
-### Default set of information
-| name        | description                     |
-|:------------|:--------------------------------|
-| `go_`       | all standard go runtime metrics |
-| `process_`  | all process level metrics       |
-| `promhttp_` | all prometheus scrape metrics   |
-
-### MinIO node specific information
-| name                       | description                                                                    |
-|:---------------------------|:-------------------------------------------------------------------------------|
-| `minio_version_info`       | Current MinIO version with its commit-id                                       |
-| `minio_disks_offline`      | Total number of offline disks on current MinIO instance                        |
-| `minio_disks_total`        | Total number of disks on current MinIO instance                                |
-
-### Disk metrics are labeled by 'disk' which indentifies each disk
-| name                       | description                                                                    |
-|:---------------------------|:-------------------------------------------------------------------------------|
-| `disk_storage_total`       | Total size of the disk                                                         |
-| `disk_storage_used`        | Total disk space used per disk                                                 |
-| `disk_storage_available`   | Total available disk space per disk                                            |
-
-### S3 API metrics are labeled by 'api' which identifies different S3 API requests
-| name                       | description                                                                    |
-|:---------------------------|:-------------------------------------------------------------------------------|
-| `s3_requests_total`        | Total number of s3 requests in current MinIO instance                          |
-| `s3_errors_total`          | Total number of errors in s3 requests in current MinIO instance                |
-| `s3_requests_current`      | Total number of active s3 requests in current MinIO instance                   |
-| `s3_rx_bytes_total`        | Total number of s3 bytes received by current MinIO server instance             |
-| `s3_tx_bytes_total`        | Total number of s3 bytes sent by current MinIO server instance                 |
-| `s3_ttfb_seconds`          | Histogram that holds the latency information of the requests                   |
-
-#### Internode metrics only available in a distributed setup
-| name                       | description                                                                    |
-|:---------------------------|:-------------------------------------------------------------------------------|
-| `internode_rx_bytes_total` | Total number of internode bytes received by current MinIO server instance      |
-| `internode_tx_bytes_total` | Total number of bytes sent to the other nodes by current MinIO server instance |
-
-Apart from above metrics, MinIO also exposes below mode specific metrics
-
-### Bucket usage specific metrics
-All metrics are labeled by `bucket`, each metric is displayed per bucket. `buckets_objects_histogram` is additionally labeled by `object_size` string which is represented by any of the following values
-
- *LESS_THAN_1024_B*
- *BETWEEN_1024_B_AND_1_MB*
- *BETWEEN_1_MB_AND_10_MB*
- *BETWEEN_10_MB_AND_64_MB*
- *BETWEEN_64_MB_AND_128_MB*
- *BETWEEN_128_MB_AND_512_MB*
- *GREATER_THAN_512_MB*
-
-Units defintions:
- 1 MB = 1024 KB
- 1 KB = 1024 B
-
-| name                                | description                                         |
-|:------------------------------------|:----------------------------------------------------|
-| `bucket_usage_size`                 | Total size of the bucket                            |
-| `bucket_objects_count`              | Total number of objects in a bucket                 |
-| `bucket_objects_histogram`          | Total number of objects filtered by different sizes |
-| `bucket_replication_pending_size`   | Total capacity not replicated                       |
-| `bucket_replication_failed_size`    | Total capacity failed to replicate at least once    |
-| `bucket_replication_successful_size`| Total capacity successfully replicated              |
-| `bucket_replication_received_size`  | Total capacity received as replicated objects       |
-
-### Cache specific metrics
-
-MinIO Gateway instances enabled with Disk-Caching expose caching related metrics.
-
-#### Global cache metrics
-| name                 | description                                       |
-|:---------------------|:--------------------------------------------------|
-| `cache_hits_total`   | Total number of cache hits                        |
-| `cache_misses_total` | Total number of cache misses                      |
-| `cache_data_served`  | Total number of bytes served from cache           |
-
-#### Per disk cache metrics
-| name                   | description                                                                      |
-|:-----------------------|:---------------------------------------------------------------------------------|
-| `cache_usage_size`     | Total cache usage in bytes                                                       |
-| `cache_total_capacity` | Total size of cache disk                                                         |
-| `cache_usage_percent`  | Total percentage cache usage                                                     |
-| `cache_usage_state`    | Indicates cache usage is high or low, relative to current cache 'quota' settings |
-
-`cache_usage_state` holds only two states
-
- '1' indicates high disk usage
- '0' indicates low disk usage
-
-### Gateway specific metrics
-MinIO Gateway instance exposes metrics related to Gateway communication with the cloud backend (S3, Azure & GCS Gateway).
-
-`<gateway_type>` changes based on the gateway in use can be 's3', 'gcs' or 'azure'. Other metrics are labeled with `method` that identifies HTTP GET, HEAD, PUT and POST requests to the backend.
-
-| name                                    | description                                                                |
-|:----------------------------------------|:---------------------------------------------------------------------------|
-| `gateway_<gateway_type>_requests`       | Total number of requests made to the gateway backend                       |
-| `gateway_<gateway_type>_bytes_sent`     | Total number of bytes sent to cloud backend (in PUT & POST Requests)       |
-| `gateway_<gateway_type>_bytes_received` | Total number of bytes received from cloud backend (in GET & HEAD Requests) |
-
-Note that this is currently only support for Azure, S3 and GCS Gateway.
-
-### MinIO self-healing metrics - `self_heal_*`
-
-MinIO exposes self-healing related metrics for erasure-code deployments _only_. These metrics are _not_ available on Gateway or Single Node, Single Drive deployments. Note that these metrics will be exposed _only_ when there is a relevant event happening on MinIO server.
-
-| name                                 | description                                                                                                                                                                 |
-|:-------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `self_heal_time_since_last_activity` | Time elapsed since last self-healing related activity                                                                                                                       |
-| `self_heal_objects_scanned`          | Number of objects scanned by self-healing thread in its current run. This will reset when a fresh self-healing run starts. This is labeled with the object type scanned     |
-| `self_heal_objects_healed`           | Number of objects healing by self-healing thread in its current run. This will reset when a fresh self-healing run starts. This is labeled with the object type scanned     |
-| `self_heal_objects_heal_failed`      | Number of objects for which self-healing failed in its current run. This will reset when a fresh self-healing run starts. This is labeled with disk status and its endpoint |
-
-## Migration guide for the new set of metrics
-
-This migration guide applies for older releases or any releases before `RELEASE.2019-10-23*`
-
-### MinIO disk level metrics - `disk_*`
-
-The migrations include
-
- `minio_total_disks` to `minio_disks_total`
- `minio_offline_disks` to `minio_disks_offline`
-
-### MinIO disk level metrics - `disk_storage_*`
-
-These metrics have one label.
-
- `disk`: Holds the disk path
-
-The migrations include
-
- `minio_disk_storage_used_bytes` to `disk_storage_used`
- `minio_disk_storage_available_bytes` to `disk_storage_available`
- `minio_disk_storage_total_bytes` to `disk_storage_total`
-
-### MinIO network level metrics
-
-These metrics are detailed to cover the s3 and internode network statistics.
-
-The migrations include
-
- `minio_network_sent_bytes_total` to `s3_tx_bytes_total` and `internode_tx_bytes_total`
- `minio_network_received_bytes_total` to `s3_rx_bytes_total` and `internode_rx_bytes_total`
+MinIO server exposes the following metrics on `/minio/prometheus/metrics/cluster` endpoint. 
+All of these can be accessed via Prometheus dashboard. 
+A sample list of exposed metrics along with their definition is available in the demo server at 
+`curl https://play.min.io:9000/minio/prometheus/metrics/cluster`

-Some of the additional metrics added were
+### List of metrics reported 

- `s3_requests_total`
- `s3_errors_total`
- `s3_ttfb_seconds`
+[The list of metrics reported can be here](https://github.com/minio/minio/blob/master/docs/metrics/prometheus/list.md)
--- a/docs/metrics/prometheus/list.md
+++ b/docs/metrics/prometheus/list.md
@ -0,0 +1,47 @@
+# List of metrics reported cluster wide
+
+Each metric includes a label for the server that calculated the metric.
+Each metric has a label for the server that generated the metric.
+
+These metrics can be from any MinIO server once per collection.
+
+| Name                                           | Description                                                                                                                 |
+|:-----------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------|
+|`minio_bucket_objects_size_distribution`        |Distribution of object sizes in the bucket, includes label for the bucket name.                                              |
+|`minio_bucket_replication_failed_bytes`         |Total number of bytes failed at least once to replicate.                                                                     |
+|`minio_bucket_replication_pending_bytes`        |Total bytes pending to replicate.                                                                                            |
+|`minio_bucket_replication_received_bytes`       |Total number of bytes replicated to this bucket from another source bucket.                                                  |
+|`minio_bucket_replication_sent_bytes`           |Total number of bytes replicated to the target bucket.                                                                       |
+|`minio_bucket_usage_object_total`               |Total number of objects                                                                                                      |
+|`minio_bucket_usage_total_bytes`                |Total bucket size in bytes                                                                                                   |
+|`minio_cluster_capacity_raw_free_bytes`         |Total free capacity online in the cluster.                                                                                   |
+|`minio_cluster_capacity_raw_total_bytes`        |Total capacity online in the cluster.                                                                                        |
+|`minio_cluster_capacity_usable_free_bytes`      |Total free usable capacity online in the cluster.                                                                            |
+|`minio_cluster_capacity_usable_total_bytes`     |Total usable capacity online in the cluster.                                                                                 |
+|`minio_cluster_disk_offline_total`              |Total disks offline.                                                                                                         |
+|`minio_cluster_disk_online_total`               |Total disks online.                                                                                                          |
+|`minio_cluster_nodes_offline_total`             |Total number of MinIO nodes offline.                                                                                         |
+|`minio_cluster_nodes_online_total`              |Total number of MinIO nodes online.                                                                                          |
+|`minio_heal_objects_error_total`                |Objects for which healing failed in current self healing run                                                                 |
+|`minio_heal_objects_heal_total`                 |Objects healed in current self healing run                                                                                   |
+|`minio_heal_objects_total`                      |Objects scanned in current self healing run                                                                                  |
+|`minio_heal_time_last_activity_nano_seconds`    |Time elapsed (in nano seconds) since last self healing activity. This is set to -1 until initial self heal activity          |
+|`minio_inter_node_traffic_received_bytes`       |Total number of bytes received from other peer nodes.                                                                        |
+|`minio_inter_node_traffic_sent_bytes`           |Total number of bytes sent to the other peer nodes.                                                                          |
+|`minio_node_disk_free_bytes`                    |Total storage available on a disk.                                                                                           |
+|`minio_node_disk_total_bytes`                   |Total storage on a disk.                                                                                                     |
+|`minio_node_disk_used_bytes`                    |Total storage used on a disk.                                                                                                |
+|`minio_s3_requests_error_total`                 |Total number S3 requests with errors                                                                                         |
+|`minio_s3_requests_inflight_total`              |Total number of S3 requests currently in flight.                                                                             |
+|`minio_s3_requests_total`                       |Total number S3 requests                                                                                                     |
+|`minio_s3_time_ttbf_seconds_distribution`       |Distribution of the time to first byte across API calls.                                                                     |
+|`minio_s3_traffic_received_bytes`               |Total number of s3 bytes received.                                                                                           |
+|`minio_s3_traffic_sent_bytes`                   |Total number of s3 bytes sent                                                                                                |
+|`minio_cache_hits_total`                        |Total number of disk cache hits                                                                                              |
+|`minio_cache_missed_total`                      |Total number of disk cache misses                                                                                            |
+|`minio_cache_sent_bytes`                        |Total number of bytes served from cache                                                                                      |
+|`minio_cache_total_bytes`                       |Total size of cache disk in bytes                                                                                            |
+|`minio_cache_usage_info`                        |Total percentage cache usage, value of 1 indicates high and 0 low, label level is set as well                                |
+|`minio_cache_used_bytes`                        |Current cache usage in bytes                                                                                                 |
+|`minio_software_commit_info`                    |Git commit hash for the MinIO release.                                                                                       |
+|`minio_software_version_info`                   |MinIO Release tag for the server                                                                                             |
--- a/go.mod
+++ b/go.mod
@ -65,6 +65,9 @@ require (
 	github.com/pierrec/lz4 v2.5.2+incompatible
 	github.com/pkg/errors v0.9.1
 	github.com/prometheus/client_golang v1.8.0
+	github.com/quasilyte/go-ruleguard v0.2.1 // indirect
+	github.com/quasilyte/go-ruleguard/dsl/fluent v0.0.0-20201222093424-5d7e62a465d3 // indirect
+	github.com/prometheus/client_model v0.2.0
 	github.com/rjeczalik/notify v0.9.2
 	github.com/rs/cors v1.7.0
 	github.com/secure-io/sio-go v0.3.0
--- a/go.sum
+++ b/go.sum
@ -529,6 +529,9 @@ github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+Gx
 github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=
 github.com/prometheus/procfs v0.2.0 h1:wH4vA7pcjKuZzjF7lM8awk4fnuJO6idemZXoKnULUx4=
 github.com/prometheus/procfs v0.2.0/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=
+github.com/quasilyte/go-ruleguard v0.2.1 h1:56eRm0daAyny9UhJnmtJW/UyLZQusukBAB8oT8AHKHo=
+github.com/quasilyte/go-ruleguard v0.2.1/go.mod h1:hN2rVc/uS4bQhQKTio2XaSJSafJwqBUWWwtssT3cQmc=
+github.com/quasilyte/go-ruleguard/dsl/fluent v0.0.0-20201222093424-5d7e62a465d3/go.mod h1:P7JlQWFT7jDcFZMtUPQbtGzzzxva3rBn6oIF+LPwFcM=
 github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a h1:9ZKAASQSHhDYGoxY8uLVpewe1GDZ2vu2Tr/vTdVAkFQ=
 github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4=
 github.com/rcrowley/go-metrics v0.0.0-20200313005456-10cdbea86bc0 h1:MkV+77GLUNo5oJ0jf870itWm3D0Sjh7+Za9gazKc5LQ=
@ -619,6 +622,7 @@ github.com/xdg/stringprep v1.0.0 h1:d9X0esnoa3dFsV0FG35rAT0RIhYFlPq7MiP+DW89La0=
 github.com/xdg/stringprep v1.0.0/go.mod h1:Jhud4/sHMO4oL310DaZAKk9ZaJ08SJfe+sJh0HrGL1Y=
 github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2 h1:eY9dn8+vbi4tKz5Qo6v2eYzo7kUS51QINcR5jNpbZS8=
 github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU=
+github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 go.etcd.io/bbolt v1.3.3 h1:MUGmc65QhB3pIlaQ5bB4LwqSj6GIonVJXpZiaKNyaKk=
 go.etcd.io/bbolt v1.3.3/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU=
@ -711,6 +715,7 @@ golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9 h1:SQFwaSi55rU7vdNs9Yr0Z324VNlrF+0wMqRXT4St8ck=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@ -782,6 +787,7 @@ golang.org/x/tools v0.0.0-20191029190741-b9c20aec41a5/go.mod h1:b+2E5dAYhXwXZwtn
 golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20191216052735-49a3e744a425/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
 golang.org/x/tools v0.0.0-20200103221440-774c71fcf114/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.0.0-20200812195022-5ae4c3c160a0/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA=
 golang.org/x/tools v0.0.0-20201105001634-bc3cf281b174/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
 golang.org/x/tools v0.0.0-20210115202250-e0d201561e39 h1:BTs2GMGSMWpgtCpv1CE7vkJTv7XcHdcLLnAMu7UbgTY=
 golang.org/x/tools v0.0.0-20210115202250-e0d201561e39/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
--- a/pkg/madmin/health.go
+++ b/pkg/madmin/health.go
@ -158,8 +158,8 @@ type PerfInfo struct {
 // ServerDrivesInfo - Drive info about all drives in a single MinIO node
 type ServerDrivesInfo struct {
 	Addr     string          `json:"addr"`
-	Serial   []DrivePerfInfo `json:"serial,omitempty"`
-	Parallel []DrivePerfInfo `json:"parallel,omitempty"`
+	Serial   []DrivePerfInfo `json:"serial,omitempty"`   // Drive perf info collected one drive at a time
+	Parallel []DrivePerfInfo `json:"parallel,omitempty"` // Drive perf info collected in parallel
 	Error    string          `json:"error,omitempty"`
 }

@ -316,3 +316,27 @@ func (adm *AdminClient) ServerHealthInfo(ctx context.Context, healthDataTypes []
 	return respChan

 }
+
+// GetTotalCapacity gets the total capacity a server holds.
+func (s *ServerDiskHwInfo) GetTotalCapacity() (capacity uint64) {
+	for _, u := range s.Usage {
+		capacity += u.Total
+	}
+	return
+}
+
+// GetTotalFreeCapacity gets the total capacity that is free.
+func (s *ServerDiskHwInfo) GetTotalFreeCapacity() (capacity uint64) {
+	for _, u := range s.Usage {
+		capacity += u.Free
+	}
+	return
+}
+
+// GetTotalUsedCapacity gets the total capacity used.
+func (s *ServerDiskHwInfo) GetTotalUsedCapacity() (capacity uint64) {
+	for _, u := range s.Usage {
+		capacity += u.Used
+	}
+	return
+}