Add support for self-healing related metrics in Prometheus (#9079)

Fixes #8988 Co-authored-by: Anis Elleuch <vadmeste@users.noreply.github.com> Co-authored-by: Harshavardhana <harsha@minio.io>
5 years ago · 6b984410d5
parent 813e0fc1a8
commit 6b984410d5
4 changed files with 351 additions and 127 deletions
--- a/cmd/admin-heal-ops.go
+++ b/cmd/admin-heal-ops.go
@ -318,9 +318,7 @@ type healSequence struct {
 	// List of entities (format, buckets, objects) to heal
 	sourceCh chan string

-	// Report healing progress, false if this is a background
-	// healing since currently there is no entity which will
-	// receive realtime healing status
+	// Report healing progress
 	reportProgress bool

 	// time at which heal sequence was started
@ -352,14 +350,23 @@ type healSequence struct {
 	// the last result index sent to client
 	lastSentResultIndex int64

-	// Number of total items scanned
-	scannedItemsCount int64
+	// Number of total items scanned against item type
+	scannedItemsMap map[madmin.HealItemType]int64
+
+	// Number of total items healed against item type
+	healedItemsMap map[madmin.HealItemType]int64
+
+	// Number of total items where healing failed against endpoint and drive state
+	healFailedItemsMap map[string]int64

 	// The time of the last scan/heal activity
 	lastHealActivity time.Time

 	// Holds the request-info for logging
 	ctx context.Context
+
+	// used to lock this structure as it is concurrently accessed
+	mutex sync.RWMutex
 }

 // NewHealSequence - creates healSettings, assumes bucket and
@ -390,7 +397,81 @@ func newHealSequence(bucket, objPrefix, clientAddr string,
 		traverseAndHealDoneCh: make(chan error),
 		stopSignalCh:          make(chan struct{}),
 		ctx:                   ctx,
+		scannedItemsMap:       make(map[madmin.HealItemType]int64),
+		healedItemsMap:        make(map[madmin.HealItemType]int64),
+		healFailedItemsMap:    make(map[string]int64),
+	}
+}
+
+// resetHealStatusCounters - reset the healSequence status counters between
+// each monthly background heal scanning activity.
+// This is used only in case of Background healing scenario, where
+// we use a single long running healSequence which reactively heals
+// objects passed to the SourceCh.
+func (h *healSequence) resetHealStatusCounters() {
+	h.mutex.Lock()
+	defer h.mutex.Unlock()
+
+	h.currentStatus.Items = []madmin.HealResultItem{}
+	h.lastSentResultIndex = 0
+	h.scannedItemsMap = make(map[madmin.HealItemType]int64)
+	h.healedItemsMap = make(map[madmin.HealItemType]int64)
+	h.healFailedItemsMap = make(map[string]int64)
+}
+
+// getScannedItemsCount - returns a count of all scanned items
+func (h *healSequence) getScannedItemsCount() int64 {
+	var count int64
+	h.mutex.RLock()
+	defer h.mutex.RUnlock()
+
+	for _, v := range h.scannedItemsMap {
+		count = count + v
 	}
+	return count
+}
+
+// getScannedItemsMap - returns map of all scanned items against type
+func (h *healSequence) getScannedItemsMap() map[madmin.HealItemType]int64 {
+	h.mutex.RLock()
+	defer h.mutex.RUnlock()
+
+	// Make a copy before returning the value
+	retMap := make(map[madmin.HealItemType]int64, len(h.scannedItemsMap))
+	for k, v := range h.scannedItemsMap {
+		retMap[k] = v
+	}
+
+	return retMap
+}
+
+// getHealedItemsMap - returns the map of all healed items against type
+func (h *healSequence) getHealedItemsMap() map[madmin.HealItemType]int64 {
+	h.mutex.RLock()
+	defer h.mutex.RUnlock()
+
+	// Make a copy before returning the value
+	retMap := make(map[madmin.HealItemType]int64, len(h.healedItemsMap))
+	for k, v := range h.healedItemsMap {
+		retMap[k] = v
+	}
+
+	return retMap
+}
+
+// gethealFailedItemsMap - returns map of all items where heal failed against
+// drive endpoint and status
+func (h *healSequence) gethealFailedItemsMap() map[string]int64 {
+	h.mutex.RLock()
+	defer h.mutex.RUnlock()
+
+	// Make a copy before returning the value
+	retMap := make(map[string]int64, len(h.healFailedItemsMap))
+	for k, v := range h.healFailedItemsMap {
+		retMap[k] = v
+	}
+
+	return retMap
 }

 // isQuitting - determines if the heal sequence is quitting (due to an
@ -556,6 +637,22 @@ func (h *healSequence) queueHealTask(path string, healType madmin.HealItemType)
 	// Wait for answer and push result to the client
 	res := <-respCh
 	if !h.reportProgress {
+		h.mutex.Lock()
+		defer h.mutex.Unlock()
+
+		// Progress is not reported in case of background heal processing.
+		// Instead we increment relevant counter based on the heal result
+		// for prometheus reporting.
+		if res.err != nil && !isErrObjectNotFound(res.err) {
+			for _, d := range res.result.After.Drives {
+				// For failed items we report the endpoint and drive state
+				// This will help users take corrective actions for drives
+				h.healFailedItemsMap[d.Endpoint+","+d.State]++
+			}
+		} else {
+			// Only object type reported for successful healing
+			h.healedItemsMap[res.result.Type]++
+		}
 		return nil
 	}
 	res.result.Type = healType
@ -599,7 +696,7 @@ func (h *healSequence) healItemsFromSourceCh() error {
 				logger.LogIf(h.ctx, err)
 			}

-			h.scannedItemsCount++
+			h.scannedItemsMap[itemType]++
 			h.lastHealActivity = UTCNow()
 		case <-h.traverseAndHealDoneCh:
 			return nil
--- a/cmd/global-heal.go
+++ b/cmd/global-heal.go
@ -61,6 +61,9 @@ func newBgHealSequence(numDisks int) *healSequence {
 		stopSignalCh:          make(chan struct{}),
 		ctx:                   ctx,
 		reportProgress:        false,
+		scannedItemsMap:       make(map[madmin.HealItemType]int64),
+		healedItemsMap:        make(map[madmin.HealItemType]int64),
+		healFailedItemsMap:    make(map[string]int64),
 	}
 }

@ -71,7 +74,7 @@ func getLocalBackgroundHealStatus() madmin.BgHealState {
 	}

 	return madmin.BgHealState{
-		ScannedItemsCount: bgSeq.scannedItemsCount,
+		ScannedItemsCount: bgSeq.getScannedItemsCount(),
 		LastHealActivity:  bgSeq.lastHealActivity,
 		NextHealRound:     UTCNow().Add(durationToNextHealRound(bgSeq.lastHealActivity)),
 	}
@ -126,12 +129,24 @@ func durationToNextHealRound(lastHeal time.Time) time.Duration {

 // Healing leader will take the charge of healing all erasure sets
 func execLeaderTasks(ctx context.Context, z *xlZones) {
-	lastScanTime := UTCNow() // So that we don't heal immediately, but after one month.
+	// So that we don't heal immediately, but after one month.
+	lastScanTime := UTCNow()
+	// Get background heal sequence to send elements to heal
+	var bgSeq *healSequence
+	var ok bool
+	for {
+		bgSeq, ok = globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID)
+		if ok {
+			break
+		}
+		time.Sleep(time.Second)
+	}
 	for {
 		select {
 		case <-ctx.Done():
 			return
 		case <-time.NewTimer(durationToNextHealRound(lastScanTime)).C:
+			bgSeq.resetHealStatusCounters()
 			for _, zone := range z.zones {
 				// Heal set by set
 				for i, set := range zone.sets {
--- a/cmd/metrics.go
+++ b/cmd/metrics.go
@ -19,6 +19,8 @@ package cmd
 import (
 	"context"
 	"net/http"
+	"strings"
+	"time"

 	"github.com/minio/minio/cmd/logger"
 	"github.com/prometheus/client_golang/prometheus"
@ -81,117 +83,166 @@ func (c *minioCollector) Collect(ch chan<- prometheus.Metric) {
 	// Expose MinIO's version information
 	minioVersionInfo.WithLabelValues(Version, CommitID).Set(float64(1.0))

-	// Fetch disk space info
-	objLayer := newObjectLayerFn()
-	// Service not initialized yet
-	if objLayer == nil {
+	storageMetricsPrometheus(ch)
+	networkMetricsPrometheus(ch)
+	httpMetricsPrometheus(ch)
+	gatewayMetricsPrometheus(ch)
+	healingMetricsPrometheus(ch)
+}
+
+// collects healing specific metrics for MinIO instance in Prometheus specific format
+// and sends to given channel
+func healingMetricsPrometheus(ch chan<- prometheus.Metric) {
+	if !globalIsXL {
 		return
 	}
+	bgSeq, exists := globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID)
+	if !exists {
+		return
+	}
+	healMetricsNamespace := "self_heal"

-	storageInfo := objLayer.StorageInfo(context.Background(), true)
-
-	offlineDisks := storageInfo.Backend.OfflineDisks
-	onlineDisks := storageInfo.Backend.OnlineDisks
-	totalDisks := offlineDisks.Merge(onlineDisks)
-
-	// MinIO Offline Disks per node
-	ch <- prometheus.MustNewConstMetric(
-		prometheus.NewDesc(
-			prometheus.BuildFQName("minio", "disks", "offline"),
-			"Total number of offline disks in current MinIO server instance",
-			nil, nil),
-		prometheus.GaugeValue,
-		float64(offlineDisks.Sum()),
-	)
+	dur := time.Duration(-1)
+	if !bgSeq.lastHealActivity.IsZero() {
+		dur = time.Since(bgSeq.lastHealActivity)
+	}

-	// MinIO Total Disks per node
 	ch <- prometheus.MustNewConstMetric(
 		prometheus.NewDesc(
-			prometheus.BuildFQName("minio", "disks", "total"),
-			"Total number of disks for current MinIO server instance",
+			prometheus.BuildFQName(healMetricsNamespace, "time", "since_last_activity"),
+			"Time elapsed (in nano seconds) since last self healing activity. This is set to -1 until initial self heal activity",
 			nil, nil),
 		prometheus.GaugeValue,
-		float64(totalDisks.Sum()),
+		float64(dur),
 	)
-
-	for i := 0; i < len(storageInfo.Total); i++ {
-		mountPath, total, free := storageInfo.MountPaths[i], storageInfo.Total[i],
-			storageInfo.Available[i]
-
-		// Total disk usage by the disk
+	for k, v := range bgSeq.getScannedItemsMap() {
 		ch <- prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
-				prometheus.BuildFQName("disk", "storage", "used"),
-				"Total disk storage used on the disk",
-				[]string{"disk"}, nil),
+				prometheus.BuildFQName(healMetricsNamespace, "objects", "scanned"),
+				"Objects scanned in current self healing run",
+				[]string{"type"}, nil),
 			prometheus.GaugeValue,
-			float64(total-free),
-			mountPath,
+			float64(v), string(k),
 		)
-
-		// Total available space in the disk
+	}
+	for k, v := range bgSeq.getHealedItemsMap() {
 		ch <- prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
-				prometheus.BuildFQName("disk", "storage", "available"),
-				"Total available space left on the disk",
-				[]string{"disk"}, nil),
+				prometheus.BuildFQName(healMetricsNamespace, "objects", "healed"),
+				"Objects healed in current self healing run",
+				[]string{"type"}, nil),
 			prometheus.GaugeValue,
-			float64(free),
-			mountPath,
+			float64(v), string(k),
 		)
-
-		// Total storage space of the disk
+	}
+	for k, v := range bgSeq.gethealFailedItemsMap() {
+		// healFailedItemsMap stores the endpoint and volume state separated by comma,
+		// split the fields and pass to channel at correct index
+		s := strings.Split(k, ",")
 		ch <- prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
-				prometheus.BuildFQName("disk", "storage", "total"),
-				"Total space on the disk",
-				[]string{"disk"}, nil),
+				prometheus.BuildFQName(healMetricsNamespace, "objects", "heal_failed"),
+				"Objects for which healing failed in current self healing run",
+				[]string{"mount_path", "volume_status"}, nil),
 			prometheus.GaugeValue,
-			float64(total),
-			mountPath,
+			float64(v), string(s[0]), string(s[1]),
 		)
 	}
+}
+
+// collects gateway specific metrics for MinIO instance in Prometheus specific format
+// and sends to given channel
+func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) {
+	if !globalIsGateway || (globalGatewayName != "s3" && globalGatewayName != "azure" && globalGatewayName != "gcs") {
+		return
+	}

-	connStats := globalConnStats.toServerConnStats()
+	objLayer := newObjectLayerWithoutSafeModeFn()
+	// Service not initialized yet
+	if objLayer == nil {
+		return
+	}
+
+	m, err := objLayer.GetMetrics(context.Background())
+	if err != nil {
+		return
+	}

-	// Network Sent/Received Bytes (internode)
 	ch <- prometheus.MustNewConstMetric(
 		prometheus.NewDesc(
-			prometheus.BuildFQName("internode", "tx", "bytes_total"),
-			"Total number of bytes sent to the other peer nodes by current MinIO server instance",
+			prometheus.BuildFQName("gateway", globalGatewayName, "bytes_received"),
+			"Total number of bytes received by current MinIO Gateway "+globalGatewayName+" backend",
 			nil, nil),
 		prometheus.CounterValue,
-		float64(connStats.TotalOutputBytes),
+		float64(m.GetBytesReceived()),
 	)
-
 	ch <- prometheus.MustNewConstMetric(
 		prometheus.NewDesc(
-			prometheus.BuildFQName("internode", "rx", "bytes_total"),
-			"Total number of internode bytes received by current MinIO server instance",
+			prometheus.BuildFQName("gateway", globalGatewayName, "bytes_sent"),
+			"Total number of bytes sent by current MinIO Gateway to "+globalGatewayName+" backend",
 			nil, nil),
 		prometheus.CounterValue,
-		float64(connStats.TotalInputBytes),
+		float64(m.GetBytesSent()),
+	)
+	s := m.GetRequests()
+	ch <- prometheus.MustNewConstMetric(
+		prometheus.NewDesc(
+			prometheus.BuildFQName("gateway", globalGatewayName, "requests"),
+			"Total number of requests made to "+globalGatewayName+" by current MinIO Gateway",
+			[]string{"method"}, nil),
+		prometheus.CounterValue,
+		float64(s.Get.Load()),
+		http.MethodGet,
 	)
+	ch <- prometheus.MustNewConstMetric(
+		prometheus.NewDesc(
+			prometheus.BuildFQName("gateway", globalGatewayName, "requests"),
+			"Total number of requests made to "+globalGatewayName+" by current MinIO Gateway",
+			[]string{"method"}, nil),
+		prometheus.CounterValue,
+		float64(s.Head.Load()),
+		http.MethodHead,
+	)
+}
+
+// collects cache metrics for MinIO server in Prometheus specific format
+// and sends to given channel
+func cacheMetricsPrometheus(ch chan<- prometheus.Metric) {
+	cacheObjLayer := newCachedObjectLayerFn()
+	// Service not initialized yet
+	if cacheObjLayer == nil {
+		return
+	}

-	// Network Sent/Received Bytes (Outbound)
 	ch <- prometheus.MustNewConstMetric(
 		prometheus.NewDesc(
-			prometheus.BuildFQName("s3", "tx", "bytes_total"),
-			"Total number of s3 bytes sent by current MinIO server instance",
+			prometheus.BuildFQName("cache", "hits", "total"),
+			"Total number of disk cache hits in current MinIO instance",
 			nil, nil),
 		prometheus.CounterValue,
-		float64(connStats.S3OutputBytes),
+		float64(cacheObjLayer.CacheStats().getHits()),
 	)
-
 	ch <- prometheus.MustNewConstMetric(
 		prometheus.NewDesc(
-			prometheus.BuildFQName("s3", "rx", "bytes_total"),
-			"Total number of s3 bytes received by current MinIO server instance",
+			prometheus.BuildFQName("cache", "misses", "total"),
+			"Total number of disk cache misses in current MinIO instance",
 			nil, nil),
 		prometheus.CounterValue,
-		float64(connStats.S3InputBytes),
+		float64(cacheObjLayer.CacheStats().getMisses()),
 	)
+	ch <- prometheus.MustNewConstMetric(
+		prometheus.NewDesc(
+			prometheus.BuildFQName("cache", "data", "served"),
+			"Total number of bytes served from cache of current MinIO instance",
+			nil, nil),
+		prometheus.CounterValue,
+		float64(cacheObjLayer.CacheStats().getBytesServed()),
+	)
+}

+// collects http metrics for MinIO server in Prometheus specific format
+// and sends to given channel
+func httpMetricsPrometheus(ch chan<- prometheus.Metric) {
 	httpStats := globalHTTPStats.toServerHTTPStats()

 	for api, value := range httpStats.CurrentS3Requests.APIStats {
@ -229,71 +280,123 @@ func (c *minioCollector) Collect(ch chan<- prometheus.Metric) {
 			api,
 		)
 	}
+}

-	// Cache related metrics
-	if globalCacheConfig.Enabled {
-		ch <- prometheus.MustNewConstMetric(
-			prometheus.NewDesc(
-				prometheus.BuildFQName("cache", "hits", "total"),
-				"Total number of disk cache hits in current MinIO instance",
-				nil, nil),
-			prometheus.CounterValue,
-			float64(newCachedObjectLayerFn().CacheStats().getHits()),
-		)
-		ch <- prometheus.MustNewConstMetric(
-			prometheus.NewDesc(
-				prometheus.BuildFQName("cache", "misses", "total"),
-				"Total number of disk cache misses in current MinIO instance",
-				nil, nil),
-			prometheus.CounterValue,
-			float64(newCachedObjectLayerFn().CacheStats().getMisses()),
-		)
-		ch <- prometheus.MustNewConstMetric(
-			prometheus.NewDesc(
-				prometheus.BuildFQName("cache", "data", "served"),
-				"Total number of bytes served from cache of current MinIO instance",
-				nil, nil),
-			prometheus.CounterValue,
-			float64(newCachedObjectLayerFn().CacheStats().getBytesServed()),
-		)
+// collects network metrics for MinIO server in Prometheus specific format
+// and sends to given channel
+func networkMetricsPrometheus(ch chan<- prometheus.Metric) {
+	connStats := globalConnStats.toServerConnStats()
+
+	// Network Sent/Received Bytes (internode)
+	ch <- prometheus.MustNewConstMetric(
+		prometheus.NewDesc(
+			prometheus.BuildFQName("internode", "tx", "bytes_total"),
+			"Total number of bytes sent to the other peer nodes by current MinIO server instance",
+			nil, nil),
+		prometheus.CounterValue,
+		float64(connStats.TotalOutputBytes),
+	)
+
+	ch <- prometheus.MustNewConstMetric(
+		prometheus.NewDesc(
+			prometheus.BuildFQName("internode", "rx", "bytes_total"),
+			"Total number of internode bytes received by current MinIO server instance",
+			nil, nil),
+		prometheus.CounterValue,
+		float64(connStats.TotalInputBytes),
+	)
+
+	// Network Sent/Received Bytes (Outbound)
+	ch <- prometheus.MustNewConstMetric(
+		prometheus.NewDesc(
+			prometheus.BuildFQName("s3", "tx", "bytes_total"),
+			"Total number of s3 bytes sent by current MinIO server instance",
+			nil, nil),
+		prometheus.CounterValue,
+		float64(connStats.S3OutputBytes),
+	)
+
+	ch <- prometheus.MustNewConstMetric(
+		prometheus.NewDesc(
+			prometheus.BuildFQName("s3", "rx", "bytes_total"),
+			"Total number of s3 bytes received by current MinIO server instance",
+			nil, nil),
+		prometheus.CounterValue,
+		float64(connStats.S3InputBytes),
+	)
+}
+
+// collects storage metrics for MinIO server in Prometheus specific format
+// and sends to given channel
+func storageMetricsPrometheus(ch chan<- prometheus.Metric) {
+	objLayer := newObjectLayerWithoutSafeModeFn()
+	// Service not initialized yet
+	if objLayer == nil {
+		return
 	}

-	if globalIsGateway && (globalGatewayName == "s3" || globalGatewayName == "azure" || globalGatewayName == "gcs") {
-		m, _ := globalObjectAPI.GetMetrics(context.Background())
-		ch <- prometheus.MustNewConstMetric(
-			prometheus.NewDesc(
-				prometheus.BuildFQName("gateway", globalGatewayName, "bytes_received"),
-				"Total number of bytes received by current MinIO Gateway "+globalGatewayName+" backend",
-				nil, nil),
-			prometheus.CounterValue,
-			float64(m.GetBytesReceived()),
-		)
+	// Fetch disk space info
+	storageInfo := objLayer.StorageInfo(context.Background(), true)
+
+	offlineDisks := storageInfo.Backend.OfflineDisks
+	onlineDisks := storageInfo.Backend.OnlineDisks
+	totalDisks := offlineDisks.Merge(onlineDisks)
+
+	// MinIO Offline Disks per node
+	ch <- prometheus.MustNewConstMetric(
+		prometheus.NewDesc(
+			prometheus.BuildFQName("minio", "disks", "offline"),
+			"Total number of offline disks in current MinIO server instance",
+			nil, nil),
+		prometheus.GaugeValue,
+		float64(offlineDisks.Sum()),
+	)
+
+	// MinIO Total Disks per node
+	ch <- prometheus.MustNewConstMetric(
+		prometheus.NewDesc(
+			prometheus.BuildFQName("minio", "disks", "total"),
+			"Total number of disks for current MinIO server instance",
+			nil, nil),
+		prometheus.GaugeValue,
+		float64(totalDisks.Sum()),
+	)
+
+	for i := 0; i < len(storageInfo.Total); i++ {
+		mountPath, total, free := storageInfo.MountPaths[i], storageInfo.Total[i],
+			storageInfo.Available[i]
+
+		// Total disk usage by the disk
 		ch <- prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
-				prometheus.BuildFQName("gateway", globalGatewayName, "bytes_sent"),
-				"Total number of bytes sent by current MinIO Gateway to "+globalGatewayName+" backend",
-				nil, nil),
-			prometheus.CounterValue,
-			float64(m.GetBytesSent()),
+				prometheus.BuildFQName("disk", "storage", "used"),
+				"Total disk storage used on the disk",
+				[]string{"disk"}, nil),
+			prometheus.GaugeValue,
+			float64(total-free),
+			mountPath,
 		)
-		s := m.GetRequests()
+
+		// Total available space in the disk
 		ch <- prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
-				prometheus.BuildFQName("gateway", globalGatewayName, "requests"),
-				"Total number of requests made to "+globalGatewayName+" by current MinIO Gateway",
-				[]string{"method"}, nil),
-			prometheus.CounterValue,
-			float64(s.Get.Load()),
-			http.MethodGet,
+				prometheus.BuildFQName("disk", "storage", "available"),
+				"Total available space left on the disk",
+				[]string{"disk"}, nil),
+			prometheus.GaugeValue,
+			float64(free),
+			mountPath,
 		)
+
+		// Total storage space of the disk
 		ch <- prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
-				prometheus.BuildFQName("gateway", globalGatewayName, "requests"),
-				"Total number of requests made to "+globalGatewayName+" by current MinIO Gateway",
-				[]string{"method"}, nil),
-			prometheus.CounterValue,
-			float64(s.Head.Load()),
-			http.MethodHead,
+				prometheus.BuildFQName("disk", "storage", "total"),
+				"Total space on the disk",
+				[]string{"disk"}, nil),
+			prometheus.GaugeValue,
+			float64(total),
+			mountPath,
 		)
 	}
 }
--- a/docs/metrics/prometheus/README.md
+++ b/docs/metrics/prometheus/README.md
@ -157,6 +157,15 @@ MinIO Gateway instance exposes metrics related to Gateway communication with the

 Note that this is currently only support for Azure, S3 and GCS Gateway.

+### MinIO self-healing metrics - `self_heal_*`
+
+MinIO exposes self-healing related metrics for erasure-code deployments _only_. These metrics are _not_ available on Gateway or Single Node, Single Drive deployments. Note that these metrics will be exposed _only_ when there is a relevant event happening on MinIO server.
+
+- `self_heal_time_since_last_activity`: Time elapsed since last self-healing related activity.
+- `self_heal_objects_scanned`: Number of objects scanned by self-healing thread in its current run. This will reset when a fresh self-healing run starts. This is labeled with the object type scanned.
+- `self_heal_objects_healed`: Number of objects healing by self-healing thread in its current run. This will reset when a fresh self-healing run starts. This is labeled with the object type scanned.
+- `self_heal_objects_heal_failed`: Number of objects for which self-healing failed in its current run. This will reset when a fresh self-healing run starts. This is labeled with disk status and its endpoint.
+
 ## Migration guide for the new set of metrics

 This migration guide applies for older releases or any releases before `RELEASE.2019-10-23*`