Add metrics for nodes online and offline (#11050)

master
Ritesh H Shukla 4 years ago committed by GitHub
parent 8c79f87f02
commit f60bbdf86b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 85
      cmd/metrics.go
  2. 19
      cmd/notification.go
  3. 13
      cmd/rest/client.go
  4. 2
      docs/metrics/prometheus/README.md

@ -53,6 +53,17 @@ var (
) )
) )
const (
healMetricsNamespace = "self_heal"
gatewayNamespace = "gateway"
cacheNamespace = "cache"
s3Namespace = "s3"
bucketNamespace = "bucket"
minioNamespace = "minio"
diskNamespace = "disk"
interNodeNamespace = "internode"
)
func init() { func init() {
prometheus.MustRegister(httpRequestsDuration) prometheus.MustRegister(httpRequestsDuration)
prometheus.MustRegister(newMinioCollector()) prometheus.MustRegister(newMinioCollector())
@ -86,6 +97,7 @@ func (c *minioCollector) Collect(ch chan<- prometheus.Metric) {
minioVersionInfo.WithLabelValues(Version, CommitID).Set(float64(1.0)) minioVersionInfo.WithLabelValues(Version, CommitID).Set(float64(1.0))
storageMetricsPrometheus(ch) storageMetricsPrometheus(ch)
nodeHealthMetricsPrometheus(ch)
bucketUsageMetricsPrometheus(ch) bucketUsageMetricsPrometheus(ch)
networkMetricsPrometheus(ch) networkMetricsPrometheus(ch)
httpMetricsPrometheus(ch) httpMetricsPrometheus(ch)
@ -94,6 +106,26 @@ func (c *minioCollector) Collect(ch chan<- prometheus.Metric) {
healingMetricsPrometheus(ch) healingMetricsPrometheus(ch)
} }
func nodeHealthMetricsPrometheus(ch chan<- prometheus.Metric) {
nodesUp, nodesDown := GetPeerOnlineCount()
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(minioNamespace, "nodes", "online"),
"Total number of MinIO nodes online",
nil, nil),
prometheus.GaugeValue,
float64(nodesUp),
)
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(minioNamespace, "nodes", "offline"),
"Total number of MinIO nodes offline",
nil, nil),
prometheus.GaugeValue,
float64(nodesDown),
)
}
// collects healing specific metrics for MinIO instance in Prometheus specific format // collects healing specific metrics for MinIO instance in Prometheus specific format
// and sends to given channel // and sends to given channel
func healingMetricsPrometheus(ch chan<- prometheus.Metric) { func healingMetricsPrometheus(ch chan<- prometheus.Metric) {
@ -104,7 +136,6 @@ func healingMetricsPrometheus(ch chan<- prometheus.Metric) {
if !exists { if !exists {
return return
} }
healMetricsNamespace := "self_heal"
var dur time.Duration var dur time.Duration
if !bgSeq.lastHealActivity.IsZero() { if !bgSeq.lastHealActivity.IsZero() {
@ -174,7 +205,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("gateway", globalGatewayName, "bytes_received"), prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "bytes_received"),
"Total number of bytes received by current MinIO Gateway "+globalGatewayName+" backend", "Total number of bytes received by current MinIO Gateway "+globalGatewayName+" backend",
nil, nil), nil, nil),
prometheus.CounterValue, prometheus.CounterValue,
@ -182,7 +213,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) {
) )
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("gateway", globalGatewayName, "bytes_sent"), prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "bytes_sent"),
"Total number of bytes sent by current MinIO Gateway to "+globalGatewayName+" backend", "Total number of bytes sent by current MinIO Gateway to "+globalGatewayName+" backend",
nil, nil), nil, nil),
prometheus.CounterValue, prometheus.CounterValue,
@ -191,7 +222,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) {
s := m.GetRequests() s := m.GetRequests()
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("gateway", globalGatewayName, "requests"), prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "requests"),
"Total number of requests made to "+globalGatewayName+" by current MinIO Gateway", "Total number of requests made to "+globalGatewayName+" by current MinIO Gateway",
[]string{"method"}, nil), []string{"method"}, nil),
prometheus.CounterValue, prometheus.CounterValue,
@ -200,7 +231,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) {
) )
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("gateway", globalGatewayName, "requests"), prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "requests"),
"Total number of requests made to "+globalGatewayName+" by current MinIO Gateway", "Total number of requests made to "+globalGatewayName+" by current MinIO Gateway",
[]string{"method"}, nil), []string{"method"}, nil),
prometheus.CounterValue, prometheus.CounterValue,
@ -209,7 +240,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) {
) )
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("gateway", globalGatewayName, "requests"), prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "requests"),
"Total number of requests made to "+globalGatewayName+" by current MinIO Gateway", "Total number of requests made to "+globalGatewayName+" by current MinIO Gateway",
[]string{"method"}, nil), []string{"method"}, nil),
prometheus.CounterValue, prometheus.CounterValue,
@ -218,7 +249,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) {
) )
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("gateway", globalGatewayName, "requests"), prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "requests"),
"Total number of requests made to "+globalGatewayName+" by current MinIO Gateway", "Total number of requests made to "+globalGatewayName+" by current MinIO Gateway",
[]string{"method"}, nil), []string{"method"}, nil),
prometheus.CounterValue, prometheus.CounterValue,
@ -238,7 +269,7 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("cache", "hits", "total"), prometheus.BuildFQName(cacheNamespace, "hits", "total"),
"Total number of disk cache hits in current MinIO instance", "Total number of disk cache hits in current MinIO instance",
nil, nil), nil, nil),
prometheus.CounterValue, prometheus.CounterValue,
@ -246,7 +277,7 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) {
) )
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("cache", "misses", "total"), prometheus.BuildFQName(cacheNamespace, "misses", "total"),
"Total number of disk cache misses in current MinIO instance", "Total number of disk cache misses in current MinIO instance",
nil, nil), nil, nil),
prometheus.CounterValue, prometheus.CounterValue,
@ -254,7 +285,7 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) {
) )
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("cache", "data", "served"), prometheus.BuildFQName(cacheNamespace, "data", "served"),
"Total number of bytes served from cache of current MinIO instance", "Total number of bytes served from cache of current MinIO instance",
nil, nil), nil, nil),
prometheus.CounterValue, prometheus.CounterValue,
@ -264,7 +295,7 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) {
// Cache disk usage percentage // Cache disk usage percentage
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("cache", "usage", "percent"), prometheus.BuildFQName(cacheNamespace, "usage", "percent"),
"Total percentage cache usage", "Total percentage cache usage",
[]string{"disk"}, nil), []string{"disk"}, nil),
prometheus.GaugeValue, prometheus.GaugeValue,
@ -273,7 +304,7 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) {
) )
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("cache", "usage", "high"), prometheus.BuildFQName(cacheNamespace, "usage", "high"),
"Indicates cache usage is high or low, relative to current cache 'quota' settings", "Indicates cache usage is high or low, relative to current cache 'quota' settings",
[]string{"disk"}, nil), []string{"disk"}, nil),
prometheus.GaugeValue, prometheus.GaugeValue,
@ -311,7 +342,7 @@ func httpMetricsPrometheus(ch chan<- prometheus.Metric) {
for api, value := range httpStats.CurrentS3Requests.APIStats { for api, value := range httpStats.CurrentS3Requests.APIStats {
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("s3", "requests", "current"), prometheus.BuildFQName(s3Namespace, "requests", "current"),
"Total number of running s3 requests in current MinIO server instance", "Total number of running s3 requests in current MinIO server instance",
[]string{"api"}, nil), []string{"api"}, nil),
prometheus.CounterValue, prometheus.CounterValue,
@ -323,7 +354,7 @@ func httpMetricsPrometheus(ch chan<- prometheus.Metric) {
for api, value := range httpStats.TotalS3Requests.APIStats { for api, value := range httpStats.TotalS3Requests.APIStats {
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("s3", "requests", "total"), prometheus.BuildFQName(s3Namespace, "requests", "total"),
"Total number of s3 requests in current MinIO server instance", "Total number of s3 requests in current MinIO server instance",
[]string{"api"}, nil), []string{"api"}, nil),
prometheus.CounterValue, prometheus.CounterValue,
@ -335,7 +366,7 @@ func httpMetricsPrometheus(ch chan<- prometheus.Metric) {
for api, value := range httpStats.TotalS3Errors.APIStats { for api, value := range httpStats.TotalS3Errors.APIStats {
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("s3", "errors", "total"), prometheus.BuildFQName(s3Namespace, "errors", "total"),
"Total number of s3 errors in current MinIO server instance", "Total number of s3 errors in current MinIO server instance",
[]string{"api"}, nil), []string{"api"}, nil),
prometheus.CounterValue, prometheus.CounterValue,
@ -353,7 +384,7 @@ func networkMetricsPrometheus(ch chan<- prometheus.Metric) {
// Network Sent/Received Bytes (internode) // Network Sent/Received Bytes (internode)
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("internode", "tx", "bytes_total"), prometheus.BuildFQName(interNodeNamespace, "tx", "bytes_total"),
"Total number of bytes sent to the other peer nodes by current MinIO server instance", "Total number of bytes sent to the other peer nodes by current MinIO server instance",
nil, nil), nil, nil),
prometheus.CounterValue, prometheus.CounterValue,
@ -362,7 +393,7 @@ func networkMetricsPrometheus(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("internode", "rx", "bytes_total"), prometheus.BuildFQName(interNodeNamespace, "rx", "bytes_total"),
"Total number of internode bytes received by current MinIO server instance", "Total number of internode bytes received by current MinIO server instance",
nil, nil), nil, nil),
prometheus.CounterValue, prometheus.CounterValue,
@ -372,7 +403,7 @@ func networkMetricsPrometheus(ch chan<- prometheus.Metric) {
// Network Sent/Received Bytes (Outbound) // Network Sent/Received Bytes (Outbound)
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("s3", "tx", "bytes_total"), prometheus.BuildFQName(s3Namespace, "tx", "bytes_total"),
"Total number of s3 bytes sent by current MinIO server instance", "Total number of s3 bytes sent by current MinIO server instance",
nil, nil), nil, nil),
prometheus.CounterValue, prometheus.CounterValue,
@ -381,7 +412,7 @@ func networkMetricsPrometheus(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("s3", "rx", "bytes_total"), prometheus.BuildFQName(s3Namespace, "rx", "bytes_total"),
"Total number of s3 bytes received by current MinIO server instance", "Total number of s3 bytes received by current MinIO server instance",
nil, nil), nil, nil),
prometheus.CounterValue, prometheus.CounterValue,
@ -421,7 +452,7 @@ func bucketUsageMetricsPrometheus(ch chan<- prometheus.Metric) {
// Total space used by bucket // Total space used by bucket
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("bucket", "usage", "size"), prometheus.BuildFQName(bucketNamespace, "usage", "size"),
"Total bucket size", "Total bucket size",
[]string{"bucket"}, nil), []string{"bucket"}, nil),
prometheus.GaugeValue, prometheus.GaugeValue,
@ -430,7 +461,7 @@ func bucketUsageMetricsPrometheus(ch chan<- prometheus.Metric) {
) )
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("bucket", "objects", "count"), prometheus.BuildFQName(bucketNamespace, "objects", "count"),
"Total number of objects in a bucket", "Total number of objects in a bucket",
[]string{"bucket"}, nil), []string{"bucket"}, nil),
prometheus.GaugeValue, prometheus.GaugeValue,
@ -476,7 +507,7 @@ func bucketUsageMetricsPrometheus(ch chan<- prometheus.Metric) {
for k, v := range usageInfo.ObjectSizesHistogram { for k, v := range usageInfo.ObjectSizesHistogram {
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("bucket", "objects", "histogram"), prometheus.BuildFQName(bucketNamespace, "objects", "histogram"),
"Total number of objects of different sizes in a bucket", "Total number of objects of different sizes in a bucket",
[]string{"bucket", "object_size"}, nil), []string{"bucket", "object_size"}, nil),
prometheus.GaugeValue, prometheus.GaugeValue,
@ -507,7 +538,7 @@ func storageMetricsPrometheus(ch chan<- prometheus.Metric) {
// MinIO Offline Disks per node // MinIO Offline Disks per node
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("minio", "disks", "offline"), prometheus.BuildFQName(minioNamespace, "disks", "offline"),
"Total number of offline disks in current MinIO server instance", "Total number of offline disks in current MinIO server instance",
nil, nil), nil, nil),
prometheus.GaugeValue, prometheus.GaugeValue,
@ -517,7 +548,7 @@ func storageMetricsPrometheus(ch chan<- prometheus.Metric) {
// MinIO Total Disks per node // MinIO Total Disks per node
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("minio", "disks", "total"), prometheus.BuildFQName(minioNamespace, "disks", "total"),
"Total number of disks for current MinIO server instance", "Total number of disks for current MinIO server instance",
nil, nil), nil, nil),
prometheus.GaugeValue, prometheus.GaugeValue,
@ -528,7 +559,7 @@ func storageMetricsPrometheus(ch chan<- prometheus.Metric) {
// Total disk usage by the disk // Total disk usage by the disk
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("disk", "storage", "used"), prometheus.BuildFQName(diskNamespace, "storage", "used"),
"Total disk storage used on the disk", "Total disk storage used on the disk",
[]string{"disk"}, nil), []string{"disk"}, nil),
prometheus.GaugeValue, prometheus.GaugeValue,
@ -539,7 +570,7 @@ func storageMetricsPrometheus(ch chan<- prometheus.Metric) {
// Total available space in the disk // Total available space in the disk
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("disk", "storage", "available"), prometheus.BuildFQName(diskNamespace, "storage", "available"),
"Total available space left on the disk", "Total available space left on the disk",
[]string{"disk"}, nil), []string{"disk"}, nil),
prometheus.GaugeValue, prometheus.GaugeValue,
@ -550,7 +581,7 @@ func storageMetricsPrometheus(ch chan<- prometheus.Metric) {
// Total storage space of the disk // Total storage space of the disk
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName("disk", "storage", "total"), prometheus.BuildFQName(diskNamespace, "storage", "total"),
"Total space on the disk", "Total space on the disk",
[]string{"disk"}, nil), []string{"disk"}, nil),
prometheus.GaugeValue, prometheus.GaugeValue,

@ -51,8 +51,8 @@ type NotificationSys struct {
targetResCh chan event.TargetIDResult targetResCh chan event.TargetIDResult
bucketRulesMap map[string]event.RulesMap bucketRulesMap map[string]event.RulesMap
bucketRemoteTargetRulesMap map[string]map[event.TargetID]event.RulesMap bucketRemoteTargetRulesMap map[string]map[event.TargetID]event.RulesMap
peerClients []*peerRESTClient peerClients []*peerRESTClient // Excludes self
allPeerClients []*peerRESTClient allPeerClients []*peerRESTClient // Includes nil client for self
} }
// GetARNList - returns available ARNs. // GetARNList - returns available ARNs.
@ -1288,6 +1288,21 @@ func NewNotificationSys(endpoints EndpointServerPools) *NotificationSys {
} }
} }
// GetPeerOnlineCount gets the count of online and offline nodes.
func GetPeerOnlineCount() (nodesOnline, nodesOffline int) {
nodesOnline = 1
nodesOffline = 0
servers := globalNotificationSys.ServerInfo()
for _, s := range servers {
if s.State == "online" {
nodesOnline++
continue
}
nodesOffline++
}
return
}
type eventArgs struct { type eventArgs struct {
EventName event.Name EventName event.Name
BucketName string BucketName string

@ -200,13 +200,22 @@ func (c *Client) MarkOffline() {
if atomic.LoadInt32(&c.connected) == closed { if atomic.LoadInt32(&c.connected) == closed {
return return
} }
if c.HealthCheckFn() { if c.CheckOnlineStatus() {
atomic.CompareAndSwapInt32(&c.connected, offline, online) atomic.CompareAndSwapInt32(&c.connected, offline, online)
logger.Info("Client %s online", c.url.String()) logger.Info("Client %s online", c.url.String())
return
} }
time.Sleep(time.Duration(r.Float64() * float64(c.HealthCheckInterval))) time.Sleep(time.Duration(r.Float64() * float64(c.HealthCheckInterval)))
} }
}() }()
} }
} }
// CheckOnlineStatus checks if a client is online.
func (c *Client) CheckOnlineStatus() bool {
if c.HealthCheckFn != nil {
if c.HealthCheckFn() {
return true
}
}
return false
}

@ -136,6 +136,8 @@ The list of metrics and its definition are as follows. (NOTE: instance here is o
| `minio_version_info` | Current MinIO version with its commit-id | | `minio_version_info` | Current MinIO version with its commit-id |
| `minio_disks_offline` | Total number of offline disks on current MinIO instance | | `minio_disks_offline` | Total number of offline disks on current MinIO instance |
| `minio_disks_total` | Total number of disks on current MinIO instance | | `minio_disks_total` | Total number of disks on current MinIO instance |
| `minio_nodes_online` | Total number of MinIO nodes online |
| `minio_nodes_offline` | Total number of MinIO nodes offline |
### Disk metrics are labeled by 'disk' which indentifies each disk ### Disk metrics are labeled by 'disk' which indentifies each disk
| name | description | | name | description |

Loading…
Cancel
Save