Add Historic CPU and memory stats (#7136)

Collect historic cpu and mem stats. Also, use actual values instead of formatted strings while returning to the client. The string formatting prevents values from being processed by the server or by the client without parsing it. This change will allow the values to be processed (eg. compute rolling-average over the lifetime of the minio server) and offloads the formatting to the client.
6 years ago · 34e7259f95
parent d0015b4d66
commit 34e7259f95
6 changed files with 126 additions and 33 deletions
--- a/cmd/admin-handlers.go
+++ b/cmd/admin-handlers.go
@ -313,18 +313,20 @@ type ServerDrivesPerfInfo struct {
 // of one minio node. It also reports any errors if encountered
 // while trying to reach this server.
 type ServerCPULoadInfo struct {
-	Addr  string     `json:"addr"`
-	Error string     `json:"error,omitempty"`
-	Load  []cpu.Load `json:"load"`
+	Addr         string     `json:"addr"`
+	Error        string     `json:"error,omitempty"`
+	Load         []cpu.Load `json:"load"`
+	HistoricLoad []cpu.Load `json:"historicLoad"`
 }

 // ServerMemUsageInfo holds informantion about memory utilization
 // of one minio node. It also reports any errors if encountered
 // while trying to reach this server.
 type ServerMemUsageInfo struct {
-	Addr  string      `json:"addr"`
-	Error string      `json:"error,omitempty"`
-	Usage []mem.Usage `json:"usage"`
+	Addr          string      `json:"addr"`
+	Error         string      `json:"error,omitempty"`
+	Usage         []mem.Usage `json:"usage"`
+	HistoricUsage []mem.Usage `json:"historicUsage"`
 }

 // PerfInfoHandler - GET /minio/admin/v1/performance?perfType={perfType}
--- a/cmd/endpoint.go
+++ b/cmd/endpoint.go
@ -204,6 +204,7 @@ func (endpoints EndpointList) GetString(i int) string {
 // local endpoints from given list of endpoints
 func localEndpointsMemUsage(endpoints EndpointList) ServerMemUsageInfo {
 	var memUsages []mem.Usage
+	var historicUsages []mem.Usage
 	var addr string
 	scratchSpace := map[string]bool{}
 	for _, endpoint := range endpoints {
@ -215,12 +216,15 @@ func localEndpointsMemUsage(endpoints EndpointList) ServerMemUsageInfo {
 			addr = GetLocalPeer(endpoints)
 			memUsage := mem.GetUsage()
 			memUsages = append(memUsages, memUsage)
+			historicUsage := mem.GetHistoricUsage()
+			historicUsages = append(historicUsages, historicUsage)
 			scratchSpace[endpoint.Host] = true
 		}
 	}
 	return ServerMemUsageInfo{
-		Addr:  addr,
-		Usage: memUsages,
+		Addr:          addr,
+		Usage:         memUsages,
+		HistoricUsage: historicUsages,
 	}
 }

@ -228,6 +232,7 @@ func localEndpointsMemUsage(endpoints EndpointList) ServerMemUsageInfo {
 // local endpoints from given list of endpoints
 func localEndpointsCPULoad(endpoints EndpointList) ServerCPULoadInfo {
 	var cpuLoads []cpu.Load
+	var historicLoads []cpu.Load
 	var addr string
 	scratchSpace := map[string]bool{}
 	for _, endpoint := range endpoints {
@ -239,12 +244,15 @@ func localEndpointsCPULoad(endpoints EndpointList) ServerCPULoadInfo {
 			addr = GetLocalPeer(endpoints)
 			cpuLoad := cpu.GetLoad()
 			cpuLoads = append(cpuLoads, cpuLoad)
+			historicLoad := cpu.GetHistoricLoad()
+			historicLoads = append(historicLoads, historicLoad)
 			scratchSpace[endpoint.Host] = true
 		}
 	}
 	return ServerCPULoadInfo{
-		Addr: addr,
-		Load: cpuLoads,
+		Addr:         addr,
+		Load:         cpuLoads,
+		HistoricLoad: historicLoads,
 	}
 }

--- a/pkg/cpu/cpu.go
+++ b/pkg/cpu/cpu.go
@ -17,11 +17,44 @@
 package cpu

 import (
-	"fmt"
 	"sync"
 	"time"
 )

+// rollingAvg holds the rolling average of the cpu load on the minio
+// server over its lifetime
+var rollingAvg *Load
+
+// cpuMeasureInterval is the interval of time between two
+// measurements of CPU load
+const cpuLoadMeasureInterval = 5 * time.Second
+
+// triggers the average load computation at server spawn
+func init() {
+	rollingAvg = &Load{
+		Min: float64(0),
+		Max: float64(0),
+		Avg: float64(0),
+	}
+	var rollingSum float64
+	var cycles float64
+	go func() {
+		for {
+			time.Sleep(cpuLoadMeasureInterval)
+			cycles = cycles + 1
+			currLoad := GetLoad()
+			if rollingAvg.Max < currLoad.Max || rollingAvg.Max == 0 {
+				rollingAvg.Max = currLoad.Max
+			}
+			if rollingAvg.Min > currLoad.Min || rollingAvg.Min == 0 {
+				rollingAvg.Min = currLoad.Min
+			}
+			rollingSum = rollingSum + currLoad.Avg
+			rollingAvg.Avg = rollingSum / cycles
+		}
+	}()
+}
+
 const (
 	// cpuLoadWindow is the interval of time for which the
 	// cpu utilization is measured
@ -37,15 +70,34 @@ const (

 // Load holds CPU utilization % measured in three intervals of 200ms each
 type Load struct {
-	Avg   string `json:"avg"`
-	Max   string `json:"max"`
-	Min   string `json:"min"`
-	Error string `json:"error,omitempty"`
+	Avg   float64 `json:"avg"`
+	Max   float64 `json:"max"`
+	Min   float64 `json:"min"`
+	Error string  `json:"error,omitempty"`
 }

 type counter struct{}

-// GetLoad returns the CPU utilization % of the current process
+// GetHistoricLoad returns the historic CPU utilization of the current process
+func GetHistoricLoad() Load {
+	return *rollingAvg
+}
+
+// GetLoad returns the CPU utilization of the current process
+// This function works by calcualating the amount of cpu clock
+// cycles the current process used in a given time window
+//
+// This corresponds to the CPU utilization calculation done by
+// tools like top. Here, we use the getclocktime with the
+// CLOCK_PROCESS_CPUTIME_ID parameter to obtain the total number of
+// clock ticks used by the process so far. Then we sleep for
+// 200ms and obtain the the total number of clock ticks again. The
+// difference between the two counts provides us the number of
+// clock ticks used by the process in the 200ms interval.
+//
+// The ratio of clock ticks used (measured in nanoseconds) to number
+// of nanoseconds in 200 milliseconds provides us the CPU usage
+// for the process currently
 func GetLoad() Load {
 	vals := make(chan time.Duration, 3)
 	wg := sync.WaitGroup{}
@ -83,9 +135,9 @@ func GetLoad() Load {
 	close(vals)
 	avg := sum / 3
 	return Load{
-		Avg:   fmt.Sprintf("%.2f%%", toFixed4(float64(avg)/float64(200*time.Millisecond))*100),
-		Max:   fmt.Sprintf("%.2f%%", toFixed4(float64(max)/float64(200*time.Millisecond))*100),
-		Min:   fmt.Sprintf("%.2f%%", toFixed4(float64(min)/float64(200*time.Millisecond))*100),
+		Avg:   toFixed4(float64(avg)/float64(200*time.Millisecond)) * 100,
+		Max:   toFixed4(float64(max)/float64(200*time.Millisecond)) * 100,
+		Min:   toFixed4(float64(min)/float64(200*time.Millisecond)) * 100,
 		Error: "",
 	}
 }
--- a/pkg/madmin/API.md
+++ b/pkg/madmin/API.md
@ -235,9 +235,9 @@ Fetches CPU utilization for all cluster nodes. Returned value is in Bytes.

 | Param | Type | Description |
 |-------|------|-------------|
-|`cpu.Load.Avg` | _string_ | The average utilization % of the CPU measured in a 200ms interval |
-|`cpu.Load.Min` | _string_ | The minimum utilization % of the CPU measured in a 200ms interval |
-|`cpu.Load.Max` | _string_ | The maximum utilization % of the CPU measured in a 200ms interval |
+|`cpu.Load.Avg` | _float64_ | The average utilization of the CPU measured in a 200ms interval |
+|`cpu.Load.Min` | _float64_ | The minimum utilization of the CPU measured in a 200ms interval |
+|`cpu.Load.Max` | _float64_ | The maximum utilization of the CPU measured in a 200ms interval |
 |`cpu.Load.Error` | _string_ | Error (if any) encountered while accesing the CPU info |

 <a name="ServerMemUsageInfo"></a>
@ -253,7 +253,7 @@ Fetches Mem utilization for all cluster nodes. Returned value is in Bytes.

 | Param | Type | Description |
 |-------|------|-------------|
-|`mem.Usage.Mem` | _string_ | The total number of bytes obtained from the OS |
+|`mem.Usage.Mem` | _uint64_ | The total number of bytes obtained from the OS |
 |`mem.Usage.Error` | _string_ | Error (if any) encountered while accesing the CPU info |

 ## 6. Heal operations
--- a/pkg/madmin/info-commands.go
+++ b/pkg/madmin/info-commands.go
@ -199,9 +199,10 @@ func (adm *AdminClient) ServerDrivesPerfInfo() ([]ServerDrivesPerfInfo, error) {
 // ServerCPULoadInfo holds information about address and cpu load of
 // a single server node
 type ServerCPULoadInfo struct {
-	Addr  string     `json:"addr"`
-	Error string     `json:"error,omitempty"`
-	Load  []cpu.Load `json:"load"`
+	Addr         string     `json:"addr"`
+	Error        string     `json:"error,omitempty"`
+	Load         []cpu.Load `json:"load"`
+	HistoricLoad []cpu.Load `json:"historicLoad"`
 }

 // ServerCPULoadInfo - Returns cpu utilization information
@ -242,9 +243,10 @@ func (adm *AdminClient) ServerCPULoadInfo() ([]ServerCPULoadInfo, error) {
 // ServerMemUsageInfo holds information about address and memory utilization of
 // a single server node
 type ServerMemUsageInfo struct {
-	Addr  string      `json:"addr"`
-	Error string      `json:"error,omitempty"`
-	Usage []mem.Usage `json:"usage"`
+	Addr          string      `json:"addr"`
+	Error         string      `json:"error,omitempty"`
+	Usage         []mem.Usage `json:"usage"`
+	HistoricUsage []mem.Usage `json:"historicUsage"`
 }

 // ServerMemUsageInfo - Returns mem utilization information
--- a/pkg/mem/mem.go
+++ b/pkg/mem/mem.go
@ -18,22 +18,51 @@ package mem

 import (
 	"runtime"
-
-	humanize "github.com/dustin/go-humanize"
+	"time"
 )

+// historicUsage holds the rolling average of memory used by
+// minio server
+var historicUsage *Usage
+
+// memUsageMeasureInterval is the window of time between
+// two measurements of memory usage
+const memUsageMeasureInterval = 5 * time.Second
+
+// triggers the collection of historic stats about the memory
+// utilized by minio server
+func init() {
+	historicUsage = &Usage{}
+	var cycles uint64
+	go func() {
+		for {
+			time.Sleep(memUsageMeasureInterval)
+			currUsage := GetUsage()
+			currSum := cycles * historicUsage.Mem
+			cycles = cycles + 1
+			historicUsage.Mem = (currSum + currUsage.Mem) / cycles
+		}
+	}()
+}
+
 // Usage holds memory utilization information in human readable format
 type Usage struct {
-	Mem   string `json:"mem"`
+	Mem   uint64 `json:"mem"`
 	Error string `json:"error,omitempty"`
 }

+// GetHistoricUsage measures the historic average of memory utilized by
+// current process
+func GetHistoricUsage() Usage {
+	return *historicUsage
+}
+
 // GetUsage measures the total memory provisioned for the current process
 // from the OS
 func GetUsage() Usage {
 	memStats := new(runtime.MemStats)
 	runtime.ReadMemStats(memStats)
 	return Usage{
-		Mem: humanize.IBytes(memStats.Sys),
+		Mem: memStats.Sys,
 	}
 }