diff --git a/cmd/data-usage-cache.go b/cmd/data-usage-cache.go index 129cabf3f..09df4742a 100644 --- a/cmd/data-usage-cache.go +++ b/cmd/data-usage-cache.go @@ -29,7 +29,6 @@ import ( "github.com/cespare/xxhash/v2" "github.com/minio/minio/cmd/logger" - "github.com/minio/minio/pkg/color" "github.com/minio/minio/pkg/hash" "github.com/tinylib/msgp/msgp" ) @@ -114,7 +113,7 @@ func (d *dataUsageCache) find(path string) *dataUsageEntry { func (d *dataUsageCache) dui(path string, buckets []BucketInfo) DataUsageInfo { e := d.find(path) if e == nil { - return DataUsageInfo{LastUpdate: time.Now()} + return DataUsageInfo{LastUpdate: UTCNow()} } flat := d.flatten(*e) return DataUsageInfo{ @@ -213,9 +212,6 @@ func (d *dataUsageCache) pathSizes(buckets []BucketInfo) map[string]uint64 { for _, bucket := range buckets { e := d.find(bucket.Name) if e == nil { - if dataUsageDebug { - logger.Info(color.Green("data-usage:")+" Bucket not found in cache: %v", bucket.Name) - } continue } flat := d.flatten(*e) diff --git a/cmd/data-usage.go b/cmd/data-usage.go index 54fe843d7..c18860b78 100644 --- a/cmd/data-usage.go +++ b/cmd/data-usage.go @@ -35,12 +35,11 @@ import ( ) const ( - dataUsageObjName = "usage.json" - dataUsageCacheName = "usage-cache.bin" - dataUsageBucketCacheDir = "usage-caches" - dataUsageCrawlConf = "MINIO_DISK_USAGE_CRAWL" - dataUsageCrawlDelay = "MINIO_DISK_USAGE_CRAWL_DELAY" - dataUsageDebug = true + dataUsageObjName = ".usage.json" + dataUsageCacheName = ".usage-cache.bin" + envDataUsageCrawlConf = "MINIO_DISK_USAGE_CRAWL_ENABLE" + envDataUsageCrawlDelay = "MINIO_DISK_USAGE_CRAWL_DELAY" + envDataUsageCrawlDebug = "MINIO_DISK_USAGE_CRAWL_DEBUG" dataUsageSleepPerFolder = 1 * time.Millisecond dataUsageSleepDefMult = 10.0 dataUsageUpdateDirCycles = 16 @@ -51,11 +50,9 @@ const ( // initDataUsageStats will start the crawler unless disabled. func initDataUsageStats() { - dataUsageEnabled, err := config.ParseBool(env.Get(dataUsageCrawlConf, config.EnableOn)) - if err == nil && !dataUsageEnabled { - return + if env.Get(envDataUsageCrawlConf, config.EnableOn) == config.EnableOn { + go runDataUsageInfoUpdateRoutine() } - go runDataUsageInfoUpdateRoutine() } // runDataUsageInfoUpdateRoutine will contain the main crawler. @@ -89,9 +86,6 @@ func runDataUsageInfo(ctx context.Context, objAPI ObjectLayer) { // data usage calculator role for its lifetime. break } - if dataUsageDebug { - logger.Info(color.Green("runDataUsageInfo:") + " Starting crawler master") - } for { select { case <-ctx.Done(): @@ -111,14 +105,11 @@ func runDataUsageInfo(ctx context.Context, objAPI ObjectLayer) { // storeDataUsageInBackend will store all objects sent on the gui channel until closed. func storeDataUsageInBackend(ctx context.Context, objAPI ObjectLayer, gui <-chan DataUsageInfo) { for dataUsageInfo := range gui { - dataUsageJSON, err := json.MarshalIndent(dataUsageInfo, "", " ") + dataUsageJSON, err := json.Marshal(dataUsageInfo) if err != nil { logger.LogIf(ctx, err) continue } - if dataUsageDebug { - logger.Info(color.Green("data-usage:")+" Received update: %s", string(dataUsageJSON)) - } size := int64(len(dataUsageJSON)) r, err := hash.NewReader(bytes.NewReader(dataUsageJSON), size, "", "", size, false) if err != nil { @@ -172,6 +163,9 @@ type folderScanner struct { newCache dataUsageCache waitForLowActiveIO func() + dataUsageCrawlMult float64 + dataUsageCrawlDebug bool + newFolders []cachedFolder existingFolders []cachedFolder } @@ -194,12 +188,6 @@ func sleepDuration(d time.Duration, x float64) { // If final is not provided the folders found are returned from the function. func (f *folderScanner) scanQueuedLevels(ctx context.Context, folders []cachedFolder, final bool) ([]cachedFolder, error) { var nextFolders []cachedFolder - delayMult := dataUsageSleepDefMult - if mult := os.Getenv(dataUsageCrawlDelay); mult != "" { - if d, err := strconv.ParseFloat(mult, 64); err == nil { - delayMult = d - } - } done := ctx.Done() for _, folder := range folders { select { @@ -207,8 +195,9 @@ func (f *folderScanner) scanQueuedLevels(ctx context.Context, folders []cachedFo return nil, ctx.Err() default: } + f.waitForLowActiveIO() - sleepDuration(dataUsageSleepPerFolder, delayMult) + sleepDuration(dataUsageSleepPerFolder, f.dataUsageCrawlMult) cache := dataUsageEntry{} thisHash := hashPath(folder.name) @@ -218,14 +207,14 @@ func (f *folderScanner) scanQueuedLevels(ctx context.Context, folders []cachedFo entName = path.Clean(path.Join(folder.name, entName)) bucket, _ := path2BucketObjectWithBasePath(f.root, entName) if bucket == "" { - if dataUsageDebug { + if f.dataUsageCrawlDebug { logger.Info(color.Green("data-usage:")+" no bucket (%s,%s)", f.root, entName) } return nil } if isReservedOrInvalidBucket(bucket, false) { - if dataUsageDebug { + if f.dataUsageCrawlDebug { logger.Info(color.Green("data-usage:")+" invalid bucket: %v, entry: %v", bucket, entName) } return nil @@ -257,12 +246,12 @@ func (f *folderScanner) scanQueuedLevels(ctx context.Context, folders []cachedFo } f.waitForLowActiveIO() // Dynamic time delay. - t := time.Now() + t := UTCNow() // Get file size, ignore errors. size, err := f.getSize(Item{Path: path.Join(f.root, entName), Typ: typ}) - sleepDuration(time.Since(t), delayMult) + sleepDuration(time.Since(t), f.dataUsageCrawlMult) if err == errSkipFile { return nil } @@ -284,12 +273,7 @@ func (f *folderScanner) scanQueuedLevels(ctx context.Context, folders []cachedFo // deepScanFolder will deep scan a folder and return the size if no error occurs. func (f *folderScanner) deepScanFolder(ctx context.Context, folder string) (*dataUsageEntry, error) { var cache dataUsageEntry - delayMult := dataUsageSleepDefMult - if mult := os.Getenv(dataUsageCrawlDelay); mult != "" { - if d, err := strconv.ParseFloat(mult, 64); err == nil { - delayMult = d - } - } + done := ctx.Done() var addDir func(entName string, typ os.FileMode) error @@ -307,11 +291,12 @@ func (f *folderScanner) deepScanFolder(ctx context.Context, folder string) (*dat dirStack = append(dirStack, entName) err := readDirFn(path.Join(dirStack...), addDir) dirStack = dirStack[:len(dirStack)-1] - sleepDuration(dataUsageSleepPerFolder, delayMult) + sleepDuration(dataUsageSleepPerFolder, f.dataUsageCrawlMult) return err } + // Dynamic time delay. - t := time.Now() + t := UTCNow() // Get file size, ignore errors. dirStack = append(dirStack, entName) @@ -321,7 +306,7 @@ func (f *folderScanner) deepScanFolder(ctx context.Context, folder string) (*dat size, err := f.getSize(Item{Path: fileName, Typ: typ}) // Don't sleep for really small amount of time - sleepDuration(time.Since(t), delayMult) + sleepDuration(time.Since(t), f.dataUsageCrawlMult) if err == errSkipFile { return nil @@ -344,22 +329,39 @@ func (f *folderScanner) deepScanFolder(ctx context.Context, folder string) (*dat // Before each operation waitForLowActiveIO is called which can be used to temporarily halt the crawler. // If the supplied context is canceled the function will return at the first chance. func updateUsage(ctx context.Context, basePath string, cache dataUsageCache, waitForLowActiveIO func(), getSize getSizeFn) (dataUsageCache, error) { + t := UTCNow() + + dataUsageDebug := env.Get(envDataUsageCrawlDebug, config.EnableOff) == config.EnableOn + defer func() { + if dataUsageDebug { + logger.Info(color.Green("updateUsage")+" Crawl time at %s: %v", basePath, time.Since(t)) + } + }() + if cache.Info.Name == "" { cache.Info.Name = dataUsageRoot } - var logPrefix, logSuffix string - if dataUsageDebug { - logPrefix = color.Green("data-usage: ") - logSuffix = color.Blue(" - %v + %v", basePath, cache.Info.Name) + + delayMult, err := strconv.ParseFloat(env.Get(envDataUsageCrawlDelay, "10.0"), 64) + if err != nil { + logger.LogIf(ctx, err) + delayMult = dataUsageSleepDefMult } + s := folderScanner{ - root: basePath, - getSize: getSize, - oldCache: cache, - newCache: dataUsageCache{Info: cache.Info}, - waitForLowActiveIO: waitForLowActiveIO, - newFolders: nil, - existingFolders: nil, + root: basePath, + getSize: getSize, + oldCache: cache, + newCache: dataUsageCache{Info: cache.Info}, + waitForLowActiveIO: waitForLowActiveIO, + newFolders: nil, + existingFolders: nil, + dataUsageCrawlMult: delayMult, + dataUsageCrawlDebug: dataUsageDebug, + } + + if s.dataUsageCrawlDebug { + logger.Info(color.Green("runDataUsageInfo:") + " Starting crawler master") } done := ctx.Done() @@ -369,14 +371,21 @@ func updateUsage(ctx context.Context, basePath string, cache dataUsageCache, wai if cache.Info.Name != dataUsageRoot { flattenLevels-- } - if dataUsageDebug { + + var logPrefix, logSuffix string + if s.dataUsageCrawlDebug { + logPrefix = color.Green("data-usage: ") + logSuffix = color.Blue(" - %v + %v", basePath, cache.Info.Name) + } + + if s.dataUsageCrawlDebug { logger.Info(logPrefix+"Cycle: %v"+logSuffix, cache.Info.NextCycle) } // Always scan flattenLevels deep. Cache root is level 0. todo := []cachedFolder{{name: cache.Info.Name}} for i := 0; i < flattenLevels; i++ { - if dataUsageDebug { + if s.dataUsageCrawlDebug { logger.Info(logPrefix+"Level %v, scanning %v directories."+logSuffix, i, len(todo)) } select { @@ -392,9 +401,10 @@ func updateUsage(ctx context.Context, basePath string, cache dataUsageCache, wai } } - if dataUsageDebug { + if s.dataUsageCrawlDebug { logger.Info(logPrefix+"New folders: %v"+logSuffix, s.newFolders) } + // Add new folders first for _, folder := range s.newFolders { select { @@ -419,9 +429,10 @@ func updateUsage(ctx context.Context, basePath string, cache dataUsageCache, wai } } - if dataUsageDebug { + if s.dataUsageCrawlDebug { logger.Info(logPrefix+"Existing folders: %v"+logSuffix, len(s.existingFolders)) } + // Do selective scanning of existing folders. for _, folder := range s.existingFolders { select { @@ -448,7 +459,7 @@ func updateUsage(ctx context.Context, basePath string, cache dataUsageCache, wai s.newCache.replaceHashed(h, folder.parent, *du) } - s.newCache.Info.LastUpdate = time.Now() + s.newCache.Info.LastUpdate = UTCNow() s.newCache.Info.NextCycle++ return s.newCache, nil } diff --git a/cmd/data-usage_test.go b/cmd/data-usage_test.go index 53305e7c4..483a126de 100644 --- a/cmd/data-usage_test.go +++ b/cmd/data-usage_test.go @@ -18,7 +18,6 @@ package cmd import ( "context" - "fmt" "io/ioutil" "os" "path/filepath" @@ -30,8 +29,8 @@ type usageTestFile struct { size int } -func Test_updateUsage(t *testing.T) { - base, err := ioutil.TempDir("", "Test_updateUsage") +func TestDataUsageUpdate(t *testing.T) { + base, err := ioutil.TempDir("", "TestDataUsageUpdate") if err != nil { t.Skip(err) } @@ -58,6 +57,7 @@ func Test_updateUsage(t *testing.T) { } return 0, nil } + got, err := updateUsage(context.Background(), base, dataUsageCache{}, func() {}, getSize) if err != nil { t.Fatal(err) @@ -316,17 +316,10 @@ func Test_updateUsage(t *testing.T) { } }) } - - t.Log(got.StringAll()) - - t.Logf("Root, flat: %+v", got.flatten(*got.root())) - t.Logf("Root: %+v", *got.root()) - t.Logf("/dir1/dira: %+v", *got.find("/dir1/dira")) - } -func Test_updateUsagePrefix(t *testing.T) { - base, err := ioutil.TempDir("", "Test_updateUsagePrefix") +func TestDataUsageUpdatePrefix(t *testing.T) { + base, err := ioutil.TempDir("", "TestDataUpdateUsagePrefix") if err != nil { t.Skip(err) } @@ -593,12 +586,6 @@ func Test_updateUsagePrefix(t *testing.T) { } }) } - - t.Log(got.StringAll()) - - t.Logf("Root, flat: %+v", got.flatten(*got.root())) - t.Logf("Root: %+v", *got.root()) - t.Logf("bucket/dir1/dira: %+v", *got.find("bucket/dir1/dira")) } func createUsageTestFiles(t *testing.T, base string, files []usageTestFile) { @@ -614,8 +601,8 @@ func createUsageTestFiles(t *testing.T, base string, files []usageTestFile) { } } -func Test_dataUsageCacheSerialize(t *testing.T) { - base, err := ioutil.TempDir("", "Test_dataUsageCacheSerialize") +func TestDataUsageCacheSerialize(t *testing.T) { + base, err := ioutil.TempDir("", "TestDataUsageCacheSerialize") if err != nil { t.Skip(err) } @@ -646,19 +633,19 @@ func Test_dataUsageCacheSerialize(t *testing.T) { if err != nil { t.Fatal(err) } - b := want.serialize() - t.Log("serialize -> ", len(b), "bytes") + b := want.serialize() var got dataUsageCache err = got.deserialize(b) if err != nil { t.Fatal(err) } + if got.Info.LastUpdate.IsZero() { t.Error("lastupdate not set") } - if fmt.Sprint(want) == fmt.Sprint(got) { + if !want.Info.LastUpdate.Equal(got.Info.LastUpdate) { t.Fatalf("deserialize mismatch\nwant: %+v\ngot: %+v", want, got) } } diff --git a/cmd/fs-v1.go b/cmd/fs-v1.go index d47b28180..386c3ca1c 100644 --- a/cmd/fs-v1.go +++ b/cmd/fs-v1.go @@ -41,7 +41,6 @@ import ( "github.com/minio/minio/pkg/bucket/lifecycle" "github.com/minio/minio/pkg/bucket/object/tagging" "github.com/minio/minio/pkg/bucket/policy" - "github.com/minio/minio/pkg/color" "github.com/minio/minio/pkg/lock" "github.com/minio/minio/pkg/madmin" "github.com/minio/minio/pkg/mimedb" @@ -244,14 +243,10 @@ func (fs *FSObjects) CrawlAndGetDataUsage(ctx context.Context, updates chan<- Da if oldCache.Info.Name == "" { oldCache.Info.Name = dataUsageRoot } - if dataUsageDebug { - logger.Info(color.Green("FSObjects.CrawlAndGetDataUsage:") + " Start crawl cycle") - } buckets, err := fs.ListBuckets(ctx) if err != nil { return err } - t := time.Now() cache, err := updateUsage(ctx, fs.fsPath, oldCache, fs.waitForLowActiveIO, func(item Item) (int64, error) { // Get file size, symlinks which cannot be // followed are automatically filtered by fastwalk. @@ -261,9 +256,7 @@ func (fs *FSObjects) CrawlAndGetDataUsage(ctx context.Context, updates chan<- Da } return fi.Size(), nil }) - if dataUsageDebug { - logger.Info(color.Green("FSObjects.CrawlAndGetDataUsage:")+" Crawl time: %v", time.Since(t)) - } + // Even if there was an error, the new cache may have better info. if cache.Info.LastUpdate.After(oldCache.Info.LastUpdate) { logger.LogIf(ctx, cache.save(ctx, fs, dataUsageCacheName)) diff --git a/cmd/xl-v1.go b/cmd/xl-v1.go index d23c04e2b..a853c42ed 100644 --- a/cmd/xl-v1.go +++ b/cmd/xl-v1.go @@ -19,14 +19,12 @@ package cmd import ( "context" "fmt" - "path" "sort" "sync" "time" "github.com/minio/minio/cmd/logger" "github.com/minio/minio/pkg/bpool" - "github.com/minio/minio/pkg/color" "github.com/minio/minio/pkg/dsync" "github.com/minio/minio/pkg/madmin" "github.com/minio/minio/pkg/sync/errgroup" @@ -321,11 +319,9 @@ func (xl xlObjects) crawlAndGetDataUsage(ctx context.Context, buckets []BucketIn return default: } - if dataUsageDebug { - logger.Info(color.Green("crawlAndGetDataUsage:")+" Scanning bucket %v.", bucket.Name) - } + // Load cache for bucket - cacheName := path.Join(dataUsageBucketCacheDir, bucket.Name+".bin") + cacheName := pathJoin(bucket.Name, dataUsageCacheName) cache := dataUsageCache{} logger.LogIf(ctx, cache.load(ctx, xl, cacheName)) if cache.Info.Name == "" { diff --git a/cmd/xl-zones.go b/cmd/xl-zones.go index 2370d73fd..d4ef39686 100644 --- a/cmd/xl-zones.go +++ b/cmd/xl-zones.go @@ -32,7 +32,6 @@ import ( "github.com/minio/minio/pkg/bucket/lifecycle" "github.com/minio/minio/pkg/bucket/object/tagging" "github.com/minio/minio/pkg/bucket/policy" - "github.com/minio/minio/pkg/color" "github.com/minio/minio/pkg/madmin" "github.com/minio/minio/pkg/sync/errgroup" ) @@ -227,10 +226,6 @@ func (z *xlZones) CrawlAndGetDataUsage(ctx context.Context, updates chan<- DataU var knownBuckets = make(map[string]struct{}) // used to deduplicate buckets. var allBuckets []BucketInfo - t := time.Now() - if dataUsageDebug { - logger.Info(color.Green("xlZones.CrawlAndGetDataUsage:") + " Start crawl cycle") - } // Collect for each set in zones. for _, z := range z.zones { for _, xlObj := range z.sets { @@ -314,9 +309,6 @@ func (z *xlZones) CrawlAndGetDataUsage(ctx context.Context, updates chan<- DataU }() wg.Wait() - if dataUsageDebug { - logger.Info(color.Green("xlZones.CrawlAndGetDataUsage:")+" Cycle scan time: %v", time.Since(t)) - } ch := make(chan struct{}) updateCloser <- ch <-ch diff --git a/docs/config/README.md b/docs/config/README.md index acecbce7a..d9689cbd8 100644 --- a/docs/config/README.md +++ b/docs/config/README.md @@ -235,8 +235,22 @@ This behavior is consistent across all keys, each key self documents itself with ## Environment only settings (not in config) -#### Worm -Enable this to turn on Write-Once-Read-Many. By default it is set to `off`. Set ``MINIO_WORM=on`` environment variable to enable WORM mode. +#### Usage crawler +Data usage crawler is enabled by default, following ENVs allow for more staggered delay in terms of usage calculation. + +The crawler adapts to the system speed and completely pauses when the system is under load. It is possible to adjust the speed of the crawler and thereby the latency of updates being reflected. The delays between each operation of the crawl can be adjusted by the `MINIO_DISK_USAGE_CRAWL_DELAY` environment variable. By default the value is `10`. This means the crawler will sleep *10x* the time each operation takes. + +This will in most setups make the crawler slow enough to not impact overall system performance. Setting `MINIO_DISK_USAGE_CRAWL_DELAY` to a *lower* value will make the crawler faster and setting it to 0 will make the crawler run at full speed (not recommended). Setting it to a higher value will make the crawler slower, further consume less resources. + +Example: Following setting will decrease the crawler speed by a factor of 3, reducing the system resource use, but increasing the latency of updates being reflected. + +```sh +export MINIO_DISK_USAGE_CRAWL_DELAY=30 +minio server /data +``` + +#### Worm (deprecated) +Enable this to turn on Write-Once-Read-Many. By default it is set to `off`. Set ``MINIO_WORM=on`` environment variable to enable WORM mode. This ENV setting is not recommended anymore, please use Object Locking and Object Retention APIs documented [here](https://github.com/minio/minio/tree/master/docs/retention). Example: