From c606c76323656944933a5c8d2e6d81d4b3a617c1 Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Tue, 15 Dec 2020 17:34:54 -0800 Subject: [PATCH] fix: prioritized latest buckets for crawler to finish the scans faster (#11115) crawler should only ListBuckets once not for each serverPool, buckets are same across all pools, across sets and ListBuckets always returns an unified view, once list buckets returns sort it by create time to scan the latest buckets earlier with the assumption that latest buckets would have lesser content than older buckets allowing them to be scanned faster and also to be able to provide more closer to latest view. --- cmd/admin-heal-ops.go | 6 ++++++ cmd/background-newdisks-heal-ops.go | 7 +++++++ cmd/erasure-server-sets.go | 27 ++++++++++++--------------- cmd/erasure-sets.go | 6 +++++- cmd/fs-v1.go | 4 +++- cmd/object-api-utils.go | 7 ------- 6 files changed, 33 insertions(+), 24 deletions(-) diff --git a/cmd/admin-heal-ops.go b/cmd/admin-heal-ops.go index 298d78dad..b9fd84461 100644 --- a/cmd/admin-heal-ops.go +++ b/cmd/admin-heal-ops.go @@ -21,6 +21,7 @@ import ( "encoding/json" "fmt" "net/http" + "sort" "strings" "sync" "time" @@ -873,6 +874,11 @@ func (h *healSequence) healBuckets(objAPI ObjectLayer, bucketsOnly bool) error { return errFnHealFromAPIErr(h.ctx, err) } + // Heal latest buckets first. + sort.Slice(buckets, func(i, j int) bool { + return buckets[i].Created.After(buckets[j].Created) + }) + for _, bucket := range buckets { if err = h.healBucket(objAPI, bucket.Name, bucketsOnly); err != nil { return err diff --git a/cmd/background-newdisks-heal-ops.go b/cmd/background-newdisks-heal-ops.go index f28039001..445e3e307 100644 --- a/cmd/background-newdisks-heal-ops.go +++ b/cmd/background-newdisks-heal-ops.go @@ -20,6 +20,7 @@ import ( "context" "errors" "fmt" + "sort" "time" "github.com/dustin/go-humanize" @@ -161,6 +162,12 @@ wait: } buckets, _ := z.ListBuckets(ctx) + + // Heal latest buckets first. + sort.Slice(buckets, func(i, j int) bool { + return buckets[i].Created.After(buckets[j].Created) + }) + for i, setMap := range erasureSetInZoneDisksToHeal { for setIndex, disks := range setMap { for _, disk := range disks { diff --git a/cmd/erasure-server-sets.go b/cmd/erasure-server-sets.go index 740acbbf5..5a070da02 100644 --- a/cmd/erasure-server-sets.go +++ b/cmd/erasure-server-sets.go @@ -23,6 +23,7 @@ import ( "io" "math/rand" "net/http" + "sort" "strconv" "strings" "sync" @@ -322,23 +323,19 @@ func (z *erasureServerPools) CrawlAndGetDataUsage(ctx context.Context, bf *bloom var mu sync.Mutex var results []dataUsageCache var firstErr error - var knownBuckets = make(map[string]struct{}) // used to deduplicate buckets. - var allBuckets []BucketInfo + + allBuckets, err := z.ListBuckets(ctx) + if err != nil { + return err + } + + // Crawl latest allBuckets first. + sort.Slice(allBuckets, func(i, j int) bool { + return allBuckets[i].Created.After(allBuckets[j].Created) + }) // Collect for each set in serverPools. for _, z := range z.serverPools { - buckets, err := z.ListBuckets(ctx) - if err != nil { - return err - } - // Add new buckets. - for _, b := range buckets { - if _, ok := knownBuckets[b.Name]; ok { - continue - } - allBuckets = append(allBuckets, b) - knownBuckets[b.Name] = struct{}{} - } for _, erObj := range z.sets { wg.Add(1) results = append(results, dataUsageCache{}) @@ -355,7 +352,7 @@ func (z *erasureServerPools) CrawlAndGetDataUsage(ctx context.Context, bf *bloom } }() // Start crawler. Blocks until done. - err := erObj.crawlAndGetDataUsage(ctx, buckets, bf, updates) + err := erObj.crawlAndGetDataUsage(ctx, allBuckets, bf, updates) if err != nil { logger.LogIf(ctx, err) mu.Lock() diff --git a/cmd/erasure-sets.go b/cmd/erasure-sets.go index 5cf87bf93..3db81f756 100644 --- a/cmd/erasure-sets.go +++ b/cmd/erasure-sets.go @@ -722,10 +722,14 @@ func (s *erasureSets) ListBuckets(ctx context.Context) (buckets []BucketInfo, er return nil, err } } + for _, v := range healBuckets { listBuckets = append(listBuckets, BucketInfo(v)) } - sort.Sort(byBucketName(listBuckets)) + + sort.Slice(listBuckets, func(i, j int) bool { + return listBuckets[i].Name < listBuckets[j].Name + }) return listBuckets, nil } diff --git a/cmd/fs-v1.go b/cmd/fs-v1.go index d239393e4..70dc9ab56 100644 --- a/cmd/fs-v1.go +++ b/cmd/fs-v1.go @@ -531,7 +531,9 @@ func (fs *FSObjects) ListBuckets(ctx context.Context) ([]BucketInfo, error) { } // Sort bucket infos by bucket name. - sort.Sort(byBucketName(bucketInfos)) + sort.Slice(bucketInfos, func(i, j int) bool { + return bucketInfos[i].Name < bucketInfos[j].Name + }) // Succes. return bucketInfos, nil diff --git a/cmd/object-api-utils.go b/cmd/object-api-utils.go index 2b0d2a5ed..33ef61add 100644 --- a/cmd/object-api-utils.go +++ b/cmd/object-api-utils.go @@ -536,13 +536,6 @@ func getCompressedOffsets(objectInfo ObjectInfo, offset int64) (int64, int64) { return compressedOffset, offset - skipLength } -// byBucketName is a collection satisfying sort.Interface. -type byBucketName []BucketInfo - -func (d byBucketName) Len() int { return len(d) } -func (d byBucketName) Swap(i, j int) { d[i], d[j] = d[j], d[i] } -func (d byBucketName) Less(i, j int) bool { return d[i].Name < d[j].Name } - // GetObjectReader is a type that wraps a reader with a lock to // provide a ReadCloser interface that unlocks on Close() type GetObjectReader struct {