From b3ca304c010cb1fb4e54f9f1f510548073857366 Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Mon, 19 Aug 2019 08:22:32 -1000 Subject: [PATCH] Avoid excessive listing attempts in the daily sweep (#8081) Add better dynamic timeouts for locks, also add jitters before launching daily sweep to ensure that not all the servers in distributed setup are not trying to hold locks to begin the sweep round. Also, add enough delay for incoming requests based on totalSetCount*totalDriveCount. A possible fix for #8071 --- cmd/background-heal-ops.go | 8 ++++---- cmd/daily-lifecycle-ops.go | 8 +++----- cmd/daily-sweeper.go | 22 ++++++++++++++++++---- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/cmd/background-heal-ops.go b/cmd/background-heal-ops.go index 3624284b2..ac3b4722e 100644 --- a/cmd/background-heal-ops.go +++ b/cmd/background-heal-ops.go @@ -62,11 +62,11 @@ func (h *healRoutine) run() { break } if globalHTTPServer != nil { - // Wait at max 1 minute for an inprogress request - // before proceeding to heal - waitCount := 60 + // Wait at max 10 minute for an inprogress request before proceeding to heal + waitCount := 600 // Any requests in progress, delay the heal. - for globalHTTPServer.GetRequestCount() > 2 && waitCount > 0 { + for (globalHTTPServer.GetRequestCount() >= int32(globalXLSetCount*globalXLSetDriveCount)) && + waitCount > 0 { waitCount-- time.Sleep(1 * time.Second) } diff --git a/cmd/daily-lifecycle-ops.go b/cmd/daily-lifecycle-ops.go index 9e653b9c6..eacfe0466 100644 --- a/cmd/daily-lifecycle-ops.go +++ b/cmd/daily-lifecycle-ops.go @@ -103,14 +103,12 @@ func startDailyLifecycle() { } } -func lifecycleRound(ctx context.Context, objAPI ObjectLayer) error { - - zeroDuration := time.Millisecond - zeroDynamicTimeout := newDynamicTimeout(zeroDuration, zeroDuration) +var lifecycleTimeout = newDynamicTimeout(60*time.Second, time.Second) +func lifecycleRound(ctx context.Context, objAPI ObjectLayer) error { // Lock to avoid concurrent lifecycle ops from other nodes sweepLock := globalNSMutex.NewNSLock(ctx, "system", "daily-lifecycle-ops") - if err := sweepLock.GetLock(zeroDynamicTimeout); err != nil { + if err := sweepLock.GetLock(lifecycleTimeout); err != nil { return err } defer sweepLock.Unlock() diff --git a/cmd/daily-sweeper.go b/cmd/daily-sweeper.go index 7bd4b8312..99e65404a 100644 --- a/cmd/daily-sweeper.go +++ b/cmd/daily-sweeper.go @@ -18,6 +18,7 @@ package cmd import ( "context" + "math/rand" "sync" "time" @@ -48,15 +49,14 @@ func copyDailySweepListeners() []chan string { return listenersCopy } +var sweepTimeout = newDynamicTimeout(60*time.Second, time.Second) + // sweepRound will list all objects, having read quorum or not and // feeds to all listeners, such as the background healing func sweepRound(ctx context.Context, objAPI ObjectLayer) error { - zeroDuration := time.Millisecond - zeroDynamicTimeout := newDynamicTimeout(zeroDuration, zeroDuration) - // General lock so we avoid parallel daily sweep by different instances. sweepLock := globalNSMutex.NewNSLock(ctx, "system", "daily-sweep") - if err := sweepLock.GetLock(zeroDynamicTimeout); err != nil { + if err := sweepLock.GetLock(sweepTimeout); err != nil { return err } defer sweepLock.Unlock() @@ -76,6 +76,17 @@ func sweepRound(ctx context.Context, objAPI ObjectLayer) error { marker := "" for { + if globalHTTPServer != nil { + // Wait at max 10 minute for an inprogress request before proceeding to heal + waitCount := 600 + // Any requests in progress, delay the heal. + for (globalHTTPServer.GetRequestCount() >= int32(globalXLSetCount*globalXLSetDriveCount)) && + waitCount > 0 { + waitCount-- + time.Sleep(1 * time.Second) + } + } + res, err := objAPI.ListObjectsHeal(ctx, bucket.Name, "", marker, "", 1000) if err != nil { continue @@ -119,6 +130,9 @@ func dailySweeper() { break } + // Start with random sleep time, so as to avoid "synchronous checks" between servers + time.Sleep(time.Duration(rand.Float64() * float64(time.Hour))) + // Perform a sweep round each month for { if time.Since(lastSweepTime) < 30*24*time.Hour {