From 187c3f62df508619c176f45a50131aaa849d855c Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Fri, 17 Jul 2020 10:08:04 -0700 Subject: [PATCH] fix: heal replaced drives properly (#10069) healing was not working properly when drives were replaced, due to the error check in root disk calculation this PR fixes this behavior This PR also adds additional fix for missing metadata entries from .minio.sys as part of disk healing as well. Added code to ignore and print more context sensitive errors for better debugging. This PR is continuation of fix in 7b14e9b660ce2d93cfc2f481c89c67d0484c40ea --- cmd/admin-heal-ops.go | 7 +++++- cmd/background-newdisks-heal-ops.go | 6 +++--- cmd/erasure-healing.go | 4 +++- cmd/erasure-sets.go | 8 ++----- cmd/format-erasure.go | 3 +++ cmd/global-heal.go | 6 ++++++ cmd/prepare-storage.go | 33 ++++++++++++++++++++--------- 7 files changed, 46 insertions(+), 21 deletions(-) diff --git a/cmd/admin-heal-ops.go b/cmd/admin-heal-ops.go index 16d86de05..5ee76aac6 100644 --- a/cmd/admin-heal-ops.go +++ b/cmd/admin-heal-ops.go @@ -704,7 +704,12 @@ func (h *healSequence) healItemsFromSourceCh() error { } if err := h.queueHealTask(source, itemType); err != nil { - logger.LogIf(h.ctx, err) + switch err.(type) { + case ObjectExistsAsDirectory: + default: + logger.LogIf(h.ctx, fmt.Errorf("Heal attempt failed for %s: %w", + pathJoin(source.bucket, source.object), err)) + } } h.scannedItemsMap[itemType]++ diff --git a/cmd/background-newdisks-heal-ops.go b/cmd/background-newdisks-heal-ops.go index c0132c7f8..bbd2c7f94 100644 --- a/cmd/background-newdisks-heal-ops.go +++ b/cmd/background-newdisks-heal-ops.go @@ -24,7 +24,7 @@ import ( "github.com/minio/minio/cmd/logger" ) -const defaultMonitorNewDiskInterval = time.Minute * 5 +const defaultMonitorNewDiskInterval = time.Minute * 3 func initLocalDisksAutoHeal(ctx context.Context, objAPI ObjectLayer) { go monitorLocalDisksAndHeal(ctx, objAPI) @@ -105,13 +105,13 @@ func monitorLocalDisksAndHeal(ctx context.Context, objAPI ObjectLayer) { // Load the new format of this passed endpoint _, format, err := connectEndpoint(endpoint) if err != nil { - logger.LogIf(ctx, err) + printEndpointError(endpoint, err, true) continue } // Calculate the set index where the current endpoint belongs setIndex, _, err := findDiskIndex(z.zones[i].format, format) if err != nil { - logger.LogIf(ctx, err) + printEndpointError(endpoint, err, false) continue } diff --git a/cmd/erasure-healing.go b/cmd/erasure-healing.go index e6cd871b6..dc612d693 100644 --- a/cmd/erasure-healing.go +++ b/cmd/erasure-healing.go @@ -459,7 +459,9 @@ func (er erasureObjects) healObject(ctx context.Context, bucket string, object s // Attempt a rename now from healed data to final location. if err = disk.RenameData(minioMetaTmpBucket, tmpID, latestMeta.DataDir, bucket, object); err != nil { - logger.LogIf(ctx, err) + if err != errIsNotRegular && err != errFileNotFound { + logger.LogIf(ctx, err) + } return result, toObjectErr(err, bucket, object) } diff --git a/cmd/erasure-sets.go b/cmd/erasure-sets.go index 3841d411a..774f4c463 100644 --- a/cmd/erasure-sets.go +++ b/cmd/erasure-sets.go @@ -204,14 +204,14 @@ func (s *erasureSets) connectDisks() { defer wg.Done() disk, format, err := connectEndpoint(endpoint) if err != nil { - printEndpointError(endpoint, err) + printEndpointError(endpoint, err, true) return } setIndex, diskIndex, err := findDiskIndex(s.format, format) if err != nil { // Close the internal connection to avoid connection leaks. disk.Close() - printEndpointError(endpoint, err) + printEndpointError(endpoint, err, false) return } disk.SetDiskID(format.Erasure.This) @@ -1296,10 +1296,6 @@ func markRootDisksAsDown(storageDisks []StorageAPI) { return } for i := range storageDisks { - if errs[i] != nil { - storageDisks[i] = nil - continue - } if infos[i].RootDisk { // We should not heal on root disk. i.e in a situation where the minio-administrator has unmounted a // defective drive we should not heal a path on the root disk. diff --git a/cmd/format-erasure.go b/cmd/format-erasure.go index 3c3a6d1b9..990cb047a 100644 --- a/cmd/format-erasure.go +++ b/cmd/format-erasure.go @@ -709,6 +709,9 @@ func saveFormatErasureAll(ctx context.Context, storageDisks []StorageAPI, format for index := range storageDisks { index := index g.Go(func() error { + if formats[index] == nil { + return errDiskNotFound + } return saveFormatErasure(storageDisks[index], formats[index], formats[index].Erasure.This) }, index) } diff --git a/cmd/global-heal.go b/cmd/global-heal.go index 6e465c152..2659bc9cf 100644 --- a/cmd/global-heal.go +++ b/cmd/global-heal.go @@ -103,6 +103,12 @@ func healErasureSet(ctx context.Context, setIndex int, xlObj *erasureObjects, dr } } + buckets = append(buckets, BucketInfo{ + Name: pathJoin(minioMetaBucket, minioConfigPrefix), + }, BucketInfo{ + Name: pathJoin(minioMetaBucket, bucketConfigPrefix), + }) // add metadata .minio.sys/ bucket prefixes to heal + // Heal all buckets with all objects for _, bucket := range buckets { // Heal current bucket diff --git a/cmd/prepare-storage.go b/cmd/prepare-storage.go index 51eef1116..21aa2b68f 100644 --- a/cmd/prepare-storage.go +++ b/cmd/prepare-storage.go @@ -33,28 +33,41 @@ import ( "github.com/minio/minio/pkg/sync/errgroup" ) -var printEndpointError = func() func(Endpoint, error) { +var printEndpointError = func() func(Endpoint, error, bool) { var mutex sync.Mutex - printOnce := make(map[Endpoint]map[string]bool) + printOnce := make(map[Endpoint]map[string]int) - return func(endpoint Endpoint, err error) { + return func(endpoint Endpoint, err error, once bool) { reqInfo := (&logger.ReqInfo{}).AppendTags("endpoint", endpoint.String()) ctx := logger.SetReqInfo(GlobalContext, reqInfo) mutex.Lock() defer mutex.Unlock() + m, ok := printOnce[endpoint] if !ok { - m = make(map[string]bool) - m[err.Error()] = true + m = make(map[string]int) + m[err.Error()]++ printOnce[endpoint] = m - logger.LogAlwaysIf(ctx, err) - return + if once { + logger.LogAlwaysIf(ctx, err) + return + } } - if m[err.Error()] { + // Once is set and we are here means error was already + // printed once. + if once { return } - m[err.Error()] = true - logger.LogAlwaysIf(ctx, err) + // once not set, check if same error occurred 3 times in + // a row, then make sure we print it to call attention. + if m[err.Error()] > 2 { + logger.LogAlwaysIf(ctx, fmt.Errorf("Following error has been printed %d times.. %w", m[err.Error()], err)) + // Reduce the count to introduce further delay in printing + // but let it again print after the 2th attempt + m[err.Error()]-- + m[err.Error()]-- + } + m[err.Error()]++ } }()