From facbd653bad5e9814c896d55cae08ae7b6e77918 Mon Sep 17 00:00:00 2001 From: Anis Elleuch Date: Thu, 14 Mar 2019 21:08:51 +0100 Subject: [PATCH] Add normal/deep type of heal scanning (#7251) Healing scan used to read all objects parts to check for bitrot checksum. This commit will add a quicker way of healing scan by only checking if parts are actually present in disks or not. --- cmd/admin-heal-ops.go | 4 +-- cmd/fs-v1.go | 2 +- cmd/fs-v1_test.go | 4 ++- cmd/gateway-unsupported.go | 2 +- cmd/object-api-interface.go | 2 +- cmd/xl-sets.go | 4 +-- cmd/xl-v1-healing-common.go | 52 ++++++++++++++++++++------------ cmd/xl-v1-healing-common_test.go | 8 +++-- cmd/xl-v1-healing.go | 9 +++--- cmd/xl-v1-healing_test.go | 6 ++-- cmd/xl-v1-object_test.go | 5 +-- pkg/madmin/heal-commands.go | 17 +++++++++-- 12 files changed, 73 insertions(+), 42 deletions(-) diff --git a/cmd/admin-heal-ops.go b/cmd/admin-heal-ops.go index 3a48b8250..a20a43a5f 100644 --- a/cmd/admin-heal-ops.go +++ b/cmd/admin-heal-ops.go @@ -575,7 +575,7 @@ func (h *healSequence) healMinioSysMeta(metaPrefix string) func() error { if h.isQuitting() { return errHealStopSignalled } - res, herr := objectAPI.HealObject(h.ctx, bucket, object, h.settings.DryRun, h.settings.Remove) + res, herr := objectAPI.HealObject(h.ctx, bucket, object, h.settings.DryRun, h.settings.Remove, h.settings.ScanMode) // Object might have been deleted, by the time heal // was attempted we ignore this object an move on. if isErrObjectNotFound(herr) { @@ -718,7 +718,7 @@ func (h *healSequence) healObject(bucket, object string) error { return errServerNotInitialized } - hri, err := objectAPI.HealObject(h.ctx, bucket, object, h.settings.DryRun, h.settings.Remove) + hri, err := objectAPI.HealObject(h.ctx, bucket, object, h.settings.DryRun, h.settings.Remove, h.settings.ScanMode) if isErrObjectNotFound(err) { return nil } diff --git a/cmd/fs-v1.go b/cmd/fs-v1.go index 6051b340c..0374013ef 100644 --- a/cmd/fs-v1.go +++ b/cmd/fs-v1.go @@ -1240,7 +1240,7 @@ func (fs *FSObjects) HealFormat(ctx context.Context, dryRun bool) (madmin.HealRe } // HealObject - no-op for fs. Valid only for XL. -func (fs *FSObjects) HealObject(ctx context.Context, bucket, object string, dryRun, remove bool) ( +func (fs *FSObjects) HealObject(ctx context.Context, bucket, object string, dryRun, remove bool, scanMode madmin.HealScanMode) ( res madmin.HealResultItem, err error) { logger.LogIf(ctx, NotImplemented{}) return res, NotImplemented{} diff --git a/cmd/fs-v1_test.go b/cmd/fs-v1_test.go index c746b8513..b590774bf 100644 --- a/cmd/fs-v1_test.go +++ b/cmd/fs-v1_test.go @@ -22,6 +22,8 @@ import ( "os" "path/filepath" "testing" + + "github.com/minio/minio/pkg/madmin" ) // Tests for if parent directory is object @@ -390,7 +392,7 @@ func TestFSHealObject(t *testing.T) { defer os.RemoveAll(disk) obj := initFSObjects(disk, t) - _, err := obj.HealObject(context.Background(), "bucket", "object", false, false) + _, err := obj.HealObject(context.Background(), "bucket", "object", false, false, madmin.HealDeepScan) if err == nil || !isSameType(err, NotImplemented{}) { t.Fatalf("Heal Object should return NotImplemented error ") } diff --git a/cmd/gateway-unsupported.go b/cmd/gateway-unsupported.go index 64cdb6bb1..5ed3d747e 100644 --- a/cmd/gateway-unsupported.go +++ b/cmd/gateway-unsupported.go @@ -102,7 +102,7 @@ func (a GatewayUnsupported) ListBucketsHeal(ctx context.Context) (buckets []Buck } // HealObject - Not implemented stub -func (a GatewayUnsupported) HealObject(ctx context.Context, bucket, object string, dryRun, remove bool) (h madmin.HealResultItem, e error) { +func (a GatewayUnsupported) HealObject(ctx context.Context, bucket, object string, dryRun, remove bool, scanMode madmin.HealScanMode) (h madmin.HealResultItem, e error) { return h, NotImplemented{} } diff --git a/cmd/object-api-interface.go b/cmd/object-api-interface.go index 36e78b546..61a452aef 100644 --- a/cmd/object-api-interface.go +++ b/cmd/object-api-interface.go @@ -88,7 +88,7 @@ type ObjectLayer interface { ReloadFormat(ctx context.Context, dryRun bool) error HealFormat(ctx context.Context, dryRun bool) (madmin.HealResultItem, error) HealBucket(ctx context.Context, bucket string, dryRun, remove bool) (madmin.HealResultItem, error) - HealObject(ctx context.Context, bucket, object string, dryRun, remove bool) (madmin.HealResultItem, error) + HealObject(ctx context.Context, bucket, object string, dryRun, remove bool, scanMode madmin.HealScanMode) (madmin.HealResultItem, error) ListBucketsHeal(ctx context.Context) (buckets []BucketInfo, err error) HealObjects(ctx context.Context, bucket, prefix string, healObjectFn func(string, string) error) error diff --git a/cmd/xl-sets.go b/cmd/xl-sets.go index aaad84964..8d41b489d 100644 --- a/cmd/xl-sets.go +++ b/cmd/xl-sets.go @@ -1296,8 +1296,8 @@ func (s *xlSets) HealBucket(ctx context.Context, bucket string, dryRun, remove b } // HealObject - heals inconsistent object on a hashedSet based on object name. -func (s *xlSets) HealObject(ctx context.Context, bucket, object string, dryRun, remove bool) (madmin.HealResultItem, error) { - return s.getHashedSet(object).HealObject(ctx, bucket, object, dryRun, remove) +func (s *xlSets) HealObject(ctx context.Context, bucket, object string, dryRun, remove bool, scanMode madmin.HealScanMode) (madmin.HealResultItem, error) { + return s.getHashedSet(object).HealObject(ctx, bucket, object, dryRun, remove, scanMode) } // Lists all buckets which need healing. diff --git a/cmd/xl-v1-healing-common.go b/cmd/xl-v1-healing-common.go index 5e4d04a71..9e9dab4c5 100644 --- a/cmd/xl-v1-healing-common.go +++ b/cmd/xl-v1-healing-common.go @@ -22,6 +22,7 @@ import ( "time" "github.com/minio/minio/cmd/logger" + "github.com/minio/minio/pkg/madmin" ) // commonTime returns a maximally occurring time from a list of time. @@ -158,7 +159,7 @@ func getLatestXLMeta(ctx context.Context, partsMetadata []xlMetaV1, errs []error // - slice of errors about the state of data files on disk - can have // a not-found error or a hash-mismatch error. func disksWithAllParts(ctx context.Context, onlineDisks []StorageAPI, partsMetadata []xlMetaV1, errs []error, bucket, - object string) ([]StorageAPI, []error) { + object string, scanMode madmin.HealScanMode) ([]StorageAPI, []error) { availableDisks := make([]StorageAPI, len(onlineDisks)) dataErrs := make([]error, len(onlineDisks)) @@ -168,27 +169,38 @@ func disksWithAllParts(ctx context.Context, onlineDisks []StorageAPI, partsMetad continue } - erasureInfo := partsMetadata[i].Erasure - erasure, err := NewErasure(ctx, erasureInfo.DataBlocks, erasureInfo.ParityBlocks, erasureInfo.BlockSize) - if err != nil { - dataErrs[i] = err - continue - } - - // disk has a valid xl.json but may not have all the - // parts. This is considered an outdated disk, since - // it needs healing too. - for _, part := range partsMetadata[i].Parts { - checksumInfo := erasureInfo.GetChecksumInfo(part.Name) - tillOffset := erasure.ShardFileTillOffset(0, part.Size, part.Size) - err = bitrotCheckFile(onlineDisk, bucket, pathJoin(object, part.Name), tillOffset, checksumInfo.Algorithm, checksumInfo.Hash, erasure.ShardSize()) + switch scanMode { + case madmin.HealDeepScan: + erasureInfo := partsMetadata[i].Erasure + erasure, err := NewErasure(ctx, erasureInfo.DataBlocks, erasureInfo.ParityBlocks, erasureInfo.BlockSize) if err != nil { - isCorrupt := strings.HasPrefix(err.Error(), "Bitrot verification mismatch - expected ") - if !isCorrupt && err != errFileNotFound && err != errVolumeNotFound { - logger.LogIf(ctx, err) - } dataErrs[i] = err - break + continue + } + + // disk has a valid xl.json but may not have all the + // parts. This is considered an outdated disk, since + // it needs healing too. + for _, part := range partsMetadata[i].Parts { + checksumInfo := erasureInfo.GetChecksumInfo(part.Name) + tillOffset := erasure.ShardFileTillOffset(0, part.Size, part.Size) + err = bitrotCheckFile(onlineDisk, bucket, pathJoin(object, part.Name), tillOffset, checksumInfo.Algorithm, checksumInfo.Hash, erasure.ShardSize()) + if err != nil { + isCorrupt := strings.HasPrefix(err.Error(), "Bitrot verification mismatch - expected ") + if !isCorrupt && err != errFileNotFound && err != errVolumeNotFound { + logger.LogIf(ctx, err) + } + dataErrs[i] = err + break + } + } + case madmin.HealNormalScan: + for _, part := range partsMetadata[i].Parts { + _, err := onlineDisk.StatFile(bucket, pathJoin(object, part.Name)) + if err != nil { + dataErrs[i] = err + break + } } } diff --git a/cmd/xl-v1-healing-common_test.go b/cmd/xl-v1-healing-common_test.go index cd8292e11..21a88ed76 100644 --- a/cmd/xl-v1-healing-common_test.go +++ b/cmd/xl-v1-healing-common_test.go @@ -23,6 +23,8 @@ import ( "path/filepath" "testing" "time" + + "github.com/minio/minio/pkg/madmin" ) // validates functionality provided to find most common @@ -239,7 +241,7 @@ func TestListOnlineDisks(t *testing.T) { i+1, test.expectedTime, modTime) } - availableDisks, newErrs := disksWithAllParts(context.Background(), onlineDisks, partsMetadata, test.errs, bucket, object) + availableDisks, newErrs := disksWithAllParts(context.Background(), onlineDisks, partsMetadata, test.errs, bucket, object, madmin.HealDeepScan) test.errs = newErrs if test._tamperBackend != noTamper { @@ -291,7 +293,7 @@ func TestDisksWithAllParts(t *testing.T) { t.Fatalf("Failed to read xl meta data %v", err) } - filteredDisks, errs := disksWithAllParts(ctx, xlDisks, partsMetadata, errs, bucket, object) + filteredDisks, errs := disksWithAllParts(ctx, xlDisks, partsMetadata, errs, bucket, object, madmin.HealDeepScan) if len(filteredDisks) != len(xlDisks) { t.Errorf("Unexpected number of disks: %d", len(filteredDisks)) @@ -328,7 +330,7 @@ func TestDisksWithAllParts(t *testing.T) { } errs = make([]error, len(xlDisks)) - filteredDisks, errs = disksWithAllParts(ctx, xlDisks, partsMetadata, errs, bucket, object) + filteredDisks, errs = disksWithAllParts(ctx, xlDisks, partsMetadata, errs, bucket, object, madmin.HealDeepScan) if len(filteredDisks) != len(xlDisks) { t.Errorf("Unexpected number of disks: %d", len(filteredDisks)) diff --git a/cmd/xl-v1-healing.go b/cmd/xl-v1-healing.go index bf3908cf0..8ea568ee4 100644 --- a/cmd/xl-v1-healing.go +++ b/cmd/xl-v1-healing.go @@ -208,7 +208,8 @@ func shouldHealObjectOnDisk(xlErr, dataErr error, meta xlMetaV1, quorumModTime t // Heals an object by re-writing corrupt/missing erasure blocks. func healObject(ctx context.Context, storageDisks []StorageAPI, bucket string, object string, - quorum int, dryRun bool) (result madmin.HealResultItem, err error) { + quorum int, dryRun bool, scanMode madmin.HealScanMode) (result madmin.HealResultItem, err error) { + partsMetadata, errs := readAllXLMetadata(ctx, storageDisks, bucket, object) errCount := 0 @@ -232,7 +233,7 @@ func healObject(ctx context.Context, storageDisks []StorageAPI, bucket string, o latestDisks, modTime := listOnlineDisks(storageDisks, partsMetadata, errs) // List of disks having all parts as per latest xl.json. - availableDisks, dataErrs := disksWithAllParts(ctx, latestDisks, partsMetadata, errs, bucket, object) + availableDisks, dataErrs := disksWithAllParts(ctx, latestDisks, partsMetadata, errs, bucket, object, scanMode) // Initialize heal result object result = madmin.HealResultItem{ @@ -621,7 +622,7 @@ func (xl xlObjects) isObjectDangling(metaArr []xlMetaV1, errs []error) (validMet // FIXME: If an object object was deleted and one disk was down, // and later the disk comes back up again, heal on the object // should delete it. -func (xl xlObjects) HealObject(ctx context.Context, bucket, object string, dryRun bool, remove bool) (hr madmin.HealResultItem, err error) { +func (xl xlObjects) HealObject(ctx context.Context, bucket, object string, dryRun bool, remove bool, scanMode madmin.HealScanMode) (hr madmin.HealResultItem, err error) { // Create context that also contains information about the object and bucket. // The top level handler might not have this information. reqInfo := logger.GetReqInfo(ctx) @@ -670,5 +671,5 @@ func (xl xlObjects) HealObject(ctx context.Context, bucket, object string, dryRu defer objectLock.RUnlock() // Heal the object. - return healObject(healCtx, xl.getDisks(), bucket, object, latestXLMeta.Erasure.DataBlocks, dryRun) + return healObject(healCtx, xl.getDisks(), bucket, object, latestXLMeta.Erasure.DataBlocks, dryRun, scanMode) } diff --git a/cmd/xl-v1-healing_test.go b/cmd/xl-v1-healing_test.go index d1f7c6313..b31acd1b6 100644 --- a/cmd/xl-v1-healing_test.go +++ b/cmd/xl-v1-healing_test.go @@ -21,6 +21,8 @@ import ( "context" "path/filepath" "testing" + + "github.com/minio/minio/pkg/madmin" ) // Tests undoes and validates if the undoing completes successfully. @@ -114,7 +116,7 @@ func TestHealObjectXL(t *testing.T) { t.Fatalf("Failed to delete a file - %v", err) } - _, err = obj.HealObject(context.Background(), bucket, object, false, false) + _, err = obj.HealObject(context.Background(), bucket, object, false, false, madmin.HealNormalScan) if err != nil { t.Fatalf("Failed to heal object - %v", err) } @@ -130,7 +132,7 @@ func TestHealObjectXL(t *testing.T) { } // Try healing now, expect to receive errDiskNotFound. - _, err = obj.HealObject(context.Background(), bucket, object, false, false) + _, err = obj.HealObject(context.Background(), bucket, object, false, false, madmin.HealDeepScan) // since majority of xl.jsons are not available, object quorum can't be read properly and error will be errXLReadQuorum if _, ok := err.(InsufficientReadQuorum); !ok { t.Errorf("Expected %v but received %v", InsufficientReadQuorum{}, err) diff --git a/cmd/xl-v1-object_test.go b/cmd/xl-v1-object_test.go index 329cfd516..822a0ef2e 100644 --- a/cmd/xl-v1-object_test.go +++ b/cmd/xl-v1-object_test.go @@ -28,6 +28,7 @@ import ( "time" humanize "github.com/dustin/go-humanize" + "github.com/minio/minio/pkg/madmin" ) func TestRepeatPutObjectPart(t *testing.T) { @@ -308,7 +309,7 @@ func TestHealing(t *testing.T) { t.Fatal(err) } - _, err = xl.HealObject(context.Background(), bucket, object, false, false) + _, err = xl.HealObject(context.Background(), bucket, object, false, false, madmin.HealNormalScan) if err != nil { t.Fatal(err) } @@ -337,7 +338,7 @@ func TestHealing(t *testing.T) { t.Fatal(err) } - _, err = xl.HealObject(context.Background(), bucket, object, false, false) + _, err = xl.HealObject(context.Background(), bucket, object, false, false, madmin.HealDeepScan) if err != nil { t.Fatal(err) } diff --git a/pkg/madmin/heal-commands.go b/pkg/madmin/heal-commands.go index c348b93fb..4aa743233 100644 --- a/pkg/madmin/heal-commands.go +++ b/pkg/madmin/heal-commands.go @@ -26,11 +26,22 @@ import ( "time" ) +// HealScanMode represents the type of healing scan +type HealScanMode int + +const ( + // HealNormalScan checks if parts are present and not outdated + HealNormalScan HealScanMode = iota + // HealDeepScan checks for parts bitrot checksums + HealDeepScan +) + // HealOpts - collection of options for a heal sequence type HealOpts struct { - Recursive bool `json:"recursive"` - DryRun bool `json:"dryRun"` - Remove bool `json:"remove"` + Recursive bool `json:"recursive"` + DryRun bool `json:"dryRun"` + Remove bool `json:"remove"` + ScanMode HealScanMode `json:"scanMode"` } // HealStartSuccess - holds information about a successfully started