You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
minio/cmd/xl-v1-healing.go

634 lines
19 KiB

/*
* Minio Cloud Storage, (C) 2016 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cmd
import (
"fmt"
"path"
"sort"
"sync"
"github.com/minio/minio/pkg/errors"
"github.com/minio/minio/pkg/madmin"
)
// healFormatXL - heals missing `format.json` on freshly or corrupted
// disks (missing format.json but does have erasure coded data in it).
func healFormatXL(storageDisks []StorageAPI, dryRun bool) (res madmin.HealResultItem,
err error) {
// Attempt to load all `format.json`.
formatConfigs, sErrs := loadAllFormats(storageDisks)
// Generic format check.
// - if (no quorum) return error
// - if (disks not recognized) // Always error.
if err = genericFormatCheckXL(formatConfigs, sErrs); err != nil {
return res, err
}
// Prepare heal-result
res = madmin.HealResultItem{
Type: madmin.HealItemMetadata,
Detail: "disk-format",
DiskCount: len(storageDisks),
}
res.InitDrives()
// Existing formats are available (i.e. ok), so save it in
// result, also populate disks to be healed.
for i, format := range formatConfigs {
drive := globalEndpoints.GetString(i)
switch {
case format != nil:
res.DriveInfo.Before[drive] = madmin.DriveStateOk
case sErrs[i] == errCorruptedFormat:
res.DriveInfo.Before[drive] = madmin.DriveStateCorrupt
case sErrs[i] == errUnformattedDisk:
res.DriveInfo.Before[drive] = madmin.DriveStateMissing
default:
res.DriveInfo.Before[drive] = madmin.DriveStateOffline
}
}
// Copy "after" drive state too
for k, v := range res.DriveInfo.Before {
res.DriveInfo.After[k] = v
}
numDisks := len(storageDisks)
_, unformattedDiskCount, diskNotFoundCount,
corruptedFormatCount, otherErrCount := formatErrsSummary(sErrs)
switch {
case unformattedDiskCount == numDisks:
// all unformatted.
if !dryRun {
err = initFormatXL(storageDisks)
if err != nil {
return res, err
}
for i := 0; i < len(storageDisks); i++ {
drive := globalEndpoints.GetString(i)
res.DriveInfo.After[drive] = madmin.DriveStateOk
}
}
return res, nil
case diskNotFoundCount > 0:
return res, fmt.Errorf("cannot proceed with heal as %s",
errSomeDiskOffline)
case otherErrCount > 0:
return res, fmt.Errorf("cannot proceed with heal as some disks had unhandled errors")
case corruptedFormatCount > 0:
// heal corrupted disks
err = healFormatXLCorruptedDisks(storageDisks, formatConfigs,
dryRun)
if err != nil {
return res, err
}
// success
if !dryRun {
for i := 0; i < len(storageDisks); i++ {
drive := globalEndpoints.GetString(i)
res.DriveInfo.After[drive] = madmin.DriveStateOk
}
}
return res, nil
case unformattedDiskCount > 0:
// heal unformatted disks
err = healFormatXLFreshDisks(storageDisks, formatConfigs,
dryRun)
if err != nil {
return res, err
}
// success
if !dryRun {
for i := 0; i < len(storageDisks); i++ {
drive := globalEndpoints.GetString(i)
res.DriveInfo.After[drive] = madmin.DriveStateOk
}
}
return res, nil
}
return res, nil
}
// Heals a bucket if it doesn't exist on one of the disks, additionally
// also heals the missing entries for bucket metadata files
// `policy.json, notification.xml, listeners.json`.
func (xl xlObjects) HealBucket(bucket string, dryRun bool) (
results []madmin.HealResultItem, err error) {
if err = checkBucketExist(bucket, xl); err != nil {
return nil, err
}
// get write quorum for an object
writeQuorum := len(xl.storageDisks)/2 + 1
bucketLock := xl.nsMutex.NewNSLock(bucket, "")
if err = bucketLock.GetLock(globalHealingTimeout); err != nil {
return nil, err
}
defer bucketLock.Unlock()
// Heal bucket.
result, err := healBucket(xl.storageDisks, bucket, writeQuorum, dryRun)
if err != nil {
return results, err
}
results = append(results, result)
// Proceed to heal bucket metadata.
metaResults, err := healBucketMetadata(xl, bucket, dryRun)
results = append(results, metaResults...)
return results, err
}
// Heal bucket - create buckets on disks where it does not exist.
func healBucket(storageDisks []StorageAPI, bucket string, writeQuorum int,
dryRun bool) (res madmin.HealResultItem, err error) {
// Initialize sync waitgroup.
var wg = &sync.WaitGroup{}
// Initialize list of errors.
var dErrs = make([]error, len(storageDisks))
// Disk states slices
beforeState := make([]string, len(storageDisks))
afterState := make([]string, len(storageDisks))
// Make a volume entry on all underlying storage disks.
for index, disk := range storageDisks {
if disk == nil {
dErrs[index] = errors.Trace(errDiskNotFound)
beforeState[index] = madmin.DriveStateOffline
afterState[index] = madmin.DriveStateOffline
continue
}
wg.Add(1)
// Make a volume inside a go-routine.
go func(index int, disk StorageAPI) {
defer wg.Done()
if _, err := disk.StatVol(bucket); err != nil {
if errors.Cause(err) != errVolumeNotFound {
beforeState[index] = madmin.DriveStateCorrupt
afterState[index] = madmin.DriveStateCorrupt
dErrs[index] = errors.Trace(err)
return
}
beforeState[index] = madmin.DriveStateMissing
afterState[index] = madmin.DriveStateMissing
// mutate only if not a dry-run
if dryRun {
return
}
makeErr := disk.MakeVol(bucket)
dErrs[index] = errors.Trace(makeErr)
if makeErr == nil {
afterState[index] = madmin.DriveStateOk
}
} else {
beforeState[index] = madmin.DriveStateOk
afterState[index] = madmin.DriveStateOk
}
}(index, disk)
}
// Wait for all make vol to finish.
wg.Wait()
// Initialize heal result info
res = madmin.HealResultItem{
Type: madmin.HealItemBucket,
Bucket: bucket,
DiskCount: len(storageDisks),
}
res.InitDrives()
for i, before := range beforeState {
drive := globalEndpoints.GetString(i)
res.DriveInfo.Before[drive] = before
res.DriveInfo.After[drive] = afterState[i]
}
reducedErr := reduceWriteQuorumErrs(dErrs, bucketOpIgnoredErrs, writeQuorum)
if errors.Cause(reducedErr) == errXLWriteQuorum {
// Purge successfully created buckets if we don't have writeQuorum.
undoMakeBucket(storageDisks, bucket)
}
return res, reducedErr
}
// Heals all the metadata associated for a given bucket, this function
// heals `policy.json`, `notification.xml` and `listeners.json`.
func healBucketMetadata(xl xlObjects, bucket string, dryRun bool) (
results []madmin.HealResultItem, err error) {
healBucketMetaFn := func(metaPath string) error {
result, healErr := xl.HealObject(minioMetaBucket, metaPath, dryRun)
// If object is not found, no result to add.
if isErrObjectNotFound(healErr) {
return nil
}
if healErr != nil {
return healErr
}
result.Type = madmin.HealItemBucketMetadata
results = append(results, result)
return nil
}
// Heal `policy.json` for missing entries, ignores if
// `policy.json` is not found.
policyPath := pathJoin(bucketConfigPrefix, bucket, bucketPolicyConfig)
err = healBucketMetaFn(policyPath)
if err != nil {
return results, err
}
// Heal `notification.xml` for missing entries, ignores if
// `notification.xml` is not found.
nConfigPath := path.Join(bucketConfigPrefix, bucket,
bucketNotificationConfig)
err = healBucketMetaFn(nConfigPath)
if err != nil {
return results, err
}
// Heal `listeners.json` for missing entries, ignores if
// `listeners.json` is not found.
lConfigPath := path.Join(bucketConfigPrefix, bucket, bucketListenerConfig)
err = healBucketMetaFn(lConfigPath)
return results, err
}
// listAllBuckets lists all buckets from all disks. It also
// returns the occurrence of each buckets in all disks
func listAllBuckets(storageDisks []StorageAPI) (buckets map[string]VolInfo,
bucketsOcc map[string]int, err error) {
buckets = make(map[string]VolInfo)
bucketsOcc = make(map[string]int)
for _, disk := range storageDisks {
if disk == nil {
continue
}
var volsInfo []VolInfo
volsInfo, err = disk.ListVols()
if err != nil {
if errors.IsErrIgnored(err, bucketMetadataOpIgnoredErrs...) {
continue
}
return nil, nil, err
}
for _, volInfo := range volsInfo {
// StorageAPI can send volume names which are
// incompatible with buckets - these are
// skipped, like the meta-bucket.
if !IsValidBucketName(volInfo.Name) ||
isMinioMetaBucketName(volInfo.Name) {
continue
}
// Increase counter per bucket name
bucketsOcc[volInfo.Name]++
// Save volume info under bucket name
buckets[volInfo.Name] = volInfo
}
}
return buckets, bucketsOcc, nil
}
// ListBucketsHeal - Find all buckets that need to be healed
func (xl xlObjects) ListBucketsHeal() ([]BucketInfo, error) {
listBuckets := []BucketInfo{}
// List all buckets that can be found in all disks
buckets, _, err := listAllBuckets(xl.storageDisks)
if err != nil {
return listBuckets, err
}
// Iterate over all buckets
for _, currBucket := range buckets {
listBuckets = append(listBuckets,
BucketInfo{currBucket.Name, currBucket.Created})
}
// Sort found buckets
sort.Sort(byBucketName(listBuckets))
return listBuckets, nil
}
// This function is meant for all the healing that needs to be done
// during startup i.e healing of buckets, bucket metadata (policy.json,
// notification.xml, listeners.json) etc. Currently this function
// supports quick healing of buckets, bucket metadata.
func quickHeal(xlObj xlObjects, writeQuorum int, readQuorum int) error {
// List all bucket name occurrence from all disks.
_, bucketOcc, err := listAllBuckets(xlObj.storageDisks)
if err != nil {
return err
}
// All bucket names and bucket metadata that should be healed.
for bucketName, occCount := range bucketOcc {
// Heal bucket only if healing is needed.
if occCount != len(xlObj.storageDisks) {
bucketLock := xlObj.nsMutex.NewNSLock(bucketName, "")
if perr := bucketLock.GetLock(globalHealingTimeout); perr != nil {
return perr
}
defer bucketLock.Unlock()
// Heal bucket and then proceed to heal bucket metadata if any.
if _, err = healBucket(xlObj.storageDisks, bucketName, writeQuorum, false); err == nil {
if _, err = healBucketMetadata(xlObj, bucketName, false); err == nil {
continue
}
return err
}
return err
}
}
// Success.
return nil
}
// Heals an object by re-writing corrupt/missing erasure blocks.
func healObject(storageDisks []StorageAPI, bucket string, object string,
quorum int, dryRun bool) (result madmin.HealResultItem, err error) {
partsMetadata, errs := readAllXLMetadata(storageDisks, bucket, object)
// readQuorum suffices for xl.json since we use monotonic
// system time to break the tie when a split-brain situation
// arises.
if reducedErr := reduceReadQuorumErrs(errs, nil, quorum); reducedErr != nil {
return result, toObjectErr(reducedErr, bucket, object)
}
// List of disks having latest version of the object xl.json
// (by modtime).
latestDisks, modTime := listOnlineDisks(storageDisks, partsMetadata, errs)
// List of disks having all parts as per latest xl.json.
availableDisks, dataErrs, aErr := disksWithAllParts(latestDisks, partsMetadata, errs, bucket, object)
if aErr != nil {
return result, toObjectErr(aErr, bucket, object)
}
// Initialize heal result object
result = madmin.HealResultItem{
Type: madmin.HealItemObject,
Bucket: bucket,
Object: object,
DiskCount: len(storageDisks),
// Initialize object size to -1, so we can detect if we are
// unable to reliably find the object size.
ObjectSize: -1,
}
result.InitDrives()
// Loop to find number of disks with valid data, per-drive
// data state and a list of outdated disks on which data needs
// to be healed.
outDatedDisks := make([]StorageAPI, len(storageDisks))
numAvailableDisks := 0
disksToHealCount := 0
for i, v := range availableDisks {
driveState := ""
switch {
case v != nil:
driveState = madmin.DriveStateOk
numAvailableDisks++
// If data is sane on any one disk, we can
// extract the correct object size.
result.ObjectSize = partsMetadata[i].Stat.Size
result.ParityBlocks = partsMetadata[i].Erasure.ParityBlocks
result.DataBlocks = partsMetadata[i].Erasure.DataBlocks
case errors.Cause(errs[i]) == errDiskNotFound:
driveState = madmin.DriveStateOffline
case errors.Cause(errs[i]) == errFileNotFound, errors.Cause(errs[i]) == errVolumeNotFound:
fallthrough
case errors.Cause(dataErrs[i]) == errFileNotFound, errors.Cause(dataErrs[i]) == errVolumeNotFound:
driveState = madmin.DriveStateMissing
default:
// all remaining cases imply corrupt data/metadata
driveState = madmin.DriveStateCorrupt
}
drive := globalEndpoints.GetString(i)
result.DriveInfo.Before[drive] = driveState
// copy for 'after' state
result.DriveInfo.After[drive] = driveState
// an online disk without valid data/metadata is
// outdated and can be healed.
if errs[i] != errDiskNotFound && v == nil {
outDatedDisks[i] = storageDisks[i]
disksToHealCount++
}
}
// If less than read quorum number of disks have all the parts
// of the data, we can't reconstruct the erasure-coded data.
if numAvailableDisks < quorum {
return result, toObjectErr(errXLReadQuorum, bucket, object)
}
if disksToHealCount == 0 {
// Nothing to heal!
return result, nil
}
// After this point, only have to repair data on disk - so
// return if it is a dry-run
if dryRun {
return result, nil
}
// Latest xlMetaV1 for reference. If a valid metadata is not
// present, it is as good as object not found.
latestMeta, pErr := pickValidXLMeta(partsMetadata, modTime)
if pErr != nil {
return result, toObjectErr(pErr, bucket, object)
}
// Clear data files of the object on outdated disks
for _, disk := range outDatedDisks {
// Before healing outdated disks, we need to remove
// xl.json and part files from "bucket/object/" so
// that rename(minioMetaBucket, "tmp/tmpuuid/",
// "bucket", "object/") succeeds.
if disk == nil {
// Not an outdated disk.
continue
}
// List and delete the object directory, ignoring
// errors.
files, derr := disk.ListDir(bucket, object)
if derr == nil {
for _, entry := range files {
_ = disk.DeleteFile(bucket,
pathJoin(object, entry))
}
}
}
// Reorder so that we have data disks first and parity disks next.
latestDisks = shuffleDisks(latestDisks, latestMeta.Erasure.Distribution)
outDatedDisks = shuffleDisks(outDatedDisks, latestMeta.Erasure.Distribution)
partsMetadata = shufflePartsMetadata(partsMetadata, latestMeta.Erasure.Distribution)
// We write at temporary location and then rename to final location.
tmpID := mustGetUUID()
// Checksum of the part files. checkSumInfos[index] will
// contain checksums of all the part files in the
// outDatedDisks[index]
checksumInfos := make([][]ChecksumInfo, len(outDatedDisks))
// Heal each part. erasureHealFile() will write the healed
// part to .minio/tmp/uuid/ which needs to be renamed later to
// the final location.
storage, err := NewErasureStorage(latestDisks, latestMeta.Erasure.DataBlocks,
latestMeta.Erasure.ParityBlocks, latestMeta.Erasure.BlockSize)
if err != nil {
return result, toObjectErr(err, bucket, object)
}
checksums := make([][]byte, len(latestDisks))
for partIndex := 0; partIndex < len(latestMeta.Parts); partIndex++ {
partName := latestMeta.Parts[partIndex].Name
partSize := latestMeta.Parts[partIndex].Size
erasure := latestMeta.Erasure
var algorithm BitrotAlgorithm
for i, disk := range storage.disks {
if disk != OfflineDisk {
info := partsMetadata[i].Erasure.GetChecksumInfo(partName)
algorithm = info.Algorithm
checksums[i] = info.Hash
}
}
// Heal the part file.
file, hErr := storage.HealFile(outDatedDisks, bucket, pathJoin(object, partName),
erasure.BlockSize, minioMetaTmpBucket, pathJoin(tmpID, partName), partSize,
algorithm, checksums)
if hErr != nil {
return result, toObjectErr(hErr, bucket, object)
}
// outDatedDisks that had write errors should not be
// written to for remaining parts, so we nil it out.
for i, disk := range outDatedDisks {
if disk == nil {
continue
}
// A non-nil stale disk which did not receive
// a healed part checksum had a write error.
if file.Checksums[i] == nil {
outDatedDisks[i] = nil
disksToHealCount--
continue
}
// append part checksums
checksumInfos[i] = append(checksumInfos[i],
ChecksumInfo{partName, file.Algorithm, file.Checksums[i]})
}
// If all disks are having errors, we give up.
if disksToHealCount == 0 {
return result, fmt.Errorf("all disks without up-to-date data had write errors")
}
}
// xl.json should be written to all the healed disks.
for index, disk := range outDatedDisks {
if disk == nil {
continue
}
partsMetadata[index] = latestMeta
partsMetadata[index].Erasure.Checksums = checksumInfos[index]
}
// Generate and write `xl.json` generated from other disks.
outDatedDisks, aErr = writeUniqueXLMetadata(outDatedDisks, minioMetaTmpBucket, tmpID,
partsMetadata, diskCount(outDatedDisks))
if aErr != nil {
return result, toObjectErr(aErr, bucket, object)
}
// Rename from tmp location to the actual location.
for diskIndex, disk := range outDatedDisks {
if disk == nil {
continue
}
// Attempt a rename now from healed data to final location.
aErr = disk.RenameFile(minioMetaTmpBucket, retainSlash(tmpID), bucket,
retainSlash(object))
if aErr != nil {
return result, toObjectErr(errors.Trace(aErr), bucket, object)
}
realDiskIdx := unshuffleIndex(diskIndex,
latestMeta.Erasure.Distribution)
drive := globalEndpoints.GetString(realDiskIdx)
result.DriveInfo.After[drive] = madmin.DriveStateOk
}
// Set the size of the object in the heal result
result.ObjectSize = latestMeta.Stat.Size
return result, nil
}
// HealObject - heal the given object.
//
// FIXME: If an object object was deleted and one disk was down,
// and later the disk comes back up again, heal on the object
// should delete it.
func (xl xlObjects) HealObject(bucket, object string, dryRun bool) (
hr madmin.HealResultItem, err error) {
// FIXME: Metadata is read again in the healObject() call below.
// Read metadata files from all the disks
partsMetadata, errs := readAllXLMetadata(xl.storageDisks, bucket, object)
// get read quorum for this object
var readQuorum int
readQuorum, _, err = objectQuorumFromMeta(xl, partsMetadata, errs)
if err != nil {
return hr, err
}
// Lock the object before healing.
objectLock := xl.nsMutex.NewNSLock(bucket, object)
if lerr := objectLock.GetRLock(globalHealingTimeout); lerr != nil {
return hr, lerr
}
defer objectLock.RUnlock()
// Heal the object.
return healObject(xl.storageDisks, bucket, object, readQuorum, dryRun)
}