You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
300 lines
9.3 KiB
300 lines
9.3 KiB
/*
|
|
* Minio Cloud Storage, (C) 2016, 2017 Minio, Inc.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
package cmd
|
|
|
|
import (
|
|
"crypto/subtle"
|
|
"path/filepath"
|
|
"sync"
|
|
"time"
|
|
|
|
"io"
|
|
)
|
|
|
|
// healBufferPool is a pool of reusable buffers used to verify a stream
|
|
// while healing.
|
|
var healBufferPool = sync.Pool{
|
|
New: func() interface{} {
|
|
b := make([]byte, readSizeV1)
|
|
return &b
|
|
},
|
|
}
|
|
|
|
// commonTime returns a maximally occurring time from a list of time.
|
|
func commonTime(modTimes []time.Time) (modTime time.Time, count int) {
|
|
var maxima int // Counter for remembering max occurrence of elements.
|
|
timeOccurenceMap := make(map[time.Time]int)
|
|
// Ignore the uuid sentinel and count the rest.
|
|
for _, time := range modTimes {
|
|
if time == timeSentinel {
|
|
continue
|
|
}
|
|
timeOccurenceMap[time]++
|
|
}
|
|
// Find the common cardinality from previously collected
|
|
// occurrences of elements.
|
|
for time, count := range timeOccurenceMap {
|
|
if count == maxima && time.After(modTime) {
|
|
maxima = count
|
|
modTime = time
|
|
|
|
} else if count > maxima {
|
|
maxima = count
|
|
modTime = time
|
|
}
|
|
}
|
|
// Return the collected common uuid.
|
|
return modTime, maxima
|
|
}
|
|
|
|
// Beginning of unix time is treated as sentinel value here.
|
|
var timeSentinel = time.Unix(0, 0).UTC()
|
|
|
|
// Boot modTimes up to disk count, setting the value to time sentinel.
|
|
func bootModtimes(diskCount int) []time.Time {
|
|
modTimes := make([]time.Time, diskCount)
|
|
// Boots up all the modtimes.
|
|
for i := range modTimes {
|
|
modTimes[i] = timeSentinel
|
|
}
|
|
return modTimes
|
|
}
|
|
|
|
// Extracts list of times from xlMetaV1 slice and returns, skips
|
|
// slice elements which have errors.
|
|
func listObjectModtimes(partsMetadata []xlMetaV1, errs []error) (modTimes []time.Time) {
|
|
modTimes = bootModtimes(len(partsMetadata))
|
|
for index, metadata := range partsMetadata {
|
|
if errs[index] != nil {
|
|
continue
|
|
}
|
|
// Once the file is found, save the uuid saved on disk.
|
|
modTimes[index] = metadata.Stat.ModTime
|
|
}
|
|
return modTimes
|
|
}
|
|
|
|
// Notes:
|
|
// There are 5 possible states a disk could be in,
|
|
// 1. __online__ - has the latest copy of xl.json - returned by listOnlineDisks
|
|
//
|
|
// 2. __offline__ - err == errDiskNotFound
|
|
//
|
|
// 3. __availableWithParts__ - has the latest copy of xl.json and has all
|
|
// parts with checksums matching; returned by disksWithAllParts
|
|
//
|
|
// 4. __outdated__ - returned by outDatedDisk, provided []StorageAPI
|
|
// returned by diskWithAllParts is passed for latestDisks.
|
|
// - has an old copy of xl.json
|
|
// - doesn't have xl.json (errFileNotFound)
|
|
// - has the latest xl.json but one or more parts are corrupt
|
|
//
|
|
// 5. __missingParts__ - has the latest copy of xl.json but has some parts
|
|
// missing. This is identified separately since this may need manual
|
|
// inspection to understand the root cause. E.g, this could be due to
|
|
// backend filesystem corruption.
|
|
|
|
// listOnlineDisks - returns
|
|
// - a slice of disks where disk having 'older' xl.json (or nothing)
|
|
// are set to nil.
|
|
// - latest (in time) of the maximally occurring modTime(s).
|
|
func listOnlineDisks(disks []StorageAPI, partsMetadata []xlMetaV1, errs []error) (onlineDisks []StorageAPI, modTime time.Time) {
|
|
onlineDisks = make([]StorageAPI, len(disks))
|
|
|
|
// List all the file commit ids from parts metadata.
|
|
modTimes := listObjectModtimes(partsMetadata, errs)
|
|
|
|
// Reduce list of UUIDs to a single common value.
|
|
modTime, _ = commonTime(modTimes)
|
|
|
|
// Create a new online disks slice, which have common uuid.
|
|
for index, t := range modTimes {
|
|
if t == modTime {
|
|
onlineDisks[index] = disks[index]
|
|
} else {
|
|
onlineDisks[index] = nil
|
|
}
|
|
}
|
|
return onlineDisks, modTime
|
|
}
|
|
|
|
// outDatedDisks - return disks which don't have the latest object (i.e xl.json).
|
|
// disks that are offline are not 'marked' outdated.
|
|
func outDatedDisks(disks, latestDisks []StorageAPI, errs []error, partsMetadata []xlMetaV1,
|
|
bucket, object string) (outDatedDisks []StorageAPI) {
|
|
|
|
outDatedDisks = make([]StorageAPI, len(disks))
|
|
for index, latestDisk := range latestDisks {
|
|
if latestDisk != nil {
|
|
continue
|
|
}
|
|
// disk either has an older xl.json or doesn't have one.
|
|
switch errorCause(errs[index]) {
|
|
case nil, errFileNotFound:
|
|
outDatedDisks[index] = disks[index]
|
|
}
|
|
}
|
|
|
|
return outDatedDisks
|
|
}
|
|
|
|
// Returns if the object should be healed.
|
|
func xlShouldHeal(disks []StorageAPI, partsMetadata []xlMetaV1, errs []error, bucket, object string) bool {
|
|
onlineDisks, _ := listOnlineDisks(disks, partsMetadata,
|
|
errs)
|
|
// Return true even if one of the disks have stale data.
|
|
for _, disk := range onlineDisks {
|
|
if disk == nil {
|
|
return true
|
|
}
|
|
}
|
|
|
|
// Check if all parts of an object are available and their
|
|
// checksums are valid.
|
|
availableDisks, _, err := disksWithAllParts(onlineDisks, partsMetadata,
|
|
errs, bucket, object)
|
|
if err != nil {
|
|
// Note: This error is due to failure of blake2b
|
|
// checksum computation of a part. It doesn't clearly
|
|
// indicate if the object needs healing. At this
|
|
// juncture healing could fail with the same
|
|
// error. So, we choose to return that there is no
|
|
// need to heal.
|
|
return false
|
|
}
|
|
|
|
// Return true even if one disk has xl.json or one or more
|
|
// parts missing.
|
|
for _, disk := range availableDisks {
|
|
if disk == nil {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// xlHealStat - returns a structure which describes how many data,
|
|
// parity erasure blocks are missing and if it is possible to heal
|
|
// with the blocks present.
|
|
func xlHealStat(xl xlObjects, partsMetadata []xlMetaV1, errs []error) HealObjectInfo {
|
|
// Less than quorum erasure coded blocks of the object have the same create time.
|
|
// This object can't be healed with the information we have.
|
|
modTime, count := commonTime(listObjectModtimes(partsMetadata, errs))
|
|
if count < xl.readQuorum {
|
|
return HealObjectInfo{
|
|
Status: quorumUnavailable,
|
|
MissingDataCount: 0,
|
|
MissingParityCount: 0,
|
|
}
|
|
}
|
|
|
|
// If there isn't a valid xlMeta then we can't heal the object.
|
|
xlMeta, err := pickValidXLMeta(partsMetadata, modTime)
|
|
if err != nil {
|
|
return HealObjectInfo{
|
|
Status: corrupted,
|
|
MissingDataCount: 0,
|
|
MissingParityCount: 0,
|
|
}
|
|
}
|
|
|
|
// Compute heal statistics like bytes to be healed, missing
|
|
// data and missing parity count.
|
|
missingDataCount := 0
|
|
missingParityCount := 0
|
|
|
|
disksMissing := false
|
|
for i, err := range errs {
|
|
// xl.json is not found, which implies the erasure
|
|
// coded blocks are unavailable in the corresponding disk.
|
|
// First half of the disks are data and the rest are parity.
|
|
switch realErr := errorCause(err); realErr {
|
|
case errDiskNotFound:
|
|
disksMissing = true
|
|
fallthrough
|
|
case errFileNotFound:
|
|
if xlMeta.Erasure.Distribution[i]-1 < xl.dataBlocks {
|
|
missingDataCount++
|
|
} else {
|
|
missingParityCount++
|
|
}
|
|
}
|
|
}
|
|
|
|
// The object may not be healed completely, since some of the
|
|
// disks needing healing are unavailable.
|
|
if disksMissing {
|
|
return HealObjectInfo{
|
|
Status: canPartiallyHeal,
|
|
MissingDataCount: missingDataCount,
|
|
MissingParityCount: missingParityCount,
|
|
}
|
|
}
|
|
|
|
// This object can be healed. We have enough object metadata
|
|
// to reconstruct missing erasure coded blocks.
|
|
return HealObjectInfo{
|
|
Status: canHeal,
|
|
MissingDataCount: missingDataCount,
|
|
MissingParityCount: missingParityCount,
|
|
}
|
|
}
|
|
|
|
// disksWithAllParts - This function needs to be called with
|
|
// []StorageAPI returned by listOnlineDisks. Returns,
|
|
// - disks which have all parts specified in the latest xl.json.
|
|
// - errs updated to have errFileNotFound in place of disks that had
|
|
// missing parts.
|
|
// - non-nil error if any of the online disks failed during
|
|
// calculating blake2b checksum.
|
|
func disksWithAllParts(onlineDisks []StorageAPI, partsMetadata []xlMetaV1, errs []error, bucket, object string) ([]StorageAPI, []error, error) {
|
|
availableDisks := make([]StorageAPI, len(onlineDisks))
|
|
buffer := healBufferPool.Get().(*[]byte)
|
|
defer healBufferPool.Put(buffer)
|
|
|
|
for diskIndex, onlineDisk := range onlineDisks {
|
|
if onlineDisk == OfflineDisk {
|
|
continue
|
|
}
|
|
// disk has a valid xl.json but may not have all the
|
|
// parts. This is considered an outdated disk, since
|
|
// it needs healing too.
|
|
for _, part := range partsMetadata[diskIndex].Parts {
|
|
partPath := filepath.Join(object, part.Name)
|
|
checkSumInfo := partsMetadata[diskIndex].Erasure.GetChecksumInfo(part.Name)
|
|
hash := checkSumInfo.Algorithm.New()
|
|
_, hErr := io.CopyBuffer(hash, StorageReader(onlineDisk, bucket, partPath, 0), *buffer)
|
|
if hErr == errFileNotFound {
|
|
errs[diskIndex] = errFileNotFound
|
|
availableDisks[diskIndex] = OfflineDisk
|
|
break
|
|
}
|
|
if hErr != nil && hErr != errFileNotFound {
|
|
return nil, nil, traceError(hErr)
|
|
}
|
|
if subtle.ConstantTimeCompare(hash.Sum(nil), checkSumInfo.Hash) != 1 {
|
|
errs[diskIndex] = errFileNotFound
|
|
availableDisks[diskIndex] = OfflineDisk
|
|
break
|
|
}
|
|
availableDisks[diskIndex] = onlineDisk
|
|
}
|
|
}
|
|
|
|
return availableDisks, errs, nil
|
|
}
|
|
|