|
|
|
/*
|
|
|
|
* Minio Cloud Storage, (C) 2016 Minio, Inc.
|
|
|
|
*
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at
|
|
|
|
*
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package cmd
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"hash"
|
|
|
|
"strings"
|
|
|
|
|
|
|
|
"github.com/minio/minio/pkg/errors"
|
|
|
|
)
|
|
|
|
|
|
|
|
// HealFile tries to reconstruct an erasure-coded file spread over all
|
|
|
|
// available disks. HealFile will read the valid parts of the file,
|
|
|
|
// reconstruct the missing data and write the reconstructed parts back
|
|
|
|
// to `staleDisks` at the destination `dstVol/dstPath/`. Parts are
|
|
|
|
// verified against the given BitrotAlgorithm and checksums.
|
|
|
|
//
|
|
|
|
// `staleDisks` is a slice of disks where each non-nil entry has stale
|
|
|
|
// or no data, and so will be healed.
|
|
|
|
//
|
|
|
|
// It is required that `s.disks` have a (read-quorum) majority of
|
|
|
|
// disks with valid data for healing to work.
|
|
|
|
//
|
|
|
|
// In addition, `staleDisks` and `s.disks` must have the same ordering
|
|
|
|
// of disks w.r.t. erasure coding of the object.
|
|
|
|
//
|
|
|
|
// Errors when writing to `staleDisks` are not propagated as long as
|
|
|
|
// writes succeed for at least one disk. This allows partial healing
|
|
|
|
// despite stale disks being faulty.
|
|
|
|
//
|
|
|
|
// It returns bitrot checksums for the non-nil staleDisks on which
|
|
|
|
// healing succeeded.
|
|
|
|
func (s ErasureStorage) HealFile(staleDisks []StorageAPI, volume, path string, blocksize int64,
|
|
|
|
dstVol, dstPath string, size int64, alg BitrotAlgorithm, checksums [][]byte) (
|
|
|
|
f ErasureFileInfo, err error) {
|
|
|
|
|
|
|
|
if !alg.Available() {
|
|
|
|
return f, errors.Trace(errBitrotHashAlgoInvalid)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Initialization
|
|
|
|
f.Checksums = make([][]byte, len(s.disks))
|
|
|
|
hashers := make([]hash.Hash, len(s.disks))
|
|
|
|
verifiers := make([]*BitrotVerifier, len(s.disks))
|
|
|
|
for i, disk := range s.disks {
|
|
|
|
switch {
|
|
|
|
case staleDisks[i] != nil:
|
|
|
|
hashers[i] = alg.New()
|
|
|
|
case disk == nil:
|
|
|
|
// disregard unavailable disk
|
|
|
|
continue
|
|
|
|
default:
|
|
|
|
verifiers[i] = NewBitrotVerifier(alg, checksums[i])
|
|
|
|
}
|
|
|
|
}
|
|
|
|
writeErrors := make([]error, len(s.disks))
|
|
|
|
|
|
|
|
// Read part file data on each disk
|
|
|
|
chunksize := ceilFrac(blocksize, int64(s.dataBlocks))
|
|
|
|
numBlocks := ceilFrac(size, blocksize)
|
|
|
|
|
|
|
|
readLen := chunksize * (numBlocks - 1)
|
|
|
|
|
|
|
|
lastChunkSize := chunksize
|
|
|
|
hasSmallerLastBlock := size%blocksize != 0
|
|
|
|
if hasSmallerLastBlock {
|
|
|
|
lastBlockLen := size % blocksize
|
|
|
|
lastChunkSize = ceilFrac(lastBlockLen, int64(s.dataBlocks))
|
|
|
|
}
|
|
|
|
readLen += lastChunkSize
|
|
|
|
var buffers [][]byte
|
|
|
|
buffers, _, err = s.readConcurrent(volume, path, 0, readLen, verifiers)
|
|
|
|
if err != nil {
|
|
|
|
return f, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Scan part files on disk, block-by-block reconstruct it and
|
|
|
|
// write to stale disks.
|
|
|
|
blocks := make([][]byte, len(s.disks))
|
|
|
|
|
|
|
|
if numBlocks > 1 {
|
|
|
|
// Allocate once for all the equal length blocks. The
|
|
|
|
// last block may have a different length - allocation
|
|
|
|
// for this happens inside the for loop below.
|
|
|
|
for i := range blocks {
|
|
|
|
if len(buffers[i]) == 0 {
|
|
|
|
blocks[i] = make([]byte, chunksize)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
var buffOffset int64
|
|
|
|
for blockNumber := int64(0); blockNumber < numBlocks; blockNumber++ {
|
|
|
|
if blockNumber == numBlocks-1 && lastChunkSize != chunksize {
|
|
|
|
for i := range blocks {
|
|
|
|
if len(buffers[i]) == 0 {
|
|
|
|
blocks[i] = make([]byte, lastChunkSize)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for i := range blocks {
|
|
|
|
if len(buffers[i]) == 0 {
|
|
|
|
blocks[i] = blocks[i][0:0]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
csize := chunksize
|
|
|
|
if blockNumber == numBlocks-1 {
|
|
|
|
csize = lastChunkSize
|
|
|
|
}
|
|
|
|
for i := range blocks {
|
|
|
|
if len(buffers[i]) != 0 {
|
|
|
|
blocks[i] = buffers[i][buffOffset : buffOffset+csize]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
buffOffset += csize
|
|
|
|
|
|
|
|
if err = s.ErasureDecodeDataAndParityBlocks(blocks); err != nil {
|
|
|
|
return f, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// write computed shards as chunks on file in each
|
|
|
|
// stale disk
|
|
|
|
writeSucceeded := false
|
|
|
|
for i, disk := range staleDisks {
|
|
|
|
// skip nil disk or disk that had error on
|
|
|
|
// previous write
|
|
|
|
if disk == nil || writeErrors[i] != nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
writeErrors[i] = disk.AppendFile(dstVol, dstPath, blocks[i])
|
|
|
|
if writeErrors[i] == nil {
|
|
|
|
hashers[i].Write(blocks[i])
|
|
|
|
writeSucceeded = true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If all disks had write errors we quit.
|
|
|
|
if !writeSucceeded {
|
|
|
|
// build error from all write errors
|
|
|
|
return f, errors.Trace(joinWriteErrors(writeErrors))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// copy computed file hashes into output variable
|
|
|
|
f.Size = size
|
|
|
|
f.Algorithm = alg
|
|
|
|
for i, disk := range staleDisks {
|
|
|
|
if disk == nil || writeErrors[i] != nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
f.Checksums[i] = hashers[i].Sum(nil)
|
|
|
|
}
|
|
|
|
return f, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func joinWriteErrors(errs []error) error {
|
|
|
|
msgs := []string{}
|
|
|
|
for i, err := range errs {
|
|
|
|
if err == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
msgs = append(msgs, fmt.Sprintf("disk %d: %v", i+1, err))
|
|
|
|
}
|
|
|
|
return fmt.Errorf("all stale disks had write errors during healing: %s",
|
|
|
|
strings.Join(msgs, ", "))
|
|
|
|
}
|