diff --git a/cmd/erasure-healfile.go b/cmd/erasure-healfile.go index 6518378c6..79d598535 100644 --- a/cmd/erasure-healfile.go +++ b/cmd/erasure-healfile.go @@ -16,72 +16,140 @@ package cmd -import "hash" +import ( + "hash" +) + +// HealFile tries to reconstruct an erasure-coded file spread over all +// available disks. HealFile will read the valid parts of the file, +// reconstruct the missing data and write the reconstructed parts back +// to `staleDisks`. +// +// `staleDisks` is a slice of disks where each non-nil entry has stale +// or no data, and so will be healed. +// +// It is required that `s.disks` have a (read-quorum) majority of +// disks with valid data for healing to work. +// +// In addition, `staleDisks` and `s.disks` must have the same ordering +// of disks w.r.t. erasure coding of the object. +// +// The function will try to read the valid parts from the file under +// the given volume and path and tries to reconstruct the file under +// the given healVolume and healPath (on staleDisks). The given +// algorithm will be used to verify the valid parts and to protect the +// reconstructed file. +// +// It returns bitrot checksums for the non-nil staleDisks. +func (s ErasureStorage) HealFile(staleDisks []StorageAPI, volume, path string, + blocksize int64, healVolume, healPath string, size int64, + algorithm BitrotAlgorithm, checksums [][]byte) (f ErasureFileInfo, + err error) { -// HealFile tries to reconstruct a bitrot encoded file spread over all available disks. HealFile will read the valid parts of the file, -// reconstruct the missing data and write the reconstructed parts back to the disks. -// It will try to read the valid parts from the file under the given volume and path and tries to reconstruct the file under the given -// healVolume and healPath. The given algorithm will be used to verify the valid parts and to protect the reconstructed file. -func (s ErasureStorage) HealFile(offlineDisks []StorageAPI, volume, path string, blocksize int64, healVolume, healPath string, size int64, algorithm BitrotAlgorithm, checksums [][]byte) (f ErasureFileInfo, err error) { if !algorithm.Available() { return f, traceError(errBitrotHashAlgoInvalid) } + + // Initialization f.Checksums = make([][]byte, len(s.disks)) - hashers, verifiers := make([]hash.Hash, len(s.disks)), make([]*BitrotVerifier, len(s.disks)) + hashers := make([]hash.Hash, len(s.disks)) + verifiers := make([]*BitrotVerifier, len(s.disks)) for i, disk := range s.disks { - if disk == OfflineDisk { + switch { + case staleDisks[i] != nil: hashers[i] = algorithm.New() - } else { + case disk == nil: + // disregard unavailable disk + continue + default: verifiers[i] = NewBitrotVerifier(algorithm, checksums[i]) f.Checksums[i] = checksums[i] } } - blocks := make([][]byte, len(s.disks)) + + // Scan part files on disk, block-by-block reconstruct it and + // write to stale disks. chunksize := getChunkSize(blocksize, s.dataBlocks) - for offset := int64(0); offset < size; offset += blocksize { - if size < blocksize { - blocksize = size + var chunkOffset, blockOffset int64 + for ; blockOffset < size; blockOffset += blocksize { + // last iteration may have less than blocksize data + // left, so chunksize needs to be recomputed. + if size < blockOffset+blocksize { + blocksize = size - blockOffset chunksize = getChunkSize(blocksize, s.dataBlocks) } + + // read a chunk from each disk, until we have + // `s.dataBlocks` number of chunks set to non-nil in + // `blocks` + blocks := make([][]byte, len(s.disks)) + var buffer []byte numReads := 0 for i, disk := range s.disks { - if disk != OfflineDisk { - if blocks[i] == nil { - blocks[i] = make([]byte, chunksize) - } - blocks[i] = blocks[i][:chunksize] - if !verifiers[i].IsVerified() { - _, err = disk.ReadFileWithVerify(volume, path, offset, blocks[i], verifiers[i]) - } else { - _, err = disk.ReadFile(volume, path, offset, blocks[i]) - } - if err != nil { - blocks[i] = nil - } else { - numReads++ - } - if numReads == s.dataBlocks { // we have enough data to reconstruct - break - } + // skip reading from unavailable or stale disks + if disk == nil || staleDisks[i] != nil { + continue + } + // allocate buffer only when needed - when + // reads fail, the buffer can be reused + if int64(len(buffer)) != chunksize { + buffer = make([]byte, chunksize) + } + if !verifiers[i].IsVerified() { + _, err = disk.ReadFileWithVerify(volume, path, + chunkOffset, buffer, verifiers[i]) + } else { + _, err = disk.ReadFile(volume, path, + chunkOffset, buffer) + } + if err != nil { + // LOG FIXME: add a conditional log + // for read failures, once per-disk + // per-function-invocation. + continue + } + + // read was successful, so set the buffer as + // blocks[i], and reset buffer to nil to force + // allocation on next iteration + blocks[i], buffer = buffer, nil + + numReads++ + if numReads == s.dataBlocks { + // we have enough data to reconstruct + break } } + + // advance the chunk offset to prepare for next loop + // iteration + chunkOffset += chunksize + + // reconstruct data - this computes all data and parity shards if err = s.ErasureDecodeDataAndParityBlocks(blocks); err != nil { return f, err } - for i, disk := range s.disks { - if disk != OfflineDisk { + + // write computed shards as chunks on file in each + // stale disk + for i, disk := range staleDisks { + if disk == nil { continue } - if err = offlineDisks[i].AppendFile(healVolume, healPath, blocks[i]); err != nil { + + err = disk.AppendFile(healVolume, healPath, blocks[i]) + if err != nil { return f, traceError(err) } hashers[i].Write(blocks[i]) } } + + // copy computed file hashes into output variable f.Size = size f.Algorithm = algorithm - for i, disk := range s.disks { - if disk != OfflineDisk { + for i, disk := range staleDisks { + if disk == nil { continue } f.Checksums[i] = hashers[i].Sum(nil) diff --git a/cmd/erasure-healfile_test.go b/cmd/erasure-healfile_test.go index 65e23dc74..c5bc4d39b 100644 --- a/cmd/erasure-healfile_test.go +++ b/cmd/erasure-healfile_test.go @@ -25,38 +25,51 @@ import ( ) var erasureHealFileTests = []struct { - dataBlocks int - disks, offDisks, badDisks, badOffDisks int - blocksize, size int64 - algorithm BitrotAlgorithm - shouldFail bool - shouldFailQuorum bool + dataBlocks, disks int + + // number of offline disks is also number of staleDisks for + // erasure reconstruction in this test + offDisks int + + // bad disks are online disks which return errors + badDisks, badStaleDisks int + + blocksize, size int64 + algorithm BitrotAlgorithm + shouldFail bool }{ - {dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: SHA256, shouldFail: false, shouldFailQuorum: false}, // 0 - {dataBlocks: 3, disks: 6, offDisks: 2, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 1 - {dataBlocks: 4, disks: 8, offDisks: 2, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 2 - {dataBlocks: 5, disks: 10, offDisks: 3, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 3 - {dataBlocks: 6, disks: 12, offDisks: 2, badDisks: 3, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: SHA256, shouldFail: false, shouldFailQuorum: false}, // 4 - {dataBlocks: 7, disks: 14, offDisks: 4, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 5 - {dataBlocks: 8, disks: 16, offDisks: 6, badDisks: 1, badOffDisks: 1, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 6 - {dataBlocks: 7, disks: 14, offDisks: 2, badDisks: 3, badOffDisks: 0, blocksize: int64(oneMiByte / 2), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: true, shouldFailQuorum: false}, // 7 - {dataBlocks: 6, disks: 12, offDisks: 1, badDisks: 0, badOffDisks: 1, blocksize: int64(oneMiByte - 1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true, shouldFailQuorum: false}, // 8 - {dataBlocks: 5, disks: 10, offDisks: 3, badDisks: 0, badOffDisks: 3, blocksize: int64(oneMiByte / 2), size: oneMiByte, algorithm: SHA256, shouldFail: true, shouldFailQuorum: false}, // 9 - {dataBlocks: 4, disks: 8, offDisks: 1, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 10 - {dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badOffDisks: 1, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 11 - {dataBlocks: 6, disks: 12, offDisks: 8, badDisks: 3, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 12 - {dataBlocks: 7, disks: 14, offDisks: 3, badDisks: 4, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 13 - {dataBlocks: 7, disks: 14, offDisks: 6, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 14 - {dataBlocks: 8, disks: 16, offDisks: 4, badDisks: 5, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 15 - {dataBlocks: 2, disks: 4, offDisks: 0, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 16 - {dataBlocks: 2, disks: 4, offDisks: 0, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: 0, shouldFail: true, shouldFailQuorum: false}, // 17 - {dataBlocks: 12, disks: 16, offDisks: 2, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 18 - {dataBlocks: 6, disks: 8, offDisks: 1, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 19 - {dataBlocks: 7, disks: 10, offDisks: 1, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: 0, shouldFail: true, shouldFailQuorum: false}, // 20 + {dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: SHA256, shouldFail: false}, // 0 + {dataBlocks: 3, disks: 6, offDisks: 2, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false}, // 1 + {dataBlocks: 4, disks: 8, offDisks: 2, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false}, // 2 + {dataBlocks: 5, disks: 10, offDisks: 3, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 3 + {dataBlocks: 6, disks: 12, offDisks: 2, badDisks: 3, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: SHA256, shouldFail: false}, // 4 + {dataBlocks: 7, disks: 14, offDisks: 4, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 5 + {dataBlocks: 8, disks: 16, offDisks: 6, badDisks: 1, badStaleDisks: 1, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true}, // 6 + {dataBlocks: 7, disks: 14, offDisks: 2, badDisks: 3, badStaleDisks: 0, blocksize: int64(oneMiByte / 2), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false}, // 7 + {dataBlocks: 6, disks: 12, offDisks: 1, badDisks: 0, badStaleDisks: 1, blocksize: int64(oneMiByte - 1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true}, // 8 + {dataBlocks: 5, disks: 10, offDisks: 3, badDisks: 0, badStaleDisks: 3, blocksize: int64(oneMiByte / 2), size: oneMiByte, algorithm: SHA256, shouldFail: true}, // 9 + {dataBlocks: 4, disks: 8, offDisks: 1, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 10 + {dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badStaleDisks: 1, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true}, // 11 + {dataBlocks: 6, disks: 12, offDisks: 8, badDisks: 3, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true}, // 12 + {dataBlocks: 7, disks: 14, offDisks: 3, badDisks: 4, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false}, // 13 + {dataBlocks: 7, disks: 14, offDisks: 6, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 14 + {dataBlocks: 8, disks: 16, offDisks: 4, badDisks: 5, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true}, // 15 + {dataBlocks: 2, disks: 4, offDisks: 0, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 16 + {dataBlocks: 2, disks: 4, offDisks: 0, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: 0, shouldFail: true}, // 17 + {dataBlocks: 12, disks: 16, offDisks: 2, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 18 + {dataBlocks: 6, disks: 8, offDisks: 1, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false}, // 19 + {dataBlocks: 7, disks: 10, offDisks: 1, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: 0, shouldFail: true}, // 20 + {dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte * 64, algorithm: SHA256, shouldFail: false}, // 21 } func TestErasureHealFile(t *testing.T) { for i, test := range erasureHealFileTests { + if test.offDisks < test.badStaleDisks { + // test case sanity check + t.Fatalf("Test %d: Bad test case - number of stale disks cannot be less than number of badstale disks", i) + } + + // create some test data setup, err := newErasureTestSetup(test.dataBlocks, test.disks-test.dataBlocks, test.blocksize) if err != nil { t.Fatalf("Test %d: failed to setup XL environment: %v", i, err) @@ -66,15 +79,11 @@ func TestErasureHealFile(t *testing.T) { setup.Remove() t.Fatalf("Test %d: failed to create ErasureStorage: %v", i, err) } - offline := make([]StorageAPI, len(storage.disks)) - copy(offline, storage.disks) - data := make([]byte, test.size) if _, err = io.ReadFull(rand.Reader, data); err != nil { setup.Remove() t.Fatalf("Test %d: failed to create random test data: %v", i, err) } - algorithm := test.algorithm if !algorithm.Available() { algorithm = DefaultBitrotAlgorithm @@ -86,7 +95,25 @@ func TestErasureHealFile(t *testing.T) { t.Fatalf("Test %d: failed to create random test data: %v", i, err) } - info, err := storage.HealFile(offline, "testbucket", "testobject", test.blocksize, "testbucket", "healedobject", test.size, test.algorithm, file.Checksums) + // setup stale disks for the test case + staleDisks := make([]StorageAPI, len(storage.disks)) + copy(staleDisks, storage.disks) + for j := 0; j < len(storage.disks); j++ { + if j < test.offDisks { + storage.disks[j] = OfflineDisk + } else { + staleDisks[j] = nil + } + } + for j := 0; j < test.badDisks; j++ { + storage.disks[test.offDisks+j] = badDisk{nil} + } + for j := 0; j < test.badStaleDisks; j++ { + staleDisks[j] = badDisk{nil} + } + + // test case setup is complete - now call Healfile() + info, err := storage.HealFile(staleDisks, "testbucket", "testobject", test.blocksize, "testbucket", "healedobject", test.size, test.algorithm, file.Checksums) if err != nil && !test.shouldFail { t.Errorf("Test %d: should pass but it failed with: %v", i, err) } @@ -100,39 +127,13 @@ func TestErasureHealFile(t *testing.T) { if info.Algorithm != test.algorithm { t.Errorf("Test %d: healed with wrong algorithm: got: %v want: %v", i, info.Algorithm, test.algorithm) } - if !reflect.DeepEqual(info.Checksums, file.Checksums) { - t.Errorf("Test %d: heal returned different bitrot keys", i) - } - } - if err == nil && !test.shouldFail { - for j := 0; j < len(storage.disks); j++ { - if j < test.offDisks { - storage.disks[j] = OfflineDisk - } else { - offline[j] = OfflineDisk - } - } - for j := 0; j < test.badDisks; j++ { - storage.disks[test.offDisks+j] = badDisk{nil} - } - for j := 0; j < test.badOffDisks; j++ { - offline[j] = badDisk{nil} - } - info, err := storage.HealFile(offline, "testbucket", "testobject", test.blocksize, "testbucket", "healedobject", test.size, test.algorithm, file.Checksums) - if err != nil && !test.shouldFailQuorum { - t.Errorf("Test %d: should pass but it failed with: %v", i, err) - } - if err == nil && test.shouldFailQuorum { - t.Errorf("Test %d: should fail but it passed", i) - } - if err == nil { - if info.Size != test.size { - t.Errorf("Test %d: healed wrong number of bytes: got: #%d want: #%d", i, info.Size, test.size) - } - if info.Algorithm != test.algorithm { - t.Errorf("Test %d: healed with wrong algorithm: got: %v want: %v", i, info.Algorithm, test.algorithm) + // Verify that checksums of staleDisks + // match expected values + for i, disk := range staleDisks { + if disk == nil { + continue } - if !reflect.DeepEqual(info.Checksums, file.Checksums) { + if !reflect.DeepEqual(info.Checksums[i], file.Checksums[i]) { t.Errorf("Test %d: heal returned different bitrot checksums", i) } }