|
|
|
/*
|
|
|
|
* MinIO Cloud Storage, (C) 2017 MinIO, Inc.
|
|
|
|
*
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at
|
|
|
|
*
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package cmd
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"sync"
|
|
|
|
|
|
|
|
"github.com/klauspost/reedsolomon"
|
|
|
|
"github.com/minio/minio/cmd/logger"
|
|
|
|
)
|
|
|
|
|
|
|
|
// Erasure - erasure encoding details.
|
|
|
|
type Erasure struct {
|
|
|
|
encoder func() reedsolomon.Encoder
|
|
|
|
dataBlocks, parityBlocks int
|
|
|
|
blockSize int64
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewErasure creates a new ErasureStorage.
|
|
|
|
func NewErasure(ctx context.Context, dataBlocks, parityBlocks int, blockSize int64) (e Erasure, err error) {
|
|
|
|
e = Erasure{
|
|
|
|
dataBlocks: dataBlocks,
|
|
|
|
parityBlocks: parityBlocks,
|
|
|
|
blockSize: blockSize,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check the parameters for sanity now.
|
|
|
|
if dataBlocks <= 0 || parityBlocks <= 0 {
|
|
|
|
return e, reedsolomon.ErrInvShardNum
|
|
|
|
}
|
|
|
|
|
|
|
|
if dataBlocks+parityBlocks > 256 {
|
|
|
|
return e, reedsolomon.ErrMaxShardNum
|
|
|
|
}
|
|
|
|
|
|
|
|
// Encoder when needed.
|
|
|
|
var enc reedsolomon.Encoder
|
|
|
|
var once sync.Once
|
|
|
|
e.encoder = func() reedsolomon.Encoder {
|
|
|
|
once.Do(func() {
|
|
|
|
e, err := reedsolomon.New(dataBlocks, parityBlocks, reedsolomon.WithAutoGoroutines(int(e.ShardSize())))
|
|
|
|
if err != nil {
|
|
|
|
// Error conditions should be checked above.
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
enc = e
|
|
|
|
})
|
|
|
|
return enc
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// EncodeData encodes the given data and returns the erasure-coded data.
|
|
|
|
// It returns an error if the erasure coding failed.
|
|
|
|
func (e *Erasure) EncodeData(ctx context.Context, data []byte) ([][]byte, error) {
|
|
|
|
if len(data) == 0 {
|
|
|
|
return make([][]byte, e.dataBlocks+e.parityBlocks), nil
|
|
|
|
}
|
|
|
|
encoded, err := e.encoder().Split(data)
|
|
|
|
if err != nil {
|
|
|
|
logger.LogIf(ctx, err)
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if err = e.encoder().Encode(encoded); err != nil {
|
|
|
|
logger.LogIf(ctx, err)
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
return encoded, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// DecodeDataBlocks decodes the given erasure-coded data.
|
|
|
|
// It only decodes the data blocks but does not verify them.
|
|
|
|
// It returns an error if the decoding failed.
|
|
|
|
func (e *Erasure) DecodeDataBlocks(data [][]byte) error {
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
5 years ago
|
|
|
var isZero = 0
|
|
|
|
for _, b := range data[:] {
|
|
|
|
if len(b) == 0 {
|
|
|
|
isZero++
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
5 years ago
|
|
|
if isZero == 0 || isZero == len(data) {
|
|
|
|
// If all are zero, payload is 0 bytes.
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return e.encoder().ReconstructData(data)
|
|
|
|
}
|
|
|
|
|
|
|
|
// DecodeDataAndParityBlocks decodes the given erasure-coded data and verifies it.
|
|
|
|
// It returns an error if the decoding failed.
|
|
|
|
func (e *Erasure) DecodeDataAndParityBlocks(ctx context.Context, data [][]byte) error {
|
|
|
|
needsReconstruction := false
|
|
|
|
for _, b := range data {
|
|
|
|
if b == nil {
|
|
|
|
needsReconstruction = true
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if !needsReconstruction {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
if err := e.encoder().Reconstruct(data); err != nil {
|
|
|
|
logger.LogIf(ctx, err)
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// ShardSize - returns actual shared size from erasure blockSize.
|
|
|
|
func (e *Erasure) ShardSize() int64 {
|
|
|
|
return ceilFrac(e.blockSize, int64(e.dataBlocks))
|
|
|
|
}
|
|
|
|
|
|
|
|
// ShardFileSize - returns final erasure size from original size.
|
|
|
|
func (e *Erasure) ShardFileSize(totalLength int64) int64 {
|
|
|
|
if totalLength == 0 {
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
if totalLength == -1 {
|
|
|
|
return -1
|
|
|
|
}
|
|
|
|
numShards := totalLength / e.blockSize
|
|
|
|
lastBlockSize := totalLength % int64(e.blockSize)
|
|
|
|
lastShardSize := ceilFrac(lastBlockSize, int64(e.dataBlocks))
|
|
|
|
return numShards*e.ShardSize() + lastShardSize
|
|
|
|
}
|
|
|
|
|
|
|
|
// ShardFileTillOffset - returns the effectiv eoffset where erasure reading begins.
|
|
|
|
func (e *Erasure) ShardFileTillOffset(startOffset, length, totalLength int64) int64 {
|
|
|
|
shardSize := e.ShardSize()
|
|
|
|
shardFileSize := e.ShardFileSize(totalLength)
|
|
|
|
endShard := (startOffset + int64(length)) / e.blockSize
|
|
|
|
tillOffset := endShard*shardSize + shardSize
|
|
|
|
if tillOffset > shardFileSize {
|
|
|
|
tillOffset = shardFileSize
|
|
|
|
}
|
|
|
|
return tillOffset
|
|
|
|
}
|