Allow proper setCount SLAs across zones (#8752)

Fixes scenario where zones are appropriately
handled, along with supporting overriding set
count. The new fix also ensures that we handle
the various setup types properly.

Update documentation to properly indicate the
behavior.

Fixes #8750

Co-authored-by: Nitish Tiwari <nitish@minio.io>
master
Harshavardhana 5 years ago committed by GitHub
parent b123be5612
commit 60813bef29
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 83
      cmd/endpoint-ellipses.go
  2. 23
      cmd/endpoint.go
  3. 2
      cmd/endpoint_test.go
  4. 9
      cmd/object-api-errors.go
  5. 7
      cmd/setup-type.go
  6. 4
      cmd/test-utils_test.go
  7. 4
      docs/distributed/DESIGN.md
  8. 28
      docs/distributed/README.md
  9. 4
      pkg/iam/policy/error.go
  10. 4
      pkg/policy/error.go

@ -76,7 +76,8 @@ func getSetIndexes(args []string, totalSizes []uint64, customSetDriveCount uint6
for _, totalSize := range totalSizes { for _, totalSize := range totalSizes {
// Check if totalSize has minimum range upto setSize // Check if totalSize has minimum range upto setSize
if totalSize < setSizes[0] || totalSize < customSetDriveCount { if totalSize < setSizes[0] || totalSize < customSetDriveCount {
return nil, config.ErrInvalidNumberOfErasureEndpoints(nil) msg := fmt.Sprintf("Incorrect number of endpoints provided %s", args)
return nil, config.ErrInvalidNumberOfErasureEndpoints(nil).Msg(msg)
} }
} }
@ -120,7 +121,8 @@ func getSetIndexes(args []string, totalSizes []uint64, customSetDriveCount uint6
// Check whether setSize is with the supported range. // Check whether setSize is with the supported range.
if !isValidSetSize(setSize) { if !isValidSetSize(setSize) {
return nil, config.ErrInvalidNumberOfErasureEndpoints(nil) msg := fmt.Sprintf("Incorrect number of endpoints provided %s", args)
return nil, config.ErrInvalidNumberOfErasureEndpoints(nil).Msg(msg)
} }
for i := range totalSizes { for i := range totalSizes {
@ -203,18 +205,7 @@ func parseEndpointSet(customSetDriveCount uint64, args ...string) (ep endpointSe
// specific set size. // specific set size.
// For example: {1...64} is divided into 4 sets each of size 16. // For example: {1...64} is divided into 4 sets each of size 16.
// This applies to even distributed setup syntax as well. // This applies to even distributed setup syntax as well.
func GetAllSets(args ...string) ([][]string, error) { func GetAllSets(customSetDriveCount uint64, args ...string) ([][]string, error) {
var customSetDriveCount uint64
if v := env.Get("MINIO_ERASURE_SET_DRIVE_COUNT", ""); v != "" {
customSetDriveCount, err := strconv.ParseUint(v, 10, 64)
if err != nil {
return nil, config.ErrInvalidErasureSetSize(err)
}
if !isValidSetSize(customSetDriveCount) {
return nil, config.ErrInvalidErasureSetSize(nil)
}
}
var setArgs [][]string var setArgs [][]string
if !ellipses.HasEllipses(args...) { if !ellipses.HasEllipses(args...) {
var setIndexes [][]uint64 var setIndexes [][]uint64
@ -257,16 +248,26 @@ func GetAllSets(args ...string) ([][]string, error) {
// CreateServerEndpoints - validates and creates new endpoints from input args, supports // CreateServerEndpoints - validates and creates new endpoints from input args, supports
// both ellipses and without ellipses transparently. // both ellipses and without ellipses transparently.
func createServerEndpoints(serverAddr string, args ...string) (EndpointZones, int, SetupType, error) { func createServerEndpoints(serverAddr string, args ...string) (
endpointZones EndpointZones, setDriveCount int,
setupType SetupType, err error) {
if len(args) == 0 { if len(args) == 0 {
return nil, -1, -1, errInvalidArgument return nil, -1, -1, errInvalidArgument
} }
var endpointZones EndpointZones if v := env.Get("MINIO_ERASURE_SET_DRIVE_COUNT", ""); v != "" {
var setupType SetupType setDriveCount, err = strconv.Atoi(v)
var drivesPerSet int if err != nil {
return nil, -1, -1, config.ErrInvalidErasureSetSize(err)
}
if !isValidSetSize(uint64(setDriveCount)) {
return nil, -1, -1, config.ErrInvalidErasureSetSize(nil)
}
}
if !ellipses.HasEllipses(args...) { if !ellipses.HasEllipses(args...) {
setArgs, err := GetAllSets(args...) setArgs, err := GetAllSets(uint64(setDriveCount), args...)
if err != nil { if err != nil {
return nil, -1, -1, err return nil, -1, -1, err
} }
@ -283,43 +284,37 @@ func createServerEndpoints(serverAddr string, args ...string) (EndpointZones, in
return endpointZones, len(setArgs[0]), setupType, nil return endpointZones, len(setArgs[0]), setupType, nil
} }
var prevSetupType SetupType
var foundPrevLocal bool var foundPrevLocal bool
// Verify the args setup-type appropriately.
{
setArgs, err := GetAllSets(args...)
if err != nil {
return nil, -1, -1, err
}
var endpoints Endpoints
endpoints, setupType, err = CreateEndpoints(serverAddr, foundPrevLocal, setArgs...)
if err != nil {
return nil, -1, -1, err
}
foundPrevLocal = endpoints.atleastOneEndpointLocal()
}
for _, arg := range args { for _, arg := range args {
setArgs, err := GetAllSets(arg) setArgs, err := GetAllSets(uint64(setDriveCount), arg)
if err != nil { if err != nil {
return nil, -1, -1, err return nil, -1, -1, err
} }
endpointList, _, err := CreateEndpoints(serverAddr, foundPrevLocal, setArgs...) var endpointList Endpoints
endpointList, setupType, err = CreateEndpoints(serverAddr, foundPrevLocal, setArgs...)
if err != nil { if err != nil {
return nil, -1, -1, err return nil, -1, -1, err
} }
if drivesPerSet != 0 && drivesPerSet != len(setArgs[0]) { if setDriveCount != 0 && setDriveCount != len(setArgs[0]) {
return nil, -1, -1, fmt.Errorf("All zones should have same drive per set ratio - expected %d, got %d", drivesPerSet, len(setArgs[0])) return nil, -1, -1, fmt.Errorf("All zones should have same drive per set ratio - expected %d, got %d", setDriveCount, len(setArgs[0]))
} }
endpointZones = append(endpointZones, ZoneEndpoints{ if prevSetupType != UnknownSetupType && prevSetupType != setupType {
return nil, -1, -1, fmt.Errorf("All zones should be of the same setup-type to maintain the original SLA expectations - expected %s, got %s", prevSetupType, setupType)
}
if err = endpointZones.Add(ZoneEndpoints{
SetCount: len(setArgs), SetCount: len(setArgs),
DrivesPerSet: len(setArgs[0]), DrivesPerSet: len(setArgs[0]),
Endpoints: endpointList, Endpoints: endpointList,
}) }); err != nil {
drivesPerSet = len(setArgs[0]) return nil, -1, -1, err
}
foundPrevLocal = endpointList.atleastOneEndpointLocal()
if setDriveCount == 0 {
setDriveCount = len(setArgs[0])
}
prevSetupType = setupType
} }
return endpointZones, drivesPerSet, setupType, nil return endpointZones, setDriveCount, setupType, nil
} }

@ -194,6 +194,24 @@ type ZoneEndpoints struct {
// EndpointZones - list of list of endpoints // EndpointZones - list of list of endpoints
type EndpointZones []ZoneEndpoints type EndpointZones []ZoneEndpoints
// Add add zone endpoints
func (l *EndpointZones) Add(zeps ZoneEndpoints) error {
existSet := set.NewStringSet()
for _, zep := range *l {
for _, ep := range zep.Endpoints {
existSet.Add(ep.String())
}
}
// Validate if there are duplicate endpoints across zones
for _, ep := range zeps.Endpoints {
if existSet.Contains(ep.String()) {
return fmt.Errorf("duplicate endpoints found")
}
}
*l = append(*l, zeps)
return nil
}
// FirstLocal returns true if the first endpoint is local. // FirstLocal returns true if the first endpoint is local.
func (l EndpointZones) FirstLocal() bool { func (l EndpointZones) FirstLocal() bool {
return l[0].Endpoints[0].IsLocal return l[0].Endpoints[0].IsLocal
@ -608,13 +626,14 @@ func CreateEndpoints(serverAddr string, foundLocal bool, args ...[]string) (Endp
// All endpoints are pointing to local host // All endpoints are pointing to local host
if len(endpoints) == localEndpointCount { if len(endpoints) == localEndpointCount {
// If all endpoints have same port number, then this is XL setup using URL style endpoints. // If all endpoints have same port number, Just treat it as distXL setup
// using URL style endpoints.
if len(localPortSet) == 1 { if len(localPortSet) == 1 {
if len(localServerHostSet) > 1 { if len(localServerHostSet) > 1 {
return endpoints, setupType, return endpoints, setupType,
config.ErrInvalidErasureEndpoints(nil).Msg("all local endpoints should not have different hostnames/ips") config.ErrInvalidErasureEndpoints(nil).Msg("all local endpoints should not have different hostnames/ips")
} }
return endpoints, XLSetupType, nil return endpoints, DistXLSetupType, nil
} }
// Even though all endpoints are local, but those endpoints use different ports. // Even though all endpoints are local, but those endpoints use different ports.

@ -285,7 +285,7 @@ func TestCreateEndpoints(t *testing.T) {
Endpoint{URL: &url.URL{Scheme: "http", Host: "localhost", Path: "/d2"}, IsLocal: true}, Endpoint{URL: &url.URL{Scheme: "http", Host: "localhost", Path: "/d2"}, IsLocal: true},
Endpoint{URL: &url.URL{Scheme: "http", Host: "localhost", Path: "/d3"}, IsLocal: true}, Endpoint{URL: &url.URL{Scheme: "http", Host: "localhost", Path: "/d3"}, IsLocal: true},
Endpoint{URL: &url.URL{Scheme: "http", Host: "localhost", Path: "/d4"}, IsLocal: true}, Endpoint{URL: &url.URL{Scheme: "http", Host: "localhost", Path: "/d4"}, IsLocal: true},
}, XLSetupType, nil}, }, DistXLSetupType, nil},
// DistXL Setup with URLEndpointType having mixed naming to local host. // DistXL Setup with URLEndpointType having mixed naming to local host.
{"127.0.0.1:10000", [][]string{{"http://localhost/d1", "http://localhost/d2", "http://127.0.0.1/d3", "http://127.0.0.1/d4"}}, "", Endpoints{}, -1, fmt.Errorf("all local endpoints should not have different hostnames/ips")}, {"127.0.0.1:10000", [][]string{{"http://localhost/d1", "http://localhost/d2", "http://127.0.0.1/d3", "http://127.0.0.1/d4"}}, "", Endpoints{}, -1, fmt.Errorf("all local endpoints should not have different hostnames/ips")},

@ -17,6 +17,7 @@
package cmd package cmd
import ( import (
"errors"
"fmt" "fmt"
"io" "io"
"path" "path"
@ -429,14 +430,14 @@ func (e BackendDown) Error() string {
// isErrBucketNotFound - Check if error type is BucketNotFound. // isErrBucketNotFound - Check if error type is BucketNotFound.
func isErrBucketNotFound(err error) bool { func isErrBucketNotFound(err error) bool {
_, ok := err.(BucketNotFound) var bkNotFound BucketNotFound
return ok return errors.As(err, &bkNotFound)
} }
// isErrObjectNotFound - Check if error type is ObjectNotFound. // isErrObjectNotFound - Check if error type is ObjectNotFound.
func isErrObjectNotFound(err error) bool { func isErrObjectNotFound(err error) bool {
_, ok := err.(ObjectNotFound) var objNotFound ObjectNotFound
return ok return errors.As(err, &objNotFound)
} }
// PreConditionFailed - Check if copy precondition failed // PreConditionFailed - Check if copy precondition failed

@ -20,8 +20,11 @@ package cmd
type SetupType int type SetupType int
const ( const (
// UnknownSetupType - starts with unknown setup type.
UnknownSetupType SetupType = iota
// FSSetupType - FS setup type enum. // FSSetupType - FS setup type enum.
FSSetupType SetupType = iota + 1 FSSetupType
// XLSetupType - XL setup type enum. // XLSetupType - XL setup type enum.
XLSetupType XLSetupType
@ -45,5 +48,5 @@ func (setupType SetupType) String() string {
return globalMinioModeGatewayPrefix return globalMinioModeGatewayPrefix
} }
return "" return "unknown"
} }

@ -1894,7 +1894,8 @@ func ExecObjectLayerAPITest(t *testing.T, objAPITest objAPITestType, endpoints [
t.Fatalf("Unable to initialize server config. %s", err) t.Fatalf("Unable to initialize server config. %s", err)
} }
globalIAMSys = NewIAMSys() newAllSubsystems()
globalIAMSys.Init(objLayer) globalIAMSys.Init(objLayer)
buckets, err := objLayer.ListBuckets(context.Background()) buckets, err := objLayer.ListBuckets(context.Background())
@ -1902,7 +1903,6 @@ func ExecObjectLayerAPITest(t *testing.T, objAPITest objAPITestType, endpoints [
t.Fatalf("Unable to list buckets on backend %s", err) t.Fatalf("Unable to list buckets on backend %s", err)
} }
globalPolicySys = NewPolicySys()
globalPolicySys.Init(buckets, objLayer) globalPolicySys.Init(buckets, objLayer)
credentials := globalActiveCred credentials := globalActiveCred

@ -90,7 +90,7 @@ Input for the key is the object name specified in `PutObject()`, returns a uniqu
- MinIO also supports expansion of existing clusters in zones. Each zone is a self contained entity with same SLA's (read/write quorum) for each object as original cluster. By using the existing namespace for lookup validation MinIO ensures conflicting objects are not created. When no such object exists then MinIO simply uses the least used zone. - MinIO also supports expansion of existing clusters in zones. Each zone is a self contained entity with same SLA's (read/write quorum) for each object as original cluster. By using the existing namespace for lookup validation MinIO ensures conflicting objects are not created. When no such object exists then MinIO simply uses the least used zone.
*There are no limits on how many zones can be combined* __There are no limits on how many zones can be combined__
``` ```
minio server http://host{1...32}/export{1...32} http://host{5...6}/export{1...8} minio server http://host{1...32}/export{1...32} http://host{5...6}/export{1...8}
@ -103,7 +103,7 @@ In above example there are two zones
> Notice the requirement of common SLA here original cluster had 1024 drives with 16 drives per erasure set, second zone is expected to have a minimum of 16 drives to match the original cluster SLA or it should be in multiples of 16. > Notice the requirement of common SLA here original cluster had 1024 drives with 16 drives per erasure set, second zone is expected to have a minimum of 16 drives to match the original cluster SLA or it should be in multiples of 16.
Following pseudo code returns the correct least used zone index to upload an object. MinIO places new objects in zones based on proportionate free space, per zone. Following pseudo code demonstrates this behavior.
```go ```go
func getAvailableZoneIdx(ctx context.Context) int { func getAvailableZoneIdx(ctx context.Context) int {
zones := z.getZonesAvailableSpace(ctx) zones := z.getZonesAvailableSpace(ctx)

@ -34,20 +34,21 @@ Install MinIO - [MinIO Quickstart Guide](https://docs.min.io/docs/minio-quicksta
To start a distributed MinIO instance, you just need to pass drive locations as parameters to the minio server command. Then, you’ll need to run the same command on all the participating nodes. To start a distributed MinIO instance, you just need to pass drive locations as parameters to the minio server command. Then, you’ll need to run the same command on all the participating nodes.
*Note* __NOTE:__
- All the nodes running distributed MinIO need to have same access key and secret key for the nodes to connect. To achieve this, it is **mandatory** to export access key and secret key as environment variables, `MINIO_ACCESS_KEY` and `MINIO_SECRET_KEY`, on all the nodes before executing MinIO server command. - All the nodes running distributed MinIO need to have same access key and secret key for the nodes to connect. To achieve this, it is __recommended__ to export access key and secret key as environment variables, `MINIO_ACCESS_KEY` and `MINIO_SECRET_KEY`, on all the nodes before executing MinIO server command.
- All the nodes running distributed MinIO setup are recommended to be in homogeneous environment, i.e. same operating system, same number of disks and same network interconnects. - __MinIO creates erasure-coding sets of 4, 6, 8, 10, 12, 14 or 16 drives. The number of drives you provide must be a multiple of one of those numbers.__
- MinIO distributed mode requires fresh directories. If required, the drives can be shared with other applications. You can do this by using a sub-directory exclusive to MinIO. For example, if you have mounted your volume under `/export`, pass `/export/data` as arguments to MinIO server. - __MinIO chooses the largest EC set size which divides into the total number of drives given. For example, 8 drives will be used as a single EC set of size 8, not two sets of size 4.__
- __Each object is written to a single EC set, and therefore is spread over no more than 16 drives.__
- __All the nodes running distributed MinIO setup are recommended to be homogeneous, i.e. same operating system, same number of disks and same network interconnects.__
- MinIO distributed mode requires __fresh directories__. If required, the drives can be shared with other applications. You can do this by using a sub-directory exclusive to MinIO. For example, if you have mounted your volume under `/export`, pass `/export/data` as arguments to MinIO server.
- The IP addresses and drive paths below are for demonstration purposes only, you need to replace these with the actual IP addresses and drive paths/folders. - The IP addresses and drive paths below are for demonstration purposes only, you need to replace these with the actual IP addresses and drive paths/folders.
- Servers running distributed MinIO instances should be less than 15 minutes apart. You can enable [NTP](http://www.ntp.org/) service as a best practice to ensure same times across servers. - Servers running distributed MinIO instances should be less than 15 minutes apart. You can enable [NTP](http://www.ntp.org/) service as a best practice to ensure same times across servers.
- Running Distributed MinIO on Windows operating system is experimental. Please proceed with caution. - `MINIO_DOMAIN` environment variable should be defined and exported for bucket DNS style support.
- `MINIO_DOMAIN` environment variable should be defined and exported if domain is needed to be set. - Running Distributed MinIO on __Windows__ operating system is experimental. Please proceed with caution.
- MinIO creates erasure-coding sets of 4, 6, 8, 10, 12, 14 or 16 drives. The number of drives you provide must be a multiple of one of those numbers.
- MinIO chooses the largest EC set size which divides into the total number of drives given. For example, 8 drives will be used as a single EC set of size 8, not two sets of size 4.
- Each object is written to a single EC set, and therefore is spread over no more than 16 drives.
Example 1: Start distributed MinIO instance on 32 nodes with 32 drives each mounted at `/export1` to `/export32` (pictured below), by running this command on all the 32 nodes: Example 1: Start distributed MinIO instance on 32 nodes with 32 drives each mounted at `/export1` to `/export32` (pictured below), by running this command on all the 32 nodes:
![Distributed MinIO, 32 nodes with 32 drives each](https://github.com/minio/minio/blob/master/docs/screenshots/Architecture-diagram_distributed_32.png?raw=true) ![Distributed MinIO, 32 nodes with 32 drives each](https://github.com/minio/minio/blob/master/docs/screenshots/Architecture-diagram_distributed_32.png?raw=true)
#### GNU/Linux and macOS #### GNU/Linux and macOS
@ -71,11 +72,8 @@ minio server http://host{1...32}/export{1...32} http://host{33...64}/export{1...
Now the server has expanded storage of *1024* more disks in total of *2048* disks, new object upload requests automatically start using the least used cluster. This expansion strategy works endlessly, so you can perpetually expand your clusters as needed. When you restart, it is immediate and non-disruptive to the applications. Each group of servers in the command-line is called a zone. There are 2 zones in this example. New objects are placed in zones in proportion to the amount of free space in each zone. Within each zone, the location of the erasure-set of drives is determined based on a deterministic hashing algorithm. Now the server has expanded storage of *1024* more disks in total of *2048* disks, new object upload requests automatically start using the least used cluster. This expansion strategy works endlessly, so you can perpetually expand your clusters as needed. When you restart, it is immediate and non-disruptive to the applications. Each group of servers in the command-line is called a zone. There are 2 zones in this example. New objects are placed in zones in proportion to the amount of free space in each zone. Within each zone, the location of the erasure-set of drives is determined based on a deterministic hashing algorithm.
*Note* > __NOTE:__ __Each zone you add must have the same erasure coding set size as the original zone, so the same data redundancy SLA is maintained.__
> For example, if your first zone was 8 drives, you could add further zones of 16, 32 or 1024 drives each. All you have to make sure is deployment SLA is multiples of original zone i.e 8.
Each zone you add must have the same erasure coding set size as the original zone, so the same data redundancy SLA is maintained.
For example, if your first zone was 8 drives, you could add further zones of 8 drives each, but not a zone of 16 drives. That's because 16 drives are treated as a single EC set of 16, not two sets of 8.
## 3. Test your setup ## 3. Test your setup
To test this setup, access the MinIO server via browser or [`mc`](https://docs.min.io/docs/minio-client-quickstart-guide). To test this setup, access the MinIO server via browser or [`mc`](https://docs.min.io/docs/minio-client-quickstart-guide).

@ -16,7 +16,9 @@
package iampolicy package iampolicy
import "fmt" import (
"fmt"
)
// Error is the generic type for any error happening during policy // Error is the generic type for any error happening during policy
// parsing. // parsing.

@ -16,7 +16,9 @@
package policy package policy
import "fmt" import (
"fmt"
)
// Error is the generic type for any error happening during policy // Error is the generic type for any error happening during policy
// parsing. // parsing.

Loading…
Cancel
Save