diff --git a/cmd/auth-rpc-client.go b/cmd/auth-rpc-client.go index 2e15ed8f1..2dec0f58f 100644 --- a/cmd/auth-rpc-client.go +++ b/cmd/auth-rpc-client.go @@ -22,6 +22,10 @@ import ( "time" ) +// Attempt to retry only this many number of times before +// giving up on the remote RPC entirely. +const globalAuthRPCRetryThreshold = 1 + // GenericReply represents any generic RPC reply. type GenericReply struct{} @@ -165,7 +169,7 @@ func (authClient *AuthRPCClient) Call(serviceMethod string, args interface { authClient.Close() // No need to return error until the retry count threshold has reached. - if i < globalMaxAuthRPCRetryThreshold { + if i < globalAuthRPCRetryThreshold { continue } } diff --git a/cmd/fs-v1_test.go b/cmd/fs-v1_test.go index 9b82d862e..7624e2106 100644 --- a/cmd/fs-v1_test.go +++ b/cmd/fs-v1_test.go @@ -20,6 +20,7 @@ import ( "bytes" "path/filepath" "testing" + "time" ) // TestNewFS - tests initialization of all input disks @@ -86,7 +87,12 @@ func TestNewFS(t *testing.T) { if err != errInvalidArgument { t.Errorf("Expecting error invalid argument, got %s", err) } - _, err = newFSObjects(&retryStorage{xlStorageDisks[0]}) + _, err = newFSObjects(&retryStorage{ + remoteStorage: xlStorageDisks[0], + maxRetryAttempts: 1, + retryUnit: time.Millisecond, + retryCap: time.Millisecond * 10, + }) if err != nil { errMsg := "Unable to recognize backend format, Disk is not in FS format." if err.Error() == errMsg { diff --git a/cmd/globals.go b/cmd/globals.go index fad83d694..3eadd1902 100644 --- a/cmd/globals.go +++ b/cmd/globals.go @@ -89,14 +89,6 @@ var ( // List of admin peers. globalAdminPeers = adminPeers{} - // Attempt to retry only this many number of times before - // giving up on the remote disk entirely. - globalMaxStorageRetryThreshold = 3 - - // Attempt to retry only this many number of times before - // giving up on the remote RPC entirely. - globalMaxAuthRPCRetryThreshold = 1 - // Add new variable global values here. ) diff --git a/cmd/prepare-storage.go b/cmd/prepare-storage.go index aebf8bc0a..d922db647 100644 --- a/cmd/prepare-storage.go +++ b/cmd/prepare-storage.go @@ -311,16 +311,40 @@ func waitForFormatDisks(firstDisk bool, endpoints []*url.URL, storageDisks []Sto if storageDisks == nil { return nil, errInvalidArgument } + + // Retryable disks before formatting, we need to have a larger + // retry window so that we wait enough amount of time before + // the disks come online. + retryDisks := make([]StorageAPI, len(storageDisks)) + for i, storage := range storageDisks { + retryDisks[i] = &retryStorage{ + remoteStorage: storage, + maxRetryAttempts: globalStorageInitRetryThreshold, + retryUnit: time.Second, + retryCap: time.Second * 30, // 30 seconds. + } + } + // Start retry loop retrying until disks are formatted properly, until we have reached // a conditional quorum of formatted disks. - err = retryFormattingDisks(firstDisk, endpoints, storageDisks) + err = retryFormattingDisks(firstDisk, endpoints, retryDisks) if err != nil { return nil, err } + // Initialize the disk into a formatted disks wrapper. formattedDisks = make([]StorageAPI, len(storageDisks)) for i, storage := range storageDisks { - formattedDisks[i] = &retryStorage{storage} + // After formatting is done we need a smaller time + // window and lower retry value before formatting. + formattedDisks[i] = &retryStorage{ + remoteStorage: storage, + maxRetryAttempts: globalStorageRetryThreshold, + retryUnit: time.Millisecond, + retryCap: time.Millisecond * 5, // 5 milliseconds. + } } + + // Success. return formattedDisks, nil } diff --git a/cmd/retry-storage.go b/cmd/retry-storage.go index 9bc8ab1a1..c30c152a3 100644 --- a/cmd/retry-storage.go +++ b/cmd/retry-storage.go @@ -22,12 +22,25 @@ import ( "github.com/minio/minio/pkg/disk" ) +const ( + // Attempt to retry only this many number of times before + // giving up on the remote disk entirely during initialization. + globalStorageInitRetryThreshold = 4 + + // Attempt to retry only this many number of times before + // giving up on the remote disk entirely after initialization. + globalStorageRetryThreshold = 1 +) + // Retry storage is an instance of StorageAPI which // additionally verifies upon network shutdown if the // underlying storage is available and is really // formatted. type retryStorage struct { - remoteStorage StorageAPI + remoteStorage StorageAPI + maxRetryAttempts int + retryUnit time.Duration + retryCap time.Duration } // String representation of remoteStorage. @@ -209,13 +222,13 @@ func (f retryStorage) reInit() (err error) { doneCh := make(chan struct{}) defer close(doneCh) - for i := range newRetryTimer(time.Second, time.Second*30, MaxJitter, doneCh) { + for i := range newRetryTimer(f.retryUnit, f.retryCap, MaxJitter, doneCh) { // Initialize and make a new login attempt. err = f.remoteStorage.Init() if err != nil { // No need to return error until the retry count // threshold has reached. - if i < globalMaxStorageRetryThreshold { + if i < f.maxRetryAttempts { continue } return err @@ -226,7 +239,7 @@ func (f retryStorage) reInit() (err error) { if err != nil { // No need to return error until the retry count // threshold has reached. - if i < globalMaxStorageRetryThreshold { + if i < f.maxRetryAttempts { continue } return err diff --git a/cmd/retry-storage_test.go b/cmd/retry-storage_test.go index 67315a8f2..04556323b 100644 --- a/cmd/retry-storage_test.go +++ b/cmd/retry-storage_test.go @@ -20,6 +20,7 @@ import ( "bytes" "reflect" "testing" + "time" ) // Tests retry storage. @@ -39,9 +40,14 @@ func TestRetryStorage(t *testing.T) { if !ok { t.Fatal("storage disk is not *retryStorage type") } - storageDisks[i] = &retryStorage{newNaughtyDisk(retryDisk, map[int]error{ - 1: errDiskNotFound, - }, nil)} + storageDisks[i] = &retryStorage{ + remoteStorage: newNaughtyDisk(retryDisk, map[int]error{ + 1: errDiskNotFound, + }, nil), + maxRetryAttempts: 1, + retryUnit: time.Millisecond, + retryCap: time.Millisecond * 10, + } } // Validate all the conditions for retrying calls. @@ -52,9 +58,14 @@ func TestRetryStorage(t *testing.T) { if !ok { t.Fatal("storage disk is not *retryStorage type") } - storageDisks[i] = &retryStorage{newNaughtyDisk(retryDisk, map[int]error{ - 1: errDiskNotFound, - }, nil)} + storageDisks[i] = &retryStorage{ + remoteStorage: newNaughtyDisk(retryDisk, map[int]error{ + 1: errDiskNotFound, + }, nil), + maxRetryAttempts: 1, + retryUnit: time.Millisecond, + retryCap: time.Millisecond * 10, + } } for _, disk := range storageDisks { @@ -77,9 +88,14 @@ func TestRetryStorage(t *testing.T) { if !ok { t.Fatal("storage disk is not *retryStorage type") } - storageDisks[i] = &retryStorage{newNaughtyDisk(retryDisk, map[int]error{ - 1: errDiskNotFound, - }, nil)} + storageDisks[i] = &retryStorage{ + remoteStorage: newNaughtyDisk(retryDisk, map[int]error{ + 1: errDiskNotFound, + }, nil), + maxRetryAttempts: 1, + retryUnit: time.Millisecond, + retryCap: time.Millisecond * 10, + } } for _, disk := range storageDisks { @@ -97,9 +113,14 @@ func TestRetryStorage(t *testing.T) { if !ok { t.Fatal("storage disk is not *retryStorage type") } - storageDisks[i] = &retryStorage{newNaughtyDisk(retryDisk, map[int]error{ - 1: errDiskNotFound, - }, nil)} + storageDisks[i] = &retryStorage{ + remoteStorage: newNaughtyDisk(retryDisk, map[int]error{ + 1: errDiskNotFound, + }, nil), + maxRetryAttempts: 1, + retryUnit: time.Millisecond, + retryCap: time.Millisecond * 10, + } } for _, disk := range storageDisks { @@ -114,9 +135,14 @@ func TestRetryStorage(t *testing.T) { if !ok { t.Fatal("storage disk is not *retryStorage type") } - storageDisks[i] = &retryStorage{newNaughtyDisk(retryDisk, map[int]error{ - 1: errDiskNotFound, - }, nil)} + storageDisks[i] = &retryStorage{ + remoteStorage: newNaughtyDisk(retryDisk, map[int]error{ + 1: errDiskNotFound, + }, nil), + maxRetryAttempts: 1, + retryUnit: time.Millisecond, + retryCap: time.Millisecond * 10, + } } for _, disk := range storageDisks { @@ -131,9 +157,14 @@ func TestRetryStorage(t *testing.T) { if !ok { t.Fatal("storage disk is not *retryStorage type") } - storageDisks[i] = &retryStorage{newNaughtyDisk(retryDisk, map[int]error{ - 1: errDiskNotFound, - }, nil)} + storageDisks[i] = &retryStorage{ + remoteStorage: newNaughtyDisk(retryDisk, map[int]error{ + 1: errDiskNotFound, + }, nil), + maxRetryAttempts: 1, + retryUnit: time.Millisecond, + retryCap: time.Millisecond * 10, + } } for _, disk := range storageDisks { @@ -151,9 +182,14 @@ func TestRetryStorage(t *testing.T) { if !ok { t.Fatal("storage disk is not *retryStorage type") } - storageDisks[i] = &retryStorage{newNaughtyDisk(retryDisk, map[int]error{ - 1: errDiskNotFound, - }, nil)} + storageDisks[i] = &retryStorage{ + remoteStorage: newNaughtyDisk(retryDisk, map[int]error{ + 1: errDiskNotFound, + }, nil), + maxRetryAttempts: 1, + retryUnit: time.Millisecond, + retryCap: time.Millisecond * 10, + } } for _, disk := range storageDisks { @@ -168,9 +204,14 @@ func TestRetryStorage(t *testing.T) { if !ok { t.Fatal("storage disk is not *retryStorage type") } - storageDisks[i] = &retryStorage{newNaughtyDisk(retryDisk, map[int]error{ - 1: errDiskNotFound, - }, nil)} + storageDisks[i] = &retryStorage{ + remoteStorage: newNaughtyDisk(retryDisk, map[int]error{ + 1: errDiskNotFound, + }, nil), + maxRetryAttempts: 1, + retryUnit: time.Millisecond, + retryCap: time.Millisecond * 10, + } } for _, disk := range storageDisks { @@ -185,9 +226,14 @@ func TestRetryStorage(t *testing.T) { if !ok { t.Fatal("storage disk is not *retryStorage type") } - storageDisks[i] = &retryStorage{newNaughtyDisk(retryDisk, map[int]error{ - 1: errDiskNotFound, - }, nil)} + storageDisks[i] = &retryStorage{ + remoteStorage: newNaughtyDisk(retryDisk, map[int]error{ + 1: errDiskNotFound, + }, nil), + maxRetryAttempts: 1, + retryUnit: time.Millisecond, + retryCap: time.Millisecond * 10, + } } for _, disk := range storageDisks { @@ -202,9 +248,14 @@ func TestRetryStorage(t *testing.T) { if !ok { t.Fatal("storage disk is not *retryStorage type") } - storageDisks[i] = &retryStorage{newNaughtyDisk(retryDisk, map[int]error{ - 1: errDiskNotFound, - }, nil)} + storageDisks[i] = &retryStorage{ + remoteStorage: newNaughtyDisk(retryDisk, map[int]error{ + 1: errDiskNotFound, + }, nil), + maxRetryAttempts: 1, + retryUnit: time.Millisecond, + retryCap: time.Millisecond * 10, + } } for _, disk := range storageDisks { @@ -223,9 +274,14 @@ func TestRetryStorage(t *testing.T) { if !ok { t.Fatal("storage disk is not *retryStorage type") } - storageDisks[i] = &retryStorage{newNaughtyDisk(retryDisk, map[int]error{ - 1: errDiskNotFound, - }, nil)} + storageDisks[i] = &retryStorage{ + remoteStorage: newNaughtyDisk(retryDisk, map[int]error{ + 1: errDiskNotFound, + }, nil), + maxRetryAttempts: 1, + retryUnit: time.Millisecond, + retryCap: time.Millisecond * 10, + } } for _, disk := range storageDisks { @@ -248,9 +304,14 @@ func TestRetryStorage(t *testing.T) { if !ok { t.Fatal("storage disk is not *retryStorage type") } - storageDisks[i] = &retryStorage{newNaughtyDisk(retryDisk, map[int]error{ - 1: errDiskNotFound, - }, nil)} + storageDisks[i] = &retryStorage{ + remoteStorage: newNaughtyDisk(retryDisk, map[int]error{ + 1: errDiskNotFound, + }, nil), + maxRetryAttempts: 1, + retryUnit: time.Millisecond, + retryCap: time.Millisecond * 10, + } } for _, disk := range storageDisks { @@ -268,9 +329,14 @@ func TestRetryStorage(t *testing.T) { if !ok { t.Fatal("storage disk is not *retryStorage type") } - storageDisks[i] = &retryStorage{newNaughtyDisk(retryDisk, map[int]error{ - 1: errDiskNotFound, - }, nil)} + storageDisks[i] = &retryStorage{ + remoteStorage: newNaughtyDisk(retryDisk, map[int]error{ + 1: errDiskNotFound, + }, nil), + maxRetryAttempts: 1, + retryUnit: time.Millisecond, + retryCap: time.Millisecond * 10, + } } for _, disk := range storageDisks { @@ -285,9 +351,14 @@ func TestRetryStorage(t *testing.T) { if !ok { t.Fatal("storage disk is not *retryStorage type") } - storageDisks[i] = &retryStorage{newNaughtyDisk(retryDisk, map[int]error{ - 1: errDiskNotFound, - }, nil)} + storageDisks[i] = &retryStorage{ + remoteStorage: newNaughtyDisk(retryDisk, map[int]error{ + 1: errDiskNotFound, + }, nil), + maxRetryAttempts: 1, + retryUnit: time.Millisecond, + retryCap: time.Millisecond * 10, + } } for _, disk := range storageDisks { @@ -306,9 +377,14 @@ func TestRetryStorage(t *testing.T) { if !ok { t.Fatal("storage disk is not *retryStorage type") } - storageDisks[i] = &retryStorage{newNaughtyDisk(retryDisk, map[int]error{ - 1: errDiskNotFound, - }, nil)} + storageDisks[i] = &retryStorage{ + remoteStorage: newNaughtyDisk(retryDisk, map[int]error{ + 1: errDiskNotFound, + }, nil), + maxRetryAttempts: 1, + retryUnit: time.Millisecond, + retryCap: time.Millisecond * 10, + } } for _, disk := range storageDisks { diff --git a/cmd/retry.go b/cmd/retry.go index 364212af1..d3ad59e20 100644 --- a/cmd/retry.go +++ b/cmd/retry.go @@ -90,7 +90,7 @@ func newRetryTimer(unit time.Duration, cap time.Duration, jitter float64, doneCh go func() { defer close(attemptCh) - nextBackoff := 1 + nextBackoff := 0 for { select { // Attempts starts. diff --git a/cmd/storage-rpc-client.go b/cmd/storage-rpc-client.go index 4fd0c85f9..ebe639dd1 100644 --- a/cmd/storage-rpc-client.go +++ b/cmd/storage-rpc-client.go @@ -241,8 +241,14 @@ func (n *networkStorage) String() string { return n.netAddr + ":" + n.netPath } -// maximum allowed network IOError. -const maxAllowedNetworkIOError = 1024 +// Network IO error count is kept at 256 with some simple +// math. Before we reject the disk completely. The combination +// of retry logic and total error count roughly comes around +// 2.5secs ( 2 * 5 * time.Millisecond * 256) which is when we +// basically take the disk offline completely. This is considered +// sufficient time tradeoff to avoid large delays in-terms of +// incoming i/o. +const maxAllowedNetworkIOError = 256 // maximum allowed network IOError. // Initializes the remote RPC connection by attempting a login attempt. func (n *networkStorage) Init() (err error) { diff --git a/cmd/test-utils_test.go b/cmd/test-utils_test.go index 39aeb9cc7..ba783bcbe 100644 --- a/cmd/test-utils_test.go +++ b/cmd/test-utils_test.go @@ -65,9 +65,6 @@ func init() { // Enable caching. setMaxMemory() - - // Tests don't need to retry. - globalMaxStorageRetryThreshold = 1 } func prepareFS() (ObjectLayer, string, error) { @@ -1656,7 +1653,12 @@ func prepareNErroredDisks(storageDisks []StorageAPI, offline int, err error, t * } for i := 0; i < offline; i++ { - storageDisks[i] = &naughtyDisk{disk: &retryStorage{storageDisks[i]}, defaultErr: err} + storageDisks[i] = &naughtyDisk{disk: &retryStorage{ + remoteStorage: storageDisks[i], + maxRetryAttempts: 1, + retryUnit: time.Millisecond, + retryCap: time.Millisecond * 10, + }, defaultErr: err} } return storageDisks }