diff --git a/cmd/admin-handlers.go b/cmd/admin-handlers.go index cc25a7cc4..2c76a7ffc 100644 --- a/cmd/admin-handlers.go +++ b/cmd/admin-handlers.go @@ -842,7 +842,7 @@ func (adminAPI adminAPIHandlers) HealFormatHandler(w http.ResponseWriter, r *htt // Wrap into retrying disks retryingDisks := initRetryableStorageDisks(bootstrapDisks, - time.Millisecond, time.Millisecond*5) + time.Millisecond, time.Millisecond*5, globalStorageHealthCheckInterval, globalStorageRetryThreshold) // Heal format.json on available storage. err = healFormatXL(retryingDisks) diff --git a/cmd/prepare-storage.go b/cmd/prepare-storage.go index 523114da1..a30ba0199 100644 --- a/cmd/prepare-storage.go +++ b/cmd/prepare-storage.go @@ -318,13 +318,14 @@ func initStorageDisks(endpoints EndpointList) ([]StorageAPI, error) { } // Wrap disks into retryable disks. -func initRetryableStorageDisks(disks []StorageAPI, retryUnit, retryCap time.Duration) (outDisks []StorageAPI) { +func initRetryableStorageDisks(disks []StorageAPI, retryUnit, retryCap, retryInterval time.Duration, retryThreshold int) (outDisks []StorageAPI) { // Initialize the disk into a retryable-disks wrapper. outDisks = make([]StorageAPI, len(disks)) for i, disk := range disks { outDisks[i] = &retryStorage{ remoteStorage: disk, - maxRetryAttempts: globalStorageRetryThreshold, + retryInterval: retryInterval, + maxRetryAttempts: retryThreshold, retryUnit: retryUnit, retryCap: retryCap, offlineTimestamp: UTCNow(), // Set timestamp to prevent immediate marking as offline @@ -346,19 +347,20 @@ func waitForFormatXLDisks(firstDisk bool, endpoints EndpointList, storageDisks [ // retry window (30 seconds, with once-per-second retries) so // that we wait enough amount of time before the disks come // online. - retryDisks := initRetryableStorageDisks(storageDisks, time.Second, time.Second*30) + retryDisks := initRetryableStorageDisks(storageDisks, time.Second, time.Second*30, + globalStorageInitHealthCheckInterval, globalStorageInitRetryThreshold) // Start retry loop retrying until disks are formatted // properly, until we have reached a conditional quorum of // formatted disks. - err = retryFormattingXLDisks(firstDisk, endpoints, retryDisks) - if err != nil { + if err = retryFormattingXLDisks(firstDisk, endpoints, retryDisks); err != nil { return nil, err } // Initialize the disk into a formatted disks wrapper. This // uses a shorter retry window (5ms with once-per-ms retries) - formattedDisks = initRetryableStorageDisks(storageDisks, time.Millisecond, time.Millisecond*5) + formattedDisks = initRetryableStorageDisks(storageDisks, time.Millisecond, time.Millisecond*5, + globalStorageHealthCheckInterval, globalStorageRetryThreshold) // Success. return formattedDisks, nil diff --git a/cmd/retry-storage.go b/cmd/retry-storage.go index a61952016..da68b9119 100644 --- a/cmd/retry-storage.go +++ b/cmd/retry-storage.go @@ -23,12 +23,24 @@ import ( ) const ( + // NOTE: Values indicated here are based on manual testing and + // for best case scenarios under wide array of setups. If you + // encounter changes in future feel free to change these values. + + // Attempt to retry only this many number of times before + // giving up on the remote disk entirely during initialization. + globalStorageInitRetryThreshold = 2 + // Attempt to retry only this many number of times before // giving up on the remote disk entirely after initialization. globalStorageRetryThreshold = 1 // Interval to check health status of a node whether it has - // come back up online + // come back up online during initialization. + globalStorageInitHealthCheckInterval = 15 * time.Minute + + // Interval to check health status of a node whether it has + // come back up online. globalStorageHealthCheckInterval = 5 * time.Minute ) @@ -52,6 +64,7 @@ func retryToStorageErr(err error) error { type retryStorage struct { remoteStorage StorageAPI maxRetryAttempts int + retryInterval time.Duration retryUnit time.Duration retryCap time.Duration offline bool // Mark whether node is offline @@ -78,7 +91,7 @@ func (f *retryStorage) Close() (err error) { // restore the connection func (f *retryStorage) IsOffline() bool { // Check if offline and whether enough time has lapsed since most recent check - if f.offline && UTCNow().Sub(f.offlineTimestamp) >= globalStorageHealthCheckInterval { + if f.offline && UTCNow().Sub(f.offlineTimestamp) >= f.retryInterval { f.offlineTimestamp = UTCNow() // reset timestamp if e := f.reInit(nil); e == nil { @@ -260,15 +273,13 @@ func (f *retryStorage) reInitUponDiskNotFound(err error) bool { return false } -// Connect and attempt to load the format from a disconnected node, -// attempts three times before giving up. +// Connect and attempt to load the format from a disconnected node. +// Additionally upon failure, we retry maxRetryAttempts times before +// giving up. Essentially as a whole it would mean we are infact +// performing 1 + maxRetryAttempts times reInit. func (f *retryStorage) reInit(e error) (err error) { - - // Only after initialization and minimum of one interval - // has passed (to prevent marking a node as offline right - // after initialization), check whether node has gone offline - if f.maxRetryAttempts == globalStorageRetryThreshold && - UTCNow().Sub(f.offlineTimestamp) >= globalStorageHealthCheckInterval { + // Check whether node has gone offline. + if UTCNow().Sub(f.offlineTimestamp) >= f.retryInterval { if e == errDiskNotFoundFromNetError { // Make node offline due to network error f.offline = true // Marking node offline f.offlineTimestamp = UTCNow() @@ -299,8 +310,7 @@ func (f *retryStorage) reInit(e error) (err error) { // Attempt to load format to see if the disk is really // a formatted disk and part of the cluster. - _, err = loadFormat(f.remoteStorage) - if err != nil { + if _, err = loadFormat(f.remoteStorage); err != nil { // No need to return error until the retry count // threshold has reached. if i < f.maxRetryAttempts {