Add more delays on distributed startup for slow network (#5240)

Refer #5237
master
Harshavardhana 7 years ago committed by Dee Koder
parent ffdf115bf2
commit 819d1e80c6
  1. 2
      cmd/admin-handlers.go
  2. 14
      cmd/prepare-storage.go
  3. 34
      cmd/retry-storage.go

@ -842,7 +842,7 @@ func (adminAPI adminAPIHandlers) HealFormatHandler(w http.ResponseWriter, r *htt
// Wrap into retrying disks // Wrap into retrying disks
retryingDisks := initRetryableStorageDisks(bootstrapDisks, retryingDisks := initRetryableStorageDisks(bootstrapDisks,
time.Millisecond, time.Millisecond*5) time.Millisecond, time.Millisecond*5, globalStorageHealthCheckInterval, globalStorageRetryThreshold)
// Heal format.json on available storage. // Heal format.json on available storage.
err = healFormatXL(retryingDisks) err = healFormatXL(retryingDisks)

@ -318,13 +318,14 @@ func initStorageDisks(endpoints EndpointList) ([]StorageAPI, error) {
} }
// Wrap disks into retryable disks. // Wrap disks into retryable disks.
func initRetryableStorageDisks(disks []StorageAPI, retryUnit, retryCap time.Duration) (outDisks []StorageAPI) { func initRetryableStorageDisks(disks []StorageAPI, retryUnit, retryCap, retryInterval time.Duration, retryThreshold int) (outDisks []StorageAPI) {
// Initialize the disk into a retryable-disks wrapper. // Initialize the disk into a retryable-disks wrapper.
outDisks = make([]StorageAPI, len(disks)) outDisks = make([]StorageAPI, len(disks))
for i, disk := range disks { for i, disk := range disks {
outDisks[i] = &retryStorage{ outDisks[i] = &retryStorage{
remoteStorage: disk, remoteStorage: disk,
maxRetryAttempts: globalStorageRetryThreshold, retryInterval: retryInterval,
maxRetryAttempts: retryThreshold,
retryUnit: retryUnit, retryUnit: retryUnit,
retryCap: retryCap, retryCap: retryCap,
offlineTimestamp: UTCNow(), // Set timestamp to prevent immediate marking as offline offlineTimestamp: UTCNow(), // Set timestamp to prevent immediate marking as offline
@ -346,19 +347,20 @@ func waitForFormatXLDisks(firstDisk bool, endpoints EndpointList, storageDisks [
// retry window (30 seconds, with once-per-second retries) so // retry window (30 seconds, with once-per-second retries) so
// that we wait enough amount of time before the disks come // that we wait enough amount of time before the disks come
// online. // online.
retryDisks := initRetryableStorageDisks(storageDisks, time.Second, time.Second*30) retryDisks := initRetryableStorageDisks(storageDisks, time.Second, time.Second*30,
globalStorageInitHealthCheckInterval, globalStorageInitRetryThreshold)
// Start retry loop retrying until disks are formatted // Start retry loop retrying until disks are formatted
// properly, until we have reached a conditional quorum of // properly, until we have reached a conditional quorum of
// formatted disks. // formatted disks.
err = retryFormattingXLDisks(firstDisk, endpoints, retryDisks) if err = retryFormattingXLDisks(firstDisk, endpoints, retryDisks); err != nil {
if err != nil {
return nil, err return nil, err
} }
// Initialize the disk into a formatted disks wrapper. This // Initialize the disk into a formatted disks wrapper. This
// uses a shorter retry window (5ms with once-per-ms retries) // uses a shorter retry window (5ms with once-per-ms retries)
formattedDisks = initRetryableStorageDisks(storageDisks, time.Millisecond, time.Millisecond*5) formattedDisks = initRetryableStorageDisks(storageDisks, time.Millisecond, time.Millisecond*5,
globalStorageHealthCheckInterval, globalStorageRetryThreshold)
// Success. // Success.
return formattedDisks, nil return formattedDisks, nil

@ -23,12 +23,24 @@ import (
) )
const ( const (
// NOTE: Values indicated here are based on manual testing and
// for best case scenarios under wide array of setups. If you
// encounter changes in future feel free to change these values.
// Attempt to retry only this many number of times before
// giving up on the remote disk entirely during initialization.
globalStorageInitRetryThreshold = 2
// Attempt to retry only this many number of times before // Attempt to retry only this many number of times before
// giving up on the remote disk entirely after initialization. // giving up on the remote disk entirely after initialization.
globalStorageRetryThreshold = 1 globalStorageRetryThreshold = 1
// Interval to check health status of a node whether it has // Interval to check health status of a node whether it has
// come back up online // come back up online during initialization.
globalStorageInitHealthCheckInterval = 15 * time.Minute
// Interval to check health status of a node whether it has
// come back up online.
globalStorageHealthCheckInterval = 5 * time.Minute globalStorageHealthCheckInterval = 5 * time.Minute
) )
@ -52,6 +64,7 @@ func retryToStorageErr(err error) error {
type retryStorage struct { type retryStorage struct {
remoteStorage StorageAPI remoteStorage StorageAPI
maxRetryAttempts int maxRetryAttempts int
retryInterval time.Duration
retryUnit time.Duration retryUnit time.Duration
retryCap time.Duration retryCap time.Duration
offline bool // Mark whether node is offline offline bool // Mark whether node is offline
@ -78,7 +91,7 @@ func (f *retryStorage) Close() (err error) {
// restore the connection // restore the connection
func (f *retryStorage) IsOffline() bool { func (f *retryStorage) IsOffline() bool {
// Check if offline and whether enough time has lapsed since most recent check // Check if offline and whether enough time has lapsed since most recent check
if f.offline && UTCNow().Sub(f.offlineTimestamp) >= globalStorageHealthCheckInterval { if f.offline && UTCNow().Sub(f.offlineTimestamp) >= f.retryInterval {
f.offlineTimestamp = UTCNow() // reset timestamp f.offlineTimestamp = UTCNow() // reset timestamp
if e := f.reInit(nil); e == nil { if e := f.reInit(nil); e == nil {
@ -260,15 +273,13 @@ func (f *retryStorage) reInitUponDiskNotFound(err error) bool {
return false return false
} }
// Connect and attempt to load the format from a disconnected node, // Connect and attempt to load the format from a disconnected node.
// attempts three times before giving up. // Additionally upon failure, we retry maxRetryAttempts times before
// giving up. Essentially as a whole it would mean we are infact
// performing 1 + maxRetryAttempts times reInit.
func (f *retryStorage) reInit(e error) (err error) { func (f *retryStorage) reInit(e error) (err error) {
// Check whether node has gone offline.
// Only after initialization and minimum of one interval if UTCNow().Sub(f.offlineTimestamp) >= f.retryInterval {
// has passed (to prevent marking a node as offline right
// after initialization), check whether node has gone offline
if f.maxRetryAttempts == globalStorageRetryThreshold &&
UTCNow().Sub(f.offlineTimestamp) >= globalStorageHealthCheckInterval {
if e == errDiskNotFoundFromNetError { // Make node offline due to network error if e == errDiskNotFoundFromNetError { // Make node offline due to network error
f.offline = true // Marking node offline f.offline = true // Marking node offline
f.offlineTimestamp = UTCNow() f.offlineTimestamp = UTCNow()
@ -299,8 +310,7 @@ func (f *retryStorage) reInit(e error) (err error) {
// Attempt to load format to see if the disk is really // Attempt to load format to see if the disk is really
// a formatted disk and part of the cluster. // a formatted disk and part of the cluster.
_, err = loadFormat(f.remoteStorage) if _, err = loadFormat(f.remoteStorage); err != nil {
if err != nil {
// No need to return error until the retry count // No need to return error until the retry count
// threshold has reached. // threshold has reached.
if i < f.maxRetryAttempts { if i < f.maxRetryAttempts {

Loading…
Cancel
Save