diff --git a/cmd/lock-rest-server.go b/cmd/lock-rest-server.go index 9764780aa..df92e8144 100644 --- a/cmd/lock-rest-server.go +++ b/cmd/lock-rest-server.go @@ -225,8 +225,6 @@ func getLongLivedLocks(interval time.Duration) map[Endpoint][]nameLockRequesterI return nlripMap } -var lockMaintenanceTimeout = newDynamicTimeout(60*time.Second, time.Second) - // lockMaintenance loops over locks that have been active for some time and checks back // with the original server whether it is still alive or not // @@ -235,28 +233,19 @@ var lockMaintenanceTimeout = newDynamicTimeout(60*time.Second, time.Second) // - some network error (and server is up normally) // // We will ignore the error, and we will retry later to get a resolve on this lock -func lockMaintenance(ctx context.Context, interval time.Duration, objAPI ObjectLayer) error { - // Lock to avoid concurrent lock maintenance loops - maintenanceLock := objAPI.NewNSLock(ctx, "system", "lock-maintenance-ops") - if err := maintenanceLock.GetLock(lockMaintenanceTimeout); err != nil { - return err - } - defer maintenanceLock.Unlock() - +func lockMaintenance(ctx context.Context, interval time.Duration) error { // Validate if long lived locks are indeed clean. // Get list of long lived locks to check for staleness. for lendpoint, nlrips := range getLongLivedLocks(interval) { + nlripsMap := make(map[string]int, len(nlrips)) for _, nlrip := range nlrips { // Locks are only held on first zone, make sure that // we only look for ownership of locks from endpoints // on first zone. for _, endpoint := range globalEndpoints[0].Endpoints { - if endpoint.String() == lendpoint.String() { - continue - } - c := newLockAPI(endpoint) if !c.IsOnline() { + nlripsMap[nlrip.name]++ continue } @@ -266,25 +255,37 @@ func lockMaintenance(ctx context.Context, interval time.Duration, objAPI ObjectL UID: nlrip.lri.UID, Resources: []string{nlrip.name}, }) - if err != nil { + nlripsMap[nlrip.name]++ c.Close() continue } - // For successful response, verify if lock was indeed active or stale. - if expired { - // The lock is no longer active at server that originated - // the lock, attempt to remove the lock. - globalLockServers[lendpoint].mutex.Lock() - // Purge the stale entry if it exists. - globalLockServers[lendpoint].removeEntryIfExists(nlrip) - globalLockServers[lendpoint].mutex.Unlock() + if !expired { + nlripsMap[nlrip.name]++ } // Close the connection regardless of the call response. c.Close() } + + // Read locks we assume quorum for be N/2 success + quorum := globalXLSetDriveCount / 2 + if nlrip.lri.Writer { + // For write locks we need N/2+1 success + quorum = globalXLSetDriveCount/2 + 1 + } + + // less than the quorum, we have locks expired. + if nlripsMap[nlrip.name] < quorum { + // The lock is no longer active at server that originated + // the lock, attempt to remove the lock. + globalLockServers[lendpoint].mutex.Lock() + // Purge the stale entry if it exists. + globalLockServers[lendpoint].removeEntryIfExists(nlrip) + globalLockServers[lendpoint].mutex.Unlock() + } + } } @@ -293,12 +294,13 @@ func lockMaintenance(ctx context.Context, interval time.Duration, objAPI ObjectL // Start lock maintenance from all lock servers. func startLockMaintenance() { - var objAPI ObjectLayer var ctx = context.Background() // Wait until the object API is ready + // no need to start the lock maintenance + // if ObjectAPI is not initialized. for { - objAPI = newObjectLayerWithoutSafeModeFn() + objAPI := newObjectLayerWithoutSafeModeFn() if objAPI == nil { time.Sleep(time.Second) continue @@ -322,7 +324,7 @@ func startLockMaintenance() { // "synchronous checks" between servers duration := time.Duration(r.Float64() * float64(lockMaintenanceInterval)) time.Sleep(duration) - if err := lockMaintenance(ctx, lockValidityCheckInterval, objAPI); err != nil { + if err := lockMaintenance(ctx, lockValidityCheckInterval); err != nil { // Sleep right after an error. duration := time.Duration(r.Float64() * float64(lockMaintenanceInterval)) time.Sleep(duration)