@ -17,6 +17,7 @@
package cmd
import (
"math"
"sync"
"sync/atomic"
"time"
@ -26,7 +27,8 @@ const (
dynamicTimeoutIncreaseThresholdPct = 0.33 // Upper threshold for failures in order to increase timeout
dynamicTimeoutDecreaseThresholdPct = 0.10 // Lower threshold for failures in order to decrease timeout
dynamicTimeoutLogSize = 16
maxDuration = 1 << 63 - 1
maxDuration = math . MaxInt64
maxDynamicTimeout = 24 * time . Hour // Never set timeout bigger than this.
)
// timeouts that are dynamically adapted based on actual usage results
@ -40,6 +42,12 @@ type dynamicTimeout struct {
// newDynamicTimeout returns a new dynamic timeout initialized with timeout value
func newDynamicTimeout ( timeout , minimum time . Duration ) * dynamicTimeout {
if timeout <= 0 || minimum <= 0 {
panic ( "newDynamicTimeout: negative or zero timeout" )
}
if minimum > timeout {
minimum = timeout
}
return & dynamicTimeout { timeout : int64 ( timeout ) , minimum : int64 ( minimum ) }
}
@ -61,60 +69,73 @@ func (dt *dynamicTimeout) LogFailure() {
// logEntry stores a log entry
func ( dt * dynamicTimeout ) logEntry ( duration time . Duration ) {
if duration < 0 {
return
}
entries := int ( atomic . AddInt64 ( & dt . entries , 1 ) )
index := entries - 1
if index < dynamicTimeoutLogSize {
dt . mutex . Lock ( )
dt . log [ index ] = duration
dt . mutex . Unlock ( )
}
if entries == dynamicTimeoutLogSize {
dt . mutex . Lock ( )
// Make copy on stack in order to call adjust()
logCopy := [ dynamicTimeoutLogSize ] time . Duration { }
copy ( logCopy [ : ] , dt . log [ : ] )
// We leak entries while we copy
if entries == dynamicTimeoutLogSize {
// reset log entries
atomic . StoreInt64 ( & dt . entries , 0 )
// Make copy on stack in order to call adjust()
logCopy := [ dynamicTimeoutLogSize ] time . Duration { }
copy ( logCopy [ : ] , dt . log [ : ] )
dt . mutex . Unlock ( )
// reset log entries
atomic . StoreInt64 ( & dt . entries , 0 )
dt . mutex . Unlock ( )
dt . adjust ( logCopy )
dt . adjust ( logCopy )
return
}
dt . mutex . Unlock ( )
}
}
// adjust changes the value of the dynamic timeout based on the
// previous results
func ( dt * dynamicTimeout ) adjust ( entries [ dynamicTimeoutLogSize ] time . Duration ) {
failures , average := 0 , int64 ( 0 )
for i := 0 ; i < len ( entries ) ; i ++ {
if entries [ i ] == maxDuration {
failures , max := 0 , time . Duration ( 0 )
for _ , dur := range entries [ : ] {
if dur == maxDuration {
failures ++
} else {
average += int64 ( entries [ i ] )
} else if dur > max {
max = dur
}
}
if failures < len ( entries ) {
average /= int64 ( len ( entries ) - failures )
}
timeOutHit Pct := float64 ( failures ) / float64 ( len ( entries ) )
failPct := float64 ( failures ) / float64 ( len ( entries ) )
if timeOutHit Pct > dynamicTimeoutIncreaseThresholdPct {
if fail Pct > dynamicTimeoutIncreaseThresholdPct {
// We are hitting the timeout too often, so increase the timeout by 25%
timeout := atomic . LoadInt64 ( & dt . timeout ) * 125 / 100
atomic . StoreInt64 ( & dt . timeout , timeout )
} else if timeOutHitPct < dynamicTimeoutDecreaseThresholdPct {
// We are hitting the timeout relatively few times, so decrease the timeout
average = average * 125 / 100 // Add buffer of 25% on top of average
timeout := ( atomic . LoadInt64 ( & dt . timeout ) + average ) / 2 // Middle between current timeout and average success
// Set upper cap.
if timeout > int64 ( maxDynamicTimeout ) {
timeout = int64 ( maxDynamicTimeout )
}
// Safety, shouldn't happen
if timeout < dt . minimum {
timeout = dt . minimum
}
atomic . StoreInt64 ( & dt . timeout , timeout )
} else if failPct < dynamicTimeoutDecreaseThresholdPct {
// We are hitting the timeout relatively few times,
// so decrease the timeout towards 25 % of maximum time spent.
max = max * 125 / 100
timeout := atomic . LoadInt64 ( & dt . timeout )
if max < time . Duration ( timeout ) {
// Move 50% toward the max.
timeout = ( int64 ( max ) + timeout ) / 2
}
if timeout < dt . minimum {
timeout = dt . minimum
}
atomic . StoreInt64 ( & dt . timeout , timeout )
}
}