fix: fix lockup in merge-walk pool (#10098)

Fixes two different types of problems

- continuation of the problem seen in FS #9992
  as not fixed for erasure coded deployments,
  reproduced this issue with spark and its fixed now

- another issue was leaking walk go-routines which
  would lead to high memory usage and crash the system
  this is simply because all the walks which were purged
  at the top limit had leaking end walkers which would
  consume memory endlessly.

closes #9966
closes #10088
master
Harshavardhana 4 years ago committed by GitHub
parent 11d21d5d1b
commit 0c4be55936
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 130
      cmd/merge-walk-pool.go
  2. 38
      cmd/merge-walk-pool_test.go
  3. 40
      cmd/tree-walk-pool.go

@ -1,5 +1,5 @@
/* /*
* MinIO Cloud Storage, (C) 2019 MinIO, Inc. * MinIO Cloud Storage, (C) 2019, 2020 MinIO, Inc.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
@ -28,6 +28,7 @@ const (
// mergeWalkVersions - represents the go routine that does the merge walk versions. // mergeWalkVersions - represents the go routine that does the merge walk versions.
type mergeWalkVersions struct { type mergeWalkVersions struct {
added time.Time
entryChs []FileInfoVersionsCh entryChs []FileInfoVersionsCh
endWalkCh chan struct{} // To signal when mergeWalk go-routine should end. endWalkCh chan struct{} // To signal when mergeWalk go-routine should end.
endTimerCh chan<- struct{} // To signal when timer go-routine should end. endTimerCh chan<- struct{} // To signal when timer go-routine should end.
@ -88,17 +89,70 @@ func (t *MergeWalkVersionsPool) Set(params listParams, resultChs []FileInfoVersi
t.Lock() t.Lock()
defer t.Unlock() defer t.Unlock()
// If we are above the limit delete at least one entry from the pool.
if len(t.pool) > treeWalkEntryLimit {
age := time.Now()
var oldest listParams
for k, v := range t.pool {
if len(v) == 0 {
delete(t.pool, k)
continue
}
// The first element is the oldest, so we only check that.
e := v[0]
if e.added.Before(age) {
oldest = k
age = e.added
}
}
// Invalidate and delete oldest.
if walks, ok := t.pool[oldest]; ok && len(walks) > 0 {
endCh := walks[0].endTimerCh
endWalkCh := walks[0].endWalkCh
if len(walks) > 1 {
// Move walks forward
copy(walks, walks[1:])
walks = walks[:len(walks)-1]
t.pool[oldest] = walks
} else {
// Only entry, just delete.
delete(t.pool, oldest)
}
select {
case endCh <- struct{}{}:
close(endWalkCh)
default:
}
} else {
// Shouldn't happen, but just in case.
delete(t.pool, oldest)
}
}
// Should be a buffered channel so that Release() never blocks. // Should be a buffered channel so that Release() never blocks.
endTimerCh := make(chan struct{}, 1) endTimerCh := make(chan struct{}, 1)
walkInfo := mergeWalkVersions{ walkInfo := mergeWalkVersions{
added: UTCNow(),
entryChs: resultChs, entryChs: resultChs,
endWalkCh: endWalkCh, endWalkCh: endWalkCh,
endTimerCh: endTimerCh, endTimerCh: endTimerCh,
} }
// Append new walk info. // Append new walk info.
t.pool[params] = append(t.pool[params], walkInfo) walks := t.pool[params]
if len(walks) < treeWalkSameEntryLimit {
t.pool[params] = append(walks, walkInfo)
} else {
// We are at limit, invalidate oldest, move list down and add new as last.
select {
case walks[0].endTimerCh <- struct{}{}:
close(walks[0].endWalkCh)
default:
}
copy(walks, walks[1:])
walks[len(walks)-1] = walkInfo
}
// Timer go-routine which times out after t.timeOut seconds. // Timer go-routine which times out after t.timeOut seconds.
go func(endTimerCh <-chan struct{}, walkInfo mergeWalkVersions) { go func(endTimerCh <-chan struct{}, walkInfo mergeWalkVersions) {
@ -108,6 +162,7 @@ func (t *MergeWalkVersionsPool) Set(params listParams, resultChs []FileInfoVersi
// Timeout has expired. Remove the mergeWalk from mergeWalkPool and // Timeout has expired. Remove the mergeWalk from mergeWalkPool and
// end the mergeWalk go-routine. // end the mergeWalk go-routine.
t.Lock() t.Lock()
defer t.Unlock()
walks, ok := t.pool[params] walks, ok := t.pool[params]
if ok { if ok {
// Trick of filtering without allocating // Trick of filtering without allocating
@ -131,7 +186,6 @@ func (t *MergeWalkVersionsPool) Set(params listParams, resultChs []FileInfoVersi
} }
// Signal the mergeWalk go-routine to die. // Signal the mergeWalk go-routine to die.
close(endWalkCh) close(endWalkCh)
t.Unlock()
case <-endTimerCh: case <-endTimerCh:
return return
} }
@ -166,23 +220,22 @@ func (t *MergeWalkPool) Release(params listParams) ([]FileInfoCh, chan struct{})
t.Lock() t.Lock()
defer t.Unlock() defer t.Unlock()
walks, ok := t.pool[params] // Pick the valid walks. walks, ok := t.pool[params] // Pick the valid walks.
if ok { if !ok || len(walks) == 0 {
if len(walks) > 0 { // Release return nil if params not found.
// Pop out the first valid walk entry. return nil, nil
walk := walks[0] }
walks[0] = mergeWalk{} // clear references.
walks = walks[1:] // Pop out the first valid walk entry.
if len(walks) > 0 { walk := walks[0]
t.pool[params] = walks walks[0] = mergeWalk{} // clear references.
} else { walks = walks[1:]
delete(t.pool, params) if len(walks) > 0 {
} t.pool[params] = walks
walk.endTimerCh <- struct{}{} } else {
return walk.entryChs, walk.endWalkCh delete(t.pool, params)
}
} }
// Release return nil if params not found. walk.endTimerCh <- struct{}{}
return nil, nil return walk.entryChs, walk.endWalkCh
} }
// Set - adds a mergeWalk to the mergeWalkPool. // Set - adds a mergeWalk to the mergeWalkPool.
@ -207,27 +260,38 @@ func (t *MergeWalkPool) Set(params listParams, resultChs []FileInfoCh, endWalkCh
continue continue
} }
// The first element is the oldest, so we only check that. // The first element is the oldest, so we only check that.
if v[0].added.Before(age) { e := v[0]
if e.added.Before(age) {
oldest = k oldest = k
age = e.added
} }
} }
// Invalidate and delete oldest. // Invalidate and delete oldest.
if walks, ok := t.pool[oldest]; ok { if walks, ok := t.pool[oldest]; ok && len(walks) > 0 {
walk := walks[0] endCh := walks[0].endTimerCh
walks[0] = mergeWalk{} // clear references. endWalkCh := walks[0].endWalkCh
walks = walks[1:] if len(walks) > 1 {
if len(walks) > 0 { // Move walks forward
t.pool[params] = walks copy(walks, walks[1:])
walks = walks[:len(walks)-1]
t.pool[oldest] = walks
} else { } else {
delete(t.pool, params) // Only entry, just delete.
delete(t.pool, oldest)
}
select {
case endCh <- struct{}{}:
close(endWalkCh)
default:
} }
walk.endTimerCh <- struct{}{} } else {
// Shouldn't happen, but just in case.
delete(t.pool, oldest)
} }
} }
// Should be a buffered channel so that Release() never blocks. // Should be a buffered channel so that Release() never blocks.
endTimerCh := make(chan struct{}, 1) endTimerCh := make(chan struct{}, 1)
walkInfo := mergeWalk{ walkInfo := mergeWalk{
added: UTCNow(), added: UTCNow(),
entryChs: resultChs, entryChs: resultChs,
@ -241,7 +305,11 @@ func (t *MergeWalkPool) Set(params listParams, resultChs []FileInfoCh, endWalkCh
t.pool[params] = append(walks, walkInfo) t.pool[params] = append(walks, walkInfo)
} else { } else {
// We are at limit, invalidate oldest, move list down and add new as last. // We are at limit, invalidate oldest, move list down and add new as last.
walks[0].endTimerCh <- struct{}{} select {
case walks[0].endTimerCh <- struct{}{}:
close(walks[0].endWalkCh)
default:
}
copy(walks, walks[1:]) copy(walks, walks[1:])
walks[len(walks)-1] = walkInfo walks[len(walks)-1] = walkInfo
} }
@ -254,6 +322,7 @@ func (t *MergeWalkPool) Set(params listParams, resultChs []FileInfoCh, endWalkCh
// Timeout has expired. Remove the mergeWalk from mergeWalkPool and // Timeout has expired. Remove the mergeWalk from mergeWalkPool and
// end the mergeWalk go-routine. // end the mergeWalk go-routine.
t.Lock() t.Lock()
defer t.Unlock()
walks, ok := t.pool[params] walks, ok := t.pool[params]
if ok { if ok {
// Trick of filtering without allocating // Trick of filtering without allocating
@ -277,7 +346,6 @@ func (t *MergeWalkPool) Set(params listParams, resultChs []FileInfoCh, endWalkCh
} }
// Signal the mergeWalk go-routine to die. // Signal the mergeWalk go-routine to die.
close(endWalkCh) close(endWalkCh)
t.Unlock()
case <-endTimerCh: case <-endTimerCh:
return return
} }

@ -1,5 +1,5 @@
/* /*
* MinIO Cloud Storage, (C) 2019 MinIO, Inc. * MinIO Cloud Storage, (C) 2019,2020 MinIO, Inc.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
@ -21,6 +21,42 @@ import (
"time" "time"
) )
// Test if tree walker go-routine is removed from the pool after timeout
// and that is available in the pool before the timeout.
func TestMergeWalkPoolVersionsBasic(t *testing.T) {
// Create a treeWalkPool
tw := NewMergeWalkVersionsPool(1 * time.Second)
// Create sample params
params := listParams{
bucket: "test-bucket",
}
endWalkCh := make(chan struct{})
// Add a treeWalk to the pool
tw.Set(params, []FileInfoVersionsCh{}, endWalkCh)
// Wait for treeWalkPool timeout to happen
<-time.After(2 * time.Second)
if c1, _ := tw.Release(params); c1 != nil {
t.Error("treeWalk go-routine must have been freed")
}
// Add the treeWalk back to the pool
endWalkCh = make(chan struct{})
tw.Set(params, []FileInfoVersionsCh{}, endWalkCh)
// Release the treeWalk before timeout
select {
case <-time.After(1 * time.Second):
break
default:
if c1, _ := tw.Release(params); c1 == nil {
t.Error("treeWalk go-routine got freed before timeout")
}
}
}
// Test if tree walker go-routine is removed from the pool after timeout // Test if tree walker go-routine is removed from the pool after timeout
// and that is available in the pool before the timeout. // and that is available in the pool before the timeout.
func TestMergeWalkPoolBasic(t *testing.T) { func TestMergeWalkPoolBasic(t *testing.T) {

@ -1,5 +1,5 @@
/* /*
* MinIO Cloud Storage, (C) 2016 MinIO, Inc. * MinIO Cloud Storage, (C) 2016-2020 MinIO, Inc.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
@ -80,23 +80,21 @@ func (t *TreeWalkPool) Release(params listParams) (resultCh chan TreeWalkResult,
t.mu.Lock() t.mu.Lock()
defer t.mu.Unlock() defer t.mu.Unlock()
walks, ok := t.pool[params] // Pick the valid walks. walks, ok := t.pool[params] // Pick the valid walks.
if ok { if !ok || len(walks) == 0 {
if len(walks) > 0 { // Release return nil if params not found.
// Pop out the first valid walk entry. return nil, nil
walk := walks[0]
walks[0] = treeWalk{} // clear references.
walks = walks[1:]
if len(walks) > 0 {
t.pool[params] = walks
} else {
delete(t.pool, params)
}
walk.endTimerCh <- struct{}{}
return walk.resultCh, walk.endWalkCh
}
} }
// Release return nil if params not found.
return nil, nil // Pop out the first valid walk entry.
walk := walks[0]
walks = walks[1:]
if len(walks) > 0 {
t.pool[params] = walks
} else {
delete(t.pool, params)
}
walk.endTimerCh <- struct{}{}
return walk.resultCh, walk.endWalkCh
} }
// Set - adds a treeWalk to the treeWalkPool. // Set - adds a treeWalk to the treeWalkPool.
@ -129,6 +127,7 @@ func (t *TreeWalkPool) Set(params listParams, resultCh chan TreeWalkResult, endW
// Invalidate and delete oldest. // Invalidate and delete oldest.
if walks, ok := t.pool[oldest]; ok && len(walks) > 0 { if walks, ok := t.pool[oldest]; ok && len(walks) > 0 {
endCh := walks[0].endTimerCh endCh := walks[0].endTimerCh
endWalkCh := walks[0].endWalkCh
if len(walks) > 1 { if len(walks) > 1 {
// Move walks forward // Move walks forward
copy(walks, walks[1:]) copy(walks, walks[1:])
@ -140,6 +139,7 @@ func (t *TreeWalkPool) Set(params listParams, resultCh chan TreeWalkResult, endW
} }
select { select {
case endCh <- struct{}{}: case endCh <- struct{}{}:
close(endWalkCh)
default: default:
} }
} else { } else {
@ -156,6 +156,7 @@ func (t *TreeWalkPool) Set(params listParams, resultCh chan TreeWalkResult, endW
endWalkCh: endWalkCh, endWalkCh: endWalkCh,
endTimerCh: endTimerCh, endTimerCh: endTimerCh,
} }
// Append new walk info. // Append new walk info.
walks := t.pool[params] walks := t.pool[params]
if len(walks) < treeWalkSameEntryLimit { if len(walks) < treeWalkSameEntryLimit {
@ -164,6 +165,7 @@ func (t *TreeWalkPool) Set(params listParams, resultCh chan TreeWalkResult, endW
// We are at limit, invalidate oldest, move list down and add new as last. // We are at limit, invalidate oldest, move list down and add new as last.
select { select {
case walks[0].endTimerCh <- struct{}{}: case walks[0].endTimerCh <- struct{}{}:
close(walks[0].endWalkCh)
default: default:
} }
copy(walks, walks[1:]) copy(walks, walks[1:])
@ -171,7 +173,7 @@ func (t *TreeWalkPool) Set(params listParams, resultCh chan TreeWalkResult, endW
} }
// Timer go-routine which times out after t.timeOut seconds. // Timer go-routine which times out after t.timeOut seconds.
go func(endTimerCh <-chan struct{}) { go func(endTimerCh <-chan struct{}, walkInfo treeWalk) {
select { select {
// Wait until timeOut // Wait until timeOut
case <-time.After(t.timeOut): case <-time.After(t.timeOut):
@ -205,5 +207,5 @@ func (t *TreeWalkPool) Set(params listParams, resultCh chan TreeWalkResult, endW
case <-endTimerCh: case <-endTimerCh:
return return
} }
}(endTimerCh) }(endTimerCh, walkInfo)
} }

Loading…
Cancel
Save