From b3f22eac565b2d5077cb0ec21f488d8f4cd1f4b8 Mon Sep 17 00:00:00 2001
From: Harshavardhana <harsha@minio.io>
Date: Tue, 14 May 2019 13:49:10 -0700
Subject: [PATCH] Offload listing to posix layer (#7611)

This PR adds one API WalkCh which
sorts and sends list over the network

Each disk walks independently in a sorted manner.
---
 cmd/merge-walk-pool.go      | 143 +++++++++++++++++++++
 cmd/merge-walk-pool_test.go | 103 +++++++++++++++
 cmd/naughty-disk_test.go    |   7 +
 cmd/posix.go                |  86 ++++++++++++-
 cmd/storage-datatypes.go    |  15 +++
 cmd/storage-interface.go    |   4 +
 cmd/storage-rest-client.go  |  37 ++++++
 cmd/storage-rest-common.go  |   5 +-
 cmd/storage-rest-server.go  |  54 ++++++++
 cmd/tree-walk-pool.go       |  14 +-
 cmd/xl-sets.go              | 249 +++++++++++++++++++++++++++++++++---
 11 files changed, 692 insertions(+), 25 deletions(-)
 create mode 100644 cmd/merge-walk-pool.go
 create mode 100644 cmd/merge-walk-pool_test.go

diff --git a/cmd/merge-walk-pool.go b/cmd/merge-walk-pool.go
new file mode 100644
index 000000000..b09fd6740
--- /dev/null
+++ b/cmd/merge-walk-pool.go
@@ -0,0 +1,143 @@
+/*
+ * MinIO Cloud Storage, (C) 2019 MinIO, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package cmd
+
+import (
+	"reflect"
+	"sync"
+	"time"
+)
+
+const (
+	globalMergeLookupTimeout = time.Minute * 1 // 1 minutes.
+)
+
+// mergeWalk - represents the go routine that does the merge walk.
+type mergeWalk struct {
+	entryChs   []FileInfoCh
+	endWalkCh  chan struct{}   // To signal when mergeWalk go-routine should end.
+	endTimerCh chan<- struct{} // To signal when timer go-routine should end.
+}
+
+// MergeWalkPool - pool of mergeWalk go routines.
+// A mergeWalk is added to the pool by Set() and removed either by
+// doing a Release() or if the concerned timer goes off.
+// mergeWalkPool's purpose is to maintain active mergeWalk go-routines in a map so that
+// it can be looked up across related list calls.
+type MergeWalkPool struct {
+	pool    map[listParams][]mergeWalk
+	timeOut time.Duration
+	lock    *sync.Mutex
+}
+
+// NewMergeWalkPool - initialize new tree walk pool.
+func NewMergeWalkPool(timeout time.Duration) *MergeWalkPool {
+	tPool := &MergeWalkPool{
+		pool:    make(map[listParams][]mergeWalk),
+		timeOut: timeout,
+		lock:    &sync.Mutex{},
+	}
+	return tPool
+}
+
+// Release - selects a mergeWalk from the pool based on the input
+// listParams, removes it from the pool, and returns the MergeWalkResult
+// channel.
+// Returns nil if listParams does not have an asccociated mergeWalk.
+func (t MergeWalkPool) Release(params listParams) ([]FileInfoCh, chan struct{}) {
+	t.lock.Lock()
+	defer t.lock.Unlock()
+	walks, ok := t.pool[params] // Pick the valid walks.
+	if ok {
+		if len(walks) > 0 {
+			// Pop out the first valid walk entry.
+			walk := walks[0]
+			walks = walks[1:]
+			if len(walks) > 0 {
+				t.pool[params] = walks
+			} else {
+				delete(t.pool, params)
+			}
+			walk.endTimerCh <- struct{}{}
+			return walk.entryChs, walk.endWalkCh
+		}
+	}
+	// Release return nil if params not found.
+	return nil, nil
+}
+
+// Set - adds a mergeWalk to the mergeWalkPool.
+// Also starts a timer go-routine that ends when:
+// 1) time.After() expires after t.timeOut seconds.
+//    The expiration is needed so that the mergeWalk go-routine resources are freed after a timeout
+//    if the S3 client does only partial listing of objects.
+// 2) Relase() signals the timer go-routine to end on endTimerCh.
+//    During listing the timer should not timeout and end the mergeWalk go-routine, hence the
+//    timer go-routine should be ended.
+func (t MergeWalkPool) Set(params listParams, resultChs []FileInfoCh, endWalkCh chan struct{}) {
+	t.lock.Lock()
+	defer t.lock.Unlock()
+
+	// Should be a buffered channel so that Release() never blocks.
+	endTimerCh := make(chan struct{}, 1)
+
+	walkInfo := mergeWalk{
+		entryChs:   resultChs,
+		endWalkCh:  endWalkCh,
+		endTimerCh: endTimerCh,
+	}
+
+	// Append new walk info.
+	t.pool[params] = append(t.pool[params], walkInfo)
+
+	// Timer go-routine which times out after t.timeOut seconds.
+	go func(endTimerCh <-chan struct{}, walkInfo mergeWalk) {
+		select {
+		// Wait until timeOut
+		case <-time.After(t.timeOut):
+			// Timeout has expired. Remove the mergeWalk from mergeWalkPool and
+			// end the mergeWalk go-routine.
+			t.lock.Lock()
+			walks, ok := t.pool[params]
+			if ok {
+				// Trick of filtering without allocating
+				// https://github.com/golang/go/wiki/SliceTricks#filtering-without-allocating
+				nwalks := walks[:0]
+				// Look for walkInfo, remove it from the walks list.
+				for _, walk := range walks {
+					if !reflect.DeepEqual(walk, walkInfo) {
+						nwalks = append(nwalks, walk)
+					}
+				}
+				if len(nwalks) == 0 {
+					// No more mergeWalk go-routines associated with listParams
+					// hence remove map entry.
+					delete(t.pool, params)
+				} else {
+					// There are more mergeWalk go-routines associated with listParams
+					// hence save the list in the map.
+					t.pool[params] = nwalks
+				}
+			}
+			// Signal the mergeWalk go-routine to die.
+			close(endWalkCh)
+			t.lock.Unlock()
+		case <-endTimerCh:
+			return
+		}
+	}(endTimerCh, walkInfo)
+}
diff --git a/cmd/merge-walk-pool_test.go b/cmd/merge-walk-pool_test.go
new file mode 100644
index 000000000..def12291a
--- /dev/null
+++ b/cmd/merge-walk-pool_test.go
@@ -0,0 +1,103 @@
+/*
+ * MinIO Cloud Storage, (C) 2019 MinIO, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package cmd
+
+import (
+	"testing"
+	"time"
+)
+
+// Test if tree walker go-routine is removed from the pool after timeout
+// and that is available in the pool before the timeout.
+func TestMergeWalkPoolBasic(t *testing.T) {
+	// Create a treeWalkPool
+	tw := NewMergeWalkPool(1 * time.Second)
+
+	// Create sample params
+	params := listParams{
+		bucket: "test-bucket",
+	}
+
+	endWalkCh := make(chan struct{})
+	// Add a treeWalk to the pool
+	tw.Set(params, []FileInfoCh{}, endWalkCh)
+
+	// Wait for treeWalkPool timeout to happen
+	<-time.After(2 * time.Second)
+	if c1, _ := tw.Release(params); c1 != nil {
+		t.Error("treeWalk go-routine must have been freed")
+	}
+
+	// Add the treeWalk back to the pool
+	endWalkCh = make(chan struct{})
+	tw.Set(params, []FileInfoCh{}, endWalkCh)
+
+	// Release the treeWalk before timeout
+	select {
+	case <-time.After(1 * time.Second):
+		break
+	default:
+		if c1, _ := tw.Release(params); c1 == nil {
+			t.Error("treeWalk go-routine got freed before timeout")
+		}
+	}
+}
+
+// Test if multiple merge walkers for the same listParams are managed as expected by the pool.
+func TestManyMergeWalksSameParam(t *testing.T) {
+	// Create a treeWalkPool.
+	tw := NewMergeWalkPool(5 * time.Second)
+
+	// Create sample params.
+	params := listParams{
+		bucket: "test-bucket",
+	}
+
+	select {
+	// This timeout is an upper-bound. This is started
+	// before the first treeWalk go-routine's timeout period starts.
+	case <-time.After(5 * time.Second):
+		break
+	default:
+		// Create many treeWalk go-routines for the same params.
+		for i := 0; i < 10; i++ {
+			endWalkCh := make(chan struct{})
+			walkChs := make([]FileInfoCh, 0)
+			tw.Set(params, walkChs, endWalkCh)
+		}
+
+		tw.lock.Lock()
+		if walks, ok := tw.pool[params]; ok {
+			if len(walks) != 10 {
+				t.Error("There aren't as many walks as were Set")
+			}
+		}
+		tw.lock.Unlock()
+		for i := 0; i < 10; i++ {
+			tw.lock.Lock()
+			if walks, ok := tw.pool[params]; ok {
+				// Before ith Release we should have 10-i treeWalk go-routines.
+				if 10-i != len(walks) {
+					t.Error("There aren't as many walks as were Set")
+				}
+			}
+			tw.lock.Unlock()
+			tw.Release(params)
+		}
+	}
+
+}
diff --git a/cmd/naughty-disk_test.go b/cmd/naughty-disk_test.go
index 6b20eba15..4a48c9791 100644
--- a/cmd/naughty-disk_test.go
+++ b/cmd/naughty-disk_test.go
@@ -111,6 +111,13 @@ func (d *naughtyDisk) DeleteVol(volume string) (err error) {
 	return d.disk.DeleteVol(volume)
 }
 
+func (d *naughtyDisk) Walk(volume, path, marker string, recursive bool, leafFile string, readMetadataFn readMetadataFunc, endWalkCh chan struct{}) (chan FileInfo, error) {
+	if err := d.calcError(); err != nil {
+		return nil, err
+	}
+	return d.disk.Walk(volume, path, marker, recursive, leafFile, readMetadataFn, endWalkCh)
+}
+
 func (d *naughtyDisk) ListDir(volume, path string, count int, leafFile string) (entries []string, err error) {
 	if err := d.calcError(); err != nil {
 		return []string{}, err
diff --git a/cmd/posix.go b/cmd/posix.go
index 5ee157a2c..4be0d1447 100644
--- a/cmd/posix.go
+++ b/cmd/posix.go
@@ -1,5 +1,5 @@
 /*
- * MinIO Cloud Storage, (C) 2016, 2017, 2018 MinIO, Inc.
+ * MinIO Cloud Storage, (C) 2016-2019 MinIO, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@ import (
 	slashpath "path"
 	"path/filepath"
 	"runtime"
+	"sort"
 	"strings"
 	"sync"
 	"sync/atomic"
@@ -45,7 +46,9 @@ const (
 	diskMinTotalSpace   = diskMinFreeSpace      // Min 900MiB total space.
 	maxAllowedIOError   = 5
 	posixWriteBlockSize = 4 * humanize.MiByte
-	directioAlignSize   = 4096 // DirectIO alignment needs to be 4K. Defined here as directio.AlignSize is defined as 0 in MacOS causing divide by 0 error.
+	// DirectIO alignment needs to be 4K. Defined here as
+	// directio.AlignSize is defined as 0 in MacOS causing divide by 0 error.
+	directioAlignSize = 4096
 )
 
 // isValidVolname verifies a volname name in accordance with object
@@ -642,6 +645,85 @@ func (s *posix) DeleteVol(volume string) (err error) {
 	return nil
 }
 
+// Walk - is a sorted walker which returns file entries in lexically
+// sorted order, additionally along with metadata about each of those entries.
+func (s *posix) Walk(volume, dirPath, marker string, recursive bool, leafFile string,
+	readMetadataFn readMetadataFunc, endWalkCh chan struct{}) (ch chan FileInfo, err error) {
+
+	defer func() {
+		if err == errFaultyDisk {
+			atomic.AddInt32(&s.ioErrCount, 1)
+		}
+	}()
+
+	if atomic.LoadInt32(&s.ioErrCount) > maxAllowedIOError {
+		return nil, errFaultyDisk
+	}
+
+	if err = s.checkDiskFound(); err != nil {
+		return nil, err
+	}
+
+	// Verify if volume is valid and it exists.
+	volumeDir, err := s.getVolDir(volume)
+	if err != nil {
+		return nil, err
+	}
+
+	// Stat a volume entry.
+	_, err = os.Stat(volumeDir)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil, errVolumeNotFound
+		} else if isSysErrIO(err) {
+			return nil, errFaultyDisk
+		}
+		return nil, err
+	}
+
+	ch = make(chan FileInfo)
+	go func() {
+		defer close(ch)
+		listDir := func(volume, dirPath, dirEntry string) (entries []string) {
+			entries, err := s.ListDir(volume, dirPath, -1, leafFile)
+			if err != nil {
+				return
+			}
+			sort.Strings(entries)
+			return filterMatchingPrefix(entries, dirEntry)
+		}
+
+		walkResultCh := startTreeWalk(context.Background(), volume, dirPath, marker, recursive, listDir, endWalkCh)
+		for {
+			walkResult, ok := <-walkResultCh
+			if !ok {
+				return
+			}
+			var fi FileInfo
+			if hasSuffix(walkResult.entry, slashSeparator) {
+				fi = FileInfo{
+					Volume: volume,
+					Name:   walkResult.entry,
+					Mode:   os.ModeDir,
+				}
+			} else {
+				buf, err := s.ReadAll(volume, pathJoin(walkResult.entry, leafFile))
+				if err != nil {
+					continue
+				}
+				fi = readMetadataFn(buf, volume, walkResult.entry)
+			}
+			select {
+			case ch <- fi:
+			case <-endWalkCh:
+				return
+			}
+		}
+	}()
+
+	return ch, nil
+}
+
 // ListDir - return all the entries at the given directory path.
 // If an entry is a directory it will be returned with a trailing "/".
 func (s *posix) ListDir(volume, dirPath string, count int, leafFile string) (entries []string, err error) {
diff --git a/cmd/storage-datatypes.go b/cmd/storage-datatypes.go
index a2bdb760e..cfa326a28 100644
--- a/cmd/storage-datatypes.go
+++ b/cmd/storage-datatypes.go
@@ -30,6 +30,13 @@ type VolInfo struct {
 	Created time.Time
 }
 
+// FilesInfo represent a list of files, additionally
+// indicates if the list is last.
+type FilesInfo struct {
+	Files       []FileInfo
+	IsTruncated bool
+}
+
 // FileInfo - represents file stat information.
 type FileInfo struct {
 	// Name of the volume.
@@ -46,4 +53,12 @@ type FileInfo struct {
 
 	// File mode bits.
 	Mode os.FileMode
+
+	// File metadata
+	Metadata map[string]string
+
+	// All the parts per object.
+	Parts []ObjectPartInfo
+
+	Quorum int
 }
diff --git a/cmd/storage-interface.go b/cmd/storage-interface.go
index 55dc66362..0502c6952 100644
--- a/cmd/storage-interface.go
+++ b/cmd/storage-interface.go
@@ -38,6 +38,10 @@ type StorageAPI interface {
 	StatVol(volume string) (vol VolInfo, err error)
 	DeleteVol(volume string) (err error)
 
+	// Walk in sorted order directly on disk.
+	Walk(volume, dirPath string, marker string, recursive bool, leafFile string,
+		readMetadataFn readMetadataFunc, endWalkCh chan struct{}) (chan FileInfo, error)
+
 	// File operations.
 	ListDir(volume, dirPath string, count int, leafFile string) ([]string, error)
 	ReadFile(volume string, path string, offset int64, buf []byte, verifier *BitrotVerifier) (n int64, err error)
diff --git a/cmd/storage-rest-client.go b/cmd/storage-rest-client.go
index 48d6721ff..ede29de94 100644
--- a/cmd/storage-rest-client.go
+++ b/cmd/storage-rest-client.go
@@ -330,6 +330,43 @@ func (client *storageRESTClient) ReadFile(volume, path string, offset int64, buf
 	return int64(n), err
 }
 
+func (client *storageRESTClient) Walk(volume, dirPath, marker string, recursive bool, leafFile string,
+	readMetadataFn readMetadataFunc, endWalkCh chan struct{}) (chan FileInfo, error) {
+	values := make(url.Values)
+	values.Set(storageRESTVolume, volume)
+	values.Set(storageRESTDirPath, dirPath)
+	values.Set(storageRESTMarkerPath, marker)
+	values.Set(storageRESTRecursive, strconv.FormatBool(recursive))
+	values.Set(storageRESTLeafFile, leafFile)
+	respBody, err := client.call(storageRESTMethodWalk, values, nil, -1)
+	if err != nil {
+		return nil, err
+	}
+
+	ch := make(chan FileInfo)
+	go func() {
+		defer close(ch)
+		defer http.DrainBody(respBody)
+
+		decoder := gob.NewDecoder(respBody)
+		for {
+			var fi FileInfo
+			if gerr := decoder.Decode(&fi); gerr != nil {
+				// Upon error return
+				return
+			}
+			select {
+			case ch <- fi:
+			case <-endWalkCh:
+				return
+			}
+
+		}
+	}()
+
+	return ch, nil
+}
+
 // ListDir - lists a directory.
 func (client *storageRESTClient) ListDir(volume, dirPath string, count int, leafFile string) (entries []string, err error) {
 	values := make(url.Values)
diff --git a/cmd/storage-rest-common.go b/cmd/storage-rest-common.go
index 7e087f42b..ca64d4a93 100644
--- a/cmd/storage-rest-common.go
+++ b/cmd/storage-rest-common.go
@@ -16,7 +16,7 @@
 
 package cmd
 
-const storageRESTVersion = "v5"
+const storageRESTVersion = "v6"
 const storageRESTPath = minioReservedBucketPath + "/storage/" + storageRESTVersion + "/"
 
 const (
@@ -34,6 +34,7 @@ const (
 	storageRESTMethodReadFile       = "readfile"
 	storageRESTMethodReadFileStream = "readfilestream"
 	storageRESTMethodListDir        = "listdir"
+	storageRESTMethodWalk           = "walk"
 	storageRESTMethodDeleteFile     = "deletefile"
 	storageRESTMethodDeleteFileBulk = "deletefilebulk"
 	storageRESTMethodRenameFile     = "renamefile"
@@ -51,7 +52,9 @@ const (
 	storageRESTOffset     = "offset"
 	storageRESTLength     = "length"
 	storageRESTCount      = "count"
+	storageRESTMarkerPath = "marker"
 	storageRESTLeafFile   = "leaf-file"
+	storageRESTRecursive  = "recursive"
 	storageRESTBitrotAlgo = "bitrot-algo"
 	storageRESTBitrotHash = "bitrot-hash"
 	storageRESTInstanceID = "instance-id"
diff --git a/cmd/storage-rest-server.go b/cmd/storage-rest-server.go
index 702166ca8..797c34875 100644
--- a/cmd/storage-rest-server.go
+++ b/cmd/storage-rest-server.go
@@ -17,6 +17,7 @@
 package cmd
 
 import (
+	"context"
 	"encoding/gob"
 	"encoding/hex"
 	"errors"
@@ -369,6 +370,57 @@ func (s *storageRESTServer) ReadFileStreamHandler(w http.ResponseWriter, r *http
 	w.(http.Flusher).Flush()
 }
 
+// readMetadata func provides the function types for reading leaf metadata.
+type readMetadataFunc func(buf []byte, volume, entry string) FileInfo
+
+func readMetadata(buf []byte, volume, entry string) FileInfo {
+	m, err := xlMetaV1UnmarshalJSON(context.Background(), buf)
+	if err != nil {
+		return FileInfo{}
+	}
+	return FileInfo{
+		Volume:   volume,
+		Name:     entry,
+		ModTime:  m.Stat.ModTime,
+		Size:     m.Stat.Size,
+		Metadata: m.Meta,
+		Parts:    m.Parts,
+		Quorum:   m.Erasure.DataBlocks,
+	}
+}
+
+// WalkHandler - remote caller to start walking at a requested directory path.
+func (s *storageRESTServer) WalkHandler(w http.ResponseWriter, r *http.Request) {
+	if !s.IsValid(w, r) {
+		return
+	}
+	vars := mux.Vars(r)
+	volume := vars[storageRESTVolume]
+	dirPath := vars[storageRESTDirPath]
+	markerPath := vars[storageRESTMarkerPath]
+	recursive, err := strconv.ParseBool(vars[storageRESTRecursive])
+	if err != nil {
+		s.writeErrorResponse(w, err)
+		return
+	}
+	leafFile := vars[storageRESTLeafFile]
+
+	endWalkCh := make(chan struct{})
+	defer close(endWalkCh)
+
+	fch, err := s.storage.Walk(volume, dirPath, markerPath, recursive, leafFile, readMetadata, endWalkCh)
+	if err != nil {
+		s.writeErrorResponse(w, err)
+		return
+	}
+	defer w.(http.Flusher).Flush()
+
+	encoder := gob.NewEncoder(w)
+	for fi := range fch {
+		encoder.Encode(&fi)
+	}
+}
+
 // ListDirHandler - list a directory.
 func (s *storageRESTServer) ListDirHandler(w http.ResponseWriter, r *http.Request) {
 	if !s.IsValid(w, r) {
@@ -479,6 +531,8 @@ func registerStorageRESTHandlers(router *mux.Router, endpoints EndpointList) {
 			Queries(restQueries(storageRESTVolume, storageRESTFilePath, storageRESTOffset, storageRESTLength)...)
 		subrouter.Methods(http.MethodPost).Path("/" + storageRESTMethodListDir).HandlerFunc(httpTraceHdrs(server.ListDirHandler)).
 			Queries(restQueries(storageRESTVolume, storageRESTDirPath, storageRESTCount, storageRESTLeafFile)...)
+		subrouter.Methods(http.MethodPost).Path("/" + storageRESTMethodWalk).HandlerFunc(httpTraceHdrs(server.WalkHandler)).
+			Queries(restQueries(storageRESTVolume, storageRESTDirPath, storageRESTMarkerPath, storageRESTRecursive, storageRESTLeafFile)...)
 		subrouter.Methods(http.MethodPost).Path("/" + storageRESTMethodDeleteFile).HandlerFunc(httpTraceHdrs(server.DeleteFileHandler)).
 			Queries(restQueries(storageRESTVolume, storageRESTFilePath)...)
 		subrouter.Methods(http.MethodPost).Path("/" + storageRESTMethodDeleteFileBulk).HandlerFunc(httpTraceHdrs(server.DeleteFileBulkHandler)).
diff --git a/cmd/tree-walk-pool.go b/cmd/tree-walk-pool.go
index 7ed4cc821..df09a5dec 100644
--- a/cmd/tree-walk-pool.go
+++ b/cmd/tree-walk-pool.go
@@ -18,6 +18,7 @@ package cmd
 
 import (
 	"errors"
+	"reflect"
 	"sync"
 	"time"
 )
@@ -127,20 +128,23 @@ func (t TreeWalkPool) Set(params listParams, resultCh chan TreeWalkResult, endWa
 			t.lock.Lock()
 			walks, ok := t.pool[params]
 			if ok {
+				// Trick of filtering without allocating
+				// https://github.com/golang/go/wiki/SliceTricks#filtering-without-allocating
+				nwalks := walks[:0]
 				// Look for walkInfo, remove it from the walks list.
-				for i, walk := range walks {
-					if walk == walkInfo {
-						walks = append(walks[:i], walks[i+1:]...)
+				for _, walk := range walks {
+					if !reflect.DeepEqual(walk, walkInfo) {
+						nwalks = append(nwalks, walk)
 					}
 				}
-				if len(walks) == 0 {
+				if len(nwalks) == 0 {
 					// No more treeWalk go-routines associated with listParams
 					// hence remove map entry.
 					delete(t.pool, params)
 				} else {
 					// There are more treeWalk go-routines associated with listParams
 					// hence save the list in the map.
-					t.pool[params] = walks
+					t.pool[params] = nwalks
 				}
 			}
 			// Signal the treeWalk go-routine to die.
diff --git a/cmd/xl-sets.go b/cmd/xl-sets.go
index b45cc13e0..886269d87 100644
--- a/cmd/xl-sets.go
+++ b/cmd/xl-sets.go
@@ -23,7 +23,6 @@ import (
 	"io"
 	"net/http"
 	"sort"
-	"strings"
 	"sync"
 	"time"
 
@@ -76,8 +75,8 @@ type xlSets struct {
 	// Distribution algorithm of choice.
 	distributionAlgo string
 
-	// Pack level listObjects pool management.
-	listPool *TreeWalkPool
+	// Merge tree walk
+	pool *MergeWalkPool
 }
 
 // isConnected - checks if the endpoint is connected or not.
@@ -270,7 +269,7 @@ func newXLSets(endpoints EndpointList, format *formatXLV3, setCount int, drivesP
 		format:             format,
 		disksConnectDoneCh: make(chan struct{}),
 		distributionAlgo:   format.XL.DistributionAlgo,
-		listPool:           NewTreeWalkPool(globalLookupTimeout),
+		pool:               NewMergeWalkPool(globalMergeLookupTimeout),
 	}
 
 	mutex := newNSLock(globalIsDistXL)
@@ -698,7 +697,6 @@ func (s *xlSets) CopyObject(ctx context.Context, srcBucket, srcObject, destBucke
 }
 
 // Returns function "listDir" of the type listDirFunc.
-// isLeaf - is used by listDir function to check if an entry is a leaf or non-leaf entry.
 // disks - used for doing disk.ListDir(). Sets passes set of disks.
 func listDirSetsFactory(ctx context.Context, sets ...*xlObjects) ListDirFunc {
 	listDirInternal := func(bucket, prefixDir, prefixEntry string, disks []StorageAPI) (mergedEntries []string) {
@@ -765,23 +763,240 @@ func listDirSetsFactory(ctx context.Context, sets ...*xlObjects) ListDirFunc {
 	return listDir
 }
 
-// ListObjects - implements listing of objects across sets, each set is independently
-// listed and subsequently merge lexically sorted inside listDirSetsFactory(). Resulting
-// value through the walk channel receives the data properly lexically sorted.
-func (s *xlSets) ListObjects(ctx context.Context, bucket, prefix, marker, delimiter string, maxKeys int) (ListObjectsInfo, error) {
-	listDir := listDirSetsFactory(ctx, s.sets...)
+// FileInfoCh - file info channel
+type FileInfoCh struct {
+	Ch    chan FileInfo
+	Prev  FileInfo
+	Valid bool
+}
+
+// Pop - pops a cached entry if any, or from the cached channel.
+func (f *FileInfoCh) Pop() (fi FileInfo, ok bool) {
+	if f.Valid {
+		f.Valid = false
+		return f.Prev, true
+	} // No cached entries found, read from channel
+	f.Prev, ok = <-f.Ch
+	return f.Prev, ok
+}
+
+// Push - cache an entry, for Pop() later.
+func (f *FileInfoCh) Push(fi FileInfo) {
+	f.Prev = fi
+	f.Valid = true
+}
+
+// Calculate least entry across multiple FileInfo channels, additionally
+// returns a boolean to indicate if the caller needs to call again.
+func leastEntry(entriesCh []FileInfoCh, readQuorum int) (FileInfo, bool) {
+	var entriesValid = make([]bool, len(entriesCh))
+	var entries = make([]FileInfo, len(entriesCh))
+	for i := range entriesCh {
+		entries[i], entriesValid[i] = entriesCh[i].Pop()
+	}
+
+	var isTruncated = false
+	for _, valid := range entriesValid {
+		if !valid {
+			continue
+		}
+		isTruncated = true
+		break
+	}
+
+	var lentry FileInfo
+	var found bool
+	for i, valid := range entriesValid {
+		if !valid {
+			continue
+		}
+		if !found {
+			lentry = entries[i]
+			found = true
+			continue
+		}
+		if entries[i].Name < lentry.Name {
+			lentry = entries[i]
+		}
+	}
+
+	// We haven't been able to find any least entry,
+	// this would mean that we don't have valid.
+	if !found {
+		return lentry, isTruncated
+	}
+
+	leastEntryCount := 0
+	for i, valid := range entriesValid {
+		if !valid {
+			continue
+		}
+
+		// Entries are duplicated across disks,
+		// we should simply skip such entries.
+		if lentry.Name == entries[i].Name && lentry.ModTime.Equal(entries[i].ModTime) {
+			leastEntryCount++
+			continue
+		}
+
+		// Push all entries which are lexically higher
+		// and will be returned later in Pop()
+		entriesCh[i].Push(entries[i])
+	}
+
+	quorum := lentry.Quorum
+	if quorum == 0 {
+		quorum = readQuorum
+	}
+
+	if leastEntryCount >= quorum {
+		return lentry, isTruncated
+	}
+
+	return leastEntry(entriesCh, readQuorum)
+}
+
+// mergeEntriesCh - merges FileInfo channel to entries upto maxKeys.
+func mergeEntriesCh(entriesCh []FileInfoCh, maxKeys int, readQuorum int) (entries FilesInfo) {
+	for i := 0; i < maxKeys; {
+		var fi FileInfo
+		fi, entries.IsTruncated = leastEntry(entriesCh, readQuorum)
+		if !entries.IsTruncated {
+			break
+		}
+		entries.Files = append(entries.Files, fi)
+		i++
+	}
+	return entries
+}
 
-	var getObjectInfoDirs []func(context.Context, string, string) (ObjectInfo, error)
-	// Verify prefixes in all sets.
+// Starts a walk channel across all disks and returns a slice.
+func (s *xlSets) startMergeWalks(ctx context.Context, bucket, prefix, marker string, recursive bool, endWalkCh chan struct{}) []FileInfoCh {
+	var entryChs []FileInfoCh
 	for _, set := range s.sets {
-		getObjectInfoDirs = append(getObjectInfoDirs, set.getObjectInfoDir)
+		for _, disk := range set.getDisks() {
+			if disk == nil {
+				// Disk can be offline
+				continue
+			}
+			entryCh, err := disk.Walk(bucket, prefix, marker, recursive, xlMetaJSONFile, readMetadata, endWalkCh)
+			if err != nil {
+				// Disk walk returned error, ignore it.
+				continue
+			}
+			entryChs = append(entryChs, FileInfoCh{
+				Ch: entryCh,
+			})
+		}
+	}
+	return entryChs
+}
+
+// ListObjects - implements listing of objects across disks, each disk is indepenently
+// walked and merged at this layer. Resulting value through the merge process sends
+// the data in lexically sorted order.
+func (s *xlSets) ListObjects(ctx context.Context, bucket, prefix, marker, delimiter string, maxKeys int) (loi ListObjectsInfo, err error) {
+	if err = checkListObjsArgs(ctx, bucket, prefix, marker, delimiter, s); err != nil {
+		return loi, err
+	}
+
+	// Marker is set validate pre-condition.
+	if marker != "" {
+		// Marker not common with prefix is not implemented. Send an empty response
+		if !hasPrefix(marker, prefix) {
+			return loi, nil
+		}
+	}
+
+	// With max keys of zero we have reached eof, return right here.
+	if maxKeys == 0 {
+		return loi, nil
+	}
+
+	// For delimiter and prefix as '/' we do not list anything at all
+	// since according to s3 spec we stop at the 'delimiter'
+	// along // with the prefix. On a flat namespace with 'prefix'
+	// as '/' we don't have any entries, since all the keys are
+	// of form 'keyName/...'
+	if delimiter == slashSeparator && prefix == slashSeparator {
+		return loi, nil
+	}
+
+	// Over flowing count - reset to maxObjectList.
+	if maxKeys < 0 || maxKeys > maxObjectList {
+		maxKeys = maxObjectList
 	}
 
-	var getObjectInfo = func(ctx context.Context, bucket string, entry string) (ObjectInfo, error) {
-		return s.getHashedSet(entry).getObjectInfo(ctx, bucket, entry)
+	// Default is recursive, if delimiter is set then list non recursive.
+	recursive := true
+	if delimiter == slashSeparator {
+		recursive = false
+	}
+
+	entryChs, endWalkCh := s.pool.Release(listParams{bucket, recursive, marker, prefix})
+	if entryChs == nil {
+		endWalkCh = make(chan struct{})
+		entryChs = s.startMergeWalks(context.Background(), bucket, prefix, marker, recursive, endWalkCh)
+	}
+
+	entries := mergeEntriesCh(entryChs, maxKeys, s.drivesPerSet/2)
+	if len(entries.Files) == 0 {
+		return loi, nil
 	}
 
-	return listObjects(ctx, s, bucket, prefix, marker, delimiter, maxKeys, s.listPool, listDir, getObjectInfo, getObjectInfoDirs...)
+	loi.Objects = make([]ObjectInfo, len(entries.Files))
+	loi.IsTruncated = entries.IsTruncated
+	if loi.IsTruncated {
+		loi.NextMarker = entries.Files[len(entries.Files)-1].Name
+	}
+
+	for _, entry := range entries.Files {
+		var objInfo ObjectInfo
+		if hasSuffix(entry.Name, slashSeparator) {
+			if !recursive {
+				loi.Prefixes = append(loi.Prefixes, entry.Name)
+				continue
+			}
+			objInfo = ObjectInfo{
+				Bucket: bucket,
+				Name:   entry.Name,
+				IsDir:  true,
+			}
+		} else {
+			objInfo = ObjectInfo{
+				IsDir:           false,
+				Bucket:          bucket,
+				Name:            entry.Name,
+				ModTime:         entry.ModTime,
+				Size:            entry.Size,
+				ContentType:     entry.Metadata["content-type"],
+				ContentEncoding: entry.Metadata["content-encoding"],
+			}
+
+			// Extract etag from metadata.
+			objInfo.ETag = extractETag(entry.Metadata)
+
+			// All the parts per object.
+			objInfo.Parts = entry.Parts
+
+			// etag/md5Sum has already been extracted. We need to
+			// remove to avoid it from appearing as part of
+			// response headers. e.g, X-Minio-* or X-Amz-*.
+			objInfo.UserDefined = cleanMetadata(entry.Metadata)
+
+			// Update storage class
+			if sc, ok := entry.Metadata[amzStorageClass]; ok {
+				objInfo.StorageClass = sc
+			} else {
+				objInfo.StorageClass = globalMinioDefaultStorageClass
+			}
+		}
+		loi.Objects = append(loi.Objects, objInfo)
+	}
+	if loi.IsTruncated {
+		s.pool.Set(listParams{bucket, recursive, loi.NextMarker, prefix}, entryChs, endWalkCh)
+	}
+	return loi, nil
 }
 
 func (s *xlSets) ListMultipartUploads(ctx context.Context, bucket, prefix, keyMarker, uploadIDMarker, delimiter string, maxUploads int) (result ListMultipartsInfo, err error) {
@@ -1301,7 +1516,7 @@ func (s *xlSets) HealObjects(ctx context.Context, bucket, prefix string, healObj
 		if !ok {
 			break
 		}
-		if err := healObjectFn(bucket, strings.TrimSuffix(walkResult.entry, slashSeparator+xlMetaJSONFile)); err != nil {
+		if err := healObjectFn(bucket, walkResult.entry); err != nil {
 			return toObjectErr(err, bucket, walkResult.entry)
 		}
 		if walkResult.end {