From 289e1d8b2a7e7470a8d957a1023e809ec50a3f89 Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Wed, 17 Feb 2021 15:34:42 -0800 Subject: [PATCH] fix: reduce crawler memory usage by orders of magnitude (#11556) currently crawler waits for an entire readdir call to return until it processes usage, lifecycle, replication and healing - instead we should pass the applicator all the way down to avoid building any special stack for all the contents in a single directory. This allows for - no need to remember the entire list of entries per directory before applying the required functions - no need to wait for entire readdir() call to finish before applying the required functions --- cmd/data-scanner.go | 8 ++++---- cmd/disk-cache-backend.go | 9 ++------- cmd/fastwalk.go | 38 ------------------------------------ cmd/metacache-server-pool.go | 2 +- cmd/os-readdir_other.go | 13 +++++++++--- cmd/os-readdir_unix.go | 16 +++++++++------ cmd/os-readdir_windows.go | 15 ++++++++++---- cmd/storage-errors.go | 9 +++++++++ 8 files changed, 47 insertions(+), 63 deletions(-) delete mode 100644 cmd/fastwalk.go diff --git a/cmd/data-scanner.go b/cmd/data-scanner.go index b8727e093..a2ff06ec4 100644 --- a/cmd/data-scanner.go +++ b/cmd/data-scanner.go @@ -407,19 +407,19 @@ func (f *folderScanner) scanQueuedLevels(ctx context.Context, folders []cachedFo if f.dataUsageCrawlDebug { console.Debugf(scannerLogPrefix+" no bucket (%s,%s)\n", f.root, entName) } - return nil + return errDoneForNow } if isReservedOrInvalidBucket(bucket, false) { if f.dataUsageCrawlDebug { console.Debugf(scannerLogPrefix+" invalid bucket: %v, entry: %v\n", bucket, entName) } - return nil + return errDoneForNow } select { case <-done: - return ctx.Err() + return errDoneForNow default: } @@ -682,7 +682,7 @@ func (f *folderScanner) deepScanFolder(ctx context.Context, folder cachedFolder, addDir = func(entName string, typ os.FileMode) error { select { case <-done: - return ctx.Err() + return errDoneForNow default: } diff --git a/cmd/disk-cache-backend.go b/cmd/disk-cache-backend.go index 6886c791c..5496bbd26 100644 --- a/cmd/disk-cache-backend.go +++ b/cmd/disk-cache-backend.go @@ -23,7 +23,6 @@ import ( "crypto/rand" "encoding/base64" "encoding/hex" - "errors" "fmt" "io" "io/ioutil" @@ -269,10 +268,6 @@ func (c *diskCache) toClear() uint64 { return bytesToClear(int64(di.Total), int64(di.Free), uint64(c.quotaPct), uint64(c.lowWatermark), uint64(c.highWatermark)) } -var ( - errDoneForNow = errors.New("done for now") -) - func (c *diskCache) purgeWait(ctx context.Context) { for { select { @@ -382,7 +377,7 @@ func (c *diskCache) purge(ctx context.Context) { return nil } - if err := readDirFilterFn(c.dir, filterFn); err != nil { + if err := readDirFn(c.dir, filterFn); err != nil { logger.LogIf(ctx, err) return } @@ -1025,7 +1020,7 @@ func (c *diskCache) scanCacheWritebackFailures(ctx context.Context) { return nil } - if err := readDirFilterFn(c.dir, filterFn); err != nil { + if err := readDirFn(c.dir, filterFn); err != nil { logger.LogIf(ctx, err) return } diff --git a/cmd/fastwalk.go b/cmd/fastwalk.go deleted file mode 100644 index ba1806e55..000000000 --- a/cmd/fastwalk.go +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// This code is imported from "golang.org/x/tools/internal/fastwalk", -// only fastwalk.go is imported since we already implement readDir() -// with some little tweaks. - -package cmd - -import ( - "errors" - "os" - "strings" -) - -var errSkipFile = errors.New("fastwalk: skip this file") - -func readDirFn(dirName string, fn func(entName string, typ os.FileMode) error) error { - fis, err := readDir(dirName) - if err != nil { - if osIsNotExist(err) || err == errFileNotFound { - return nil - } - return err - } - for _, fi := range fis { - var mode os.FileMode - if strings.HasSuffix(fi, SlashSeparator) { - mode |= os.ModeDir - } - - if err = fn(fi, mode); err != nil { - return err - } - } - return nil -} diff --git a/cmd/metacache-server-pool.go b/cmd/metacache-server-pool.go index a29330e2b..6dea435c7 100644 --- a/cmd/metacache-server-pool.go +++ b/cmd/metacache-server-pool.go @@ -33,7 +33,7 @@ import ( func renameAllBucketMetacache(epPath string) error { // Rename all previous `.minio.sys/buckets//.metacache` to // to `.minio.sys/tmp/` for deletion. - return readDirFilterFn(pathJoin(epPath, minioMetaBucket, bucketMetaPrefix), func(name string, typ os.FileMode) error { + return readDirFn(pathJoin(epPath, minioMetaBucket, bucketMetaPrefix), func(name string, typ os.FileMode) error { if typ == os.ModeDir { tmpMetacacheOld := pathJoin(epPath, minioMetaTmpBucket+"-old", mustGetUUID()) if err := renameAll(pathJoin(epPath, minioMetaBucket, metacachePrefixForID(name, slashSeparator)), diff --git a/cmd/os-readdir_other.go b/cmd/os-readdir_other.go index 60e1fd86d..79f761127 100644 --- a/cmd/os-readdir_other.go +++ b/cmd/os-readdir_other.go @@ -29,11 +29,15 @@ func readDir(dirPath string) (entries []string, err error) { return readDirN(dirPath, -1) } -// readDir applies the filter function on each entries at dirPath, doesn't recurse into -// the directory itself. -func readDirFilterFn(dirPath string, filter func(name string, typ os.FileMode) error) error { +// readDirFn applies the fn() function on each entries at dirPath, doesn't recurse into +// the directory itself, if the dirPath doesn't exist this function doesn't return +// an error. +func readDirFn(dirPath string, filter func(name string, typ os.FileMode) error) error { d, err := os.Open(dirPath) if err != nil { + if osErrToFileErr(err) == errFileNotFound { + return nil + } return osErrToFileErr(err) } defer d.Close() @@ -46,6 +50,9 @@ func readDirFilterFn(dirPath string, filter func(name string, typ os.FileMode) e if err == io.EOF { break } + if osErrToFileErr(err) == errFileNotFound { + return nil + } return osErrToFileErr(err) } for _, fi := range fis { diff --git a/cmd/os-readdir_unix.go b/cmd/os-readdir_unix.go index aca99911e..76ecf193c 100644 --- a/cmd/os-readdir_unix.go +++ b/cmd/os-readdir_unix.go @@ -84,11 +84,15 @@ func readDir(dirPath string) (entries []string, err error) { return readDirN(dirPath, -1) } -// readDir applies the filter function on each entries at dirPath, doesn't recurse into -// the directory itself. -func readDirFilterFn(dirPath string, filter func(name string, typ os.FileMode) error) error { +// readDirFn applies the fn() function on each entries at dirPath, doesn't recurse into +// the directory itself, if the dirPath doesn't exist this function doesn't return +// an error. +func readDirFn(dirPath string, fn func(name string, typ os.FileMode) error) error { f, err := os.Open(dirPath) if err != nil { + if osErrToFileErr(err) == errFileNotFound { + return nil + } return osErrToFileErr(err) } defer f.Close() @@ -103,7 +107,7 @@ func readDirFilterFn(dirPath string, filter func(name string, typ os.FileMode) e nbuf, err = syscall.ReadDirent(int(f.Fd()), buf) if err != nil { if isSysErrNotDir(err) { - return errFileNotFound + return nil } return err } @@ -122,8 +126,8 @@ func readDirFilterFn(dirPath string, filter func(name string, typ os.FileMode) e if typ&os.ModeSymlink == os.ModeSymlink { continue } - if err = filter(string(name), typ); err == errDoneForNow { - // filtering requested to return by caller. + if err = fn(string(name), typ); err == errDoneForNow { + // fn() requested to return by caller. return nil } } diff --git a/cmd/os-readdir_windows.go b/cmd/os-readdir_windows.go index 74e98a734..cc79d1d13 100644 --- a/cmd/os-readdir_windows.go +++ b/cmd/os-readdir_windows.go @@ -29,11 +29,15 @@ func readDir(dirPath string) (entries []string, err error) { return readDirN(dirPath, -1) } -// readDir applies the filter function on each entries at dirPath, doesn't recurse into -// the directory itself. -func readDirFilterFn(dirPath string, filter func(name string, typ os.FileMode) error) error { +// readDirFn applies the fn() function on each entries at dirPath, doesn't recurse into +// the directory itself, if the dirPath doesn't exist this function doesn't return +// an error. +func readDirFn(dirPath string, filter func(name string, typ os.FileMode) error) error { f, err := os.Open(dirPath) if err != nil { + if osErrToFileErr(err) == errFileNotFound { + return nil + } return osErrToFileErr(err) } defer f.Close() @@ -45,6 +49,9 @@ func readDirFilterFn(dirPath string, filter func(name string, typ os.FileMode) e if e == syscall.ERROR_NO_MORE_FILES { break } else { + if isSysErrPathNotFound(e) { + return nil + } return osErrToFileErr(&os.PathError{ Op: "FindNextFile", Path: dirPath, @@ -69,7 +76,7 @@ func readDirFilterFn(dirPath string, filter func(name string, typ os.FileMode) e } } - return err + return nil } // Return N entries at the directory dirPath. If count is -1, return all entries diff --git a/cmd/storage-errors.go b/cmd/storage-errors.go index 86e1216cf..38a75e5ee 100644 --- a/cmd/storage-errors.go +++ b/cmd/storage-errors.go @@ -16,6 +16,8 @@ package cmd +import "errors" + // errUnexpected - unexpected error, requires manual intervention. var errUnexpected = StorageErr("unexpected error, please report this issue at https://github.com/minio/minio/issues") @@ -104,6 +106,13 @@ var errLessData = StorageErr("less data available than what was requested") // errMoreData = returned when more data was sent by the caller than what it was supposed to. var errMoreData = StorageErr("more data was sent than what was advertised") +// indicates readDirFn to return without further applying the fn() +var errDoneForNow = errors.New("done for now") + +// errSkipFile returned by the fn() for readDirFn() when it needs +// to proceed to next entry. +var errSkipFile = errors.New("skip this file") + // StorageErr represents error generated by xlStorage call. type StorageErr string