fix: add support for O_DIRECT reads for erasure backends (#10718)

4 years ago · 3a2f89b3c0
parent 6135f072d2
commit 3a2f89b3c0
3 changed files with 171 additions and 46 deletions
--- a/cmd/config/storageclass/storage-class.go
+++ b/cmd/config/storageclass/storage-class.go
@ -18,6 +18,7 @@ package storageclass

 import (
 	"encoding/json"
+	"errors"
 	"fmt"
 	"strconv"
 	"strings"
@ -32,17 +33,26 @@ const (
 	RRS = "REDUCED_REDUNDANCY"
 	// Standard storage class
 	STANDARD = "STANDARD"
+	// DMA storage class
+	DMA = "DMA"
+
+	// Valid values are "write" and "read-write"
+	DMAWrite     = "write"
+	DMAReadWrite = "read-write"
 )

 // Standard constats for config info storage class
 const (
 	ClassStandard = "standard"
 	ClassRRS      = "rrs"
+	ClassDMA      = "dma"

 	// Reduced redundancy storage class environment variable
 	RRSEnv = "MINIO_STORAGE_CLASS_RRS"
 	// Standard storage class environment variable
 	StandardEnv = "MINIO_STORAGE_CLASS_STANDARD"
+	// DMA storage class environment variable
+	DMAEnv = "MINIO_STORAGE_CLASS_DMA"

 	// Supported storage class scheme is EC
 	schemePrefix = "EC"
@ -52,6 +62,9 @@ const (

 	// Default RRS parity is always minimum parity.
 	defaultRRSParity = minParityDisks
+
+	// Default DMA value
+	defaultDMA = DMAWrite
 )

 // DefaultKVS - default storage class config
@ -65,18 +78,24 @@ var (
 			Key:   ClassRRS,
 			Value: "EC:2",
 		},
+		config.KV{
+			Key:   ClassDMA,
+			Value: defaultDMA,
+		},
 	}
 )

 // StorageClass - holds storage class information
 type StorageClass struct {
 	Parity int
+	DMA    string
 }

 // Config storage class configuration
 type Config struct {
 	Standard StorageClass `json:"standard"`
 	RRS      StorageClass `json:"rrs"`
+	DMA      StorageClass `json:"dma"`
 }

 // UnmarshalJSON - Validate SS and RRS parity when unmarshalling JSON.
@ -93,7 +112,7 @@ func (sCfg *Config) UnmarshalJSON(data []byte) error {
 // IsValid - returns true if input string is a valid
 // storage class kind supported.
 func IsValid(sc string) bool {
-	return sc == RRS || sc == STANDARD
+	return sc == RRS || sc == STANDARD || sc == DMA
 }

 // UnmarshalText unmarshals storage class from its textual form into
@ -103,6 +122,14 @@ func (sc *StorageClass) UnmarshalText(b []byte) error {
 	if scStr == "" {
 		return nil
 	}
+	if scStr == DMAWrite {
+		sc.DMA = DMAWrite
+		return nil
+	}
+	if scStr == DMAReadWrite {
+		sc.DMA = DMAReadWrite
+		return nil
+	}
 	s, err := parseStorageClass(scStr)
 	if err != nil {
 		return err
@ -116,14 +143,14 @@ func (sc *StorageClass) MarshalText() ([]byte, error) {
 	if sc.Parity != 0 {
 		return []byte(fmt.Sprintf("%s:%d", schemePrefix, sc.Parity)), nil
 	}
-	return []byte(""), nil
+	return []byte(sc.DMA), nil
 }

 func (sc *StorageClass) String() string {
 	if sc.Parity != 0 {
 		return fmt.Sprintf("%s:%d", schemePrefix, sc.Parity)
 	}
-	return ""
+	return sc.DMA
 }

 // Parses given storageClassEnv and returns a storageClass structure.
@ -212,6 +239,11 @@ func (sCfg Config) GetParityForSC(sc string) (parity int) {
 	}
 }

+// GetDMA - returns DMA configuration.
+func (sCfg Config) GetDMA() string {
+	return sCfg.DMA.DMA
+}
+
 // Enabled returns if etcd is enabled.
 func Enabled(kvs config.KVS) bool {
 	ssc := kvs.Get(ClassStandard)
@ -231,6 +263,7 @@ func LookupConfig(kvs config.KVS, setDriveCount int) (cfg Config, err error) {

 	ssc := env.Get(StandardEnv, kvs.Get(ClassStandard))
 	rrsc := env.Get(RRSEnv, kvs.Get(ClassRRS))
+	dma := env.Get(DMAEnv, kvs.Get(ClassDMA))
 	// Check for environment variables and parse into storageClass struct
 	if ssc != "" {
 		cfg.Standard, err = parseStorageClass(ssc)
@ -252,6 +285,14 @@ func LookupConfig(kvs config.KVS, setDriveCount int) (cfg Config, err error) {
 		cfg.RRS.Parity = defaultRRSParity
 	}

+	if dma == "" {
+		dma = defaultDMA
+	}
+	if dma != DMAReadWrite && dma != DMAWrite {
+		return Config{}, errors.New(`valid dma values are "read-write" and "write"`)
+	}
+	cfg.DMA.DMA = dma
+
 	// Validation is done after parsing both the storage classes. This is needed because we need one
 	// storage class value to deduce the correct value of the other storage class.
 	if err = validateParity(cfg.Standard.Parity, cfg.RRS.Parity, setDriveCount); err != nil {
--- a/cmd/xl-storage.go
+++ b/cmd/xl-storage.go
@ -42,6 +42,7 @@ import (
 	jsoniter "github.com/json-iterator/go"
 	"github.com/klauspost/readahead"
 	"github.com/minio/minio/cmd/config"
+	"github.com/minio/minio/cmd/config/storageclass"
 	"github.com/minio/minio/cmd/logger"
 	"github.com/minio/minio/pkg/disk"
 	"github.com/minio/minio/pkg/env"
@ -1584,6 +1585,56 @@ func (s *xlStorage) openFile(volume, path string, mode int) (f *os.File, err err
 	return w, nil
 }

+// To support O_DIRECT reads for erasure backends.
+type odirectreader struct {
+	f         *os.File
+	buf       []byte
+	bufp      *[]byte
+	freshRead bool
+	s         *xlStorage
+	err       error
+}
+
+// Read - Implements Reader interface.
+func (o *odirectreader) Read(buf []byte) (n int, err error) {
+	if o.err != nil {
+		return 0, o.err
+	}
+	if o.buf == nil {
+		o.bufp = o.s.pool.Get().(*[]byte)
+	}
+	if o.freshRead {
+		o.buf = *o.bufp
+		n, err = o.f.Read(o.buf)
+		if err != nil && err != io.EOF {
+			o.err = err
+			return n, err
+		}
+		if n == 0 {
+			// err is io.EOF
+			o.err = err
+			return n, err
+		}
+		o.buf = o.buf[:n]
+		o.freshRead = false
+	}
+	if len(buf) >= len(o.buf) {
+		n = copy(buf, o.buf)
+		o.freshRead = true
+		return n, nil
+	}
+	n = copy(buf, o.buf)
+	o.buf = o.buf[n:]
+	return n, nil
+}
+
+// Close - Release the buffer and close the file.
+func (o *odirectreader) Close() error {
+	o.s.pool.Put(o.bufp)
+	atomic.AddInt32(&o.s.activeIOCount, -1)
+	return o.f.Close()
+}
+
 // ReadFileStream - Returns the read stream of the file.
 func (s *xlStorage) ReadFileStream(ctx context.Context, volume, path string, offset, length int64) (io.ReadCloser, error) {
 	if offset < 0 {
@ -1611,6 +1662,29 @@ func (s *xlStorage) ReadFileStream(ctx context.Context, volume, path string, off
 		return nil, err
 	}

+	if offset == 0 && globalStorageClass.GetDMA() == storageclass.DMAReadWrite {
+		file, err := disk.OpenFileDirectIO(filePath, os.O_RDONLY, 0666)
+		if err != nil {
+			switch {
+			case os.IsNotExist(err):
+				return nil, errFileNotFound
+			case os.IsPermission(err):
+				return nil, errFileAccessDenied
+			case isSysErrNotDir(err):
+				return nil, errFileAccessDenied
+			case isSysErrIO(err):
+				return nil, errFaultyDisk
+			case isSysErrTooManyFiles(err):
+				return nil, errTooManyOpenFiles
+			default:
+				return nil, err
+			}
+		}
+
+		atomic.AddInt32(&s.activeIOCount, 1)
+		return &odirectreader{file, nil, nil, true, s, nil}, nil
+	}
+
 	// Open the file for reading.
 	file, err := os.Open(filePath)
 	if err != nil {
--- a/cmd/xl-storage_test.go
+++ b/cmd/xl-storage_test.go
@ -30,6 +30,7 @@ import (
 	"syscall"
 	"testing"

+	"github.com/minio/minio/cmd/config/storageclass"
 	"github.com/minio/minio/pkg/disk"
 )

@ -1063,62 +1064,71 @@ func TestXLStorageReadFile(t *testing.T) {
 		}
 	}

-	// Following block validates all ReadFile test cases.
-	for i, testCase := range testCases {
-		var n int64
-		// Common read buffer.
-		var buf = make([]byte, testCase.bufSize)
-		n, err = xlStorage.ReadFile(context.Background(), testCase.volume, testCase.fileName, testCase.offset, buf, v)
-		if err != nil && testCase.expectedErr != nil {
-			// Validate if the type string of the errors are an exact match.
-			if err.Error() != testCase.expectedErr.Error() {
-				if runtime.GOOS != globalWindowsOSName {
-					t.Errorf("Case: %d %#v, expected: %s, got: %s", i+1, testCase, testCase.expectedErr, err)
-				} else {
-					var resultErrno, expectErrno uintptr
-					if pathErr, ok := err.(*os.PathError); ok {
-						if errno, pok := pathErr.Err.(syscall.Errno); pok {
-							resultErrno = uintptr(errno)
+	for l := 0; l < 2; l++ {
+		// 1st loop tests with dma=write, 2nd loop tests with dma=read-write.
+		if l == 1 {
+			globalStorageClass.DMA.DMA = storageclass.DMAReadWrite
+		}
+		// Following block validates all ReadFile test cases.
+		for i, testCase := range testCases {
+			var n int64
+			// Common read buffer.
+			var buf = make([]byte, testCase.bufSize)
+			n, err = xlStorage.ReadFile(context.Background(), testCase.volume, testCase.fileName, testCase.offset, buf, v)
+			if err != nil && testCase.expectedErr != nil {
+				// Validate if the type string of the errors are an exact match.
+				if err.Error() != testCase.expectedErr.Error() {
+					if runtime.GOOS != globalWindowsOSName {
+						t.Errorf("Case: %d %#v, expected: %s, got: %s", i+1, testCase, testCase.expectedErr, err)
+					} else {
+						var resultErrno, expectErrno uintptr
+						if pathErr, ok := err.(*os.PathError); ok {
+							if errno, pok := pathErr.Err.(syscall.Errno); pok {
+								resultErrno = uintptr(errno)
+							}
 						}
-					}
-					if pathErr, ok := testCase.expectedErr.(*os.PathError); ok {
-						if errno, pok := pathErr.Err.(syscall.Errno); pok {
-							expectErrno = uintptr(errno)
+						if pathErr, ok := testCase.expectedErr.(*os.PathError); ok {
+							if errno, pok := pathErr.Err.(syscall.Errno); pok {
+								expectErrno = uintptr(errno)
+							}
+						}
+						if !(expectErrno != 0 && resultErrno != 0 && expectErrno == resultErrno) {
+							t.Errorf("Case: %d %#v, expected: %s, got: %s", i+1, testCase, testCase.expectedErr, err)
 						}
 					}
-					if !(expectErrno != 0 && resultErrno != 0 && expectErrno == resultErrno) {
-						t.Errorf("Case: %d %#v, expected: %s, got: %s", i+1, testCase, testCase.expectedErr, err)
+				}
+				// Err unexpected EOF special case, where we verify we have provided a larger
+				// buffer than the data itself, but the results are in-fact valid. So we validate
+				// this error condition specifically treating it as a good condition with valid
+				// results. In this scenario return 'n' is always lesser than the input buffer.
+				if err == io.ErrUnexpectedEOF {
+					if !bytes.Equal(testCase.expectedBuf, buf[:n]) {
+						t.Errorf("Case: %d %#v, expected: \"%s\", got: \"%s\"", i+1, testCase, string(testCase.expectedBuf), string(buf[:n]))
+					}
+					if n > int64(len(buf)) {
+						t.Errorf("Case: %d %#v, expected: %d, got: %d", i+1, testCase, testCase.bufSize, n)
 					}
 				}
 			}
-			// Err unexpected EOF special case, where we verify we have provided a larger
-			// buffer than the data itself, but the results are in-fact valid. So we validate
-			// this error condition specifically treating it as a good condition with valid
-			// results. In this scenario return 'n' is always lesser than the input buffer.
-			if err == io.ErrUnexpectedEOF {
-				if !bytes.Equal(testCase.expectedBuf, buf[:n]) {
-					t.Errorf("Case: %d %#v, expected: \"%s\", got: \"%s\"", i+1, testCase, string(testCase.expectedBuf), string(buf[:n]))
+			// ReadFile has returned success, but our expected error is non 'nil'.
+			if err == nil && err != testCase.expectedErr {
+				t.Errorf("Case: %d %#v, expected: %s, got :%s", i+1, testCase, testCase.expectedErr, err)
+			}
+			// Expected error retured, proceed further to validate the returned results.
+			if err == nil && err == testCase.expectedErr {
+				if !bytes.Equal(testCase.expectedBuf, buf) {
+					t.Errorf("Case: %d %#v, expected: \"%s\", got: \"%s\"", i+1, testCase, string(testCase.expectedBuf), string(buf[:testCase.bufSize]))
 				}
-				if n > int64(len(buf)) {
+				if n != int64(testCase.bufSize) {
 					t.Errorf("Case: %d %#v, expected: %d, got: %d", i+1, testCase, testCase.bufSize, n)
 				}
 			}
 		}
-		// ReadFile has returned success, but our expected error is non 'nil'.
-		if err == nil && err != testCase.expectedErr {
-			t.Errorf("Case: %d %#v, expected: %s, got :%s", i+1, testCase, testCase.expectedErr, err)
-		}
-		// Expected error retured, proceed further to validate the returned results.
-		if err == nil && err == testCase.expectedErr {
-			if !bytes.Equal(testCase.expectedBuf, buf) {
-				t.Errorf("Case: %d %#v, expected: \"%s\", got: \"%s\"", i+1, testCase, string(testCase.expectedBuf), string(buf[:testCase.bufSize]))
-			}
-			if n != int64(testCase.bufSize) {
-				t.Errorf("Case: %d %#v, expected: %d, got: %d", i+1, testCase, testCase.bufSize, n)
-			}
-		}
 	}

+	// Reset the flag.
+	globalStorageClass.DMA.DMA = storageclass.DMAWrite
+
 	// TestXLStorage for permission denied.
 	if runtime.GOOS != globalWindowsOSName {
 		permDeniedDir := createPermDeniedFile(t)