/* * Minio Cloud Storage, (C) 2016 Minio, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package main import ( "fmt" "math/rand" "os" slashpath "path" "strings" "path" "sync" "github.com/klauspost/reedsolomon" ) const ( // XL erasure metadata file. xlMetaV1File = "file.json" // Maximum erasure blocks. maxErasureBlocks = 16 // Minimum erasure blocks. minErasureBlocks = 8 ) // XL layer structure. type XL struct { ReedSolomon reedsolomon.Encoder // Erasure encoder/decoder. DataBlocks int ParityBlocks int storageDisks []StorageAPI readQuorum int writeQuorum int } // newXL instantiate a new XL. func newXL(disks ...string) (StorageAPI, error) { // Initialize XL. xl := &XL{} // Verify total number of disks. totalDisks := len(disks) if totalDisks > maxErasureBlocks { return nil, errMaxDisks } if totalDisks < minErasureBlocks { return nil, errMinDisks } // isEven function to verify if a given number if even. isEven := func(number int) bool { return number%2 == 0 } // Verify if we have even number of disks. // only combination of 8, 10, 12, 14, 16 are supported. if !isEven(totalDisks) { return nil, errNumDisks } // Calculate data and parity blocks. dataBlocks, parityBlocks := totalDisks/2, totalDisks/2 // Initialize reed solomon encoding. rs, err := reedsolomon.New(dataBlocks, parityBlocks) if err != nil { return nil, err } // Save the reedsolomon. xl.DataBlocks = dataBlocks xl.ParityBlocks = parityBlocks xl.ReedSolomon = rs // Initialize all storage disks. storageDisks := make([]StorageAPI, len(disks)) for index, disk := range disks { var err error // Intentionally ignore disk not found errors while // initializing POSIX, so that we have successfully // initialized posix Storage. // Subsequent calls to XL/Erasure will manage any errors // related to disks. storageDisks[index], err = newPosix(disk) if err != nil && err != errDiskNotFound { return nil, err } } // Save all the initialized storage disks. xl.storageDisks = storageDisks // Figure out read and write quorum based on number of storage disks. // Read quorum should be always N/2 + 1 (due to Vandermonde matrix // erasure requirements) xl.readQuorum = len(xl.storageDisks)/2 + 1 // Write quorum is assumed if we have total disks + 3 // parity. (Need to discuss this again) xl.writeQuorum = len(xl.storageDisks)/2 + 3 if xl.writeQuorum > len(xl.storageDisks) { xl.writeQuorum = len(xl.storageDisks) } // Return successfully initialized. return xl, nil } // MakeVol - make a volume. func (xl XL) MakeVol(volume string) error { if !isValidVolname(volume) { return errInvalidArgument } // Hold a write lock before creating a volume. nsMutex.Lock(volume, "") defer nsMutex.Unlock(volume, "") // Err counters. createVolErr := 0 // Count generic create vol errs. volumeExistsErrCnt := 0 // Count all errVolumeExists errs. // Initialize sync waitgroup. var wg = &sync.WaitGroup{} // Initialize list of errors. var dErrs = make([]error, len(xl.storageDisks)) // Make a volume entry on all underlying storage disks. for index, disk := range xl.storageDisks { wg.Add(1) // Make a volume inside a go-routine. go func(index int, disk StorageAPI) { defer wg.Done() if disk == nil { return } dErrs[index] = disk.MakeVol(volume) }(index, disk) } // Wait for all make vol to finish. wg.Wait() // Loop through all the concocted errors. for _, err := range dErrs { if err == nil { continue } // if volume already exists, count them. if err == errVolumeExists { volumeExistsErrCnt++ continue } // Update error counter separately. createVolErr++ } // Return err if all disks report volume exists. if volumeExistsErrCnt == len(xl.storageDisks) { return errVolumeExists } else if createVolErr > len(xl.storageDisks)-xl.writeQuorum { // Return errWriteQuorum if errors were more than // allowed write quorum. return errWriteQuorum } return nil } // DeleteVol - delete a volume. func (xl XL) DeleteVol(volume string) error { if !isValidVolname(volume) { return errInvalidArgument } // Hold a write lock for Delete volume. nsMutex.Lock(volume, "") defer nsMutex.Unlock(volume, "") // Collect if all disks report volume not found. var volumeNotFoundErrCnt int var wg = &sync.WaitGroup{} var dErrs = make([]error, len(xl.storageDisks)) // Remove a volume entry on all underlying storage disks. for index, disk := range xl.storageDisks { wg.Add(1) // Delete volume inside a go-routine. go func(index int, disk StorageAPI) { defer wg.Done() dErrs[index] = disk.DeleteVol(volume) }(index, disk) } // Wait for all the delete vols to finish. wg.Wait() // Loop through concocted errors and return anything unusual. for _, err := range dErrs { if err != nil { // We ignore error if errVolumeNotFound or errDiskNotFound if err == errVolumeNotFound || err == errDiskNotFound { volumeNotFoundErrCnt++ continue } return err } } // Return err if all disks report volume not found. if volumeNotFoundErrCnt == len(xl.storageDisks) { return errVolumeNotFound } return nil } // ListVols - list volumes. func (xl XL) ListVols() (volsInfo []VolInfo, err error) { emptyCount := 0 // Initialize sync waitgroup. var wg = &sync.WaitGroup{} // Success vols map carries successful results of ListVols from each disks. var successVols = make([][]VolInfo, len(xl.storageDisks)) for index, disk := range xl.storageDisks { wg.Add(1) // Add each go-routine to wait for. go func(index int, disk StorageAPI) { // Indicate wait group as finished. defer wg.Done() // Initiate listing. vlsInfo, lErr := disk.ListVols() if lErr == nil { if len(vlsInfo) == 0 { emptyCount++ // Calculate empty count specially. } else { successVols[index] = vlsInfo } } }(index, disk) } // For all the list volumes running in parallel to finish. wg.Wait() // If all list operations resulted in an empty count which is same // as your total storage disks, then it is a valid case return // success with empty vols. if emptyCount == len(xl.storageDisks) { return []VolInfo{}, nil } else if len(successVols) < xl.readQuorum { // If there is data and not empty, then we attempt quorum verification. // Verify if we have enough quorum to list vols. return nil, errReadQuorum } var total, free int64 // Loop through success vols and get aggregated usage values. for _, vlsInfo := range successVols { free += vlsInfo[0].Free total += vlsInfo[0].Total } // Save the updated usage values back into the vols. for _, volInfo := range successVols[0] { volInfo.Free = free volInfo.Total = total volsInfo = append(volsInfo, volInfo) } // NOTE: The assumption here is that volumes across all disks in // readQuorum have consistent view i.e they all have same number // of buckets. This is essentially not verified since healing // should take care of this. return volsInfo, nil } // getAllVolInfo - list bucket volume info from all disks. // Returns error slice indicating the failed volume stat operations. func (xl XL) getAllVolInfo(volume string) (volsInfo []VolInfo, errs []error) { // Create errs and volInfo slices of storageDisks size. errs = make([]error, len(xl.storageDisks)) volsInfo = make([]VolInfo, len(xl.storageDisks)) // Allocate a new waitgroup. var wg = &sync.WaitGroup{} for index, disk := range xl.storageDisks { wg.Add(1) // Stat volume on all the disks in a routine. go func(index int, disk StorageAPI) { defer wg.Done() volInfo, err := disk.StatVol(volume) if err != nil { errs[index] = err return } volsInfo[index] = volInfo }(index, disk) } // Wait for all the Stat operations to finish. wg.Wait() // Return the concocted values. return volsInfo, errs } // listAllVolInfo - list all stat volume info from all disks. // Returns // - stat volume info for all online disks. // - boolean to indicate if healing is necessary. // - error if any. func (xl XL) listAllVolInfo(volume string) ([]VolInfo, bool, error) { volsInfo, errs := xl.getAllVolInfo(volume) volsInfo = removeDuplicateVols(volsInfo) notFoundCount := 0 for _, err := range errs { if err == errVolumeNotFound { notFoundCount++ // If we have errors with file not found greater than allowed read // quorum we return err as errFileNotFound. if notFoundCount > len(xl.storageDisks)-xl.readQuorum { return nil, false, errVolumeNotFound } } } // Calculate online disk count. onlineDiskCount := 0 for index := range errs { if errs[index] == nil { onlineDiskCount++ } } var heal bool // If online disks count is lesser than configured disks, most // probably we need to heal the file, additionally verify if the // count is lesser than readQuorum, if not we throw an error. if onlineDiskCount < len(xl.storageDisks) { // Online disks lesser than total storage disks, needs to be // healed. unless we do not have readQuorum. heal = true // Verify if online disks count are lesser than readQuorum // threshold, return an error if yes. if onlineDiskCount < xl.readQuorum { return nil, false, errReadQuorum } } // Return success. return volsInfo, heal, nil } // healVolume - heals any missing volumes. func (xl XL) healVolume(volume string) error { // Acquire a read lock. nsMutex.RLock(volume, "") defer nsMutex.RUnlock(volume, "") // Lists volume info for all online disks. volsInfo, heal, err := xl.listAllVolInfo(volume) if err != nil { return err } if !heal { return nil } // Create volume if missing on online disks. for index, volInfo := range volsInfo { if volInfo.Name != "" { continue } // Volinfo name would be an empty string, create it. if err = xl.storageDisks[index].MakeVol(volume); err != nil { continue } } return nil } // Removes any duplicate vols. func removeDuplicateVols(volsInfo []VolInfo) []VolInfo { // Use map to record duplicates as we find them. result := []VolInfo{} m := make(map[string]VolInfo) for _, v := range volsInfo { if _, found := m[v.Name]; !found { m[v.Name] = v } } result = make([]VolInfo, 0, len(m)) for _, v := range m { result = append(result, v) } // Return the new slice. return result } // StatVol - get volume stat info. func (xl XL) StatVol(volume string) (volInfo VolInfo, err error) { if !isValidVolname(volume) { return VolInfo{}, errInvalidArgument } // Acquire a read lock before reading. nsMutex.RLock(volume, "") volsInfo, heal, err := xl.listAllVolInfo(volume) nsMutex.RUnlock(volume, "") if err != nil { return VolInfo{}, err } if heal { go func() { hErr := xl.healVolume(volume) errorIf(hErr, "Unable to heal volume "+volume+".") }() } // Loop through all statVols, calculate the actual usage values. var total, free int64 for _, volInfo := range volsInfo { free += volInfo.Free total += volInfo.Total } // Filter volsInfo and update the volInfo. volInfo = volsInfo[0] volInfo.Free = free volInfo.Total = total return volInfo, nil } // isLeafDirectoryXL - check if a given path is leaf directory. i.e // if it contains file xlMetaV1File func isLeafDirectoryXL(disk StorageAPI, volume, leafPath string) (isLeaf bool) { _, err := disk.StatFile(volume, path.Join(leafPath, xlMetaV1File)) return err == nil } // ListDir - return all the entries at the given directory path. // If an entry is a directory it will be returned with a trailing "/". func (xl XL) ListDir(volume, dirPath string) (entries []string, err error) { if !isValidVolname(volume) { return nil, errInvalidArgument } // Count for list errors encountered. var listErrCount = 0 // Loop through and return the first success entry based on the // selected random disk. for listErrCount < len(xl.storageDisks) { // Choose a random disk on each attempt, do not hit the same disk all the time. randIndex := rand.Intn(len(xl.storageDisks) - 1) disk := xl.storageDisks[randIndex] // Pick a random disk. // Initiate a list operation, if successful filter and return quickly. if entries, err = disk.ListDir(volume, dirPath); err == nil { for i, entry := range entries { isLeaf := isLeafDirectoryXL(disk, volume, path.Join(dirPath, entry)) isDir := strings.HasSuffix(entry, slashSeparator) if isDir && isLeaf { entries[i] = strings.TrimSuffix(entry, slashSeparator) } } // We got the entries successfully return. return entries, nil } listErrCount++ // Update list error count. } // Return error at the end. return nil, err } // Object API. // StatFile - stat a file func (xl XL) StatFile(volume, path string) (FileInfo, error) { if !isValidVolname(volume) { return FileInfo{}, errInvalidArgument } if !isValidPath(path) { return FileInfo{}, errInvalidArgument } // Acquire read lock. nsMutex.RLock(volume, path) _, metadata, heal, err := xl.listOnlineDisks(volume, path) nsMutex.RUnlock(volume, path) if err != nil { return FileInfo{}, err } if heal { // Heal in background safely, since we already have read quorum disks. go func() { hErr := xl.healFile(volume, path) errorIf(hErr, "Unable to heal file "+volume+"/"+path+".") }() } // Return file info. return FileInfo{ Volume: volume, Name: path, Size: metadata.Stat.Size, ModTime: metadata.Stat.ModTime, Mode: os.FileMode(0644), }, nil } // DeleteFile - delete a file func (xl XL) DeleteFile(volume, path string) error { if !isValidVolname(volume) { return errInvalidArgument } if !isValidPath(path) { return errInvalidArgument } nsMutex.Lock(volume, path) defer nsMutex.Unlock(volume, path) errCount := 0 // Update meta data file and remove part file for index, disk := range xl.storageDisks { erasureFilePart := slashpath.Join(path, fmt.Sprintf("file.%d", index)) err := disk.DeleteFile(volume, erasureFilePart) if err != nil { errCount++ // We can safely allow DeleteFile errors up to len(xl.storageDisks) - xl.writeQuorum // otherwise return failure. if errCount <= len(xl.storageDisks)-xl.writeQuorum { continue } return err } xlMetaV1FilePath := slashpath.Join(path, "file.json") err = disk.DeleteFile(volume, xlMetaV1FilePath) if err != nil { errCount++ // We can safely allow DeleteFile errors up to len(xl.storageDisks) - xl.writeQuorum // otherwise return failure. if errCount <= len(xl.storageDisks)-xl.writeQuorum { continue } return err } } // Return success. return nil } // RenameFile - rename file. func (xl XL) RenameFile(srcVolume, srcPath, dstVolume, dstPath string) error { // Validate inputs. if !isValidVolname(srcVolume) { return errInvalidArgument } if !isValidPath(srcPath) { return errInvalidArgument } if !isValidVolname(dstVolume) { return errInvalidArgument } if !isValidPath(dstPath) { return errInvalidArgument } // Hold read lock at source before rename. nsMutex.RLock(srcVolume, srcPath) defer nsMutex.RUnlock(srcVolume, srcPath) // Hold write lock at destination before rename. nsMutex.Lock(dstVolume, dstPath) defer nsMutex.Unlock(dstVolume, dstPath) errCount := 0 for _, disk := range xl.storageDisks { // Append "/" as srcPath and dstPath are either leaf-dirs or non-leaf-dris. // If srcPath is an object instead of prefix we just rename the leaf-dir and // not rename the part and metadata files separately. err := disk.RenameFile(srcVolume, retainSlash(srcPath), dstVolume, retainSlash(dstPath)) if err != nil { errCount++ // We can safely allow RenameFile errors up to len(xl.storageDisks) - xl.writeQuorum // otherwise return failure. if errCount <= len(xl.storageDisks)-xl.writeQuorum { continue } return err } } return nil }