From 75715820007a8832f4bc86c338d2ce1cb11eff2f Mon Sep 17 00:00:00 2001 From: Anis Elleuch Date: Tue, 11 Sep 2018 00:21:59 +0100 Subject: [PATCH] Print storage errors during distributed initialization (#6441) This commit will print connection failures to other disks in other nodes after 5 retries. It is useful for users to understand why the distribued cluster fails to boot up. --- cmd/naughty-disk_test.go | 4 ++++ cmd/posix.go | 4 ++++ cmd/prepare-storage.go | 18 +++++++++++++++--- cmd/storage-interface.go | 2 ++ cmd/storage-rpc-client.go | 18 +++++++++++++++++- 5 files changed, 42 insertions(+), 4 deletions(-) diff --git a/cmd/naughty-disk_test.go b/cmd/naughty-disk_test.go index df0811933..f8feffdeb 100644 --- a/cmd/naughty-disk_test.go +++ b/cmd/naughty-disk_test.go @@ -52,6 +52,10 @@ func (d *naughtyDisk) IsOnline() bool { return d.disk.IsOnline() } +func (d *naughtyDisk) LastError() (err error) { + return nil +} + func (d *naughtyDisk) Close() (err error) { if err = d.calcError(); err != nil { return err diff --git a/cmd/posix.go b/cmd/posix.go index 772a2a6e5..ad083a390 100644 --- a/cmd/posix.go +++ b/cmd/posix.go @@ -274,6 +274,10 @@ func (s *posix) String() string { return s.diskPath } +func (s *posix) LastError() error { + return nil +} + func (s *posix) Close() error { close(s.stopUsageCh) s.connected = false diff --git a/cmd/prepare-storage.go b/cmd/prepare-storage.go index edc7d5f8c..21016e7de 100644 --- a/cmd/prepare-storage.go +++ b/cmd/prepare-storage.go @@ -120,13 +120,25 @@ var errXLV3ThisEmpty = fmt.Errorf("XL format version 3 has This field empty") // connect to list of endpoints and load all XL disk formats, validate the formats are correct // and are in quorum, if no formats are found attempt to initialize all of them for the first // time. additionally make sure to close all the disks used in this attempt. -func connectLoadInitFormats(firstDisk bool, endpoints EndpointList, setCount, drivesPerSet int) (*formatXLV3, error) { +func connectLoadInitFormats(retryCount int, firstDisk bool, endpoints EndpointList, setCount, drivesPerSet int) (*formatXLV3, error) { + // Initialize all storage disks storageDisks, err := initStorageDisks(endpoints) if err != nil { return nil, err } defer closeStorageDisks(storageDisks) + // Connect to all storage disks, a connection failure will be + // only logged after some retries. + for _, disk := range storageDisks { + if disk != nil { + connectErr := disk.LastError() + if connectErr != nil && retryCount >= 5 { + logger.Info("Unable to connect to %s: %v\n", disk.String(), connectErr.Error()) + } + } + } + // Attempt to load all `format.json` from all disks. formatConfigs, sErrs := loadFormatXLAll(storageDisks) // Check if we have @@ -238,8 +250,8 @@ func waitForFormatXL(ctx context.Context, firstDisk bool, endpoints EndpointList retryTimerCh := newRetryTimerSimple(doneCh) for { select { - case _ = <-retryTimerCh: - format, err := connectLoadInitFormats(firstDisk, endpoints, setCount, disksPerSet) + case retryCount := <-retryTimerCh: + format, err := connectLoadInitFormats(retryCount, firstDisk, endpoints, setCount, disksPerSet) if err != nil { switch err { case errNotFirstDisk: diff --git a/cmd/storage-interface.go b/cmd/storage-interface.go index 936a8d8d4..109b1fa74 100644 --- a/cmd/storage-interface.go +++ b/cmd/storage-interface.go @@ -27,7 +27,9 @@ type StorageAPI interface { // Storage operations. IsOnline() bool // Returns true if disk is online. + LastError() error Close() error + DiskInfo() (info DiskInfo, err error) // Volume operations. diff --git a/cmd/storage-rpc-client.go b/cmd/storage-rpc-client.go index daa8abbfd..cece042c5 100644 --- a/cmd/storage-rpc-client.go +++ b/cmd/storage-rpc-client.go @@ -104,6 +104,8 @@ func toStorageErr(err error) error { type StorageRPCClient struct { *RPCClient connected bool + // Plain error of the last RPC call + lastRPCError error } // Stringer provides a canonicalized representation of network device. @@ -114,6 +116,11 @@ func (client *StorageRPCClient) String() string { return url.String() } +// LastError - returns the last RPC call result, nil or error if any +func (client *StorageRPCClient) LastError() error { + return client.lastRPCError +} + // Close - closes underneath RPC client. func (client *StorageRPCClient) Close() error { client.connected = false @@ -125,14 +132,22 @@ func (client *StorageRPCClient) IsOnline() bool { return client.connected } +func (client *StorageRPCClient) connect() { + err := client.Call(storageServiceName+".Connect", &AuthArgs{}, &VoidReply{}) + client.lastRPCError = err + client.connected = err == nil +} + func (client *StorageRPCClient) call(handler string, args interface { SetAuthArgs(args AuthArgs) }, reply interface{}) error { + if !client.connected { return errDiskNotFound } err := client.Call(handler, args, reply) + client.lastRPCError = err if err == nil { return nil } @@ -318,6 +333,7 @@ func newStorageRPC(endpoint Endpoint) *StorageRPCClient { logger.FatalIf(err, "Unable to parse storage RPC Host", context.Background()) rpcClient, err := NewStorageRPCClient(host, endpoint.Path) logger.FatalIf(err, "Unable to initialize storage RPC client", context.Background()) - rpcClient.connected = rpcClient.Call(storageServiceName+".Connect", &AuthArgs{}, &VoidReply{}) == nil + // Attempt first try connection and save error if any. + rpcClient.connect() return rpcClient }