Print storage errors during distributed initialization (#6441)

This commit will print connection failures to other disks in other nodes
after 5 retries. It is useful for users to understand why the
distribued cluster fails to boot up.
master
Anis Elleuch 6 years ago committed by Dee Koder
parent 12b4971b70
commit 7571582000
  1. 4
      cmd/naughty-disk_test.go
  2. 4
      cmd/posix.go
  3. 18
      cmd/prepare-storage.go
  4. 2
      cmd/storage-interface.go
  5. 18
      cmd/storage-rpc-client.go

@ -52,6 +52,10 @@ func (d *naughtyDisk) IsOnline() bool {
return d.disk.IsOnline() return d.disk.IsOnline()
} }
func (d *naughtyDisk) LastError() (err error) {
return nil
}
func (d *naughtyDisk) Close() (err error) { func (d *naughtyDisk) Close() (err error) {
if err = d.calcError(); err != nil { if err = d.calcError(); err != nil {
return err return err

@ -274,6 +274,10 @@ func (s *posix) String() string {
return s.diskPath return s.diskPath
} }
func (s *posix) LastError() error {
return nil
}
func (s *posix) Close() error { func (s *posix) Close() error {
close(s.stopUsageCh) close(s.stopUsageCh)
s.connected = false s.connected = false

@ -120,13 +120,25 @@ var errXLV3ThisEmpty = fmt.Errorf("XL format version 3 has This field empty")
// connect to list of endpoints and load all XL disk formats, validate the formats are correct // connect to list of endpoints and load all XL disk formats, validate the formats are correct
// and are in quorum, if no formats are found attempt to initialize all of them for the first // and are in quorum, if no formats are found attempt to initialize all of them for the first
// time. additionally make sure to close all the disks used in this attempt. // time. additionally make sure to close all the disks used in this attempt.
func connectLoadInitFormats(firstDisk bool, endpoints EndpointList, setCount, drivesPerSet int) (*formatXLV3, error) { func connectLoadInitFormats(retryCount int, firstDisk bool, endpoints EndpointList, setCount, drivesPerSet int) (*formatXLV3, error) {
// Initialize all storage disks
storageDisks, err := initStorageDisks(endpoints) storageDisks, err := initStorageDisks(endpoints)
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer closeStorageDisks(storageDisks) defer closeStorageDisks(storageDisks)
// Connect to all storage disks, a connection failure will be
// only logged after some retries.
for _, disk := range storageDisks {
if disk != nil {
connectErr := disk.LastError()
if connectErr != nil && retryCount >= 5 {
logger.Info("Unable to connect to %s: %v\n", disk.String(), connectErr.Error())
}
}
}
// Attempt to load all `format.json` from all disks. // Attempt to load all `format.json` from all disks.
formatConfigs, sErrs := loadFormatXLAll(storageDisks) formatConfigs, sErrs := loadFormatXLAll(storageDisks)
// Check if we have // Check if we have
@ -238,8 +250,8 @@ func waitForFormatXL(ctx context.Context, firstDisk bool, endpoints EndpointList
retryTimerCh := newRetryTimerSimple(doneCh) retryTimerCh := newRetryTimerSimple(doneCh)
for { for {
select { select {
case _ = <-retryTimerCh: case retryCount := <-retryTimerCh:
format, err := connectLoadInitFormats(firstDisk, endpoints, setCount, disksPerSet) format, err := connectLoadInitFormats(retryCount, firstDisk, endpoints, setCount, disksPerSet)
if err != nil { if err != nil {
switch err { switch err {
case errNotFirstDisk: case errNotFirstDisk:

@ -27,7 +27,9 @@ type StorageAPI interface {
// Storage operations. // Storage operations.
IsOnline() bool // Returns true if disk is online. IsOnline() bool // Returns true if disk is online.
LastError() error
Close() error Close() error
DiskInfo() (info DiskInfo, err error) DiskInfo() (info DiskInfo, err error)
// Volume operations. // Volume operations.

@ -104,6 +104,8 @@ func toStorageErr(err error) error {
type StorageRPCClient struct { type StorageRPCClient struct {
*RPCClient *RPCClient
connected bool connected bool
// Plain error of the last RPC call
lastRPCError error
} }
// Stringer provides a canonicalized representation of network device. // Stringer provides a canonicalized representation of network device.
@ -114,6 +116,11 @@ func (client *StorageRPCClient) String() string {
return url.String() return url.String()
} }
// LastError - returns the last RPC call result, nil or error if any
func (client *StorageRPCClient) LastError() error {
return client.lastRPCError
}
// Close - closes underneath RPC client. // Close - closes underneath RPC client.
func (client *StorageRPCClient) Close() error { func (client *StorageRPCClient) Close() error {
client.connected = false client.connected = false
@ -125,14 +132,22 @@ func (client *StorageRPCClient) IsOnline() bool {
return client.connected return client.connected
} }
func (client *StorageRPCClient) connect() {
err := client.Call(storageServiceName+".Connect", &AuthArgs{}, &VoidReply{})
client.lastRPCError = err
client.connected = err == nil
}
func (client *StorageRPCClient) call(handler string, args interface { func (client *StorageRPCClient) call(handler string, args interface {
SetAuthArgs(args AuthArgs) SetAuthArgs(args AuthArgs)
}, reply interface{}) error { }, reply interface{}) error {
if !client.connected { if !client.connected {
return errDiskNotFound return errDiskNotFound
} }
err := client.Call(handler, args, reply) err := client.Call(handler, args, reply)
client.lastRPCError = err
if err == nil { if err == nil {
return nil return nil
} }
@ -318,6 +333,7 @@ func newStorageRPC(endpoint Endpoint) *StorageRPCClient {
logger.FatalIf(err, "Unable to parse storage RPC Host", context.Background()) logger.FatalIf(err, "Unable to parse storage RPC Host", context.Background())
rpcClient, err := NewStorageRPCClient(host, endpoint.Path) rpcClient, err := NewStorageRPCClient(host, endpoint.Path)
logger.FatalIf(err, "Unable to initialize storage RPC client", context.Background()) logger.FatalIf(err, "Unable to initialize storage RPC client", context.Background())
rpcClient.connected = rpcClient.Call(storageServiceName+".Connect", &AuthArgs{}, &VoidReply{}) == nil // Attempt first try connection and save error if any.
rpcClient.connect()
return rpcClient return rpcClient
} }

Loading…
Cancel
Save