From d4b107adf48c8dbc819f8e74d0aeb9bc9854f559 Mon Sep 17 00:00:00 2001 From: Nitish Tiwari Date: Mon, 14 Aug 2017 01:33:06 +0530 Subject: [PATCH] Retry name lookup for kubernetes and docker swarm environment (#4800) Wait for remote hosts to resolve instead of failing on first host resolution error, when running in Kubernetes or Docker environment. Note that - Waiting is based on exponential back-off mechanism - If run as a binary, server fails if remote host is not resolvable This is needed because in orchestration platforms like Kubernetes, remote hosts are started sequentially and all the hosts are not up initially, though they are expected to come up in a short time frame It is difficult to identify a cap on the waiting time due to non-deterministic nature of infrastructure platforms, so the server waits infinitely for the hosts to come up, while logging the error messages to the console. Fixes: https://github.com/minio/minio/issues/4669 --- buildscripts/docker-entrypoint.sh | 38 ---------------------------- cmd/net.go | 42 ++++++++++++++++++++++++++++--- 2 files changed, 38 insertions(+), 42 deletions(-) diff --git a/buildscripts/docker-entrypoint.sh b/buildscripts/docker-entrypoint.sh index 52ca80972..fbb813ea1 100755 --- a/buildscripts/docker-entrypoint.sh +++ b/buildscripts/docker-entrypoint.sh @@ -22,41 +22,6 @@ if [ "${1}" != "minio" ]; then fi fi -# Wait for all the hosts to come online and have -# their DNS entries populated properly. -docker_wait_hosts() { - hosts="$@" - num_hosts=0 - # Count number of hosts in arguments. - for host in $hosts; do - [ $(echo "$host" | grep -E "^http") ] || continue - num_hosts=$((num_hosts+1)) - done - if [ $num_hosts -gt 0 ]; then - echo -n "Waiting for all hosts to resolve..." - while true; do - x=0 - for host in $hosts; do - [ $(echo "$host" | grep -E "^http") ] || continue - # Extract the domain. - host=$(echo $host | sed -e 's/^http[s]\?:\/\/\([^\/]\+\).*/\1/') - echo -n . - val=$(ping -c 1 $host 2>/dev/null) - if [ $? != 0 ]; then - echo "Failed to lookup $host" - continue - fi - x=$((x+1)) - done - # Provided hosts same as successful hosts, should break out. - test $x -eq $num_hosts && break - echo "Failed to resolve hosts.. retrying after 1 second." - sleep 1 - done - echo "All hosts are resolving proceeding to initialize Minio." - fi -} - ## Look for docker secrets in default documented location. docker_secrets_env() { local MINIO_ACCESS_KEY_FILE="/run/secrets/access_key" @@ -75,7 +40,4 @@ docker_secrets_env() { ## Set access env from secrets if necessary. docker_secrets_env -## Wait for all the hosts to come online. -docker_wait_hosts "$@" - exec "$@" diff --git a/cmd/net.go b/cmd/net.go index b2ed5b311..8cccc2373 100644 --- a/cmd/net.go +++ b/cmd/net.go @@ -26,7 +26,9 @@ import ( "strconv" "strings" "syscall" + "time" + humanize "github.com/dustin/go-humanize" "github.com/minio/minio-go/pkg/set" ) @@ -65,12 +67,44 @@ func mustGetLocalIP4() (ipList set.StringSet) { // getHostIP4 returns IPv4 address of given host. func getHostIP4(host string) (ipList set.StringSet, err error) { - ipList = set.NewStringSet() - ips, err := net.LookupIP(host) - if err != nil { - return ipList, err + var ips []net.IP + + if ips, err = net.LookupIP(host); err != nil { + // return err if not Docker or Kubernetes + // We use IsDocker() method to check for Docker Swarm environment + // as there is no reliable way to clearly identify Swarm from + // Docker environment. + if !IsDocker() && !IsKubernetes() { + return ipList, err + } + + // channel to indicate completion of host resolution + doneCh := make(chan struct{}) + // Indicate retry routine to exit cleanly, upon this function return. + defer close(doneCh) + // Mark the starting time + startTime := time.Now() + // wait for hosts to resolve in exponentialbackoff manner + for _ = range newRetryTimerSimple(doneCh) { + // Retry infinitely on Kubernetes and Docker swarm. + // This is needed as the remote hosts are sometime + // not available immediately. + if ips, err = net.LookupIP(host); err == nil { + break + } + // time elapsed + timeElapsed := time.Since(startTime) + // log error only if more than 1s elapsed + if timeElapsed > time.Second { + // log the message to console about the host not being + // resolveable. + errorIf(err, "Unable to resolve host %s (%s)", host, + humanize.RelTime(startTime, startTime.Add(timeElapsed), "elapsed", "")) + } + } } + ipList = set.NewStringSet() for _, ip := range ips { if ip.To4() != nil { ipList.Add(ip.String())