fix: heal multiple buckets in bulk (#11029)

makes server startup, orders of magnitude
faster with large number of buckets
master
Harshavardhana 4 years ago committed by GitHub
parent 3514e89eb3
commit 9c53cc1b83
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 32
      buildscripts/verify-build.sh
  2. 58
      buildscripts/verify-healing.sh
  3. 28
      cmd/erasure-bucket.go
  4. 22
      cmd/erasure-server-sets.go
  5. 25
      cmd/erasure-sets.go
  6. 5
      cmd/gateway-unsupported.go
  7. 30
      cmd/iam.go
  8. 3
      cmd/metacache-set.go
  9. 1
      cmd/object-api-interface.go
  10. 20
      cmd/server-main.go
  11. 61
      pkg/madmin/examples/heal-buckets-list.go

@ -60,24 +60,24 @@ function start_minio_erasure_sets()
sleep 15
}
function start_minio_zone_erasure_sets()
function start_minio_pool_erasure_sets()
{
export MINIO_ACCESS_KEY=$ACCESS_KEY
export MINIO_SECRET_KEY=$SECRET_KEY
"${MINIO[@]}" server --address=:9000 "http://127.0.0.1:9000${WORK_DIR}/zone-disk-sets{1...4}" "http://127.0.0.1:9001${WORK_DIR}/zone-disk-sets{5...8}" >"$WORK_DIR/zone-minio-9000.log" 2>&1 &
"${MINIO[@]}" server --address=:9001 "http://127.0.0.1:9000${WORK_DIR}/zone-disk-sets{1...4}" "http://127.0.0.1:9001${WORK_DIR}/zone-disk-sets{5...8}" >"$WORK_DIR/zone-minio-9001.log" 2>&1 &
"${MINIO[@]}" server --address=:9000 "http://127.0.0.1:9000${WORK_DIR}/pool-disk-sets{1...4}" "http://127.0.0.1:9001${WORK_DIR}/pool-disk-sets{5...8}" >"$WORK_DIR/pool-minio-9000.log" 2>&1 &
"${MINIO[@]}" server --address=:9001 "http://127.0.0.1:9000${WORK_DIR}/pool-disk-sets{1...4}" "http://127.0.0.1:9001${WORK_DIR}/pool-disk-sets{5...8}" >"$WORK_DIR/pool-minio-9001.log" 2>&1 &
sleep 40
}
function start_minio_zone_erasure_sets_ipv6()
function start_minio_pool_erasure_sets_ipv6()
{
export MINIO_ACCESS_KEY=$ACCESS_KEY
export MINIO_SECRET_KEY=$SECRET_KEY
"${MINIO[@]}" server --address="[::1]:9000" "http://[::1]:9000${WORK_DIR}/zone-disk-sets{1...4}" "http://[::1]:9001${WORK_DIR}/zone-disk-sets{5...8}" >"$WORK_DIR/zone-minio-ipv6-9000.log" 2>&1 &
"${MINIO[@]}" server --address="[::1]:9001" "http://[::1]:9000${WORK_DIR}/zone-disk-sets{1...4}" "http://[::1]:9001${WORK_DIR}/zone-disk-sets{5...8}" >"$WORK_DIR/zone-minio-ipv6-9001.log" 2>&1 &
"${MINIO[@]}" server --address="[::1]:9000" "http://[::1]:9000${WORK_DIR}/pool-disk-sets{1...4}" "http://[::1]:9001${WORK_DIR}/pool-disk-sets{5...8}" >"$WORK_DIR/pool-minio-ipv6-9000.log" 2>&1 &
"${MINIO[@]}" server --address="[::1]:9001" "http://[::1]:9000${WORK_DIR}/pool-disk-sets{1...4}" "http://[::1]:9001${WORK_DIR}/pool-disk-sets{5...8}" >"$WORK_DIR/pool-minio-ipv6-9001.log" 2>&1 &
sleep 40
}
@ -129,9 +129,9 @@ function run_test_erasure_sets() {
return "$rv"
}
function run_test_zone_erasure_sets()
function run_test_pool_erasure_sets()
{
start_minio_zone_erasure_sets
start_minio_pool_erasure_sets
(cd "$WORK_DIR" && "$FUNCTIONAL_TESTS")
rv=$?
@ -142,20 +142,20 @@ function run_test_zone_erasure_sets()
if [ "$rv" -ne 0 ]; then
for i in $(seq 0 1); do
echo "server$i log:"
cat "$WORK_DIR/zone-minio-900$i.log"
cat "$WORK_DIR/pool-minio-900$i.log"
done
fi
for i in $(seq 0 1); do
rm -f "$WORK_DIR/zone-minio-900$i.log"
rm -f "$WORK_DIR/pool-minio-900$i.log"
done
return "$rv"
}
function run_test_zone_erasure_sets_ipv6()
function run_test_pool_erasure_sets_ipv6()
{
start_minio_zone_erasure_sets_ipv6
start_minio_pool_erasure_sets_ipv6
export SERVER_ENDPOINT="[::1]:9000"
@ -168,12 +168,12 @@ function run_test_zone_erasure_sets_ipv6()
if [ "$rv" -ne 0 ]; then
for i in $(seq 0 1); do
echo "server$i log:"
cat "$WORK_DIR/zone-minio-ipv6-900$i.log"
cat "$WORK_DIR/pool-minio-ipv6-900$i.log"
done
fi
for i in $(seq 0 1); do
rm -f "$WORK_DIR/zone-minio-ipv6-900$i.log"
rm -f "$WORK_DIR/pool-minio-ipv6-900$i.log"
done
return "$rv"
@ -293,14 +293,14 @@ function main()
fi
echo "Testing in Distributed Eraure expanded setup"
if ! run_test_zone_erasure_sets; then
if ! run_test_pool_erasure_sets; then
echo "FAILED"
purge "$WORK_DIR"
exit 1
fi
echo "Testing in Distributed Erasure expanded setup with ipv6"
if ! run_test_zone_erasure_sets_ipv6; then
if ! run_test_pool_erasure_sets_ipv6; then
echo "FAILED"
purge "$WORK_DIR"
exit 1

@ -29,44 +29,52 @@ MINIO_CONFIG_DIR="$WORK_DIR/.minio"
MINIO=( "$PWD/minio" --config-dir "$MINIO_CONFIG_DIR" server )
function start_minio_3_node() {
declare -a minio_pids
declare -a ARGS
export MINIO_ACCESS_KEY=minio
export MINIO_SECRET_KEY=minio123
export MINIO_ERASURE_SET_DRIVE_COUNT=6
start_port=$(shuf -i 10000-65000 -n 1)
args=""
for i in $(seq 1 3); do
ARGS+=("http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/1/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/2/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/3/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/4/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/5/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/6/")
args="$args http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/1/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/2/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/3/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/4/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/5/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/6/"
done
"${MINIO[@]}" --address ":$[$start_port+1]" ${ARGS[@]} > "${WORK_DIR}/dist-minio-server1.log" 2>&1 &
minio_pids[0]=$!
disown "${minio_pids[0]}"
"${MINIO[@]}" --address ":$[$start_port+1]" $args > "${WORK_DIR}/dist-minio-server1.log" 2>&1 &
disown $!
"${MINIO[@]}" --address ":$[$start_port+2]" ${ARGS[@]} > "${WORK_DIR}/dist-minio-server2.log" 2>&1 &
minio_pids[1]=$!
disown "${minio_pids[1]}"
"${MINIO[@]}" --address ":$[$start_port+2]" $args > "${WORK_DIR}/dist-minio-server2.log" 2>&1 &
disown $!
"${MINIO[@]}" --address ":$[$start_port+3]" ${ARGS[@]} > "${WORK_DIR}/dist-minio-server3.log" 2>&1 &
minio_pids[2]=$!
disown "${minio_pids[2]}"
"${MINIO[@]}" --address ":$[$start_port+3]" $args > "${WORK_DIR}/dist-minio-server3.log" 2>&1 &
disown $!
sleep "$1"
for pid in "${minio_pids[@]}"; do
if ! kill "$pid"; then
for i in $(seq 1 3); do
echo "server$i log:"
cat "${WORK_DIR}/dist-minio-server$i.log"
done
echo "FAILED"
purge "$WORK_DIR"
exit 1
fi
if [ "$(pgrep -c minio)" -ne 3 ]; then
for i in $(seq 1 3); do
echo "server$i log:"
cat "${WORK_DIR}/dist-minio-server$i.log"
done
echo "FAILED"
purge "$WORK_DIR"
exit 1
fi
if ! pkill minio; then
for i in $(seq 1 3); do
echo "server$i log:"
cat "${WORK_DIR}/dist-minio-server$i.log"
done
echo "FAILED"
purge "$WORK_DIR"
exit 1
fi
sleep 1;
if pgrep minio; then
# forcibly killing, to proceed further properly.
kill -9 "$pid"
sleep 1 # wait 1sec per pid
done
if ! pkill -9 minio; then
echo "no minio process running anymore, proceed."
fi
fi
}

@ -31,6 +31,34 @@ var bucketOpIgnoredErrs = append(baseIgnoredErrs, errDiskAccessDenied, errUnform
// list all errors that can be ignored in a bucket metadata operation.
var bucketMetadataOpIgnoredErrs = append(bucketOpIgnoredErrs, errVolumeNotFound)
// MakeMultipleBuckets - create a list of buckets
func (er erasureObjects) MakeMultipleBuckets(ctx context.Context, buckets ...string) error {
storageDisks := er.getDisks()
g := errgroup.WithNErrs(len(storageDisks))
// Make a volume entry on all underlying storage disks.
for index := range storageDisks {
index := index
g.Go(func() error {
if storageDisks[index] != nil {
if err := storageDisks[index].MakeVolBulk(ctx, buckets...); err != nil {
if !errors.Is(err, errVolumeExists) {
logger.LogIf(ctx, err)
}
return err
}
return nil
}
return errDiskNotFound
}, index)
}
writeQuorum := getWriteQuorum(len(storageDisks))
err := reduceWriteQuorumErrs(ctx, g.Wait(), bucketOpIgnoredErrs, writeQuorum)
return toObjectErr(err)
}
/// Bucket operations
// MakeBucket - make a bucket.

@ -424,6 +424,28 @@ func (z *erasureServerPools) CrawlAndGetDataUsage(ctx context.Context, bf *bloom
return firstErr
}
func (z *erasureServerPools) MakeMultipleBuckets(ctx context.Context, buckets ...string) error {
g := errgroup.WithNErrs(len(z.serverPools))
// Create buckets in parallel across all sets.
for index := range z.serverPools {
index := index
g.Go(func() error {
return z.serverPools[index].MakeMultipleBuckets(ctx, buckets...)
}, index)
}
errs := g.Wait()
// Return the first encountered error
for _, err := range errs {
if err != nil {
return err
}
}
return nil
}
// MakeBucketWithLocation - creates a new bucket across all serverPools simultaneously
// even if one of the sets fail to create buckets, we proceed all the successful
// operations.

@ -531,6 +531,31 @@ func (s *erasureSets) Shutdown(ctx context.Context) error {
return nil
}
// MakeMultipleBuckets - make many buckets at once.
func (s *erasureSets) MakeMultipleBuckets(ctx context.Context, buckets ...string) error {
g := errgroup.WithNErrs(len(s.sets))
// Create buckets in parallel across all sets.
for index := range s.sets {
index := index
g.Go(func() error {
return s.sets[index].MakeMultipleBuckets(ctx, buckets...)
}, index)
}
errs := g.Wait()
// Return the first encountered error
for _, err := range errs {
if err != nil {
return err
}
}
// Success.
return nil
}
// MakeBucketLocation - creates a new bucket across all sets simultaneously,
// then return the first encountered error
func (s *erasureSets) MakeBucketWithLocation(ctx context.Context, bucket string, opts BucketOptions) error {

@ -51,6 +51,11 @@ func (a GatewayUnsupported) SetDriveCount() int {
return 0
}
// MakeMultipleBuckets is dummy stub for gateway.
func (a GatewayUnsupported) MakeMultipleBuckets(ctx context.Context, buckets ...string) error {
return NotImplemented{}
}
// ListMultipartUploads lists all multipart uploads.
func (a GatewayUnsupported) ListMultipartUploads(ctx context.Context, bucket string, prefix string, keyMarker string, uploadIDMarker string, delimiter string, maxUploads int) (lmi ListMultipartsInfo, err error) {
return lmi, NotImplemented{}

@ -466,11 +466,10 @@ func (sys *IAMSys) Init(ctx context.Context, objAPI ObjectLayer) {
r := rand.New(rand.NewSource(time.Now().UnixNano()))
var err error
for {
// let one of the server acquire the lock, if not let them timeout.
// which shall be retried again by this loop.
if err = txnLk.GetLock(retryCtx, iamLockTimeout); err != nil {
if err := txnLk.GetLock(retryCtx, iamLockTimeout); err != nil {
logger.Info("Waiting for all MinIO IAM sub-system to be initialized.. trying to acquire lock")
time.Sleep(time.Duration(r.Float64() * float64(5*time.Second)))
continue
@ -480,7 +479,7 @@ func (sys *IAMSys) Init(ctx context.Context, objAPI ObjectLayer) {
// **** WARNING ****
// Migrating to encrypted backend on etcd should happen before initialization of
// IAM sub-system, make sure that we do not move the above codeblock elsewhere.
if err = migrateIAMConfigsEtcdToEncrypted(ctx, globalEtcdClient); err != nil {
if err := migrateIAMConfigsEtcdToEncrypted(ctx, globalEtcdClient); err != nil {
txnLk.Unlock()
logger.LogIf(ctx, fmt.Errorf("Unable to decrypt an encrypted ETCD backend for IAM users and policies: %w", err))
logger.LogIf(ctx, errors.New("IAM sub-system is partially initialized, some users may not be available"))
@ -494,7 +493,7 @@ func (sys *IAMSys) Init(ctx context.Context, objAPI ObjectLayer) {
}
// Migrate IAM configuration, if necessary.
if err = sys.doIAMConfigMigration(ctx); err != nil {
if err := sys.doIAMConfigMigration(ctx); err != nil {
txnLk.Unlock()
if errors.Is(err, errDiskNotFound) ||
errors.Is(err, errConfigNotFound) ||
@ -515,14 +514,27 @@ func (sys *IAMSys) Init(ctx context.Context, objAPI ObjectLayer) {
break
}
err = sys.store.loadAll(ctx, sys)
for {
if err := sys.store.loadAll(ctx, sys); err != nil {
if errors.Is(err, errDiskNotFound) ||
errors.Is(err, errConfigNotFound) ||
errors.Is(err, context.DeadlineExceeded) ||
errors.As(err, &rquorum) ||
errors.As(err, &wquorum) ||
isErrBucketNotFound(err) {
logger.Info("Waiting for all MinIO IAM sub-system to be initialized.. possible cause (%v)", err)
time.Sleep(time.Duration(r.Float64() * float64(5*time.Second)))
continue
}
if err != nil {
logger.LogIf(ctx, fmt.Errorf("Unable to initialize IAM sub-system, some users may not be available %w", err))
}
}
break
}
// Invalidate the old cred always, even upon error to avoid any leakage.
globalOldCred = auth.Credentials{}
if err != nil {
logger.LogIf(ctx, fmt.Errorf("Unable to initialize IAM sub-system, some users may not be available %w", err))
}
go sys.store.watch(ctx, sys)
}

@ -614,9 +614,6 @@ func (er *erasureObjects) listPath(ctx context.Context, o listPathOptions) (entr
if len(disks) < askDisks {
err = InsufficientReadQuorum{}
if debugPrint {
console.Errorf("listPath: Insufficient disks, %d of %d needed are available", len(disks), askDisks)
}
logger.LogIf(ctx, fmt.Errorf("listPath: Insufficient disks, %d of %d needed are available", len(disks), askDisks))
cancel()
return

@ -82,6 +82,7 @@ type ObjectLayer interface {
StorageInfo(ctx context.Context, local bool) (StorageInfo, []error) // local queries only local disks
// Bucket operations.
MakeMultipleBuckets(ctx context.Context, buckets ...string) error
MakeBucketWithLocation(ctx context.Context, bucket string, opts BucketOptions) error
GetBucketInfo(ctx context.Context, bucket string) (bucketInfo BucketInfo, err error)
ListBuckets(ctx context.Context) (buckets []BucketInfo, err error)

@ -304,18 +304,14 @@ func initAllSubsystems(ctx context.Context, newObject ObjectLayer) (err error) {
if err != nil {
return fmt.Errorf("Unable to list buckets to heal: %w", err)
}
for _, bucket := range buckets {
if err = newObject.MakeBucketWithLocation(ctx, bucket.Name, BucketOptions{}); err != nil {
if errors.As(err, &wquorum) || errors.As(err, &rquorum) {
// Return the error upwards for the caller to retry.
return fmt.Errorf("Unable to heal bucket: %w", err)
}
if _, ok := err.(BucketExists); !ok {
// ignore any other error and log for investigation.
logger.LogIf(ctx, err)
continue
}
// Bucket already exists, nothing that needs to be done.
bucketNames := make([]string, len(buckets))
for i := range buckets {
bucketNames[i] = buckets[i].Name
}
if err = newObject.MakeMultipleBuckets(ctx, bucketNames...); err != nil {
if errors.As(err, &wquorum) || errors.As(err, &rquorum) {
// Return the error upwards for the caller to retry.
return fmt.Errorf("Unable to heal buckets: %w", err)
}
}
} else {

@ -1,61 +0,0 @@
// +build ignore
package main
/*
* MinIO Cloud Storage, (C) 2017 MinIO, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
import (
"context"
"fmt"
"log"
"github.com/minio/minio/pkg/madmin"
)
func main() {
// Note: YOUR-ACCESSKEYID, YOUR-SECRETACCESSKEY are
// dummy values, please replace them with original values.
// API requests are secure (HTTPS) if secure=true and insecure (HTTP) otherwise.
// New returns an MinIO Admin client object.
madmClnt, err := madmin.New("your-minio.example.com:9000", "YOUR-ACCESSKEYID", "YOUR-SECRETACCESSKEY", true)
if err != nil {
log.Fatalln(err)
}
// List buckets that need healing
healBucketsList, err := madmClnt.ListBucketsHeal(context.Background())
if err != nil {
log.Fatalln(err)
}
for _, bucket := range healBucketsList {
if bucket.HealBucketInfo != nil {
switch healInfo := *bucket.HealBucketInfo; healInfo.Status {
case madmin.CanHeal:
fmt.Println(bucket.Name, " can be healed.")
case madmin.QuorumUnavailable:
fmt.Println(bucket.Name, " can't be healed until quorum is available.")
case madmin.Corrupted:
fmt.Println(bucket.Name, " can't be healed, not enough information.")
}
}
fmt.Println("bucket: ", bucket)
}
}
Loading…
Cancel
Save