/* * MinIO Cloud Storage, (C) 2020 MinIO, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cmd import ( "context" "fmt" "net/http" "runtime" "strings" "time" minio "github.com/minio/minio-go/v7" miniogo "github.com/minio/minio-go/v7" "github.com/minio/minio-go/v7/pkg/encrypt" "github.com/minio/minio-go/v7/pkg/tags" "github.com/minio/minio/cmd/crypto" xhttp "github.com/minio/minio/cmd/http" "github.com/minio/minio/cmd/logger" "github.com/minio/minio/pkg/bucket/bandwidth" "github.com/minio/minio/pkg/bucket/replication" "github.com/minio/minio/pkg/event" iampolicy "github.com/minio/minio/pkg/iam/policy" "github.com/minio/minio/pkg/madmin" ) // gets replication config associated to a given bucket name. func getReplicationConfig(ctx context.Context, bucketName string) (rc *replication.Config, err error) { if globalIsGateway { objAPI := newObjectLayerFn() if objAPI == nil { return nil, errServerNotInitialized } return nil, BucketReplicationConfigNotFound{Bucket: bucketName} } return globalBucketMetadataSys.GetReplicationConfig(ctx, bucketName) } // validateReplicationDestination returns error if replication destination bucket missing or not configured // It also returns true if replication destination is same as this server. func validateReplicationDestination(ctx context.Context, bucket string, rCfg *replication.Config) (bool, error) { arn, err := madmin.ParseARN(rCfg.RoleArn) if err != nil { return false, BucketRemoteArnInvalid{} } if arn.Type != madmin.ReplicationService { return false, BucketRemoteArnTypeInvalid{} } clnt := globalBucketTargetSys.GetRemoteTargetClient(ctx, rCfg.RoleArn) if clnt == nil { return false, BucketRemoteTargetNotFound{Bucket: bucket} } if found, _ := clnt.BucketExists(ctx, rCfg.GetDestination().Bucket); !found { return false, BucketRemoteDestinationNotFound{Bucket: rCfg.GetDestination().Bucket} } if ret, err := globalBucketObjectLockSys.Get(bucket); err == nil { if ret.LockEnabled { lock, _, _, _, err := clnt.GetObjectLockConfig(ctx, rCfg.GetDestination().Bucket) if err != nil || lock != "Enabled" { return false, BucketReplicationDestinationMissingLock{Bucket: rCfg.GetDestination().Bucket} } } } // validate replication ARN against target endpoint c, ok := globalBucketTargetSys.arnRemotesMap[rCfg.RoleArn] if ok { if c.EndpointURL().String() == clnt.EndpointURL().String() { sameTarget, _ := isLocalHost(clnt.EndpointURL().Hostname(), clnt.EndpointURL().Port(), globalMinioPort) return sameTarget, nil } } return false, BucketRemoteTargetNotFound{Bucket: bucket} } func mustReplicateWeb(ctx context.Context, r *http.Request, bucket, object string, meta map[string]string, replStatus string, permErr APIErrorCode) bool { if permErr != ErrNone { return false } return mustReplicater(ctx, bucket, object, meta, replStatus) } // mustReplicate returns true if object meets replication criteria. func mustReplicate(ctx context.Context, r *http.Request, bucket, object string, meta map[string]string, replStatus string) bool { if s3Err := isPutActionAllowed(ctx, getRequestAuthType(r), bucket, "", r, iampolicy.GetReplicationConfigurationAction); s3Err != ErrNone { return false } return mustReplicater(ctx, bucket, object, meta, replStatus) } // mustReplicater returns true if object meets replication criteria. func mustReplicater(ctx context.Context, bucket, object string, meta map[string]string, replStatus string) bool { if globalIsGateway { return false } if rs, ok := meta[xhttp.AmzBucketReplicationStatus]; ok { replStatus = rs } if replication.StatusType(replStatus) == replication.Replica { return false } cfg, err := getReplicationConfig(ctx, bucket) if err != nil { return false } opts := replication.ObjectOpts{ Name: object, SSEC: crypto.SSEC.IsEncrypted(meta), } tagStr, ok := meta[xhttp.AmzObjectTagging] if ok { opts.UserTags = tagStr } return cfg.Replicate(opts) } // returns true if any of the objects being deleted qualifies for replication. func hasReplicationRules(ctx context.Context, bucket string, objects []ObjectToDelete) bool { c, err := getReplicationConfig(ctx, bucket) if err != nil || c == nil { return false } for _, obj := range objects { if c.HasActiveRules(obj.ObjectName, true) { return true } } return false } // returns whether object version is a deletemarker and if object qualifies for replication func checkReplicateDelete(ctx context.Context, bucket string, dobj ObjectToDelete, oi ObjectInfo, gerr error) (dm, replicate bool) { rcfg, err := getReplicationConfig(ctx, bucket) if err != nil || rcfg == nil { return false, false } // when incoming delete is removal of a delete marker( a.k.a versioned delete), // GetObjectInfo returns extra information even though it returns errFileNotFound if gerr != nil { validReplStatus := false switch oi.ReplicationStatus { case replication.Pending, replication.Complete, replication.Failed: validReplStatus = true } if oi.DeleteMarker && validReplStatus { return oi.DeleteMarker, true } return oi.DeleteMarker, false } opts := replication.ObjectOpts{ Name: dobj.ObjectName, SSEC: crypto.SSEC.IsEncrypted(oi.UserDefined), UserTags: oi.UserTags, DeleteMarker: oi.DeleteMarker, VersionID: dobj.VersionID, } return oi.DeleteMarker, rcfg.Replicate(opts) } // replicate deletes to the designated replication target if replication configuration // has delete marker replication or delete replication (MinIO extension to allow deletes where version id // is specified) enabled. // Similar to bucket replication for PUT operation, soft delete (a.k.a setting delete marker) and // permanent deletes (by specifying a version ID in the delete operation) have three states "Pending", "Complete" // and "Failed" to mark the status of the replication of "DELETE" operation. All failed operations can // then be retried by healing. In the case of permanent deletes, until the replication is completed on the // target cluster, the object version is marked deleted on the source and hidden from listing. It is permanently // deleted from the source when the VersionPurgeStatus changes to "Complete", i.e after replication succeeds // on target. func replicateDelete(ctx context.Context, dobj DeletedObjectVersionInfo, objectAPI ObjectLayer) { bucket := dobj.Bucket rcfg, err := getReplicationConfig(ctx, bucket) if err != nil || rcfg == nil { return } tgt := globalBucketTargetSys.GetRemoteTargetClient(ctx, rcfg.RoleArn) if tgt == nil { return } versionID := dobj.DeleteMarkerVersionID if versionID == "" { versionID = dobj.VersionID } rmErr := tgt.RemoveObject(ctx, rcfg.GetDestination().Bucket, dobj.ObjectName, miniogo.RemoveObjectOptions{ VersionID: versionID, Internal: miniogo.AdvancedRemoveOptions{ ReplicationDeleteMarker: dobj.DeleteMarkerVersionID != "", ReplicationMTime: dobj.DeleteMarkerMTime.Time, ReplicationStatus: miniogo.ReplicationStatusReplica, }, }) replicationStatus := dobj.DeleteMarkerReplicationStatus versionPurgeStatus := dobj.VersionPurgeStatus if rmErr != nil { if dobj.VersionID == "" { replicationStatus = string(replication.Failed) } else { versionPurgeStatus = Failed } } else { if dobj.VersionID == "" { replicationStatus = string(replication.Complete) } else { versionPurgeStatus = Complete } } var eventName = event.ObjectReplicationComplete if replicationStatus == string(replication.Failed) || versionPurgeStatus == Failed { eventName = event.ObjectReplicationFailed } objInfo := ObjectInfo{ Name: dobj.ObjectName, DeleteMarker: dobj.DeleteMarker, VersionID: versionID, ReplicationStatus: replication.StatusType(dobj.DeleteMarkerReplicationStatus), VersionPurgeStatus: versionPurgeStatus, } eventArg := &eventArgs{ BucketName: bucket, Object: objInfo, Host: "Internal: [Replication]", EventName: eventName, } sendEvent(*eventArg) // Update metadata on the delete marker or purge permanent delete if replication success. if _, err = objectAPI.DeleteObject(ctx, bucket, dobj.ObjectName, ObjectOptions{ VersionID: versionID, DeleteMarker: dobj.DeleteMarker, DeleteMarkerReplicationStatus: replicationStatus, Versioned: globalBucketVersioningSys.Enabled(bucket), VersionPurgeStatus: versionPurgeStatus, VersionSuspended: globalBucketVersioningSys.Suspended(bucket), }); err != nil { logger.LogIf(ctx, fmt.Errorf("Unable to update replication metadata for %s/%s %s: %w", bucket, dobj.ObjectName, dobj.VersionID, err)) } } func getCopyObjMetadata(oi ObjectInfo, dest replication.Destination) map[string]string { meta := make(map[string]string, len(oi.UserDefined)) for k, v := range oi.UserDefined { if k == xhttp.AmzBucketReplicationStatus { continue } if strings.HasPrefix(strings.ToLower(k), ReservedMetadataPrefixLower) { continue } meta[k] = v } if oi.ContentEncoding != "" { meta[xhttp.ContentEncoding] = oi.ContentEncoding } if oi.ContentType != "" { meta[xhttp.ContentType] = oi.ContentType } tag, err := tags.ParseObjectTags(oi.UserTags) if err != nil { return nil } if tag != nil { meta[xhttp.AmzObjectTagging] = tag.String() meta[xhttp.AmzTagDirective] = "REPLACE" } sc := dest.StorageClass if sc == "" { sc = oi.StorageClass } meta[xhttp.AmzStorageClass] = sc if oi.UserTags != "" { meta[xhttp.AmzObjectTagging] = oi.UserTags } meta[xhttp.MinIOSourceMTime] = oi.ModTime.Format(time.RFC3339Nano) meta[xhttp.MinIOSourceETag] = oi.ETag meta[xhttp.AmzBucketReplicationStatus] = replication.Replica.String() return meta } func putReplicationOpts(ctx context.Context, dest replication.Destination, objInfo ObjectInfo) (putOpts miniogo.PutObjectOptions) { meta := make(map[string]string) for k, v := range objInfo.UserDefined { if k == xhttp.AmzBucketReplicationStatus { continue } if strings.HasPrefix(strings.ToLower(k), ReservedMetadataPrefixLower) { continue } meta[k] = v } tag, err := tags.ParseObjectTags(objInfo.UserTags) if err != nil { return } sc := dest.StorageClass if sc == "" { sc = objInfo.StorageClass } putOpts = miniogo.PutObjectOptions{ UserMetadata: meta, UserTags: tag.ToMap(), ContentType: objInfo.ContentType, ContentEncoding: objInfo.ContentEncoding, StorageClass: sc, Internal: miniogo.AdvancedPutOptions{ SourceVersionID: objInfo.VersionID, ReplicationStatus: miniogo.ReplicationStatusReplica, SourceMTime: objInfo.ModTime, SourceETag: objInfo.ETag, }, } if mode, ok := objInfo.UserDefined[xhttp.AmzObjectLockMode]; ok { rmode := miniogo.RetentionMode(mode) putOpts.Mode = rmode } if retainDateStr, ok := objInfo.UserDefined[xhttp.AmzObjectLockRetainUntilDate]; ok { rdate, err := time.Parse(time.RFC3339Nano, retainDateStr) if err != nil { return } putOpts.RetainUntilDate = rdate } if lhold, ok := objInfo.UserDefined[xhttp.AmzObjectLockLegalHold]; ok { putOpts.LegalHold = miniogo.LegalHoldStatus(lhold) } if crypto.S3.IsEncrypted(objInfo.UserDefined) { putOpts.ServerSideEncryption = encrypt.NewSSE() } return } type replicationAction string const ( replicateMetadata replicationAction = "metadata" replicateNone replicationAction = "none" replicateAll replicationAction = "all" ) // returns replicationAction by comparing metadata between source and target func getReplicationAction(oi1 ObjectInfo, oi2 minio.ObjectInfo) replicationAction { // needs full replication if oi1.ETag != oi2.ETag || oi1.VersionID != oi2.VersionID || oi1.Size != oi2.Size || oi1.DeleteMarker != oi2.IsDeleteMarker { return replicateAll } if !oi1.ModTime.Equal(oi2.LastModified) || oi1.ContentType != oi2.ContentType || oi1.StorageClass != oi2.StorageClass { return replicateMetadata } if oi1.ContentEncoding != "" { enc, ok := oi2.UserMetadata[xhttp.ContentEncoding] if !ok || enc != oi1.ContentEncoding { return replicateMetadata } } for k, v := range oi2.UserMetadata { oi2.Metadata[k] = []string{v} } if len(oi2.Metadata) != len(oi1.UserDefined) { return replicateMetadata } for k1, v1 := range oi1.UserDefined { if v2, ok := oi2.Metadata[k1]; !ok || v1 != strings.Join(v2, "") { return replicateMetadata } } t, _ := tags.MapToObjectTags(oi2.UserTags) if t.String() != oi1.UserTags { return replicateMetadata } return replicateNone } // replicateObject replicates the specified version of the object to destination bucket // The source object is then updated to reflect the replication status. func replicateObject(ctx context.Context, objInfo ObjectInfo, objectAPI ObjectLayer) { bucket := objInfo.Bucket object := objInfo.Name cfg, err := getReplicationConfig(ctx, bucket) if err != nil { logger.LogIf(ctx, err) return } tgt := globalBucketTargetSys.GetRemoteTargetClient(ctx, cfg.RoleArn) if tgt == nil { logger.LogIf(ctx, fmt.Errorf("failed to get target for bucket:%s arn:%s", bucket, cfg.RoleArn)) return } gr, err := objectAPI.GetObjectNInfo(ctx, bucket, object, nil, http.Header{}, readLock, ObjectOptions{ VersionID: objInfo.VersionID, }) if err != nil { return } objInfo = gr.ObjInfo size, err := objInfo.GetActualSize() if err != nil { logger.LogIf(ctx, err) gr.Close() return } dest := cfg.GetDestination() if dest.Bucket == "" { gr.Close() return } rtype := replicateAll oi, err := tgt.StatObject(ctx, dest.Bucket, object, miniogo.StatObjectOptions{VersionID: objInfo.VersionID}) if err == nil { rtype = getReplicationAction(objInfo, oi) if rtype == replicateNone { gr.Close() // object with same VersionID already exists, replication kicked off by // PutObject might have completed. return } } replicationStatus := replication.Complete if rtype != replicateAll { gr.Close() // replicate metadata for object tagging/copy with metadata replacement dstOpts := miniogo.PutObjectOptions{Internal: miniogo.AdvancedPutOptions{SourceVersionID: objInfo.VersionID}} _, err = tgt.CopyObject(ctx, dest.Bucket, object, dest.Bucket, object, getCopyObjMetadata(objInfo, dest), dstOpts) if err != nil { replicationStatus = replication.Failed } } else { target, err := globalBucketMetadataSys.GetBucketTarget(bucket, cfg.RoleArn) if err != nil { logger.LogIf(ctx, fmt.Errorf("failed to get target for replication bucket:%s cfg:%s err:%s", bucket, cfg.RoleArn, err)) gr.Close() return } putOpts := putReplicationOpts(ctx, dest, objInfo) // Setup bandwidth throttling peers, _ := globalEndpoints.peers() totalNodesCount := len(peers) if totalNodesCount == 0 { totalNodesCount = 1 // For standalone erasure coding } b := target.BandwidthLimit / int64(totalNodesCount) var headerSize int for k, v := range putOpts.Header() { headerSize += len(k) + len(v) } // r takes over closing gr. r := bandwidth.NewMonitoredReader(ctx, globalBucketMonitor, objInfo.Bucket, objInfo.Name, gr, headerSize, b, target.BandwidthLimit) _, err = tgt.PutObject(ctx, dest.Bucket, object, r, size, "", "", putOpts) if err != nil { replicationStatus = replication.Failed } r.Close() } objInfo.UserDefined[xhttp.AmzBucketReplicationStatus] = replicationStatus.String() if objInfo.UserTags != "" { objInfo.UserDefined[xhttp.AmzObjectTagging] = objInfo.UserTags } // FIXME: add support for missing replication events // - event.ObjectReplicationNotTracked // - event.ObjectReplicationMissedThreshold // - event.ObjectReplicationReplicatedAfterThreshold var eventName = event.ObjectReplicationComplete if replicationStatus == replication.Failed { eventName = event.ObjectReplicationFailed } sendEvent(eventArgs{ EventName: eventName, BucketName: bucket, Object: objInfo, Host: "Internal: [Replication]", }) objInfo.metadataOnly = true // Perform only metadata updates. if _, err = objectAPI.CopyObject(ctx, bucket, object, bucket, object, objInfo, ObjectOptions{ VersionID: objInfo.VersionID, }, ObjectOptions{ VersionID: objInfo.VersionID, }); err != nil { logger.LogIf(ctx, fmt.Errorf("Unable to update replication metadata for %s: %s", objInfo.VersionID, err)) } } // filterReplicationStatusMetadata filters replication status metadata for COPY func filterReplicationStatusMetadata(metadata map[string]string) map[string]string { // Copy on write dst := metadata var copied bool delKey := func(key string) { if _, ok := metadata[key]; !ok { return } if !copied { dst = make(map[string]string, len(metadata)) for k, v := range metadata { dst[k] = v } copied = true } delete(dst, key) } delKey(xhttp.AmzBucketReplicationStatus) return dst } // DeletedObjectVersionInfo has info on deleted object type DeletedObjectVersionInfo struct { DeletedObject Bucket string } type replicationState struct { // add future metrics here replicaCh chan ObjectInfo replicaDeleteCh chan DeletedObjectVersionInfo } func (r *replicationState) queueReplicaTask(oi ObjectInfo) { if r == nil { return } select { case r.replicaCh <- oi: default: } } func (r *replicationState) queueReplicaDeleteTask(doi DeletedObjectVersionInfo) { if r == nil { return } select { case r.replicaDeleteCh <- doi: default: } } var ( globalReplicationState *replicationState // TODO: currently keeping it conservative // but eventually can be tuned in future, // take only half the CPUs for replication // conservatively. globalReplicationConcurrent = runtime.GOMAXPROCS(0) / 2 ) func newReplicationState() *replicationState { // fix minimum concurrent replication to 1 for single CPU setup if globalReplicationConcurrent == 0 { globalReplicationConcurrent = 1 } rs := &replicationState{ replicaCh: make(chan ObjectInfo, 10000), replicaDeleteCh: make(chan DeletedObjectVersionInfo, 10000), } go func() { <-GlobalContext.Done() close(rs.replicaCh) close(rs.replicaDeleteCh) }() return rs } // addWorker creates a new worker to process tasks func (r *replicationState) addWorker(ctx context.Context, objectAPI ObjectLayer) { // Add a new worker. go func() { for { select { case <-ctx.Done(): return case oi, ok := <-r.replicaCh: if !ok { return } replicateObject(ctx, oi, objectAPI) case doi, ok := <-r.replicaDeleteCh: if !ok { return } replicateDelete(ctx, doi, objectAPI) } } }() } func initBackgroundReplication(ctx context.Context, objectAPI ObjectLayer) { if globalReplicationState == nil { return } // Start with globalReplicationConcurrent. for i := 0; i < globalReplicationConcurrent; i++ { globalReplicationState.addWorker(ctx, objectAPI) } }