diff --git a/.mise.toml b/.mise.toml new file mode 100644 index 00000000..74c75e85 --- /dev/null +++ b/.mise.toml @@ -0,0 +1,6 @@ +[tools] +go = "1.25.1" + +# Keep Go from auto switching toolchains +[env] +GOTOOLCHAIN = "local" \ No newline at end of file diff --git a/RELEASE_VERSION b/RELEASE_VERSION index 34cde569..406ebcbd 100644 --- a/RELEASE_VERSION +++ b/RELEASE_VERSION @@ -1 +1 @@ -3.2.6 +3.2.7 diff --git a/build.sh b/build.sh index 0d79d8b5..f55464ac 100755 --- a/build.sh +++ b/build.sh @@ -74,11 +74,6 @@ function precheck() { fi fi - if [[ -z "$GOPATH" ]]; then - echo "GOPATH not set" - ok=1 - fi - if [[ ! -x "$( which go )" ]]; then echo "go binary not found in PATH" ok=1 @@ -214,18 +209,18 @@ package_linux() { [ $do_tar -eq 1 ] && tar -C $build_path/orchestrator -czf $release_base_path/orchestrator-"${RELEASE_VERSION}"-$target-$arch.tar.gz ./ debug "Creating Distro full packages" - [ $do_rpm -eq 1 ] && fpm -v "${RELEASE_VERSION}" --epoch 1 -f -s dir -n orchestrator -m shlomi-noach --description "MySQL replication topology management and HA" --url "https://github.com/openark/orchestrator" --vendor "GitHub" --license "Apache 2.0" -C $build_path/orchestrator --prefix=/ --config-files /usr/local/orchestrator/resources/public/css/custom.css --config-files /usr/local/orchestrator/resources/public/js/custom.js --depends 'jq >= 1.5' -t rpm . - [ $do_deb -eq 1 ] && fpm -v "${RELEASE_VERSION}" --epoch 1 -f -s dir -n orchestrator -m shlomi-noach --description "MySQL replication topology management and HA" --url "https://github.com/openark/orchestrator" --vendor "GitHub" --license "Apache 2.0" -C $build_path/orchestrator --prefix=/ --config-files /usr/local/orchestrator/resources/public/css/custom.css --config-files /usr/local/orchestrator/resources/public/js/custom.js --depends 'jq >= 1.5' -t deb --deb-no-default-config-files . + [ $do_rpm -eq 1 ] && fpm -a "$arch" -v "${RELEASE_VERSION}" --epoch 1 -f -s dir -n orchestrator -m shlomi-noach --description "MySQL replication topology management and HA" --url "https://github.com/openark/orchestrator" --vendor "GitHub" --license "Apache 2.0" -C $build_path/orchestrator --prefix=/ --config-files /usr/local/orchestrator/resources/public/css/custom.css --config-files /usr/local/orchestrator/resources/public/js/custom.js --depends 'jq >= 1.5' -t rpm . + [ $do_deb -eq 1 ] && fpm -a "$arch" -v "${RELEASE_VERSION}" --epoch 1 -f -s dir -n orchestrator -m shlomi-noach --description "MySQL replication topology management and HA" --url "https://github.com/openark/orchestrator" --vendor "GitHub" --license "Apache 2.0" -C $build_path/orchestrator --prefix=/ --config-files /usr/local/orchestrator/resources/public/css/custom.css --config-files /usr/local/orchestrator/resources/public/js/custom.js --depends 'jq >= 1.5' -t deb --deb-no-default-config-files . debug "Creating Distro cli packages" # orchestrator-cli packaging -- executable only - [ $do_rpm -eq 1 ] && fpm -v "${RELEASE_VERSION}" --epoch 1 -f -s dir -n orchestrator-cli -m shlomi-noach --description "MySQL replication topology management and HA: binary only" --url "https://github.com/openark/orchestrator" --vendor "GitHub" --license "Apache 2.0" -C $build_path/orchestrator-cli --prefix=/ --depends 'jq >= 1.5' -t rpm . - [ $do_deb -eq 1 ] && fpm -v "${RELEASE_VERSION}" --epoch 1 -f -s dir -n orchestrator-cli -m shlomi-noach --description "MySQL replication topology management and HA: binary only" --url "https://github.com/openark/orchestrator" --vendor "GitHub" --license "Apache 2.0" -C $build_path/orchestrator-cli --prefix=/ --depends 'jq >= 1.5' -t deb --deb-no-default-config-files . + [ $do_rpm -eq 1 ] && fpm -a "$arch" -v "${RELEASE_VERSION}" --epoch 1 -f -s dir -n orchestrator-cli -m shlomi-noach --description "MySQL replication topology management and HA: binary only" --url "https://github.com/openark/orchestrator" --vendor "GitHub" --license "Apache 2.0" -C $build_path/orchestrator-cli --prefix=/ --depends 'jq >= 1.5' -t rpm . + [ $do_deb -eq 1 ] && fpm -a "$arch" -v "${RELEASE_VERSION}" --epoch 1 -f -s dir -n orchestrator-cli -m shlomi-noach --description "MySQL replication topology management and HA: binary only" --url "https://github.com/openark/orchestrator" --vendor "GitHub" --license "Apache 2.0" -C $build_path/orchestrator-cli --prefix=/ --depends 'jq >= 1.5' -t deb --deb-no-default-config-files . debug "Creating Distro orchestrator-client packages" # orchestrator-client packaging -- shell script only - [ $do_rpm -eq 1 ] && fpm -v "${RELEASE_VERSION}" --epoch 1 -f -s dir -n orchestrator-client -m shlomi-noach --description "MySQL replication topology management and HA: client script" --url "https://github.com/openark/orchestrator" --vendor "GitHub" --license "Apache 2.0" -C $build_path/orchestrator-client --prefix=/ --depends 'jq >= 1.5' -t rpm . - [ $do_deb -eq 1 ] && fpm -v "${RELEASE_VERSION}" --epoch 1 -f -s dir -n orchestrator-client -m shlomi-noach --description "MySQL replication topology management and HA: client script" --url "https://github.com/openark/orchestrator" --vendor "GitHub" --license "Apache 2.0" -C $build_path/orchestrator-client --prefix=/ --depends 'jq >= 1.5' -t deb --deb-no-default-config-files . + [ $do_rpm -eq 1 ] && fpm -a "$arch" -v "${RELEASE_VERSION}" --epoch 1 -f -s dir -n orchestrator-client -m shlomi-noach --description "MySQL replication topology management and HA: client script" --url "https://github.com/openark/orchestrator" --vendor "GitHub" --license "Apache 2.0" -C $build_path/orchestrator-client --prefix=/ --depends 'jq >= 1.5' -t rpm . + [ $do_deb -eq 1 ] && fpm -a "$arch" -v "${RELEASE_VERSION}" --epoch 1 -f -s dir -n orchestrator-client -m shlomi-noach --description "MySQL replication topology management and HA: client script" --url "https://github.com/openark/orchestrator" --vendor "GitHub" --license "Apache 2.0" -C $build_path/orchestrator-client --prefix=/ --depends 'jq >= 1.5' -t deb --deb-no-default-config-files . if [ -n "$package_name_extra" ] ; then # Strip version core out of sting like "3.2.6-pre123+g1234567" to "3.2.6". diff --git a/go/app/cli.go b/go/app/cli.go index 71f3ecbb..c6812308 100644 --- a/go/app/cli.go +++ b/go/app/cli.go @@ -267,7 +267,7 @@ func Cli(command string, strict bool, instance string, destination string, owner } validateInstanceIsFound(instanceKey) - lostReplicas, equalReplicas, aheadReplicas, cannotReplicateReplicas, promotedReplica, err := inst.RegroupReplicas(instanceKey, false, func(candidateReplica *inst.Instance) { fmt.Println(candidateReplica.Key.DisplayString()) }, postponedFunctionsContainer) + lostReplicas, equalReplicas, aheadReplicas, cannotReplicateReplicas, promotedReplica, err := inst.RegroupReplicas(instanceKey, false, true, func(candidateReplica *inst.Instance) { fmt.Println(candidateReplica.Key.DisplayString()) }, postponedFunctionsContainer) lostReplicas = append(lostReplicas, cannotReplicateReplicas...) postponedFunctionsContainer.Wait() @@ -387,7 +387,7 @@ func Cli(command string, strict bool, instance string, destination string, owner log.Fatal("Cannot deduce instance:", instance) } - instance, _, _, _, _, err := inst.GetCandidateReplica(instanceKey, false) + instance, _, _, _, _, err := inst.GetCandidateReplica(instanceKey, false, false) if err != nil { log.Fatale(err) } else { @@ -450,7 +450,7 @@ func Cli(command string, strict bool, instance string, destination string, owner } validateInstanceIsFound(instanceKey) - lostReplicas, movedReplicas, cannotReplicateReplicas, promotedReplica, err := inst.RegroupReplicasGTID(instanceKey, false, true, func(candidateReplica *inst.Instance) { fmt.Println(candidateReplica.Key.DisplayString()) }, postponedFunctionsContainer, nil) + lostReplicas, movedReplicas, cannotReplicateReplicas, promotedReplica, err := inst.RegroupReplicasGTID(instanceKey, false, true, true, func(candidateReplica *inst.Instance) { fmt.Println(candidateReplica.Key.DisplayString()) }, postponedFunctionsContainer, nil) lostReplicas = append(lostReplicas, cannotReplicateReplicas...) if promotedReplica == nil { @@ -1474,7 +1474,7 @@ func Cli(command string, strict bool, instance string, destination string, owner log.Fatal("Cannot deduce instance:", instance) } - recoveryAttempted, promotedInstanceKey, err := logic.CheckAndRecover(instanceKey, destinationKey, (command == "recover-lite")) + recoveryAttempted, promotedInstanceKey, err := logic.CheckAndRecover(instanceKey, destinationKey, (command == "recover-lite"), true) if err != nil { log.Fatale(err) } diff --git a/go/config/config.go b/go/config/config.go index dbd5567a..58eff5fc 100644 --- a/go/config/config.go +++ b/go/config/config.go @@ -232,6 +232,7 @@ type Configuration struct { SkipBinlogEventsContaining []string // When scanning/comparing binlogs for Pseudo-GTID, skip entries containing given texts. These are NOT regular expressions (would consume too much CPU while scanning binlogs), just substrings to find. ReduceReplicationAnalysisCount bool // When true, replication analysis will only report instances where possibility of handled problems is possible in the first place (e.g. will not report most leaf nodes, that are mostly uninteresting). When false, provides an entry for every known instance FailureDetectionPeriodBlockMinutes int // The time for which an instance's failure discovery is kept "active", so as to avoid concurrent "discoveries" of the instance's failure; this precedes any recovery process, if any. + RecoveryBlockCrossDatacenterFailovers bool // When true, recovery of a master in one datacenter will not result in failover to a replica in another datacenter. Such failovers must be triggered manually via the UI or API RecoveryPeriodBlockMinutes int // (supported for backwards compatibility but please use newer `RecoveryPeriodBlockSeconds` instead) The time for which an instance's recovery is kept "active", so as to avoid concurrent recoveries on same instance as well as flapping RecoveryPeriodBlockSeconds int // (overrides `RecoveryPeriodBlockMinutes`) The time for which an instance's recovery is kept "active", so as to avoid concurrent recoveries on same instance as well as flapping RecoveryIgnoreHostnameFilters []string // Recovery analysis will completely ignore hosts matching given patterns diff --git a/go/http/api.go b/go/http/api.go index e329705e..008146de 100644 --- a/go/http/api.go +++ b/go/http/api.go @@ -1159,7 +1159,7 @@ func (this *HttpAPI) RegroupReplicas(params martini.Params, r render.Render, req return } - lostReplicas, equalReplicas, aheadReplicas, cannotReplicateReplicas, promotedReplica, err := inst.RegroupReplicas(&instanceKey, false, nil, nil) + lostReplicas, equalReplicas, aheadReplicas, cannotReplicateReplicas, promotedReplica, err := inst.RegroupReplicas(&instanceKey, false, true, nil, nil) lostReplicas = append(lostReplicas, cannotReplicateReplicas...) if err != nil { Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) @@ -1207,7 +1207,7 @@ func (this *HttpAPI) RegroupReplicasGTID(params martini.Params, r render.Render, return } - lostReplicas, movedReplicas, cannotReplicateReplicas, promotedReplica, err := inst.RegroupReplicasGTID(&instanceKey, false, true, nil, nil, nil) + lostReplicas, movedReplicas, cannotReplicateReplicas, promotedReplica, err := inst.RegroupReplicasGTID(&instanceKey, false, true, true, nil, nil, nil) lostReplicas = append(lostReplicas, cannotReplicateReplicas...) if err != nil { @@ -3237,7 +3237,7 @@ func (this *HttpAPI) Recover(params martini.Params, r render.Render, req *http.R } skipProcesses := (req.URL.Query().Get("skipProcesses") == "true") || (params["skipProcesses"] == "true") - recoveryAttempted, promotedInstanceKey, err := logic.CheckAndRecover(&instanceKey, candidateKey, skipProcesses) + recoveryAttempted, promotedInstanceKey, err := logic.CheckAndRecover(&instanceKey, candidateKey, skipProcesses, true) if err != nil { Respond(r, &APIResponse{Code: ERROR, Message: err.Error(), Details: instanceKey}) return diff --git a/go/inst/analysis_dao.go b/go/inst/analysis_dao.go index e7b85a20..5c144777 100644 --- a/go/inst/analysis_dao.go +++ b/go/inst/analysis_dao.go @@ -500,7 +500,7 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) a.ClusterDetails.ClusterName, a.IsMaster, a.LastCheckValid, a.LastCheckPartialSuccess, a.CountReplicas, a.CountValidReplicas, a.CountValidReplicatingReplicas, a.CountLaggingReplicas, a.CountDelayedReplicas, a.CountReplicasFailingToConnectToMaster, ) if util.ClearToLog("analysis_dao", analysisMessage) { - log.Debugf(analysisMessage) + log.Debug(analysisMessage) } } if !a.IsReplicationGroupMember /* Traditional Async/Semi-sync replication issue detection */ { diff --git a/go/inst/audit_dao.go b/go/inst/audit_dao.go index 629b6e8d..13c65c1f 100644 --- a/go/inst/audit_dao.go +++ b/go/inst/audit_dao.go @@ -101,7 +101,7 @@ func AuditOperation(auditType string, instanceKey *InstanceKey, message string) }() } if !auditWrittenToFile { - log.Infof(logMessage) + log.Info(logMessage) } auditOperationCounter.Inc(1) diff --git a/go/inst/instance_dao.go b/go/inst/instance_dao.go index 11745d2b..27020566 100644 --- a/go/inst/instance_dao.go +++ b/go/inst/instance_dao.go @@ -219,7 +219,7 @@ func logReadTopologyInstanceError(instanceKey *InstanceKey, hint string, err err strings.Replace(hint, "%", "%%", -1), // escape % err) } - return log.Errorf(msg) + return log.Error(msg) } // ReadTopologyInstance collects information on the state of a MySQL diff --git a/go/inst/instance_topology.go b/go/inst/instance_topology.go index 91b78ece..eea6421f 100644 --- a/go/inst/instance_topology.go +++ b/go/inst/instance_topology.go @@ -269,7 +269,7 @@ Cleanup: if err == nil { message := fmt.Sprintf("moved %+v via equivalence coordinates below %+v", *instanceKey, *otherKey) - log.Debugf(message) + log.Debug(message) AuditOperation("move-equivalent", instanceKey, message) } return instance, err @@ -2264,7 +2264,7 @@ func chooseCandidateReplica(replicas [](*Instance)) (candidateReplica *Instance, } // GetCandidateReplica chooses the best replica to promote given a (possibly dead) master -func GetCandidateReplica(masterKey *InstanceKey, forRematchPurposes bool) (*Instance, [](*Instance), [](*Instance), [](*Instance), [](*Instance), error) { +func GetCandidateReplica(masterKey *InstanceKey, forRematchPurposes bool, isGraceful bool) (*Instance, [](*Instance), [](*Instance), [](*Instance), [](*Instance), error) { var candidateReplica *Instance aheadReplicas := [](*Instance){} equalReplicas := [](*Instance){} @@ -2292,6 +2292,12 @@ func GetCandidateReplica(masterKey *InstanceKey, forRematchPurposes bool) (*Inst return candidateReplica, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err } if candidateReplica != nil { + AuditOperation("get-candidate-replica", masterKey, fmt.Sprintf("Graceful: %v, should block cross DC failovers: %v in DC: %v", isGraceful, config.Config.RecoveryBlockCrossDatacenterFailovers, dataCenterHint)) + // In automatic failover cases, respect cross-datacenter failover configuration + if !isGraceful && config.Config.RecoveryBlockCrossDatacenterFailovers && dataCenterHint != "" && candidateReplica.DataCenter != dataCenterHint { + AuditOperation("get-candidate-replica", masterKey, fmt.Sprintf("Candidate replica %+v is in different data center (%v) than master %+v (%v), automatic failover blocked", candidateReplica.Key, candidateReplica.DataCenter, *masterKey, dataCenterHint)) + return nil, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, fmt.Errorf("candidate replica %+v is in different data center (%v) than master %+v (%v), automatic failover blocked", candidateReplica.Key, candidateReplica.DataCenter, *masterKey, dataCenterHint) + } mostUpToDateReplica := replicas[0] if candidateReplica.ExecBinlogCoordinates.SmallerThan(&mostUpToDateReplica.ExecBinlogCoordinates) { log.Warningf("GetCandidateReplica: chosen replica: %+v is behind most-up-to-date replica: %+v", candidateReplica.Key, mostUpToDateReplica.Key) @@ -2344,7 +2350,7 @@ func RegroupReplicasPseudoGTID( candidateReplica *Instance, err error, ) { - candidateReplica, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err = GetCandidateReplica(masterKey, true) + candidateReplica, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err = GetCandidateReplica(masterKey, true, false) if err != nil { if !returnReplicaEvenOnFailureToRegroup { candidateReplica = nil @@ -2461,7 +2467,7 @@ func RegroupReplicasPseudoGTIDIncludingSubReplicasOfBinlogServers( log.Debugf("RegroupReplicasIncludingSubReplicasOfBinlogServers: most up to date binlog server of %+v: %+v", *masterKey, mostUpToDateBinlogServer.Key) // Find the most up to date candidate replica: - candidateReplica, _, _, _, _, err := GetCandidateReplica(masterKey, true) + candidateReplica, _, _, _, _, err := GetCandidateReplica(masterKey, true, false) if err != nil { return log.Errore(err) } @@ -2515,6 +2521,7 @@ func RegroupReplicasGTID( masterKey *InstanceKey, returnReplicaEvenOnFailureToRegroup bool, startReplicationOnCandidate bool, + isGraceful bool, onCandidateReplicaChosen func(*Instance), postponedFunctionsContainer *PostponedFunctionsContainer, postponeAllMatchOperations func(*Instance, bool) bool, @@ -2527,7 +2534,7 @@ func RegroupReplicasGTID( ) { var emptyReplicas [](*Instance) var unmovedReplicas [](*Instance) - candidateReplica, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err := GetCandidateReplica(masterKey, true) + candidateReplica, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err := GetCandidateReplica(masterKey, true, isGraceful) if err != nil { if !returnReplicaEvenOnFailureToRegroup { candidateReplica = nil @@ -2597,7 +2604,7 @@ func RegroupReplicasBinlogServers(masterKey *InstanceKey, returnReplicaEvenOnFai // RegroupReplicas is a "smart" method of promoting one replica over the others ("promoting" it on top of its siblings) // This method decides which strategy to use: GTID, Pseudo-GTID, Binlog Servers. -func RegroupReplicas(masterKey *InstanceKey, returnReplicaEvenOnFailureToRegroup bool, +func RegroupReplicas(masterKey *InstanceKey, returnReplicaEvenOnFailureToRegroup bool, isGraceful bool, onCandidateReplicaChosen func(*Instance), postponedFunctionsContainer *PostponedFunctionsContainer) ( @@ -2637,7 +2644,7 @@ func RegroupReplicas(masterKey *InstanceKey, returnReplicaEvenOnFailureToRegroup } if allGTID { log.Debugf("RegroupReplicas: using GTID to regroup replicas of %+v", *masterKey) - unmovedReplicas, movedReplicas, cannotReplicateReplicas, candidateReplica, err := RegroupReplicasGTID(masterKey, returnReplicaEvenOnFailureToRegroup, true, onCandidateReplicaChosen, nil, nil) + unmovedReplicas, movedReplicas, cannotReplicateReplicas, candidateReplica, err := RegroupReplicasGTID(masterKey, returnReplicaEvenOnFailureToRegroup, true, isGraceful, onCandidateReplicaChosen, nil, nil) return unmovedReplicas, emptyReplicas, movedReplicas, cannotReplicateReplicas, candidateReplica, err } if allBinlogServers { diff --git a/go/inst/instance_topology_test.go b/go/inst/instance_topology_test.go index 615dd37b..3bacd925 100644 --- a/go/inst/instance_topology_test.go +++ b/go/inst/instance_topology_test.go @@ -3,10 +3,11 @@ package inst import ( "math/rand" + "testing" + "github.com/openark/golib/log" test "github.com/openark/golib/tests" "github.com/openark/orchestrator/go/config" - "testing" ) var ( diff --git a/go/logic/orchestrator.go b/go/logic/orchestrator.go index bfa1afaa..856ee7ed 100644 --- a/go/logic/orchestrator.go +++ b/go/logic/orchestrator.go @@ -681,7 +681,7 @@ func ContinuousDiscovery() { return } if runCheckAndRecoverOperationsTimeRipe() { - CheckAndRecover(nil, nil, false) + CheckAndRecover(nil, nil, false, false) } else { log.Debugf("Waiting for %+v seconds to pass before running failure detection/recovery", checkAndRecoverWaitPeriod.Seconds()) } diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index adb281f6..7a7a94ba 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -512,7 +512,7 @@ func GetMasterRecoveryType(analysisEntry *inst.ReplicationAnalysis) (masterRecov } // recoverDeadMaster recovers a dead master, complete logic inside -func recoverDeadMaster(topologyRecovery *TopologyRecovery, candidateInstanceKey *inst.InstanceKey, skipProcesses bool) (recoveryAttempted bool, promotedReplica *inst.Instance, lostReplicas [](*inst.Instance), err error) { +func recoverDeadMaster(topologyRecovery *TopologyRecovery, candidateInstanceKey *inst.InstanceKey, skipProcesses bool, isGraceful bool) (recoveryAttempted bool, promotedReplica *inst.Instance, lostReplicas [](*inst.Instance), err error) { topologyRecovery.Type = MasterRecovery analysisEntry := &topologyRecovery.AnalysisEntry failedInstanceKey := &analysisEntry.AnalyzedInstanceKey @@ -554,7 +554,7 @@ func recoverDeadMaster(topologyRecovery *TopologyRecovery, candidateInstanceKey case MasterRecoveryGTID: { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadMaster: regrouping replicas via GTID")) - lostReplicas, _, cannotReplicateReplicas, promotedReplica, err = inst.RegroupReplicasGTID(failedInstanceKey, true, false, nil, &topologyRecovery.PostponedFunctionsContainer, promotedReplicaIsIdeal) + lostReplicas, _, cannotReplicateReplicas, promotedReplica, err = inst.RegroupReplicasGTID(failedInstanceKey, true, false, isGraceful, nil, &topologyRecovery.PostponedFunctionsContainer, promotedReplicaIsIdeal) } case MasterRecoveryPseudoGTID: { @@ -839,7 +839,7 @@ func replacePromotedReplicaWithCandidate(topologyRecovery *TopologyRecovery, dea // checkAndRecoverDeadMaster checks a given analysis, decides whether to take action, and possibly takes action // Returns true when action was taken. -func checkAndRecoverDeadMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { +func checkAndRecoverDeadMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool, isGraceful bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { if !(forceInstanceRecovery || analysisEntry.ClusterDetails.HasAutomatedMasterRecovery) { return false, nil, nil } @@ -852,7 +852,7 @@ func checkAndRecoverDeadMaster(analysisEntry inst.ReplicationAnalysis, candidate // That's it! We must do recovery! AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("will handle DeadMaster event on %+v", analysisEntry.ClusterDetails.ClusterName)) recoverDeadMasterCounter.Inc(1) - recoveryAttempted, promotedReplica, lostReplicas, err := recoverDeadMaster(topologyRecovery, candidateInstanceKey, skipProcesses) + recoveryAttempted, promotedReplica, lostReplicas, err := recoverDeadMaster(topologyRecovery, candidateInstanceKey, skipProcesses, isGraceful) if err != nil { AuditTopologyRecovery(topologyRecovery, err.Error()) } @@ -1118,7 +1118,7 @@ func GetCandidateSiblingOfIntermediateMaster(topologyRecovery *TopologyRecovery, } // RecoverDeadIntermediateMaster performs intermediate master recovery; complete logic inside -func RecoverDeadIntermediateMaster(topologyRecovery *TopologyRecovery, skipProcesses bool) (successorInstance *inst.Instance, err error) { +func RecoverDeadIntermediateMaster(topologyRecovery *TopologyRecovery, skipProcesses bool, isGraceful bool) (successorInstance *inst.Instance, err error) { topologyRecovery.Type = IntermediateMasterRecovery analysisEntry := &topologyRecovery.AnalysisEntry failedInstanceKey := &analysisEntry.AnalyzedInstanceKey @@ -1169,7 +1169,7 @@ func RecoverDeadIntermediateMaster(topologyRecovery *TopologyRecovery, skipProce if !recoveryResolved { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadIntermediateMaster: will next attempt regrouping of replicas")) // Plan B: regroup (we wish to reduce cross-DC replication streams) - lostReplicas, _, _, _, regroupPromotedReplica, regroupError := inst.RegroupReplicas(failedInstanceKey, true, nil, nil) + lostReplicas, _, _, _, regroupPromotedReplica, regroupError := inst.RegroupReplicas(failedInstanceKey, true, isGraceful, nil, nil) if regroupError != nil { topologyRecovery.AddError(regroupError) AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadIntermediateMaster: regroup failed on: %+v", regroupError)) @@ -1223,7 +1223,7 @@ func RecoverDeadIntermediateMaster(topologyRecovery *TopologyRecovery, skipProce // RecoverDeadReplicationGroupMemberWithReplicas performs dead group member recovery. It does so by finding members of // the same replication group of the one of the failed instance, picking a random one and relocating replicas to it. -func RecoverDeadReplicationGroupMemberWithReplicas(topologyRecovery *TopologyRecovery, skipProcesses bool) (successorInstance *inst.Instance, err error) { +func RecoverDeadReplicationGroupMemberWithReplicas(topologyRecovery *TopologyRecovery, skipProcesses bool, isGraceful bool) (successorInstance *inst.Instance, err error) { topologyRecovery.Type = ReplicationGroupMemberRecovery analysisEntry := &topologyRecovery.AnalysisEntry failedGroupMemberInstanceKey := &analysisEntry.AnalyzedInstanceKey @@ -1259,7 +1259,7 @@ func RecoverDeadReplicationGroupMemberWithReplicas(topologyRecovery *TopologyRec // checkAndRecoverDeadIntermediateMaster checks a given analysis, decides whether to take action, and possibly takes action // Returns true when action was taken. -func checkAndRecoverDeadIntermediateMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { +func checkAndRecoverDeadIntermediateMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool, isGraceful bool) (bool, *TopologyRecovery, error) { if !(forceInstanceRecovery || analysisEntry.ClusterDetails.HasAutomatedIntermediateMasterRecovery) { return false, nil, nil } @@ -1271,7 +1271,7 @@ func checkAndRecoverDeadIntermediateMaster(analysisEntry inst.ReplicationAnalysi // That's it! We must do recovery! recoverDeadIntermediateMasterCounter.Inc(1) - promotedReplica, err := RecoverDeadIntermediateMaster(topologyRecovery, skipProcesses) + promotedReplica, err := RecoverDeadIntermediateMaster(topologyRecovery, skipProcesses, isGraceful) if promotedReplica != nil { // success recoverDeadIntermediateMasterSuccessCounter.Inc(1) @@ -1289,7 +1289,7 @@ func checkAndRecoverDeadIntermediateMaster(analysisEntry inst.ReplicationAnalysi } // RecoverDeadCoMaster recovers a dead co-master, complete logic inside -func RecoverDeadCoMaster(topologyRecovery *TopologyRecovery, skipProcesses bool) (promotedReplica *inst.Instance, lostReplicas [](*inst.Instance), err error) { +func RecoverDeadCoMaster(topologyRecovery *TopologyRecovery, skipProcesses bool, isGraceful bool) (promotedReplica *inst.Instance, lostReplicas [](*inst.Instance), err error) { topologyRecovery.Type = CoMasterRecovery analysisEntry := &topologyRecovery.AnalysisEntry failedInstanceKey := &analysisEntry.AnalyzedInstanceKey @@ -1318,7 +1318,7 @@ func RecoverDeadCoMaster(topologyRecovery *TopologyRecovery, skipProcesses bool) switch coMasterRecoveryType { case MasterRecoveryGTID: { - lostReplicas, _, cannotReplicateReplicas, promotedReplica, err = inst.RegroupReplicasGTID(failedInstanceKey, true, false, nil, &topologyRecovery.PostponedFunctionsContainer, nil) + lostReplicas, _, cannotReplicateReplicas, promotedReplica, err = inst.RegroupReplicasGTID(failedInstanceKey, true, false, isGraceful, nil, &topologyRecovery.PostponedFunctionsContainer, nil) } case MasterRecoveryPseudoGTID: { @@ -1409,7 +1409,7 @@ func RecoverDeadCoMaster(topologyRecovery *TopologyRecovery, skipProcesses bool) // checkAndRecoverDeadCoMaster checks a given analysis, decides whether to take action, and possibly takes action // Returns true when action was taken. -func checkAndRecoverDeadCoMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { +func checkAndRecoverDeadCoMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool, isGraceful bool) (bool, *TopologyRecovery, error) { failedInstanceKey := &analysisEntry.AnalyzedInstanceKey if !(forceInstanceRecovery || analysisEntry.ClusterDetails.HasAutomatedMasterRecovery) { return false, nil, nil @@ -1422,7 +1422,7 @@ func checkAndRecoverDeadCoMaster(analysisEntry inst.ReplicationAnalysis, candida // That's it! We must do recovery! recoverDeadCoMasterCounter.Inc(1) - promotedReplica, lostReplicas, err := RecoverDeadCoMaster(topologyRecovery, skipProcesses) + promotedReplica, lostReplicas, err := RecoverDeadCoMaster(topologyRecovery, skipProcesses, isGraceful) resolveRecovery(topologyRecovery, promotedReplica) if promotedReplica == nil { inst.AuditOperation("recover-dead-co-master", failedInstanceKey, "Failure: no replica promoted.") @@ -1455,7 +1455,7 @@ func checkAndRecoverDeadCoMaster(analysisEntry inst.ReplicationAnalysis, candida // checkAndRecoverNonWriteableMaster attempts to recover from a read only master by turning it writeable. // This behavior is feature protected, see config.Config.RecoverNonWriteableMaster -func checkAndRecoverNonWriteableMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { +func checkAndRecoverNonWriteableMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool, isGraceful bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { if !config.Config.RecoverNonWriteableMaster { return false, nil, nil } @@ -1481,7 +1481,7 @@ func checkAndRecoverNonWriteableMaster(analysisEntry inst.ReplicationAnalysis, c } // checkAndRecoverLockedSemiSyncMaster -func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { +func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool, isGraceful bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, true, true) if topologyRecovery == nil { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverLockedSemiSyncMaster.", analysisEntry.AnalyzedInstanceKey)) @@ -1498,7 +1498,7 @@ func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis, } // checkAndRecoverMasterWithTooManySemiSyncReplicas registers and performs a recovery for MasterWithTooManySemiSyncReplicas -func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { +func checkAndRecoverMasterWithTooManySemiSyncReplicas(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool, isGraceful bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, true, true) if topologyRecovery == nil { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverMasterWithTooManySemiSyncReplicas.", analysisEntry.AnalyzedInstanceKey)) @@ -1551,7 +1551,7 @@ func recoverSemiSyncReplicas(topologyRecovery *TopologyRecovery, analysisEntry i } // checkAndRecoverGenericProblem is a general-purpose recovery function -func checkAndRecoverGenericProblem(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { +func checkAndRecoverGenericProblem(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool, isGraceful bool) (bool, *TopologyRecovery, error) { return false, nil, nil } @@ -1561,7 +1561,7 @@ func checkAndRecoverGenericProblem(analysisEntry inst.ReplicationAnalysis, candi // members are akin to intermediate masters. Considering also that a failed group member can always be considered as a // secondary (even if it was primary, the group should have detected its failure and elected a new primary), then // failure of a group member with replicas is akin to failure of an intermediate master. -func checkAndRecoverDeadGroupMemberWithReplicas(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { +func checkAndRecoverDeadGroupMemberWithReplicas(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool, isGraceful bool) (bool, *TopologyRecovery, error) { // Don't proceed with recovery unless it was forced or automatic intermediate source recovery is enabled. // We consider failed group members akin to failed intermediate masters, so we re-use the configuration for // intermediates. @@ -1576,7 +1576,7 @@ func checkAndRecoverDeadGroupMemberWithReplicas(analysisEntry inst.ReplicationAn // Proceed with recovery recoverDeadReplicationGroupMemberCounter.Inc(1) - recoveredToGroupMember, err := RecoverDeadReplicationGroupMemberWithReplicas(topologyRecovery, skipProcesses) + recoveredToGroupMember, err := RecoverDeadReplicationGroupMemberWithReplicas(topologyRecovery, skipProcesses, isGraceful) if recoveredToGroupMember != nil { // success @@ -1703,7 +1703,7 @@ func checkAndExecuteFailureDetectionProcesses(analysisEntry inst.ReplicationAnal } func getCheckAndRecoverFunction(analysisCode inst.AnalysisCode, analyzedInstanceKey *inst.InstanceKey) ( - checkAndRecoverFunction func(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error), + checkAndRecoverFunction func(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool, isGraceful bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error), isActionableRecovery bool, ) { switch analysisCode { @@ -1793,7 +1793,7 @@ func runEmergentOperations(analysisEntry *inst.ReplicationAnalysis, allowInstanc // executeCheckAndRecoverFunction will choose the correct check & recovery function based on analysis. // It executes the function synchronuously -func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { +func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool, isGraceful bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { atomic.AddInt64(&countPendingRecoveries, 1) defer atomic.AddInt64(&countPendingRecoveries, -1) @@ -1870,7 +1870,7 @@ func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis, cand if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: recovery", analysisEntry.AnalyzedInstanceKey.StringCode()) { log.Infof("executeCheckAndRecoverFunction: proceeding with %+v recovery on %+v; isRecoverable?: %+v; skipProcesses: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, isActionableRecovery, skipProcesses) } - recoveryAttempted, topologyRecovery, err = checkAndRecoverFunction(analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses) + recoveryAttempted, topologyRecovery, err = checkAndRecoverFunction(analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses, isGraceful) if !recoveryAttempted { return recoveryAttempted, topologyRecovery, err } @@ -1902,7 +1902,8 @@ func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis, cand } // CheckAndRecover is the main entry point for the recovery mechanism -func CheckAndRecover(specificInstance *inst.InstanceKey, candidateInstanceKey *inst.InstanceKey, skipProcesses bool) (recoveryAttempted bool, promotedReplicaKey *inst.InstanceKey, err error) { +// isGraceful is true when called in response to a user request of some sort (as opposed to being triggered automatically) +func CheckAndRecover(specificInstance *inst.InstanceKey, candidateInstanceKey *inst.InstanceKey, skipProcesses bool, isGraceful bool) (recoveryAttempted bool, promotedReplicaKey *inst.InstanceKey, err error) { // Allow the analysis to run even if we don't want to recover replicationAnalysis, err := inst.GetReplicationAnalysis("", &inst.ReplicationAnalysisHints{IncludeDowntimed: true, AuditAnalysis: true}) if err != nil { @@ -1929,14 +1930,14 @@ func CheckAndRecover(specificInstance *inst.InstanceKey, candidateInstanceKey *i if specificInstance != nil { // force mode. Keep it synchronous var topologyRecovery *TopologyRecovery - recoveryAttempted, topologyRecovery, err = executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, true, skipProcesses) + recoveryAttempted, topologyRecovery, err = executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, true, skipProcesses, isGraceful) log.Errore(err) if topologyRecovery != nil { promotedReplicaKey = topologyRecovery.SuccessorKey } } else { go func() { - _, _, err := executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, false, skipProcesses) + _, _, err := executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, false, skipProcesses, isGraceful) log.Errore(err) }() } @@ -1972,7 +1973,7 @@ func forceAnalysisEntry(clusterName string, analysisCode inst.AnalysisCode, comm // The caller of this function injects the type of analysis it wishes the function to assume. // By calling this function one takes responsibility for one's actions. func ForceExecuteRecovery(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { - return executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, true, skipProcesses) + return executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, true, skipProcesses, true) } // ForceMasterFailover *trusts* master of given cluster is dead and initiates a failover @@ -2055,7 +2056,7 @@ func getGracefulMasterTakeoverDesignatedInstance(clusterMasterKey *inst.Instance return nil, fmt.Errorf("GracefulMasterTakeover: target instance not indicated, auto=false, and master %+v has %+v replicas. orchestrator cannot choose where to failover to. Aborting", *clusterMasterKey, len(clusterMasterDirectReplicas)) } log.Debugf("GracefulMasterTakeover: request takeover for master %+v, no designated replica indicated. orchestrator will attempt to auto deduce replica.", *clusterMasterKey) - designatedInstance, _, _, _, _, err = inst.GetCandidateReplica(clusterMasterKey, false) + designatedInstance, _, _, _, _, err = inst.GetCandidateReplica(clusterMasterKey, false, true) if err != nil || designatedInstance == nil { return nil, fmt.Errorf("GracefulMasterTakeover: no target instance indicated, failed to auto-detect candidate replica for master %+v. Aborting", *clusterMasterKey) }