diff --git a/internal/cluster/common.go b/internal/cluster/common.go index bfe847960332..0c3a635901ad 100644 --- a/internal/cluster/common.go +++ b/internal/cluster/common.go @@ -188,53 +188,6 @@ func getKnownMastersNodesIds(c *common.Cluster, db *gorm.DB) ([]*strfmt.UUID, er return masterNodesIds, nil } -func HostsInStatus(c *common.Cluster, statuses []string) (int, int, int) { - mappedMastersByRole := MapMasterHostsByStatus(c) - mappedArbitersByRole := MapArbiterHostsByStatus(c) - mappedWorkersByRole := MapWorkersHostsByStatus(c) - mastersInSomeInstallingStatus := 0 - arbitersInSomeInstallingStatus := 0 - workersInSomeInstallingStatus := 0 - - for _, status := range statuses { - mastersInSomeInstallingStatus += len(mappedMastersByRole[status]) - arbitersInSomeInstallingStatus += len(mappedArbitersByRole[status]) - workersInSomeInstallingStatus += len(mappedWorkersByRole[status]) - } - return mastersInSomeInstallingStatus, arbitersInSomeInstallingStatus, workersInSomeInstallingStatus -} - -func MapMasterHostsByStatus(c *common.Cluster) map[string][]*models.Host { - return mapHostsByStatus(c, models.HostRoleMaster) -} - -func MapArbiterHostsByStatus(c *common.Cluster) map[string][]*models.Host { - return mapHostsByStatus(c, models.HostRoleArbiter) -} - -func MapWorkersHostsByStatus(c *common.Cluster) map[string][]*models.Host { - return mapHostsByStatus(c, models.HostRoleWorker) -} - -func mapHostsByStatus(c *common.Cluster, role models.HostRole) map[string][]*models.Host { - hostMap := make(map[string][]*models.Host) - for _, host := range c.Hosts { - if role != "" && common.GetEffectiveRole(host) != role { - continue - } - if _, ok := hostMap[swag.StringValue(host.Status)]; ok { - hostMap[swag.StringValue(host.Status)] = append(hostMap[swag.StringValue(host.Status)], host) - } else { - hostMap[swag.StringValue(host.Status)] = []*models.Host{host} - } - } - return hostMap -} - -func MapHostsByStatus(c *common.Cluster) map[string][]*models.Host { - return mapHostsByStatus(c, "") -} - func UpdateMachineNetwork(db *gorm.DB, cluster *common.Cluster, machineNetwork []string) error { if len(machineNetwork) > 2 { return common.NewApiError(http.StatusInternalServerError, diff --git a/internal/cluster/transition.go b/internal/cluster/transition.go index f9398be9d219..e2d6b0771442 100644 --- a/internal/cluster/transition.go +++ b/internal/cluster/transition.go @@ -257,7 +257,7 @@ func (th *transitionHandler) createClusterCompletionStatusInfo(ctx context.Conte statusInfo = StatusInfoDegraded statusInfo += ". Failed OLM operators: " + strings.Join(statuses[models.OperatorTypeOlm][models.OperatorStatusFailed], ", ") } else { - _, _, installedWorkers := HostsInStatus(cluster, []string{models.HostStatusInstalled}) + _, _, installedWorkers := common.HostsInStatus(cluster, []string{models.HostStatusInstalled}) if installedWorkers < common.NumberOfWorkers(cluster) { statusInfo = StatusInfoNotAllWorkersInstalled } @@ -418,7 +418,7 @@ func (th *transitionHandler) IsFinalizing(sw stateswitch.StateSwitch, args state sCluster, ok := sw.(*stateCluster) installedStatus := []string{models.HostStatusInstalled} - if ok && th.enoughMastersAndWorkers(sCluster, installedStatus) { + if ok && common.HasEnoughMastersAndWorkers(sCluster.cluster, installedStatus) { th.log.Infof("Cluster %s has at least required number of installed hosts, "+ "cluster is finalizing.", sCluster.cluster.ID) return true, nil @@ -431,7 +431,7 @@ func (th *transitionHandler) IsInstalling(sw stateswitch.StateSwitch, args state sCluster, _ := sw.(*stateCluster) installingStatuses := []string{models.HostStatusInstalling, models.HostStatusInstallingInProgress, models.HostStatusInstalled, models.HostStatusInstallingPendingUserAction, models.HostStatusPreparingSuccessful} - return th.enoughMastersAndWorkers(sCluster, installingStatuses), nil + return common.HasEnoughMastersAndWorkers(sCluster.cluster, installingStatuses), nil } // check if we should stay in installing state @@ -507,49 +507,6 @@ func (th *transitionHandler) PostUpdateFinalizingAMSConsoleUrl(sw stateswitch.St return nil } -// enoughMastersAndWorkers returns whether the number of master and worker nodes in the specified cluster with the given status -// meets the required criteria. The conditions are as follows: -// - For SNO (Single Node OpenShift), there must be exactly one master node and zero worker nodes. -// - For High Availability cluster, the number of master nodes should match the user's request, and not less than the minimum. The worker node requirement depends on this request: -// If the user requested at least two workers, there must be at least two, indicating non-schedulable masters were intended. -// If the user requested fewer than two workers, any number of workers is acceptable. -// - For TNA Clusters the same conditions apply as for High Availability Clusters, but we also need to check that at least one arbiter node is in the correct status. -func (th *transitionHandler) enoughMastersAndWorkers(sCluster *stateCluster, statuses []string) bool { - mastersInSomeInstallingStatus, arbitersInSomeInstallingStatus, workersInSomeInstallingStatus := HostsInStatus(sCluster.cluster, statuses) - - if sCluster.cluster.ControlPlaneCount == 1 { - return mastersInSomeInstallingStatus == common.AllowedNumberOfMasterHostsInNoneHaMode && - workersInSomeInstallingStatus == common.AllowedNumberOfWorkersInNoneHaMode - } - - // hosts roles are known at this stage - masters, arbiters, workers, _ := common.GetHostsByEachRole(&sCluster.cluster.Cluster, false) - numberOfExpectedMasters := len(masters) - numberOfExpectedArbiters := len(arbiters) - - minMasterHostsNeeded := common.MinMasterHostsNeededForInstallationInHaMode - if numberOfExpectedArbiters != 0 { - minMasterHostsNeeded = common.MinMasterHostsNeededForInstallationInHaArbiterMode - // validate arbiters - if arbitersInSomeInstallingStatus == 0 { - return false - } - } - - // validate masters - if numberOfExpectedMasters < minMasterHostsNeeded || - mastersInSomeInstallingStatus < numberOfExpectedMasters { - return false - } - - numberOfExpectedWorkers := len(workers) - - // validate workers - return numberOfExpectedWorkers < common.MinimumNumberOfWorkersForNonSchedulableMastersClusterInHaMode || - numberOfExpectedWorkers >= common.MinimumNumberOfWorkersForNonSchedulableMastersClusterInHaMode && - workersInSomeInstallingStatus >= common.MinimumNumberOfWorkersForNonSchedulableMastersClusterInHaMode -} - // check if installation reach to timeout func (th *transitionHandler) IsInstallationTimedOut(sw stateswitch.StateSwitch, args stateswitch.TransitionArgs) (bool, error) { sCluster, ok := sw.(*stateCluster) diff --git a/internal/common/common.go b/internal/common/common.go index bdec5140383c..8432b21f9064 100644 --- a/internal/common/common.go +++ b/internal/common/common.go @@ -781,3 +781,62 @@ func GetDefaultHighAvailabilityAndMasterCountParams(highAvailabilityMode *string func IsClusterTopologyHighlyAvailableArbiter(cluster *Cluster) bool { return funk.NotEmpty(GetHostsByRole(cluster, models.HostRoleArbiter)) } + +func HostsInStatus(c *Cluster, statuses []string) (masters, arbiters, workers int) { + for _, host := range c.Hosts { + if funk.ContainsString(statuses, swag.StringValue(host.Status)) { + switch GetEffectiveRole(host) { + case models.HostRoleMaster, models.HostRoleBootstrap: + masters++ + case models.HostRoleArbiter: + arbiters++ + case models.HostRoleWorker: + workers++ + } + } + } + return +} + +// HasEnoughMastersAndWorkers returns whether the number of master and worker nodes in the specified cluster with the given status +// meets the required criteria. The conditions are as follows: +// - For SNO (Single Node OpenShift), there must be exactly one master node and zero worker nodes. +// - For High Availability cluster, the number of master nodes should match the user's request, and not less than the minimum. The worker node requirement depends on this request: +// If the user requested at least two workers, there must be at least two, indicating non-schedulable masters were intended. +// If the user requested fewer than two workers, any number of workers is acceptable. +// - For TNA Clusters the same conditions apply as for High Availability Clusters, but we also need to check that at least one arbiter node is in the correct status. +func HasEnoughMastersAndWorkers(c *Cluster, statuses []string) bool { + mastersInStatus, arbitersInStatus, workersInStatus := HostsInStatus(c, statuses) + + if c.ControlPlaneCount == 1 { + return mastersInStatus == AllowedNumberOfMasterHostsInNoneHaMode && + workersInStatus == AllowedNumberOfWorkersInNoneHaMode + } + + // hosts roles are known at this stage + masters, arbiters, workers, _ := GetHostsByEachRole(&c.Cluster, false) + numberOfExpectedMasters := len(masters) + numberOfExpectedArbiters := len(arbiters) + + minMasterHostsNeeded := MinMasterHostsNeededForInstallationInHaMode + if numberOfExpectedArbiters != 0 { + minMasterHostsNeeded = MinMasterHostsNeededForInstallationInHaArbiterMode + // validate arbiters + if arbitersInStatus == 0 { + return false + } + } + + // validate masters + if numberOfExpectedMasters < minMasterHostsNeeded || + mastersInStatus < numberOfExpectedMasters { + return false + } + + numberOfExpectedWorkers := len(workers) + + // validate workers + return numberOfExpectedWorkers < MinimumNumberOfWorkersForNonSchedulableMastersClusterInHaMode || + numberOfExpectedWorkers >= MinimumNumberOfWorkersForNonSchedulableMastersClusterInHaMode && + workersInStatus >= MinimumNumberOfWorkersForNonSchedulableMastersClusterInHaMode +} diff --git a/internal/common/common_test.go b/internal/common/common_test.go index f997e96d3731..38899833b02b 100644 --- a/internal/common/common_test.go +++ b/internal/common/common_test.go @@ -666,3 +666,360 @@ func createClusterFromHosts(hosts []*models.Host) Cluster { }, } } + +var _ = Describe("HostsInStatus", func() { + It("should return zero counts for empty cluster", func() { + cluster := createClusterFromHosts([]*models.Host{}) + masters, arbiters, workers := HostsInStatus(&cluster, []string{models.HostStatusKnown}) + + Expect(masters).To(Equal(0)) + Expect(arbiters).To(Equal(0)) + Expect(workers).To(Equal(0)) + }) + + It("should count masters in specified status", func() { + hosts := []*models.Host{ + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusInsufficient), + } + cluster := createClusterFromHosts(hosts) + masters, arbiters, workers := HostsInStatus(&cluster, []string{models.HostStatusKnown}) + + Expect(masters).To(Equal(2)) + Expect(arbiters).To(Equal(0)) + Expect(workers).To(Equal(0)) + }) + + It("should count arbiters in specified status", func() { + hosts := []*models.Host{ + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleArbiter, models.HostStatusKnown), + createHost(models.HostRoleArbiter, models.HostStatusInsufficient), + } + cluster := createClusterFromHosts(hosts) + masters, arbiters, workers := HostsInStatus(&cluster, []string{models.HostStatusKnown}) + + Expect(masters).To(Equal(2)) + Expect(arbiters).To(Equal(1)) + Expect(workers).To(Equal(0)) + }) + + It("should count workers in specified status", func() { + hosts := []*models.Host{ + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleWorker, models.HostStatusKnown), + createHost(models.HostRoleWorker, models.HostStatusKnown), + createHost(models.HostRoleWorker, models.HostStatusInsufficient), + } + cluster := createClusterFromHosts(hosts) + masters, arbiters, workers := HostsInStatus(&cluster, []string{models.HostStatusKnown}) + + Expect(masters).To(Equal(1)) + Expect(arbiters).To(Equal(0)) + Expect(workers).To(Equal(2)) + }) + + It("should count hosts with multiple statuses", func() { + hosts := []*models.Host{ + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusInstalling), + createHost(models.HostRoleMaster, models.HostStatusInsufficient), + createHost(models.HostRoleWorker, models.HostStatusKnown), + createHost(models.HostRoleWorker, models.HostStatusInstalling), + } + cluster := createClusterFromHosts(hosts) + masters, arbiters, workers := HostsInStatus(&cluster, []string{models.HostStatusKnown, models.HostStatusInstalling}) + + Expect(masters).To(Equal(2)) + Expect(arbiters).To(Equal(0)) + Expect(workers).To(Equal(2)) + }) + + It("should handle bootstrap hosts as masters", func() { + hosts := []*models.Host{ + createHost(models.HostRoleBootstrap, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusKnown), + } + cluster := createClusterFromHosts(hosts) + masters, arbiters, workers := HostsInStatus(&cluster, []string{models.HostStatusKnown}) + + Expect(masters).To(Equal(2)) + Expect(arbiters).To(Equal(0)) + Expect(workers).To(Equal(0)) + }) + + It("should use effective role for auto-assigned hosts", func() { + hostId1 := strfmt.UUID(uuid.New().String()) + clusterId1 := strfmt.UUID(uuid.New().String()) + hostId2 := strfmt.UUID(uuid.New().String()) + clusterId2 := strfmt.UUID(uuid.New().String()) + hostId3 := strfmt.UUID(uuid.New().String()) + clusterId3 := strfmt.UUID(uuid.New().String()) + + hosts := []*models.Host{ + { + ID: &hostId1, + InfraEnvID: strfmt.UUID(uuid.New().String()), + ClusterID: &clusterId1, + Role: models.HostRoleAutoAssign, + SuggestedRole: models.HostRoleMaster, + Status: swag.String(models.HostStatusKnown), + }, + { + ID: &hostId2, + InfraEnvID: strfmt.UUID(uuid.New().String()), + ClusterID: &clusterId2, + Role: models.HostRoleAutoAssign, + SuggestedRole: models.HostRoleWorker, + Status: swag.String(models.HostStatusKnown), + }, + { + ID: &hostId3, + InfraEnvID: strfmt.UUID(uuid.New().String()), + ClusterID: &clusterId3, + Role: models.HostRoleAutoAssign, + SuggestedRole: models.HostRoleArbiter, + Status: swag.String(models.HostStatusKnown), + }, + } + cluster := createClusterFromHosts(hosts) + masters, arbiters, workers := HostsInStatus(&cluster, []string{models.HostStatusKnown}) + + Expect(masters).To(Equal(1)) + Expect(arbiters).To(Equal(1)) + Expect(workers).To(Equal(1)) + }) + + It("should return zero when no hosts match status", func() { + hosts := []*models.Host{ + createHost(models.HostRoleMaster, models.HostStatusInsufficient), + createHost(models.HostRoleWorker, models.HostStatusPendingForInput), + } + cluster := createClusterFromHosts(hosts) + masters, arbiters, workers := HostsInStatus(&cluster, []string{models.HostStatusKnown}) + + Expect(masters).To(Equal(0)) + Expect(arbiters).To(Equal(0)) + Expect(workers).To(Equal(0)) + }) +}) + +var _ = Describe("HasEnoughMastersAndWorkers", func() { + Context("single node", func() { + It("should return true with exactly 1 master and 0 workers", func() { + hosts := []*models.Host{ + createHost(models.HostRoleMaster, models.HostStatusKnown), + } + cluster := createClusterFromHosts(hosts) + cluster.ControlPlaneCount = 1 + + result := HasEnoughMastersAndWorkers(&cluster, []string{models.HostStatusKnown}) + Expect(result).To(BeTrue()) + }) + + It("should return false with 1 master and 1 worker", func() { + hosts := []*models.Host{ + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleWorker, models.HostStatusKnown), + } + cluster := createClusterFromHosts(hosts) + cluster.ControlPlaneCount = 1 + + result := HasEnoughMastersAndWorkers(&cluster, []string{models.HostStatusKnown}) + Expect(result).To(BeFalse()) + }) + + It("should return false with 0 masters", func() { + cluster := createClusterFromHosts([]*models.Host{}) + cluster.ControlPlaneCount = 1 + + result := HasEnoughMastersAndWorkers(&cluster, []string{models.HostStatusKnown}) + Expect(result).To(BeFalse()) + }) + + It("should return false with 2 masters", func() { + hosts := []*models.Host{ + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusKnown), + } + cluster := createClusterFromHosts(hosts) + cluster.ControlPlaneCount = 1 + + result := HasEnoughMastersAndWorkers(&cluster, []string{models.HostStatusKnown}) + Expect(result).To(BeFalse()) + }) + }) + + Context("highly available", func() { + It("should return true with 3 masters and 0 workers", func() { + hosts := []*models.Host{ + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusKnown), + } + cluster := createClusterFromHosts(hosts) + cluster.ControlPlaneCount = 3 + + result := HasEnoughMastersAndWorkers(&cluster, []string{models.HostStatusKnown}) + Expect(result).To(BeTrue()) + }) + + It("should return true with 3 masters and 1 worker", func() { + hosts := []*models.Host{ + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleWorker, models.HostStatusKnown), + } + cluster := createClusterFromHosts(hosts) + cluster.ControlPlaneCount = 3 + + result := HasEnoughMastersAndWorkers(&cluster, []string{models.HostStatusKnown}) + Expect(result).To(BeTrue()) + }) + + It("should return true with 3 masters and 2+ workers when all are in status", func() { + hosts := []*models.Host{ + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleWorker, models.HostStatusKnown), + createHost(models.HostRoleWorker, models.HostStatusKnown), + } + cluster := createClusterFromHosts(hosts) + cluster.ControlPlaneCount = 3 + + result := HasEnoughMastersAndWorkers(&cluster, []string{models.HostStatusKnown}) + Expect(result).To(BeTrue()) + }) + + It("should return false with 3 masters expected but only 2 in status", func() { + hosts := []*models.Host{ + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusInsufficient), + } + cluster := createClusterFromHosts(hosts) + cluster.ControlPlaneCount = 3 + + result := HasEnoughMastersAndWorkers(&cluster, []string{models.HostStatusKnown}) + Expect(result).To(BeFalse()) + }) + + It("should return false when expecting 2+ workers but only 1 in status", func() { + hosts := []*models.Host{ + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleWorker, models.HostStatusKnown), + createHost(models.HostRoleWorker, models.HostStatusInsufficient), + } + cluster := createClusterFromHosts(hosts) + cluster.ControlPlaneCount = 3 + + result := HasEnoughMastersAndWorkers(&cluster, []string{models.HostStatusKnown}) + Expect(result).To(BeFalse()) + }) + + It("should return true with 5 masters", func() { + hosts := []*models.Host{ + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusKnown), + } + cluster := createClusterFromHosts(hosts) + cluster.ControlPlaneCount = 5 + + result := HasEnoughMastersAndWorkers(&cluster, []string{models.HostStatusKnown}) + Expect(result).To(BeTrue()) + }) + + It("should return false with less than minimum masters", func() { + hosts := []*models.Host{ + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusKnown), + } + cluster := createClusterFromHosts(hosts) + cluster.ControlPlaneCount = 3 + + result := HasEnoughMastersAndWorkers(&cluster, []string{models.HostStatusKnown}) + Expect(result).To(BeFalse()) + }) + }) + + Context("two node with arbiter", func() { + It("should return true with 2 masters, 1 arbiter", func() { + hosts := []*models.Host{ + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleArbiter, models.HostStatusKnown), + } + cluster := createClusterFromHosts(hosts) + cluster.ControlPlaneCount = 2 + + result := HasEnoughMastersAndWorkers(&cluster, []string{models.HostStatusKnown}) + Expect(result).To(BeTrue()) + }) + + It("should return false with 2 masters but 0 arbiters in status", func() { + hosts := []*models.Host{ + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleArbiter, models.HostStatusInsufficient), + } + cluster := createClusterFromHosts(hosts) + cluster.ControlPlaneCount = 2 + + result := HasEnoughMastersAndWorkers(&cluster, []string{models.HostStatusKnown}) + Expect(result).To(BeFalse()) + }) + + It("should return false with only 1 master in status", func() { + hosts := []*models.Host{ + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusInsufficient), + createHost(models.HostRoleArbiter, models.HostStatusKnown), + } + cluster := createClusterFromHosts(hosts) + cluster.ControlPlaneCount = 2 + + result := HasEnoughMastersAndWorkers(&cluster, []string{models.HostStatusKnown}) + Expect(result).To(BeFalse()) + }) + + It("should return true with 2 masters, 1 arbiter, and workers", func() { + hosts := []*models.Host{ + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleArbiter, models.HostStatusKnown), + createHost(models.HostRoleWorker, models.HostStatusKnown), + } + cluster := createClusterFromHosts(hosts) + cluster.ControlPlaneCount = 2 + + result := HasEnoughMastersAndWorkers(&cluster, []string{models.HostStatusKnown}) + Expect(result).To(BeTrue()) + }) + }) + + It("should count hosts in any of the specified statuses with multiple status values", func() { + hosts := []*models.Host{ + createHost(models.HostRoleMaster, models.HostStatusKnown), + createHost(models.HostRoleMaster, models.HostStatusInstalling), + createHost(models.HostRoleMaster, models.HostStatusInstalled), + } + cluster := createClusterFromHosts(hosts) + cluster.ControlPlaneCount = 3 + + result := HasEnoughMastersAndWorkers(&cluster, []string{ + models.HostStatusKnown, + models.HostStatusInstalling, + models.HostStatusInstalled, + }) + Expect(result).To(BeTrue()) + }) +}) diff --git a/internal/host/common.go b/internal/host/common.go index 0029e88120bc..2e05c6c7dac9 100644 --- a/internal/host/common.go +++ b/internal/host/common.go @@ -46,6 +46,7 @@ const ( statusInfoHostReadyToBeBound = "Host is ready to be bound to a cluster" statusInfoBinding = "Host is waiting to be bound to the cluster" statusRebootTimeout = "Host timed out when pulling the configuration files. Verify in the host console that the host boots from the OpenShift installation disk $INSTALLATION_DISK and has network access to the cluster API. The installation will resume after the host successfully boots and can access the cluster API" + statusInfoPendingUserActionTimeout = "Host failed to boot from the installation disk within the expected time." statusInfoUnbinding = "Host is waiting to be unbound from the cluster" statusInfoRebootingDay2 = "Host has rebooted and no further updates will be posted. Please check console for progress and to possibly approve pending CSRs" statusInfoRebootingForReclaim = "Host is rebooting into the discovery image" diff --git a/internal/host/config.go b/internal/host/config.go index 2640587c6d64..3df8209914c8 100644 --- a/internal/host/config.go +++ b/internal/host/config.go @@ -17,13 +17,14 @@ type PrepareConfig struct { type Config struct { PrepareConfig PrepareConfig LogTimeoutConfig - EnableAutoAssign bool `envconfig:"ENABLE_AUTO_ASSIGN" default:"true"` - ResetTimeout time.Duration `envconfig:"RESET_CLUSTER_TIMEOUT" default:"3m"` - MonitorBatchSize int `envconfig:"HOST_MONITOR_BATCH_SIZE" default:"100"` - DisabledHostvalidations DisabledHostValidations `envconfig:"DISABLED_HOST_VALIDATIONS" default:""` // Which host validations to disable (should not run in preprocess) - BootstrapHostMAC string `envconfig:"BOOTSTRAP_HOST_MAC" default:""` // For ephemeral installer to ensure the bootstrap for the (single) cluster lands on the same host as assisted-service - MaxHostDisconnectionTime time.Duration `envconfig:"HOST_MAX_DISCONNECTION_TIME" default:"3m"` - EnableVirtualInterfaces bool `envconfig:"ENABLE_VIRTUAL_INTERFACES" default:"false"` + EnableAutoAssign bool `envconfig:"ENABLE_AUTO_ASSIGN" default:"true"` + ResetTimeout time.Duration `envconfig:"RESET_CLUSTER_TIMEOUT" default:"3m"` + MonitorBatchSize int `envconfig:"HOST_MONITOR_BATCH_SIZE" default:"100"` + DisabledHostvalidations DisabledHostValidations `envconfig:"DISABLED_HOST_VALIDATIONS" default:""` // Which host validations to disable (should not run in preprocess) + BootstrapHostMAC string `envconfig:"BOOTSTRAP_HOST_MAC" default:""` // For ephemeral installer to ensure the bootstrap for the (single) cluster lands on the same host as assisted-service + MaxHostDisconnectionTime time.Duration `envconfig:"HOST_MAX_DISCONNECTION_TIME" default:"3m"` + EnableVirtualInterfaces bool `envconfig:"ENABLE_VIRTUAL_INTERFACES" default:"false"` + InstallingPendingUserActionTimeout time.Duration `envconfig:"HOST_INSTALLING_PENDING_USER_ACTION_TIMEOUT" default:"60m"` // hostStageTimeouts contains the values of the host stage timeouts. Don't use this // directly, use the HostStageTimeout method instead. diff --git a/internal/host/host_test.go b/internal/host/host_test.go index 4de5f2b53ad3..f6e399af2904 100644 --- a/internal/host/host_test.go +++ b/internal/host/host_test.go @@ -50,11 +50,12 @@ var ( var _ = BeforeEach(func() { defaultConfig = &Config{ - ResetTimeout: 3 * time.Minute, - EnableAutoAssign: true, - MonitorBatchSize: 100, - DisabledHostvalidations: defaultDisabledHostValidations, - MaxHostDisconnectionTime: MaxHostDisconnectionTime, + ResetTimeout: 3 * time.Minute, + EnableAutoAssign: true, + MonitorBatchSize: 100, + DisabledHostvalidations: defaultDisabledHostValidations, + MaxHostDisconnectionTime: MaxHostDisconnectionTime, + InstallingPendingUserActionTimeout: 60 * time.Minute, } err := defaultConfig.Complete() Expect(err).ToNot(HaveOccurred()) diff --git a/internal/host/mock_transition.go b/internal/host/mock_transition.go index f8721a16bb79..7fd2a5e91dee 100644 --- a/internal/host/mock_transition.go +++ b/internal/host/mock_transition.go @@ -35,6 +35,21 @@ func (m *MockTransitionHandler) EXPECT() *MockTransitionHandlerMockRecorder { return m.recorder } +// ClusterWouldSucceedWithoutHost mocks base method. +func (m *MockTransitionHandler) ClusterWouldSucceedWithoutHost(sw stateswitch.StateSwitch, arg1 stateswitch.TransitionArgs) (bool, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ClusterWouldSucceedWithoutHost", sw, arg1) + ret0, _ := ret[0].(bool) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// ClusterWouldSucceedWithoutHost indicates an expected call of ClusterWouldSucceedWithoutHost. +func (mr *MockTransitionHandlerMockRecorder) ClusterWouldSucceedWithoutHost(sw, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ClusterWouldSucceedWithoutHost", reflect.TypeOf((*MockTransitionHandler)(nil).ClusterWouldSucceedWithoutHost), sw, arg1) +} + // HasClusterError mocks base method. func (m *MockTransitionHandler) HasClusterError(sw stateswitch.StateSwitch, args stateswitch.TransitionArgs) (bool, error) { m.ctrl.T.Helper() @@ -65,6 +80,21 @@ func (mr *MockTransitionHandlerMockRecorder) HasInstallationInProgressTimedOut(s return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "HasInstallationInProgressTimedOut", reflect.TypeOf((*MockTransitionHandler)(nil).HasInstallationInProgressTimedOut), sw, arg1) } +// HasPendingUserActionTimedOut mocks base method. +func (m *MockTransitionHandler) HasPendingUserActionTimedOut(sw stateswitch.StateSwitch, arg1 stateswitch.TransitionArgs) (bool, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "HasPendingUserActionTimedOut", sw, arg1) + ret0, _ := ret[0].(bool) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// HasPendingUserActionTimedOut indicates an expected call of HasPendingUserActionTimedOut. +func (mr *MockTransitionHandlerMockRecorder) HasPendingUserActionTimedOut(sw, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "HasPendingUserActionTimedOut", reflect.TypeOf((*MockTransitionHandler)(nil).HasPendingUserActionTimedOut), sw, arg1) +} + // HasStatusTimedOut mocks base method. func (m *MockTransitionHandler) HasStatusTimedOut(timeout time.Duration) stateswitch.Condition { m.ctrl.T.Helper() diff --git a/internal/host/statemachine.go b/internal/host/statemachine.go index 4e8df9456309..7db96679337f 100644 --- a/internal/host/statemachine.go +++ b/internal/host/statemachine.go @@ -884,6 +884,22 @@ func NewHostStateMachine(sm stateswitch.StateMachine, th TransitionHandler) stat }, }) + sm.AddTransitionRule(stateswitch.TransitionRule{ + TransitionType: TransitionTypeRefresh, + SourceStates: []stateswitch.State{ + stateswitch.State(models.HostStatusInstallingPendingUserAction)}, + Condition: stateswitch.And( + th.HasPendingUserActionTimedOut, + th.ClusterWouldSucceedWithoutHost, + ), + DestinationState: stateswitch.State(models.HostStatusError), + PostTransition: th.PostRefreshHost(statusInfoPendingUserActionTimeout), + Documentation: stateswitch.TransitionRuleDoc{ + Name: "Host pending user action timeout with cluster viability check", + Description: "When a host is in installing-pending-user-action state for too long without recovery, transition to error state ONLY if the cluster can still succeed without this host. This prevents timing out hosts that are required for cluster success (e.g., masters in a 3-node cluster), giving users more time to fix boot order issues for critical hosts.", + }, + }) + // Noop transitions for cluster error for _, state := range []stateswitch.State{ stateswitch.State(models.HostStatusInstalling), diff --git a/internal/host/transition.go b/internal/host/transition.go index 6a57b38a1e9d..424eb6880ee4 100644 --- a/internal/host/transition.go +++ b/internal/host/transition.go @@ -36,6 +36,8 @@ type transitionHandler struct { type TransitionHandler interface { HasClusterError(sw stateswitch.StateSwitch, args stateswitch.TransitionArgs) (bool, error) HasInstallationInProgressTimedOut(sw stateswitch.StateSwitch, _ stateswitch.TransitionArgs) (bool, error) + HasPendingUserActionTimedOut(sw stateswitch.StateSwitch, _ stateswitch.TransitionArgs) (bool, error) + ClusterWouldSucceedWithoutHost(sw stateswitch.StateSwitch, _ stateswitch.TransitionArgs) (bool, error) HasStatusTimedOut(timeout time.Duration) stateswitch.Condition HostNotResponsiveWhilePreparingInstallation(sw stateswitch.StateSwitch, args stateswitch.TransitionArgs) (bool, error) IsDay2Host(sw stateswitch.StateSwitch, args stateswitch.TransitionArgs) (bool, error) @@ -671,6 +673,37 @@ func (th *transitionHandler) HasInstallationInProgressTimedOut(sw stateswitch.St return time.Since(time.Time(sHost.host.Progress.StageUpdatedAt)) > maxDuration, nil } +func (th *transitionHandler) HasPendingUserActionTimedOut(sw stateswitch.StateSwitch, _ stateswitch.TransitionArgs) (bool, error) { + sHost, ok := sw.(*stateHost) + if !ok { + return false, errors.New("HasPendingUserActionTimedOut incompatible type of StateSwitch") + } + return time.Since(time.Time(sHost.host.StatusUpdatedAt)) > th.config.InstallingPendingUserActionTimeout, nil +} + +func (th *transitionHandler) ClusterWouldSucceedWithoutHost(sw stateswitch.StateSwitch, args stateswitch.TransitionArgs) (bool, error) { + sHost, ok := sw.(*stateHost) + if !ok { + return false, errors.New("ClusterWouldSucceedWithoutHost incompatible type of StateSwitch") + } + params, ok := args.(*TransitionArgsRefreshHost) + if !ok { + return false, errors.New("ClusterWouldSucceedWithoutHost invalid argument") + } + + // Get the cluster with all hosts + if sHost.host.ClusterID == nil { + return false, errors.New("cluster ID must not be nil") + } + cluster, err := common.GetClusterFromDBWithHosts(params.db, *sHost.host.ClusterID) + if err != nil { + th.log.WithError(err).Errorf("failed to get cluster %s", sHost.host.ClusterID) + return false, err + } + + return common.HasEnoughMastersAndWorkers(cluster, []string{models.HostStatusInstalled}), nil +} + func (th *transitionHandler) PostHostPreparationTimeout() stateswitch.PostTransition { ret := func(sw stateswitch.StateSwitch, args stateswitch.TransitionArgs) error { sHost, ok := sw.(*stateHost) diff --git a/internal/host/transition_test.go b/internal/host/transition_test.go index 18f68fe1856f..ea2c739a0131 100644 --- a/internal/host/transition_test.go +++ b/internal/host/transition_test.go @@ -2161,6 +2161,74 @@ var _ = Describe("Refresh Host", func() { }) + Context("host installing-pending-user-action timeout", func() { + BeforeEach(func() { + pr.EXPECT().IsHostSupported(commontesting.EqPlatformType(models.PlatformTypeVsphere), gomock.Any()).Return(false, nil).AnyTimes() + mockDefaultClusterHostRequirements(mockHwValidator) + + host = hostutil.GenerateTestHost(hostId, infraEnvId, clusterId, models.HostStatusInstallingPendingUserAction) + cluster = hostutil.GenerateTestCluster(clusterId) + Expect(db.Create(&cluster).Error).ToNot(HaveOccurred()) + }) + + It("times out when host is in state for more than the timeout value and is not required for success", func() { + // Create 3 master hosts in installed status so the cluster can succeed without the timed-out worker + for range 3 { + masterId := strfmt.UUID(uuid.New().String()) + master := hostutil.GenerateTestHostByKind(masterId, infraEnvId, &clusterId, models.HostStatusInstalled, models.HostKindHost, models.HostRoleMaster) + Expect(db.Create(&master).Error).ShouldNot(HaveOccurred()) + } + + host.StatusUpdatedAt = strfmt.DateTime(time.Now().Add(-90 * time.Minute)) + Expect(db.Create(&host).Error).ShouldNot(HaveOccurred()) + + mockEvents.EXPECT().SendHostEvent(gomock.Any(), eventstest.NewEventMatcher( + eventstest.WithNameMatcher(eventgen.HostStatusUpdatedEventName), + eventstest.WithHostIdMatcher(hostId.String()), + eventstest.WithInfraEnvIdMatcher(host.InfraEnvID.String()), + eventstest.WithClusterIdMatcher(host.ClusterID.String()), + eventstest.WithSeverityMatcher(hostutil.GetEventSeverityFromHostStatus(models.HostStatusError)))) + + Expect(hapi.RefreshStatus(ctx, &host, db)).To(Succeed()) + + var resultHost models.Host + Expect(db.Take(&resultHost, "id = ? and cluster_id = ?", hostId.String(), clusterId.String()).Error).ToNot(HaveOccurred()) + Expect(swag.StringValue(resultHost.Status)).To(Equal(models.HostStatusError)) + Expect(swag.StringValue(resultHost.StatusInfo)).To(Equal(statusInfoPendingUserActionTimeout)) + }) + + It("remains when host is required for success", func() { + // Create 2 master hosts in installed status + for range 2 { + masterId := strfmt.UUID(uuid.New().String()) + master := hostutil.GenerateTestHostByKind(masterId, infraEnvId, &clusterId, models.HostStatusInstalled, models.HostKindHost, models.HostRoleMaster) + Expect(db.Create(&master).Error).ShouldNot(HaveOccurred()) + } + + // third master in pending-user-action should remain there even after 90 minutes + host.StatusUpdatedAt = strfmt.DateTime(time.Now().Add(-90 * time.Minute)) + host.Role = models.HostRoleMaster + Expect(db.Create(&host).Error).ShouldNot(HaveOccurred()) + + Expect(hapi.RefreshStatus(ctx, &host, db)).To(Succeed()) + + var resultHost models.Host + Expect(db.Take(&resultHost, "id = ? and cluster_id = ?", hostId.String(), clusterId.String()).Error).ToNot(HaveOccurred()) + Expect(swag.StringValue(resultHost.Status)).To(Equal(models.HostStatusInstallingPendingUserAction)) + }) + + It("remains when host is in state for less than the timeout value", func() { + host.StatusUpdatedAt = strfmt.DateTime(time.Now().Add(-30 * time.Minute)) + Expect(db.Create(&host).Error).ShouldNot(HaveOccurred()) + + Expect(hapi.RefreshStatus(ctx, &host, db)).To(Succeed()) + + var resultHost models.Host + Expect(db.Take(&resultHost, "id = ? and cluster_id = ?", hostId.String(), clusterId.String()).Error).ToNot(HaveOccurred()) + Expect(swag.StringValue(resultHost.Status)).To(Equal(models.HostStatusInstallingPendingUserAction)) + }) + }) + Context("Installation disk error handling in status info", func() { BeforeEach(func() { pr.EXPECT().IsHostSupported(commontesting.EqPlatformType(models.PlatformTypeVsphere), gomock.Any()).Return(false, nil).AnyTimes()