diff --git a/internal/bminventory/inventory.go b/internal/bminventory/inventory.go index 483744bc56e..d9b617fe466 100644 --- a/internal/bminventory/inventory.go +++ b/internal/bminventory/inventory.go @@ -1450,10 +1450,10 @@ func (b *bareMetalInventory) InstallClusterInternal(ctx context.Context, params // auto select hosts roles if not selected yet. err = b.db.Transaction(func(tx *gorm.DB) error { var updated bool - sortedHosts, canRefreshRoles := host.SortHosts(cluster.Hosts) + sortedHosts, canRefreshRoles := host.SortHosts(cluster.Hosts, cluster.ControlPlaneCount) if canRefreshRoles { for i := range sortedHosts { - updated, err = b.hostApi.AutoAssignRole(ctx, cluster.Hosts[i], tx) + updated, err = b.hostApi.AutoAssignRole(ctx, sortedHosts[i], tx) if err != nil { return err } diff --git a/internal/host/host_test.go b/internal/host/host_test.go index d62576ad848..e7b94a340fd 100644 --- a/internal/host/host_test.go +++ b/internal/host/host_test.go @@ -3170,6 +3170,49 @@ var _ = Describe("AutoAssignRole", func() { verifyAutoAssignRole(&h, true, true) Expect(hostutil.GetHostFromDB(*h.ID, infraEnvId, db).Role).Should(Equal(models.HostRoleArbiter)) }) + + It("TNA cluster with day-0 workers having similar resources to arbiter", func() { + cluster.ControlPlaneCount = common.MinMasterHostsNeededForInstallationInHaArbiterMode + db.Save(cluster) + + hosts := []*models.Host{ + // 2 intended masters: high resources + generateAutoAssignHost(strfmt.UUID(uuid.New().String()), 16, 64, false, "master-capable-1"), + generateAutoAssignHost(strfmt.UUID(uuid.New().String()), 16, 64, false, "master-capable-2"), + // 1 intended arbiter: lower resources + generateAutoAssignHost(strfmt.UUID(uuid.New().String()), 8, 32, false, "arbiter-capable"), + // 2 intended workers: similar resources to arbiter + generateAutoAssignHost(strfmt.UUID(uuid.New().String()), 8, 32, false, "worker-1"), + generateAutoAssignHost(strfmt.UUID(uuid.New().String()), 8, 32, false, "worker-2"), + } + + cluster.Hosts = hosts + + sortedHosts, _ := SortHosts(hosts, cluster.ControlPlaneCount) + + var masterCount, arbiterCount, workerCount int + + for _, host := range sortedHosts { + Expect(db.Create(host).Error).ShouldNot(HaveOccurred()) + verifyAutoAssignRole(host, true, true) + role := hostutil.GetHostFromDB(*host.ID, infraEnvId, db).Role + switch role { + case models.HostRoleMaster: + // Masters should be the most capable hosts + Expect(host.RequestedHostname).To(HavePrefix("master-capable")) + masterCount++ + case models.HostRoleArbiter: + arbiterCount++ + case models.HostRoleWorker: + workerCount++ + } + } + + Expect(masterCount).To(Equal(2), "Should have exactly 2 masters") + Expect(arbiterCount).To(Equal(1), "Should have exactly 1 arbiter") + Expect(workerCount).To(Equal(2), "Should have exactly 2 workers") + }) + It("should assign roles based on hardware with GPU weight affecting priority", func() { cluster.ControlPlaneCount = common.MinMasterHostsNeededForInstallationInHaMode hosts := []*models.Host{ @@ -3185,7 +3228,7 @@ var _ = Describe("AutoAssignRole", func() { cluster.Hosts = hosts // Sort hosts first (like the real auto-assign logic does) - sortedHosts, _ := SortHosts(hosts) + sortedHosts, _ := SortHosts(hosts, common.MinMasterHostsNeededForInstallationInHaMode) var masterCount, workerCount int @@ -3218,7 +3261,7 @@ var _ = Describe("AutoAssignRole", func() { cluster.Hosts = hosts // Sort hosts first (like the real auto-assign logic does) - sortedHosts, _ := SortHosts(hosts) + sortedHosts, _ := SortHosts(hosts, common.MinMasterHostsNeededForInstallationInHaMode) var masterCount, workerCount int @@ -4410,7 +4453,7 @@ var _ = Describe("sortHost by hardware", func() { } It("verify host order", func() { - sorted, _ := SortHosts(generateHosts()) + sorted, _ := SortHosts(generateHosts(), common.MinMasterHostsNeededForInstallationInHaMode) expected := []string{ "insufficient for both master and worker", "minimal worker with 3 disks (total of 120 GB)", @@ -4428,6 +4471,26 @@ var _ = Describe("sortHost by hardware", func() { Expect(h.RequestedHostname).To(Equal(expected[i])) } }) + It("verify host order for two-node topology", func() { + sorted, _ := SortHosts(generateHosts(), common.MinMasterHostsNeededForInstallationInHaArbiterMode) + expected := []string{ + "odf worker with 3 disks (total of 120 GB)", + "odf worker with 3 disks (total of 80 GB)", + "insufficient for both master and worker", + "odf worker with 1 disk of 40 GB", + "odf master with 3 disks (total of 120 GB)", + "sno master with 3 disks (total of 120 GB)", + "minimal master with 3 disks (total of 120 GB)", + "minimal master with 3 disks (total of 80 GB)", + "minimal master with no disks", + "minimal worker with 3 disks (total of 120 GB)", + // GPU hosts still last + "host with minimal hardware to be either master/worker, with GPU", + } + for i, h := range sorted { + Expect(h.RequestedHostname).To(Equal(expected[i])) + } + }) }) var _ = Describe("update node labels", func() { diff --git a/internal/host/monitor.go b/internal/host/monitor.go index 079826ff63a..d630d71ad25 100644 --- a/internal/host/monitor.go +++ b/internal/host/monitor.go @@ -68,7 +68,7 @@ func (m *Manager) initMonitoringQueryGenerator() { } } -func SortHosts(hosts []*models.Host) ([]*models.Host, bool) { +func SortHosts(hosts []*models.Host, controlPlaneCount int64) ([]*models.Host, bool) { diskCapacityGiB := func(disks []*models.Disk) int64 { return funk.Reduce(disks, func(acc int64, d *models.Disk) int64 { if d.InstallationEligibility.Eligible { @@ -116,6 +116,9 @@ func SortHosts(hosts []*models.Host) ([]*models.Host, bool) { } } + isTwoNodeTopology := controlPlaneCount >= common.MinMasterHostsNeededForInstallationInHaArbiterMode && + controlPlaneCount < common.MinMasterHostsNeededForInstallationInHaMode + sortByWeight := func(hostList []*models.Host) { sort.SliceStable(hostList, func(i, j int) bool { inventory_i, _ := common.UnmarshalInventory(hostList[i].Inventory) @@ -137,6 +140,10 @@ func SortHosts(hosts []*models.Host) ([]*models.Host, bool) { HostWeightMemWeight*(float64(memInGib(inventory_j))-HostWeightMinimumMemGib) + HostWeightDiskWeight*(float64(diskCapacityGiB(inventory_j.Disks))-HostWeightMinimumDiskCapacityGib) + if isTwoNodeTopology { + return wi > wj + } + return wi < wj }) } @@ -150,6 +157,17 @@ func SortHosts(hosts []*models.Host) ([]*models.Host, bool) { result = append(result, hostsWithoutGPU...) result = append(result, hostsWithGPU...) + // for TNA, move the least capable non-GPU host (last in the non-GPU + // descending section) to the arbiter position so that the greedy algorithm + // in selectRole assings it as a arbiter instead of a more capable worker. + if isTwoNodeTopology && len(hostsWithoutGPU) > int(controlPlaneCount) { + lastNonGPUIdx := len(hostsWithoutGPU) - 1 + leastCapable := result[lastNonGPUIdx] + arbiterPos := int(controlPlaneCount) + copy(result[arbiterPos+1:], result[arbiterPos:lastNonGPUIdx]) + result[arbiterPos] = leastCapable + } + return result, allHostsHasInventory } @@ -202,7 +220,7 @@ func (m *Manager) clusterHostMonitoring() { for _, c := range clusters { inventoryCache := make(InventoryCache) - sortedHosts, canRefreshRoles := SortHosts(c.Hosts) + sortedHosts, canRefreshRoles := SortHosts(c.Hosts, c.ControlPlaneCount) log = log.WithField("cluster", c.ID.String())