From 453b76383ad69a01c6b92a560d2dc531ca92a811 Mon Sep 17 00:00:00 2001 From: Patryk Diak Date: Wed, 30 Nov 2022 18:41:55 +0100 Subject: [PATCH 1/8] Rename LinkRoutesAddOrUpdateSrcOrMTU to LinkRoutesApply Signed-off-by: Patryk Diak --- go-controller/pkg/node/gateway_init.go | 2 +- go-controller/pkg/node/gateway_shared_intf.go | 2 +- go-controller/pkg/node/management-port_linux.go | 2 +- go-controller/pkg/node/node.go | 2 +- go-controller/pkg/util/net_linux.go | 2 +- go-controller/pkg/util/net_linux_unit_test.go | 4 ++-- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/go-controller/pkg/node/gateway_init.go b/go-controller/pkg/node/gateway_init.go index 629df29c03..c025763675 100644 --- a/go-controller/pkg/node/gateway_init.go +++ b/go-controller/pkg/node/gateway_init.go @@ -247,7 +247,7 @@ func configureSvcRouteViaInterface(iface string, gwIPs []net.IP) error { mtu = config.Default.RoutableMTU } - err = util.LinkRoutesAddOrUpdateSourceOrMTU(link, gwIP[0], []*net.IPNet{subnet}, mtu, nil) + err = util.LinkRoutesApply(link, gwIP[0], []*net.IPNet{subnet}, mtu, nil) if err != nil { return fmt.Errorf("unable to add/update route for service via %s for gwIP %s, error: %v", iface, gwIP[0].String(), err) } diff --git a/go-controller/pkg/node/gateway_shared_intf.go b/go-controller/pkg/node/gateway_shared_intf.go index 03d30d920b..39263c9916 100644 --- a/go-controller/pkg/node/gateway_shared_intf.go +++ b/go-controller/pkg/node/gateway_shared_intf.go @@ -1763,7 +1763,7 @@ func addMasqueradeRoute(netIfaceName, nodeName string, watchFactory factory.Node } if masqIPNet != nil { klog.Infof("Setting OVN Masquerade route with source: %s", nodeIP) - err = util.LinkRoutesAddOrUpdateSourceOrMTU(netIfaceLink, nil, []*net.IPNet{masqIPNet}, + err = util.LinkRoutesApply(netIfaceLink, nil, []*net.IPNet{masqIPNet}, mtu, nodeIP) if err != nil { return fmt.Errorf("unable to add OVN masquerade route to host, error: %v", err) diff --git a/go-controller/pkg/node/management-port_linux.go b/go-controller/pkg/node/management-port_linux.go index df62858f3a..d35f9d83db 100644 --- a/go-controller/pkg/node/management-port_linux.go +++ b/go-controller/pkg/node/management-port_linux.go @@ -187,7 +187,7 @@ func setupManagementPortIPFamilyConfig(mpcfg *managementPortConfig, cfg *managem return warnings, err } - err = util.LinkRoutesAddOrUpdateSourceOrMTU(mpcfg.link, cfg.gwIP, []*net.IPNet{subnet}, config.Default.RoutableMTU, nil) + err = util.LinkRoutesApply(mpcfg.link, cfg.gwIP, []*net.IPNet{subnet}, config.Default.RoutableMTU, nil) if err != nil { return warnings, err } diff --git a/go-controller/pkg/node/node.go b/go-controller/pkg/node/node.go index e7ca7306a5..5d25df85e7 100644 --- a/go-controller/pkg/node/node.go +++ b/go-controller/pkg/node/node.go @@ -541,7 +541,7 @@ func (n *OvnNode) Start(ctx context.Context, wg *sync.WaitGroup) error { } else { gwIP = mgmtPortConfig.ipv6.gwIP } - err := util.LinkRoutesAddOrUpdateSourceOrMTU(link, gwIP, []*net.IPNet{subnet}, config.Default.RoutableMTU, nil) + err := util.LinkRoutesApply(link, gwIP, []*net.IPNet{subnet}, config.Default.RoutableMTU, nil) if err != nil { return fmt.Errorf("unable to add legacy route for services via mp0, error: %v", err) } diff --git a/go-controller/pkg/util/net_linux.go b/go-controller/pkg/util/net_linux.go index c0c452d482..ab1dd647a0 100644 --- a/go-controller/pkg/util/net_linux.go +++ b/go-controller/pkg/util/net_linux.go @@ -302,7 +302,7 @@ func LinkRoutesAdd(link netlink.Link, gwIP net.IP, subnets []*net.IPNet, mtu int return nil } -func LinkRoutesAddOrUpdateSourceOrMTU(link netlink.Link, gwIP net.IP, subnets []*net.IPNet, mtu int, src net.IP) error { +func LinkRoutesApply(link netlink.Link, gwIP net.IP, subnets []*net.IPNet, mtu int, src net.IP) error { for _, subnet := range subnets { route, err := LinkRouteGetFilteredRoute(filterRouteByDst(link, subnet)) if err != nil { diff --git a/go-controller/pkg/util/net_linux_unit_test.go b/go-controller/pkg/util/net_linux_unit_test.go index 060d38adf8..226d3548aa 100644 --- a/go-controller/pkg/util/net_linux_unit_test.go +++ b/go-controller/pkg/util/net_linux_unit_test.go @@ -671,7 +671,7 @@ func TestLinkRoutesAddOrUpdateMTU(t *testing.T) { }, }, { - desc: "LinkRoutesAddOrUpdateSrcOrMTU() returns NO error when subnets input list is empty", + desc: "LinkRoutesApply() returns NO error when subnets input list is empty", }, } for i, tc := range tests { @@ -680,7 +680,7 @@ func TestLinkRoutesAddOrUpdateMTU(t *testing.T) { ovntest.ProcessMockFnList(&mockNetLinkOps.Mock, tc.onRetArgsNetLinkLibOpers) ovntest.ProcessMockFnList(&mockLink.Mock, tc.onRetArgsLinkIfaceOpers) - err := LinkRoutesAddOrUpdateSourceOrMTU(tc.inputLink, tc.inputGwIP, tc.inputSubnets, tc.inputMTU, tc.inputSrc) + err := LinkRoutesApply(tc.inputLink, tc.inputGwIP, tc.inputSubnets, tc.inputMTU, tc.inputSrc) t.Log(err) if tc.errExp { assert.Error(t, err) From b0c423da7ce95d5377f8773a794e784205de684c Mon Sep 17 00:00:00 2001 From: Patryk Diak Date: Wed, 30 Nov 2022 18:44:04 +0100 Subject: [PATCH 2/8] LinkRoutesApply: update routes GW if it was changed Signed-off-by: Patryk Diak --- go-controller/pkg/util/net_linux.go | 14 ++++----- go-controller/pkg/util/net_linux_unit_test.go | 29 +++++++++++++++++-- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/go-controller/pkg/util/net_linux.go b/go-controller/pkg/util/net_linux.go index ab1dd647a0..0be93b25f8 100644 --- a/go-controller/pkg/util/net_linux.go +++ b/go-controller/pkg/util/net_linux.go @@ -302,6 +302,10 @@ func LinkRoutesAdd(link netlink.Link, gwIP net.IP, subnets []*net.IPNet, mtu int return nil } +// LinkRoutesApply applies routes for given subnets. +// For each subnet it searches for an existing route by destination(subnet) on link: +// * if found and gwIP, mtu or src changed the route will be updated +// * if not found it adds a new route func LinkRoutesApply(link netlink.Link, gwIP net.IP, subnets []*net.IPNet, mtu int, src net.IP) error { for _, subnet := range subnets { route, err := LinkRouteGetFilteredRoute(filterRouteByDst(link, subnet)) @@ -309,17 +313,11 @@ func LinkRoutesApply(link netlink.Link, gwIP net.IP, subnets []*net.IPNet, mtu i return err } if route != nil { - var changed bool - if route.MTU != mtu { + if route.MTU != mtu || !src.Equal(route.Src) || !gwIP.Equal(route.Gw) { route.MTU = mtu - changed = true - } - if !src.Equal(route.Src) { route.Src = src - changed = true - } + route.Gw = gwIP - if changed { err = netLinkOps.RouteReplace(route) if err != nil { return fmt.Errorf("failed to replace route for subnet %s via gateway %s with mtu %d: %v", diff --git a/go-controller/pkg/util/net_linux_unit_test.go b/go-controller/pkg/util/net_linux_unit_test.go index 226d3548aa..1720a34a0e 100644 --- a/go-controller/pkg/util/net_linux_unit_test.go +++ b/go-controller/pkg/util/net_linux_unit_test.go @@ -636,7 +636,7 @@ func TestLinkRoutesAddOrUpdateMTU(t *testing.T) { onRetArgsNetLinkLibOpers: []ovntest.TestifyMockHelper{ {OnCallMethodName: "RouteListFiltered", OnCallMethodArgType: []string{"int", "*netlink.Route", "uint64"}, RetArgList: []interface{}{[]netlink.Route{ { - Gw: ovntest.MustParseIP("192.168.0.1"), + Gw: nil, Dst: ovntest.MustParseIPNet("10.18.20.0/24"), MTU: 1400, Src: ovntest.MustParseIP("192.168.0.10"), @@ -649,9 +649,32 @@ func TestLinkRoutesAddOrUpdateMTU(t *testing.T) { }, }, { - desc: "Route exists, has the same (mtu and source) and is not updated", + desc: "Route exists, has different gw and is updated", inputLink: mockLink, - inputGwIP: nil, + inputGwIP: ovntest.MustParseIP("192.168.0.1"), + inputSubnets: ovntest.MustParseIPNets("10.18.20.0/24"), + inputMTU: 1400, + inputSrc: ovntest.MustParseIP("192.168.0.8"), + errExp: false, + onRetArgsNetLinkLibOpers: []ovntest.TestifyMockHelper{ + {OnCallMethodName: "RouteListFiltered", OnCallMethodArgType: []string{"int", "*netlink.Route", "uint64"}, RetArgList: []interface{}{[]netlink.Route{ + { + Gw: ovntest.MustParseIP("192.168.0.2"), + Dst: ovntest.MustParseIPNet("10.18.20.0/24"), + MTU: 1400, + Src: ovntest.MustParseIP("192.168.0.8"), + }, + }, nil}}, + {OnCallMethodName: "RouteReplace", OnCallMethodArgType: []string{"*netlink.Route"}, RetArgList: []interface{}{nil}}, + }, + onRetArgsLinkIfaceOpers: []ovntest.TestifyMockHelper{ + {OnCallMethodName: "Attrs", OnCallMethodArgType: []string{}, RetArgList: []interface{}{&netlink.LinkAttrs{Name: "testIfaceName", Index: 1}}}, + }, + }, + { + desc: "Route exists, has the same (mtu, source and gw) and is not updated", + inputLink: mockLink, + inputGwIP: ovntest.MustParseIP("192.168.0.1"), inputSubnets: ovntest.MustParseIPNets("10.18.20.0/24"), inputMTU: 1400, inputSrc: ovntest.MustParseIP("192.168.0.10"), From f620c6c6adbb2a2367eabb5c86b6fd2b0e831272 Mon Sep 17 00:00:00 2001 From: Yun Zhou Date: Tue, 1 Nov 2022 08:35:26 -0700 Subject: [PATCH 3/8] make code shareble for secondary network controller Existing OVN code can be shared by the default network controller and the upcoming secondary layer 3 network controller. Update it to get ready for the seconary layer 3 network support. Signed-off-by: Yun Zhou --- .../network_controller_manager.go | 21 +- .../pkg/ovn/base_network_controller.go | 605 +++++++++++++++ .../pkg/ovn/base_network_controller_pods.go | 729 ++++++++++++++++++ .../pkg/ovn/default_network_controller.go | 74 +- go-controller/pkg/ovn/master.go | 330 +------- go-controller/pkg/ovn/master_test.go | 1 - go-controller/pkg/ovn/namespace.go | 113 --- go-controller/pkg/ovn/ovn.go | 57 +- go-controller/pkg/ovn/ovn_test.go | 5 +- go-controller/pkg/ovn/pods.go | 705 ++--------------- go-controller/pkg/ovn/topology_version.go | 41 +- go-controller/pkg/util/node_annotations.go | 5 +- 12 files changed, 1460 insertions(+), 1226 deletions(-) create mode 100644 go-controller/pkg/ovn/base_network_controller.go create mode 100644 go-controller/pkg/ovn/base_network_controller_pods.go diff --git a/go-controller/pkg/network-controller-manager/network_controller_manager.go b/go-controller/pkg/network-controller-manager/network_controller_manager.go index ebc6a4eb24..5a064416f3 100644 --- a/go-controller/pkg/network-controller-manager/network_controller_manager.go +++ b/go-controller/pkg/network-controller-manager/network_controller_manager.go @@ -59,6 +59,8 @@ type NetworkControllerManager struct { sbClient libovsdbclient.Client // has SCTP support SCTPSupport bool + // Supports multicast? + multicastSupport bool stopChan chan struct{} wg *sync.WaitGroup @@ -194,6 +196,8 @@ func (cm *NetworkControllerManager) Init() error { return err } + cm.configureMulticastSupport() + err = cm.enableOVNLogicalDataPathGroups() if err != nil { return err @@ -218,6 +222,17 @@ func (cm *NetworkControllerManager) configureSCTPSupport() error { return nil } +func (cm *NetworkControllerManager) configureMulticastSupport() { + cm.multicastSupport = config.EnableMulticast + if cm.multicastSupport { + if _, _, err := util.RunOVNSbctl("--columns=_uuid", "list", "IGMP_Group"); err != nil { + klog.Warningf("Multicast support enabled, however version of OVN in use does not support IGMP Group. " + + "Disabling Multicast Support") + cm.multicastSupport = false + } + } +} + // enableOVNLogicalDataPathGroups sets an OVN flag to enable logical datapath // groups on OVN 20.12 and later. The option is ignored if OVN doesn't // understand it. Logical datapath groups reduce the size of the southbound @@ -242,9 +257,9 @@ func (cm *NetworkControllerManager) configureMetrics(stopChan <-chan struct{}) { // NewDefaultNetworkController creates and returns the controller for default network func (cm *NetworkControllerManager) NewDefaultNetworkController() { - bnc := ovn.NewBaseNetworkController(cm.client, cm.kube, cm.watchFactory, cm.recorder, cm.nbClient, - cm.sbClient, cm.podRecorder, cm.SCTPSupport) - defaultController := ovn.NewDefaultNetworkController(bnc) + cnci := ovn.NewCommonNetworkControllerInfo(cm.client, cm.kube, cm.watchFactory, cm.recorder, cm.nbClient, + cm.sbClient, cm.podRecorder, cm.SCTPSupport, cm.multicastSupport) + defaultController := ovn.NewDefaultNetworkController(cnci) cm.ovnControllers[ovntypes.DefaultNetworkName] = defaultController } diff --git a/go-controller/pkg/ovn/base_network_controller.go b/go-controller/pkg/ovn/base_network_controller.go new file mode 100644 index 0000000000..440cb038d4 --- /dev/null +++ b/go-controller/pkg/ovn/base_network_controller.go @@ -0,0 +1,605 @@ +package ovn + +import ( + "context" + "fmt" + "math" + "net" + "strconv" + "sync" + "time" + + libovsdbclient "github.com/ovn-org/libovsdb/client" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/factory" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/kube" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdbops" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/metrics" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb" + addressset "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/address_set" + ovnlb "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/loadbalancer" + lsm "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/logical_switch_manager" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/subnetallocator" + ovnretry "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/retry" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" + + kapi "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/sets" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/record" + "k8s.io/client-go/util/retry" + "k8s.io/klog/v2" + utilnet "k8s.io/utils/net" +) + +// CommonNetworkControllerInfo structure is place holder for all fields shared among controllers. +type CommonNetworkControllerInfo struct { + client clientset.Interface + kube kube.Interface + watchFactory *factory.WatchFactory + podRecorder *metrics.PodRecorder + + // event recorder used to post events to k8s + recorder record.EventRecorder + + // libovsdb northbound client interface + nbClient libovsdbclient.Client + + // libovsdb southbound client interface + sbClient libovsdbclient.Client + + // has SCTP support + SCTPSupport bool + + // Supports multicast? + multicastSupport bool +} + +// BaseNetworkController structure holds per-network fields and network specific configuration +// Note that all the methods with NetworkControllerInfo pointer receivers will be called +// by more than one type of network controllers. +type BaseNetworkController struct { + CommonNetworkControllerInfo + + // retry framework for pods + retryPods *ovnretry.RetryFramework + // retry framework for nodes + retryNodes *ovnretry.RetryFramework + + // pod events factory handler + podHandler *factory.Handler + // node events factory handler + nodeHandler *factory.Handler + + // A cache of all logical switches seen by the watcher and their subnets + lsManager *lsm.LogicalSwitchManager + + // A cache of all logical ports known to the controller + logicalPortCache *portCache + + // Info about known namespaces. You must use oc.getNamespaceLocked() or + // oc.waitForNamespaceLocked() to read this map, and oc.createNamespaceLocked() + // or oc.deleteNamespaceLocked() to modify it. namespacesMutex is only held + // from inside those functions. + namespaces map[string]*namespaceInfo + namespacesMutex sync.Mutex + + // An address set factory that creates address sets + addressSetFactory addressset.AddressSetFactory + + // stopChan per controller + stopChan chan struct{} +} + +// NewCommonNetworkControllerInfo creates CommonNetworkControllerInfo shared by controllers +func NewCommonNetworkControllerInfo(client clientset.Interface, kube kube.Interface, wf *factory.WatchFactory, + recorder record.EventRecorder, nbClient libovsdbclient.Client, sbClient libovsdbclient.Client, + podRecorder *metrics.PodRecorder, SCTPSupport, multicastSupport bool) *CommonNetworkControllerInfo { + return &CommonNetworkControllerInfo{ + client: client, + kube: kube, + watchFactory: wf, + recorder: recorder, + nbClient: nbClient, + sbClient: sbClient, + podRecorder: podRecorder, + SCTPSupport: SCTPSupport, + multicastSupport: multicastSupport, + } +} + +// createOvnClusterRouter creates the central router for the network +func (bnc *BaseNetworkController) createOvnClusterRouter() (*nbdb.LogicalRouter, error) { + // Create default Control Plane Protection (COPP) entry for routers + defaultCOPPUUID, err := EnsureDefaultCOPP(bnc.nbClient) + if err != nil { + return nil, fmt.Errorf("unable to create router control plane protection: %w", err) + } + + // Create a single common distributed router for the cluster. + logicalRouterName := types.OVNClusterRouter + logicalRouter := nbdb.LogicalRouter{ + Name: logicalRouterName, + ExternalIDs: map[string]string{ + "k8s-cluster-router": "yes", + }, + Options: map[string]string{ + "always_learn_from_arp_request": "false", + }, + Copp: &defaultCOPPUUID, + } + if bnc.multicastSupport { + logicalRouter.Options = map[string]string{ + "mcast_relay": "true", + } + } + + err = libovsdbops.CreateOrUpdateLogicalRouter(bnc.nbClient, &logicalRouter) + if err != nil { + return nil, fmt.Errorf("failed to create distributed router %s, error: %v", + logicalRouterName, err) + } + + return &logicalRouter, nil +} + +// syncNodeClusterRouterPort ensures a node's LS to the cluster router's LRP is created. +// NOTE: We could have created the router port in ensureNodeLogicalNetwork() instead of here, +// but chassis ID is not available at that moment. We need the chassis ID to set the +// gateway-chassis, which in effect pins the logical switch to the current node in OVN. +// Otherwise, ovn-controller will flood-fill unrelated datapaths unnecessarily, causing scale +// problems. +func (bnc *BaseNetworkController) syncNodeClusterRouterPort(node *kapi.Node, hostSubnets []*net.IPNet) error { + chassisID, err := util.ParseNodeChassisIDAnnotation(node) + if err != nil { + return err + } + + if len(hostSubnets) == 0 { + hostSubnets, err = util.ParseNodeHostSubnetAnnotation(node, types.DefaultNetworkName) + if err != nil { + return err + } + } + + // logical router port MAC is based on IPv4 subnet if there is one, else IPv6 + var nodeLRPMAC net.HardwareAddr + for _, hostSubnet := range hostSubnets { + gwIfAddr := util.GetNodeGatewayIfAddr(hostSubnet) + nodeLRPMAC = util.IPAddrToHWAddr(gwIfAddr.IP) + if !utilnet.IsIPv6CIDR(hostSubnet) { + break + } + } + + switchName := node.Name + logicalRouterName := types.OVNClusterRouter + lrpName := types.RouterToSwitchPrefix + switchName + lrpNetworks := []string{} + for _, hostSubnet := range hostSubnets { + gwIfAddr := util.GetNodeGatewayIfAddr(hostSubnet) + lrpNetworks = append(lrpNetworks, gwIfAddr.String()) + } + logicalRouterPort := nbdb.LogicalRouterPort{ + Name: lrpName, + MAC: nodeLRPMAC.String(), + Networks: lrpNetworks, + } + logicalRouter := nbdb.LogicalRouter{Name: logicalRouterName} + + err = libovsdbops.CreateOrUpdateLogicalRouterPorts(bnc.nbClient, &logicalRouter, + []*nbdb.LogicalRouterPort{&logicalRouterPort}, &logicalRouterPort.MAC, &logicalRouterPort.Networks) + if err != nil { + klog.Errorf("Failed to add logical router port %+v to router %s: %v", logicalRouterPort, logicalRouterName, err) + return err + } + + gatewayChassisName := lrpName + "-" + chassisID + gatewayChassis := nbdb.GatewayChassis{ + Name: gatewayChassisName, + ChassisName: chassisID, + Priority: 1, + } + + err = libovsdbops.CreateOrUpdateGatewayChassis(bnc.nbClient, &logicalRouterPort, &gatewayChassis, + &gatewayChassis.Name, &gatewayChassis.ChassisName, &gatewayChassis.Priority) + if err != nil { + klog.Errorf("Failed to add gateway chassis %s to logical router port %s, error: %v", chassisID, lrpName, err) + return err + } + + return nil +} + +func (bnc *BaseNetworkController) createNodeLogicalSwitch(nodeName string, hostSubnets []*net.IPNet, + loadBalancerGroupUUID string) error { + // logical router port MAC is based on IPv4 subnet if there is one, else IPv6 + var nodeLRPMAC net.HardwareAddr + switchName := nodeName + for _, hostSubnet := range hostSubnets { + gwIfAddr := util.GetNodeGatewayIfAddr(hostSubnet) + nodeLRPMAC = util.IPAddrToHWAddr(gwIfAddr.IP) + if !utilnet.IsIPv6CIDR(hostSubnet) { + break + } + } + + logicalSwitch := nbdb.LogicalSwitch{ + Name: switchName, + } + + var v4Gateway, v6Gateway net.IP + logicalRouterPortNetwork := []string{} + logicalSwitch.OtherConfig = map[string]string{} + for _, hostSubnet := range hostSubnets { + gwIfAddr := util.GetNodeGatewayIfAddr(hostSubnet) + mgmtIfAddr := util.GetNodeManagementIfAddr(hostSubnet) + logicalRouterPortNetwork = append(logicalRouterPortNetwork, gwIfAddr.String()) + + if utilnet.IsIPv6CIDR(hostSubnet) { + v6Gateway = gwIfAddr.IP + + logicalSwitch.OtherConfig["ipv6_prefix"] = + hostSubnet.IP.String() + } else { + v4Gateway = gwIfAddr.IP + excludeIPs := mgmtIfAddr.IP.String() + if config.HybridOverlay.Enabled { + hybridOverlayIfAddr := util.GetNodeHybridOverlayIfAddr(hostSubnet) + excludeIPs += ".." + hybridOverlayIfAddr.IP.String() + } + logicalSwitch.OtherConfig["subnet"] = hostSubnet.String() + logicalSwitch.OtherConfig["exclude_ips"] = excludeIPs + } + } + + if loadBalancerGroupUUID != "" { + logicalSwitch.LoadBalancerGroup = []string{loadBalancerGroupUUID} + } + + logicalRouterPortName := types.RouterToSwitchPrefix + switchName + logicalRouterPort := nbdb.LogicalRouterPort{ + Name: logicalRouterPortName, + MAC: nodeLRPMAC.String(), + Networks: logicalRouterPortNetwork, + } + logicalRouterName := types.OVNClusterRouter + logicalRouter := nbdb.LogicalRouter{Name: logicalRouterName} + + err := libovsdbops.CreateOrUpdateLogicalRouterPorts(bnc.nbClient, &logicalRouter, + []*nbdb.LogicalRouterPort{&logicalRouterPort}, &logicalRouterPort.Networks, &logicalRouterPort.MAC) + if err != nil { + return fmt.Errorf("failed to add logical router port %+v to router %s: %v", logicalRouterPort, logicalRouterName, err) + } + + // If supported, enable IGMP/MLD snooping and querier on the node. + if bnc.multicastSupport { + logicalSwitch.OtherConfig["mcast_snoop"] = "true" + + // Configure IGMP/MLD querier if the gateway IP address is known. + // Otherwise disable it. + if v4Gateway != nil || v6Gateway != nil { + logicalSwitch.OtherConfig["mcast_querier"] = "true" + logicalSwitch.OtherConfig["mcast_eth_src"] = nodeLRPMAC.String() + if v4Gateway != nil { + logicalSwitch.OtherConfig["mcast_ip4_src"] = v4Gateway.String() + } + if v6Gateway != nil { + logicalSwitch.OtherConfig["mcast_ip6_src"] = util.HWAddrToIPv6LLA(nodeLRPMAC).String() + } + } else { + logicalSwitch.OtherConfig["mcast_querier"] = "false" + } + } + + err = libovsdbops.CreateOrUpdateLogicalSwitch(bnc.nbClient, &logicalSwitch, &logicalSwitch.OtherConfig, + &logicalSwitch.LoadBalancerGroup) + if err != nil { + return fmt.Errorf("failed to add logical switch %+v: %v", logicalSwitch, err) + } + + // Connect the switch to the router. + logicalSwitchPort := nbdb.LogicalSwitchPort{ + Name: types.SwitchToRouterPrefix + switchName, + Type: "router", + Addresses: []string{"router"}, + Options: map[string]string{"router-port": types.RouterToSwitchPrefix + switchName}, + } + sw := nbdb.LogicalSwitch{Name: switchName} + err = libovsdbops.CreateOrUpdateLogicalSwitchPortsOnSwitch(bnc.nbClient, &sw, &logicalSwitchPort) + if err != nil { + klog.Errorf("Failed to add logical port %+v to switch %s: %v", logicalSwitchPort, switchName, err) + return err + } + + // multicast is only supported in default network for now + if bnc.multicastSupport { + err = libovsdbops.AddPortsToPortGroup(bnc.nbClient, types.ClusterRtrPortGroupName, logicalSwitchPort.UUID) + if err != nil { + klog.Errorf(err.Error()) + return err + } + } + + // Add the switch to the logical switch cache + return bnc.lsManager.AddSwitch(logicalSwitch.Name, logicalSwitch.UUID, hostSubnets) +} + +func (bnc *BaseNetworkController) allocateNodeSubnets(node *kapi.Node, + masterSubnetAllocator *subnetallocator.HostSubnetAllocator) ([]*net.IPNet, error) { + existingSubnets, err := util.ParseNodeHostSubnetAnnotation(node, types.DefaultNetworkName) + if err != nil && !util.IsAnnotationNotSetError(err) { + // Log the error and try to allocate new subnets + klog.Infof("Failed to get node %s host subnets annotations: %v", node.Name, err) + } + + hostSubnets, allocatedSubnets, err := masterSubnetAllocator.AllocateNodeSubnets(node.Name, existingSubnets, config.IPv4Mode, config.IPv6Mode) + if err != nil { + return nil, err + } + // Release the allocation on error + defer func() { + if err != nil { + if errR := masterSubnetAllocator.ReleaseNodeSubnets(node.Name, allocatedSubnets...); errR != nil { + klog.Warningf("Error releasing node %s subnets: %v", node.Name, errR) + } + } + }() + + return hostSubnets, nil +} + +// UpdateNodeAnnotationWithRetry update node's hostSubnet annotation (possibly for multiple networks) and the +// other given node annotations +func (bnc *BaseNetworkController) UpdateNodeAnnotationWithRetry(nodeName string, hostSubnetsMap map[string][]*net.IPNet, + otherUpdatedNodeAnnotation map[string]string) error { + // Retry if it fails because of potential conflict which is transient. Return error in the + // case of other errors (say temporary API server down), and it will be taken care of by the + // retry mechanism. + resultErr := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + // Informer cache should not be mutated, so get a copy of the object + node, err := bnc.watchFactory.GetNode(nodeName) + if err != nil { + return err + } + + cnode := node.DeepCopy() + for netName, hostSubnets := range hostSubnetsMap { + cnode.Annotations, err = util.UpdateNodeHostSubnetAnnotation(cnode.Annotations, hostSubnets, netName) + if err != nil { + return fmt.Errorf("failed to update node %q annotation subnet %s", + node.Name, util.JoinIPNets(hostSubnets, ",")) + } + } + for k, v := range otherUpdatedNodeAnnotation { + cnode.Annotations[k] = v + } + return bnc.kube.UpdateNode(cnode) + }) + if resultErr != nil { + return fmt.Errorf("failed to update node %s annotation", nodeName) + } + return nil +} + +// deleteNodeLogicalNetwork removes the logical switch and logical router port associated with the node +func (bnc *BaseNetworkController) deleteNodeLogicalNetwork(nodeName string) error { + switchName := nodeName + // Remove switch to lb associations from the LBCache before removing the switch + lbCache, err := ovnlb.GetLBCache(bnc.nbClient) + if err != nil { + return fmt.Errorf("failed to get load_balancer cache for node %s: %v", nodeName, err) + } + lbCache.RemoveSwitch(switchName) + + // Remove the logical switch associated with the node + err = libovsdbops.DeleteLogicalSwitch(bnc.nbClient, switchName) + if err != nil { + return fmt.Errorf("failed to delete logical switch %s: %v", switchName, err) + } + + logicalRouterName := types.OVNClusterRouter + logicalRouter := nbdb.LogicalRouter{Name: logicalRouterName} + logicalRouterPort := nbdb.LogicalRouterPort{ + Name: types.RouterToSwitchPrefix + switchName, + } + err = libovsdbops.DeleteLogicalRouterPorts(bnc.nbClient, &logicalRouter, &logicalRouterPort) + if err != nil { + return fmt.Errorf("failed to delete router port %s: %v", logicalRouterPort.Name, err) + } + + return nil +} + +// updates the list of nodes if the given node manages its hostSubnets; returns its hostSubnets if any +func (bnc *BaseNetworkController) updateNodesManageHostSubnets(node *kapi.Node, + masterSubnetAllocator *subnetallocator.HostSubnetAllocator, foundNodes sets.String) []*net.IPNet { + if noHostSubnet(node) { + return []*net.IPNet{} + } + hostSubnets, _ := util.ParseNodeHostSubnetAnnotation(node, types.DefaultNetworkName) + foundNodes.Insert(node.Name) + + klog.V(5).Infof("Node %s contains subnets: %v", node.Name, hostSubnets) + if err := masterSubnetAllocator.MarkSubnetsAllocated(node.Name, hostSubnets...); err != nil { + utilruntime.HandleError(err) + } + return hostSubnets +} + +func (bnc *BaseNetworkController) addAllPodsOnNode(nodeName string) []error { + errs := []error{} + options := metav1.ListOptions{ + FieldSelector: fields.OneTermEqualSelector("spec.nodeName", nodeName).String(), + ResourceVersion: "0", + } + pods, err := bnc.client.CoreV1().Pods(metav1.NamespaceAll).List(context.TODO(), options) + if err != nil { + errs = append(errs, err) + klog.Errorf("Unable to list existing pods on node: %s, existing pods on this node may not function", + nodeName) + } else { + klog.V(5).Infof("When adding node %s, found %d pods to add to retryPods", nodeName, len(pods.Items)) + for _, pod := range pods.Items { + pod := pod + if util.PodCompleted(&pod) { + continue + } + klog.V(5).Infof("Adding pod %s/%s to retryPods", pod.Namespace, pod.Name) + err = bnc.retryPods.AddRetryObjWithAddNoBackoff(&pod) + if err != nil { + errs = append(errs, err) + klog.Errorf("Failed to add pod %s/%s to retryPods: %v", pod.Namespace, pod.Name, err) + } + } + } + bnc.retryPods.RequestRetryObjs() + return errs +} + +func (bnc *BaseNetworkController) updateL3TopologyVersion() error { + currentTopologyVersion := strconv.Itoa(types.OvnCurrentTopologyVersion) + clusterRouterName := types.OVNClusterRouter + logicalRouter := nbdb.LogicalRouter{ + Name: clusterRouterName, + ExternalIDs: map[string]string{"k8s-ovn-topo-version": currentTopologyVersion}, + } + err := libovsdbops.UpdateLogicalRouterSetExternalIDs(bnc.nbClient, &logicalRouter) + if err != nil { + return fmt.Errorf("failed to generate set topology version, err: %v", err) + } + klog.Infof("Updated Logical_Router %s topology version to %s", clusterRouterName, currentTopologyVersion) + return nil +} + +// determineOVNTopoVersionFromOVN determines what OVN Topology version is being used +// If "k8s-ovn-topo-version" key in external_ids column does not exist, it is prior to OVN topology versioning +// and therefore set version number to OvnCurrentTopologyVersion +func (bnc *BaseNetworkController) determineOVNTopoVersionFromOVN() (int, error) { + clusterRouterName := types.OVNClusterRouter + logicalRouter := &nbdb.LogicalRouter{Name: clusterRouterName} + logicalRouter, err := libovsdbops.GetLogicalRouter(bnc.nbClient, logicalRouter) + if err != nil && err != libovsdbclient.ErrNotFound { + return 0, fmt.Errorf("error getting router %s: %v", clusterRouterName, err) + } + if err == libovsdbclient.ErrNotFound { + // no OVNClusterRouter exists, DB is empty, nothing to upgrade + return math.MaxInt32, nil + } + v, exists := logicalRouter.ExternalIDs["k8s-ovn-topo-version"] + if !exists { + klog.Infof("No version string found. The OVN topology is before versioning is introduced. Upgrade needed") + return 0, nil + } + ver, err := strconv.Atoi(v) + if err != nil { + return 0, fmt.Errorf("invalid OVN topology version string for the cluster, err: %v", err) + } + return ver, nil +} + +// getNamespaceLocked locks namespacesMutex, looks up ns, and (if found), returns it with +// its mutex locked. If ns is not known, nil will be returned +func (bnc *BaseNetworkController) getNamespaceLocked(ns string, readOnly bool) (*namespaceInfo, func()) { + // Only hold namespacesMutex while reading/modifying oc.namespaces. In particular, + // we drop namespacesMutex while trying to claim nsInfo.Mutex, because something + // else might have locked the nsInfo and be doing something slow with it, and we + // don't want to block all access to oc.namespaces while that's happening. + bnc.namespacesMutex.Lock() + nsInfo := bnc.namespaces[ns] + bnc.namespacesMutex.Unlock() + + if nsInfo == nil { + return nil, nil + } + var unlockFunc func() + if readOnly { + unlockFunc = func() { nsInfo.RUnlock() } + nsInfo.RLock() + } else { + unlockFunc = func() { nsInfo.Unlock() } + nsInfo.Lock() + } + // Check that the namespace wasn't deleted while we were waiting for the lock + bnc.namespacesMutex.Lock() + defer bnc.namespacesMutex.Unlock() + if nsInfo != bnc.namespaces[ns] { + unlockFunc() + return nil, nil + } + return nsInfo, unlockFunc +} + +// deleteNamespaceLocked locks namespacesMutex, finds and deletes ns, and returns the +// namespace, locked. +func (bnc *BaseNetworkController) deleteNamespaceLocked(ns string) *namespaceInfo { + // The locking here is the same as in getNamespaceLocked + + bnc.namespacesMutex.Lock() + nsInfo := bnc.namespaces[ns] + bnc.namespacesMutex.Unlock() + + if nsInfo == nil { + return nil + } + nsInfo.Lock() + + bnc.namespacesMutex.Lock() + defer bnc.namespacesMutex.Unlock() + if nsInfo != bnc.namespaces[ns] { + nsInfo.Unlock() + return nil + } + if nsInfo.addressSet != nil { + // Empty the address set, then delete it after an interval. + if err := nsInfo.addressSet.SetIPs(nil); err != nil { + klog.Errorf("Warning: failed to empty address set for deleted NS %s: %v", ns, err) + } + + // Delete the address set after a short delay. + // This is so NetworkPolicy handlers can converge and stop referencing it. + addressSet := nsInfo.addressSet + go func() { + select { + case <-bnc.stopChan: + return + case <-time.After(20 * time.Second): + // Check to see if the NS was re-added in the meanwhile. If so, + // only delete if the new NS's AddressSet shouldn't exist. + nsInfo, nsUnlock := bnc.getNamespaceLocked(ns, true) + if nsInfo != nil { + defer nsUnlock() + if nsInfo.addressSet != nil { + klog.V(5).Infof("Skipping deferred deletion of AddressSet for NS %s: re-created", ns) + return + } + } + + klog.V(5).Infof("Finishing deferred deletion of AddressSet for NS %s", ns) + if err := addressSet.Destroy(); err != nil { + klog.Errorf("Failed to delete AddressSet for NS %s: %v", ns, err.Error()) + } + } + }() + } + delete(bnc.namespaces, ns) + + return nsInfo +} + +// WatchNodes starts the watching of the nodes resource and calls back the appropriate handler logic +func (bnc *BaseNetworkController) WatchNodes() error { + if bnc.nodeHandler != nil { + return nil + } + + handler, err := bnc.retryNodes.WatchResource() + if err == nil { + bnc.nodeHandler = handler + } + return err +} diff --git a/go-controller/pkg/ovn/base_network_controller_pods.go b/go-controller/pkg/ovn/base_network_controller_pods.go new file mode 100644 index 0000000000..2908a8168e --- /dev/null +++ b/go-controller/pkg/ovn/base_network_controller_pods.go @@ -0,0 +1,729 @@ +package ovn + +import ( + "fmt" + "net" + "time" + + networkattachmentdefinitionapi "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/apis/k8s.cni.cncf.io/v1" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/metrics" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/ipallocator" + logicalswitchmanager "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/logical_switch_manager" + ovntypes "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" + "github.com/pkg/errors" + kapi "k8s.io/api/core/v1" + kerrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/util/retry" + "k8s.io/klog/v2" + utilnet "k8s.io/utils/net" + + libovsdbclient "github.com/ovn-org/libovsdb/client" + "github.com/ovn-org/libovsdb/ovsdb" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdbops" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb" +) + +func (bnc *BaseNetworkController) allocatePodIPs(pod *kapi.Pod, + annotations *util.PodAnnotation) (expectedLogicalPortName string, err error) { + switchName := pod.Spec.NodeName + if !util.PodScheduled(pod) || !util.PodWantsNetwork(pod) || util.PodCompleted(pod) { + return "", nil + } + // skip nodes that are not running ovnk (inferred from host subnets) + if bnc.lsManager.IsNonHostSubnetSwitch(switchName) { + return "", nil + } + expectedLogicalPortName = util.GetLogicalPortName(pod.Namespace, pod.Name) + // it is possible to try to add a pod here that has no node. For example if a pod was deleted with + // a finalizer, and then the node was removed. In this case the pod will still exist in a running state. + // Terminating pods should still have network connectivity for pre-stop hooks or termination grace period + if _, err := bnc.watchFactory.GetNode(pod.Spec.NodeName); kerrors.IsNotFound(err) && + bnc.lsManager.GetSwitchSubnets(switchName) == nil { + if util.PodTerminating(pod) { + klog.Infof("Ignoring IP allocation for terminating pod: %s/%s, on deleted "+ + "node: %s", pod.Namespace, pod.Name, pod.Spec.NodeName) + return expectedLogicalPortName, nil + } else { + // unknown condition how we are getting a non-terminating pod without a node here + klog.Errorf("Pod IP allocation found for a non-existent node in API with unknown "+ + "condition. Pod: %s/%s, node: %s", pod.Namespace, pod.Name, pod.Spec.NodeName) + } + } + if err := bnc.waitForNodeLogicalSwitchInCache(switchName); err != nil { + return expectedLogicalPortName, fmt.Errorf("failed to wait for switch %s to be added to cache. IP allocation may fail!", + switchName) + } + if err = bnc.lsManager.AllocateIPs(switchName, annotations.IPs); err != nil { + if err == ipallocator.ErrAllocated { + // already allocated: log an error but not stop syncPod from continuing + klog.Errorf("Already allocated IPs: %s for pod: %s on switchName: %s", + util.JoinIPNetIPs(annotations.IPs, " "), expectedLogicalPortName, + switchName) + } else { + return expectedLogicalPortName, fmt.Errorf("couldn't allocate IPs: %s for pod: %s on switch: %s"+ + " error: %v", util.JoinIPNetIPs(annotations.IPs, " "), expectedLogicalPortName, + switchName, err) + } + } + return expectedLogicalPortName, nil +} + +func (bnc *BaseNetworkController) deleteStaleLogicalSwitchPorts(expectedLogicalPorts map[string]bool) error { + // get all the nodes from the watchFactory + nodes, err := bnc.watchFactory.GetNodes() + if err != nil { + return fmt.Errorf("failed to get nodes: %v", err) + } + + var ops []ovsdb.Operation + for _, n := range nodes { + // skip nodes that are not running ovnk (inferred from host subnets) + switchName := n.Name + if bnc.lsManager.IsNonHostSubnetSwitch(switchName) { + continue + } + p := func(item *nbdb.LogicalSwitchPort) bool { + return item.ExternalIDs["pod"] == "true" && !expectedLogicalPorts[item.Name] + } + sw := nbdb.LogicalSwitch{ + Name: switchName, + } + sw.UUID, _ = bnc.lsManager.GetUUID(switchName) + + ops, err = libovsdbops.DeleteLogicalSwitchPortsWithPredicateOps(bnc.nbClient, ops, &sw, p) + if err != nil { + return fmt.Errorf("could not generate ops to delete stale ports from logical switch %s (%+v)", switchName, err) + } + } + + _, err = libovsdbops.TransactAndCheck(bnc.nbClient, ops) + if err != nil { + return fmt.Errorf("could not remove stale logicalPorts from switches (%+v)", err) + } + return nil +} + +// lookupPortUUIDAndSwitchName will use libovsdb to locate the logical switch port uuid as well as the logical switch +// that owns such port (aka nodeName), based on the logical port name. +func (bnc *BaseNetworkController) lookupPortUUIDAndSwitchName(logicalPort string) (portUUID string, logicalSwitch string, err error) { + lsp := &nbdb.LogicalSwitchPort{Name: logicalPort} + lsp, err = libovsdbops.GetLogicalSwitchPort(bnc.nbClient, lsp) + if err != nil { + return "", "", err + } + p := func(item *nbdb.LogicalSwitch) bool { + for _, currPortUUID := range item.Ports { + if currPortUUID == lsp.UUID { + return true + } + } + return false + } + nodeSwitches, err := libovsdbops.FindLogicalSwitchesWithPredicate(bnc.nbClient, p) + if err != nil { + return "", "", fmt.Errorf("failed to get node logical switch for logical port %s (%s): %w", logicalPort, lsp.UUID, err) + } + if len(nodeSwitches) != 1 { + return "", "", fmt.Errorf("found %d node logical switch for logical port %s (%s)", len(nodeSwitches), logicalPort, lsp.UUID) + } + return lsp.UUID, nodeSwitches[0].Name, nil +} + +func (bnc *BaseNetworkController) deletePodLogicalPort(pod *kapi.Pod, portInfo *lpInfo) (*lpInfo, error) { + var portUUID, switchName string + var podIfAddrs []*net.IPNet + var err error + + // get the logical switch name that the pod's logical port is expected to be on + expectedSwitchName := pod.Spec.NodeName + podDesc := fmt.Sprintf("pod %s/%s", pod.Namespace, pod.Name) + logicalPort := util.GetLogicalPortName(pod.Namespace, pod.Name) + if portInfo == nil { + // If ovnkube-master restarts, it is also possible the Pod's logical switch port + // is not re-added into the cache. Delete logical switch port anyway. + annotation, err := util.UnmarshalPodAnnotation(pod.Annotations, ovntypes.DefaultNetworkName) + if err != nil { + if util.IsAnnotationNotSetError(err) { + // if the annotation doesn’t exist, that’s not an error. It means logical port does not need to be deleted. + klog.V(5).Infof("No annotations on %s, no need to delete its logical port: %s", podDesc, logicalPort) + return nil, nil + } + return nil, fmt.Errorf("unable to unmarshal pod annotations for %s: %w", podDesc, err) + } + + // Since portInfo is not available, use ovn to locate the logical switch (named after the node name) for the logical port. + portUUID, switchName, err = bnc.lookupPortUUIDAndSwitchName(logicalPort) + if err != nil { + if err != libovsdbclient.ErrNotFound { + return nil, fmt.Errorf("unable to locate portUUID+switchName for %s: %w", podDesc, err) + } + // The logical port no longer exists in OVN. The caller expects this function to be idem-potent, + // so the proper action to take is to use an empty uuid and extract the node name from the pod spec. + portUUID = "" + switchName = expectedSwitchName + } + podIfAddrs = annotation.IPs + + klog.Warningf("No cached port info for deleting %s. Using logical switch %s port uuid %s and addrs %v", + podDesc, switchName, portUUID, podIfAddrs) + } else { + portUUID = portInfo.uuid + switchName = portInfo.logicalSwitch + podIfAddrs = portInfo.ips + } + + // Sanity check + if switchName != expectedSwitchName { + klog.Errorf("Deleting %s expecting switch name: %s, OVN DB has switch name %s for port uuid %s", + podDesc, expectedSwitchName, switchName, portUUID) + } + + shouldRelease := true + // check to make sure no other pods are using this IP before we try to release it if this is a completed pod. + if util.PodCompleted(pod) { + if shouldRelease, err = bnc.lsManager.ConditionalIPRelease(switchName, podIfAddrs, func() (bool, error) { + pods, err := bnc.watchFactory.GetAllPods() + if err != nil { + return false, fmt.Errorf("unable to get pods to determine if completed pod IP is in use by another pod. "+ + "Will not release pod %s/%s IP: %#v from allocator", pod.Namespace, pod.Name, podIfAddrs) + } + // iterate through all pods, ignore pods on other switches + for _, p := range pods { + if util.PodCompleted(p) || !util.PodWantsNetwork(p) || !util.PodScheduled(p) || expectedSwitchName != switchName { + continue + } + // check if the pod addresses match in the OVN annotation + pAddrs, err := util.GetAllPodIPs(p) + if err != nil { + continue + } + + for _, pAddr := range pAddrs { + for _, podAddr := range podIfAddrs { + if pAddr.Equal(podAddr.IP) { + klog.Infof("Will not release IP address: %s for %s. Detected another pod"+ + " using this IP: %s/%s", pAddr.String(), podDesc, p.Namespace, p.Name) + return false, nil + } + } + } + } + klog.Infof("Releasing IPs for Completed pod: %s/%s, ips: %s", pod.Namespace, pod.Name, + util.JoinIPNetIPs(podIfAddrs, " ")) + return true, nil + }); err != nil { + return nil, fmt.Errorf("cannot determine if IPs are safe to release for completed pod: %s: %w", podDesc, err) + } + } + + var allOps, ops []ovsdb.Operation + + // if the ip is in use by another pod we should not try to remove it from the address set + if shouldRelease { + if ops, err = bnc.deletePodFromNamespace(pod.Namespace, + podIfAddrs, portUUID); err != nil { + return nil, fmt.Errorf("unable to delete pod %s from namespace: %w", podDesc, err) + } + allOps = append(allOps, ops...) + } + ops, err = bnc.delLSPOps(logicalPort, switchName, portUUID) + // Tolerate cases where logical switch of the logical port no longer exist in OVN. + if err != nil && !errors.Is(err, libovsdbclient.ErrNotFound) { + return nil, fmt.Errorf("failed to create delete ops for the lsp: %s: %s", logicalPort, err) + } + allOps = append(allOps, ops...) + + recordOps, txOkCallBack, _, err := metrics.GetConfigDurationRecorder().AddOVN(bnc.nbClient, "pod", pod.Namespace, + pod.Name) + if err != nil { + klog.Errorf("Failed to record config duration: %v", err) + } + allOps = append(allOps, recordOps...) + + _, err = libovsdbops.TransactAndCheck(bnc.nbClient, allOps) + if err != nil { + return nil, fmt.Errorf("cannot delete logical switch port %s, %v", logicalPort, err) + } + txOkCallBack() + + // do not remove SNATs/GW routes/IPAM for an IP address unless we have validated no other pod is using it + if !shouldRelease { + return nil, nil + } + + pInfo := lpInfo{ + name: logicalPort, + uuid: portUUID, + logicalSwitch: switchName, + ips: podIfAddrs, + } + return &pInfo, nil +} + +func (bnc *BaseNetworkController) releasePodIPs(pInfo *lpInfo) error { + if err := bnc.lsManager.ReleaseIPs(pInfo.logicalSwitch, pInfo.ips); err != nil { + if !errors.Is(err, logicalswitchmanager.SwitchNotFound) { + return fmt.Errorf("cannot release IPs of port %s on switch %s: %w", pInfo.name, pInfo.logicalSwitch, err) + } + klog.Warningf("Ignoring release IPs failure of port %s on switch %s: %w", pInfo.name, pInfo.logicalSwitch, err) + } + return nil +} + +func (bnc *BaseNetworkController) waitForNodeLogicalSwitch(switchName string) (*nbdb.LogicalSwitch, error) { + // Wait for the node logical switch to be created by the ClusterController and be present + // in libovsdb's cache. The node switch will be created when the node's logical network infrastructure + // is created by the node watch + ls := &nbdb.LogicalSwitch{Name: switchName} + if err := wait.PollImmediate(30*time.Millisecond, 30*time.Second, func() (bool, error) { + if lsUUID, ok := bnc.lsManager.GetUUID(switchName); !ok { + return false, fmt.Errorf("error getting logical switch %s: %s", switchName, "switch not in logical switch cache") + } else { + ls.UUID = lsUUID + return true, nil + } + }); err != nil { + return nil, fmt.Errorf("timed out waiting for logical switch in logical switch cache %q subnet: %v", switchName, err) + } + return ls, nil +} + +func (bnc *BaseNetworkController) waitForNodeLogicalSwitchInCache(switchName string) error { + // Wait for the node logical switch to be created by the ClusterController. + // The node switch will be created when the node's logical network infrastructure + // is created by the node watch. + var subnets []*net.IPNet + if err := wait.PollImmediate(30*time.Millisecond, 30*time.Second, func() (bool, error) { + subnets = bnc.lsManager.GetSwitchSubnets(switchName) + return subnets != nil, nil + }); err != nil { + return fmt.Errorf("timed out waiting for logical switch %q subnet: %v", switchName, err) + } + return nil +} + +func (bnc *BaseNetworkController) addRoutesGatewayIP(pod *kapi.Pod, podAnnotation *util.PodAnnotation, + nodeSubnets []*net.IPNet) error { + // if there are other network attachments for the pod, then check if those network-attachment's + // annotation has default-route key. If present, then we need to skip adding default route for + // OVN interface + networks, err := util.GetK8sPodAllNetworks(pod) + if err != nil { + return fmt.Errorf("error while getting network attachment definition for [%s/%s]: %v", + pod.Namespace, pod.Name, err) + } + otherDefaultRouteV4 := false + otherDefaultRouteV6 := false + for _, network := range networks { + for _, gatewayRequest := range network.GatewayRequest { + if utilnet.IsIPv6(gatewayRequest) { + otherDefaultRouteV6 = true + } else { + otherDefaultRouteV4 = true + } + } + } + + for _, podIfAddr := range podAnnotation.IPs { + isIPv6 := utilnet.IsIPv6CIDR(podIfAddr) + nodeSubnet, err := util.MatchIPNetFamily(isIPv6, nodeSubnets) + if err != nil { + return err + } + + gatewayIPnet := util.GetNodeGatewayIfAddr(nodeSubnet) + + otherDefaultRoute := otherDefaultRouteV4 + if isIPv6 { + otherDefaultRoute = otherDefaultRouteV6 + } + var gatewayIP net.IP + if otherDefaultRoute { + for _, clusterSubnet := range config.Default.ClusterSubnets { + if isIPv6 == utilnet.IsIPv6CIDR(clusterSubnet.CIDR) { + podAnnotation.Routes = append(podAnnotation.Routes, util.PodRoute{ + Dest: clusterSubnet.CIDR, + NextHop: gatewayIPnet.IP, + }) + } + } + for _, serviceSubnet := range config.Kubernetes.ServiceCIDRs { + if isIPv6 == utilnet.IsIPv6CIDR(serviceSubnet) { + podAnnotation.Routes = append(podAnnotation.Routes, util.PodRoute{ + Dest: serviceSubnet, + NextHop: gatewayIPnet.IP, + }) + } + } + } else { + gatewayIP = gatewayIPnet.IP + } + + if gatewayIP != nil { + podAnnotation.Gateways = append(podAnnotation.Gateways, gatewayIP) + } + } + return nil +} + +// podExpectedInLogicalCache returns true if pod should be added to oc.logicalPortCache. +// For some pods, like hostNetwork pods, overlay node pods, or completed pods waiting for them to be added +// to oc.logicalPortCache will never succeed. +func (bnc *BaseNetworkController) podExpectedInLogicalCache(pod *kapi.Pod) bool { + switchName := pod.Spec.NodeName + return util.PodWantsNetwork(pod) && !bnc.lsManager.IsNonHostSubnetSwitch(switchName) && !util.PodCompleted(pod) +} + +func (bnc *BaseNetworkController) addLogicalPortToNetwork(pod *kapi.Pod, + network *networkattachmentdefinitionapi.NetworkSelectionElement) (ops []ovsdb.Operation, + lsp *nbdb.LogicalSwitchPort, podAnnotation *util.PodAnnotation, newlyCreatedPort bool, err error) { + var ls *nbdb.LogicalSwitch + + podDesc := fmt.Sprintf("%s/%s", pod.Namespace, pod.Name) + switchName := pod.Spec.NodeName + + // it is possible to try to add a pod here that has no node. For example if a pod was deleted with + // a finalizer, and then the node was removed. In this case the pod will still exist in a running state. + // Terminating pods should still have network connectivity for pre-stop hooks or termination grace period + // We cannot wire a pod that has no node/switch, so retry again later + if _, err := bnc.watchFactory.GetNode(pod.Spec.NodeName); kerrors.IsNotFound(err) && + bnc.lsManager.GetSwitchSubnets(switchName) == nil { + podState := "unknown" + if util.PodTerminating(pod) { + podState = "terminating" + } + return nil, nil, nil, false, fmt.Errorf("[%s/%s] Non-existent node: %s in API for pod with %s state", + pod.Namespace, pod.Name, pod.Spec.NodeName, podState) + } + + ls, err = bnc.waitForNodeLogicalSwitch(switchName) + if err != nil { + return nil, nil, nil, false, err + } + + portName := util.GetLogicalPortName(pod.Namespace, pod.Name) + klog.Infof("[%s] creating logical port %s for pod on switch %s", podDesc, portName, switchName) + + var podMac net.HardwareAddr + var podIfAddrs []*net.IPNet + var addresses []string + var releaseIPs bool + lspExist := false + needsIP := true + + // Check if the pod's logical switch port already exists. If it + // does don't re-add the port to OVN as this will change its + // UUID and and the port cache, address sets, and port groups + // will still have the old UUID. + lsp = &nbdb.LogicalSwitchPort{Name: portName} + existingLSP, err := libovsdbops.GetLogicalSwitchPort(bnc.nbClient, lsp) + if err != nil && err != libovsdbclient.ErrNotFound { + return nil, nil, nil, false, fmt.Errorf("unable to get the lsp %s from the nbdb: %s", portName, err) + } + lspExist = err != libovsdbclient.ErrNotFound + + // Sanity check. If port exists, it should be in the logical switch obtained from the pod spec. + if lspExist { + portFound := false + ls, err = libovsdbops.GetLogicalSwitch(bnc.nbClient, ls) + if err != nil { + return nil, nil, nil, false, fmt.Errorf("[%s] unable to find logical switch %s in NBDB", + podDesc, switchName) + } + for _, currPortUUID := range ls.Ports { + if currPortUUID == existingLSP.UUID { + portFound = true + break + } + } + if !portFound { + // This should never happen and indicates we failed to clean up an LSP for a pod that was recreated + return nil, nil, nil, false, fmt.Errorf("[%s] failed to locate existing logical port %s (%s) in logical switch %s", + podDesc, existingLSP.Name, existingLSP.UUID, switchName) + } + } + + lsp.Options = make(map[string]string) + // Unique identifier to distinguish interfaces for recreated pods, also set by ovnkube-node + // ovn-controller will claim the OVS interface only if external_ids:iface-id + // matches with the Port_Binding.logical_port and external_ids:iface-id-ver matches + // with the Port_Binding.options:iface-id-ver. This is not mandatory. + // If Port_binding.options:iface-id-ver is not set, then OVS + // Interface.external_ids:iface-id-ver if set is ignored. + // Don't set iface-id-ver for already existing LSP if it wasn't set before, + // because the corresponding OVS port may not have it set + // (then ovn-controller won't bind the interface). + // May happen on upgrade, because ovnkube-node doesn't update + // existing OVS interfaces with new iface-id-ver option. + if !lspExist || len(existingLSP.Options["iface-id-ver"]) != 0 { + lsp.Options["iface-id-ver"] = string(pod.UID) + } + // Bind the port to the node's chassis; prevents ping-ponging between + // chassis if ovnkube-node isn't running correctly and hasn't cleared + // out iface-id for an old instance of this pod, and the pod got + // rescheduled. + lsp.Options["requested-chassis"] = pod.Spec.NodeName + + podAnnotation, err = util.UnmarshalPodAnnotation(pod.Annotations, ovntypes.DefaultNetworkName) + + // the IPs we allocate in this function need to be released back to the + // IPAM pool if there is some error in any step of addLogicalPort past + // the point the IPs were assigned via the IPAM manager. + // this needs to be done only when releaseIPs is set to true (the case where + // we truly have assigned podIPs in this call) AND when there is no error in + // the rest of the functionality of addLogicalPort. It is important to use a + // named return variable for defer to work correctly. + + defer func() { + if releaseIPs && err != nil { + if relErr := bnc.lsManager.ReleaseIPs(switchName, podIfAddrs); relErr != nil { + klog.Errorf("Error when releasing IPs %s for switch: %s, err: %q", + util.JoinIPNetIPs(podIfAddrs, " "), switchName, relErr) + } else { + klog.Infof("Released IPs: %s for node: %s", util.JoinIPNetIPs(podIfAddrs, " "), switchName) + } + } + }() + + if err == nil { + podMac = podAnnotation.MAC + podIfAddrs = podAnnotation.IPs + + // If the pod already has annotations use the existing static + // IP/MAC from the annotation. + lsp.DynamicAddresses = nil + + // ensure we have reserved the IPs in the annotation + if err = bnc.lsManager.AllocateIPs(switchName, podIfAddrs); err != nil && err != ipallocator.ErrAllocated { + return nil, nil, nil, false, fmt.Errorf("unable to ensure IPs allocated for already annotated pod: %s, IPs: %s, error: %v", + podDesc, util.JoinIPNetIPs(podIfAddrs, " "), err) + } else { + needsIP = false + } + } + + if needsIP { + if existingLSP != nil { + // try to get the MAC and IPs from existing OVN port first + podMac, podIfAddrs, err = bnc.getPortAddresses(switchName, existingLSP) + if err != nil { + return nil, nil, nil, false, fmt.Errorf("failed to get pod addresses for pod %s on node: %s, err: %v", + podDesc, switchName, err) + } + } + needsNewAllocation := false + + // ensure we have reserved the IPs found in OVN + if len(podIfAddrs) == 0 { + needsNewAllocation = true + } else if err = bnc.lsManager.AllocateIPs(switchName, podIfAddrs); err != nil && err != ipallocator.ErrAllocated { + klog.Warningf("Unable to allocate IPs %s found on existing OVN port: %s, for pod %s on switch: %s"+ + " error: %v", util.JoinIPNetIPs(podIfAddrs, " "), portName, podDesc, switchName, err) + + needsNewAllocation = true + } + if needsNewAllocation { + // Previous attempts to use already configured IPs failed, need to assign new + podMac, podIfAddrs, err = bnc.assignPodAddresses(switchName) + if err != nil { + return nil, nil, nil, false, fmt.Errorf("failed to assign pod addresses for pod %s on switch: %s, err: %v", + podDesc, switchName, err) + } + } + + releaseIPs = true + // handle error cases separately first to ensure binding to err, otherwise the + // defer will fail + if network != nil && network.MacRequest != "" { + klog.V(5).Infof("Pod %s requested custom MAC: %s", podDesc, network.MacRequest) + podMac, err = net.ParseMAC(network.MacRequest) + if err != nil { + return nil, nil, nil, false, fmt.Errorf("failed to parse mac %s requested in annotation for pod %s: Error %v", + network.MacRequest, podDesc, err) + } + } + podAnnotation = &util.PodAnnotation{ + IPs: podIfAddrs, + MAC: podMac, + } + var nodeSubnets []*net.IPNet + if nodeSubnets = bnc.lsManager.GetSwitchSubnets(switchName); nodeSubnets == nil { + return nil, nil, nil, false, fmt.Errorf("cannot retrieve subnet for assigning gateway routes for pod %s, switch: %s", + podDesc, switchName) + } + err = bnc.addRoutesGatewayIP(pod, podAnnotation, nodeSubnets) + if err != nil { + return nil, nil, nil, false, err + } + + klog.V(5).Infof("Annotation values: ip=%v ; mac=%s ; gw=%s", + podIfAddrs, podMac, podAnnotation.Gateways) + annoStart := time.Now() + err = bnc.updatePodAnnotationWithRetry(pod, podAnnotation, ovntypes.DefaultNetworkName) + podAnnoTime := time.Since(annoStart) + klog.Infof("[%s] addLogicalPort annotation time took %v", podDesc, podAnnoTime) + if err != nil { + return nil, nil, nil, false, err + } + releaseIPs = false + } + + // set addresses on the port + // LSP addresses in OVN are a single space-separated value + addresses = []string{podMac.String()} + for _, podIfAddr := range podIfAddrs { + addresses[0] = addresses[0] + " " + podIfAddr.IP.String() + } + + lsp.Addresses = addresses + + // add external ids + lsp.ExternalIDs = map[string]string{"namespace": pod.Namespace, "pod": "true"} + + // CNI depends on the flows from port security, delay setting it until end + lsp.PortSecurity = addresses + + ops, err = libovsdbops.CreateOrUpdateLogicalSwitchPortsOnSwitchOps(bnc.nbClient, nil, ls, lsp) + if err != nil { + return nil, nil, nil, false, + fmt.Errorf("error creating logical switch port %+v on switch %+v: %+v", *lsp, *ls, err) + } + + return ops, lsp, podAnnotation, needsIP && !lspExist, nil +} + +func (bnc *BaseNetworkController) updatePodAnnotationWithRetry(origPod *kapi.Pod, podInfo *util.PodAnnotation, nadName string) error { + resultErr := retry.RetryOnConflict(util.OvnConflictBackoff, func() error { + // Informer cache should not be mutated, so get a copy of the object + pod, err := bnc.watchFactory.GetPod(origPod.Namespace, origPod.Name) + if err != nil { + return err + } + + cpod := pod.DeepCopy() + cpod.Annotations, err = util.MarshalPodAnnotation(cpod.Annotations, podInfo, nadName) + if err != nil { + return err + } + return bnc.kube.UpdatePod(cpod) + }) + if resultErr != nil { + return fmt.Errorf("failed to update annotation on pod %s/%s: %v", origPod.Namespace, origPod.Name, resultErr) + } + return nil +} + +// Given a switch, gets the next set of addresses (from the IPAM) for each of the node's +// subnets to assign to the new pod +func (bnc *BaseNetworkController) assignPodAddresses(switchName string) (net.HardwareAddr, []*net.IPNet, error) { + var ( + podMAC net.HardwareAddr + podCIDRs []*net.IPNet + err error + ) + podCIDRs, err = bnc.lsManager.AllocateNextIPs(switchName) + if err != nil { + return nil, nil, err + } + if len(podCIDRs) > 0 { + podMAC = util.IPAddrToHWAddr(podCIDRs[0].IP) + } + return podMAC, podCIDRs, nil +} + +// Given a logical switch port and the switch on which it is scheduled, get all +// addresses currently assigned to it including subnet masks. +func (bnc *BaseNetworkController) getPortAddresses(switchName string, existingLSP *nbdb.LogicalSwitchPort) (net.HardwareAddr, []*net.IPNet, error) { + podMac, podIPs, err := util.ExtractPortAddresses(existingLSP) + if err != nil { + return nil, nil, err + } else if podMac == nil || len(podIPs) == 0 { + return nil, nil, nil + } + + var podIPNets []*net.IPNet + + nodeSubnets := bnc.lsManager.GetSwitchSubnets(switchName) + + for _, ip := range podIPs { + for _, subnet := range nodeSubnets { + if subnet.Contains(ip) { + podIPNets = append(podIPNets, + &net.IPNet{ + IP: ip, + Mask: subnet.Mask, + }) + break + } + } + } + return podMac, podIPNets, nil +} + +// delLSPOps returns the ovsdb operations required to delete the given logical switch port (LSP) +func (bnc *BaseNetworkController) delLSPOps(logicalPort, switchName, + lspUUID string) ([]ovsdb.Operation, error) { + lsUUID, _ := bnc.lsManager.GetUUID(switchName) + lsw := nbdb.LogicalSwitch{ + UUID: lsUUID, + Name: switchName, + } + lsp := nbdb.LogicalSwitchPort{ + UUID: lspUUID, + Name: logicalPort, + } + ops, err := libovsdbops.DeleteLogicalSwitchPortsOps(bnc.nbClient, nil, &lsw, &lsp) + if err != nil { + return nil, fmt.Errorf("error deleting logical switch port %+v from switch %+v: %w", lsp, lsw, err) + } + + return ops, nil +} + +func (bnc *BaseNetworkController) deletePodFromNamespace(ns string, podIfAddrs []*net.IPNet, portUUID string) ([]ovsdb.Operation, error) { + // for secondary network, namespace may be not managed + nsInfo, nsUnlock := bnc.getNamespaceLocked(ns, true) + if nsInfo == nil { + return nil, nil + } + defer nsUnlock() + var ops []ovsdb.Operation + var err error + if nsInfo.addressSet != nil { + if ops, err = nsInfo.addressSet.DeleteIPsReturnOps(createIPAddressSlice(podIfAddrs)); err != nil { + return nil, err + } + } + + // Remove the port from the multicast allow policy. + if bnc.multicastSupport && nsInfo.multicastEnabled && len(portUUID) > 0 { + if err = podDeleteAllowMulticastPolicy(bnc.nbClient, ns, portUUID); err != nil { + return nil, err + } + } + + return ops, nil +} + +func (bnc *BaseNetworkController) getPortInfo(pod *kapi.Pod) *lpInfo { + var portInfo *lpInfo + key := util.GetLogicalPortName(pod.Namespace, pod.Name) + portInfo, _ = bnc.logicalPortCache.get(key) + return portInfo +} + +// WatchPods starts the watching of the Pod resource and calls back the appropriate handler logic +func (bnc *BaseNetworkController) WatchPods() error { + if bnc.podHandler != nil { + return nil + } + + handler, err := bnc.retryPods.WatchResource() + if err == nil { + bnc.podHandler = handler + } + return err +} diff --git a/go-controller/pkg/ovn/default_network_controller.go b/go-controller/pkg/ovn/default_network_controller.go index c8918c36f3..67f3c77753 100644 --- a/go-controller/pkg/ovn/default_network_controller.go +++ b/go-controller/pkg/ovn/default_network_controller.go @@ -42,9 +42,8 @@ import ( type DefaultNetworkController struct { BaseNetworkController - // wg and stopChan per-Controller - wg *sync.WaitGroup - stopChan chan struct{} + // waitGroup per-Controller + wg *sync.WaitGroup // FIXME DUAL-STACK - Make IP Allocators more dual-stack friendly masterSubnetAllocator *subnetallocator.HostSubnetAllocator @@ -54,19 +53,6 @@ type DefaultNetworkController struct { // cluster's east-west traffic. loadbalancerClusterCache map[kapi.Protocol]string - // A cache of all logical switches seen by the watcher and their subnets - lsManager *lsm.LogicalSwitchManager - - // A cache of all logical ports known to the controller - logicalPortCache *portCache - - // Info about known namespaces. You must use oc.getNamespaceLocked() or - // oc.waitForNamespaceLocked() to read this map, and oc.createNamespaceLocked() - // or oc.deleteNamespaceLocked() to modify it. namespacesMutex is only held - // from inside those functions. - namespaces map[string]*namespaceInfo - namespacesMutex sync.Mutex - externalGWCache map[ktypes.NamespacedName]*externalRouteInfo exGWCacheMutex sync.RWMutex @@ -87,9 +73,6 @@ type DefaultNetworkController struct { egressQoSNodeSynced cache.InformerSynced egressQoSNodeQueue workqueue.RateLimitingInterface - // An address set factory that creates address sets - addressSetFactory addressset.AddressSetFactory - // network policies map, key should be retrieved with getPolicyKey(policy *knet.NetworkPolicy). // network policies that failed to be created will also be added here, and can be retried or cleaned up later. // network policy is only deleted from this map after successful cleanup. @@ -104,9 +87,6 @@ type DefaultNetworkController struct { // make sure to keep this order to avoid deadlocks sharedNetpolPortGroups *syncmap.SyncMap[*defaultDenyPortGroups] - // Supports multicast? - multicastSupport bool - // Cluster wide Load_Balancer_Group UUID. loadBalancerGroupUUID string @@ -130,9 +110,6 @@ type DefaultNetworkController struct { joinSwIPManager *lsm.JoinSwitchIPManager - // retry framework for pods - retryPods *retry.RetryFramework - // retry framework for network policies retryNetworkPolicies *retry.RetryFramework @@ -150,8 +127,6 @@ type DefaultNetworkController struct { // EgressIP Node-specific syncMap used by egressip node event handler addEgressNodeFailed sync.Map - // retry framework for nodes - retryNodes *retry.RetryFramework // Node-specific syncMaps used by node event handler gatewaysFailed sync.Map mgmtPortFailed sync.Map @@ -172,38 +147,40 @@ type DefaultNetworkController struct { // NewDefaultNetworkController creates a new OVN controller for creating logical network // infrastructure and policy for default l3 network -func NewDefaultNetworkController(bnc *BaseNetworkController) *DefaultNetworkController { +func NewDefaultNetworkController(cnci *CommonNetworkControllerInfo) *DefaultNetworkController { stopChan := make(chan struct{}) wg := &sync.WaitGroup{} - return newDefaultNetworkControllerCommon(bnc, stopChan, wg, nil) + return newDefaultNetworkControllerCommon(cnci, stopChan, wg, nil) } -func newDefaultNetworkControllerCommon(bnc *BaseNetworkController, +func newDefaultNetworkControllerCommon(cnci *CommonNetworkControllerInfo, defaultStopChan chan struct{}, defaultWg *sync.WaitGroup, addressSetFactory addressset.AddressSetFactory) *DefaultNetworkController { if addressSetFactory == nil { - addressSetFactory = addressset.NewOvnAddressSetFactory(bnc.nbClient) + addressSetFactory = addressset.NewOvnAddressSetFactory(cnci.nbClient) } - svcController, svcFactory := newServiceController(bnc.client, bnc.nbClient, bnc.recorder) - egressSvcController := newEgressServiceController(bnc.client, bnc.nbClient, svcFactory, defaultStopChan) + svcController, svcFactory := newServiceController(cnci.client, cnci.nbClient, cnci.recorder) + egressSvcController := newEgressServiceController(cnci.client, cnci.nbClient, svcFactory, defaultStopChan) var hybridOverlaySubnetAllocator *subnetallocator.HostSubnetAllocator if config.HybridOverlay.Enabled { hybridOverlaySubnetAllocator = subnetallocator.NewHostSubnetAllocator() } oc := &DefaultNetworkController{ - BaseNetworkController: *bnc, - stopChan: defaultStopChan, + BaseNetworkController: BaseNetworkController{ + CommonNetworkControllerInfo: *cnci, + lsManager: lsm.NewLogicalSwitchManager(), + logicalPortCache: newPortCache(defaultStopChan), + namespaces: make(map[string]*namespaceInfo), + namespacesMutex: sync.Mutex{}, + addressSetFactory: addressSetFactory, + stopChan: defaultStopChan, + }, wg: defaultWg, masterSubnetAllocator: subnetallocator.NewHostSubnetAllocator(), hybridOverlaySubnetAllocator: hybridOverlaySubnetAllocator, - lsManager: lsm.NewLogicalSwitchManager(), - logicalPortCache: newPortCache(defaultStopChan), - namespaces: make(map[string]*namespaceInfo), - namespacesMutex: sync.Mutex{}, externalGWCache: make(map[ktypes.NamespacedName]*externalRouteInfo), exGWCacheMutex: sync.RWMutex{}, - addressSetFactory: addressSetFactory, networkPolicies: syncmap.NewSyncMap[*networkPolicy](), sharedNetpolPortGroups: syncmap.NewSyncMap[*defaultDenyPortGroups](), eIPC: egressIPController{ @@ -213,14 +190,13 @@ func newDefaultNetworkControllerCommon(bnc *BaseNetworkController, pendingCloudPrivateIPConfigsMutex: &sync.Mutex{}, pendingCloudPrivateIPConfigsOps: make(map[string]map[string]*cloudPrivateIPConfigOp), allocator: allocator{&sync.Mutex{}, make(map[string]*egressNode)}, - nbClient: bnc.nbClient, - watchFactory: bnc.watchFactory, + nbClient: cnci.nbClient, + watchFactory: cnci.watchFactory, egressIPTotalTimeout: config.OVNKubernetesFeature.EgressIPReachabiltyTotalTimeout, reachabilityCheckInterval: egressIPReachabilityCheckInterval, egressIPNodeHealthCheckPort: config.OVNKubernetesFeature.EgressIPNodeHealthCheckPort, }, loadbalancerClusterCache: make(map[kapi.Protocol]string), - multicastSupport: config.EnableMulticast, loadBalancerGroupUUID: "", aclLoggingEnabled: true, joinSwIPManager: nil, @@ -303,7 +279,7 @@ func (oc *DefaultNetworkController) Stop() { oc.wg.Wait() } -// Init runs a subnet IPAM and the network controller that watches arrival/departure +// Init runs a subnet IPAM and a controller that watches arrival/departure // of nodes in the cluster // On an addition to the cluster (node create), a new subnet is created for it that will translate // to creation of a logical switch (done by the node, but could be created here at the master process too) @@ -337,14 +313,6 @@ func (oc *DefaultNetworkController) Init() error { } } - if oc.multicastSupport { - if _, _, err := util.RunOVNSbctl("--columns=_uuid", "list", "IGMP_Group"); err != nil { - klog.Warningf("Multicast support enabled, however version of OVN in use does not support IGMP Group. " + - "Disabling Multicast Support") - oc.multicastSupport = false - } - } - err = oc.createACLLoggingMeter() if err != nil { klog.Warningf("ACL logging support enabled, however acl-logging meter could not be created: %v. "+ @@ -634,7 +602,7 @@ func (h *defaultNetworkControllerEventHandler) IsResourceScheduled(obj interface // AddResource adds the specified object to the cluster according to its type and returns the error, // if any, yielded during object creation. -// Given an object to add and a boolean specifying if the function was executed from iterateRetryResources, +// Given an object to add and a boolean specifying if the function was executed from iterateRetryResources func (h *defaultNetworkControllerEventHandler) AddResource(obj interface{}, fromRetryLoop bool) error { var err error diff --git a/go-controller/pkg/ovn/master.go b/go-controller/pkg/ovn/master.go index e521abbd98..7583d6a33c 100644 --- a/go-controller/pkg/ovn/master.go +++ b/go-controller/pkg/ovn/master.go @@ -1,7 +1,6 @@ package ovn import ( - "context" "fmt" "net" "strings" @@ -9,7 +8,6 @@ import ( kapi "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/fields" kerrors "k8s.io/apimachinery/pkg/util/errors" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/sets" @@ -27,7 +25,6 @@ import ( hotypes "github.com/ovn-org/ovn-kubernetes/go-controller/hybrid-overlay/pkg/types" houtil "github.com/ovn-org/ovn-kubernetes/go-controller/hybrid-overlay/pkg/util" - ovnlb "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/loadbalancer" lsm "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/logical_switch_manager" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" @@ -117,33 +114,11 @@ func (oc *DefaultNetworkController) upgradeOVNTopology(existingNodes *kapi.NodeL // SetupMaster creates the central router and load-balancers for the network func (oc *DefaultNetworkController) SetupMaster(existingNodeNames []string) error { // Create default Control Plane Protection (COPP) entry for routers - var err error - oc.defaultCOPPUUID, err = EnsureDefaultCOPP(oc.nbClient) - if err != nil { - return fmt.Errorf("unable to create router control plane protection: %w", err) - } - - // Create a single common distributed router for the cluster. - logicalRouter := nbdb.LogicalRouter{ - Name: types.OVNClusterRouter, - ExternalIDs: map[string]string{ - "k8s-cluster-router": "yes", - }, - Options: map[string]string{ - "always_learn_from_arp_request": "false", - }, - Copp: &oc.defaultCOPPUUID, - } - if oc.multicastSupport { - logicalRouter.Options = map[string]string{ - "mcast_relay": "true", - } - } - - err = libovsdbops.CreateOrUpdateLogicalRouter(oc.nbClient, &logicalRouter) + logicalRouter, err := oc.createOvnClusterRouter() if err != nil { - return fmt.Errorf("failed to create a single common distributed router for the cluster, error: %v", err) + return err } + oc.defaultCOPPUUID = *(logicalRouter.Copp) // Create a cluster-wide port group that all logical switch ports are part of pg := libovsdbops.BuildPortGroup(types.ClusterPortGroupName, types.ClusterPortGroupName, nil, nil) @@ -220,10 +195,10 @@ func (oc *DefaultNetworkController) SetupMaster(existingNodeNames []string) erro Networks: gwLRPNetworks, } - err = libovsdbops.CreateOrUpdateLogicalRouterPorts(oc.nbClient, &logicalRouter, + err = libovsdbops.CreateOrUpdateLogicalRouterPorts(oc.nbClient, logicalRouter, []*nbdb.LogicalRouterPort{&logicalRouterPort}, &logicalRouterPort.MAC, &logicalRouterPort.Networks) if err != nil { - return fmt.Errorf("failed to add logical router port %+v on router %+v: %v", logicalRouterPort, logicalRouter, err) + return fmt.Errorf("failed to add logical router port %+v on router %s: %v", logicalRouterPort, logicalRouter.Name, err) } // Create OVNJoinSwitch that will be used to connect gateway routers to the @@ -356,157 +331,13 @@ func (oc *DefaultNetworkController) syncGatewayLogicalNetwork(node *kapi.Node, l return err } -// syncNodeClusterRouterPort ensures a node's LS to the cluster router's LRP is created. -// NOTE: We could have created the router port in ensureNodeLogicalNetwork() instead of here, -// but chassis ID is not available at that moment. We need the chassis ID to set the -// gateway-chassis, which in effect pins the logical switch to the current node in OVN. -// Otherwise, ovn-controller will flood-fill unrelated datapaths unnecessarily, causing scale -// problems. -func (oc *DefaultNetworkController) syncNodeClusterRouterPort(node *kapi.Node, hostSubnets []*net.IPNet) error { - chassisID, err := util.ParseNodeChassisIDAnnotation(node) - if err != nil { - return err - } - - if hostSubnets == nil { - hostSubnets, err = util.ParseNodeHostSubnetAnnotation(node, types.DefaultNetworkName) - if err != nil { - return err - } - } - - // logical router port MAC is based on IPv4 subnet if there is one, else IPv6 - var nodeLRPMAC net.HardwareAddr - for _, hostSubnet := range hostSubnets { - gwIfAddr := util.GetNodeGatewayIfAddr(hostSubnet) - nodeLRPMAC = util.IPAddrToHWAddr(gwIfAddr.IP) - if !utilnet.IsIPv6CIDR(hostSubnet) { - break - } - } - - lrpName := types.RouterToSwitchPrefix + node.Name - lrpNetworks := []string{} - for _, hostSubnet := range hostSubnets { - gwIfAddr := util.GetNodeGatewayIfAddr(hostSubnet) - lrpNetworks = append(lrpNetworks, gwIfAddr.String()) - } - logicalRouterPort := nbdb.LogicalRouterPort{ - Name: lrpName, - MAC: nodeLRPMAC.String(), - Networks: lrpNetworks, - } - logicalRouter := nbdb.LogicalRouter{Name: types.OVNClusterRouter} - - err = libovsdbops.CreateOrUpdateLogicalRouterPorts(oc.nbClient, &logicalRouter, - []*nbdb.LogicalRouterPort{&logicalRouterPort}, &logicalRouterPort.MAC, &logicalRouterPort.Networks) - if err != nil { - klog.Errorf("Failed to add logical router port %+v to router %s: %v", logicalRouterPort, types.OVNClusterRouter, err) - return err - } - - gatewayChassisName := lrpName + "-" + chassisID - gatewayChassis := nbdb.GatewayChassis{ - Name: gatewayChassisName, - ChassisName: chassisID, - Priority: 1, - } - - err = libovsdbops.CreateOrUpdateGatewayChassis(oc.nbClient, &logicalRouterPort, &gatewayChassis, - &gatewayChassis.Name, &gatewayChassis.ChassisName, &gatewayChassis.Priority) - if err != nil { - klog.Errorf("Failed to add gateway chassis %s to logical router port %s, error: %v", chassisID, lrpName, err) - return err - } - - return nil -} - func (oc *DefaultNetworkController) ensureNodeLogicalNetwork(node *kapi.Node, hostSubnets []*net.IPNet) error { - // logical router port MAC is based on IPv4 subnet if there is one, else IPv6 - var nodeLRPMAC net.HardwareAddr - switchName := node.Name - for _, hostSubnet := range hostSubnets { - gwIfAddr := util.GetNodeGatewayIfAddr(hostSubnet) - nodeLRPMAC = util.IPAddrToHWAddr(gwIfAddr.IP) - if !utilnet.IsIPv6CIDR(hostSubnet) { - break - } - } - - logicalSwitch := nbdb.LogicalSwitch{ - Name: switchName, - } - - var v4Gateway, v6Gateway net.IP var hostNetworkPolicyIPs []net.IP - logicalRouterPortNetwork := []string{} - logicalSwitch.OtherConfig = map[string]string{} + + switchName := node.Name for _, hostSubnet := range hostSubnets { - gwIfAddr := util.GetNodeGatewayIfAddr(hostSubnet) mgmtIfAddr := util.GetNodeManagementIfAddr(hostSubnet) - logicalRouterPortNetwork = append(logicalRouterPortNetwork, gwIfAddr.String()) hostNetworkPolicyIPs = append(hostNetworkPolicyIPs, mgmtIfAddr.IP) - - if utilnet.IsIPv6CIDR(hostSubnet) { - v6Gateway = gwIfAddr.IP - - logicalSwitch.OtherConfig["ipv6_prefix"] = - hostSubnet.IP.String() - } else { - v4Gateway = gwIfAddr.IP - excludeIPs := mgmtIfAddr.IP.String() - if config.HybridOverlay.Enabled { - hybridOverlayIfAddr := util.GetNodeHybridOverlayIfAddr(hostSubnet) - excludeIPs += ".." + hybridOverlayIfAddr.IP.String() - } - logicalSwitch.OtherConfig["subnet"] = hostSubnet.String() - logicalSwitch.OtherConfig["exclude_ips"] = excludeIPs - } - } - - if oc.loadBalancerGroupUUID != "" { - logicalSwitch.LoadBalancerGroup = []string{oc.loadBalancerGroupUUID} - } - - logicalRouterPortName := types.RouterToSwitchPrefix + switchName - logicalRouterPort := nbdb.LogicalRouterPort{ - Name: logicalRouterPortName, - MAC: nodeLRPMAC.String(), - Networks: logicalRouterPortNetwork, - } - logicalRouter := nbdb.LogicalRouter{Name: types.OVNClusterRouter} - - err := libovsdbops.CreateOrUpdateLogicalRouterPorts(oc.nbClient, &logicalRouter, - []*nbdb.LogicalRouterPort{&logicalRouterPort}, &logicalRouterPort.Networks, &logicalRouterPort.MAC) - if err != nil { - return fmt.Errorf("failed to add logical router port %+v to router %s: %v", logicalRouterPort, types.OVNClusterRouter, err) - } - - // If supported, enable IGMP/MLD snooping and querier on the node. - if oc.multicastSupport { - logicalSwitch.OtherConfig["mcast_snoop"] = "true" - - // Configure IGMP/MLD querier if the gateway IP address is known. - // Otherwise disable it. - if v4Gateway != nil || v6Gateway != nil { - logicalSwitch.OtherConfig["mcast_querier"] = "true" - logicalSwitch.OtherConfig["mcast_eth_src"] = nodeLRPMAC.String() - if v4Gateway != nil { - logicalSwitch.OtherConfig["mcast_ip4_src"] = v4Gateway.String() - } - if v6Gateway != nil { - logicalSwitch.OtherConfig["mcast_ip6_src"] = util.HWAddrToIPv6LLA(nodeLRPMAC).String() - } - } else { - logicalSwitch.OtherConfig["mcast_querier"] = "false" - } - } - - err = libovsdbops.CreateOrUpdateLogicalSwitch(oc.nbClient, &logicalSwitch, &logicalSwitch.OtherConfig, - &logicalSwitch.LoadBalancerGroup) - if err != nil { - return fmt.Errorf("failed to add logical switch %+v: %v", logicalSwitch, err) } // also add the join switch IPs for this node - needed in shared gateway mode @@ -537,34 +368,13 @@ func (oc *DefaultNetworkController) ensureNodeLogicalNetwork(node *kapi.Node, ho return err } - // Connect the switch to the router. - logicalSwitchPort := nbdb.LogicalSwitchPort{ - Name: types.SwitchToRouterPrefix + switchName, - Type: "router", - Addresses: []string{"router"}, - Options: map[string]string{"router-port": types.RouterToSwitchPrefix + switchName}, - } - sw := nbdb.LogicalSwitch{Name: switchName} - err = libovsdbops.CreateOrUpdateLogicalSwitchPortsOnSwitch(oc.nbClient, &sw, &logicalSwitchPort) - if err != nil { - klog.Errorf("Failed to add logical port %+v to switch %s: %v", logicalSwitchPort, switchName, err) - return err - } - - err = libovsdbops.AddPortsToPortGroup(oc.nbClient, types.ClusterRtrPortGroupName, logicalSwitchPort.UUID) - if err != nil { - klog.Errorf(err.Error()) - return err - } - - // Add the switch to the logical switch cache - return oc.lsManager.AddSwitch(switchName, logicalSwitch.UUID, hostSubnets) + return oc.createNodeLogicalSwitch(node.Name, hostSubnets, oc.loadBalancerGroupUUID) } -func (oc *DefaultNetworkController) updateNodeAnnotationWithRetry(nodeName string, hostSubnets []*net.IPNet) error { - gwLRPIPs, err := oc.joinSwIPManager.EnsureJoinLRPIPs(nodeName) +func (oc *DefaultNetworkController) addNode(node *kapi.Node) ([]*net.IPNet, error) { + gwLRPIPs, err := oc.joinSwIPManager.EnsureJoinLRPIPs(node.Name) if err != nil { - return fmt.Errorf("failed to allocate join switch port IP address for node %s: %v", nodeName, err) + return nil, fmt.Errorf("failed to allocate join switch port IP address for node %s: %v", node.Name, err) } var v4Addr, v6Addr *net.IPNet for _, ip := range gwLRPIPs { @@ -574,60 +384,19 @@ func (oc *DefaultNetworkController) updateNodeAnnotationWithRetry(nodeName strin v6Addr = ip } } - - // Retry if it fails because of potential conflict which is transient. Return error in the - // case of other errors (say temporary API server down), and it will be taken care of by the - // retry mechanism. - resultErr := retry.RetryOnConflict(retry.DefaultBackoff, func() error { - // Informer cache should not be mutated, so get a copy of the object - node, err := oc.watchFactory.GetNode(nodeName) - if err != nil { - return err - } - - cnode := node.DeepCopy() - cnode.Annotations, err = util.CreateNodeGateRouterLRPAddrAnnotation(cnode.Annotations, v4Addr, v6Addr) - if err != nil { - return fmt.Errorf("failed to marshal node %q annotation for Gateway LRP IP %v", - node.Name, gwLRPIPs) - } - cnode.Annotations, err = util.UpdateNodeHostSubnetAnnotation(cnode.Annotations, hostSubnets, types.DefaultNetworkName) - if err != nil { - return fmt.Errorf("failed to update node %q annotation subnet %s", - node.Name, util.JoinIPNets(hostSubnets, ",")) - } - return oc.kube.UpdateNode(cnode) - }) - if resultErr != nil { - return fmt.Errorf("failed to update node %s annotation", nodeName) - } - return nil -} - -func (oc *DefaultNetworkController) addNode(node *kapi.Node) ([]*net.IPNet, error) { - existingSubnets, err := util.ParseNodeHostSubnetAnnotation(node, types.DefaultNetworkName) - if err != nil && !util.IsAnnotationNotSetError(err) { - // Log the error and try to allocate new subnets - klog.Infof("Failed to get node %s host subnets annotations: %v", node.Name, err) + updatedNodeAnnotation, err := util.CreateNodeGatewayRouterLRPAddrAnnotation(nil, v4Addr, v6Addr) + if err != nil { + return nil, fmt.Errorf("failed to marshal node %q annotation for Gateway LRP IP %v", + node.Name, gwLRPIPs) } - hostSubnets, allocatedSubnets, err := oc.masterSubnetAllocator.AllocateNodeSubnets(node.Name, existingSubnets, config.IPv4Mode, config.IPv6Mode) + hostSubnets, err := oc.allocateNodeSubnets(node, oc.masterSubnetAllocator) if err != nil { return nil, err } - // Release the allocation on error - defer func() { - if err != nil { - if errR := oc.masterSubnetAllocator.ReleaseNodeSubnets(node.Name, allocatedSubnets...); errR != nil { - klog.Warningf("Error releasing node %s subnets: %v", node.Name, errR) - } - } - }() - // Set the HostSubnet annotation on the node object to signal - // to nodes that their logical infrastructure is set up and they can - // proceed with their initialization - err = oc.updateNodeAnnotationWithRetry(node.Name, hostSubnets) + hostSubnetsMap := map[string][]*net.IPNet{types.DefaultNetworkName: hostSubnets} + err = oc.UpdateNodeAnnotationWithRetry(node.Name, hostSubnetsMap, updatedNodeAnnotation) if err != nil { return nil, err } @@ -690,33 +459,6 @@ func (oc *DefaultNetworkController) deleteStaleNodeChassis(node *kapi.Node) erro return nil } -// deleteNodeLogicalNetwork removes the logical switch and logical router port associated with the node -func (oc *DefaultNetworkController) deleteNodeLogicalNetwork(nodeName string) error { - // Remove switch to lb associations from the LBCache before removing the switch - lbCache, err := ovnlb.GetLBCache(oc.nbClient) - if err != nil { - return fmt.Errorf("failed to get load_balancer cache for node %s: %v", nodeName, err) - } - lbCache.RemoveSwitch(nodeName) - - // Remove the logical switch associated with the node - err = libovsdbops.DeleteLogicalSwitch(oc.nbClient, nodeName) - if err != nil { - return fmt.Errorf("failed to delete logical switch %s: %v", nodeName, err) - } - - logiccalRouter := nbdb.LogicalRouter{Name: types.OVNClusterRouter} - logicalRouterPort := nbdb.LogicalRouterPort{ - Name: types.RouterToSwitchPrefix + nodeName, - } - err = libovsdbops.DeleteLogicalRouterPorts(oc.nbClient, &logiccalRouter, &logicalRouterPort) - if err != nil { - return fmt.Errorf("failed to delete router port %s: %v", logicalRouterPort.Name, err) - } - - return nil -} - func (oc *DefaultNetworkController) deleteNode(nodeName string) error { oc.masterSubnetAllocator.ReleaseAllNodeSubnets(nodeName) @@ -843,7 +585,7 @@ func (oc *DefaultNetworkController) syncNodes(nodes []interface{}) error { if !ok { return fmt.Errorf("spurious object in syncNodes: %v", tmp) } - hostSubnets, _ := util.ParseNodeHostSubnetAnnotation(node, types.DefaultNetworkName) + hostSubnets := oc.updateNodesManageHostSubnets(node, oc.masterSubnetAllocator, foundNodes) if config.HybridOverlay.Enabled && len(hostSubnets) == 0 && houtil.IsHybridOverlayNode(node) { // this is a hybrid overlay node so mark as allocated from the hybrid overlay subnet allocator hostSubnet, err := houtil.ParseHybridOverlayHostSubnet(node) @@ -858,13 +600,6 @@ func (oc *DefaultNetworkController) syncNodes(nodes []interface{}) error { // there is nothing left to be done if this is a hybrid overlay node continue } - - foundNodes.Insert(node.Name) - klog.V(5).Infof("Node %s contains subnets: %v", node.Name, hostSubnets) - if err := oc.masterSubnetAllocator.MarkSubnetsAllocated(node.Name, hostSubnets...); err != nil { - utilruntime.HandleError(err) - } - // For each existing node, reserve its joinSwitch LRP IPs if they already exist. if _, err := oc.joinSwIPManager.EnsureJoinLRPIPs(node.Name); err != nil { // TODO (flaviof): keep going even if EnsureJoinLRPIPs returned an error. Maybe we should not. @@ -1038,31 +773,8 @@ func (oc *DefaultNetworkController) addUpdateNodeEvent(node *kapi.Node, nSyncs * // if per pod SNAT is being used, then l3 gateway config is required to be able to add pods if _, gwFailed := oc.gatewaysFailed.Load(node.Name); !gwFailed || !config.Gateway.DisableSNATMultipleGWs { if nSyncs.syncNode || nSyncs.syncGw { // do this only if it is a new node add or a gateway sync happened - options := metav1.ListOptions{ - FieldSelector: fields.OneTermEqualSelector("spec.nodeName", node.Name).String(), - ResourceVersion: "0", - } - pods, err := oc.client.CoreV1().Pods(metav1.NamespaceAll).List(context.TODO(), options) - if err != nil { - errs = append(errs, err) - klog.Errorf("Unable to list existing pods on node: %s, existing pods on this node may not function", - node.Name) - } else { - klog.V(5).Infof("When adding node %s, found %d pods to add to retryPods", node.Name, len(pods.Items)) - for _, pod := range pods.Items { - pod := pod - if util.PodCompleted(&pod) { - continue - } - klog.V(5).Infof("Adding pod %s/%s to retryPods", pod.Namespace, pod.Name) - err = oc.retryPods.AddRetryObjWithAddNoBackoff(&pod) - if err != nil { - errs = append(errs, err) - klog.Errorf("Failed to add pod %s/%s to retryPods: %v", pod.Namespace, pod.Name, err) - } - } - } - oc.retryPods.RequestRetryObjs() + errors := oc.addAllPodsOnNode(node.Name) + errs = append(errs, errors...) } } diff --git a/go-controller/pkg/ovn/master_test.go b/go-controller/pkg/ovn/master_test.go index 32417e8166..808e218f13 100644 --- a/go-controller/pkg/ovn/master_test.go +++ b/go-controller/pkg/ovn/master_test.go @@ -317,7 +317,6 @@ func addNodeLogicalFlows(testData []libovsdbtest.TestData, expectedOVNClusterRou Addresses: []string{"router"}, }) expectedNodeSwitch.Ports = append(expectedNodeSwitch.Ports, types.SwitchToRouterPrefix+node.Name+"-UUID") - expectedClusterRouterPortGroup.Ports = []string{types.SwitchToRouterPrefix + node.Name + "-UUID"} testData = append(testData, &nbdb.LogicalSwitchPort{ Name: types.K8sPrefix + node.Name, diff --git a/go-controller/pkg/ovn/namespace.go b/go-controller/pkg/ovn/namespace.go index b9dc627c63..dd9552b2f1 100644 --- a/go-controller/pkg/ovn/namespace.go +++ b/go-controller/pkg/ovn/namespace.go @@ -140,30 +140,6 @@ func (oc *DefaultNetworkController) addPodToNamespace(ns string, ips []*net.IPNe return oc.getRoutingExternalGWs(nsInfo), oc.getRoutingPodGWs(nsInfo), ops, nil } -func (oc *DefaultNetworkController) deletePodFromNamespace(ns string, podIfAddrs []*net.IPNet, portUUID string) ([]ovsdb.Operation, error) { - nsInfo, nsUnlock := oc.getNamespaceLocked(ns, true) - if nsInfo == nil { - return nil, nil - } - defer nsUnlock() - var ops []ovsdb.Operation - var err error - if nsInfo.addressSet != nil { - if ops, err = nsInfo.addressSet.DeleteIPsReturnOps(createIPAddressSlice(podIfAddrs)); err != nil { - return nil, err - } - } - - // Remove the port from the multicast allow policy. - if oc.multicastSupport && nsInfo.multicastEnabled && len(portUUID) > 0 { - if err = podDeleteAllowMulticastPolicy(oc.nbClient, ns, portUUID); err != nil { - return nil, err - } - } - - return ops, nil -} - func createIPAddressSlice(ips []*net.IPNet) []net.IP { ipAddrs := make([]net.IP, 0) for _, ip := range ips { @@ -408,38 +384,6 @@ func (oc *DefaultNetworkController) deleteNamespace(ns *kapi.Namespace) error { return nil } -// getNamespaceLocked locks namespacesMutex, looks up ns, and (if found), returns it with -// its mutex locked. If ns is not known, nil will be returned -func (oc *DefaultNetworkController) getNamespaceLocked(ns string, readOnly bool) (*namespaceInfo, func()) { - // Only hold namespacesMutex while reading/modifying oc.namespaces. In particular, - // we drop namespacesMutex while trying to claim nsInfo.Mutex, because something - // else might have locked the nsInfo and be doing something slow with it, and we - // don't want to block all access to oc.namespaces while that's happening. - oc.namespacesMutex.Lock() - nsInfo := oc.namespaces[ns] - oc.namespacesMutex.Unlock() - - if nsInfo == nil { - return nil, nil - } - var unlockFunc func() - if readOnly { - unlockFunc = func() { nsInfo.RUnlock() } - nsInfo.RLock() - } else { - unlockFunc = func() { nsInfo.Unlock() } - nsInfo.Lock() - } - // Check that the namespace wasn't deleted while we were waiting for the lock - oc.namespacesMutex.Lock() - defer oc.namespacesMutex.Unlock() - if nsInfo != oc.namespaces[ns] { - unlockFunc() - return nil, nil - } - return nsInfo, unlockFunc -} - // ensureNamespaceLocked locks namespacesMutex, gets/creates an entry for ns, configures OVN nsInfo, and returns it // with its mutex locked. // ns is the name of the namespace, while namespace is the optional k8s namespace object @@ -515,63 +459,6 @@ func (oc *DefaultNetworkController) ensureNamespaceLocked(ns string, readOnly bo return nsInfo, unlockFunc, nil } -// deleteNamespaceLocked locks namespacesMutex, finds and deletes ns, and returns the -// namespace, locked. -func (oc *DefaultNetworkController) deleteNamespaceLocked(ns string) *namespaceInfo { - // The locking here is the same as in getNamespaceLocked - - oc.namespacesMutex.Lock() - nsInfo := oc.namespaces[ns] - oc.namespacesMutex.Unlock() - - if nsInfo == nil { - return nil - } - nsInfo.Lock() - - oc.namespacesMutex.Lock() - defer oc.namespacesMutex.Unlock() - if nsInfo != oc.namespaces[ns] { - nsInfo.Unlock() - return nil - } - if nsInfo.addressSet != nil { - // Empty the address set, then delete it after an interval. - if err := nsInfo.addressSet.SetIPs(nil); err != nil { - klog.Errorf("Warning: failed to empty address set for deleted NS %s: %v", ns, err) - } - - // Delete the address set after a short delay. - // This is so NetworkPolicy handlers can converge and stop referencing it. - addressSet := nsInfo.addressSet - go func() { - select { - case <-oc.stopChan: - return - case <-time.After(20 * time.Second): - // Check to see if the NS was re-added in the meanwhile. If so, - // only delete if the new NS's AddressSet shouldn't exist. - nsInfo, nsUnlock := oc.getNamespaceLocked(ns, true) - if nsInfo != nil { - defer nsUnlock() - if nsInfo.addressSet != nil { - klog.V(5).Infof("Skipping deferred deletion of AddressSet for NS %s: re-created", ns) - return - } - } - - klog.V(5).Infof("Finishing deferred deletion of AddressSet for NS %s", ns) - if err := addressSet.Destroy(); err != nil { - klog.Errorf("Failed to delete AddressSet for NS %s: %v", ns, err.Error()) - } - } - }() - } - delete(oc.namespaces, ns) - - return nsInfo -} - func (oc *DefaultNetworkController) createNamespaceAddrSetAllPods(ns string) (addressset.AddressSet, error) { var ips []net.IP // special handling of host network namespace diff --git a/go-controller/pkg/ovn/ovn.go b/go-controller/pkg/ovn/ovn.go index d66dfb19d4..3da7f79ac7 100644 --- a/go-controller/pkg/ovn/ovn.go +++ b/go-controller/pkg/ovn/ovn.go @@ -9,15 +9,10 @@ import ( "sync" "time" - ref "k8s.io/client-go/tools/reference" - nettypes "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/apis/k8s.cni.cncf.io/v1" libovsdbclient "github.com/ovn-org/libovsdb/client" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" - "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/factory" - "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/kube" - "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/metrics" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb" egresssvc "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/controller/egress_services" svccontroller "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/controller/services" @@ -35,6 +30,7 @@ import ( clientset "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/record" + ref "k8s.io/client-go/tools/reference" "k8s.io/klog/v2" ) @@ -49,26 +45,6 @@ type ACLLoggingLevels struct { Deny string `json:"deny,omitempty"` } -// BaseNetworkController structure is place holder for all fields shared among controllers. -type BaseNetworkController struct { - client clientset.Interface - kube kube.Interface - watchFactory *factory.WatchFactory - podRecorder *metrics.PodRecorder - - // event recorder used to post events to k8s - recorder record.EventRecorder - - // libovsdb northbound client interface - nbClient libovsdbclient.Client - - // libovsdb southbound client interface - sbClient libovsdbclient.Client - - // has SCTP support - SCTPSupport bool -} - const ( // TCP is the constant string for the string "TCP" TCP = "TCP" @@ -85,22 +61,6 @@ func getPodNamespacedName(pod *kapi.Pod) string { return util.GetLogicalPortName(pod.Namespace, pod.Name) } -// NewBaseNetworkController creates BaseNetworkController shared by controllers -func NewBaseNetworkController(client clientset.Interface, kube kube.Interface, wf *factory.WatchFactory, - recorder record.EventRecorder, nbClient libovsdbclient.Client, - sbClient libovsdbclient.Client, podRecorder *metrics.PodRecorder, SCTPSupport bool) *BaseNetworkController { - return &BaseNetworkController{ - client: client, - kube: kube, - watchFactory: wf, - recorder: recorder, - nbClient: nbClient, - sbClient: sbClient, - podRecorder: podRecorder, - SCTPSupport: SCTPSupport, - } -} - // syncPeriodic adds a goroutine that periodically does some work // right now there is only one ticker registered // for syncNodesPeriodic which deletes chassis records from the sbdb @@ -134,7 +94,7 @@ func (oc *DefaultNetworkController) getPortInfo(pod *kapi.Pod) *lpInfo { mac: mac, } } else { - portInfo, _ = oc.logicalPortCache.get(key) + portInfo = oc.BaseNetworkController.getPortInfo(pod) } return portInfo } @@ -210,12 +170,6 @@ func (oc *DefaultNetworkController) removePod(pod *kapi.Pod, portInfo *lpInfo) e return nil } -// WatchPods starts the watching of the Pod resource and calls back the appropriate handler logic -func (oc *DefaultNetworkController) WatchPods() error { - _, err := oc.retryPods.WatchResource() - return err -} - // WatchNetworkPolicy starts the watching of the network policy resource and calls // back the appropriate handler logic func (oc *DefaultNetworkController) WatchNetworkPolicy() error { @@ -305,13 +259,6 @@ func (oc *DefaultNetworkController) syncNodeGateway(node *kapi.Node, hostSubnets return nil } -// WatchNodes starts the watching of node resource and calls -// back the appropriate handler logic -func (oc *DefaultNetworkController) WatchNodes() error { - _, err := oc.retryNodes.WatchResource() - return err -} - // aclLoggingUpdateNsInfo parses the provided annotation values and sets nsInfo.aclLogging.Deny and // nsInfo.aclLogging.Allow. If errors are encountered parsing the annotation, disable logging completely. If either // value contains invalid input, disable logging for the respective key. This is needed to ensure idempotency. diff --git a/go-controller/pkg/ovn/ovn_test.go b/go-controller/pkg/ovn/ovn_test.go index 8387a2a074..cfee2030f2 100644 --- a/go-controller/pkg/ovn/ovn_test.go +++ b/go-controller/pkg/ovn/ovn_test.go @@ -152,7 +152,7 @@ func NewOvnController(ovnClient *util.OVNClientset, wf *factory.WatchFactory, st } podRecorder := metrics.NewPodRecorder() - bnc := NewBaseNetworkController( + cnci := NewCommonNetworkControllerInfo( ovnClient.KubeClient, &kube.Kube{ KClient: ovnClient.KubeClient, @@ -166,7 +166,8 @@ func NewOvnController(ovnClient *util.OVNClientset, wf *factory.WatchFactory, st libovsdbOvnSBClient, &podRecorder, false, + false, ) - return newDefaultNetworkControllerCommon(bnc, stopChan, wg, addressSetFactory) + return newDefaultNetworkControllerCommon(cnci, stopChan, wg, addressSetFactory) } diff --git a/go-controller/pkg/ovn/pods.go b/go-controller/pkg/ovn/pods.go index 8e6efc16d7..75758263ec 100644 --- a/go-controller/pkg/ovn/pods.go +++ b/go-controller/pkg/ovn/pods.go @@ -6,25 +6,16 @@ import ( "sync/atomic" "time" + "github.com/ovn-org/libovsdb/ovsdb" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdbops" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/metrics" - "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/ipallocator" - logicalswitchmanager "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/logical_switch_manager" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb" ovntypes "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" - util "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" - "github.com/pkg/errors" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" kapi "k8s.io/api/core/v1" - kerrors "k8s.io/apimachinery/pkg/api/errors" ktypes "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/util/wait" - "k8s.io/client-go/util/retry" "k8s.io/klog/v2" - utilnet "k8s.io/utils/net" - - libovsdbclient "github.com/ovn-org/libovsdb/client" - "github.com/ovn-org/libovsdb/ovsdb" - "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdbops" - "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb" ) func (oc *DefaultNetworkController) syncPods(pods []interface{}) error { @@ -38,130 +29,43 @@ func (oc *DefaultNetworkController) syncPods(pods []interface{}) error { if !ok { return fmt.Errorf("spurious object in syncPods: %v", podInterface) } - switchName := pod.Spec.NodeName annotations, err := util.UnmarshalPodAnnotation(pod.Annotations, ovntypes.DefaultNetworkName) - if util.PodScheduled(pod) && util.PodWantsNetwork(pod) && !util.PodCompleted(pod) && err == nil { - // skip nodes that are not running ovnk (inferred from host subnets) - if oc.lsManager.IsNonHostSubnetSwitch(switchName) { - continue - } - logicalPort := util.GetLogicalPortName(pod.Namespace, pod.Name) - expectedLogicalPorts[logicalPort] = true - // it is possible to try to add a pod here that has no node. For example if a pod was deleted with - // a finalizer, and then the node was removed. In this case the pod will still exist in a running state. - // Terminating pods should still have network connectivity for pre-stop hooks or termination grace period - if _, err := oc.watchFactory.GetNode(pod.Spec.NodeName); kerrors.IsNotFound(err) && - oc.lsManager.GetSwitchSubnets(switchName) == nil { - if util.PodTerminating(pod) { - klog.Infof("Ignoring IP allocation for terminating pod: %s/%s, on deleted "+ - "node: %s", pod.Namespace, pod.Name, pod.Spec.NodeName) - continue - } else { - // unknown condition how we are getting a non-terminating pod without a node here - klog.Errorf("Pod IP allocation found for a non-existent node in API with unknown "+ - "condition. Pod: %s/%s, node: %s", pod.Namespace, pod.Name, pod.Spec.NodeName) - } - } - if err = oc.waitForNodeLogicalSwitchInCache(switchName); err != nil { - return fmt.Errorf("failed to wait for switch %s to be added to cache. IP allocation may fail", - switchName) - } - if err = oc.lsManager.AllocateIPs(switchName, annotations.IPs); err != nil { - if err == ipallocator.ErrAllocated { - // already allocated: log an error but not stop syncPod from continuing - klog.Errorf("Already allocated IPs: %s for pod: %s on switchName: %s", - util.JoinIPNetIPs(annotations.IPs, " "), logicalPort, - switchName) - } else { - return fmt.Errorf("couldn't allocate IPs: %s for pod: %s on switch: %s"+ - " error: %v", util.JoinIPNetIPs(annotations.IPs, " "), logicalPort, - switchName, err) - } - } + if err != nil { + continue + } + expectedLogicalPortName, err := oc.allocatePodIPs(pod, annotations) + if err != nil { + return err } + if expectedLogicalPortName != "" { + expectedLogicalPorts[expectedLogicalPortName] = true + } + // delete the outdated hybrid overlay subnet route if it exists - if annotations != nil { - newRoutes := []util.PodRoute{} - for _, subnet := range oc.lsManager.GetSwitchSubnets(switchName) { - hybridOverlayIFAddr := util.GetNodeHybridOverlayIfAddr(subnet).IP - for _, route := range annotations.Routes { - if !route.NextHop.Equal(hybridOverlayIFAddr) { - newRoutes = append(newRoutes, route) - } + newRoutes := []util.PodRoute{} + switchName := pod.Spec.NodeName + for _, subnet := range oc.lsManager.GetSwitchSubnets(switchName) { + hybridOverlayIFAddr := util.GetNodeHybridOverlayIfAddr(subnet).IP + for _, route := range annotations.Routes { + if !route.NextHop.Equal(hybridOverlayIFAddr) { + newRoutes = append(newRoutes, route) } } - // checking the length because cannot compare the slices directly and if routes are removed - // the length will be different - if len(annotations.Routes) != len(newRoutes) { - annotations.Routes = newRoutes - err = oc.updatePodAnnotationWithRetry(pod, annotations) - if err != nil { - return fmt.Errorf("failed to set annotation on pod %s: %v", pod.Name, err) - } + } + // checking the length because cannot compare the slices directly and if routes are removed + // the length will be different + if len(annotations.Routes) != len(newRoutes) { + annotations.Routes = newRoutes + err = oc.updatePodAnnotationWithRetry(pod, annotations, ovntypes.DefaultNetworkName) + if err != nil { + return fmt.Errorf("failed to set annotation on pod %s: %v", pod.Name, err) } } } // all pods present before ovn-kube startup have been processed atomic.StoreUint32(&oc.allInitialPodsProcessed, 1) - // get all the nodes from the watchFactory - nodes, err := oc.watchFactory.GetNodes() - if err != nil { - return fmt.Errorf("failed to get nodes: %v", err) - } - - var ops []ovsdb.Operation - for _, n := range nodes { - // skip nodes that are not running ovnk (inferred from host subnets) - switchName := n.Name - if oc.lsManager.IsNonHostSubnetSwitch(switchName) { - continue - } - p := func(item *nbdb.LogicalSwitchPort) bool { - return item.ExternalIDs["pod"] == "true" && !expectedLogicalPorts[item.Name] - } - sw := nbdb.LogicalSwitch{ - Name: switchName, - } - sw.UUID, _ = oc.lsManager.GetUUID(switchName) - - ops, err = libovsdbops.DeleteLogicalSwitchPortsWithPredicateOps(oc.nbClient, ops, &sw, p) - if err != nil { - return fmt.Errorf("could not generate ops to delete stale ports from logical switch %s (%+v)", n.Name, err) - } - } - - _, err = libovsdbops.TransactAndCheck(oc.nbClient, ops) - if err != nil { - return fmt.Errorf("could not remove stale logicalPorts from switches (%+v)", err) - } - return nil -} - -// lookupPortUUIDAndSwitchName will use libovsdb to locate the logical switch port uuid as well as the logical switch -// that owns such port (aka nodeName), based on the logical port name. -func (oc *DefaultNetworkController) lookupPortUUIDAndSwitchName(logicalPort string) (portUUID string, switchName string, err error) { - lsp := &nbdb.LogicalSwitchPort{Name: logicalPort} - lsp, err = libovsdbops.GetLogicalSwitchPort(oc.nbClient, lsp) - if err != nil { - return "", "", err - } - p := func(item *nbdb.LogicalSwitch) bool { - for _, currPortUUID := range item.Ports { - if currPortUUID == lsp.UUID { - return true - } - } - return false - } - nodeSwitches, err := libovsdbops.FindLogicalSwitchesWithPredicate(oc.nbClient, p) - if err != nil { - return "", "", fmt.Errorf("failed to get node logical switch for logical port %s (%s): %w", logicalPort, lsp.UUID, err) - } - if len(nodeSwitches) != 1 { - return "", "", fmt.Errorf("found %d node logical switch for logical port %s (%s)", len(nodeSwitches), logicalPort, lsp.UUID) - } - return lsp.UUID, nodeSwitches[0].Name, nil + return oc.deleteStaleLogicalSwitchPorts(expectedLogicalPorts) } func (oc *DefaultNetworkController) deleteLogicalPort(pod *kapi.Pod, portInfo *lpInfo) (err error) { @@ -178,128 +82,23 @@ func (oc *DefaultNetworkController) deleteLogicalPort(pod *kapi.Pod, portInfo *l return nil } - logicalPort := util.GetLogicalPortName(pod.Namespace, pod.Name) - var portUUID string - var switchName string - var podIfAddrs []*net.IPNet - if portInfo == nil { - // If ovnkube-master restarts, it is also possible the Pod's logical switch port - // is not re-added into the cache. Delete logical switch port anyway. - annotation, err := util.UnmarshalPodAnnotation(pod.Annotations, ovntypes.DefaultNetworkName) - if err != nil { - if util.IsAnnotationNotSetError(err) { - // if the annotation doesn’t exist, that’s not an error. It means logical port does not need to be deleted. - klog.V(5).Infof("No annotations on pod %s/%s, no need to delete its logical port: %s", pod.Namespace, pod.Name, logicalPort) - return nil - } - return fmt.Errorf("unable to unmarshal pod annotations for pod %s/%s: %w", pod.Namespace, pod.Name, err) - } - - // Since portInfo is not available, use ovn to locate the logical switch (named after the node name) for the logical port. - portUUID, switchName, err = oc.lookupPortUUIDAndSwitchName(logicalPort) - if err != nil { - if err != libovsdbclient.ErrNotFound { - return fmt.Errorf("unable to locate portUUID+switchName for pod %s/%s: %w", pod.Namespace, pod.Name, err) - } - // The logical port no longer exists in OVN. The caller expects this function to be idem-potent, - // so the proper action to take is to use an empty uuid and extract the node name from the pod spec. - portUUID = "" - switchName = pod.Spec.NodeName - } - podIfAddrs = annotation.IPs - - klog.Warningf("No cached port info for deleting pod: %s. Using logical switch %s port uuid %s and addrs %v", - podDesc, switchName, portUUID, podIfAddrs) - } else { - portUUID = portInfo.uuid - switchName = portInfo.logicalSwitch // ls <==> nodeName - podIfAddrs = portInfo.ips - } - - // Sanity check. The nodeName from pod spec is expected to be the same as the logical switch obtained from the port. - if switchName != pod.Spec.NodeName { - klog.Errorf("Deleting pod %s has an unexpected switch name in spec: %s, ovn expects it to be %s for port uuid %s", - podDesc, pod.Spec.NodeName, switchName, portUUID) - } - - shouldRelease := true - // check to make sure no other pods are using this IP before we try to release it if this is a completed pod. - if util.PodCompleted(pod) { - if shouldRelease, err = oc.lsManager.ConditionalIPRelease(switchName, podIfAddrs, func() (bool, error) { - pods, err := oc.watchFactory.GetAllPods() - if err != nil { - return false, fmt.Errorf("unable to get pods to determine if completed pod IP is in use by another pod. "+ - "Will not release pod %s/%s IP: %#v from allocator", pod.Namespace, pod.Name, podIfAddrs) - } - // iterate through all pods, ignore pods on other switches - for _, p := range pods { - if util.PodCompleted(p) || !util.PodWantsNetwork(p) || !util.PodScheduled(p) || p.Spec.NodeName != switchName { - continue - } - // check if the pod addresses match in the OVN annotation - pAddrs, err := util.GetAllPodIPs(p) - if err != nil { - continue - } - - for _, pAddr := range pAddrs { - for _, podAddr := range podIfAddrs { - if pAddr.Equal(podAddr.IP) { - klog.Infof("Will not release IP address: %s for pod %s/%s. Detected another pod"+ - " using this IP: %s/%s", pAddr.String(), pod.Namespace, pod.Name, p.Namespace, p.Name) - return false, nil - } - } - } - } - klog.Infof("Releasing IPs for Completed pod: %s/%s, ips: %s", pod.Namespace, pod.Name, - util.JoinIPNetIPs(podIfAddrs, " ")) - return true, nil - }); err != nil { - return fmt.Errorf("cannot determine if IPs are safe to release for completed pod: %s: %w", podDesc, err) - } - } - - var allOps, ops []ovsdb.Operation - - // if the ip is in use by another pod we should not try to remove it from the address set - if shouldRelease { - if ops, err = oc.deletePodFromNamespace(pod.Namespace, podIfAddrs, portUUID); err != nil { - return fmt.Errorf("unable to delete pod %s from namespace: %w", podDesc, err) - } - allOps = append(allOps, ops...) - } - ops, err = oc.delLSPOps(logicalPort, switchName, portUUID) - // Tolerate cases where logical switch of the logical port no longer exist in OVN. - if err != nil && !errors.Is(err, libovsdbclient.ErrNotFound) { - return fmt.Errorf("failed to create delete ops for the lsp: %s: %s", logicalPort, err) - } - allOps = append(allOps, ops...) - - recordOps, txOkCallBack, _, err := metrics.GetConfigDurationRecorder().AddOVN(oc.nbClient, "pod", pod.Namespace, - pod.Name) - if err != nil { - klog.Errorf("Failed to record config duration: %v", err) - } - allOps = append(allOps, recordOps...) - _, err = libovsdbops.TransactAndCheck(oc.nbClient, allOps) + pInfo, err := oc.deletePodLogicalPort(pod, portInfo) if err != nil { - return fmt.Errorf("cannot delete logical switch port %s, %v", logicalPort, err) + return err } - txOkCallBack() // do not remove SNATs/GW routes/IPAM for an IP address unless we have validated no other pod is using it - if !shouldRelease { + if pInfo == nil { return nil } if config.Gateway.DisableSNATMultipleGWs { - if err := deletePodSNAT(oc.nbClient, switchName, []*net.IPNet{}, podIfAddrs); err != nil { + if err := deletePodSNAT(oc.nbClient, pInfo.logicalSwitch, []*net.IPNet{}, pInfo.ips); err != nil { return fmt.Errorf("cannot delete GR SNAT for pod %s: %w", podDesc, err) } } podNsName := ktypes.NamespacedName{Namespace: pod.Namespace, Name: pod.Name} - if err := oc.deleteGWRoutesForPod(podNsName, podIfAddrs); err != nil { + if err := oc.deleteGWRoutesForPod(podNsName, pInfo.ips); err != nil { return fmt.Errorf("cannot delete GW Routes for pod %s: %w", podDesc, err) } @@ -309,118 +108,8 @@ func (oc *DefaultNetworkController) deleteLogicalPort(pod *kapi.Pod, portInfo *l // while it is now on another pod. Releasing IPs may fail at this point if cache knows nothing about it, // which is okay since node may have been deleted. klog.Infof("Attempting to release IPs for pod: %s/%s, ips: %s", pod.Namespace, pod.Name, - util.JoinIPNetIPs(podIfAddrs, " ")) - if err := oc.lsManager.ReleaseIPs(switchName, podIfAddrs); err != nil { - if !errors.Is(err, logicalswitchmanager.SwitchNotFound) { - return fmt.Errorf("cannot release IPs for pod %s on node %s: %w", podDesc, switchName, err) - } - klog.Warningf("Ignoring release IPs failure for pod %s on node %s: %w", podDesc, switchName, err) - } - - return nil -} - -func (oc *DefaultNetworkController) waitForNodeLogicalSwitch(switchName string) (*nbdb.LogicalSwitch, error) { - // Wait for the node logical switch to be created by the ClusterController and be present - // in libovsdb's cache. The node switch will be created when the node's logical network infrastructure - // is created by the node watch - ls := &nbdb.LogicalSwitch{Name: switchName} - if err := wait.PollImmediate(30*time.Millisecond, 30*time.Second, func() (bool, error) { - if lsUUID, ok := oc.lsManager.GetUUID(switchName); !ok { - return false, fmt.Errorf("error getting logical switch %s: %s", switchName, "switch not in logical switch cache") - } else { - ls.UUID = lsUUID - return true, nil - } - }); err != nil { - return nil, fmt.Errorf("timed out waiting for logical switch in logical switch cache %q subnet: %v", switchName, err) - } - return ls, nil -} - -func (oc *DefaultNetworkController) waitForNodeLogicalSwitchInCache(switchName string) error { - // Wait for the node logical switch to be created by the ClusterController. - // The node switch will be created when the node's logical network infrastructure - // is created by the node watch. - var subnets []*net.IPNet - if err := wait.PollImmediate(30*time.Millisecond, 30*time.Second, func() (bool, error) { - subnets = oc.lsManager.GetSwitchSubnets(switchName) - return subnets != nil, nil - }); err != nil { - return fmt.Errorf("timed out waiting for logical switch %q subnet: %v", switchName, err) - } - return nil -} - -func (oc *DefaultNetworkController) addRoutesGatewayIP(pod *kapi.Pod, podAnnotation *util.PodAnnotation, nodeSubnets []*net.IPNet) error { - // if there are other network attachments for the pod, then check if those network-attachment's - // annotation has default-route key. If present, then we need to skip adding default route for - // OVN interface - networks, err := util.GetK8sPodAllNetworks(pod) - if err != nil { - return fmt.Errorf("error while getting network attachment definition for [%s/%s]: %v", - pod.Namespace, pod.Name, err) - } - otherDefaultRouteV4 := false - otherDefaultRouteV6 := false - for _, network := range networks { - for _, gatewayRequest := range network.GatewayRequest { - if utilnet.IsIPv6(gatewayRequest) { - otherDefaultRouteV6 = true - } else { - otherDefaultRouteV4 = true - } - } - } - - for _, podIfAddr := range podAnnotation.IPs { - isIPv6 := utilnet.IsIPv6CIDR(podIfAddr) - nodeSubnet, err := util.MatchIPNetFamily(isIPv6, nodeSubnets) - if err != nil { - return err - } - - gatewayIPnet := util.GetNodeGatewayIfAddr(nodeSubnet) - - otherDefaultRoute := otherDefaultRouteV4 - if isIPv6 { - otherDefaultRoute = otherDefaultRouteV6 - } - var gatewayIP net.IP - if otherDefaultRoute { - for _, clusterSubnet := range config.Default.ClusterSubnets { - if isIPv6 == utilnet.IsIPv6CIDR(clusterSubnet.CIDR) { - podAnnotation.Routes = append(podAnnotation.Routes, util.PodRoute{ - Dest: clusterSubnet.CIDR, - NextHop: gatewayIPnet.IP, - }) - } - } - for _, serviceSubnet := range config.Kubernetes.ServiceCIDRs { - if isIPv6 == utilnet.IsIPv6CIDR(serviceSubnet) { - podAnnotation.Routes = append(podAnnotation.Routes, util.PodRoute{ - Dest: serviceSubnet, - NextHop: gatewayIPnet.IP, - }) - } - } - } else { - gatewayIP = gatewayIPnet.IP - } - - if gatewayIP != nil { - podAnnotation.Gateways = append(podAnnotation.Gateways, gatewayIP) - } - } - return nil -} - -// podExpectedInLogicalCache returns true if pod should be added to oc.logicalPortCache. -// For some pods, like hostNetwork pods, overlay node pods, or completed pods waiting for them to be added -// to oc.logicalPortCache will never succeed. -func (oc *DefaultNetworkController) podExpectedInLogicalCache(pod *kapi.Pod) bool { - switchName := pod.Spec.NodeName - return util.PodWantsNetwork(pod) && !oc.lsManager.IsNonHostSubnetSwitch(switchName) && !util.PodCompleted(pod) + util.JoinIPNetIPs(pInfo.ips, " ")) + return oc.releasePodIPs(pInfo) } func (oc *DefaultNetworkController) addLogicalPort(pod *kapi.Pod) (err error) { @@ -430,211 +119,34 @@ func (oc *DefaultNetworkController) addLogicalPort(pod *kapi.Pod) (err error) { return nil } + network, err := util.GetK8sPodDefaultNetwork(pod) + if err != nil { + return fmt.Errorf("error getting default-network's network-attachment: %v", err) + } + var libovsdbExecuteTime time.Duration - var podAnnoTime time.Duration + var lsp *nbdb.LogicalSwitchPort + var ops []ovsdb.Operation + var podAnnotation *util.PodAnnotation + var newlyCreatedPort bool // Keep track of how long syncs take. start := time.Now() defer func() { - klog.Infof("[%s/%s] addLogicalPort took %v, libovsdb time %v, annotation time: %v", - pod.Namespace, pod.Name, time.Since(start), libovsdbExecuteTime, podAnnoTime) + klog.Infof("[%s/%s] addLogicalPort took %v, libovsdb time %v: %v", + pod.Namespace, pod.Name, time.Since(start), libovsdbExecuteTime) }() - // it is possible to try to add a pod here that has no node. For example if a pod was deleted with - // a finalizer, and then the node was removed. In this case the pod will still exist in a running state. - // Terminating pods should still have network connectivity for pre-stop hooks or termination grace period - // We cannot wire a pod that has no node/switch, so retry again later - if _, err := oc.watchFactory.GetNode(pod.Spec.NodeName); kerrors.IsNotFound(err) && - oc.lsManager.GetSwitchSubnets(switchName) == nil { - podState := "unknown" - if util.PodTerminating(pod) { - podState = "terminating" - } - return fmt.Errorf("[%s/%s] Non-existent node: %s in API for pod with %s state", - pod.Namespace, pod.Name, pod.Spec.NodeName, podState) - } - - ls, err := oc.waitForNodeLogicalSwitch(switchName) + ops, lsp, podAnnotation, newlyCreatedPort, err = oc.addLogicalPortToNetwork(pod, network) if err != nil { return err } - portName := util.GetLogicalPortName(pod.Namespace, pod.Name) - klog.Infof("[%s/%s] creating logical port for pod on switch %s", pod.Namespace, pod.Name, switchName) - - var podMac net.HardwareAddr - var podIfAddrs []*net.IPNet - var addresses []string - var releaseIPs bool - lspExist := false - needsIP := true - - // Check if the pod's logical switch port already exists. If it - // does don't re-add the port to OVN as this will change its - // UUID and and the port cache, address sets, and port groups - // will still have the old UUID. - lsp := &nbdb.LogicalSwitchPort{Name: portName} - existingLSP, err := libovsdbops.GetLogicalSwitchPort(oc.nbClient, lsp) - if err != nil && err != libovsdbclient.ErrNotFound { - return fmt.Errorf("unable to get the lsp %s from the nbdb: %s", portName, err) - } - lspExist = err != libovsdbclient.ErrNotFound - - // Sanity check. If port exists, it should be in the logical switch obtained from the pod spec. - if lspExist { - portFound := false - ls, err = libovsdbops.GetLogicalSwitch(oc.nbClient, ls) - if err != nil { - return fmt.Errorf("[%s/%s] unable to find logical switch %s in NBDB", pod.Namespace, pod.Name, - switchName) - } - for _, currPortUUID := range ls.Ports { - if currPortUUID == existingLSP.UUID { - portFound = true - break - } - } - if !portFound { - // This should never happen and indicates we failed to clean up an LSP for a pod that was recreated - return fmt.Errorf("[%s/%s] failed to locate existing logical port %s (%s) in logical switch %s", - pod.Namespace, pod.Name, existingLSP.Name, existingLSP.UUID, switchName) - } - } - - lsp.Options = make(map[string]string) - // Unique identifier to distinguish interfaces for recreated pods, also set by ovnkube-node - // ovn-controller will claim the OVS interface only if external_ids:iface-id - // matches with the Port_Binding.logical_port and external_ids:iface-id-ver matches - // with the Port_Binding.options:iface-id-ver. This is not mandatory. - // If Port_binding.options:iface-id-ver is not set, then OVS - // Interface.external_ids:iface-id-ver if set is ignored. - // Don't set iface-id-ver for already existing LSP if it wasn't set before, - // because the corresponding OVS port may not have it set - // (then ovn-controller won't bind the interface). - // May happen on upgrade, because ovnkube-node doesn't update - // existing OVS interfaces with new iface-id-ver option. - if !lspExist || len(existingLSP.Options["iface-id-ver"]) != 0 { - lsp.Options["iface-id-ver"] = string(pod.UID) - } - // Bind the port to the node's chassis; prevents ping-ponging between - // chassis if ovnkube-node isn't running correctly and hasn't cleared - // out iface-id for an old instance of this pod, and the pod got - // rescheduled. - lsp.Options["requested-chassis"] = pod.Spec.NodeName - - annotation, err := util.UnmarshalPodAnnotation(pod.Annotations, ovntypes.DefaultNetworkName) - - // the IPs we allocate in this function need to be released back to the - // IPAM pool if there is some error in any step of addLogicalPort past - // the point the IPs were assigned via the IPAM manager. - // this needs to be done only when releaseIPs is set to true (the case where - // we truly have assigned podIPs in this call) AND when there is no error in - // the rest of the functionality of addLogicalPort. It is important to use a - // named return variable for defer to work correctly. - - defer func() { - if releaseIPs && err != nil { - if relErr := oc.lsManager.ReleaseIPs(switchName, podIfAddrs); relErr != nil { - klog.Errorf("Error when releasing IPs for switch: %s, err: %q", - switchName, relErr) - } else { - klog.Infof("Released IPs: %s for switch: %s", util.JoinIPNetIPs(podIfAddrs, " "), switchName) - } - } - }() - - if err == nil { - podMac = annotation.MAC - podIfAddrs = annotation.IPs - - // If the pod already has annotations use the existing static - // IP/MAC from the annotation. - lsp.DynamicAddresses = nil - - // ensure we have reserved the IPs in the annotation - if err = oc.lsManager.AllocateIPs(switchName, podIfAddrs); err != nil && err != ipallocator.ErrAllocated { - return fmt.Errorf("unable to ensure IPs allocated for already annotated pod: %s, IPs: %s, error: %v", - pod.Name, util.JoinIPNetIPs(podIfAddrs, " "), err) - } else { - needsIP = false - } - } - - if needsIP { - if existingLSP != nil { - // try to get the MAC and IPs from existing OVN port first - podMac, podIfAddrs, err = oc.getPortAddresses(switchName, existingLSP) - if err != nil { - return fmt.Errorf("failed to get pod addresses for pod %s on switch: %s, err: %v", - portName, switchName, err) - } - } - needsNewAllocation := false - - // ensure we have reserved the IPs found in OVN - if len(podIfAddrs) == 0 { - needsNewAllocation = true - } else if err = oc.lsManager.AllocateIPs(switchName, podIfAddrs); err != nil && err != ipallocator.ErrAllocated { - klog.Warningf("Unable to allocate IPs found on existing OVN port: %s, for pod %s on switch: %s"+ - " error: %v", util.JoinIPNetIPs(podIfAddrs, " "), portName, switchName, err) - - needsNewAllocation = true - } - if needsNewAllocation { - // Previous attempts to use already configured IPs failed, need to assign new - podMac, podIfAddrs, err = oc.assignPodAddresses(switchName) - if err != nil { - return fmt.Errorf("failed to assign pod addresses for pod %s on switch: %s, err: %v", - portName, switchName, err) - } - } - - releaseIPs = true - network, err := util.GetK8sPodDefaultNetwork(pod) - // handle error cases separately first to ensure binding to err, otherwise the - // defer will fail - if err != nil { - return fmt.Errorf("error while getting custom MAC config for port %q from "+ - "default-network's network-attachment: %v", portName, err) - } - - if network != nil && network.MacRequest != "" { - klog.V(5).Infof("Pod %s/%s requested custom MAC: %s", pod.Namespace, pod.Name, network.MacRequest) - podMac, err = net.ParseMAC(network.MacRequest) - if err != nil { - return fmt.Errorf("failed to parse mac %s requested in annotation for pod %s: Error %v", - network.MacRequest, pod.Name, err) - } - } - podAnnotation := util.PodAnnotation{ - IPs: podIfAddrs, - MAC: podMac, - } - var nodeSubnets []*net.IPNet - if nodeSubnets = oc.lsManager.GetSwitchSubnets(switchName); nodeSubnets == nil { - return fmt.Errorf("cannot retrieve subnet for assigning gateway routes for pod %s, node: %s", - pod.Name, switchName) - } - err = oc.addRoutesGatewayIP(pod, &podAnnotation, nodeSubnets) - if err != nil { - return err - } - - klog.V(5).Infof("Annotation values: ip=%v ; mac=%s ; gw=%s", - podIfAddrs, podMac, podAnnotation.Gateways) - annoStart := time.Now() - err = oc.updatePodAnnotationWithRetry(pod, &podAnnotation) - podAnnoTime = time.Since(annoStart) - if err != nil { - return err - } - releaseIPs = false - } - // Ensure the namespace/nsInfo exists - routingExternalGWs, routingPodGWs, ops, err := oc.addPodToNamespace(pod.Namespace, podIfAddrs) + routingExternalGWs, routingPodGWs, addOps, err := oc.addPodToNamespace(pod.Namespace, podAnnotation.IPs) if err != nil { return err } + ops = append(ops, addOps...) // if we have any external or pod Gateways, add routes gateways := make([]*gatewayInfo, 0, len(routingExternalGWs.gws)+len(routingPodGWs)) @@ -656,7 +168,7 @@ func (oc *DefaultNetworkController) addLogicalPort(pod *kapi.Pod) (err error) { if len(gateways) > 0 { podNsName := ktypes.NamespacedName{Namespace: pod.Namespace, Name: pod.Name} - err = oc.addGWRoutesForPod(gateways, podIfAddrs, podNsName, pod.Spec.NodeName) + err = oc.addGWRoutesForPod(gateways, podAnnotation.IPs, podNsName, pod.Spec.NodeName) if err != nil { return err } @@ -665,31 +177,11 @@ func (oc *DefaultNetworkController) addLogicalPort(pod *kapi.Pod) (err error) { // namespace annotations to go through external egress router if extIPs, err := getExternalIPsGR(oc.watchFactory, pod.Spec.NodeName); err != nil { return err - } else if ops, err = oc.addOrUpdatePodSNATReturnOps(pod.Spec.NodeName, extIPs, podIfAddrs, ops); err != nil { + } else if ops, err = oc.addOrUpdatePodSNATReturnOps(pod.Spec.NodeName, extIPs, podAnnotation.IPs, ops); err != nil { return err } } - // set addresses on the port - // LSP addresses in OVN are a single space-separated value - addresses = []string{podMac.String()} - for _, podIfAddr := range podIfAddrs { - addresses[0] = addresses[0] + " " + podIfAddr.IP.String() - } - - lsp.Addresses = addresses - - // add external ids - lsp.ExternalIDs = map[string]string{"namespace": pod.Namespace, "pod": "true"} - - // CNI depends on the flows from port security, delay setting it until end - lsp.PortSecurity = addresses - - ops, err = libovsdbops.CreateOrUpdateLogicalSwitchPortsOnSwitchOps(oc.nbClient, ops, ls, lsp) - if err != nil { - return fmt.Errorf("error creating logical switch port %+v on switch %+v: %+v", *lsp, *ls, err) - } - recordOps, txOkCallBack, _, err := metrics.GetConfigDurationRecorder().AddOVN(oc.nbClient, "pod", pod.Namespace, pod.Name) if err != nil { @@ -718,7 +210,7 @@ func (oc *DefaultNetworkController) addLogicalPort(pod *kapi.Pod) (err error) { } // Add the pod's logical switch port to the port cache - portInfo := oc.logicalPortCache.add(switchName, portName, lsp.UUID, podMac, podIfAddrs) + portInfo := oc.logicalPortCache.add(switchName, lsp.Name, lsp.UUID, podAnnotation.MAC, podAnnotation.IPs) // If multicast is allowed and enabled for the namespace, add the port to the allow policy. // FIXME: there's a race here with the Namespace multicastUpdateNamespace() handler, but @@ -733,95 +225,8 @@ func (oc *DefaultNetworkController) addLogicalPort(pod *kapi.Pod) (err error) { } } //observe the pod creation latency metric for newly created pods only - if needsIP && !lspExist { + if newlyCreatedPort { metrics.RecordPodCreated(pod) } return nil } - -func (oc *DefaultNetworkController) updatePodAnnotationWithRetry(origPod *kapi.Pod, podInfo *util.PodAnnotation) error { - resultErr := retry.RetryOnConflict(util.OvnConflictBackoff, func() error { - // Informer cache should not be mutated, so get a copy of the object - pod, err := oc.watchFactory.GetPod(origPod.Namespace, origPod.Name) - if err != nil { - return err - } - - cpod := pod.DeepCopy() - cpod.Annotations, err = util.MarshalPodAnnotation(cpod.Annotations, podInfo, ovntypes.DefaultNetworkName) - if err != nil { - return err - } - return oc.kube.UpdatePod(cpod) - }) - if resultErr != nil { - return fmt.Errorf("failed to update annotation on pod %s/%s: %v", origPod.Namespace, origPod.Name, resultErr) - } - return nil -} - -// Given a switch, gets the next set of addresses (from the IPAM) for each of the node's -// subnets to assign to the new pod -func (oc *DefaultNetworkController) assignPodAddresses(switchName string) (net.HardwareAddr, []*net.IPNet, error) { - var ( - podMAC net.HardwareAddr - podCIDRs []*net.IPNet - err error - ) - podCIDRs, err = oc.lsManager.AllocateNextIPs(switchName) - if err != nil { - return nil, nil, err - } - if len(podCIDRs) > 0 { - podMAC = util.IPAddrToHWAddr(podCIDRs[0].IP) - } - return podMAC, podCIDRs, nil -} - -// Given a logical switch port and the switch on which it is scheduled, get all -// addresses currently assigned to it including subnet masks. -func (oc *DefaultNetworkController) getPortAddresses(switchName string, existingLSP *nbdb.LogicalSwitchPort) (net.HardwareAddr, []*net.IPNet, error) { - podMac, podIPs, err := util.ExtractPortAddresses(existingLSP) - if err != nil { - return nil, nil, err - } else if podMac == nil || len(podIPs) == 0 { - return nil, nil, nil - } - - var podIPNets []*net.IPNet - - nodeSubnets := oc.lsManager.GetSwitchSubnets(switchName) - - for _, ip := range podIPs { - for _, subnet := range nodeSubnets { - if subnet.Contains(ip) { - podIPNets = append(podIPNets, - &net.IPNet{ - IP: ip, - Mask: subnet.Mask, - }) - break - } - } - } - return podMac, podIPNets, nil -} - -// delLSPOps returns the ovsdb operations required to delete the given logical switch port (LSP) -func (oc *DefaultNetworkController) delLSPOps(logicalPort, switchName, lspUUID string) ([]ovsdb.Operation, error) { - lsUUID, _ := oc.lsManager.GetUUID(switchName) - lsw := nbdb.LogicalSwitch{ - UUID: lsUUID, - Name: switchName, - } - lsp := nbdb.LogicalSwitchPort{ - UUID: lspUUID, - Name: logicalPort, - } - ops, err := libovsdbops.DeleteLogicalSwitchPortsOps(oc.nbClient, nil, &lsw, &lsp) - if err != nil { - return nil, fmt.Errorf("error deleting logical switch port %+v from switch %+v: %w", lsp, lsw, err) - } - - return ops, nil -} diff --git a/go-controller/pkg/ovn/topology_version.go b/go-controller/pkg/ovn/topology_version.go index 93205868e8..cfc0103322 100644 --- a/go-controller/pkg/ovn/topology_version.go +++ b/go-controller/pkg/ovn/topology_version.go @@ -2,14 +2,9 @@ package ovn import ( "context" - "fmt" - "math" "strconv" - libovsdbclient "github.com/ovn-org/libovsdb/client" globalconfig "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" - "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdbops" - "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb" addressset "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/address_set" ovntypes "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" @@ -37,17 +32,12 @@ func (oc *DefaultNetworkController) ovnTopologyCleanup() error { // - an ExternalID on the ovn_cluster_router LogicalRouter in nbdb // - a ConfigMap. This is used by nodes to determine the cluster's topology func (oc *DefaultNetworkController) reportTopologyVersion(ctx context.Context) error { - currentTopologyVersion := strconv.Itoa(ovntypes.OvnCurrentTopologyVersion) - logicalRouter := nbdb.LogicalRouter{ - Name: ovntypes.OVNClusterRouter, - ExternalIDs: map[string]string{"k8s-ovn-topo-version": currentTopologyVersion}, - } - err := libovsdbops.UpdateLogicalRouterSetExternalIDs(oc.nbClient, &logicalRouter) + err := oc.updateL3TopologyVersion() if err != nil { - return fmt.Errorf("failed to generate set topology version in OVN, err: %v", err) + return err } - klog.Infof("Updated Logical_Router %s topology version to %s", ovntypes.OVNClusterRouter, currentTopologyVersion) + currentTopologyVersion := strconv.Itoa(ovntypes.OvnCurrentTopologyVersion) // Report topology version in a ConfigMap // (we used to report this via annotations on our Node) cm := corev1apply.ConfigMap(ovntypes.OvnK8sStatusCMName, globalconfig.Kubernetes.OVNConfigNamespace) @@ -99,28 +89,3 @@ func (oc *DefaultNetworkController) cleanTopologyAnnotation() error { return nil } - -// determineOVNTopoVersionFromOVN determines what OVN Topology version is being used -// If "k8s-ovn-topo-version" key in external_ids column does not exist, it is prior to OVN topology versioning -// and therefore set version number to OvnCurrentTopologyVersion -func (oc *DefaultNetworkController) determineOVNTopoVersionFromOVN() (int, error) { - logicalRouter := &nbdb.LogicalRouter{Name: ovntypes.OVNClusterRouter} - logicalRouter, err := libovsdbops.GetLogicalRouter(oc.nbClient, logicalRouter) - if err != nil && err != libovsdbclient.ErrNotFound { - return 0, fmt.Errorf("error getting router %s: %v", ovntypes.OVNClusterRouter, err) - } - if err == libovsdbclient.ErrNotFound { - // no OVNClusterRouter exists, DB is empty, nothing to upgrade - return math.MaxInt32, nil - } - v, exists := logicalRouter.ExternalIDs["k8s-ovn-topo-version"] - if !exists { - klog.Infof("No version string found. The OVN topology is before versioning is introduced. Upgrade needed") - return 0, nil - } - ver, err := strconv.Atoi(v) - if err != nil { - return 0, fmt.Errorf("invalid OVN topology version string for the cluster, err: %v", err) - } - return ver, nil -} diff --git a/go-controller/pkg/util/node_annotations.go b/go-controller/pkg/util/node_annotations.go index 60a5ae7164..a245788d88 100644 --- a/go-controller/pkg/util/node_annotations.go +++ b/go-controller/pkg/util/node_annotations.go @@ -338,8 +338,9 @@ func SetNodePrimaryIfAddr(nodeAnnotator kube.Annotator, nodeIPNetv4, nodeIPNetv6 return nodeAnnotator.Set(ovnNodeIfAddr, primaryIfAddrAnnotation) } -// CreateNodeGateRouterLRPAddrAnnotation sets the IPv4 / IPv6 values of the node's Gatewary Router LRP to join switch. -func CreateNodeGateRouterLRPAddrAnnotation(nodeAnnotation map[string]string, nodeIPNetv4, nodeIPNetv6 *net.IPNet) (map[string]string, error) { +// CreateNodeGatewayRouterLRPAddrAnnotation sets the IPv4 / IPv6 values of the node's Gatewary Router LRP to join switch. +func CreateNodeGatewayRouterLRPAddrAnnotation(nodeAnnotation map[string]string, nodeIPNetv4, + nodeIPNetv6 *net.IPNet) (map[string]string, error) { if nodeAnnotation == nil { nodeAnnotation = map[string]string{} } From 1299ed447bbf9b1b385eff9c390bb91a8c3184d1 Mon Sep 17 00:00:00 2001 From: Nadia Pinaeva Date: Wed, 7 Dec 2022 22:20:15 +0100 Subject: [PATCH 4/8] Fix address set cleanup: only delete address sets owned by given object. Fix FakeAddressSetFactory.ProcessEachAddressSet locking to allow calling other FakeAddressSetFactory in iteratorFn. Signed-off-by: Nadia Pinaeva --- .../pkg/ovn/address_set/address_set.go | 20 +++------- .../ovn/address_set/address_set_cleanup.go | 2 +- .../pkg/ovn/address_set/address_set_test.go | 34 +++++----------- .../pkg/ovn/address_set/fake_address_set.go | 20 ++++------ go-controller/pkg/ovn/namespace.go | 40 ++++++++++++++++--- go-controller/pkg/ovn/namespace_test.go | 39 ++++++++++++++++++ go-controller/pkg/ovn/policy.go | 33 ++++++++++----- go-controller/pkg/ovn/policy_test.go | 37 +++++++++++++++++ 8 files changed, 157 insertions(+), 68 deletions(-) diff --git a/go-controller/pkg/ovn/address_set/address_set.go b/go-controller/pkg/ovn/address_set/address_set.go index bfb93149bb..fa30a09244 100644 --- a/go-controller/pkg/ovn/address_set/address_set.go +++ b/go-controller/pkg/ovn/address_set/address_set.go @@ -24,7 +24,7 @@ const ( ipv6AddressSetSuffix = "_v6" ) -type AddressSetIterFunc func(hashedName, namespace, suffix string) error +type AddressSetIterFunc func(hashedName, name string) error type AddressSetDoFunc func(as AddressSet) error // AddressSetFactory is an interface for managing address set objects @@ -135,7 +135,7 @@ func (asf *ovnAddressSetFactory) EnsureAddressSet(name string) (AddressSet, erro return &ovnAddressSets{nbClient: asf.nbClient, name: name, ipv4: v4set, ipv6: v6set}, nil } -func forEachAddressSet(nbClient libovsdbclient.Client, do func(string) error) error { +func forEachAddressSet(nbClient libovsdbclient.Client, do func(string, string) error) error { p := func(addrSet *nbdb.AddressSet) bool { _, exists := addrSet.ExternalIDs["name"] return exists @@ -147,7 +147,7 @@ func forEachAddressSet(nbClient libovsdbclient.Client, do func(string) error) er var errors []error for _, addrSet := range addrSetList { - if err := do(addrSet.ExternalIDs["name"]); err != nil { + if err := do(addrSet.Name, addrSet.ExternalIDs["name"]); err != nil { errors = append(errors, err) } } @@ -159,12 +159,10 @@ func forEachAddressSet(nbClient libovsdbclient.Client, do func(string) error) er return nil } -// ProcessEachAddressSet will pass the unhashed address set name, namespace name -// and the first suffix in the name to the 'iteratorFn' for every address_set in -// OVN. (Unhashed address set names are of the form namespaceName[.suffix1.suffix2. .suffixN]) +// ProcessEachAddressSet will pass the hashed and unhashed address set name to iteratorFn for every address set. func (asf *ovnAddressSetFactory) ProcessEachAddressSet(iteratorFn AddressSetIterFunc) error { processedAddressSets := sets.String{} - return forEachAddressSet(asf.nbClient, func(name string) error { + return forEachAddressSet(asf.nbClient, func(hashedName, name string) error { // Remove the suffix from the address set name and normalize addrSetName := truncateSuffixFromAddressSet(name) if processedAddressSets.Has(addrSetName) { @@ -174,13 +172,7 @@ func (asf *ovnAddressSetFactory) ProcessEachAddressSet(iteratorFn AddressSetIter return nil } processedAddressSets.Insert(addrSetName) - names := strings.Split(addrSetName, ".") - addrSetNamespace := names[0] - nameSuffix := "" - if len(names) >= 2 { - nameSuffix = names[1] - } - return iteratorFn(addrSetName, addrSetNamespace, nameSuffix) + return iteratorFn(hashedName, addrSetName) }) } diff --git a/go-controller/pkg/ovn/address_set/address_set_cleanup.go b/go-controller/pkg/ovn/address_set/address_set_cleanup.go index e45b9e9e95..7858a8cd8e 100644 --- a/go-controller/pkg/ovn/address_set/address_set_cleanup.go +++ b/go-controller/pkg/ovn/address_set/address_set_cleanup.go @@ -16,7 +16,7 @@ func NonDualStackAddressSetCleanup(nbClient libovsdbclient.Client) error { const old = 0 const new = 1 addressSets := map[string][2]bool{} - err := forEachAddressSet(nbClient, func(name string) error { + err := forEachAddressSet(nbClient, func(hashedName, name string) error { shortName := truncateSuffixFromAddressSet(name) spec, found := addressSets[shortName] if !found { diff --git a/go-controller/pkg/ovn/address_set/address_set_test.go b/go-controller/pkg/ovn/address_set/address_set_test.go index 089ef9ee00..42c0a1e52b 100644 --- a/go-controller/pkg/ovn/address_set/address_set_test.go +++ b/go-controller/pkg/ovn/address_set/address_set_test.go @@ -64,7 +64,7 @@ var _ = ginkgo.Describe("OVN Address Set operations", func() { }) ginkgo.Context("when iterating address sets", func() { - ginkgo.It("calls the iterator function for each address set with the given prefix", func() { + ginkgo.It("calls the iterator function for each address set", func() { app.Action = func(ctx *cli.Context) error { dbSetup := libovsdbtest.TestSetup{ NBData: []libovsdbtest.TestData{ @@ -91,33 +91,17 @@ var _ = ginkgo.Describe("OVN Address Set operations", func() { _, err = config.InitConfig(ctx, nil, nil) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - namespaces := []testAddressSetName{ - { - namespace: "ns1", - suffix: []string{"foo", "bar"}, - }, - { - namespace: "ns2", - suffix: []string{"test", "test2"}, - }, - { - namespace: "ns3", - }, + namespaces := map[string]bool{ + "ns1.foo.bar": true, + "ns2.test.test2": true, + "ns3": true, } - - err = asFactory.ProcessEachAddressSet(func(addrSetName, namespaceName, nameSuffix string) error { - found := false - for _, n := range namespaces { - name := n.makeNames() - if addrSetName == name { - found = true - gomega.Expect(namespaceName).To(gomega.Equal(n.namespace)) - } - } - gomega.Expect(found).To(gomega.BeTrue()) + err = asFactory.ProcessEachAddressSet(func(hashedName, addrSetName string) error { + gomega.Expect(namespaces[addrSetName]).To(gomega.BeTrue()) + delete(namespaces, addrSetName) return nil }) + gomega.Expect(len(namespaces)).To(gomega.Equal(0)) gomega.Expect(err).NotTo(gomega.HaveOccurred()) return nil } diff --git a/go-controller/pkg/ovn/address_set/fake_address_set.go b/go-controller/pkg/ovn/address_set/fake_address_set.go index b209eb45a6..e0f5185236 100644 --- a/go-controller/pkg/ovn/address_set/fake_address_set.go +++ b/go-controller/pkg/ovn/address_set/fake_address_set.go @@ -3,14 +3,12 @@ package addressset import ( "k8s.io/klog/v2" "net" - "strings" "sync" "sync/atomic" "github.com/ovn-org/libovsdb/ovsdb" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" - "k8s.io/apimachinery/pkg/util/sets" utilnet "k8s.io/utils/net" "github.com/onsi/gomega" @@ -73,21 +71,17 @@ func (f *FakeAddressSetFactory) EnsureAddressSet(name string) (AddressSet, error func (f *FakeAddressSetFactory) ProcessEachAddressSet(iteratorFn AddressSetIterFunc) error { f.Lock() - defer f.Unlock() - asNames := sets.String{} + asNames := map[string]string{} for _, set := range f.sets { asName := truncateSuffixFromAddressSet(set.getName()) - if asNames.Has(asName) { + if _, ok := asNames[asName]; ok { continue } - asNames.Insert(asName) - parts := strings.Split(asName, ".") - addrSetNamespace := parts[0] - nameSuffix := "" - if len(parts) >= 2 { - nameSuffix = parts[1] - } - if err := iteratorFn(asName, addrSetNamespace, nameSuffix); err != nil { + asNames[asName] = set.hashName + } + f.Unlock() + for asName, hashName := range asNames { + if err := iteratorFn(hashName, asName); err != nil { return err } } diff --git a/go-controller/pkg/ovn/namespace.go b/go-controller/pkg/ovn/namespace.go index dd9552b2f1..186e8dcb70 100644 --- a/go-controller/pkg/ovn/namespace.go +++ b/go-controller/pkg/ovn/namespace.go @@ -4,11 +4,14 @@ import ( "context" "fmt" "net" + "strings" "sync" "time" "github.com/ovn-org/libovsdb/ovsdb" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdbops" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb" addressset "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/address_set" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" @@ -67,11 +70,38 @@ func (oc *DefaultNetworkController) syncNamespaces(namespaces []interface{}) err expectedNs[ns.Name] = true } - err := oc.addressSetFactory.ProcessEachAddressSet(func(addrSetName, namespaceName, nameSuffix string) error { - if nameSuffix == "" && !expectedNs[namespaceName] { - if err := oc.addressSetFactory.DestroyAddressSetInBackingStore(addrSetName); err != nil { - klog.Errorf(err.Error()) - return err + err := oc.addressSetFactory.ProcessEachAddressSet(func(hashedName, addrSetName string) error { + // filter out address sets owned by HybridRoutePolicy and EgressQoS by prefix, and + // owned by network policy by dot in the name (namespace can't have dots in its name). + // the only left address sets may be owned by egress firewall dns or namespace + if !strings.HasPrefix(addrSetName, types.HybridRoutePolicyPrefix) && + !strings.HasPrefix(addrSetName, types.EgressQoSRulePrefix) && + !strings.Contains(addrSetName, ".") { + // make sure address set is not owned by egress firewall dns + // find ACLs referencing given address set (by hashName) + aclPred := func(acl *nbdb.ACL) bool { + return strings.Contains(acl.Match, "$"+hashedName) + } + acls, err := libovsdbops.FindACLsWithPredicate(oc.nbClient, aclPred) + if err != nil { + return fmt.Errorf("failed to find referencing acls for address set %s: %v", addrSetName, err) + } + if len(acls) > 0 { + // if given address set is owned by egress firewall, all ACLs will be owned by the same object + acl := acls[0] + // check if egress firewall dns is the owner + // the only address set that may be referenced in egress firewall destination is dns address set + if acl.ExternalIDs[egressFirewallACLExtIdKey] != "" && strings.Contains(acl.Match, ".dst == $"+hashedName) { + // address set is owned by egress firewall, skip + return nil + } + } + // address set is owned by namespace, namespace name = address set name + if !expectedNs[addrSetName] { + if err := oc.addressSetFactory.DestroyAddressSetInBackingStore(addrSetName); err != nil { + klog.Errorf(err.Error()) + return err + } } } return nil diff --git a/go-controller/pkg/ovn/namespace_test.go b/go-controller/pkg/ovn/namespace_test.go index 5fb4b3a214..c1262537de 100644 --- a/go-controller/pkg/ovn/namespace_test.go +++ b/go-controller/pkg/ovn/namespace_test.go @@ -2,6 +2,7 @@ package ovn import ( "context" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb" "net" "sync" @@ -88,6 +89,44 @@ var _ = ginkgo.Describe("OVN Namespace Operations", func() { }) ginkgo.Context("on startup", func() { + ginkgo.It("only cleans up address sets owned by namespace", func() { + namespace1 := newNamespace(namespaceName) + // namespace-owned address set for existing namespace, should stay + fakeOvn.asf.NewAddressSet(namespaceName, []net.IP{net.ParseIP("1.1.1.1")}) + // namespace-owned address set for stale namespace, should be deleted + fakeOvn.asf.NewAddressSet("namespace2", []net.IP{net.ParseIP("1.1.1.2")}) + // netpol-owned address set for existing netpol, should stay + fakeOvn.asf.NewAddressSet("namespace1.netpol1.egress.0", []net.IP{net.ParseIP("1.1.1.3")}) + // egressQoS-owned address set, should stay + fakeOvn.asf.NewAddressSet(ovntypes.EgressQoSRulePrefix+"namespace", []net.IP{net.ParseIP("1.1.1.4")}) + // hybridNode-owned address set, should stay + fakeOvn.asf.NewAddressSet(ovntypes.HybridRoutePolicyPrefix+"node", []net.IP{net.ParseIP("1.1.1.5")}) + // egress firewall-owned address set, should stay + // needs exiting ACL to distinguish from namespace-owned + dnsAS, err := fakeOvn.asf.NewAddressSet("dnsname", []net.IP{net.ParseIP("1.1.1.6")}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + dnsHashName, _ := dnsAS.GetASHashNames() + egressFirewallACL := BuildACL( + "aclName", + 1, + "ip4.dst == $"+dnsHashName, + nbdb.ACLActionAllow, + nil, + lportIngress, + map[string]string{egressFirewallACLExtIdKey: "egressfirewall1"}, + ) + + fakeOvn.startWithDBSetup(libovsdbtest.TestSetup{NBData: []libovsdbtest.TestData{egressFirewallACL}}) + err = fakeOvn.controller.syncNamespaces([]interface{}{namespace1}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + fakeOvn.asf.ExpectAddressSetWithIPs(namespaceName, []string{"1.1.1.1"}) + fakeOvn.asf.EventuallyExpectNoAddressSet("namespace2") + fakeOvn.asf.ExpectAddressSetWithIPs("namespace1.netpol1.egress.0", []string{"1.1.1.3"}) + fakeOvn.asf.ExpectAddressSetWithIPs(ovntypes.EgressQoSRulePrefix+"namespace", []string{"1.1.1.4"}) + fakeOvn.asf.ExpectAddressSetWithIPs(ovntypes.HybridRoutePolicyPrefix+"node", []string{"1.1.1.5"}) + fakeOvn.asf.ExpectAddressSetWithIPs("dnsname", []string{"1.1.1.6"}) + }) ginkgo.It("reconciles an existing namespace with pods", func() { namespaceT := *newNamespace(namespaceName) diff --git a/go-controller/pkg/ovn/policy.go b/go-controller/pkg/ovn/policy.go index b0cf2eefdd..92b8fc9d3a 100644 --- a/go-controller/pkg/ovn/policy.go +++ b/go-controller/pkg/ovn/policy.go @@ -3,6 +3,7 @@ package ovn import ( "fmt" "net" + "strconv" "strings" "sync" @@ -300,16 +301,28 @@ func (oc *DefaultNetworkController) syncNetworkPolicies(networkPolicies []interf } stalePGs := []string{} - err := oc.addressSetFactory.ProcessEachAddressSet(func(addrSetName, namespaceName, policyName string) error { - if policyName != "" && !expectedPolicies[namespaceName][policyName] { - // policy doesn't exist on k8s. Delete the port group - portGroupName := fmt.Sprintf("%s_%s", namespaceName, policyName) - hashedLocalPortGroup := hashedPortGroup(portGroupName) - stalePGs = append(stalePGs, hashedLocalPortGroup) - // delete the address sets for this old policy from OVN - if err := oc.addressSetFactory.DestroyAddressSetInBackingStore(addrSetName); err != nil { - klog.Errorf(err.Error()) - return err + err := oc.addressSetFactory.ProcessEachAddressSet(func(hashedName, addrSetName string) error { + // netpol name has format fmt.Sprintf("%s.%s.%s.%d", gp.policyNamespace, gp.policyName, direction, gp.idx) + s := strings.Split(addrSetName, ".") + sLen := len(s) + // priority should be a number + _, numErr := strconv.Atoi(s[sLen-1]) + if sLen >= 4 && (s[sLen-2] == "ingress" || s[sLen-2] == "egress") && numErr == nil { + // address set is owned by network policy + // namespace doesn't have dots + namespaceName := s[0] + // policyName may have dots, join in that case + policyName := strings.Join(s[1:sLen-2], ".") + if !expectedPolicies[namespaceName][policyName] { + // policy doesn't exist on k8s. Delete the port group + portGroupName := fmt.Sprintf("%s_%s", namespaceName, policyName) + hashedLocalPortGroup := hashedPortGroup(portGroupName) + stalePGs = append(stalePGs, hashedLocalPortGroup) + // delete the address sets for this old policy from OVN + if err := oc.addressSetFactory.DestroyAddressSetInBackingStore(addrSetName); err != nil { + klog.Errorf(err.Error()) + return err + } } } return nil diff --git a/go-controller/pkg/ovn/policy_test.go b/go-controller/pkg/ovn/policy_test.go index 443377641e..144faa3a53 100644 --- a/go-controller/pkg/ovn/policy_test.go +++ b/go-controller/pkg/ovn/policy_test.go @@ -3,6 +3,7 @@ package ovn import ( "context" "fmt" + "net" "sort" "strconv" "strings" @@ -513,6 +514,42 @@ var _ = ginkgo.Describe("OVN NetworkPolicy Operations", func() { } ginkgo.Context("on startup", func() { + ginkgo.It("only cleans up address sets owned by network policy", func() { + app.Action = func(ctx *cli.Context) error { + namespace1 := *newNamespace(namespaceName1) + namespace2 := *newNamespace(namespaceName2) + networkPolicy := getMatchLabelsNetworkPolicy(netPolicyName1, namespace1.Name, + namespace2.Name, "", true, true) + // namespace-owned address set, should stay + fakeOvn.asf.NewAddressSet("namespace", []net.IP{net.ParseIP("1.1.1.1")}) + // netpol-owned address set for existing netpol, should stay + fakeOvn.asf.NewAddressSet(fmt.Sprintf("namespace1.%s.egress.0", netPolicyName1), []net.IP{net.ParseIP("1.1.1.2")}) + // netpol-owned address set for non-exisitng netpol, should be deleted + fakeOvn.asf.NewAddressSet("namespace1.not-existing.egress.0", []net.IP{net.ParseIP("1.1.1.3")}) + // egressQoS-owned address set, should stay + fakeOvn.asf.NewAddressSet(types.EgressQoSRulePrefix+"namespace", []net.IP{net.ParseIP("1.1.1.4")}) + // hybridNode-owned address set, should stay + fakeOvn.asf.NewAddressSet(types.HybridRoutePolicyPrefix+"node", []net.IP{net.ParseIP("1.1.1.5")}) + // egress firewall-owned address set, should stay + fakeOvn.asf.NewAddressSet("test.dns.name", []net.IP{net.ParseIP("1.1.1.6")}) + + fakeOvn.start() + err := fakeOvn.controller.syncNetworkPolicies([]interface{}{networkPolicy}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + fakeOvn.asf.ExpectAddressSetWithIPs("namespace", []string{"1.1.1.1"}) + fakeOvn.asf.ExpectAddressSetWithIPs(fmt.Sprintf("namespace1.%s.egress.0", netPolicyName1), []string{"1.1.1.2"}) + fakeOvn.asf.EventuallyExpectNoAddressSet("namespace1.not-existing.egress.0") + fakeOvn.asf.ExpectAddressSetWithIPs(types.EgressQoSRulePrefix+"namespace", []string{"1.1.1.4"}) + fakeOvn.asf.ExpectAddressSetWithIPs(types.HybridRoutePolicyPrefix+"node", []string{"1.1.1.5"}) + fakeOvn.asf.ExpectAddressSetWithIPs("test.dns.name", []string{"1.1.1.6"}) + return nil + } + + err := app.Run([]string{app.Name}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }) + ginkgo.It("reconciles an existing networkPolicy with empty db", func() { app.Action = func(ctx *cli.Context) error { namespace1 := *newNamespace(namespaceName1) From b5488fb1a3ef74d25ccf8a37d0d3cf0cbd09c95a Mon Sep 17 00:00:00 2001 From: Tim Rozet Date: Thu, 8 Dec 2022 14:58:06 -0500 Subject: [PATCH 5/8] Avoid duplicate address set transactions When pod updates happen, handlers may be invoked that add pods IPs to address sets. A prime example of this is network policy handlePeerPods. Each time the pod is updated, the address will be added to the address set with a mutate-insert operation. The cost of this txn when there are many network policies that apply to this pod is high. For example, with a pod that has 730 network policies, and 7 pod updates that would result in 5110 transactions instead of only the 730 needed to add this pod in the first place. Signed-off-by: Tim Rozet --- .../pkg/ovn/address_set/address_set.go | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/go-controller/pkg/ovn/address_set/address_set.go b/go-controller/pkg/ovn/address_set/address_set.go index bfb93149bb..20b2037eb8 100644 --- a/go-controller/pkg/ovn/address_set/address_set.go +++ b/go-controller/pkg/ovn/address_set/address_set.go @@ -496,6 +496,10 @@ func (as *ovnAddressSet) addIPs(ips []net.IP) ([]ovsdb.Operation, error) { uniqIPs := ipsToStringUnique(ips) + if as.hasIPs(uniqIPs...) { + return nil, nil + } + klog.V(5).Infof("(%s) adding IPs (%s) to address set", asDetail(as), uniqIPs) addrset := nbdb.AddressSet{ @@ -510,6 +514,20 @@ func (as *ovnAddressSet) addIPs(ips []net.IP) ([]ovsdb.Operation, error) { return ops, nil } +// hasIPs returns true if an address set contains all given IPs +func (as *ovnAddressSet) hasIPs(ips ...string) bool { + existingIPs, err := as.getIPs() + if err != nil { + return false + } + + if len(existingIPs) == 0 { + return false + } + + return sets.NewString(existingIPs...).HasAll(ips...) +} + // deleteIPs removes selected IPs from the existing address_set func (as *ovnAddressSet) deleteIPs(ips []net.IP) ([]ovsdb.Operation, error) { if len(ips) == 0 { From ad43995f1d3eea1bea7c1b3148650d999d847d57 Mon Sep 17 00:00:00 2001 From: Tim Rozet Date: Thu, 8 Dec 2022 15:53:31 -0500 Subject: [PATCH 6/8] Avoid duplicate add port to port group txns Checks the cache in order to decide whether or not we actually need to send a transaction to add the port. This results in a perf gain of not wasting OVSDB transactions. Signed-off-by: Tim Rozet --- go-controller/pkg/libovsdbops/portgroup.go | 19 +++++++++++++++++++ go-controller/pkg/ovn/policy.go | 22 +++++++++++++++++++--- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/go-controller/pkg/libovsdbops/portgroup.go b/go-controller/pkg/libovsdbops/portgroup.go index 238d7b1c17..97540065e6 100644 --- a/go-controller/pkg/libovsdbops/portgroup.go +++ b/go-controller/pkg/libovsdbops/portgroup.go @@ -75,6 +75,25 @@ func CreateOrUpdatePortGroups(nbClient libovsdbclient.Client, pgs ...*nbdb.PortG return err } +// GetPortGroup looks up a port group from the cache +func GetPortGroup(nbClient libovsdbclient.Client, pg *nbdb.PortGroup) (*nbdb.PortGroup, error) { + found := []*nbdb.PortGroup{} + opModel := operationModel{ + Model: pg, + ExistingResult: &found, + ErrNotFound: true, + BulkOp: false, + } + + m := newModelClient(nbClient) + err := m.Lookup(opModel) + if err != nil { + return nil, err + } + + return found[0], nil +} + func AddPortsToPortGroupOps(nbClient libovsdbclient.Client, ops []libovsdb.Operation, name string, ports ...string) ([]libovsdb.Operation, error) { if len(ports) == 0 { return ops, nil diff --git a/go-controller/pkg/ovn/policy.go b/go-controller/pkg/ovn/policy.go index b0cf2eefdd..955825d8cf 100644 --- a/go-controller/pkg/ovn/policy.go +++ b/go-controller/pkg/ovn/policy.go @@ -723,6 +723,7 @@ func (oc *DefaultNetworkController) getExistingLocalPolicyPorts(np *networkPolic // denyPGAddPorts adds ports to default deny port groups. // It also can take existing ops e.g. to add port to network policy port group and transact it. +// It only adds new ports that do not already exist in the deny port groups. func (oc *DefaultNetworkController) denyPGAddPorts(np *networkPolicy, portNamesToUUIDs map[string]string, ops []ovsdb.Operation) error { var err error ingressDenyPGName := defaultDenyPortGroupName(np.namespace, ingressDefaultDenySuffix) @@ -838,9 +839,11 @@ func (oc *DefaultNetworkController) handleLocalPodSelectorAddFunc(np *networkPol var err error // add pods to policy port group var ops []ovsdb.Operation - ops, err = libovsdbops.AddPortsToPortGroupOps(oc.nbClient, nil, np.portGroupName, policyPortUUIDs...) - if err != nil { - return fmt.Errorf("unable to get ops to add new pod to policy port group: %v", err) + if !PortGroupHasPorts(oc.nbClient, np.portGroupName, policyPortUUIDs) { + ops, err = libovsdbops.AddPortsToPortGroupOps(oc.nbClient, nil, np.portGroupName, policyPortUUIDs...) + if err != nil { + return fmt.Errorf("unable to get ops to add new pod to policy port group: %v", err) + } } // add pods to default deny port group // make sure to only pass newly added pods @@ -1704,3 +1707,16 @@ func getPolicyKey(policy *knet.NetworkPolicy) string { func (np *networkPolicy) getKey() string { return fmt.Sprintf("%v/%v", np.namespace, np.name) } + +// PortGroupHasPorts returns true if a port group contains all given ports +func PortGroupHasPorts(nbClient libovsdbclient.Client, pgName string, portUUIDs []string) bool { + pg := &nbdb.PortGroup{ + Name: pgName, + } + pg, err := libovsdbops.GetPortGroup(nbClient, pg) + if err != nil { + return false + } + + return sets.NewString(pg.Ports...).HasAll(portUUIDs...) +} From 66bb8fe1b3632f9d06b1afee6d3a0874d3bee1d0 Mon Sep 17 00:00:00 2001 From: Nadia Pinaeva Date: Thu, 8 Dec 2022 16:09:09 +0100 Subject: [PATCH 7/8] Delete egress firewall acls before deleting referenced address sets to avoid ovn-controller syntax error. Signed-off-by: Nadia Pinaeva --- .../pkg/ovn/address_set/address_set.go | 2 + .../pkg/ovn/address_set/address_set_test.go | 20 +++--- go-controller/pkg/ovn/egressfirewall.go | 8 +-- go-controller/pkg/ovn/namespace.go | 61 ++++++++++--------- go-controller/pkg/ovn/namespace_test.go | 4 +- go-controller/pkg/ovn/policy.go | 36 +++++------ 6 files changed, 69 insertions(+), 62 deletions(-) diff --git a/go-controller/pkg/ovn/address_set/address_set.go b/go-controller/pkg/ovn/address_set/address_set.go index fa30a09244..3cf1e75741 100644 --- a/go-controller/pkg/ovn/address_set/address_set.go +++ b/go-controller/pkg/ovn/address_set/address_set.go @@ -135,6 +135,8 @@ func (asf *ovnAddressSetFactory) EnsureAddressSet(name string) (AddressSet, erro return &ovnAddressSets{nbClient: asf.nbClient, name: name, ipv4: v4set, ipv6: v6set}, nil } +// forEachAddressSet executes a do function on each address set found to have ExternalIDs["name"]. +// do function should take parameters: hashed addr set name, real name func forEachAddressSet(nbClient libovsdbclient.Client, do func(string, string) error) error { p := func(addrSet *nbdb.AddressSet) bool { _, exists := addrSet.ExternalIDs["name"] diff --git a/go-controller/pkg/ovn/address_set/address_set_test.go b/go-controller/pkg/ovn/address_set/address_set_test.go index 42c0a1e52b..fc13618344 100644 --- a/go-controller/pkg/ovn/address_set/address_set_test.go +++ b/go-controller/pkg/ovn/address_set/address_set_test.go @@ -70,16 +70,16 @@ var _ = ginkgo.Describe("OVN Address Set operations", func() { NBData: []libovsdbtest.TestData{ &nbdb.AddressSet{ Name: "1", - ExternalIDs: map[string]string{"name": "ns1.foo.bar"}, + ExternalIDs: map[string]string{"name": "foo.bar"}, }, &nbdb.AddressSet{ Name: "2", - ExternalIDs: map[string]string{"name": "ns2.test.test2"}, + ExternalIDs: map[string]string{"name": "test.test2"}, }, &nbdb.AddressSet{ Name: "3", - ExternalIDs: map[string]string{"name": "ns3"}, + ExternalIDs: map[string]string{"name": "test3"}, }, }, } @@ -91,17 +91,17 @@ var _ = ginkgo.Describe("OVN Address Set operations", func() { _, err = config.InitConfig(ctx, nil, nil) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - namespaces := map[string]bool{ - "ns1.foo.bar": true, - "ns2.test.test2": true, - "ns3": true, + expectedAddressSets := map[string]bool{ + "foo.bar": true, + "test.test2": true, + "test3": true, } err = asFactory.ProcessEachAddressSet(func(hashedName, addrSetName string) error { - gomega.Expect(namespaces[addrSetName]).To(gomega.BeTrue()) - delete(namespaces, addrSetName) + gomega.Expect(expectedAddressSets[addrSetName]).To(gomega.BeTrue()) + delete(expectedAddressSets, addrSetName) return nil }) - gomega.Expect(len(namespaces)).To(gomega.Equal(0)) + gomega.Expect(len(expectedAddressSets)).To(gomega.Equal(0)) gomega.Expect(err).NotTo(gomega.HaveOccurred()) return nil } diff --git a/go-controller/pkg/ovn/egressfirewall.go b/go-controller/pkg/ovn/egressfirewall.go index a2d77b31be..84519e5588 100644 --- a/go-controller/pkg/ovn/egressfirewall.go +++ b/go-controller/pkg/ovn/egressfirewall.go @@ -275,15 +275,15 @@ func (oc *DefaultNetworkController) deleteEgressFirewall(egressFirewallObj *egre break } } + // delete acls first, then dns address set that is referenced in these acls + if err := oc.deleteEgressFirewallRules(egressFirewallObj.Namespace); err != nil { + return err + } if deleteDNS { if err := oc.egressFirewallDNS.Delete(egressFirewallObj.Namespace); err != nil { return err } } - - if err := oc.deleteEgressFirewallRules(egressFirewallObj.Namespace); err != nil { - return err - } oc.egressFirewalls.Delete(egressFirewallObj.Namespace) return nil } diff --git a/go-controller/pkg/ovn/namespace.go b/go-controller/pkg/ovn/namespace.go index 186e8dcb70..e6150cc162 100644 --- a/go-controller/pkg/ovn/namespace.go +++ b/go-controller/pkg/ovn/namespace.go @@ -71,39 +71,42 @@ func (oc *DefaultNetworkController) syncNamespaces(namespaces []interface{}) err } err := oc.addressSetFactory.ProcessEachAddressSet(func(hashedName, addrSetName string) error { - // filter out address sets owned by HybridRoutePolicy and EgressQoS by prefix, and - // owned by network policy by dot in the name (namespace can't have dots in its name). + // filter out address sets owned by HybridRoutePolicy and EgressQoS by prefix. + // network policy-owned address set would have a dot in the address set name due to the format + // (namespace can't have dots in its name, and their address sets too). // the only left address sets may be owned by egress firewall dns or namespace - if !strings.HasPrefix(addrSetName, types.HybridRoutePolicyPrefix) && - !strings.HasPrefix(addrSetName, types.EgressQoSRulePrefix) && - !strings.Contains(addrSetName, ".") { - // make sure address set is not owned by egress firewall dns - // find ACLs referencing given address set (by hashName) - aclPred := func(acl *nbdb.ACL) bool { - return strings.Contains(acl.Match, "$"+hashedName) - } - acls, err := libovsdbops.FindACLsWithPredicate(oc.nbClient, aclPred) - if err != nil { - return fmt.Errorf("failed to find referencing acls for address set %s: %v", addrSetName, err) - } - if len(acls) > 0 { - // if given address set is owned by egress firewall, all ACLs will be owned by the same object - acl := acls[0] - // check if egress firewall dns is the owner - // the only address set that may be referenced in egress firewall destination is dns address set - if acl.ExternalIDs[egressFirewallACLExtIdKey] != "" && strings.Contains(acl.Match, ".dst == $"+hashedName) { - // address set is owned by egress firewall, skip - return nil - } + if strings.HasPrefix(addrSetName, types.HybridRoutePolicyPrefix) || + strings.HasPrefix(addrSetName, types.EgressQoSRulePrefix) || + strings.Contains(addrSetName, ".") { + return nil + } + + // make sure address set is not owned by egress firewall dns + // find ACLs referencing given address set (by hashName) + aclPred := func(acl *nbdb.ACL) bool { + return strings.Contains(acl.Match, "$"+hashedName) + } + acls, err := libovsdbops.FindACLsWithPredicate(oc.nbClient, aclPred) + if err != nil { + return fmt.Errorf("failed to find referencing acls for address set %s: %v", addrSetName, err) + } + if len(acls) > 0 { + // if given address set is owned by egress firewall, all ACLs will be owned by the same object + acl := acls[0] + // check if egress firewall dns is the owner + // the only address set that may be referenced in egress firewall destination is dns address set + if acl.ExternalIDs[egressFirewallACLExtIdKey] != "" && strings.Contains(acl.Match, ".dst == $"+hashedName) { + // address set is owned by egress firewall, skip + return nil } - // address set is owned by namespace, namespace name = address set name - if !expectedNs[addrSetName] { - if err := oc.addressSetFactory.DestroyAddressSetInBackingStore(addrSetName); err != nil { - klog.Errorf(err.Error()) - return err - } + } + // address set is owned by namespace, namespace name = address set name + if !expectedNs[addrSetName] { + if err = oc.addressSetFactory.DestroyAddressSetInBackingStore(addrSetName); err != nil { + return err } } + return nil }) if err != nil { diff --git a/go-controller/pkg/ovn/namespace_test.go b/go-controller/pkg/ovn/namespace_test.go index c1262537de..e9a5042899 100644 --- a/go-controller/pkg/ovn/namespace_test.go +++ b/go-controller/pkg/ovn/namespace_test.go @@ -2,7 +2,6 @@ package ovn import ( "context" - "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb" "net" "sync" @@ -11,6 +10,7 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/kube" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb" lsm "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/logical_switch_manager" ovntest "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/testing" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/testing/libovsdb" @@ -102,7 +102,7 @@ var _ = ginkgo.Describe("OVN Namespace Operations", func() { // hybridNode-owned address set, should stay fakeOvn.asf.NewAddressSet(ovntypes.HybridRoutePolicyPrefix+"node", []net.IP{net.ParseIP("1.1.1.5")}) // egress firewall-owned address set, should stay - // needs exiting ACL to distinguish from namespace-owned + // needs existing ACL to distinguish from namespace-owned dnsAS, err := fakeOvn.asf.NewAddressSet("dnsname", []net.IP{net.ParseIP("1.1.1.6")}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) dnsHashName, _ := dnsAS.GetASHashNames() diff --git a/go-controller/pkg/ovn/policy.go b/go-controller/pkg/ovn/policy.go index 92b8fc9d3a..4887125bf1 100644 --- a/go-controller/pkg/ovn/policy.go +++ b/go-controller/pkg/ovn/policy.go @@ -301,28 +301,30 @@ func (oc *DefaultNetworkController) syncNetworkPolicies(networkPolicies []interf } stalePGs := []string{} - err := oc.addressSetFactory.ProcessEachAddressSet(func(hashedName, addrSetName string) error { + err := oc.addressSetFactory.ProcessEachAddressSet(func(_, addrSetName string) error { // netpol name has format fmt.Sprintf("%s.%s.%s.%d", gp.policyNamespace, gp.policyName, direction, gp.idx) s := strings.Split(addrSetName, ".") sLen := len(s) // priority should be a number _, numErr := strconv.Atoi(s[sLen-1]) - if sLen >= 4 && (s[sLen-2] == "ingress" || s[sLen-2] == "egress") && numErr == nil { - // address set is owned by network policy - // namespace doesn't have dots - namespaceName := s[0] - // policyName may have dots, join in that case - policyName := strings.Join(s[1:sLen-2], ".") - if !expectedPolicies[namespaceName][policyName] { - // policy doesn't exist on k8s. Delete the port group - portGroupName := fmt.Sprintf("%s_%s", namespaceName, policyName) - hashedLocalPortGroup := hashedPortGroup(portGroupName) - stalePGs = append(stalePGs, hashedLocalPortGroup) - // delete the address sets for this old policy from OVN - if err := oc.addressSetFactory.DestroyAddressSetInBackingStore(addrSetName); err != nil { - klog.Errorf(err.Error()) - return err - } + if sLen < 4 || (s[sLen-2] != "ingress" && s[sLen-2] != "egress") || numErr != nil { + // address set name is not formatted by network policy + return nil + } + + // address set is owned by network policy + // namespace doesn't have dots + namespaceName := s[0] + // policyName may have dots, join in that case + policyName := strings.Join(s[1:sLen-2], ".") + if !expectedPolicies[namespaceName][policyName] { + // policy doesn't exist on k8s. Delete the port group + portGroupName := fmt.Sprintf("%s_%s", namespaceName, policyName) + hashedLocalPortGroup := hashedPortGroup(portGroupName) + stalePGs = append(stalePGs, hashedLocalPortGroup) + // delete the address sets for this old policy from OVN + if err := oc.addressSetFactory.DestroyAddressSetInBackingStore(addrSetName); err != nil { + return err } } return nil From b825a293fa0a73dbba8245fe0e782f93c1913767 Mon Sep 17 00:00:00 2001 From: Nadia Pinaeva Date: Mon, 12 Dec 2022 16:38:52 +0100 Subject: [PATCH 8/8] Use PeerPodHandler if namespaceAndPod selector has empty namespace selector to reduce the number of per-namespace pod handlers. We don't need per-namespace handlers in case of empty namespace selector, just use pod selector with empty namespace. Signed-off-by: Nadia Pinaeva --- go-controller/pkg/ovn/policy.go | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/go-controller/pkg/ovn/policy.go b/go-controller/pkg/ovn/policy.go index b0cf2eefdd..b67abab809 100644 --- a/go-controller/pkg/ovn/policy.go +++ b/go-controller/pkg/ovn/policy.go @@ -1139,7 +1139,13 @@ func (oc *DefaultNetworkController) createNetworkPolicy(policy *knet.NetworkPoli // For each rule that contains both peer namespace selector and // peer pod selector, we create a watcher for each matching namespace // that populates the addressSet - err = oc.addPeerNamespaceAndPodHandler(handler.namespaceSelector, handler.podSelector, handler.gress, np) + nsSel, _ := metav1.LabelSelectorAsSelector(handler.namespaceSelector) + if nsSel.Empty() { + // namespace is not limited by a selector, just use pod selector with empty namespace + err = oc.addPeerPodHandler(handler.podSelector, handler.gress, np, "") + } else { + err = oc.addPeerNamespaceAndPodHandler(handler.namespaceSelector, handler.podSelector, handler.gress, np) + } } else if handler.namespaceSelector != nil { // For each peer namespace selector, we create a watcher that // populates ingress.peerAddressSets @@ -1147,7 +1153,7 @@ func (oc *DefaultNetworkController) createNetworkPolicy(policy *knet.NetworkPoli } else if handler.podSelector != nil { // For each peer pod selector, we create a watcher that // populates the addressSet - err = oc.addPeerPodHandler(handler.podSelector, handler.gress, np) + err = oc.addPeerPodHandler(handler.podSelector, handler.gress, np, np.namespace) } if err != nil { return fmt.Errorf("failed to start peer handler: %v", err) @@ -1422,7 +1428,7 @@ type NetworkPolicyExtraParameters struct { // PeerPodSelectorType uses handlePeerPodSelectorAddUpdate on Add and Update, // and handlePeerPodSelectorDelete on Delete. func (oc *DefaultNetworkController) addPeerPodHandler(podSelector *metav1.LabelSelector, - gp *gressPolicy, np *networkPolicy) error { + gp *gressPolicy, np *networkPolicy, namespace string) error { // NetworkPolicy is validated by the apiserver; this can't fail. sel, _ := metav1.LabelSelectorAsSelector(podSelector) @@ -1442,7 +1448,7 @@ func (oc *DefaultNetworkController) addPeerPodHandler(podSelector *metav1.LabelS gp: gp, }) - podHandler, err := retryPeerPods.WatchResourceFiltered(np.namespace, sel) + podHandler, err := retryPeerPods.WatchResourceFiltered(namespace, sel) if err != nil { klog.Errorf("Failed WatchResource for addPeerPodHandler: %v", err) return err