Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: [NPM] Reduce/Refactor Noisy NPM Logs #3468

Open
wants to merge 35 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
221479e
removed all logs from npm dataplane (except error/warning logs)
rayaisaiah Mar 3, 2025
d603453
removed all logs from npm controller (except error/warning logs)
rayaisaiah Mar 3, 2025
36eb2d6
restored logs that are ununused by current npm (v2)
rayaisaiah Mar 3, 2025
13501a7
removed test files
rayaisaiah Mar 3, 2025
6a2eda5
resolved comments
rayaisaiah Mar 4, 2025
a373499
keep log related to reconciling chain placement when the chain is not…
rayaisaiah Mar 4, 2025
2004053
added bootup logs back
rayaisaiah Mar 4, 2025
8a8eff1
Removed two more noisy logs
rayaisaiah Mar 4, 2025
ba84f91
Add loglevel config option when printing application insight logs
rayaisaiah Mar 5, 2025
b72bec7
Updated all non-error/warning logs to commented out and with a vap TODO
rayaisaiah Mar 5, 2025
821c9ef
fixed typo
rayaisaiah Mar 5, 2025
207dadb
small typo fix
rayaisaiah Mar 5, 2025
f208a60
updated configmap with loglevel
rayaisaiah Mar 6, 2025
3004ec6
updated default value
rayaisaiah Mar 6, 2025
d8ec874
added a default value for loglevel
rayaisaiah Mar 6, 2025
28c5322
fixed typo in json
rayaisaiah Mar 6, 2025
1449775
removed comma
rayaisaiah Mar 6, 2025
38cbdd9
changed loglevel to info in configmap
rayaisaiah Mar 6, 2025
cf04343
add a short sleep in TestNetPolInBackgroundSkipAddAfterRemove
rayaisaiah Mar 6, 2025
08697eb
test remove dataplane changes to see if race condition fixes
rayaisaiah Mar 6, 2025
394dd0b
Revert "test remove dataplane changes to see if race condition fixes"
rayaisaiah Mar 6, 2025
449c2af
test
rayaisaiah Mar 6, 2025
a69d632
Revert "test"
rayaisaiah Mar 6, 2025
1aef051
test
rayaisaiah Mar 6, 2025
dd0869f
update dataplane to test if changes are flagged in race
rayaisaiah Mar 6, 2025
34c3aea
added stop channels to unit tests to avoid race condtiions
rayaisaiah Mar 7, 2025
52b8247
add non noisy logs back
rayaisaiah Mar 7, 2025
0e91284
increased time
rayaisaiah Mar 7, 2025
b416f07
revert time change after RunPeriodicTasks
rayaisaiah Mar 7, 2025
f92231b
test with 1000 seconds
rayaisaiah Mar 7, 2025
1e41f52
5000 milliseconds
rayaisaiah Mar 7, 2025
0a2d1ba
tweaked the delay
rayaisaiah Mar 7, 2025
117c5d8
update to 1500 for defer
rayaisaiah Mar 7, 2025
cabf807
increased to 1500
rayaisaiah Mar 7, 2025
4e88cd2
increase to 2000
rayaisaiah Mar 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 0 additions & 39 deletions npm/pkg/dataplane/dataplane.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ type DataPlane struct {
func NewDataPlane(nodeName string, ioShim *common.IOShim, cfg *Config, stopChannel <-chan struct{}) (*DataPlane, error) {
metrics.InitializeAll()
if util.IsWindowsDP() {
klog.Infof("[DataPlane] enabling AddEmptySetToLists for Windows")
cfg.IPSetManagerCfg.AddEmptySetToLists = true
}

Expand All @@ -106,13 +105,11 @@ func NewDataPlane(nodeName string, ioShim *common.IOShim, cfg *Config, stopChann
// do not let Linux apply in background
dp.applyInBackground = cfg.ApplyInBackground && util.IsWindowsDP()
if dp.applyInBackground {
klog.Infof("[DataPlane] dataplane configured to apply in background every %v or every %d calls to ApplyDataPlane()", dp.ApplyInterval, dp.ApplyMaxBatches)
dp.updatePodCache = newUpdatePodCache(cfg.ApplyMaxBatches)
if dp.ApplyMaxBatches <= 0 || dp.ApplyInterval == 0 {
return nil, ErrInvalidApplyConfig
}
} else {
klog.Info("[DataPlane] dataplane configured to NOT apply in background")
dp.updatePodCache = newUpdatePodCache(1)
}

Expand Down Expand Up @@ -150,7 +147,6 @@ func (dp *DataPlane) FinishBootupPhase() {
dp.applyInfo.Lock()
defer dp.applyInfo.Unlock()

klog.Infof("[DataPlane] finished bootup phase")
dp.applyInfo.inBootupPhase = false
}

Expand Down Expand Up @@ -257,8 +253,6 @@ func (dp *DataPlane) AddToSets(setNames []*ipsets.IPSetMetadata, podMetadata *Po
}

if dp.shouldUpdatePod() && podMetadata.NodeName == dp.nodeName {
klog.Infof("[DataPlane] Updating Sets to Add for pod key %s", podMetadata.PodKey)

// lock updatePodCache while reading/modifying or setting the updatePod in the cache
dp.updatePodCache.Lock()
defer dp.updatePodCache.Unlock()
Expand All @@ -279,8 +273,6 @@ func (dp *DataPlane) RemoveFromSets(setNames []*ipsets.IPSetMetadata, podMetadat
}

if dp.shouldUpdatePod() && podMetadata.NodeName == dp.nodeName {
klog.Infof("[DataPlane] Updating Sets to Remove for pod key %s", podMetadata.PodKey)

// lock updatePodCache while reading/modifying or setting the updatePod in the cache
dp.updatePodCache.Lock()
defer dp.updatePodCache.Unlock()
Expand Down Expand Up @@ -328,23 +320,18 @@ func (dp *DataPlane) ApplyDataPlane() error {
newCount := dp.applyInfo.numBatches
dp.applyInfo.Unlock()

klog.Infof("[DataPlane] [%s] new batch count: %d", contextApplyDP, newCount)

if newCount >= dp.ApplyMaxBatches {
klog.Infof("[DataPlane] [%s] applying now since reached maximum batch count: %d", contextApplyDP, newCount)
return dp.applyDataPlaneNow(contextApplyDP)
}

return nil
}

func (dp *DataPlane) applyDataPlaneNow(context string) error {
klog.Infof("[DataPlane] [ApplyDataPlane] [%s] starting to apply ipsets", context)
err := dp.ipsetMgr.ApplyIPSets()
if err != nil {
return fmt.Errorf("[DataPlane] [%s] error while applying IPSets: %w", context, err)
}
klog.Infof("[DataPlane] [ApplyDataPlane] [%s] finished applying ipsets", context)

// see comment in RemovePolicy() for why this is here
dp.setRemovePolicyFailure(false)
Expand All @@ -365,23 +352,18 @@ func (dp *DataPlane) applyDataPlaneNow(context string) error {
}
dp.updatePodCache.Unlock()

klog.Infof("[DataPlane] [ApplyDataPlane] [%s] refreshing endpoints before updating pods", context)

err := dp.refreshPodEndpoints()
if err != nil {
metrics.SendErrorLogAndMetric(util.DaemonDataplaneID, "[DataPlane] failed to refresh endpoints while updating pods. err: [%s]", err.Error())
// return as success since this can be retried irrespective of other operations
return nil
}

klog.Infof("[DataPlane] [ApplyDataPlane] [%s] refreshed endpoints", context)

// lock updatePodCache while driving goal state to kernel
// prevents another ApplyDataplane call from updating the same pods
dp.updatePodCache.Lock()
defer dp.updatePodCache.Unlock()

klog.Infof("[DataPlane] [ApplyDataPlane] [%s] starting to update pods", context)
for !dp.updatePodCache.isEmpty() {
pod := dp.updatePodCache.dequeue()
if pod == nil {
Expand All @@ -398,16 +380,12 @@ func (dp *DataPlane) applyDataPlaneNow(context string) error {
continue
}
}

klog.Infof("[DataPlane] [ApplyDataPlane] [%s] finished updating pods", context)
}
return nil
}

// AddPolicy takes in a translated NPMNetworkPolicy object and applies on dataplane
func (dp *DataPlane) AddPolicy(policy *policies.NPMNetworkPolicy) error {
klog.Infof("[DataPlane] Add Policy called for %s", policy.PolicyKey)

if !dp.netPolInBackground {
return dp.addPolicies([]*policies.NPMNetworkPolicy{policy})
}
Expand All @@ -420,10 +398,7 @@ func (dp *DataPlane) AddPolicy(policy *policies.NPMNetworkPolicy) error {
dp.netPolQueue.enqueue(policy)
newCount := dp.netPolQueue.len()

klog.Infof("[DataPlane] [%s] new pending netpol count: %d", contextAddNetPol, newCount)

if newCount >= dp.MaxPendingNetPols {
klog.Infof("[DataPlane] [%s] applying now since reached maximum batch count: %d", contextAddNetPol, newCount)
dp.addPoliciesWithRetry(contextAddNetPol)
}
return nil
Expand All @@ -433,25 +408,21 @@ func (dp *DataPlane) AddPolicy(policy *policies.NPMNetworkPolicy) error {
// The caller must lock netPolQueue.
func (dp *DataPlane) addPoliciesWithRetry(context string) {
netPols := dp.netPolQueue.dump()
klog.Infof("[DataPlane] adding policies %+v", netPols)

err := dp.addPolicies(netPols)
if err == nil {
// clear queue and return on success
klog.Infof("[DataPlane] [%s] added policies successfully", context)
dp.netPolQueue.clear()
return
}

klog.Errorf("[DataPlane] [%s] failed to add policies. will retry one policy at a time. err: %s", context, err.Error())
metrics.SendErrorLogAndMetric(util.DaemonDataplaneID, "[DataPlane] [%s] failed to add policies. err: %s", context, err.Error())

// retry one policy at a time
for _, netPol := range netPols {
err = dp.addPolicies([]*policies.NPMNetworkPolicy{netPol})
if err == nil {
// remove from queue on success
klog.Infof("[DataPlane] [%s] added policy successfully one at a time. policyKey: %s", context, netPol.PolicyKey)
dp.netPolQueue.delete(netPol.PolicyKey)
} else {
// keep in queue on failure
Expand All @@ -469,7 +440,6 @@ func (dp *DataPlane) addPolicies(netPols []*policies.NPMNetworkPolicy) error {
}

if len(netPols) == 0 {
klog.Infof("[DataPlane] expected to have at least one NetPol in dp.addPolicies()")
return nil
}

Expand Down Expand Up @@ -506,14 +476,12 @@ func (dp *DataPlane) addPolicies(netPols []*policies.NPMNetworkPolicy) error {
// Create and add references for Selector IPSets first
err := dp.createIPSetsAndReferences(netPol.AllPodSelectorIPSets(), netPol.PolicyKey, ipsets.SelectorType)
if err != nil {
klog.Infof("[DataPlane] error while adding Selector IPSet references: %s", err.Error())
return fmt.Errorf("[DataPlane] error while adding Selector IPSet references: %w", err)
}

// Create and add references for Rule IPSets
err = dp.createIPSetsAndReferences(netPol.RuleIPSets, netPol.PolicyKey, ipsets.NetPolType)
if err != nil {
klog.Infof("[DataPlane] error while adding Rule IPSet references: %s", err.Error())
return fmt.Errorf("[DataPlane] error while adding Rule IPSet references: %w", err)
}

Expand All @@ -526,15 +494,11 @@ func (dp *DataPlane) addPolicies(netPols []*policies.NPMNetworkPolicy) error {
// increment batch and apply IPSets if needed
dp.applyInfo.numBatches++
newCount := dp.applyInfo.numBatches
klog.Infof("[DataPlane] [%s] new batch count: %d", contextAddNetPolBootup, newCount)
if newCount >= dp.ApplyMaxBatches {
klog.Infof("[DataPlane] [%s] applying now since reached maximum batch count: %d", contextAddNetPolBootup, newCount)
klog.Infof("[DataPlane] [%s] starting to apply ipsets", contextAddNetPolBootup)
err = dp.ipsetMgr.ApplyIPSets()
if err != nil {
return fmt.Errorf("[DataPlane] [%s] error while applying IPSets: %w", contextAddNetPolBootup, err)
}
klog.Infof("[DataPlane] [%s] finished applying ipsets", contextAddNetPolBootup)

// see comment in RemovePolicy() for why this is here
dp.setRemovePolicyFailure(false)
Expand Down Expand Up @@ -574,8 +538,6 @@ func (dp *DataPlane) addPolicies(netPols []*policies.NPMNetworkPolicy) error {

// RemovePolicy takes in network policyKey (namespace/name of network policy) and removes it from dataplane and cache
func (dp *DataPlane) RemovePolicy(policyKey string) error {
klog.Infof("[DataPlane] Remove Policy called for %s", policyKey)

if dp.netPolInBackground {
// make sure to not add this NetPol if we're deleting it
// hold the lock for the rest of this function so that we don't contend or have races with the background NetPol thread
Expand Down Expand Up @@ -650,7 +612,6 @@ func (dp *DataPlane) RemovePolicy(policyKey string) error {
// UpdatePolicy takes in updated policy object, calculates the delta and applies changes
// onto dataplane accordingly
func (dp *DataPlane) UpdatePolicy(policy *policies.NPMNetworkPolicy) error {
klog.Infof("[DataPlane] Update Policy called for %s", policy.PolicyKey)
ok := dp.policyMgr.PolicyExists(policy.PolicyKey)
if !ok {
klog.Infof("[DataPlane] Policy %s is not found.", policy.PolicyKey)
Expand Down
11 changes: 0 additions & 11 deletions npm/pkg/dataplane/dpshim/dirtycache.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
package dpshim

import "k8s.io/klog"

type dirtyCache struct {
toAddorUpdateSets map[string]struct{}
toDeleteSets map[string]struct{}
Expand All @@ -19,12 +17,10 @@ func newDirtyCache() *dirtyCache {
}

func (dc *dirtyCache) clearCache() {
klog.Infof("Clearing dirty cache")
dc.toAddorUpdateSets = make(map[string]struct{})
dc.toDeleteSets = make(map[string]struct{})
dc.toAddorUpdatePolicies = make(map[string]struct{})
dc.toDeletePolicies = make(map[string]struct{})
dc.printContents()
}

func (dc *dirtyCache) modifyAddorUpdateSets(setName string) {
Expand All @@ -51,10 +47,3 @@ func (dc *dirtyCache) hasContents() bool {
return len(dc.toAddorUpdateSets) > 0 || len(dc.toDeleteSets) > 0 ||
len(dc.toAddorUpdatePolicies) > 0 || len(dc.toDeletePolicies) > 0
}

func (dc *dirtyCache) printContents() {
klog.Infof("toAddorUpdateSets: %v", dc.toAddorUpdateSets)
klog.Infof("toDeleteSets: %v", dc.toDeleteSets)
klog.Infof("toAddorUpdatePolicies: %v", dc.toAddorUpdatePolicies)
klog.Infof("toDeletePolicies: %v", dc.toDeletePolicies)
}
13 changes: 0 additions & 13 deletions npm/pkg/dataplane/dpshim/dpshim.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ func (dp *DPShim) HydrateClients() (*protos.Events, error) {
defer dp.unlock()

if len(dp.setCache) == 0 && len(dp.policyCache) == 0 {
klog.Infof("HydrateClients: No local cache objects to hydrate daemon client")
return nil, nil
}

Expand All @@ -81,7 +80,6 @@ func (dp *DPShim) HydrateClients() (*protos.Events, error) {
}

if len(goalStates) == 0 {
klog.Info("HydrateClients: No changes to apply")
return nil, nil
}

Expand Down Expand Up @@ -137,14 +135,12 @@ func (dp *DPShim) DeleteIPSet(setMetadata *ipsets.IPSetMetadata, _ util.DeleteOp

func (dp *DPShim) deleteIPSet(setMetadata *ipsets.IPSetMetadata) {
setName := setMetadata.GetPrefixName()
klog.Infof("deleteIPSet: cleaning up %s", setName)
set, ok := dp.setCache[setName]
if !ok {
return
}

if set.HasReferences() {
klog.Infof("deleteIPSet: ignore delete since set: %s has references", setName)
return
}

Expand All @@ -161,7 +157,6 @@ func (dp *DPShim) AddToSets(setMetadatas []*ipsets.IPSetMetadata, podMetadata *d
defer dp.unlock()

for _, set := range setMetadatas {
klog.Infof("AddToSets: Adding pod IP: %s, Key: %s, to set %s", podMetadata.PodIP, podMetadata.PodKey, set.GetPrefixName())
prefixedSetName := set.GetPrefixName()
if !dp.setExists(prefixedSetName) {
dp.createIPSet(set)
Expand Down Expand Up @@ -192,7 +187,6 @@ func (dp *DPShim) RemoveFromSets(setMetadatas []*ipsets.IPSetMetadata, podMetada
defer dp.unlock()

for _, set := range setMetadatas {
klog.Infof("RemoveFromSets: removing pod ip: %s, podkey: %s, from set %s ", podMetadata.PodIP, podMetadata.PodKey, set.GetPrefixName())
prefixedSetName := set.GetPrefixName()
if !dp.setExists(prefixedSetName) {
continue
Expand All @@ -209,8 +203,6 @@ func (dp *DPShim) RemoveFromSets(setMetadatas []*ipsets.IPSetMetadata, podMetada
continue
}
if cachedPod.PodKey != podMetadata.PodKey {
klog.Infof("DeleteFromSet: PodOwner has changed for Ip: %s, setName:%s, Old podKey: %s, new podKey: %s. Ignore the delete as this is stale update",
cachedPod.PodIP, prefixedSetName, cachedPod.PodKey, podMetadata.PodKey)
continue
}

Expand Down Expand Up @@ -400,12 +392,9 @@ func (dp *DPShim) ApplyDataPlane() error {

// check dirty cache contents
if !dp.dirtyCache.hasContents() {
klog.Info("ApplyDataPlane: No changes to apply")
return nil
}

dp.dirtyCache.printContents()

goalStates := make(map[string]*protos.GoalState)

toApplySets, err := dp.processIPSetsApply()
Expand Down Expand Up @@ -441,7 +430,6 @@ func (dp *DPShim) ApplyDataPlane() error {
}

if len(goalStates) == 0 {
klog.Info("ApplyDataPlane: No changes to apply")
return nil
}

Expand Down Expand Up @@ -629,7 +617,6 @@ func (dp *DPShim) deleteUnusedSets(stopChannel <-chan struct{}) {
case <-stopChannel:
return
case <-ticker.C:
klog.Info("deleteUnusedSets: cleaning up unused sets")
dp.checkSetReferences()
err := dp.ApplyDataPlane()
if err != nil {
Expand Down
11 changes: 0 additions & 11 deletions npm/pkg/dataplane/ipsets/ipsetmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ func (iMgr *IPSetManager) Reconcile() {
}
numRemovedSets := originalNumSets - len(iMgr.setMap)
if numRemovedSets > 0 {
klog.Infof("[IPSetManager] removed %d empty/unreferenced ipsets, updating toDeleteCache to: %+v", numRemovedSets, iMgr.dirtyCache.printDeleteCache())
}
}

Expand Down Expand Up @@ -308,10 +307,6 @@ func (iMgr *IPSetManager) RemoveFromSets(removeFromSets []*IPSetMetadata, ip, po
}
// in case the IP belongs to a new Pod, then ignore this Delete call as this might be stale
if cachedPodKey != podKey {
klog.Infof(
"[IPSetManager] DeleteFromSet: PodOwner has changed for Ip: %s, setName:%s, Old podKey: %s, new podKey: %s. Ignore the delete as this is stale update",
ip, prefixedName, cachedPodKey, podKey,
)
continue
}

Expand Down Expand Up @@ -453,14 +448,8 @@ func (iMgr *IPSetManager) ApplyIPSets() error {
defer iMgr.Unlock()

if iMgr.dirtyCache.numSetsToAddOrUpdate() == 0 && iMgr.dirtyCache.numSetsToDelete() == 0 {
klog.Info("[IPSetManager] No IPSets to apply")
return nil
}

klog.Infof(
"[IPSetManager] dirty caches. toAddUpdateCache: %s, toDeleteCache: %s",
iMgr.dirtyCache.printAddOrUpdateCache(), iMgr.dirtyCache.printDeleteCache(),
)
iMgr.sanitizeDirtyCache()

// Call the appropriate apply ipsets
Expand Down
Loading
Loading