Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Observability] Intra cloud connectivity check #507

Open
wants to merge 52 commits into
base: feature/observability-azure
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
5755161
initial version for check tag
J-467 Oct 7, 2024
a1d0463
check tags: orchestrator
J-467 Oct 7, 2024
d79e8cb
client and cli integration
J-467 Oct 7, 2024
510ca65
fixing orchestrator setup
J-467 Oct 7, 2024
52facee
cli, orchestrator, and client fixes
J-467 Oct 10, 2024
0f01728
getTagWithName function
J-467 Oct 10, 2024
f86c0b5
Check resource setup
J-467 Oct 22, 2024
9f82904
Modularizing check Resource in orchestrator
J-467 Oct 22, 2024
8088f6b
Fix resource (orchestrator, client, cli)
J-467 Oct 22, 2024
a85e4d9
Cleaning up check and fix; plugin handles fix
J-467 Nov 3, 2024
83b0103
isolating fixReq and fixResp protobuf
J-467 Nov 13, 2024
5487dba
Check and fix for resource
J-467 Nov 17, 2024
cc87aac
check and fix for resource_exist
J-467 Nov 17, 2024
932edbf
Deleting unused functions; fixing lint
J-467 Nov 18, 2024
74249eb
Renaming fix variable to convey attempt
J-467 Nov 18, 2024
3786c8b
Renaming check tests to signify infra check
J-467 Nov 18, 2024
20950b6
initial version for check tag
J-467 Oct 7, 2024
40e6372
check tags: orchestrator
J-467 Oct 7, 2024
49beee2
client and cli integration
J-467 Oct 7, 2024
a42cdb0
fixing orchestrator setup
J-467 Oct 7, 2024
958473e
cli, orchestrator, and client fixes
J-467 Oct 10, 2024
ffa9fa9
getTagWithName function
J-467 Oct 10, 2024
5b3afc7
Check resource setup
J-467 Oct 22, 2024
d5f0219
Modularizing check Resource in orchestrator
J-467 Oct 22, 2024
3ebb98e
Fix resource (orchestrator, client, cli)
J-467 Oct 22, 2024
4d64715
Cleaning up check and fix; plugin handles fix
J-467 Nov 3, 2024
3fd3e8d
isolating fixReq and fixResp protobuf
J-467 Nov 13, 2024
4bfb36b
Check and fix for resource
J-467 Nov 17, 2024
70f7d84
check and fix for resource_exist
J-467 Nov 17, 2024
43db5ed
Permit list check for resources
J-467 Nov 17, 2024
777b64e
Deleting unused functions; fixing lint
J-467 Nov 18, 2024
1620e3c
Renaming fix variable to convey attempt
J-467 Nov 18, 2024
76fb774
Renaming check tests to signify infra check
J-467 Nov 18, 2024
5949e67
Permit list check for resources
J-467 Nov 17, 2024
3823ffd
Renaming fix variable
J-467 Nov 18, 2024
590e7dd
Merge branch 'juliantk/check-tags' into juliantk/check-permit-lists
J-467 Nov 18, 2024
f574069
Deleting unused proto message
J-467 Nov 19, 2024
2bee8d6
Permit list targets check
J-467 Nov 18, 2024
92db514
Modifying check response to use maps
J-467 Nov 19, 2024
5255d67
Merge branch 'juliantk/check-tags' into juliantk/check-permit-lists
J-467 Nov 20, 2024
dbecda2
Refactoring code to use maps
J-467 Nov 20, 2024
c0df4f9
Adding messages to check results
J-467 Nov 21, 2024
ad6047f
Merge branch 'juliantk/check-permit-lists' into juliantk/permit-list-…
J-467 Nov 21, 2024
5a5ee22
Permit list target refactor
J-467 Nov 21, 2024
0ae42bf
Adding returns for public cloud and multi cloud
J-467 Nov 23, 2024
9961010
Checking for NAT in public conns
J-467 Nov 23, 2024
fd9d537
Public connections and fixing resource check
J-467 Dec 12, 2024
37f48e4
Permit list target refactor
J-467 Nov 21, 2024
fe47e89
Removing dump
J-467 Dec 27, 2024
882918b
Merge branch 'feature/observability-azure' into juliantk/permit-list-…
J-467 Dec 27, 2024
88e45f8
Merge branch 'juliantk/permit-list-targets' into juliantk/public-config
J-467 Dec 27, 2024
edf0960
Intra cloud check
J-467 Jan 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
314 changes: 295 additions & 19 deletions pkg/azure/plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,21 +85,13 @@ func (s *azurePluginServer) GetPermitList(ctx context.Context, req *paragliderpb
}
nsg := netInfo.NSG

// initialize a list of permit list rules
rules := []*paragliderpb.PermitListRule{}

// get the NSG rules
for _, rule := range nsg.Properties.SecurityRules {
if !strings.HasPrefix(*rule.Name, denyAllNsgRulePrefix) && strings.HasPrefix(*rule.Name, paragliderPrefix) {
plRule, err := azureHandler.GetPermitListRuleFromNSGRule(rule)
if err != nil {
utils.Log.Printf("An error occured while getting Paraglider rule from NSG rule: %+v", err)
return nil, err
}
plRule.Name = getRuleNameFromNSGRuleName(plRule.Name)
rules = append(rules, plRule)
}
rules, err := getPermitListsFromRules(azureHandler, nsg.Properties.SecurityRules, true)
if err != nil {
utils.Log.Printf("An error occured while getting permit list rules:%+v", err)
return nil, err
}

return &paragliderpb.GetPermitListResponse{Rules: rules}, nil
}

Expand Down Expand Up @@ -847,30 +839,50 @@ func (s *azurePluginServer) AttachResource(ctx context.Context, attachResourceRe
return &paragliderpb.AttachResourceResponse{Name: *resource.Name, Uri: *resource.ID, Ip: networkInfo.Address}, nil
}

func (s *azurePluginServer) ValidateResource(ctx context.Context, req *paragliderpb.ValidateResourceRequest) (*paragliderpb.ValidateResourceResponse, error) {
resourceInfo, err := getResourceIDInfo(req.Uri)
if err != nil {
utils.Log.Printf("An error occured while getting resource id info:%+v", err)
return nil, err
}

// The namespace doesn't matter for handler setup to check resource existence
handler, err := s.setupAzureHandler(resourceInfo, "")
if err != nil {
return nil, err
}

_, err = ValidateResourceExists(ctx, handler, req.Uri)
if err != nil {
return &paragliderpb.ValidateResourceResponse{Validated: false}, nil
}

return &paragliderpb.ValidateResourceResponse{Validated: true}, nil
}

func (s *azurePluginServer) CheckResource(ctx context.Context, checkReq *paragliderpb.CheckResourceRequest) (*paragliderpb.CheckResourceResponse, error) {
resp := &paragliderpb.CheckResourceResponse{}
resourceId := checkReq.GetResource()
namespace := checkReq.GetNamespace()
attemptFix := checkReq.GetAttemptFix()

checks := make(map[int32]*paragliderpb.CheckResult)

resourceIdInfo, err := getResourceIDInfo(resourceId)
if err != nil {
return resp, err
}

handler, err := s.setupAzureHandler(resourceIdInfo, namespace)
if err != nil {
return resp, err
}


// Resource Exists Check
_, err = ValidateResourceExists(ctx, handler, resourceId)
if err != nil {
// todo: Do this check in a different way
if strings.Contains(err.Error(), "ResourceNotFound") {
// todo: Do this check in a different way. This format is based on return text in err msg
errorText := textBetween(err.Error(), "ERROR CODE", "\n")
if strings.Contains(errorText, "NotFound") {
checks[int32(paragliderpb.CheckCode_Resource_Exists)] = &paragliderpb.CheckResult{
Status: paragliderpb.CheckStatus_FAIL,
}
Expand All @@ -885,7 +897,8 @@ func (s *azurePluginServer) CheckResource(ctx context.Context, checkReq *paragli
// Get Network Information
networkInfo, err := GetNetworkInfoFromResource(ctx, handler, resourceId)
if err != nil {
if strings.Contains(err.Error(), "ResourceNotFound") {
errorText := textBetween(err.Error(), "ERROR CODE", "\n")
if strings.Contains(errorText, "NotFound") {
if strings.HasPrefix(err.Error(), "NIC") {
checks[int32(paragliderpb.CheckCode_Network_Exists)] = &paragliderpb.CheckResult{
Status: paragliderpb.CheckStatus_FAIL,
Expand Down Expand Up @@ -956,10 +969,273 @@ func (s *azurePluginServer) CheckResource(ctx context.Context, checkReq *paragli
}

// Permit List Targets Check
status, msgs, publicCloudConn, _, peerVnets, targets, err := s.CheckPermitLists(ctx, handler, resourceId, networkInfo, checkReq.Namespace, attemptFix)
if err != nil {
// todo: should we return an error here?
utils.Log.Printf("An error occured while checking permit lists:%+v", err)
return nil, err
}
checks[int32(paragliderpb.CheckCode_PermitListTargets)] = &paragliderpb.CheckResult{Status: status, Messages: msgs}

// Public Cloud Connnection Configuration Check
// todo: Maybe we need to check for more than NAT?
hasNAT := false
createdNAT := false
if publicCloudConn {
if attemptFix {
_, err = getOrCreateNatGateway(ctx, handler, namespace, *vnet.Location)
createdNAT = (err == nil)
} else {
natGatewayName := getNatGatewayName(namespace, *vnet.Location)
_, err = handler.GetNatGateway(ctx, natGatewayName)
hasNAT = (err == nil)
}
}

if !publicCloudConn || publicCloudConn && hasNAT {
checks[int32(paragliderpb.CheckCode_PublicConnectionsConfigured)] = &paragliderpb.CheckResult{
Status: paragliderpb.CheckStatus_OK,
}
} else if createdNAT {
checks[int32(paragliderpb.CheckCode_PublicConnectionsConfigured)] = &paragliderpb.CheckResult{
Status: paragliderpb.CheckStatus_FIXED,
}
} else {
// Resource should have a NAT if it references a public IP
checks[int32(paragliderpb.CheckCode_PublicConnectionsConfigured)] = &paragliderpb.CheckResult{
Status: paragliderpb.CheckStatus_FAIL,
Messages: []string{"Error with Network Address Translator:\n", err.Error()},
}
}

// Intra-cloud Check
peerings := vnet.Properties.VirtualNetworkPeerings
checks[int32(paragliderpb.CheckCode_IntraCloudConnectionsConfigured)] = &paragliderpb.CheckResult{
Messages: make([]string, 0),
}
for _, peering := range peerings {
remoteId := *peering.Properties.RemoteVirtualNetwork.ID
brokenPeering := false
target := ""
peeringCloudInfo, ok := peerVnets[remoteId]
if ok {
if *peering.Properties.PeeringState != armnetwork.VirtualNetworkPeeringStateConnected {
brokenPeering = true
target = targets[peeringCloudInfo]
} else {
// Remove the peer vnet from the map to signify that it is connected
delete(peerVnets, remoteId)
}
}

if brokenPeering {
if attemptFix {
err = s.createPeering(ctx, *handler, resourceIdInfo, *vnet.Name, peeringCloudInfo, target)
if err == nil {
delete(peerVnets, remoteId)
} else {
checks[int32(paragliderpb.CheckCode_IntraCloudConnectionsConfigured)].Messages = append(checks[int32(paragliderpb.CheckCode_IntraCloudConnectionsConfigured)].Messages, "Error creating peering to vnet")
}
} else {
checks[int32(paragliderpb.CheckCode_IntraCloudConnectionsConfigured)].Messages = append(checks[int32(paragliderpb.CheckCode_IntraCloudConnectionsConfigured)].Messages, "Peering to vnet not connected")
}
}
}

// All existing and connected peer vnets should be deleted from the peer vnets map
if len(peerVnets) > 0 {
checks[int32(paragliderpb.CheckCode_IntraCloudConnectionsConfigured)].Status = paragliderpb.CheckStatus_FAIL
} else if attemptFix {
checks[int32(paragliderpb.CheckCode_IntraCloudConnectionsConfigured)].Status = paragliderpb.CheckStatus_FIXED
} else {
checks[int32(paragliderpb.CheckCode_IntraCloudConnectionsConfigured)].Status = paragliderpb.CheckStatus_OK
}

resp.Checks = checks
return resp, nil
}

// Check that the targets of the permit lists exists
// Also return whether the resource has any multicloud or public cloud connection
func (s *azurePluginServer) CheckPermitLists(
ctx context.Context,
handler *AzureSDKHandler,
resourceID string,
networkInfo *resourceNetworkInfo,
namespace string,
attemptFix bool,
) (status paragliderpb.CheckStatus, messages []string, publicCloud bool, multiCloud bool, peerVnets map[string]*utils.PeeringCloudInfo, targets map[*utils.PeeringCloudInfo]string, err error) {
vnetName := getVnetFromSubnetId(networkInfo.SubnetID)
vnet, err := handler.GetVirtualNetwork(ctx, vnetName)
if err != nil {
utils.Log.Printf("An error occured while getting vnet:%+v", err)
return paragliderpb.CheckStatus_FAIL, nil, false, false, nil, nil, err
}

// Get subnets address spaces
localVnetAddressSpaces := []string{}
for _, addressSpace := range vnet.Properties.AddressSpace.AddressPrefixes {
localVnetAddressSpaces = append(localVnetAddressSpaces, *addressSpace)
}
if len(localVnetAddressSpaces) == 0 {
return paragliderpb.CheckStatus_FAIL, nil, false, false, nil, nil, fmt.Errorf("unable to get subnet address prefix for vnet")
}

// Used when fixing. Tracks IPs associated to any deleted rule
deletedIps := map[string]bool{}
visitedIps := map[string]bool{}
publicCloud = false

// Map of vnet name to peering cloud info
peerVnets = make(map[string]*utils.PeeringCloudInfo)
targets = make(map[*utils.PeeringCloudInfo]string)

// Get used address spaces of all clouds
orchestratorConn, err := grpc.NewClient(s.orchestratorServerAddr, grpc.WithTransportCredentials(insecure.NewCredentials()))
if err != nil {
return paragliderpb.CheckStatus_FAIL, nil, false, false, peerVnets, targets, fmt.Errorf("unable to establish connection with orchestrator: %w", err)
}
defer orchestratorConn.Close()
orchestratorClient := paragliderpb.NewControllerClient(orchestratorConn)
getUsedAddressSpacesResp, err := orchestratorClient.GetUsedAddressSpaces(context.Background(), &emptypb.Empty{})
if err != nil {
return paragliderpb.CheckStatus_FAIL, nil, false, false, peerVnets, targets, fmt.Errorf("unable to get used address spaces: %w", err)
}

// Get permit lists for resource
rules, err := getPermitListsFromRules(handler, networkInfo.NSG.Properties.SecurityRules, false)
if err != nil {
return paragliderpb.CheckStatus_FAIL, nil, false, false, peerVnets, targets, fmt.Errorf("unable to get permit lists from rules: %w", err)
}

status = paragliderpb.CheckStatus_OK
messages = []string{}
for _, rule := range rules {
peeringCloudInfos, err := utils.GetPermitListRulePeeringCloudInfo(rule, getUsedAddressSpacesResp.AddressSpaceMappings)
if err != nil {
return paragliderpb.CheckStatus_FAIL, nil, publicCloud, multiCloud, peerVnets, targets, fmt.Errorf("unable to get peering cloud infos: %w", err)
}

for i, peeringCloudInfo := range peeringCloudInfos {
// If the rule has no tag for the target, skip
// May be because the resource was attached and Paraglider does not know about targets
if i >= len(rule.Tags) {
continue
}

peerTag := rule.Tags[i]
peerIp := rule.Targets[i]
// For check, no need to check the same IP twice if both inbound and outbound rules exist
// For fix, both inbound and outbound rules need to be deleted
if visitedIps[peerIp] && !attemptFix {
continue // Skip if the IP has already been visited and checked
}

if peeringCloudInfo == nil {
// Public IP
publicCloud = true
} else {
// If a deleted IP is seen in another rule, it means the rule is
// in the opposite direction(in vs outbound) and should also be deleted
if deletedIps[peerIp] && attemptFix {
err = handler.DeleteSecurityRule(ctx, *networkInfo.NSG.Name, rule.Name)
if err == nil {
status = paragliderpb.CheckStatus_FIXED
} else {
status = paragliderpb.CheckStatus_FAIL
}
continue
}

// Get the URI for the peered resource
var uriReq *paragliderpb.RetrieveUriRequest
var uriResp *paragliderpb.RetrieveUriResponse
if peeringCloudInfo.Cloud == utils.AZURE {
// Azure will handle validation of azure resources
uriReq = &paragliderpb.RetrieveUriRequest{TagName: peerTag, Cloud: utils.AZURE, ShouldValidate: false}
uriResp, err = orchestratorClient.RetrieveUriFromTag(ctx, uriReq)
if err != nil {
return paragliderpb.CheckStatus_FAIL, nil, publicCloud, multiCloud, peerVnets, targets, fmt.Errorf("unable to get uri from ip: %w", err)
}
peerUri := uriResp.Uri
peerInfo, err := getResourceIDInfo(peerUri)
if err != nil {
utils.Log.Printf("An error occured while getting resource id info:%+v", err)
return paragliderpb.CheckStatus_FAIL, nil, publicCloud, multiCloud, peerVnets, targets, err
}
// The namespace doesn't matter for this peer handler setup
// because this handler is setup to only validate if the resource exists on the cloud
peerHandler, err := s.setupAzureHandler(peerInfo, namespace)
if err != nil {
return paragliderpb.CheckStatus_FAIL, nil, publicCloud, multiCloud, peerVnets, targets, err
}

_, err = ValidateResourceExists(ctx, peerHandler, peerUri)
if err == nil {
uriResp.Validated = true
}

// Get the Peer Vnet for peering validation if peer is validated
if uriResp.Validated {
isLocal, err := utils.IsPermitListRuleTagInAddressSpace(peerIp, localVnetAddressSpaces)
if err != nil {
return paragliderpb.CheckStatus_FAIL, nil, publicCloud, multiCloud, peerVnets, targets, err
}
if !isLocal {
// Get peer Vnet
peerNetworkInfo, err := GetNetworkInfoFromResource(ctx, peerHandler, peerUri)
if err != nil {
return paragliderpb.CheckStatus_FAIL, nil, publicCloud, multiCloud, peerVnets, targets, err
}
peerVnetName := getVnetFromSubnetId(peerNetworkInfo.SubnetID)
peerVnet, err := peerHandler.GetVirtualNetwork(ctx, peerVnetName)
if err != nil {
return paragliderpb.CheckStatus_FAIL, nil, publicCloud, multiCloud, peerVnets, targets, err
}
peerVnets[*peerVnet.ID] = peeringCloudInfo
targets[peeringCloudInfo] = peerIp
}
}
} else {
// External cloud should validate that resource existss
uriReq = &paragliderpb.RetrieveUriRequest{TagName: peerTag, Cloud: utils.AZURE, ShouldValidate: true}
uriResp, err = orchestratorClient.RetrieveUriFromTag(ctx, uriReq)
if err != nil {
return paragliderpb.CheckStatus_FAIL, nil, publicCloud, multiCloud, peerVnets, targets, fmt.Errorf("unable to get uri from ip: %w", err)
}
// Multi connection exists if cross-cloud resource is validated/exists
if !multiCloud {
multiCloud = uriResp.Validated
}
}

if !uriResp.Validated {
// The peered resource doesn't exist
status = paragliderpb.CheckStatus_FAIL

// Attempt fixing by deleting the rule
if attemptFix {
err = handler.DeleteSecurityRule(ctx, *networkInfo.NSG.Name, rule.Name)
if err == nil {
status = paragliderpb.CheckStatus_FIXED
deletedIps[peerIp] = true
} else {
// Add error message if fix failed
messages = append(messages, fmt.Sprintf("Failed to delete permit list to resource: %s", peerTag))
}
} else {
messages = append(messages, fmt.Sprintf("Peered resource doesn't exist: %s", peerTag))
}
}
}

visitedIps[peerIp] = true
}
}

return status, messages, publicCloud, multiCloud, peerVnets, targets, nil
}

func Setup(port int, orchestratorServerAddr string) *azurePluginServer {
lis, err := net.Listen("tcp", fmt.Sprintf("localhost:%d", port))
if err != nil {
Expand Down
Loading