From f86025b7cd6a15fd6926a7c0df8f877382e286d2 Mon Sep 17 00:00:00 2001 From: Praveen Raghav Date: Tue, 24 Nov 2020 12:32:10 +0530 Subject: [PATCH 01/13] added cli var for ecs instance draining --- cloudlift/config/environment_configuration.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/cloudlift/config/environment_configuration.py b/cloudlift/config/environment_configuration.py index 9dda5c61..c52abc7b 100644 --- a/cloudlift/config/environment_configuration.py +++ b/cloudlift/config/environment_configuration.py @@ -128,6 +128,9 @@ def _create_config(self): cluster_min_instances = prompt("Min instances in cluster", default=1) cluster_max_instances = prompt("Max instances in cluster", default=5) cluster_instance_type = prompt("Instance type", default='m5.xlarge') + topic_name = prompt("SNS Topic Name") + heartbeat_timeout = prompt("Timeout for Lifecycle Hook", default=300) + lifecycle_hook_name = prompt("Lifecycle Hook name") key_name = prompt("SSH key name") notifications_arn = prompt("Notification SNS ARN") ssl_certificate_arn = prompt("SSL certificate ARN") @@ -167,6 +170,11 @@ def _create_config(self): "environment": { "notifications_arn": notifications_arn, "ssl_certificate_arn": ssl_certificate_arn + }, + "draining":{ + "topic_name": topic_name, + "heartbeat_timeout": heartbeat_timeout, + "lifecycle_hook_name": lifecycle_hook_name } } } @@ -268,6 +276,19 @@ def _validate_changes(self, configuration): "ssl_certificate_arn" ] }, + "draining": { + "type": "object", + "properties": { + "topic_name": {"type": "string"}, + "heartbeat_timeout": {"type": "integer"}, + "lifecycle_hook_name": {"type": "string"}, + }, + "required": [ + "topic_name", + "heartbeat_timeout", + "lifecycle_hook_name" + ] + }, "region": {"type": "string"}, "vpc": { "type": "object", From faaa5174acda1bc27af1f768666068dde786d836 Mon Sep 17 00:00:00 2001 From: Praveen Raghav Date: Tue, 24 Nov 2020 12:35:56 +0530 Subject: [PATCH 02/13] Lambda code for ecs instance draining --- .../ecs_instance_draining_lambda.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 cloudlift/deployment/ecs_instance_draining_lambda.py diff --git a/cloudlift/deployment/ecs_instance_draining_lambda.py b/cloudlift/deployment/ecs_instance_draining_lambda.py new file mode 100644 index 00000000..5a745fd5 --- /dev/null +++ b/cloudlift/deployment/ecs_instance_draining_lambda.py @@ -0,0 +1,61 @@ +import json +import time +import boto3 +import os + +ECS = boto3.client('ecs') +ASG = boto3.client('autoscaling') +SNS = boto3.client('sns') + +def find_ecs_instance_info(instance_id,cluster_name): + paginator = ECS.get_paginator('list_container_instances') + for list_resp in paginator.paginate(cluster=cluster_name): + arns = list_resp['containerInstanceArns'] + desc_resp = ECS.describe_container_instances(cluster=cluster_name, + containerInstances=arns) + for container_instance in desc_resp['containerInstances']: + if container_instance['ec2InstanceId'] != instance_id: + continue + print('Found instance: id=%s, arn=%s, status=%s, runningTasksCount=%s' % + (instance_id, container_instance['containerInstanceArn'], + container_instance['status'], container_instance['runningTasksCount'])) + return (container_instance['containerInstanceArn'], + container_instance['status'], container_instance['runningTasksCount']) + return None, None, 0 + +def instance_has_running_tasks(instance_id,cluster_name): + (instance_arn, container_status, running_tasks) = find_ecs_instance_info(instance_id,cluster_name) + if instance_arn is None: + print('Could not find instance ID %s. Letting autoscaling kill the instance.' % + (instance_id)) + return False + if container_status != 'DRAINING': + print('Setting container instance %s (%s) to DRAINING' % + (instance_id, instance_arn)) + ECS.update_container_instances_state(cluster=cluster_name, + containerInstances=[instance_arn], + status='DRAINING') + return running_tasks > 0 + +def lambda_handler(event, context): + msg = json.loads(event['Records'][0]['Sns']['Message']) + print("Event: ", msg) + if 'LifecycleTransition' not in msg.keys() or \ + msg['LifecycleTransition'].find('autoscaling:EC2_INSTANCE_TERMINATING') == -1: + print('Exiting since the lifecycle transition is not EC2_INSTANCE_TERMINATING.') + return + if instance_has_running_tasks(msg['EC2InstanceId'], msg['NotificationMetadata']): + print('Tasks are still running on instance %s; posting msg to SNS topic %s' % + (msg['EC2InstanceId'], event['Records'][0]['Sns']['TopicArn'])) + time.sleep(5) + sns_resp = SNS.publish(TopicArn=event['Records'][0]['Sns']['TopicArn'], + Message=json.dumps(msg), + Subject='Publishing SNS msg to invoke Lambda again.') + print('Posted msg %s to SNS topic.' % (sns_resp['MessageId'])) + else: + print('No tasks are running on instance %s; setting lifecycle to complete' % + (msg['EC2InstanceId'])) + ASG.complete_lifecycle_action(LifecycleHookName=msg['LifecycleHookName'], + AutoScalingGroupName=msg['AutoScalingGroupName'], + LifecycleActionResult='CONTINUE', + InstanceId=msg['EC2InstanceId']) \ No newline at end of file From 94518d023653167b03fbc4c1a42b3cc6c63d944a Mon Sep 17 00:00:00 2001 From: Praveen Raghav Date: Tue, 24 Nov 2020 12:40:26 +0530 Subject: [PATCH 03/13] ECS instance draining --- .../deployment/cluster_template_generator.py | 144 +++- .../expected_environment_template.yml | 648 ++++++++++++++++++ 2 files changed, 787 insertions(+), 5 deletions(-) create mode 100644 test/templates/expected_environment_template.yml diff --git a/cloudlift/deployment/cluster_template_generator.py b/cloudlift/deployment/cluster_template_generator.py index 327f93d0..8a027ed7 100644 --- a/cloudlift/deployment/cluster_template_generator.py +++ b/cloudlift/deployment/cluster_template_generator.py @@ -1,28 +1,32 @@ import json import re +import pathlib from cfn_flip import to_yaml from stringcase import camelcase, pascalcase from troposphere import (Base64, FindInMap, Output, Parameter, Ref, Sub, - cloudformation) + cloudformation, GetAtt, Join) from troposphere.autoscaling import (AutoScalingGroup, LaunchConfiguration, - ScalingPolicy) + ScalingPolicy, LifecycleHook) from troposphere.cloudwatch import Alarm, MetricDimension from troposphere.ec2 import (VPC, InternetGateway, NatGateway, Route, RouteTable, SecurityGroup, Subnet, SubnetRouteTableAssociation, VPCGatewayAttachment) from troposphere.ecs import Cluster from troposphere.elasticache import SubnetGroup as ElastiCacheSubnetGroup -from troposphere.iam import InstanceProfile, Role +from troposphere.iam import InstanceProfile, Role, PolicyType, Policy from troposphere.logs import LogGroup from troposphere.policies import (AutoScalingRollingUpdate, CreationPolicy, ResourceSignal) from troposphere.rds import DBSubnetGroup - +from troposphere.awslambda import Function, Code, MEMORY_VALUES, Permission from cloudlift.config import DecimalEncoder from cloudlift.config import get_client_for, get_region_for_environment from cloudlift.deployment.template_generator import TemplateGenerator from cloudlift.version import VERSION +from troposphere.sns import Subscription, Topic, SubscriptionResource +from awacs.aws import Allow, Statement, Principal, PolicyDocument +from awacs.sts import AssumeRole class ClusterTemplateGenerator(TemplateGenerator): @@ -305,6 +309,7 @@ def _add_cluster(self): cluster = Cluster('Cluster', ClusterName=Ref('AWS::StackName')) self.template.add_resource(cluster) self._add_ec2_auto_scaling() + self._add_instance_draining(cluster) self._add_cluster_alarms(cluster) return cluster @@ -540,6 +545,107 @@ def _add_ec2_auto_scaling(self): ScalingAdjustment=1 ) self.template.add_resource(self.cluster_scaling_policy) + return self.auto_scaling_group + + def _add_instance_draining(self, cluster): + self.sns_asg_role = Role( + "SNSASGRole", + AssumeRolePolicyDocument=PolicyDocument( + Statement=[ + Statement( + Effect=Allow, + Action=[AssumeRole], + Principal=Principal("Service", ["autoscaling.amazonaws.com"]) + ) + ] + ), + ManagedPolicyArns=["arn:aws:iam::aws:policy/service-role/AutoScalingNotificationAccessRole"] + ) + self.template.add_resource(self.sns_asg_role) + self.lambda_execution_role = Role( + "LambdaExecutionRole", + Policies=[Policy( + PolicyName="lambda-inline", + PolicyDocument={ + "Version": "2012-10-17", + "Statement": [{ + "Effect": "Allow", + "Action": [ + "autoscaling:CompleteLifecycleAction", + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents", + "ecs:ListContainerInstances", + "ecs:DescribeContainerInstances", + "ecs:UpdateContainerInstancesState", + "sns:Publish" + ], + "Resource": "*" + }], + } + )], + AssumeRolePolicyDocument=PolicyDocument( + Statement=[ + Statement( + Effect=Allow, + Action=[AssumeRole], + Principal=Principal("Service", ["lambda.amazonaws.com"]) + ) + ] + ), + ManagedPolicyArns=["arn:aws:iam::aws:policy/service-role/AutoScalingNotificationAccessRole"] + ) + self.template.add_resource(self.lambda_execution_role) + with open (str(pathlib.Path(__file__).parent.absolute())+"/ecs_instance_draining_lambda.py", "r") as ecs_instance_draining_lambda: + lambda_code=ecs_instance_draining_lambda.readlines() + self.lambda_function_for_asg = Function( + "LambdaFunctionForASG", + Handler="index.lambda_handler", + Role=GetAtt(self.lambda_execution_role, "Arn"), + Runtime="python3.6", + MemorySize=128, + Timeout=60, + Code=Code( + ZipFile=Join("", lambda_code) + ) + ) + self.template.add_resource(self.lambda_function_for_asg) + self.asg_sns_topic = Topic( + "ASGSNSTopic", + TopicName=Ref('TopicName'), + Subscription=[Subscription( + Protocol="lambda", + Endpoint=GetAtt(self.lambda_function_for_asg, "Arn") + )] + ) + self.template.add_resource(self.asg_sns_topic) + self.lambda_invoke_permission = Permission( + "LambdaInvokePermission", + FunctionName=Ref(self.lambda_function_for_asg), + Action="lambda:InvokeFunction", + Principal="sns.amazonaws.com", + SourceArn=Ref(self.asg_sns_topic) + ) + self.template.add_resource(self.lambda_invoke_permission) + self.lambda_subscription_to_sns_topic = SubscriptionResource( + "LambdaSubscriptionToSNSTopic", + Protocol="lambda", + Endpoint=GetAtt(self.lambda_function_for_asg, "Arn"), + TopicArn=Ref(self.asg_sns_topic) + ) + self.template.add_resource(self.lambda_subscription_to_sns_topic) + self.asg_lifecycle_hook=LifecycleHook( + "ASGLifecycleHook", + AutoScalingGroupName=Ref(self.auto_scaling_group), + DefaultResult="ABANDON", + HeartbeatTimeout=Ref('HeartbeatTimeout'), + LifecycleHookName=Ref('LifecycleHookName'), + LifecycleTransition="autoscaling:EC2_INSTANCE_TERMINATING", + NotificationMetadata=Ref(cluster), + NotificationTargetARN=Ref(self.asg_sns_topic), + RoleARN=GetAtt(self.sns_asg_role, "Arn"), + ) + self.template.add_resource(self.asg_lifecycle_hook) def _add_cluster_parameters(self): self.template.add_parameter(Parameter( @@ -551,6 +657,12 @@ def _add_cluster_parameters(self): self.key_pair = Parameter( "KeyPair", Description='', Type="AWS::EC2::KeyPair::KeyName", Default="") self.template.add_parameter(self.key_pair) + self.template.add_parameter(Parameter( + "TopicName", Description='', Type="String", Default=str(self.configuration['draining']['topic_name']))) + self.template.add_parameter(Parameter( + "HeartbeatTimeout", Description='', Type="Number", Default=str(self.configuration['draining']['heartbeat_timeout']))) + self.template.add_parameter(Parameter( + "LifecycleHookName", Description='', Type="String", Default=str(self.configuration['draining']['lifecycle_hook_name']))) self.template.add_parameter(Parameter( "MinSize", Description='', Type="Number", Default=str(self.configuration['cluster']['min_instances']))) self.template.add_parameter(Parameter( @@ -582,6 +694,9 @@ def _add_cluster_outputs(self): 'max_instances': str(self.configuration['cluster']['max_instances']), 'instance_type': self.configuration['cluster']['instance_type'], 'key_name': self.configuration['cluster']['key_name'], + 'topic_name': self.configuration['draining']['topic_name'], + 'heartbeat_timeout': str(self.configuration['draining']['heartbeat_timeout']), + 'lifecycle_hook_name': self.configuration['draining']['lifecycle_hook_name'], 'cloudlift_version': VERSION } self.template.add_output(Output( @@ -667,6 +782,16 @@ def _add_metadata(self): 'NotificationSnsArn' ] }, + { + 'Label': { + 'default': 'ECS Draining Configuration' + }, + 'Parameters': [ + 'TopicName', + 'HeartbeatTimeout', + 'LifecycleHookName' + ] + } ], 'ParameterLabels': { 'Environment': { @@ -684,7 +809,7 @@ def _add_metadata(self): 'default': 'Min. no. of instances in cluster' }, 'NotificationSnsArn': { - 'default': 'The SNS topic to which notifactions has to be triggered' + 'default': 'The SNS topic to which notifications has to be triggered' }, 'Subnet1': { 'default': 'Enter the ID of the 1st subnet' @@ -695,6 +820,15 @@ def _add_metadata(self): 'VPC': { 'default': 'Enter the VPC in which you want the environment to be setup' }, + 'LifecycleHookName': { + 'default': 'Enter the name for Auto Scaling Group Lifecycle Hook' + }, + 'HeartbeatTimeout': { + 'default': 'Enter the maximum timeout in sec for lifecycle hook' + }, + 'TopicName': { + 'default': 'Enter the name for Lambda SNS topic' + } } } }) diff --git a/test/templates/expected_environment_template.yml b/test/templates/expected_environment_template.yml new file mode 100644 index 00000000..29d8ccf5 --- /dev/null +++ b/test/templates/expected_environment_template.yml @@ -0,0 +1,648 @@ +Metadata: + AWS::CloudFormation::Interface: + ParameterGroups: + - Label: + default: Cluster Configuration + Parameters: + - KeyPair + - Environment + - MinSize + - MaxSize + - InstanceType + - VPC + - Subnet1 + - Subnet2 + - NotificationSnsArn + - Label: + default: ECS Draining Configuration + Parameters: + - TopicName + - HeartbeatTimeout + - LifecycleHookName + ParameterLabels: + Environment: + default: Enter the environment e.g. dev or staging or sandbox or production + InstanceType: + default: Type of instance + KeyPair: + default: Select the key with which you want to login to the ec2 instances + MaxSize: + default: Max. no. of instances in cluster + MinSize: + default: Min. no. of instances in cluster + NotificationSnsArn: + default: The SNS topic to which notifications has to be triggered + Subnet1: + default: Enter the ID of the 1st subnet + Subnet2: + default: Enter the ID of the 2nd subnet + VPC: + default: Enter the VPC in which you want the environment to be setup + LifecycleHookName: + default: Enter the name for Auto Scaling Group Lifecycle Hook + HeartbeatTimeout: + default: Enter the maximum timeout in sec for lifecycle hook + TopicName: + default: Enter the name for Lambda SNS topic +Mappings: + AWSRegionToAMI: + ap-south-1: + AMI: ami-0c42adb42b71cacfc +Outputs: + StackId: + Description: The unique ID of the stack. To be supplied to circle CI environment + variables to validate during deployment. + Value: !Ref 'AWS::StackId' + StackName: + Description: The name of the stack + Value: !Ref 'AWS::StackName' + CloudliftOptions: + Description: Options used with cloudlift when building this cluster + Value: '{"env": "demo", "min_instances": "1", "max_instances": "2", "instance_type": + "t3a.micro", "key_name": "praveen-test", "topic_name": "DemoTest", "heartbeat_timeout": + "300", "lifecycle_hook_name": "DemoTest", "cloudlift_version": "1.4.4"}' + VPC: + Description: VPC in which environment is setup + Value: !Ref 'demoVpc' + PrivateSubnet1: + Description: ID of the 1st subnet + Value: !Ref 'demoPrivateSubnet2' + PrivateSubnet2: + Description: ID of the 2nd subnet + Value: !Ref 'demoPrivateSubnet1' + PublicSubnet1: + Description: ID of the 1st subnet + Value: !Ref 'demoPublicSubnet2' + PublicSubnet2: + Description: ID of the 2nd subnet + Value: !Ref 'demoPublicSubnet1' + AutoScalingGroup: + Description: AutoScaling group for ECS container instances + Value: !Ref 'AutoScalingGroup' + SecurityGroupAlb: + Description: Security group ID for ALB + Value: !Ref 'SecurityGroupAlb' + MinInstances: + Description: Minimum instances in cluster + Value: '1' + MaxInstances: + Description: Maximum instances in cluster + Value: '2' + InstanceType: + Description: EC2 instance type + Value: t3a.micro + KeyName: + Description: Key Pair name for accessing the instances + Value: praveen-test +Parameters: + Environment: + Description: '' + Type: String + Default: '' + KeyPair: + Description: '' + Type: AWS::EC2::KeyPair::KeyName + Default: '' + TopicName: + Description: '' + Type: String + Default: DemoTest + HeartbeatTimeout: + Description: '' + Type: Number + Default: '300' + LifecycleHookName: + Description: '' + Type: String + Default: DemoTest + MinSize: + Description: '' + Type: Number + Default: '1' + MaxSize: + Description: '' + Type: Number + Default: '2' + NotificationSnsArn: + Description: '' + Type: String + Default: arn:aws:sns:ap-south-1:259042324395:Praveen + InstanceType: + Description: '' + Type: String + Default: t3a.micro +Resources: + demoVpc: + Properties: + CidrBlock: 10.7.0.0/16 + EnableDnsSupport: 'true' + EnableDnsHostnames: 'true' + InstanceTenancy: default + Tags: + - Key: category + Value: services + - Key: environment + Value: demo + - Key: Name + Value: demo-vpc + Type: AWS::EC2::VPC + demoIg: + Properties: + Tags: + - Key: Name + Value: demo-internet-gateway + - Key: environment + Value: demo + Type: AWS::EC2::InternetGateway + demoAttachment: + Properties: + InternetGatewayId: !Ref 'demoIg' + VpcId: !Ref 'demoVpc' + Type: AWS::EC2::VPCGatewayAttachment + demoPublic: + Properties: + VpcId: !Ref 'demoVpc' + Tags: + - Key: Name + Value: demo-public + - Key: environment + Value: demo + Type: AWS::EC2::RouteTable + DependsOn: demoVpc + demoPublicSubnet1: + Properties: + AvailabilityZone: ap-south-1b + CidrBlock: 10.7.0.0/22 + VpcId: !Ref 'demoVpc' + MapPublicIpOnLaunch: 'true' + Tags: + - Key: Name + Value: demo-public-1 + - Key: environment + Value: demo + Type: AWS::EC2::Subnet + demoPublicSubnet1Assoc: + Properties: + RouteTableId: !Ref 'demoPublic' + SubnetId: !Ref 'demoPublicSubnet1' + Type: AWS::EC2::SubnetRouteTableAssociation + demoPublicSubnet2: + Properties: + AvailabilityZone: ap-south-1a + CidrBlock: 10.7.4.0/22 + VpcId: !Ref 'demoVpc' + MapPublicIpOnLaunch: 'true' + Tags: + - Key: Name + Value: demo-public-2 + - Key: environment + Value: demo + Type: AWS::EC2::Subnet + demoPublicSubnet2Assoc: + Properties: + RouteTableId: !Ref 'demoPublic' + SubnetId: !Ref 'demoPublicSubnet2' + Type: AWS::EC2::SubnetRouteTableAssociation + demoIgRoute: + Properties: + DestinationCidrBlock: '0.0.0.0/0' + GatewayId: !Ref 'demoIg' + RouteTableId: !Ref 'demoPublic' + Type: AWS::EC2::Route + demoPrivate: + Properties: + VpcId: !Ref 'demoVpc' + Tags: + - Key: Name + Value: demo-private + - Key: environment + Value: demo + Type: AWS::EC2::RouteTable + demoPrivateSubnet1: + Properties: + AvailabilityZone: ap-south-1b + CidrBlock: 10.7.8.0/22 + VpcId: !Ref 'demoVpc' + MapPublicIpOnLaunch: 'false' + Tags: + - Key: Name + Value: demo-private-1 + - Key: environment + Value: demo + Type: AWS::EC2::Subnet + demoPrivateSubnet1Assoc: + Properties: + RouteTableId: !Ref 'demoPrivate' + SubnetId: !Ref 'demoPrivateSubnet1' + Type: AWS::EC2::SubnetRouteTableAssociation + demoPrivateSubnet2: + Properties: + AvailabilityZone: ap-south-1a + CidrBlock: 10.7.12.0/22 + VpcId: !Ref 'demoVpc' + MapPublicIpOnLaunch: 'false' + Tags: + - Key: Name + Value: demo-private-2 + - Key: environment + Value: demo + Type: AWS::EC2::Subnet + demoPrivateSubnet2Assoc: + Properties: + RouteTableId: !Ref 'demoPrivate' + SubnetId: !Ref 'demoPrivateSubnet2' + Type: AWS::EC2::SubnetRouteTableAssociation + demoNat: + Properties: + AllocationId: eipalloc-0103733acf336d725 + SubnetId: !Ref 'demoPublicSubnet1' + Tags: + - Key: Name + Value: demo-nat-gateway + - Key: environment + Value: demo + Type: AWS::EC2::NatGateway + demoNatRoute: + Properties: + DestinationCidrBlock: '0.0.0.0/0' + NatGatewayId: !Ref 'demoNat' + RouteTableId: !Ref 'demoPrivate' + Type: AWS::EC2::Route + DBSubnetGroup: + Properties: + DBSubnetGroupName: demo-subnet + Tags: + - Key: category + Value: services + - Key: environment + Value: demo + DBSubnetGroupDescription: demo subnet group + SubnetIds: + - !Ref 'demoPrivateSubnet1' + - !Ref 'demoPrivateSubnet2' + Type: AWS::RDS::DBSubnetGroup + ElasticacheSubnetGroup: + Properties: + CacheSubnetGroupName: demo-subnet + Description: demo subnet group + SubnetIds: + - !Ref 'demoPrivateSubnet1' + - !Ref 'demoPrivateSubnet2' + Type: AWS::ElastiCache::SubnetGroup + demoLogGroup: + Properties: + LogGroupName: demo-logs + RetentionInDays: 365 + Type: AWS::Logs::LogGroup + Cluster: + Properties: + ClusterName: !Ref 'AWS::StackName' + Type: AWS::ECS::Cluster + ECSRole: + Properties: + Path: / + ManagedPolicyArns: + - arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role + - arn:aws:iam::aws:policy/AmazonDynamoDBReadOnlyAccess + - arn:aws:iam::aws:policy/service-role/AmazonEC2RoleforSSM + RoleName: !Sub 'ecs-${AWS::StackName}-${AWS::Region}' + AssumeRolePolicyDocument: + Statement: + - Action: + - sts:AssumeRole + Effect: Allow + Principal: + Service: + - ec2.amazonaws.com + Type: AWS::IAM::Role + InstanceProfile: + Properties: + Path: / + Roles: + - !Ref 'ECSRole' + Type: AWS::IAM::InstanceProfile + SecurityGroupAlb: + Properties: + VpcId: !Ref 'demoVpc' + GroupDescription: !Sub '${AWS::StackName}-alb' + Type: AWS::EC2::SecurityGroup + SecurityGroupEc2Hosts: + Properties: + SecurityGroupIngress: + - SourceSecurityGroupId: !Ref 'SecurityGroupAlb' + IpProtocol: -1 + VpcId: !Ref 'demoVpc' + GroupDescription: !Sub '${AWS::StackName}-hosts' + Type: AWS::EC2::SecurityGroup + SecurityGroupDatabases: + Properties: + SecurityGroupIngress: + - SourceSecurityGroupId: !Ref 'SecurityGroupEc2Hosts' + IpProtocol: -1 + VpcId: !Ref 'demoVpc' + GroupDescription: !Sub '${AWS::StackName}-databases' + Type: AWS::EC2::SecurityGroup + LaunchConfiguration: + Properties: + UserData: !Base64 + Fn::Sub: "#!/bin/bash\nyum update -y\nyum install -y aws-cfn-bootstrap\n/opt/aws/bin/cfn-init\ + \ -v --region ${AWS::Region} --stack ${AWS::StackName} --resource LaunchConfiguration\n\ + /opt/aws/bin/cfn-signal -e $? --region ${AWS::Region} --stack ${AWS::StackName}\ + \ --resource AutoScalingGroup\nyum install -y https://s3.amazonaws.com/ec2-downloads-windows/SSMAgent/latest/linux_amd64/amazon-ssm-agent.rpm\n\ + systemctl enable amazon-ssm-agent\nsystemctl start amazon-ssm-agent\n" + IamInstanceProfile: !Ref 'InstanceProfile' + SecurityGroups: + - !Ref 'SecurityGroupEc2Hosts' + InstanceType: !Ref 'InstanceType' + ImageId: !FindInMap + - AWSRegionToAMI + - !Ref 'AWS::Region' + - AMI + KeyName: !Ref 'KeyPair' + Type: AWS::AutoScaling::LaunchConfiguration + Metadata: + AWS::CloudFormation::Init: + config: + files: + /etc/cfn/cfn-hup.conf: + content: !Sub "[main]\nstack=${AWS::StackId}\nregion=${AWS::Region}\n" + mode: '256' + owner: root + group: root + /etc/cfn/hooks.d/cfn-auto-reloader.conf: + content: !Sub "[cfn-auto-reloader-hook]\ntriggers=post.update\npath=Resources.ContainerInstances.Metadata.AWS::CloudFormation::Init\n\ + action=/opt/aws/bin/cfn-init -v --region ${AWS::Region} --stack ${AWS::StackName}\ + \ --resource LaunchConfiguration\n" + services: + sysvinit: + cfn-hup: + enabled: 'true' + ensureRunning: 'true' + files: + - /etc/cfn/cfn-hup.conf + - /etc/cfn/hooks.d/cfn-auto-reloader.conf + commands: + '01_add_instance_to_cluster': + command: !Sub "echo \"ECS_CLUSTER=${Cluster}\nECS_RESERVED_MEMORY=256\"\ + \ > /etc/ecs/ecs.config" + AutoScalingGroup: + Properties: + DesiredCapacity: 1 + Tags: + - PropagateAtLaunch: true + Value: !Sub '${AWS::StackName} - ECS Host' + Key: Name + MinSize: !Ref 'MinSize' + MaxSize: !Ref 'MaxSize' + VPCZoneIdentifier: + - !Ref 'demoPrivateSubnet2' + - !Ref 'demoPrivateSubnet1' + LaunchConfigurationName: !Ref 'LaunchConfiguration' + Type: AWS::AutoScaling::AutoScalingGroup + UpdatePolicy: {} + CreationPolicy: + ResourceSignal: + Timeout: PT15M + AutoScalingPolicy: + Properties: + AdjustmentType: ChangeInCapacity + AutoScalingGroupName: !Ref 'AutoScalingGroup' + Cooldown: 300 + PolicyType: SimpleScaling + ScalingAdjustment: 1 + Type: AWS::AutoScaling::ScalingPolicy + SNSASGRole: + Properties: + AssumeRolePolicyDocument: + Statement: + - Effect: Allow + Action: + - sts:AssumeRole + Principal: + Service: + - autoscaling.amazonaws.com + ManagedPolicyArns: + - arn:aws:iam::aws:policy/service-role/AutoScalingNotificationAccessRole + Type: AWS::IAM::Role + LambdaExecutionRole: + Properties: + Policies: + - PolicyName: lambda-inline + PolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Action: + - autoscaling:CompleteLifecycleAction + - logs:CreateLogGroup + - logs:CreateLogStream + - logs:PutLogEvents + - ecs:ListContainerInstances + - ecs:DescribeContainerInstances + - ecs:UpdateContainerInstancesState + - sns:Publish + Resource: '*' + AssumeRolePolicyDocument: + Statement: + - Effect: Allow + Action: + - sts:AssumeRole + Principal: + Service: + - lambda.amazonaws.com + ManagedPolicyArns: + - arn:aws:iam::aws:policy/service-role/AutoScalingNotificationAccessRole + Type: AWS::IAM::Role + LambdaFunctionForASG: + Properties: + Handler: index.lambda_handler + Role: !GetAtt 'LambdaExecutionRole.Arn' + Runtime: python3.6 + MemorySize: 128 + Timeout: 60 + Code: + ZipFile: !Join + - '' + - - "import json\n" + - "import time\n" + - "import boto3\n" + - "import os\n" + - "\n" + - "ECS = boto3.client('ecs')\n" + - "ASG = boto3.client('autoscaling')\n" + - "SNS = boto3.client('sns')\n" + - "\n" + - "def find_ecs_instance_info(instance_id,cluster_name):\n" + - " paginator = ECS.get_paginator('list_container_instances')\n" + - " for list_resp in paginator.paginate(cluster=cluster_name):\n" + - " arns = list_resp['containerInstanceArns']\n" + - " desc_resp = ECS.describe_container_instances(cluster=cluster_name,\n" + - " containerInstances=arns)\n" + - " for container_instance in desc_resp['containerInstances']:\n" + - " if container_instance['ec2InstanceId'] != instance_id:\n" + - " continue\n" + - " print('Found instance: id=%s, arn=%s, status=%s, runningTasksCount=%s'\ + \ %\n" + - " (instance_id, container_instance['containerInstanceArn'],\n" + - " container_instance['status'], container_instance['runningTasksCount']))\n" + - " return (container_instance['containerInstanceArn'],\n" + - " container_instance['status'], container_instance['runningTasksCount'])\n" + - " return None, None, 0\n" + - "\n" + - "def instance_has_running_tasks(instance_id,cluster_name):\n" + - " (instance_arn, container_status, running_tasks) = find_ecs_instance_info(instance_id,cluster_name)\n" + - " if instance_arn is None:\n" + - " print('Could not find instance ID %s. Letting autoscaling kill\ + \ the instance.' %\n" + - " (instance_id))\n" + - " return False\n" + - " if container_status != 'DRAINING':\n" + - " print('Setting container instance %s (%s) to DRAINING' %\n" + - " (instance_id, instance_arn))\n" + - " ECS.update_container_instances_state(cluster=cluster_name,\n" + - " containerInstances=[instance_arn],\n" + - " status='DRAINING')\n" + - " return running_tasks > 0\n" + - " \n" + - "def lambda_handler(event, context):\n" + - " msg = json.loads(event['Records'][0]['Sns']['Message'])\n" + - " print(\"Event: \", msg)\n" + - " if 'LifecycleTransition' not in msg.keys() or \\\n" + - " msg['LifecycleTransition'].find('autoscaling:EC2_INSTANCE_TERMINATING')\ + \ == -1:\n" + - " print('Exiting since the lifecycle transition is not EC2_INSTANCE_TERMINATING.')\n" + - " return\n" + - " if instance_has_running_tasks(msg['EC2InstanceId'], msg['NotificationMetadata']):\n" + - " print('Tasks are still running on instance %s; posting msg\ + \ to SNS topic %s' %\n" + - " (msg['EC2InstanceId'], event['Records'][0]['Sns']['TopicArn']))\n" + - " time.sleep(5)\n" + - " sns_resp = SNS.publish(TopicArn=event['Records'][0]['Sns']['TopicArn'],\n" + - " Message=json.dumps(msg),\n" + - " Subject='Publishing SNS msg to invoke\ + \ Lambda again.')\n" + - " print('Posted msg %s to SNS topic.' % (sns_resp['MessageId']))\n" + - " else:\n" + - " print('No tasks are running on instance %s; setting lifecycle\ + \ to complete' %\n" + - " (msg['EC2InstanceId']))\n" + - " ASG.complete_lifecycle_action(LifecycleHookName=msg['LifecycleHookName'],\n" + - " AutoScalingGroupName=msg['AutoScalingGroupName'],\n" + - " LifecycleActionResult='CONTINUE',\n" + - ' InstanceId=msg[''EC2InstanceId''])' + Type: AWS::Lambda::Function + ASGSNSTopic: + Properties: + TopicName: !Ref 'TopicName' + Subscription: + - Protocol: lambda + Endpoint: !GetAtt 'LambdaFunctionForASG.Arn' + Type: AWS::SNS::Topic + LambdaInvokePermission: + Properties: + FunctionName: !Ref 'LambdaFunctionForASG' + Action: lambda:InvokeFunction + Principal: sns.amazonaws.com + SourceArn: !Ref 'ASGSNSTopic' + Type: AWS::Lambda::Permission + LambdaSubscriptionToSNSTopic: + Properties: + Protocol: lambda + Endpoint: !GetAtt 'LambdaFunctionForASG.Arn' + TopicArn: !Ref 'ASGSNSTopic' + Type: AWS::SNS::Subscription + ASGLifecycleHook: + Properties: + AutoScalingGroupName: !Ref 'AutoScalingGroup' + DefaultResult: ABANDON + HeartbeatTimeout: !Ref 'HeartbeatTimeout' + LifecycleHookName: !Ref 'LifecycleHookName' + LifecycleTransition: autoscaling:EC2_INSTANCE_TERMINATING + NotificationMetadata: !Ref 'Cluster' + NotificationTargetARN: !Ref 'ASGSNSTopic' + RoleARN: !GetAtt 'SNSASGRole.Arn' + Type: AWS::AutoScaling::LifecycleHook + Ec2HostsHighCPUAlarm: + Properties: + EvaluationPeriods: 1 + Dimensions: + - Name: AutoScalingGroupName + Value: !Ref 'AutoScalingGroup' + AlarmActions: + - !Ref 'NotificationSnsArn' + AlarmDescription: Alarm if CPU too high or metric disappears indicating instance + is down + Namespace: AWS/EC2 + Period: 60 + ComparisonOperator: GreaterThanThreshold + Statistic: Average + Threshold: '60' + MetricName: CPUUtilization + Type: AWS::CloudWatch::Alarm + ClusterHighCPUAlarm: + Properties: + EvaluationPeriods: 1 + Dimensions: + - Name: ClusterName + Value: !Ref 'Cluster' + AlarmActions: + - !Ref 'NotificationSnsArn' + AlarmDescription: Alarm if CPU is too high for cluster. + Namespace: AWS/ECS + Period: 300 + ComparisonOperator: GreaterThanThreshold + Statistic: Average + Threshold: '60' + MetricName: CPUUtilization + Type: AWS::CloudWatch::Alarm + ClusterHighMemoryAlarm: + Properties: + EvaluationPeriods: 1 + Dimensions: + - Name: ClusterName + Value: !Ref 'Cluster' + AlarmActions: + - !Ref 'NotificationSnsArn' + AlarmDescription: Alarm if memory is too high for cluster. + Namespace: AWS/ECS + Period: 300 + ComparisonOperator: GreaterThanThreshold + Statistic: Average + Threshold: '60' + MetricName: MemoryUtilization + Type: AWS::CloudWatch::Alarm + ClusterHighMemoryReservationAlarm: + Properties: + EvaluationPeriods: 1 + Dimensions: + - Name: ClusterName + Value: !Ref 'Cluster' + AlarmActions: + - !Ref 'AutoScalingPolicy' + AlarmDescription: Alarm if memory reservation is over 75% for cluster. + Namespace: AWS/ECS + Period: 300 + ComparisonOperator: GreaterThanThreshold + Statistic: Average + Threshold: '75' + MetricName: MemoryReservation + Type: AWS::CloudWatch::Alarm + ClusterHighMemoryReservationUserNotifcationAlarm: + Properties: + EvaluationPeriods: 3 + Dimensions: + - Name: ClusterName + Value: !Ref 'Cluster' + AlarmActions: + - !Ref 'NotificationSnsArn' + OKActions: + - !Ref 'NotificationSnsArn' + AlarmDescription: Alarm if memory reservation is over 75% for cluster for 15 + minutes. + Namespace: AWS/ECS + Period: 300 + ComparisonOperator: GreaterThanThreshold + Statistic: Average + Threshold: '75' + MetricName: MemoryReservation + Type: AWS::CloudWatch::Alarm \ No newline at end of file From 70bfc6bff7bbc9041945c683f757ce6ee09adc3a Mon Sep 17 00:00:00 2001 From: Praveen Raghav Date: Tue, 24 Nov 2020 13:16:09 +0530 Subject: [PATCH 04/13] remove unwanted return --- cloudlift/deployment/cluster_template_generator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cloudlift/deployment/cluster_template_generator.py b/cloudlift/deployment/cluster_template_generator.py index 8a027ed7..355045a5 100644 --- a/cloudlift/deployment/cluster_template_generator.py +++ b/cloudlift/deployment/cluster_template_generator.py @@ -545,7 +545,6 @@ def _add_ec2_auto_scaling(self): ScalingAdjustment=1 ) self.template.add_resource(self.cluster_scaling_policy) - return self.auto_scaling_group def _add_instance_draining(self, cluster): self.sns_asg_role = Role( From 94c963728aa6b4182b87eab4220654622cedc73d Mon Sep 17 00:00:00 2001 From: Praveen Raghav Date: Mon, 30 Nov 2020 17:13:23 +0530 Subject: [PATCH 05/13] remove unwanted var --- cloudlift/config/environment_configuration.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/cloudlift/config/environment_configuration.py b/cloudlift/config/environment_configuration.py index c52abc7b..2b9422aa 100644 --- a/cloudlift/config/environment_configuration.py +++ b/cloudlift/config/environment_configuration.py @@ -170,11 +170,6 @@ def _create_config(self): "environment": { "notifications_arn": notifications_arn, "ssl_certificate_arn": ssl_certificate_arn - }, - "draining":{ - "topic_name": topic_name, - "heartbeat_timeout": heartbeat_timeout, - "lifecycle_hook_name": lifecycle_hook_name } } } @@ -276,19 +271,6 @@ def _validate_changes(self, configuration): "ssl_certificate_arn" ] }, - "draining": { - "type": "object", - "properties": { - "topic_name": {"type": "string"}, - "heartbeat_timeout": {"type": "integer"}, - "lifecycle_hook_name": {"type": "string"}, - }, - "required": [ - "topic_name", - "heartbeat_timeout", - "lifecycle_hook_name" - ] - }, "region": {"type": "string"}, "vpc": { "type": "object", From f949c584aee4258f1915786e675a3c489c3f3e6f Mon Sep 17 00:00:00 2001 From: Praveen Raghav Date: Mon, 30 Nov 2020 17:13:44 +0530 Subject: [PATCH 06/13] remove unwanted var --- .../deployment/cluster_template_generator.py | 34 ++----------------- 1 file changed, 3 insertions(+), 31 deletions(-) diff --git a/cloudlift/deployment/cluster_template_generator.py b/cloudlift/deployment/cluster_template_generator.py index 355045a5..1ef5c18e 100644 --- a/cloudlift/deployment/cluster_template_generator.py +++ b/cloudlift/deployment/cluster_template_generator.py @@ -611,7 +611,7 @@ def _add_instance_draining(self, cluster): self.template.add_resource(self.lambda_function_for_asg) self.asg_sns_topic = Topic( "ASGSNSTopic", - TopicName=Ref('TopicName'), + TopicName=Join("", [Ref(cluster),"Topic"]), Subscription=[Subscription( Protocol="lambda", Endpoint=GetAtt(self.lambda_function_for_asg, "Arn") @@ -637,8 +637,8 @@ def _add_instance_draining(self, cluster): "ASGLifecycleHook", AutoScalingGroupName=Ref(self.auto_scaling_group), DefaultResult="ABANDON", - HeartbeatTimeout=Ref('HeartbeatTimeout'), - LifecycleHookName=Ref('LifecycleHookName'), + HeartbeatTimeout=300, + LifecycleHookName=Join("", [Ref(cluster),"ASGHook"]), LifecycleTransition="autoscaling:EC2_INSTANCE_TERMINATING", NotificationMetadata=Ref(cluster), NotificationTargetARN=Ref(self.asg_sns_topic), @@ -656,12 +656,6 @@ def _add_cluster_parameters(self): self.key_pair = Parameter( "KeyPair", Description='', Type="AWS::EC2::KeyPair::KeyName", Default="") self.template.add_parameter(self.key_pair) - self.template.add_parameter(Parameter( - "TopicName", Description='', Type="String", Default=str(self.configuration['draining']['topic_name']))) - self.template.add_parameter(Parameter( - "HeartbeatTimeout", Description='', Type="Number", Default=str(self.configuration['draining']['heartbeat_timeout']))) - self.template.add_parameter(Parameter( - "LifecycleHookName", Description='', Type="String", Default=str(self.configuration['draining']['lifecycle_hook_name']))) self.template.add_parameter(Parameter( "MinSize", Description='', Type="Number", Default=str(self.configuration['cluster']['min_instances']))) self.template.add_parameter(Parameter( @@ -693,9 +687,6 @@ def _add_cluster_outputs(self): 'max_instances': str(self.configuration['cluster']['max_instances']), 'instance_type': self.configuration['cluster']['instance_type'], 'key_name': self.configuration['cluster']['key_name'], - 'topic_name': self.configuration['draining']['topic_name'], - 'heartbeat_timeout': str(self.configuration['draining']['heartbeat_timeout']), - 'lifecycle_hook_name': self.configuration['draining']['lifecycle_hook_name'], 'cloudlift_version': VERSION } self.template.add_output(Output( @@ -780,16 +771,6 @@ def _add_metadata(self): 'Subnet2', 'NotificationSnsArn' ] - }, - { - 'Label': { - 'default': 'ECS Draining Configuration' - }, - 'Parameters': [ - 'TopicName', - 'HeartbeatTimeout', - 'LifecycleHookName' - ] } ], 'ParameterLabels': { @@ -818,15 +799,6 @@ def _add_metadata(self): }, 'VPC': { 'default': 'Enter the VPC in which you want the environment to be setup' - }, - 'LifecycleHookName': { - 'default': 'Enter the name for Auto Scaling Group Lifecycle Hook' - }, - 'HeartbeatTimeout': { - 'default': 'Enter the maximum timeout in sec for lifecycle hook' - }, - 'TopicName': { - 'default': 'Enter the name for Lambda SNS topic' } } } From 500f9a06ae856dde1ccebd4da6d51cd8053acfcf Mon Sep 17 00:00:00 2001 From: Praveen Raghav Date: Mon, 30 Nov 2020 17:15:16 +0530 Subject: [PATCH 07/13] remove template --- .../expected_environment_template.yml | 648 ------------------ 1 file changed, 648 deletions(-) delete mode 100644 test/templates/expected_environment_template.yml diff --git a/test/templates/expected_environment_template.yml b/test/templates/expected_environment_template.yml deleted file mode 100644 index 29d8ccf5..00000000 --- a/test/templates/expected_environment_template.yml +++ /dev/null @@ -1,648 +0,0 @@ -Metadata: - AWS::CloudFormation::Interface: - ParameterGroups: - - Label: - default: Cluster Configuration - Parameters: - - KeyPair - - Environment - - MinSize - - MaxSize - - InstanceType - - VPC - - Subnet1 - - Subnet2 - - NotificationSnsArn - - Label: - default: ECS Draining Configuration - Parameters: - - TopicName - - HeartbeatTimeout - - LifecycleHookName - ParameterLabels: - Environment: - default: Enter the environment e.g. dev or staging or sandbox or production - InstanceType: - default: Type of instance - KeyPair: - default: Select the key with which you want to login to the ec2 instances - MaxSize: - default: Max. no. of instances in cluster - MinSize: - default: Min. no. of instances in cluster - NotificationSnsArn: - default: The SNS topic to which notifications has to be triggered - Subnet1: - default: Enter the ID of the 1st subnet - Subnet2: - default: Enter the ID of the 2nd subnet - VPC: - default: Enter the VPC in which you want the environment to be setup - LifecycleHookName: - default: Enter the name for Auto Scaling Group Lifecycle Hook - HeartbeatTimeout: - default: Enter the maximum timeout in sec for lifecycle hook - TopicName: - default: Enter the name for Lambda SNS topic -Mappings: - AWSRegionToAMI: - ap-south-1: - AMI: ami-0c42adb42b71cacfc -Outputs: - StackId: - Description: The unique ID of the stack. To be supplied to circle CI environment - variables to validate during deployment. - Value: !Ref 'AWS::StackId' - StackName: - Description: The name of the stack - Value: !Ref 'AWS::StackName' - CloudliftOptions: - Description: Options used with cloudlift when building this cluster - Value: '{"env": "demo", "min_instances": "1", "max_instances": "2", "instance_type": - "t3a.micro", "key_name": "praveen-test", "topic_name": "DemoTest", "heartbeat_timeout": - "300", "lifecycle_hook_name": "DemoTest", "cloudlift_version": "1.4.4"}' - VPC: - Description: VPC in which environment is setup - Value: !Ref 'demoVpc' - PrivateSubnet1: - Description: ID of the 1st subnet - Value: !Ref 'demoPrivateSubnet2' - PrivateSubnet2: - Description: ID of the 2nd subnet - Value: !Ref 'demoPrivateSubnet1' - PublicSubnet1: - Description: ID of the 1st subnet - Value: !Ref 'demoPublicSubnet2' - PublicSubnet2: - Description: ID of the 2nd subnet - Value: !Ref 'demoPublicSubnet1' - AutoScalingGroup: - Description: AutoScaling group for ECS container instances - Value: !Ref 'AutoScalingGroup' - SecurityGroupAlb: - Description: Security group ID for ALB - Value: !Ref 'SecurityGroupAlb' - MinInstances: - Description: Minimum instances in cluster - Value: '1' - MaxInstances: - Description: Maximum instances in cluster - Value: '2' - InstanceType: - Description: EC2 instance type - Value: t3a.micro - KeyName: - Description: Key Pair name for accessing the instances - Value: praveen-test -Parameters: - Environment: - Description: '' - Type: String - Default: '' - KeyPair: - Description: '' - Type: AWS::EC2::KeyPair::KeyName - Default: '' - TopicName: - Description: '' - Type: String - Default: DemoTest - HeartbeatTimeout: - Description: '' - Type: Number - Default: '300' - LifecycleHookName: - Description: '' - Type: String - Default: DemoTest - MinSize: - Description: '' - Type: Number - Default: '1' - MaxSize: - Description: '' - Type: Number - Default: '2' - NotificationSnsArn: - Description: '' - Type: String - Default: arn:aws:sns:ap-south-1:259042324395:Praveen - InstanceType: - Description: '' - Type: String - Default: t3a.micro -Resources: - demoVpc: - Properties: - CidrBlock: 10.7.0.0/16 - EnableDnsSupport: 'true' - EnableDnsHostnames: 'true' - InstanceTenancy: default - Tags: - - Key: category - Value: services - - Key: environment - Value: demo - - Key: Name - Value: demo-vpc - Type: AWS::EC2::VPC - demoIg: - Properties: - Tags: - - Key: Name - Value: demo-internet-gateway - - Key: environment - Value: demo - Type: AWS::EC2::InternetGateway - demoAttachment: - Properties: - InternetGatewayId: !Ref 'demoIg' - VpcId: !Ref 'demoVpc' - Type: AWS::EC2::VPCGatewayAttachment - demoPublic: - Properties: - VpcId: !Ref 'demoVpc' - Tags: - - Key: Name - Value: demo-public - - Key: environment - Value: demo - Type: AWS::EC2::RouteTable - DependsOn: demoVpc - demoPublicSubnet1: - Properties: - AvailabilityZone: ap-south-1b - CidrBlock: 10.7.0.0/22 - VpcId: !Ref 'demoVpc' - MapPublicIpOnLaunch: 'true' - Tags: - - Key: Name - Value: demo-public-1 - - Key: environment - Value: demo - Type: AWS::EC2::Subnet - demoPublicSubnet1Assoc: - Properties: - RouteTableId: !Ref 'demoPublic' - SubnetId: !Ref 'demoPublicSubnet1' - Type: AWS::EC2::SubnetRouteTableAssociation - demoPublicSubnet2: - Properties: - AvailabilityZone: ap-south-1a - CidrBlock: 10.7.4.0/22 - VpcId: !Ref 'demoVpc' - MapPublicIpOnLaunch: 'true' - Tags: - - Key: Name - Value: demo-public-2 - - Key: environment - Value: demo - Type: AWS::EC2::Subnet - demoPublicSubnet2Assoc: - Properties: - RouteTableId: !Ref 'demoPublic' - SubnetId: !Ref 'demoPublicSubnet2' - Type: AWS::EC2::SubnetRouteTableAssociation - demoIgRoute: - Properties: - DestinationCidrBlock: '0.0.0.0/0' - GatewayId: !Ref 'demoIg' - RouteTableId: !Ref 'demoPublic' - Type: AWS::EC2::Route - demoPrivate: - Properties: - VpcId: !Ref 'demoVpc' - Tags: - - Key: Name - Value: demo-private - - Key: environment - Value: demo - Type: AWS::EC2::RouteTable - demoPrivateSubnet1: - Properties: - AvailabilityZone: ap-south-1b - CidrBlock: 10.7.8.0/22 - VpcId: !Ref 'demoVpc' - MapPublicIpOnLaunch: 'false' - Tags: - - Key: Name - Value: demo-private-1 - - Key: environment - Value: demo - Type: AWS::EC2::Subnet - demoPrivateSubnet1Assoc: - Properties: - RouteTableId: !Ref 'demoPrivate' - SubnetId: !Ref 'demoPrivateSubnet1' - Type: AWS::EC2::SubnetRouteTableAssociation - demoPrivateSubnet2: - Properties: - AvailabilityZone: ap-south-1a - CidrBlock: 10.7.12.0/22 - VpcId: !Ref 'demoVpc' - MapPublicIpOnLaunch: 'false' - Tags: - - Key: Name - Value: demo-private-2 - - Key: environment - Value: demo - Type: AWS::EC2::Subnet - demoPrivateSubnet2Assoc: - Properties: - RouteTableId: !Ref 'demoPrivate' - SubnetId: !Ref 'demoPrivateSubnet2' - Type: AWS::EC2::SubnetRouteTableAssociation - demoNat: - Properties: - AllocationId: eipalloc-0103733acf336d725 - SubnetId: !Ref 'demoPublicSubnet1' - Tags: - - Key: Name - Value: demo-nat-gateway - - Key: environment - Value: demo - Type: AWS::EC2::NatGateway - demoNatRoute: - Properties: - DestinationCidrBlock: '0.0.0.0/0' - NatGatewayId: !Ref 'demoNat' - RouteTableId: !Ref 'demoPrivate' - Type: AWS::EC2::Route - DBSubnetGroup: - Properties: - DBSubnetGroupName: demo-subnet - Tags: - - Key: category - Value: services - - Key: environment - Value: demo - DBSubnetGroupDescription: demo subnet group - SubnetIds: - - !Ref 'demoPrivateSubnet1' - - !Ref 'demoPrivateSubnet2' - Type: AWS::RDS::DBSubnetGroup - ElasticacheSubnetGroup: - Properties: - CacheSubnetGroupName: demo-subnet - Description: demo subnet group - SubnetIds: - - !Ref 'demoPrivateSubnet1' - - !Ref 'demoPrivateSubnet2' - Type: AWS::ElastiCache::SubnetGroup - demoLogGroup: - Properties: - LogGroupName: demo-logs - RetentionInDays: 365 - Type: AWS::Logs::LogGroup - Cluster: - Properties: - ClusterName: !Ref 'AWS::StackName' - Type: AWS::ECS::Cluster - ECSRole: - Properties: - Path: / - ManagedPolicyArns: - - arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role - - arn:aws:iam::aws:policy/AmazonDynamoDBReadOnlyAccess - - arn:aws:iam::aws:policy/service-role/AmazonEC2RoleforSSM - RoleName: !Sub 'ecs-${AWS::StackName}-${AWS::Region}' - AssumeRolePolicyDocument: - Statement: - - Action: - - sts:AssumeRole - Effect: Allow - Principal: - Service: - - ec2.amazonaws.com - Type: AWS::IAM::Role - InstanceProfile: - Properties: - Path: / - Roles: - - !Ref 'ECSRole' - Type: AWS::IAM::InstanceProfile - SecurityGroupAlb: - Properties: - VpcId: !Ref 'demoVpc' - GroupDescription: !Sub '${AWS::StackName}-alb' - Type: AWS::EC2::SecurityGroup - SecurityGroupEc2Hosts: - Properties: - SecurityGroupIngress: - - SourceSecurityGroupId: !Ref 'SecurityGroupAlb' - IpProtocol: -1 - VpcId: !Ref 'demoVpc' - GroupDescription: !Sub '${AWS::StackName}-hosts' - Type: AWS::EC2::SecurityGroup - SecurityGroupDatabases: - Properties: - SecurityGroupIngress: - - SourceSecurityGroupId: !Ref 'SecurityGroupEc2Hosts' - IpProtocol: -1 - VpcId: !Ref 'demoVpc' - GroupDescription: !Sub '${AWS::StackName}-databases' - Type: AWS::EC2::SecurityGroup - LaunchConfiguration: - Properties: - UserData: !Base64 - Fn::Sub: "#!/bin/bash\nyum update -y\nyum install -y aws-cfn-bootstrap\n/opt/aws/bin/cfn-init\ - \ -v --region ${AWS::Region} --stack ${AWS::StackName} --resource LaunchConfiguration\n\ - /opt/aws/bin/cfn-signal -e $? --region ${AWS::Region} --stack ${AWS::StackName}\ - \ --resource AutoScalingGroup\nyum install -y https://s3.amazonaws.com/ec2-downloads-windows/SSMAgent/latest/linux_amd64/amazon-ssm-agent.rpm\n\ - systemctl enable amazon-ssm-agent\nsystemctl start amazon-ssm-agent\n" - IamInstanceProfile: !Ref 'InstanceProfile' - SecurityGroups: - - !Ref 'SecurityGroupEc2Hosts' - InstanceType: !Ref 'InstanceType' - ImageId: !FindInMap - - AWSRegionToAMI - - !Ref 'AWS::Region' - - AMI - KeyName: !Ref 'KeyPair' - Type: AWS::AutoScaling::LaunchConfiguration - Metadata: - AWS::CloudFormation::Init: - config: - files: - /etc/cfn/cfn-hup.conf: - content: !Sub "[main]\nstack=${AWS::StackId}\nregion=${AWS::Region}\n" - mode: '256' - owner: root - group: root - /etc/cfn/hooks.d/cfn-auto-reloader.conf: - content: !Sub "[cfn-auto-reloader-hook]\ntriggers=post.update\npath=Resources.ContainerInstances.Metadata.AWS::CloudFormation::Init\n\ - action=/opt/aws/bin/cfn-init -v --region ${AWS::Region} --stack ${AWS::StackName}\ - \ --resource LaunchConfiguration\n" - services: - sysvinit: - cfn-hup: - enabled: 'true' - ensureRunning: 'true' - files: - - /etc/cfn/cfn-hup.conf - - /etc/cfn/hooks.d/cfn-auto-reloader.conf - commands: - '01_add_instance_to_cluster': - command: !Sub "echo \"ECS_CLUSTER=${Cluster}\nECS_RESERVED_MEMORY=256\"\ - \ > /etc/ecs/ecs.config" - AutoScalingGroup: - Properties: - DesiredCapacity: 1 - Tags: - - PropagateAtLaunch: true - Value: !Sub '${AWS::StackName} - ECS Host' - Key: Name - MinSize: !Ref 'MinSize' - MaxSize: !Ref 'MaxSize' - VPCZoneIdentifier: - - !Ref 'demoPrivateSubnet2' - - !Ref 'demoPrivateSubnet1' - LaunchConfigurationName: !Ref 'LaunchConfiguration' - Type: AWS::AutoScaling::AutoScalingGroup - UpdatePolicy: {} - CreationPolicy: - ResourceSignal: - Timeout: PT15M - AutoScalingPolicy: - Properties: - AdjustmentType: ChangeInCapacity - AutoScalingGroupName: !Ref 'AutoScalingGroup' - Cooldown: 300 - PolicyType: SimpleScaling - ScalingAdjustment: 1 - Type: AWS::AutoScaling::ScalingPolicy - SNSASGRole: - Properties: - AssumeRolePolicyDocument: - Statement: - - Effect: Allow - Action: - - sts:AssumeRole - Principal: - Service: - - autoscaling.amazonaws.com - ManagedPolicyArns: - - arn:aws:iam::aws:policy/service-role/AutoScalingNotificationAccessRole - Type: AWS::IAM::Role - LambdaExecutionRole: - Properties: - Policies: - - PolicyName: lambda-inline - PolicyDocument: - Version: '2012-10-17' - Statement: - - Effect: Allow - Action: - - autoscaling:CompleteLifecycleAction - - logs:CreateLogGroup - - logs:CreateLogStream - - logs:PutLogEvents - - ecs:ListContainerInstances - - ecs:DescribeContainerInstances - - ecs:UpdateContainerInstancesState - - sns:Publish - Resource: '*' - AssumeRolePolicyDocument: - Statement: - - Effect: Allow - Action: - - sts:AssumeRole - Principal: - Service: - - lambda.amazonaws.com - ManagedPolicyArns: - - arn:aws:iam::aws:policy/service-role/AutoScalingNotificationAccessRole - Type: AWS::IAM::Role - LambdaFunctionForASG: - Properties: - Handler: index.lambda_handler - Role: !GetAtt 'LambdaExecutionRole.Arn' - Runtime: python3.6 - MemorySize: 128 - Timeout: 60 - Code: - ZipFile: !Join - - '' - - - "import json\n" - - "import time\n" - - "import boto3\n" - - "import os\n" - - "\n" - - "ECS = boto3.client('ecs')\n" - - "ASG = boto3.client('autoscaling')\n" - - "SNS = boto3.client('sns')\n" - - "\n" - - "def find_ecs_instance_info(instance_id,cluster_name):\n" - - " paginator = ECS.get_paginator('list_container_instances')\n" - - " for list_resp in paginator.paginate(cluster=cluster_name):\n" - - " arns = list_resp['containerInstanceArns']\n" - - " desc_resp = ECS.describe_container_instances(cluster=cluster_name,\n" - - " containerInstances=arns)\n" - - " for container_instance in desc_resp['containerInstances']:\n" - - " if container_instance['ec2InstanceId'] != instance_id:\n" - - " continue\n" - - " print('Found instance: id=%s, arn=%s, status=%s, runningTasksCount=%s'\ - \ %\n" - - " (instance_id, container_instance['containerInstanceArn'],\n" - - " container_instance['status'], container_instance['runningTasksCount']))\n" - - " return (container_instance['containerInstanceArn'],\n" - - " container_instance['status'], container_instance['runningTasksCount'])\n" - - " return None, None, 0\n" - - "\n" - - "def instance_has_running_tasks(instance_id,cluster_name):\n" - - " (instance_arn, container_status, running_tasks) = find_ecs_instance_info(instance_id,cluster_name)\n" - - " if instance_arn is None:\n" - - " print('Could not find instance ID %s. Letting autoscaling kill\ - \ the instance.' %\n" - - " (instance_id))\n" - - " return False\n" - - " if container_status != 'DRAINING':\n" - - " print('Setting container instance %s (%s) to DRAINING' %\n" - - " (instance_id, instance_arn))\n" - - " ECS.update_container_instances_state(cluster=cluster_name,\n" - - " containerInstances=[instance_arn],\n" - - " status='DRAINING')\n" - - " return running_tasks > 0\n" - - " \n" - - "def lambda_handler(event, context):\n" - - " msg = json.loads(event['Records'][0]['Sns']['Message'])\n" - - " print(\"Event: \", msg)\n" - - " if 'LifecycleTransition' not in msg.keys() or \\\n" - - " msg['LifecycleTransition'].find('autoscaling:EC2_INSTANCE_TERMINATING')\ - \ == -1:\n" - - " print('Exiting since the lifecycle transition is not EC2_INSTANCE_TERMINATING.')\n" - - " return\n" - - " if instance_has_running_tasks(msg['EC2InstanceId'], msg['NotificationMetadata']):\n" - - " print('Tasks are still running on instance %s; posting msg\ - \ to SNS topic %s' %\n" - - " (msg['EC2InstanceId'], event['Records'][0]['Sns']['TopicArn']))\n" - - " time.sleep(5)\n" - - " sns_resp = SNS.publish(TopicArn=event['Records'][0]['Sns']['TopicArn'],\n" - - " Message=json.dumps(msg),\n" - - " Subject='Publishing SNS msg to invoke\ - \ Lambda again.')\n" - - " print('Posted msg %s to SNS topic.' % (sns_resp['MessageId']))\n" - - " else:\n" - - " print('No tasks are running on instance %s; setting lifecycle\ - \ to complete' %\n" - - " (msg['EC2InstanceId']))\n" - - " ASG.complete_lifecycle_action(LifecycleHookName=msg['LifecycleHookName'],\n" - - " AutoScalingGroupName=msg['AutoScalingGroupName'],\n" - - " LifecycleActionResult='CONTINUE',\n" - - ' InstanceId=msg[''EC2InstanceId''])' - Type: AWS::Lambda::Function - ASGSNSTopic: - Properties: - TopicName: !Ref 'TopicName' - Subscription: - - Protocol: lambda - Endpoint: !GetAtt 'LambdaFunctionForASG.Arn' - Type: AWS::SNS::Topic - LambdaInvokePermission: - Properties: - FunctionName: !Ref 'LambdaFunctionForASG' - Action: lambda:InvokeFunction - Principal: sns.amazonaws.com - SourceArn: !Ref 'ASGSNSTopic' - Type: AWS::Lambda::Permission - LambdaSubscriptionToSNSTopic: - Properties: - Protocol: lambda - Endpoint: !GetAtt 'LambdaFunctionForASG.Arn' - TopicArn: !Ref 'ASGSNSTopic' - Type: AWS::SNS::Subscription - ASGLifecycleHook: - Properties: - AutoScalingGroupName: !Ref 'AutoScalingGroup' - DefaultResult: ABANDON - HeartbeatTimeout: !Ref 'HeartbeatTimeout' - LifecycleHookName: !Ref 'LifecycleHookName' - LifecycleTransition: autoscaling:EC2_INSTANCE_TERMINATING - NotificationMetadata: !Ref 'Cluster' - NotificationTargetARN: !Ref 'ASGSNSTopic' - RoleARN: !GetAtt 'SNSASGRole.Arn' - Type: AWS::AutoScaling::LifecycleHook - Ec2HostsHighCPUAlarm: - Properties: - EvaluationPeriods: 1 - Dimensions: - - Name: AutoScalingGroupName - Value: !Ref 'AutoScalingGroup' - AlarmActions: - - !Ref 'NotificationSnsArn' - AlarmDescription: Alarm if CPU too high or metric disappears indicating instance - is down - Namespace: AWS/EC2 - Period: 60 - ComparisonOperator: GreaterThanThreshold - Statistic: Average - Threshold: '60' - MetricName: CPUUtilization - Type: AWS::CloudWatch::Alarm - ClusterHighCPUAlarm: - Properties: - EvaluationPeriods: 1 - Dimensions: - - Name: ClusterName - Value: !Ref 'Cluster' - AlarmActions: - - !Ref 'NotificationSnsArn' - AlarmDescription: Alarm if CPU is too high for cluster. - Namespace: AWS/ECS - Period: 300 - ComparisonOperator: GreaterThanThreshold - Statistic: Average - Threshold: '60' - MetricName: CPUUtilization - Type: AWS::CloudWatch::Alarm - ClusterHighMemoryAlarm: - Properties: - EvaluationPeriods: 1 - Dimensions: - - Name: ClusterName - Value: !Ref 'Cluster' - AlarmActions: - - !Ref 'NotificationSnsArn' - AlarmDescription: Alarm if memory is too high for cluster. - Namespace: AWS/ECS - Period: 300 - ComparisonOperator: GreaterThanThreshold - Statistic: Average - Threshold: '60' - MetricName: MemoryUtilization - Type: AWS::CloudWatch::Alarm - ClusterHighMemoryReservationAlarm: - Properties: - EvaluationPeriods: 1 - Dimensions: - - Name: ClusterName - Value: !Ref 'Cluster' - AlarmActions: - - !Ref 'AutoScalingPolicy' - AlarmDescription: Alarm if memory reservation is over 75% for cluster. - Namespace: AWS/ECS - Period: 300 - ComparisonOperator: GreaterThanThreshold - Statistic: Average - Threshold: '75' - MetricName: MemoryReservation - Type: AWS::CloudWatch::Alarm - ClusterHighMemoryReservationUserNotifcationAlarm: - Properties: - EvaluationPeriods: 3 - Dimensions: - - Name: ClusterName - Value: !Ref 'Cluster' - AlarmActions: - - !Ref 'NotificationSnsArn' - OKActions: - - !Ref 'NotificationSnsArn' - AlarmDescription: Alarm if memory reservation is over 75% for cluster for 15 - minutes. - Namespace: AWS/ECS - Period: 300 - ComparisonOperator: GreaterThanThreshold - Statistic: Average - Threshold: '75' - MetricName: MemoryReservation - Type: AWS::CloudWatch::Alarm \ No newline at end of file From 089cc41ce21682a9c6294962775b17c4c1ebbaf5 Mon Sep 17 00:00:00 2001 From: Praveen Raghav Date: Mon, 30 Nov 2020 17:16:10 +0530 Subject: [PATCH 08/13] remove unwanted var --- cloudlift/config/environment_configuration.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/cloudlift/config/environment_configuration.py b/cloudlift/config/environment_configuration.py index 2b9422aa..9dda5c61 100644 --- a/cloudlift/config/environment_configuration.py +++ b/cloudlift/config/environment_configuration.py @@ -128,9 +128,6 @@ def _create_config(self): cluster_min_instances = prompt("Min instances in cluster", default=1) cluster_max_instances = prompt("Max instances in cluster", default=5) cluster_instance_type = prompt("Instance type", default='m5.xlarge') - topic_name = prompt("SNS Topic Name") - heartbeat_timeout = prompt("Timeout for Lifecycle Hook", default=300) - lifecycle_hook_name = prompt("Lifecycle Hook name") key_name = prompt("SSH key name") notifications_arn = prompt("Notification SNS ARN") ssl_certificate_arn = prompt("SSL certificate ARN") From fb7fd448cf9a6fa52988436291cad9ac7a1bf538 Mon Sep 17 00:00:00 2001 From: Praveen Raghav Date: Mon, 30 Nov 2020 17:25:16 +0530 Subject: [PATCH 09/13] update var name --- .../deployment/cluster_template_generator.py | 4 +- test/deployment/environment_creation_test.py | 59 +++++++++++++++++++ 2 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 test/deployment/environment_creation_test.py diff --git a/cloudlift/deployment/cluster_template_generator.py b/cloudlift/deployment/cluster_template_generator.py index 1ef5c18e..b58d0f20 100644 --- a/cloudlift/deployment/cluster_template_generator.py +++ b/cloudlift/deployment/cluster_template_generator.py @@ -611,7 +611,7 @@ def _add_instance_draining(self, cluster): self.template.add_resource(self.lambda_function_for_asg) self.asg_sns_topic = Topic( "ASGSNSTopic", - TopicName=Join("", [Ref(cluster),"Topic"]), + TopicName=Join("-", [Ref(cluster),"Topic"]), Subscription=[Subscription( Protocol="lambda", Endpoint=GetAtt(self.lambda_function_for_asg, "Arn") @@ -638,7 +638,7 @@ def _add_instance_draining(self, cluster): AutoScalingGroupName=Ref(self.auto_scaling_group), DefaultResult="ABANDON", HeartbeatTimeout=300, - LifecycleHookName=Join("", [Ref(cluster),"ASGHook"]), + LifecycleHookName=Join("-", [Ref(cluster),"ASG-Hook"]), LifecycleTransition="autoscaling:EC2_INSTANCE_TERMINATING", NotificationMetadata=Ref(cluster), NotificationTargetARN=Ref(self.asg_sns_topic), diff --git a/test/deployment/environment_creation_test.py b/test/deployment/environment_creation_test.py new file mode 100644 index 00000000..683b60e4 --- /dev/null +++ b/test/deployment/environment_creation_test.py @@ -0,0 +1,59 @@ +import datetime + +from cfn_flip import to_json +from mock import patch + +from cloudlift.config import ServiceConfiguration +from cloudlift.deployment.cluster_template_generator import ClusterTemplateGenerator + + + +def mocked_environment_config(cls, *args, **kwargs): + return { + "demo": { + "cluster": { + "instance_type": "t3a.micro", + "key_name": "praveen-test", + "max_instances": 2, + "min_instances": 1 + }, + "draining": { + "heartbeat_timeout": 300, + "lifecycle_hook_name": "DemoTest", + "topic_name": "DemoTest" + }, + "environment": { + "notifications_arn": "arn:aws:sns:ap-south-1:259042324395:Praveen", + "ssl_certificate_arn": "arn:aws:acm:ap-south-1:259042324395:certificate/09d771d0-24d3-45d2-8e40-2237f12bea6a" + }, + "region": "ap-south-1", + "vpc": { + "cidr": "10.7.0.0/16", + "nat-gateway": { + "elastic-ip-allocation-id": "eipalloc-0103733acf336d725" + }, + "subnets": { + "private": { + "subnet-1": { + "cidr": "10.7.8.0/22" + }, + "subnet-2": { + "cidr": "10.7.12.0/22" + } + }, + "public": { + "subnet-1": { + "cidr": "10.7.0.0/22" + }, + "subnet-2": { + "cidr": "10.7.4.0/22" + } + } + } + } + } + } + +class TestServiceTemplateGenerator(object): + def test_environment_creation(self): + en = ClusterTemplateGenerator("demo", mocked_environment_config) \ No newline at end of file From 02168e4abb555a192bf0ac21328ababd3364e0bc Mon Sep 17 00:00:00 2001 From: Praveen Raghav Date: Mon, 30 Nov 2020 17:31:55 +0530 Subject: [PATCH 10/13] remove unwanted test --- test/deployment/environment_creation_test.py | 59 -------------------- 1 file changed, 59 deletions(-) delete mode 100644 test/deployment/environment_creation_test.py diff --git a/test/deployment/environment_creation_test.py b/test/deployment/environment_creation_test.py deleted file mode 100644 index 683b60e4..00000000 --- a/test/deployment/environment_creation_test.py +++ /dev/null @@ -1,59 +0,0 @@ -import datetime - -from cfn_flip import to_json -from mock import patch - -from cloudlift.config import ServiceConfiguration -from cloudlift.deployment.cluster_template_generator import ClusterTemplateGenerator - - - -def mocked_environment_config(cls, *args, **kwargs): - return { - "demo": { - "cluster": { - "instance_type": "t3a.micro", - "key_name": "praveen-test", - "max_instances": 2, - "min_instances": 1 - }, - "draining": { - "heartbeat_timeout": 300, - "lifecycle_hook_name": "DemoTest", - "topic_name": "DemoTest" - }, - "environment": { - "notifications_arn": "arn:aws:sns:ap-south-1:259042324395:Praveen", - "ssl_certificate_arn": "arn:aws:acm:ap-south-1:259042324395:certificate/09d771d0-24d3-45d2-8e40-2237f12bea6a" - }, - "region": "ap-south-1", - "vpc": { - "cidr": "10.7.0.0/16", - "nat-gateway": { - "elastic-ip-allocation-id": "eipalloc-0103733acf336d725" - }, - "subnets": { - "private": { - "subnet-1": { - "cidr": "10.7.8.0/22" - }, - "subnet-2": { - "cidr": "10.7.12.0/22" - } - }, - "public": { - "subnet-1": { - "cidr": "10.7.0.0/22" - }, - "subnet-2": { - "cidr": "10.7.4.0/22" - } - } - } - } - } - } - -class TestServiceTemplateGenerator(object): - def test_environment_creation(self): - en = ClusterTemplateGenerator("demo", mocked_environment_config) \ No newline at end of file From 7e383e708f16274aa1cf8916bb1551e72a7b7ca6 Mon Sep 17 00:00:00 2001 From: Praveen Raghav Date: Tue, 1 Dec 2020 10:23:58 +0530 Subject: [PATCH 11/13] remove heartbeat timeout --- cloudlift/deployment/cluster_template_generator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cloudlift/deployment/cluster_template_generator.py b/cloudlift/deployment/cluster_template_generator.py index b58d0f20..e497715c 100644 --- a/cloudlift/deployment/cluster_template_generator.py +++ b/cloudlift/deployment/cluster_template_generator.py @@ -637,7 +637,6 @@ def _add_instance_draining(self, cluster): "ASGLifecycleHook", AutoScalingGroupName=Ref(self.auto_scaling_group), DefaultResult="ABANDON", - HeartbeatTimeout=300, LifecycleHookName=Join("-", [Ref(cluster),"ASG-Hook"]), LifecycleTransition="autoscaling:EC2_INSTANCE_TERMINATING", NotificationMetadata=Ref(cluster), From d5a975c63eace7888ed58abe8a00b510072b6e42 Mon Sep 17 00:00:00 2001 From: praveenraghav01 Date: Tue, 29 Mar 2022 12:59:19 +0530 Subject: [PATCH 12/13] added down scaling policy --- .../deployment/cluster_template_generator.py | 39 +++++++++++++-- .../ecs_instance_draining_lambda.py | 47 +++++++++++-------- cloudlift/version/__init__.py | 2 +- 3 files changed, 64 insertions(+), 24 deletions(-) diff --git a/cloudlift/deployment/cluster_template_generator.py b/cloudlift/deployment/cluster_template_generator.py index 11edca71..a381db96 100644 --- a/cloudlift/deployment/cluster_template_generator.py +++ b/cloudlift/deployment/cluster_template_generator.py @@ -19,7 +19,7 @@ from troposphere.policies import (AutoScalingRollingUpdate, CreationPolicy, ResourceSignal) from troposphere.rds import DBSubnetGroup -from troposphere.awslambda import Function, Code, MEMORY_VALUES, Permission +from troposphere.awslambda import Function, Code, Permission from troposphere.servicediscovery import PrivateDnsNamespace from cloudlift.config import DecimalEncoder from cloudlift.config import get_client_for, get_region_for_environment @@ -404,6 +404,27 @@ def _add_cluster_alarms(self, cluster): ) self.template.add_resource( self.cluster_high_memory_reservation_autoscale_alarm) + self.cluster_low_memory_reservation_autoscale_alarm = Alarm( + 'ClusterLowMemoryReservationAlarm', + EvaluationPeriods=1, + Dimensions=[ + MetricDimension(Name='ClusterName', Value=Ref(cluster)) + ], + AlarmActions=[ + Ref(self.cluster_down_scaling_policy) + ], + AlarmDescription='Alarm if memory reservation is below 60% \ +for cluster.', + Namespace='AWS/ECS', + Period=300, + ComparisonOperator='LessThanThreshold', + Statistic='Average', + Threshold='60', + MetricName='MemoryReservation' + ) + self.template.add_resource( + self.cluster_low_memory_reservation_autoscale_alarm) + self.cluster_high_memory_reservation_user_notification_alarm = Alarm( 'ClusterHighMemoryReservationUserNotifcationAlarm', EvaluationPeriods=3, @@ -572,6 +593,15 @@ def _add_ec2_auto_scaling(self): ScalingAdjustment=1 ) self.template.add_resource(self.cluster_scaling_policy) + self.cluster_down_scaling_policy = ScalingPolicy( + 'AutoDownScalingPolicy', + AdjustmentType='ChangeInCapacity', + AutoScalingGroupName=Ref(self.auto_scaling_group), + Cooldown=300, + PolicyType='SimpleScaling', + ScalingAdjustment=-1 + ) + self.template.add_resource(self.cluster_down_scaling_policy) def _add_instance_draining(self, cluster): self.sns_asg_role = Role( @@ -625,12 +655,13 @@ def _add_instance_draining(self, cluster): with open (str(pathlib.Path(__file__).parent.absolute())+"/ecs_instance_draining_lambda.py", "r") as ecs_instance_draining_lambda: lambda_code=ecs_instance_draining_lambda.readlines() self.lambda_function_for_asg = Function( - "LambdaFunctionForASG", + "ECSInstanceDraining", Handler="index.lambda_handler", + Description="Drain ECS instance", Role=GetAtt(self.lambda_execution_role, "Arn"), Runtime="python3.6", MemorySize=128, - Timeout=60, + Timeout=300, Code=Code( ZipFile=Join("", lambda_code) ) @@ -792,7 +823,7 @@ def _add_cluster_outputs(self): def _add_metadata(self): - self.template.add_metadata({ + self.template.set_metadata({ 'AWS::CloudFormation::Interface': { 'ParameterGroups': [ { diff --git a/cloudlift/deployment/ecs_instance_draining_lambda.py b/cloudlift/deployment/ecs_instance_draining_lambda.py index 5a745fd5..4197ed5a 100644 --- a/cloudlift/deployment/ecs_instance_draining_lambda.py +++ b/cloudlift/deployment/ecs_instance_draining_lambda.py @@ -6,56 +6,65 @@ ECS = boto3.client('ecs') ASG = boto3.client('autoscaling') SNS = boto3.client('sns') +CW = boto3.client('cloudwatch') -def find_ecs_instance_info(instance_id,cluster_name): + +def find_ecs_instance_info(instance_id, cluster_name): paginator = ECS.get_paginator('list_container_instances') for list_resp in paginator.paginate(cluster=cluster_name): arns = list_resp['containerInstanceArns'] desc_resp = ECS.describe_container_instances(cluster=cluster_name, - containerInstances=arns) + containerInstances=arns) for container_instance in desc_resp['containerInstances']: if container_instance['ec2InstanceId'] != instance_id: continue print('Found instance: id=%s, arn=%s, status=%s, runningTasksCount=%s' % - (instance_id, container_instance['containerInstanceArn'], - container_instance['status'], container_instance['runningTasksCount'])) + (instance_id, container_instance['containerInstanceArn'], + container_instance['status'], container_instance['runningTasksCount'])) return (container_instance['containerInstanceArn'], container_instance['status'], container_instance['runningTasksCount']) return None, None, 0 -def instance_has_running_tasks(instance_id,cluster_name): - (instance_arn, container_status, running_tasks) = find_ecs_instance_info(instance_id,cluster_name) + +def instance_has_running_tasks(instance_id, cluster_name): + (instance_arn, container_status, running_tasks) = find_ecs_instance_info( + instance_id, cluster_name) if instance_arn is None: print('Could not find instance ID %s. Letting autoscaling kill the instance.' % - (instance_id)) + (instance_id)) return False if container_status != 'DRAINING': print('Setting container instance %s (%s) to DRAINING' % - (instance_id, instance_arn)) + (instance_id, instance_arn)) ECS.update_container_instances_state(cluster=cluster_name, - containerInstances=[instance_arn], - status='DRAINING') + containerInstances=[instance_arn], + status='DRAINING') return running_tasks > 0 - + + def lambda_handler(event, context): msg = json.loads(event['Records'][0]['Sns']['Message']) print("Event: ", msg) if 'LifecycleTransition' not in msg.keys() or \ - msg['LifecycleTransition'].find('autoscaling:EC2_INSTANCE_TERMINATING') == -1: + msg['LifecycleTransition'].find('autoscaling:EC2_INSTANCE_TERMINATING') == -1: print('Exiting since the lifecycle transition is not EC2_INSTANCE_TERMINATING.') return if instance_has_running_tasks(msg['EC2InstanceId'], msg['NotificationMetadata']): print('Tasks are still running on instance %s; posting msg to SNS topic %s' % - (msg['EC2InstanceId'], event['Records'][0]['Sns']['TopicArn'])) + (msg['EC2InstanceId'], event['Records'][0]['Sns']['TopicArn'])) time.sleep(5) sns_resp = SNS.publish(TopicArn=event['Records'][0]['Sns']['TopicArn'], - Message=json.dumps(msg), - Subject='Publishing SNS msg to invoke Lambda again.') + Message=json.dumps(msg), + Subject='Publishing SNS msg to invoke Lambda again.') print('Posted msg %s to SNS topic.' % (sns_resp['MessageId'])) else: print('No tasks are running on instance %s; setting lifecycle to complete' % - (msg['EC2InstanceId'])) + (msg['EC2InstanceId'])) ASG.complete_lifecycle_action(LifecycleHookName=msg['LifecycleHookName'], - AutoScalingGroupName=msg['AutoScalingGroupName'], - LifecycleActionResult='CONTINUE', - InstanceId=msg['EC2InstanceId']) \ No newline at end of file + AutoScalingGroupName=msg['AutoScalingGroupName'], + LifecycleActionResult='CONTINUE', + InstanceId=msg['EC2InstanceId']) + if msg['NotificationMetadata'] == 'cluster-production': + alarm_name = 'ecs_agent_alarm_' + msg['EC2InstanceId'] + response = CW.delete_alarms(AlarmNames=[alarm_name]) + print('Alarm %s deleted' % alarm_name) diff --git a/cloudlift/version/__init__.py b/cloudlift/version/__init__.py index a8c40b90..18350283 100644 --- a/cloudlift/version/__init__.py +++ b/cloudlift/version/__init__.py @@ -1,2 +1,2 @@ -VERSION = '1.5.1' +VERSION = '1.5.2' From 34a2637521e67eab981abe6aef160a8719e929c2 Mon Sep 17 00:00:00 2001 From: praveenraghav01 Date: Tue, 29 Mar 2022 12:59:36 +0530 Subject: [PATCH 13/13] added awscli v2 --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index c5b29575..e62e7337 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ boto3>=1.9.89 -awscli>=1.19.105 +awscliv2>=2.1.1 certifi==2017.7.27.1 cfn-flip==1.0.3 chardet==3.0.4 @@ -19,5 +19,5 @@ requests>=2.20.0 six==1.10.0 stringcase==1.0.6 terminaltables==3.1.0 -troposphere>=2.6.4 +troposphere>=2.7.1 awacs==2.0.1