diff --git a/cloudlift/deployment/cluster_template_generator.py b/cloudlift/deployment/cluster_template_generator.py index d72dc1ef..a381db96 100644 --- a/cloudlift/deployment/cluster_template_generator.py +++ b/cloudlift/deployment/cluster_template_generator.py @@ -1,29 +1,33 @@ import json import re +import pathlib from cfn_flip import to_yaml from stringcase import camelcase, pascalcase from troposphere import (Base64, FindInMap, Output, Parameter, Ref, Sub, - cloudformation, Export, GetAtt, Tags) + cloudformation, Export, GetAtt, Join, Tags) from troposphere.autoscaling import (AutoScalingGroup, LaunchConfiguration, - ScalingPolicy) + ScalingPolicy, LifecycleHook) from troposphere.cloudwatch import Alarm, MetricDimension from troposphere.ec2 import (VPC, InternetGateway, NatGateway, Route, RouteTable, SecurityGroup, Subnet, SecurityGroupIngress, SubnetRouteTableAssociation, VPCGatewayAttachment) from troposphere.ecs import Cluster from troposphere.elasticache import SubnetGroup as ElastiCacheSubnetGroup -from troposphere.iam import InstanceProfile, Role +from troposphere.iam import InstanceProfile, Role, PolicyType, Policy from troposphere.logs import LogGroup from troposphere.policies import (AutoScalingRollingUpdate, CreationPolicy, ResourceSignal) from troposphere.rds import DBSubnetGroup +from troposphere.awslambda import Function, Code, Permission from troposphere.servicediscovery import PrivateDnsNamespace - from cloudlift.config import DecimalEncoder from cloudlift.config import get_client_for, get_region_for_environment from cloudlift.deployment.template_generator import TemplateGenerator from cloudlift.version import VERSION +from troposphere.sns import Subscription, Topic, SubscriptionResource +from awacs.aws import Allow, Statement, Principal, PolicyDocument +from awacs.sts import AssumeRole class ClusterTemplateGenerator(TemplateGenerator): @@ -321,6 +325,7 @@ def _add_cluster(self): cluster = Cluster('Cluster', ClusterName=Ref('AWS::StackName')) self.template.add_resource(cluster) self._add_ec2_auto_scaling() + self._add_instance_draining(cluster) self._add_cluster_alarms(cluster) return cluster @@ -399,6 +404,27 @@ def _add_cluster_alarms(self, cluster): ) self.template.add_resource( self.cluster_high_memory_reservation_autoscale_alarm) + self.cluster_low_memory_reservation_autoscale_alarm = Alarm( + 'ClusterLowMemoryReservationAlarm', + EvaluationPeriods=1, + Dimensions=[ + MetricDimension(Name='ClusterName', Value=Ref(cluster)) + ], + AlarmActions=[ + Ref(self.cluster_down_scaling_policy) + ], + AlarmDescription='Alarm if memory reservation is below 60% \ +for cluster.', + Namespace='AWS/ECS', + Period=300, + ComparisonOperator='LessThanThreshold', + Statistic='Average', + Threshold='60', + MetricName='MemoryReservation' + ) + self.template.add_resource( + self.cluster_low_memory_reservation_autoscale_alarm) + self.cluster_high_memory_reservation_user_notification_alarm = Alarm( 'ClusterHighMemoryReservationUserNotifcationAlarm', EvaluationPeriods=3, @@ -567,6 +593,115 @@ def _add_ec2_auto_scaling(self): ScalingAdjustment=1 ) self.template.add_resource(self.cluster_scaling_policy) + self.cluster_down_scaling_policy = ScalingPolicy( + 'AutoDownScalingPolicy', + AdjustmentType='ChangeInCapacity', + AutoScalingGroupName=Ref(self.auto_scaling_group), + Cooldown=300, + PolicyType='SimpleScaling', + ScalingAdjustment=-1 + ) + self.template.add_resource(self.cluster_down_scaling_policy) + + def _add_instance_draining(self, cluster): + self.sns_asg_role = Role( + "SNSASGRole", + AssumeRolePolicyDocument=PolicyDocument( + Statement=[ + Statement( + Effect=Allow, + Action=[AssumeRole], + Principal=Principal("Service", ["autoscaling.amazonaws.com"]) + ) + ] + ), + ManagedPolicyArns=["arn:aws:iam::aws:policy/service-role/AutoScalingNotificationAccessRole"] + ) + self.template.add_resource(self.sns_asg_role) + self.lambda_execution_role = Role( + "LambdaExecutionRole", + Policies=[Policy( + PolicyName="lambda-inline", + PolicyDocument={ + "Version": "2012-10-17", + "Statement": [{ + "Effect": "Allow", + "Action": [ + "autoscaling:CompleteLifecycleAction", + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents", + "ecs:ListContainerInstances", + "ecs:DescribeContainerInstances", + "ecs:UpdateContainerInstancesState", + "sns:Publish" + ], + "Resource": "*" + }], + } + )], + AssumeRolePolicyDocument=PolicyDocument( + Statement=[ + Statement( + Effect=Allow, + Action=[AssumeRole], + Principal=Principal("Service", ["lambda.amazonaws.com"]) + ) + ] + ), + ManagedPolicyArns=["arn:aws:iam::aws:policy/service-role/AutoScalingNotificationAccessRole"] + ) + self.template.add_resource(self.lambda_execution_role) + with open (str(pathlib.Path(__file__).parent.absolute())+"/ecs_instance_draining_lambda.py", "r") as ecs_instance_draining_lambda: + lambda_code=ecs_instance_draining_lambda.readlines() + self.lambda_function_for_asg = Function( + "ECSInstanceDraining", + Handler="index.lambda_handler", + Description="Drain ECS instance", + Role=GetAtt(self.lambda_execution_role, "Arn"), + Runtime="python3.6", + MemorySize=128, + Timeout=300, + Code=Code( + ZipFile=Join("", lambda_code) + ) + ) + self.template.add_resource(self.lambda_function_for_asg) + self.asg_sns_topic = Topic( + "ASGSNSTopic", + TopicName=Join("-", [Ref(cluster),"Scaling"]), + Subscription=[Subscription( + Protocol="lambda", + Endpoint=GetAtt(self.lambda_function_for_asg, "Arn") + )] + ) + self.template.add_resource(self.asg_sns_topic) + self.lambda_invoke_permission = Permission( + "LambdaInvokePermission", + FunctionName=Ref(self.lambda_function_for_asg), + Action="lambda:InvokeFunction", + Principal="sns.amazonaws.com", + SourceArn=Ref(self.asg_sns_topic) + ) + self.template.add_resource(self.lambda_invoke_permission) + self.lambda_subscription_to_sns_topic = SubscriptionResource( + "LambdaSubscriptionToSNSTopic", + Protocol="lambda", + Endpoint=GetAtt(self.lambda_function_for_asg, "Arn"), + TopicArn=Ref(self.asg_sns_topic) + ) + self.template.add_resource(self.lambda_subscription_to_sns_topic) + self.asg_lifecycle_hook=LifecycleHook( + "ASGLifecycleHook", + AutoScalingGroupName=Ref(self.auto_scaling_group), + DefaultResult="ABANDON", + LifecycleHookName=Join("-", [Ref(cluster),"ASG-Hook"]), + LifecycleTransition="autoscaling:EC2_INSTANCE_TERMINATING", + NotificationMetadata=Ref(cluster), + NotificationTargetARN=Ref(self.asg_sns_topic), + RoleARN=GetAtt(self.sns_asg_role, "Arn"), + ) + self.template.add_resource(self.asg_lifecycle_hook) def _add_cluster_parameters(self): self.template.add_parameter(Parameter( @@ -688,7 +823,7 @@ def _add_cluster_outputs(self): def _add_metadata(self): - self.template.add_metadata({ + self.template.set_metadata({ 'AWS::CloudFormation::Interface': { 'ParameterGroups': [ { @@ -706,7 +841,7 @@ def _add_metadata(self): 'Subnet2', 'NotificationSnsArn' ] - }, + } ], 'ParameterLabels': { 'Environment': { @@ -724,7 +859,7 @@ def _add_metadata(self): 'default': 'Min. no. of instances in cluster' }, 'NotificationSnsArn': { - 'default': 'The SNS topic to which notifactions has to be triggered' + 'default': 'The SNS topic to which notifications has to be triggered' }, 'Subnet1': { 'default': 'Enter the ID of the 1st subnet' @@ -734,7 +869,7 @@ def _add_metadata(self): }, 'VPC': { 'default': 'Enter the VPC in which you want the environment to be setup' - }, + } } } }) diff --git a/cloudlift/deployment/ecs_instance_draining_lambda.py b/cloudlift/deployment/ecs_instance_draining_lambda.py new file mode 100644 index 00000000..4197ed5a --- /dev/null +++ b/cloudlift/deployment/ecs_instance_draining_lambda.py @@ -0,0 +1,70 @@ +import json +import time +import boto3 +import os + +ECS = boto3.client('ecs') +ASG = boto3.client('autoscaling') +SNS = boto3.client('sns') +CW = boto3.client('cloudwatch') + + +def find_ecs_instance_info(instance_id, cluster_name): + paginator = ECS.get_paginator('list_container_instances') + for list_resp in paginator.paginate(cluster=cluster_name): + arns = list_resp['containerInstanceArns'] + desc_resp = ECS.describe_container_instances(cluster=cluster_name, + containerInstances=arns) + for container_instance in desc_resp['containerInstances']: + if container_instance['ec2InstanceId'] != instance_id: + continue + print('Found instance: id=%s, arn=%s, status=%s, runningTasksCount=%s' % + (instance_id, container_instance['containerInstanceArn'], + container_instance['status'], container_instance['runningTasksCount'])) + return (container_instance['containerInstanceArn'], + container_instance['status'], container_instance['runningTasksCount']) + return None, None, 0 + + +def instance_has_running_tasks(instance_id, cluster_name): + (instance_arn, container_status, running_tasks) = find_ecs_instance_info( + instance_id, cluster_name) + if instance_arn is None: + print('Could not find instance ID %s. Letting autoscaling kill the instance.' % + (instance_id)) + return False + if container_status != 'DRAINING': + print('Setting container instance %s (%s) to DRAINING' % + (instance_id, instance_arn)) + ECS.update_container_instances_state(cluster=cluster_name, + containerInstances=[instance_arn], + status='DRAINING') + return running_tasks > 0 + + +def lambda_handler(event, context): + msg = json.loads(event['Records'][0]['Sns']['Message']) + print("Event: ", msg) + if 'LifecycleTransition' not in msg.keys() or \ + msg['LifecycleTransition'].find('autoscaling:EC2_INSTANCE_TERMINATING') == -1: + print('Exiting since the lifecycle transition is not EC2_INSTANCE_TERMINATING.') + return + if instance_has_running_tasks(msg['EC2InstanceId'], msg['NotificationMetadata']): + print('Tasks are still running on instance %s; posting msg to SNS topic %s' % + (msg['EC2InstanceId'], event['Records'][0]['Sns']['TopicArn'])) + time.sleep(5) + sns_resp = SNS.publish(TopicArn=event['Records'][0]['Sns']['TopicArn'], + Message=json.dumps(msg), + Subject='Publishing SNS msg to invoke Lambda again.') + print('Posted msg %s to SNS topic.' % (sns_resp['MessageId'])) + else: + print('No tasks are running on instance %s; setting lifecycle to complete' % + (msg['EC2InstanceId'])) + ASG.complete_lifecycle_action(LifecycleHookName=msg['LifecycleHookName'], + AutoScalingGroupName=msg['AutoScalingGroupName'], + LifecycleActionResult='CONTINUE', + InstanceId=msg['EC2InstanceId']) + if msg['NotificationMetadata'] == 'cluster-production': + alarm_name = 'ecs_agent_alarm_' + msg['EC2InstanceId'] + response = CW.delete_alarms(AlarmNames=[alarm_name]) + print('Alarm %s deleted' % alarm_name) diff --git a/cloudlift/version/__init__.py b/cloudlift/version/__init__.py index a8c40b90..18350283 100644 --- a/cloudlift/version/__init__.py +++ b/cloudlift/version/__init__.py @@ -1,2 +1,2 @@ -VERSION = '1.5.1' +VERSION = '1.5.2' diff --git a/requirements.txt b/requirements.txt index c5b29575..e62e7337 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ boto3>=1.9.89 -awscli>=1.19.105 +awscliv2>=2.1.1 certifi==2017.7.27.1 cfn-flip==1.0.3 chardet==3.0.4 @@ -19,5 +19,5 @@ requests>=2.20.0 six==1.10.0 stringcase==1.0.6 terminaltables==3.1.0 -troposphere>=2.6.4 +troposphere>=2.7.1 awacs==2.0.1