Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 143 additions & 8 deletions cloudlift/deployment/cluster_template_generator.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,33 @@
import json
import re
import pathlib

from cfn_flip import to_yaml
from stringcase import camelcase, pascalcase
from troposphere import (Base64, FindInMap, Output, Parameter, Ref, Sub,
cloudformation, Export, GetAtt, Tags)
cloudformation, Export, GetAtt, Join, Tags)
from troposphere.autoscaling import (AutoScalingGroup, LaunchConfiguration,
ScalingPolicy)
ScalingPolicy, LifecycleHook)
from troposphere.cloudwatch import Alarm, MetricDimension
from troposphere.ec2 import (VPC, InternetGateway, NatGateway, Route,
RouteTable, SecurityGroup, Subnet, SecurityGroupIngress,
SubnetRouteTableAssociation, VPCGatewayAttachment)
from troposphere.ecs import Cluster
from troposphere.elasticache import SubnetGroup as ElastiCacheSubnetGroup
from troposphere.iam import InstanceProfile, Role
from troposphere.iam import InstanceProfile, Role, PolicyType, Policy
from troposphere.logs import LogGroup
from troposphere.policies import (AutoScalingRollingUpdate, CreationPolicy,
ResourceSignal)
from troposphere.rds import DBSubnetGroup
from troposphere.awslambda import Function, Code, Permission
from troposphere.servicediscovery import PrivateDnsNamespace

from cloudlift.config import DecimalEncoder
from cloudlift.config import get_client_for, get_region_for_environment
from cloudlift.deployment.template_generator import TemplateGenerator
from cloudlift.version import VERSION
from troposphere.sns import Subscription, Topic, SubscriptionResource
from awacs.aws import Allow, Statement, Principal, PolicyDocument
from awacs.sts import AssumeRole


class ClusterTemplateGenerator(TemplateGenerator):
Expand Down Expand Up @@ -321,6 +325,7 @@ def _add_cluster(self):
cluster = Cluster('Cluster', ClusterName=Ref('AWS::StackName'))
self.template.add_resource(cluster)
self._add_ec2_auto_scaling()
self._add_instance_draining(cluster)
self._add_cluster_alarms(cluster)
return cluster

Expand Down Expand Up @@ -399,6 +404,27 @@ def _add_cluster_alarms(self, cluster):
)
self.template.add_resource(
self.cluster_high_memory_reservation_autoscale_alarm)
self.cluster_low_memory_reservation_autoscale_alarm = Alarm(
'ClusterLowMemoryReservationAlarm',
EvaluationPeriods=1,
Dimensions=[
MetricDimension(Name='ClusterName', Value=Ref(cluster))
],
AlarmActions=[
Ref(self.cluster_down_scaling_policy)
],
AlarmDescription='Alarm if memory reservation is below 60% \
for cluster.',
Namespace='AWS/ECS',
Period=300,
ComparisonOperator='LessThanThreshold',
Statistic='Average',
Threshold='60',
MetricName='MemoryReservation'
)
self.template.add_resource(
self.cluster_low_memory_reservation_autoscale_alarm)

self.cluster_high_memory_reservation_user_notification_alarm = Alarm(
'ClusterHighMemoryReservationUserNotifcationAlarm',
EvaluationPeriods=3,
Expand Down Expand Up @@ -567,6 +593,115 @@ def _add_ec2_auto_scaling(self):
ScalingAdjustment=1
)
self.template.add_resource(self.cluster_scaling_policy)
self.cluster_down_scaling_policy = ScalingPolicy(
'AutoDownScalingPolicy',
AdjustmentType='ChangeInCapacity',
AutoScalingGroupName=Ref(self.auto_scaling_group),
Cooldown=300,
PolicyType='SimpleScaling',
ScalingAdjustment=-1
)
self.template.add_resource(self.cluster_down_scaling_policy)

def _add_instance_draining(self, cluster):
self.sns_asg_role = Role(
"SNSASGRole",
AssumeRolePolicyDocument=PolicyDocument(
Statement=[
Statement(
Effect=Allow,
Action=[AssumeRole],
Principal=Principal("Service", ["autoscaling.amazonaws.com"])
)
]
),
ManagedPolicyArns=["arn:aws:iam::aws:policy/service-role/AutoScalingNotificationAccessRole"]
)
self.template.add_resource(self.sns_asg_role)
self.lambda_execution_role = Role(
"LambdaExecutionRole",
Policies=[Policy(
PolicyName="lambda-inline",
PolicyDocument={
"Version": "2012-10-17",
"Statement": [{
"Effect": "Allow",
"Action": [
"autoscaling:CompleteLifecycleAction",
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents",
"ecs:ListContainerInstances",
"ecs:DescribeContainerInstances",
"ecs:UpdateContainerInstancesState",
"sns:Publish"
],
"Resource": "*"
}],
}
)],
AssumeRolePolicyDocument=PolicyDocument(
Statement=[
Statement(
Effect=Allow,
Action=[AssumeRole],
Principal=Principal("Service", ["lambda.amazonaws.com"])
)
]
),
ManagedPolicyArns=["arn:aws:iam::aws:policy/service-role/AutoScalingNotificationAccessRole"]
)
self.template.add_resource(self.lambda_execution_role)
with open (str(pathlib.Path(__file__).parent.absolute())+"/ecs_instance_draining_lambda.py", "r") as ecs_instance_draining_lambda:
lambda_code=ecs_instance_draining_lambda.readlines()
self.lambda_function_for_asg = Function(
"ECSInstanceDraining",
Handler="index.lambda_handler",
Description="Drain ECS instance",
Role=GetAtt(self.lambda_execution_role, "Arn"),
Runtime="python3.6",
MemorySize=128,
Timeout=300,
Code=Code(
ZipFile=Join("", lambda_code)
)
)
self.template.add_resource(self.lambda_function_for_asg)
self.asg_sns_topic = Topic(
"ASGSNSTopic",
TopicName=Join("-", [Ref(cluster),"Scaling"]),
Subscription=[Subscription(
Protocol="lambda",
Endpoint=GetAtt(self.lambda_function_for_asg, "Arn")
)]
)
self.template.add_resource(self.asg_sns_topic)
self.lambda_invoke_permission = Permission(
"LambdaInvokePermission",
FunctionName=Ref(self.lambda_function_for_asg),
Action="lambda:InvokeFunction",
Principal="sns.amazonaws.com",
SourceArn=Ref(self.asg_sns_topic)
)
self.template.add_resource(self.lambda_invoke_permission)
self.lambda_subscription_to_sns_topic = SubscriptionResource(
"LambdaSubscriptionToSNSTopic",
Protocol="lambda",
Endpoint=GetAtt(self.lambda_function_for_asg, "Arn"),
TopicArn=Ref(self.asg_sns_topic)
)
self.template.add_resource(self.lambda_subscription_to_sns_topic)
self.asg_lifecycle_hook=LifecycleHook(
"ASGLifecycleHook",
AutoScalingGroupName=Ref(self.auto_scaling_group),
DefaultResult="ABANDON",
LifecycleHookName=Join("-", [Ref(cluster),"ASG-Hook"]),
LifecycleTransition="autoscaling:EC2_INSTANCE_TERMINATING",
NotificationMetadata=Ref(cluster),
NotificationTargetARN=Ref(self.asg_sns_topic),
RoleARN=GetAtt(self.sns_asg_role, "Arn"),
)
self.template.add_resource(self.asg_lifecycle_hook)

def _add_cluster_parameters(self):
self.template.add_parameter(Parameter(
Expand Down Expand Up @@ -688,7 +823,7 @@ def _add_cluster_outputs(self):


def _add_metadata(self):
self.template.add_metadata({
self.template.set_metadata({
'AWS::CloudFormation::Interface': {
'ParameterGroups': [
{
Expand All @@ -706,7 +841,7 @@ def _add_metadata(self):
'Subnet2',
'NotificationSnsArn'
]
},
}
],
'ParameterLabels': {
'Environment': {
Expand All @@ -724,7 +859,7 @@ def _add_metadata(self):
'default': 'Min. no. of instances in cluster'
},
'NotificationSnsArn': {
'default': 'The SNS topic to which notifactions has to be triggered'
'default': 'The SNS topic to which notifications has to be triggered'
},
'Subnet1': {
'default': 'Enter the ID of the 1st subnet'
Expand All @@ -734,7 +869,7 @@ def _add_metadata(self):
},
'VPC': {
'default': 'Enter the VPC in which you want the environment to be setup'
},
}
}
}
})
70 changes: 70 additions & 0 deletions cloudlift/deployment/ecs_instance_draining_lambda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import json
import time
import boto3
import os

ECS = boto3.client('ecs')
ASG = boto3.client('autoscaling')
SNS = boto3.client('sns')
CW = boto3.client('cloudwatch')


def find_ecs_instance_info(instance_id, cluster_name):
paginator = ECS.get_paginator('list_container_instances')
for list_resp in paginator.paginate(cluster=cluster_name):
arns = list_resp['containerInstanceArns']
desc_resp = ECS.describe_container_instances(cluster=cluster_name,
containerInstances=arns)
for container_instance in desc_resp['containerInstances']:
if container_instance['ec2InstanceId'] != instance_id:
continue
print('Found instance: id=%s, arn=%s, status=%s, runningTasksCount=%s' %
(instance_id, container_instance['containerInstanceArn'],
container_instance['status'], container_instance['runningTasksCount']))
return (container_instance['containerInstanceArn'],
container_instance['status'], container_instance['runningTasksCount'])
return None, None, 0


def instance_has_running_tasks(instance_id, cluster_name):
(instance_arn, container_status, running_tasks) = find_ecs_instance_info(
instance_id, cluster_name)
if instance_arn is None:
print('Could not find instance ID %s. Letting autoscaling kill the instance.' %
(instance_id))
return False
if container_status != 'DRAINING':
print('Setting container instance %s (%s) to DRAINING' %
(instance_id, instance_arn))
ECS.update_container_instances_state(cluster=cluster_name,
containerInstances=[instance_arn],
status='DRAINING')
return running_tasks > 0


def lambda_handler(event, context):
msg = json.loads(event['Records'][0]['Sns']['Message'])
print("Event: ", msg)
if 'LifecycleTransition' not in msg.keys() or \
msg['LifecycleTransition'].find('autoscaling:EC2_INSTANCE_TERMINATING') == -1:
print('Exiting since the lifecycle transition is not EC2_INSTANCE_TERMINATING.')
return
if instance_has_running_tasks(msg['EC2InstanceId'], msg['NotificationMetadata']):
print('Tasks are still running on instance %s; posting msg to SNS topic %s' %
(msg['EC2InstanceId'], event['Records'][0]['Sns']['TopicArn']))
time.sleep(5)
sns_resp = SNS.publish(TopicArn=event['Records'][0]['Sns']['TopicArn'],
Message=json.dumps(msg),
Subject='Publishing SNS msg to invoke Lambda again.')
print('Posted msg %s to SNS topic.' % (sns_resp['MessageId']))
else:
print('No tasks are running on instance %s; setting lifecycle to complete' %
(msg['EC2InstanceId']))
ASG.complete_lifecycle_action(LifecycleHookName=msg['LifecycleHookName'],
AutoScalingGroupName=msg['AutoScalingGroupName'],
LifecycleActionResult='CONTINUE',
InstanceId=msg['EC2InstanceId'])
if msg['NotificationMetadata'] == 'cluster-production':
alarm_name = 'ecs_agent_alarm_' + msg['EC2InstanceId']
response = CW.delete_alarms(AlarmNames=[alarm_name])
print('Alarm %s deleted' % alarm_name)
2 changes: 1 addition & 1 deletion cloudlift/version/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
VERSION = '1.5.1'
VERSION = '1.5.2'

4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
boto3>=1.9.89
awscli>=1.19.105
awscliv2>=2.1.1
certifi==2017.7.27.1
cfn-flip==1.0.3
chardet==3.0.4
Expand All @@ -19,5 +19,5 @@ requests>=2.20.0
six==1.10.0
stringcase==1.0.6
terminaltables==3.1.0
troposphere>=2.6.4
troposphere>=2.7.1
awacs==2.0.1