AWS官方计划内的硬件维护操作通知往往会提前几周发布出来,从而能够让客户有余地选择合适的时间做重启或者迁移等预防性措施,避免硬切换带来的稳定性损失
美中不足的是,如此重要的消息却只是轻描淡写的以红点提示的形式显示在五花缭乱的web控制台上,总有点"偷偷摸摸"的感觉;尽管AWS官方还会发送邮件通知作为兜底,但是这种"上古时代"的非即时消息通知方式很容易让人遗忘和忽略,从而引发服务宕机;因此,希望能够将维护消息接入公司告警系统,广而告之,及时的告知维护人员
大致逻辑:
1.使用CloudEvent定期触发Lambda函数,获取ec2/rds中待维护的实例,将消息发送给sns
2.内部报警服务订阅sns topic,从而将维护通知通过企业微信发送给相应人员
lambda函数逻辑
import json
import boto3
import re
from functools import wraps
from datetime import datetime
TOPICARN = 'arn:aws:sns:us-west-2:88888888:CloudWatchAlarm'
DATE_NOW = datetime.strftime(datetime.now(), '%Y-%m-%dT%H:%M:%S.%f')[:-3]+"+0000"
# 触发EC2重启的event类型
EC2_EVENT_TYPE_LIST= [
'instance-reboot',
'system-reboot',
'system-maintenance',
'instance-retirement',
'instance-stop'
]
# 自定义企业微信消息格式
ALARM_MSG={
"AlarmName": "AWS EC2/RDS 官方维护预告",
"AWSAccountId": "8888888888",
"NewStateValue": "ALARM",
"NewStateReason": "",
"StateChangeTime": DATE_NOW,
"Region": "US West (Oregon)",
"OldStateValue": "OK",
"Trigger": {
"MetricName": "aws ec2 maintenance",
"Namespace": "AWS/Maintenance",
"StatisticType": "Statistic",
"Statistic": "",
"Unit": "",
"Dimensions": [
{
"value": "",
"name": "InstanceId"
}
],
"Period": "",
"EvaluationPeriods": "",
"ComparisonOperator": "",
"Threshold": "",
"TreatMissingData": "",
"EvaluateLowSampleCountPercentile": ""
}
}
def send_to_sns(msg):
sns_client = boto3.client('sns')
try:
sns_response = sns_client.publish(
TopicArn = TOPICARN,
Message = msg,
Subject = 'aws maintenance notice'
)
return('Publish to SNS Channel Message Id:{}'.format(sns_response['MessageId']))
except Exception as e:
return(e)
def alarm_send(func):
@wraps(func)
def _wrapper():
msg_list = func()
if msg_list:
for msg in msg_list:
send_to_sns(msg)
return _wrapper
@alarm_send
def get_ec2_maintenance_notices():
client=boto3.client('ec2','us-west-2')
response=client.describe_instance_status(
Filters=[
{
'Name': 'event.code',
'Values': EC2_EVENT_TYPE_LIST
},
],
)['InstanceStatuses']
if response:
alarm_msg_list = []
for each in response:
# 过滤已完成操作的维护事件
if re.search('Completed',each['Events'][0]['Description']):
continue
else:
ALARM_MSG["AlarmDescription"] = each['InstanceId']+" will under maintenance"
ALARM_MSG["NewStateReason"] = each['Events'][0]['Description']
ALARM_MSG["Trigger"]["Dimensions"][0]['value'] = each['InstanceId']
alarm_msg_list.append(json.dumps(ALARM_MSG))
return alarm_msg_list
@alarm_send
def get_rds_maintenance_notices():
client = boto3.client('rds','us-west-2')
response = client.describe_pending_maintenance_actions()['PendingMaintenanceActions']
if response:
alarm_msg_list = []
for each in response:
if re.search('maintenance',each['PendingMaintenanceActionDetails'][0]['Action']):
ALARM_MSG["AlarmName"] = "AWS RDS 维护通知"
ALARM_MSG["AlarmDescription"] = each['ResourceIdentifier']+" will under maintenance"
ALARM_MSG["NewStateReason"]= each['PendingMaintenanceActionDetails'][0]['Description']
ALARM_MSG["Trigger"]["Dimensions"][0]['value']=each['ResourceIdentifier']
alarm_msg_list.append(json.dumps(ALARM_MSG))
return alarm_msg_list
def lambda_handler(event, context):
get_ec2_maintenance_notices()
get_rds_maintenance_notices()
文章均为原创,关注公众号获取更多知识