X-Git-Url: http://git.squeep.com/?p=awsible;a=blobdiff_plain;f=sqs-action.py;h=7cad07910214cb4f3fb0cf85420a29fa55f939ee;hp=a71d01d003620f8817793737bf6372d61758728c;hb=HEAD;hpb=93faadc586acb82d42c7af9ea12951074c33dd87 diff --git a/sqs-action.py b/sqs-action.py index a71d01d..7cad079 100755 --- a/sqs-action.py +++ b/sqs-action.py @@ -7,9 +7,11 @@ and run the appropriate Ansible playbook against the host. import argparse import logging import boto3 +import botocore.exceptions import json import sys import os +import time import errno from subprocess import Popen, PIPE from tempfile import gettempdir @@ -29,9 +31,10 @@ def notify(subj, msg): def handleEvent(message, event, ASGName, InstanceId): notice = [' '.join([ASGName, InstanceId, event])] + postnotice = [] if os.path.isfile(os.path.join(args.playbooks, ASGName + '.yml')): message.change_visibility(VisibilityTimeout=(60 * 15)) # hope config doesn't take more than 15m - cmd = [ ANSIBLE_PLAYBOOK_CMD, '-i', 'inventory', '--limit', InstanceId, ASGName + '.yml'] + cmd = [ANSIBLE_PLAYBOOK_CMD, '--limit', InstanceId, ASGName + '.yml'] p = Popen(cmd, cwd=args.playbooks, stdout=PIPE, stderr=PIPE) (stdoutdata, stderrdata) = p.communicate() retval = p.returncode @@ -41,10 +44,23 @@ def handleEvent(message, event, ASGName, InstanceId): else: notice += ['SUCCESS'] message.delete() + if os.path.isfile(os.path.join(args.playbooks, ASGName + '-post.yml')): + postnotice = [' '.join([ASGName, 'post', event])] + cmd = [ANSIBLE_PLAYBOOK_CMD, ASGName + '-post.yml'] + p = Popen(cmd, cwd=args.playbooks, stdout=PIPE, stderr=PIPE) + (stdoutdata, stderrdata) = p.communicate() + retval = p.returncode + if retval: + postnotice += ['FAILURE CODE {}'.format(retval), stderrdata, stdoutdata] + else: + postnotice += ['SUCCESS'] + else: notice += ['no action taken: no playbook for this ASG'] message.delete() notify(notice[0], '\n'.join(notice)) + if len(postnotice): + notify(postnotice[0], '\n'.join(postnotice)) def processMessage(message): @@ -55,23 +71,23 @@ def processMessage(message): ASGName = data['AutoScalingGroupName'] InstanceId = data['EC2InstanceId'] except: - logging.debug('unparsable message %r', message.body) + logging.warning('unparsable message %r', message.body) message.delete() else: if event == 'autoscaling:EC2_INSTANCE_LAUNCH': try: instanceState = ec2r.Instance(InstanceId).state['Name'] except: - logging.debug('instance %s does not exist', InstanceId) + logging.warning('instance %s does not exist', InstanceId) message.change_visibility(VisibilityTimeout=60 * 2) else: if instanceState == 'running': handleEvent(message, event, ASGName, InstanceId) else: - logging.debug('instance %s is in state %s, will try again', InstanceId, instanceState) + logging.warning('instance %s is in state %s, will try again', InstanceId, instanceState) message.change_visibility(VisibilityTimeout=60 * 2) else: - logging.debug('nothing to do for event %r', data) + logging.warning('nothing to do for event %r', data) message.delete() @@ -104,10 +120,23 @@ args = parser.parse_args() pidfile = PidFileSingleton() -session = boto3.session.Session(**{k:v for k,v in vars(args).items() if k in ('profile_name', 'region_name')}) -queue = session.resource('sqs').get_queue_by_name(QueueName=args.queue) -topic = session.resource('sns').Topic(args.arn) if args.arn else None -ec2r = session.resource('ec2') +# occasionally, small instances seem to briefly lose their iam credentials +sessionTriesRemaining = 3 +while sessionTriesRemaining: + try: + session = boto3.session.Session(**{k:v for k,v in vars(args).items() if k in ('profile_name', 'region_name')}) + queue = session.resource('sqs').get_queue_by_name(QueueName=args.queue) + topic = session.resource('sns').Topic(args.arn) if args.arn else None + ec2r = session.resource('ec2') + except botocore.exceptions.NoCredentialsError as e: + logging.debug('Trouble with credentials, will retry %s more times.', sessionTriesRemaining) + sessionTriesRemaining -= 1 + time.sleep(5) + continue + break +if sessionTriesRemaining == 0: + logging.error('Failed trying to use IAM credentials.') + sys.exit(1) while True: # long poll until there are no more messages