add a basic cw alarm
[awsible] / sqs-action.py
index f092fd76b99dcb8d8c43abcf647933fee61bea5d..7cad07910214cb4f3fb0cf85420a29fa55f939ee 100755 (executable)
@@ -7,9 +7,11 @@ and run the appropriate Ansible playbook against the host.
 import argparse
 import logging
 import boto3
+import botocore.exceptions
 import json
 import sys
 import os
+import time
 import errno
 from subprocess import Popen, PIPE
 from tempfile import gettempdir
@@ -29,9 +31,10 @@ def notify(subj, msg):
 
 def handleEvent(message, event, ASGName, InstanceId):
     notice = [' '.join([ASGName, InstanceId, event])]
+    postnotice = []
     if os.path.isfile(os.path.join(args.playbooks, ASGName + '.yml')):
         message.change_visibility(VisibilityTimeout=(60 * 15)) # hope config doesn't take more than 15m
-        cmd = [ ANSIBLE_PLAYBOOK_CMD, '-i', 'inventory', '--limit', InstanceId, ASGName + '.yml']
+        cmd = [ANSIBLE_PLAYBOOK_CMD, '--limit', InstanceId, ASGName + '.yml']
         p = Popen(cmd, cwd=args.playbooks, stdout=PIPE, stderr=PIPE)
         (stdoutdata, stderrdata) = p.communicate()
         retval = p.returncode
@@ -41,9 +44,23 @@ def handleEvent(message, event, ASGName, InstanceId):
         else:
             notice += ['SUCCESS']
             message.delete()
+            if os.path.isfile(os.path.join(args.playbooks, ASGName + '-post.yml')):
+                postnotice = [' '.join([ASGName, 'post', event])]
+                cmd = [ANSIBLE_PLAYBOOK_CMD, ASGName + '-post.yml']
+                p = Popen(cmd, cwd=args.playbooks, stdout=PIPE, stderr=PIPE)
+                (stdoutdata, stderrdata) = p.communicate()
+                retval = p.returncode
+                if retval:
+                    postnotice += ['FAILURE CODE {}'.format(retval), stderrdata, stdoutdata]
+                else:
+                    postnotice += ['SUCCESS']
+
     else:
         notice += ['no action taken: no playbook for this ASG']
+        message.delete()
     notify(notice[0], '\n'.join(notice))
+    if len(postnotice):
+        notify(postnotice[0], '\n'.join(postnotice))
 
 
 def processMessage(message):
@@ -54,23 +71,23 @@ def processMessage(message):
         ASGName = data['AutoScalingGroupName']
         InstanceId = data['EC2InstanceId']
     except:
-        logging.debug('unparsable message %r', message.body)
+        logging.warning('unparsable message %r', message.body)
         message.delete()
     else:
         if event == 'autoscaling:EC2_INSTANCE_LAUNCH':
             try:
                 instanceState = ec2r.Instance(InstanceId).state['Name']
             except:
-                logging.debug('instance %s does not exist', InstanceId)
+                logging.warning('instance %s does not exist', InstanceId)
                 message.change_visibility(VisibilityTimeout=60 * 2)
             else:
                 if instanceState == 'running':
                     handleEvent(message, event, ASGName, InstanceId)
                 else:
-                    logging.debug('instance %s is in state %s, will try again', InstanceId, instanceState)
+                    logging.warning('instance %s is in state %s, will try again', InstanceId, instanceState)
                     message.change_visibility(VisibilityTimeout=60 * 2)
         else:
-            logging.debug('nothing to do for event %r', data)
+            logging.warning('nothing to do for event %r', data)
             message.delete()
 
 
@@ -103,10 +120,23 @@ args = parser.parse_args()
 
 pidfile = PidFileSingleton()
 
-session = boto3.session.Session(**{k:v for k,v in vars(args).items() if k in ('profile_name', 'region_name')})
-queue = session.resource('sqs').get_queue_by_name(QueueName=args.queue)
-topic = session.resource('sns').Topic(args.arn) if args.arn else None
-ec2r = session.resource('ec2')
+# occasionally, small instances seem to briefly lose their iam credentials
+sessionTriesRemaining = 3
+while sessionTriesRemaining:
+    try:
+        session = boto3.session.Session(**{k:v for k,v in vars(args).items() if k in ('profile_name', 'region_name')})
+        queue = session.resource('sqs').get_queue_by_name(QueueName=args.queue)
+        topic = session.resource('sns').Topic(args.arn) if args.arn else None
+        ec2r = session.resource('ec2')
+    except botocore.exceptions.NoCredentialsError as e:
+        logging.debug('Trouble with credentials, will retry %s more times.', sessionTriesRemaining)
+        sessionTriesRemaining -= 1
+        time.sleep(5)
+        continue
+    break
+if sessionTriesRemaining == 0:
+    logging.error('Failed trying to use IAM credentials.')
+    sys.exit(1)
 
 while True:
     # long poll until there are no more messages