diff --git a/rattail_tempmon/server.py b/rattail_tempmon/server.py index 64daa78..82851ac 100644 --- a/rattail_tempmon/server.py +++ b/rattail_tempmon/server.py @@ -30,7 +30,9 @@ import time import datetime import logging +import six import humanize +from sqlalchemy.exc import OperationalError from rattail.db import Session, api from rattail_tempmon.db import Session as TempmonSession, model as tempmon @@ -53,13 +55,16 @@ class TempmonServerDaemon(Daemon): Keeps an eye on tempmon readings and sends alerts as needed. """ self.extra_emails = self.config.getlist('rattail.tempmon', 'extra_emails', default=[]) + delay = self.config.getint('rattail.tempmon', 'server.delay', default=60) + self.failed_checks = 0 + while True: self.check_readings() - - # TODO: make this configurable - time.sleep(60) + time.sleep(delay) def check_readings(self): + + # log.debug("checking readings") self.now = make_utc() session = TempmonSession() @@ -69,11 +74,40 @@ class TempmonServerDaemon(Daemon): .filter(tempmon.Client.archived == False) for client in clients: self.check_readings_for_client(session, client) - except: - log.exception("Failed to check client probe readings (but will keep trying)") + session.flush() + + except Exception as error: + log_error = True + self.failed_checks += 1 session.rollback() - else: + + # our goal here is to suppress logging when we see connection + # errors which are due to a simple postgres restart. but if they + # keep coming then we'll go ahead and log them (sending email) + if isinstance(error, OperationalError): + + # this first test works upon first DB restart, as well as the + # first time after DB stop. but in the case of DB stop, + # subsequent errors will instead match the second test + if error.connection_invalidated or ( + 'could not connect to server: Connection refused' in six.text_type(error)): + + # only suppress logging for 3 failures, after that we let them go + # TODO: should make the max attempts configurable + if self.failed_checks < 4: + log_error = False + log.debug("database connection failure #%s: %s", + self.failed_checks, + six.text_type(error)) + + # send error email unless we're suppressing it for now + if log_error: + log.exception("Failed to check client probe readings (but will keep trying)") + + else: # checks were successful + self.failed_checks = 0 session.commit() + finally: session.close()