From b4c52319c6f84c2c870a3077ca0136a328978586 Mon Sep 17 00:00:00 2001 From: Lance Edgar Date: Sun, 7 Oct 2018 18:47:02 -0500 Subject: [PATCH] Make client more tolerant of database restart note that a retry is *not* attempted within a given "take readings" run. rather, client will consider that full readings take to have failed, if any part of it fails. but then we keep track of type/amount of some (database connection) failures, and will suppress logging the full error for first 3 attempts. in practice this lets us recover from simple database restarts, and if database becomes truly unavailable we'll hear about it shortly. any other type of error is immediately logged on first failure. --- rattail_tempmon/client.py | 51 ++++++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/rattail_tempmon/client.py b/rattail_tempmon/client.py index c3d8b63..2b154cb 100644 --- a/rattail_tempmon/client.py +++ b/rattail_tempmon/client.py @@ -1,8 +1,8 @@ -# -*- coding: utf-8 -*- +# -*- coding: utf-8; -*- ################################################################################ # # Rattail -- Retail Software Framework -# Copyright © 2010-2017 Lance Edgar +# Copyright © 2010-2018 Lance Edgar # # This file is part of Rattail. # @@ -32,6 +32,8 @@ import random import socket import logging +import six +from sqlalchemy.exc import OperationalError from sqlalchemy.orm.exc import NoResultFound from rattail.daemon import Daemon @@ -71,6 +73,7 @@ class TempmonClient(Daemon): session.close() # main loop: take readings, pause, repeat + self.failed_checks = 0 while True: self.take_readings(client_uuid) time.sleep(self.delay) @@ -79,6 +82,7 @@ class TempmonClient(Daemon): """ Take new readings for all enabled probes on this client. """ + # log.debug("taking readings") session = Session() try: @@ -87,20 +91,43 @@ class TempmonClient(Daemon): if client.enabled: for probe in client.enabled_probes(): self.take_reading(session, probe) + session.flush() - except: - log.exception("Failed to read/record temperature data (but will keep trying)") - session.rollback() - - else: - # make sure we show as being online + # one more thing, make sure our client appears "online" if not client.online: client.online = True - try: - session.commit() - except: + + except Exception as error: + log_error = True + self.failed_checks += 1 + session.rollback() + + # our goal here is to suppress logging when we see connection + # errors which are due to a simple postgres restart. but if they + # keep coming then we'll go ahead and log them (sending email) + if isinstance(error, OperationalError): + + # this first test works upon first DB restart, as well as the + # first time after DB stop. but in the case of DB stop, + # subsequent errors will instead match the second test + if error.connection_invalidated or ( + 'could not connect to server: Connection refused' in six.text_type(error)): + + # only suppress logging for 3 failures, after that we let them go + # TODO: should make the max attempts configurable + if self.failed_checks < 4: + log_error = False + log.debug("database connection failure #%s: %s", + self.failed_checks, + six.text_type(error)) + + # send error email unless we're suppressing it for now + if log_error: log.exception("Failed to read/record temperature data (but will keep trying)") - session.rollback() + + else: # taking readings was successful + self.failed_checks = 0 + session.commit() finally: session.close()