Make server more tolerant of database restart

note that a retry is *not* attempted within a given "check readings" run. rather, server will consider that full readings check to have failed, if any part of it fails. but then we keep track of type/amount of some (database connection) failures, and will suppress logging the full error for first 3 attempts. in practice this lets us recover from simple database restarts, and if database becomes truly unavailable we'll hear about it shortly. any other type of error is immediately logged on first failure.
2018-10-07 18:16:18 -05:00 · 2018-10-07 18:16:18 -05:00 · 2f7fa3430a
commit 2f7fa3430a
parent b4fa6a17c5
1 changed files with 40 additions and 6 deletions
--- a/rattail_tempmon/server.py
+++ b/rattail_tempmon/server.py
@ -30,7 +30,9 @@ import time
 import datetime
 import logging

+import six
 import humanize
+from sqlalchemy.exc import OperationalError

 from rattail.db import Session, api
 from rattail_tempmon.db import Session as TempmonSession, model as tempmon
@ -53,13 +55,16 @@ class TempmonServerDaemon(Daemon):
        Keeps an eye on tempmon readings and sends alerts as needed.
        """
        self.extra_emails = self.config.getlist('rattail.tempmon', 'extra_emails', default=[])
+        delay = self.config.getint('rattail.tempmon', 'server.delay', default=60)
+        self.failed_checks = 0
+
        while True:
            self.check_readings()
-
-            # TODO: make this configurable
-            time.sleep(60)
+            time.sleep(delay)

    def check_readings(self):
+
+        # log.debug("checking readings")
        self.now = make_utc()
        session = TempmonSession()

@ -69,11 +74,40 @@ class TempmonServerDaemon(Daemon):
                             .filter(tempmon.Client.archived == False)
            for client in clients:
                self.check_readings_for_client(session, client)
-        except:
-            log.exception("Failed to check client probe readings (but will keep trying)")
+            session.flush()
+
+        except Exception as error:
+            log_error = True
+            self.failed_checks += 1
            session.rollback()
-        else:
+
+            # our goal here is to suppress logging when we see connection
+            # errors which are due to a simple postgres restart.  but if they
+            # keep coming then we'll go ahead and log them (sending email)
+            if isinstance(error, OperationalError):
+
+                # this first test works upon first DB restart, as well as the
+                # first time after DB stop.  but in the case of DB stop,
+                # subsequent errors will instead match the second test
+                if error.connection_invalidated or (
+                        'could not connect to server: Connection refused' in six.text_type(error)):
+
+                    # only suppress logging for 3 failures, after that we let them go
+                    # TODO: should make the max attempts configurable
+                    if self.failed_checks < 4:
+                        log_error = False
+                        log.debug("database connection failure #%s: %s",
+                                  self.failed_checks,
+                                  six.text_type(error))
+
+            # send error email unless we're suppressing it for now
+            if log_error:
+                log.exception("Failed to check client probe readings (but will keep trying)")
+
+        else: # checks were successful
+            self.failed_checks = 0
            session.commit()
+
        finally:
            session.close()