From 2f7fa3430aae4affbec3dbb11a3d2f00dfa2d44e Mon Sep 17 00:00:00 2001
From: Lance Edgar <lance@edbob.org>
Date: Sun, 7 Oct 2018 18:16:18 -0500
Subject: [PATCH] Make server more tolerant of database restart

note that a retry is *not* attempted within a given "check readings" run.
rather, server will consider that full readings check to have failed, if any
part of it fails.

but then we keep track of type/amount of some (database connection) failures,
and will suppress logging the full error for first 3 attempts.  in practice
this lets us recover from simple database restarts, and if database becomes
truly unavailable we'll hear about it shortly.

any other type of error is immediately logged on first failure.
---
 rattail_tempmon/server.py | 46 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 6 deletions(-)

diff --git a/rattail_tempmon/server.py b/rattail_tempmon/server.py
index 64daa78..82851ac 100644
--- a/rattail_tempmon/server.py
+++ b/rattail_tempmon/server.py
@@ -30,7 +30,9 @@ import time
 import datetime
 import logging
 
+import six
 import humanize
+from sqlalchemy.exc import OperationalError
 
 from rattail.db import Session, api
 from rattail_tempmon.db import Session as TempmonSession, model as tempmon
@@ -53,13 +55,16 @@ class TempmonServerDaemon(Daemon):
         Keeps an eye on tempmon readings and sends alerts as needed.
         """
         self.extra_emails = self.config.getlist('rattail.tempmon', 'extra_emails', default=[])
+        delay = self.config.getint('rattail.tempmon', 'server.delay', default=60)
+        self.failed_checks = 0
+
         while True:
             self.check_readings()
-
-            # TODO: make this configurable
-            time.sleep(60)
+            time.sleep(delay)
 
     def check_readings(self):
+
+        # log.debug("checking readings")
         self.now = make_utc()
         session = TempmonSession()
 
@@ -69,11 +74,40 @@ class TempmonServerDaemon(Daemon):
                              .filter(tempmon.Client.archived == False)
             for client in clients:
                 self.check_readings_for_client(session, client)
-        except:
-            log.exception("Failed to check client probe readings (but will keep trying)")
+            session.flush()
+
+        except Exception as error:
+            log_error = True
+            self.failed_checks += 1
             session.rollback()
-        else:
+
+            # our goal here is to suppress logging when we see connection
+            # errors which are due to a simple postgres restart.  but if they
+            # keep coming then we'll go ahead and log them (sending email)
+            if isinstance(error, OperationalError):
+
+                # this first test works upon first DB restart, as well as the
+                # first time after DB stop.  but in the case of DB stop,
+                # subsequent errors will instead match the second test
+                if error.connection_invalidated or (
+                        'could not connect to server: Connection refused' in six.text_type(error)):
+
+                    # only suppress logging for 3 failures, after that we let them go
+                    # TODO: should make the max attempts configurable
+                    if self.failed_checks < 4:
+                        log_error = False
+                        log.debug("database connection failure #%s: %s",
+                                  self.failed_checks,
+                                  six.text_type(error))
+
+            # send error email unless we're suppressing it for now
+            if log_error:
+                log.exception("Failed to check client probe readings (but will keep trying)")
+
+        else: # checks were successful
+            self.failed_checks = 0
             session.commit()
+
         finally:
             session.close()