Make server more tolerant of database restart

note that a retry is *not* attempted within a given "check readings" run.
rather, server will consider that full readings check to have failed, if any
part of it fails.

but then we keep track of type/amount of some (database connection) failures,
and will suppress logging the full error for first 3 attempts.  in practice
this lets us recover from simple database restarts, and if database becomes
truly unavailable we'll hear about it shortly.

any other type of error is immediately logged on first failure.
This commit is contained in:
Lance Edgar 2018-10-07 18:16:18 -05:00
parent b4fa6a17c5
commit 2f7fa3430a

View file

@ -30,7 +30,9 @@ import time
import datetime
import logging
import six
import humanize
from sqlalchemy.exc import OperationalError
from rattail.db import Session, api
from rattail_tempmon.db import Session as TempmonSession, model as tempmon
@ -53,13 +55,16 @@ class TempmonServerDaemon(Daemon):
Keeps an eye on tempmon readings and sends alerts as needed.
"""
self.extra_emails = self.config.getlist('rattail.tempmon', 'extra_emails', default=[])
delay = self.config.getint('rattail.tempmon', 'server.delay', default=60)
self.failed_checks = 0
while True:
self.check_readings()
# TODO: make this configurable
time.sleep(60)
time.sleep(delay)
def check_readings(self):
# log.debug("checking readings")
self.now = make_utc()
session = TempmonSession()
@ -69,11 +74,40 @@ class TempmonServerDaemon(Daemon):
.filter(tempmon.Client.archived == False)
for client in clients:
self.check_readings_for_client(session, client)
except:
log.exception("Failed to check client probe readings (but will keep trying)")
session.flush()
except Exception as error:
log_error = True
self.failed_checks += 1
session.rollback()
else:
# our goal here is to suppress logging when we see connection
# errors which are due to a simple postgres restart. but if they
# keep coming then we'll go ahead and log them (sending email)
if isinstance(error, OperationalError):
# this first test works upon first DB restart, as well as the
# first time after DB stop. but in the case of DB stop,
# subsequent errors will instead match the second test
if error.connection_invalidated or (
'could not connect to server: Connection refused' in six.text_type(error)):
# only suppress logging for 3 failures, after that we let them go
# TODO: should make the max attempts configurable
if self.failed_checks < 4:
log_error = False
log.debug("database connection failure #%s: %s",
self.failed_checks,
six.text_type(error))
# send error email unless we're suppressing it for now
if log_error:
log.exception("Failed to check client probe readings (but will keep trying)")
else: # checks were successful
self.failed_checks = 0
session.commit()
finally:
session.close()