Make server more tolerant of database restart
note that a retry is *not* attempted within a given "check readings" run. rather, server will consider that full readings check to have failed, if any part of it fails. but then we keep track of type/amount of some (database connection) failures, and will suppress logging the full error for first 3 attempts. in practice this lets us recover from simple database restarts, and if database becomes truly unavailable we'll hear about it shortly. any other type of error is immediately logged on first failure.
This commit is contained in:
parent
b4fa6a17c5
commit
2f7fa3430a
|
@ -30,7 +30,9 @@ import time
|
|||
import datetime
|
||||
import logging
|
||||
|
||||
import six
|
||||
import humanize
|
||||
from sqlalchemy.exc import OperationalError
|
||||
|
||||
from rattail.db import Session, api
|
||||
from rattail_tempmon.db import Session as TempmonSession, model as tempmon
|
||||
|
@ -53,13 +55,16 @@ class TempmonServerDaemon(Daemon):
|
|||
Keeps an eye on tempmon readings and sends alerts as needed.
|
||||
"""
|
||||
self.extra_emails = self.config.getlist('rattail.tempmon', 'extra_emails', default=[])
|
||||
delay = self.config.getint('rattail.tempmon', 'server.delay', default=60)
|
||||
self.failed_checks = 0
|
||||
|
||||
while True:
|
||||
self.check_readings()
|
||||
|
||||
# TODO: make this configurable
|
||||
time.sleep(60)
|
||||
time.sleep(delay)
|
||||
|
||||
def check_readings(self):
|
||||
|
||||
# log.debug("checking readings")
|
||||
self.now = make_utc()
|
||||
session = TempmonSession()
|
||||
|
||||
|
@ -69,11 +74,40 @@ class TempmonServerDaemon(Daemon):
|
|||
.filter(tempmon.Client.archived == False)
|
||||
for client in clients:
|
||||
self.check_readings_for_client(session, client)
|
||||
except:
|
||||
log.exception("Failed to check client probe readings (but will keep trying)")
|
||||
session.flush()
|
||||
|
||||
except Exception as error:
|
||||
log_error = True
|
||||
self.failed_checks += 1
|
||||
session.rollback()
|
||||
else:
|
||||
|
||||
# our goal here is to suppress logging when we see connection
|
||||
# errors which are due to a simple postgres restart. but if they
|
||||
# keep coming then we'll go ahead and log them (sending email)
|
||||
if isinstance(error, OperationalError):
|
||||
|
||||
# this first test works upon first DB restart, as well as the
|
||||
# first time after DB stop. but in the case of DB stop,
|
||||
# subsequent errors will instead match the second test
|
||||
if error.connection_invalidated or (
|
||||
'could not connect to server: Connection refused' in six.text_type(error)):
|
||||
|
||||
# only suppress logging for 3 failures, after that we let them go
|
||||
# TODO: should make the max attempts configurable
|
||||
if self.failed_checks < 4:
|
||||
log_error = False
|
||||
log.debug("database connection failure #%s: %s",
|
||||
self.failed_checks,
|
||||
six.text_type(error))
|
||||
|
||||
# send error email unless we're suppressing it for now
|
||||
if log_error:
|
||||
log.exception("Failed to check client probe readings (but will keep trying)")
|
||||
|
||||
else: # checks were successful
|
||||
self.failed_checks = 0
|
||||
session.commit()
|
||||
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
|
Loading…
Reference in a new issue