Make client more tolerant of database restart

note that a retry is *not* attempted within a given "take readings" run.
rather, client will consider that full readings take to have failed, if any
part of it fails.

but then we keep track of type/amount of some (database connection) failures,
and will suppress logging the full error for first 3 attempts.  in practice
this lets us recover from simple database restarts, and if database becomes
truly unavailable we'll hear about it shortly.

any other type of error is immediately logged on first failure.
This commit is contained in:
Lance Edgar 2018-10-07 18:47:02 -05:00
parent 2f7fa3430a
commit b4c52319c6

View file

@ -1,8 +1,8 @@
# -*- coding: utf-8 -*-
# -*- coding: utf-8; -*-
################################################################################
#
# Rattail -- Retail Software Framework
# Copyright © 2010-2017 Lance Edgar
# Copyright © 2010-2018 Lance Edgar
#
# This file is part of Rattail.
#
@ -32,6 +32,8 @@ import random
import socket
import logging
import six
from sqlalchemy.exc import OperationalError
from sqlalchemy.orm.exc import NoResultFound
from rattail.daemon import Daemon
@ -71,6 +73,7 @@ class TempmonClient(Daemon):
session.close()
# main loop: take readings, pause, repeat
self.failed_checks = 0
while True:
self.take_readings(client_uuid)
time.sleep(self.delay)
@ -79,6 +82,7 @@ class TempmonClient(Daemon):
"""
Take new readings for all enabled probes on this client.
"""
# log.debug("taking readings")
session = Session()
try:
@ -87,21 +91,44 @@ class TempmonClient(Daemon):
if client.enabled:
for probe in client.enabled_probes():
self.take_reading(session, probe)
session.flush()
except:
log.exception("Failed to read/record temperature data (but will keep trying)")
session.rollback()
else:
# make sure we show as being online
# one more thing, make sure our client appears "online"
if not client.online:
client.online = True
try:
session.commit()
except:
log.exception("Failed to read/record temperature data (but will keep trying)")
except Exception as error:
log_error = True
self.failed_checks += 1
session.rollback()
# our goal here is to suppress logging when we see connection
# errors which are due to a simple postgres restart. but if they
# keep coming then we'll go ahead and log them (sending email)
if isinstance(error, OperationalError):
# this first test works upon first DB restart, as well as the
# first time after DB stop. but in the case of DB stop,
# subsequent errors will instead match the second test
if error.connection_invalidated or (
'could not connect to server: Connection refused' in six.text_type(error)):
# only suppress logging for 3 failures, after that we let them go
# TODO: should make the max attempts configurable
if self.failed_checks < 4:
log_error = False
log.debug("database connection failure #%s: %s",
self.failed_checks,
six.text_type(error))
# send error email unless we're suppressing it for now
if log_error:
log.exception("Failed to read/record temperature data (but will keep trying)")
else: # taking readings was successful
self.failed_checks = 0
session.commit()
finally:
session.close()