rattail-tempmon/rattail_tempmon/server.py
Lance Edgar 3aa4185de9 Leverage common "now" value when sending emails from server
i.e. don't generate new "now" when sending an email, just use the "now" we
established when starting to check the readings
2018-10-17 19:18:34 -05:00

267 lines
11 KiB
Python

# -*- coding: utf-8; -*-
################################################################################
#
# Rattail -- Retail Software Framework
# Copyright © 2010-2018 Lance Edgar
#
# This file is part of Rattail.
#
# Rattail is free software: you can redistribute it and/or modify it under the
# terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# Rattail is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# Rattail. If not, see <http://www.gnu.org/licenses/>.
#
################################################################################
"""
Tempmon server daemon
"""
from __future__ import unicode_literals, absolute_import
import time
import datetime
import logging
import six
import humanize
from sqlalchemy import orm
from sqlalchemy.exc import OperationalError
from rattail.db import Session, api
from rattail_tempmon.db import Session as TempmonSession, model as tempmon
from rattail.daemon import Daemon
from rattail.time import localtime, make_utc
from rattail.mail import send_email
log = logging.getLogger(__name__)
class TempmonServerDaemon(Daemon):
"""
Linux daemon implementation of tempmon server.
"""
timefmt = '%Y-%m-%d %H:%M:%S'
def run(self):
"""
Keeps an eye on tempmon readings and sends alerts as needed.
"""
self.extra_emails = self.config.getlist('rattail.tempmon', 'extra_emails', default=[])
delay = self.config.getint('rattail.tempmon', 'server.delay', default=60)
self.failed_checks = 0
while True:
self.check_readings()
time.sleep(delay)
def check_readings(self):
# log.debug("checking readings")
self.now = make_utc()
session = TempmonSession()
try:
clients = session.query(tempmon.Client)\
.filter(tempmon.Client.enabled == True)\
.filter(tempmon.Client.archived == False)
for client in clients:
self.check_readings_for_client(session, client)
session.flush()
except Exception as error:
log_error = True
self.failed_checks += 1
session.rollback()
# our goal here is to suppress logging when we see connection
# errors which are due to a simple postgres restart. but if they
# keep coming then we'll go ahead and log them (sending email)
if isinstance(error, OperationalError):
# this first test works upon first DB restart, as well as the
# first time after DB stop. but in the case of DB stop,
# subsequent errors will instead match the second test
if error.connection_invalidated or (
'could not connect to server: Connection refused' in six.text_type(error)):
# only suppress logging for 3 failures, after that we let them go
# TODO: should make the max attempts configurable
if self.failed_checks < 4:
log_error = False
log.debug("database connection failure #%s: %s",
self.failed_checks,
six.text_type(error))
# send error email unless we're suppressing it for now
if log_error:
log.exception("Failed to check client probe readings (but will keep trying)")
else: # checks were successful
self.failed_checks = 0
session.commit()
finally:
session.close()
def check_readings_for_client(self, session, client):
"""
Check readings for all (enabled) probes for the given client.
"""
# cutoff is calculated as the client delay (i.e. how often it takes
# readings) plus one minute. we "should" have a reading for each probe
# within that time window. if no readings are found we will consider
# the client to be (possibly) offline.
delay = client.delay or 60
cutoff = self.now - datetime.timedelta(seconds=delay + 60)
online = False
for probe in client.enabled_probes():
online = online or bool(self.check_readings_for_probe(session, probe, cutoff))
# if client was previously marked online, but we have no "new"
# readings, then let's look closer to see if it's been long enough to
# mark it offline
if client.online and not online:
# we consider client offline if it has failed to take readings for
# 3 times in a row. allow a one minute buffer for good measure.
cutoff = self.now - datetime.timedelta(seconds=(delay * 3) + 60)
reading = session.query(tempmon.Reading)\
.filter(tempmon.Reading.client == client)\
.filter(tempmon.Reading.taken >= cutoff)\
.first()
if not reading:
log.info("marking client as OFFLINE: {}".format(client))
client.online = False
send_email(self.config, 'tempmon_client_offline', {
'client': client,
'now': localtime(self.config, self.now, from_utc=True),
})
def check_readings_for_probe(self, session, probe, cutoff):
"""
Check readings for the given probe, within the time window defined by
the given cutoff.
"""
# we really only care about the latest reading
reading = session.query(tempmon.Reading)\
.filter(tempmon.Reading.probe == probe)\
.filter(tempmon.Reading.taken >= cutoff)\
.order_by(tempmon.Reading.taken.desc())\
.first()
if reading:
# is reading below critical min, or above critical max?
if (reading.degrees_f <= probe.critical_temp_min or
reading.degrees_f >= probe.critical_temp_max):
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP, reading)
# is reading below "good" min?
elif reading.degrees_f < probe.good_temp_min:
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_LOW_TEMP, reading)
# is reading above "good" max?
elif reading.degrees_f > probe.good_temp_max:
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP, reading)
else: # temp is good
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_GOOD_TEMP, reading)
return True
else: # no current readings for probe
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_ERROR)
return False
def update_status(self, probe, status, reading=None):
data = {
'probe': probe,
'status': self.enum.TEMPMON_PROBE_STATUS[status],
'reading': reading,
'taken': localtime(self.config, reading.taken, from_utc=True) if reading else None,
'now': localtime(self.config, self.now, from_utc=True),
}
prev_status = probe.status
prev_alert_sent = probe.status_alert_sent
if probe.status != status:
probe.status = status
probe.status_changed = self.now
probe.status_alert_sent = None
# send email when things go back to normal, after being bad
if status == self.enum.TEMPMON_PROBE_STATUS_GOOD_TEMP and prev_alert_sent:
send_email(self.config, 'tempmon_good_temp', data)
probe.status_alert_sent = self.now
# no (more) email if status is good
if status == self.enum.TEMPMON_PROBE_STATUS_GOOD_TEMP:
return
# no email if we already sent one...until timeout is reached
if probe.status_alert_sent:
timeout = datetime.timedelta(minutes=probe.status_alert_timeout)
if (self.now - probe.status_alert_sent) <= timeout:
return
# delay even the first email, until configured threshold is reached
# unless we have a critical status
if status != self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP:
timeout = datetime.timedelta(minutes=probe.therm_status_timeout)
if (self.now - probe.status_changed) <= timeout:
return
# determine URL for probe, if possible
url = self.config.get('tailbone', 'url.tempmon.probe', default='#')
data['probe_url'] = url.format(uuid=probe.uuid)
since = localtime(self.config, probe.status_changed, from_utc=True)
data['status_since'] = since.strftime('%I:%M %p')
data['status_since_delta'] = humanize.naturaltime(self.now - probe.status_changed)
# fetch last 90 minutes of readings
session = orm.object_session(probe)
recent_minutes = 90 # TODO: make configurable
cutoff = self.now - datetime.timedelta(seconds=(60 * recent_minutes))
readings = session.query(tempmon.Reading)\
.filter(tempmon.Reading.probe == probe)\
.filter(tempmon.Reading.taken >= cutoff)\
.order_by(tempmon.Reading.taken.desc())
data['recent_minutes'] = recent_minutes
data['recent_readings'] = readings
data['pretty_time'] = lambda dt: localtime(self.config, dt, from_utc=True).strftime('%Y-%m-%d %I:%M %p')
msgtypes = {
self.enum.TEMPMON_PROBE_STATUS_LOW_TEMP : 'tempmon_low_temp',
self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP : 'tempmon_high_temp',
self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP : 'tempmon_critical_temp',
self.enum.TEMPMON_PROBE_STATUS_ERROR : 'tempmon_error',
}
send_email(self.config, msgtypes[status], data)
# maybe send more emails if config said so
for msgtype in self.extra_emails:
send_email(self.config, msgtype, data)
probe.status_alert_sent = self.now
def make_daemon(config, pidfile=None):
"""
Returns a tempmon server daemon instance.
"""
if not pidfile:
pidfile = config.get('rattail.tempmon', 'server.pid_path',
default='/var/run/rattail/tempmon-server.pid')
return TempmonServerDaemon(pidfile, config=config)