Add per-status timeouts and tracking for probe status

i.e. this lets us keep track of when a probe becomes "high temp" and then later
if it becomes "critical high temp" we can still know how long it's been high
This commit is contained in:
Lance Edgar 2018-10-19 14:58:30 -05:00
parent 8be64c0580
commit 19553edda6
6 changed files with 324 additions and 41 deletions

View file

@ -0,0 +1,51 @@
# -*- coding: utf-8; -*-
"""add more timeouts
Revision ID: b02c531caca5
Revises: 5f2b87474433
Create Date: 2018-10-19 13:51:54.422490
"""
from __future__ import unicode_literals, absolute_import
# revision identifiers, used by Alembic.
revision = 'b02c531caca5'
down_revision = u'5f2b87474433'
branch_labels = None
depends_on = None
from alembic import op
import sqlalchemy as sa
import rattail.db.types
def upgrade():
# probe
op.add_column('probe', sa.Column('critical_max_started', sa.DateTime(), nullable=True))
op.add_column('probe', sa.Column('critical_max_timeout', sa.Integer(), nullable=True))
op.add_column('probe', sa.Column('critical_min_started', sa.DateTime(), nullable=True))
op.add_column('probe', sa.Column('critical_min_timeout', sa.Integer(), nullable=True))
op.add_column('probe', sa.Column('error_started', sa.DateTime(), nullable=True))
op.add_column('probe', sa.Column('error_timeout', sa.Integer(), nullable=True))
op.add_column('probe', sa.Column('good_max_started', sa.DateTime(), nullable=True))
op.add_column('probe', sa.Column('good_max_timeout', sa.Integer(), nullable=True))
op.add_column('probe', sa.Column('good_min_started', sa.DateTime(), nullable=True))
op.add_column('probe', sa.Column('good_min_timeout', sa.Integer(), nullable=True))
def downgrade():
# probe
op.drop_column('probe', 'good_min_timeout')
op.drop_column('probe', 'good_min_started')
op.drop_column('probe', 'good_max_timeout')
op.drop_column('probe', 'good_max_started')
op.drop_column('probe', 'error_timeout')
op.drop_column('probe', 'error_started')
op.drop_column('probe', 'critical_min_timeout')
op.drop_column('probe', 'critical_min_started')
op.drop_column('probe', 'critical_max_timeout')
op.drop_column('probe', 'critical_max_started')

View file

@ -2,7 +2,7 @@
################################################################################
#
# Rattail -- Retail Software Framework
# Copyright © 2010-2017 Lance Edgar
# Copyright © 2010-2018 Lance Edgar
#
# This file is part of Rattail.
#
@ -33,6 +33,7 @@ import sqlalchemy as sa
from sqlalchemy import orm
from sqlalchemy.ext.declarative import declarative_base
from rattail import enum
from rattail.db.model import uuid_column
from rattail.db.model.core import ModelBase
@ -130,11 +131,88 @@ class Probe(Base):
device_path = sa.Column(sa.String(length=255), nullable=True)
enabled = sa.Column(sa.Boolean(), nullable=False, default=True)
good_temp_min = sa.Column(sa.Integer(), nullable=False)
good_temp_max = sa.Column(sa.Integer(), nullable=False)
critical_temp_min = sa.Column(sa.Integer(), nullable=False)
critical_temp_max = sa.Column(sa.Integer(), nullable=False)
critical_temp_max = sa.Column(sa.Integer(), nullable=False, doc="""
Maximum high temperature; when a reading is greater than or equal to this
value, the probe's status becomes "critical high temp".
""")
critical_max_started = sa.Column(sa.DateTime(), nullable=True, doc="""
Timestamp when the probe readings started to indicate "critical high temp"
status. This should be null unless the probe currently has that status.
""")
critical_max_timeout = sa.Column(sa.Integer(), nullable=True, doc="""
Number of minutes the probe is allowed to have "critical high temp" status,
before the first email alert is sent for that. If empty, there will be no
delay and the first email will go out as soon as that status is reached.
If set, should probably be a *low* number.
""")
good_temp_max = sa.Column(sa.Integer(), nullable=False, doc="""
Maximum good temperature; when a reading is greater than or equal to this
value, the probe's status becomes "high temp" (unless the reading also
breaches the :attr:`critical_temp_max` threshold).
""")
good_max_timeout = sa.Column(sa.Integer(), nullable=True, doc="""
Number of minutes the probe is allowed to have "high temp" status, before
the first email alert is sent for that. This is typically meant to account
for the length of the defrost cycle, so may be a rather large number.
""")
good_max_started = sa.Column(sa.DateTime(), nullable=True, doc="""
Timestamp when the probe readings started to indicate "high temp" status.
This should be null unless the probe currently has either "high temp" or
"critical high temp" status.
""")
good_temp_min = sa.Column(sa.Integer(), nullable=False, doc="""
Minimum good temperature; when a reading is less than or equal to this
value, the probe's status becomes "low temp" (unless the reading also
breaches the :attr:`critical_temp_min` threshold).
""")
good_min_timeout = sa.Column(sa.Integer(), nullable=True, doc="""
Number of minutes the probe is allowed to have "low temp" status, before
the first email alert is sent for that.
""")
good_min_started = sa.Column(sa.DateTime(), nullable=True, doc="""
Timestamp when the probe readings started to indicate "low temp" status.
This should be null unless the probe currently has either "low temp" or
"critical low temp" status.
""")
critical_temp_min = sa.Column(sa.Integer(), nullable=False, doc="""
Minimum low temperature; when a reading is less than or equal to this
value, the probe's status becomes "critical low temp". If empty, there
will be no delay and the first email will go out as soon as that status is
reached.
""")
critical_min_started = sa.Column(sa.DateTime(), nullable=True, doc="""
Timestamp when the probe readings started to indicate "critical low temp"
status. This should be null unless the probe currently has that status.
""")
critical_min_timeout = sa.Column(sa.Integer(), nullable=True, doc="""
Number of minutes the probe is allowed to have "critical low temp" status,
before the first email alert is sent for that. If empty, there will be no
delay and the first email will go out as soon as that status is reached.
""")
error_started = sa.Column(sa.DateTime(), nullable=True, doc="""
Timestamp when the probe readings started to indicate "error" status. This
should be null unless the probe currently has that status.
""")
error_timeout = sa.Column(sa.Integer(), nullable=True, doc="""
Number of minutes the probe is allowed to have "error" status, before the
first email alert is sent for that. If empty, there will be no delay and
the first email will go out as soon as that status is reached.
""")
# TODO: deprecate / remove this
therm_status_timeout = sa.Column(sa.Integer(), nullable=False, doc="""
Number of minutes the temperature is allowed to be "high" before the first
"high temp" email alert is sent. This is typically meant to account for
@ -159,6 +237,90 @@ class Probe(Base):
def __str__(self):
return self.description
def start_status(self, status, time):
"""
Update the "started" timestamp field for the given status. This is
used to track e.g. when we cross the "high temp" threshold, as a
separate event from when the "critical high temp" threshold is reached.
Note that in addition to setting the appropriate timestamp field, this
also will clear out other timestamp fields, according to the specific
(new) status.
"""
if status in (enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP,
enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP):
self.critical_max_started = time
# note, we don't clear out "high temp" time
self.good_min_started = None
self.critical_min_started = None
self.error_started = None
elif status == enum.TEMPMON_PROBE_STATUS_HIGH_TEMP:
self.critical_max_started = None
self.good_max_started = time
self.good_min_started = None
self.critical_min_started = None
self.error_started = None
elif status == enum.TEMPMON_PROBE_STATUS_LOW_TEMP:
self.critical_max_started = None
self.good_max_started = None
self.good_min_started = time
self.critical_min_started = None
self.error_started = None
elif status == enum.TEMPMON_PROBE_STATUS_CRITICAL_LOW_TEMP:
self.critical_max_started = None
self.good_max_started = None
# note, we don't clear out "low temp" time
self.critical_min_started = time
self.error_started = None
elif status == enum.TEMPMON_PROBE_STATUS_ERROR:
# note, we don't clear out any other status times
self.error_started = time
def status_started(self, status):
"""
Return the timestamp indicating when the given status started.
"""
if status in (enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP,
enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP):
return self.critical_max_started
elif status == enum.TEMPMON_PROBE_STATUS_HIGH_TEMP:
return self.good_max_started
elif status == enum.TEMPMON_PROBE_STATUS_LOW_TEMP:
return self.good_min_started
elif status == enum.TEMPMON_PROBE_STATUS_CRITICAL_LOW_TEMP:
return self.critical_min_started
elif status == enum.TEMPMON_PROBE_STATUS_ERROR:
return self.error_started
def timeout_for_status(self, status):
"""
Returns the timeout value for the given status. This is be the number
of minutes by which we should delay the initial email for the status.
"""
if status in (enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP,
enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP):
return self.critical_max_timeout
elif status == enum.TEMPMON_PROBE_STATUS_HIGH_TEMP:
return self.good_max_timeout or self.therm_status_timeout
elif status == enum.TEMPMON_PROBE_STATUS_LOW_TEMP:
return self.good_min_timeout or self.therm_status_timeout
elif status == enum.TEMPMON_PROBE_STATUS_CRITICAL_LOW_TEMP:
return self.critical_min_timeout
elif status == enum.TEMPMON_PROBE_STATUS_ERROR:
return self.error_timeout
@six.python_2_unicode_compatible
class Reading(Base):

View file

@ -54,6 +54,30 @@ class TempmonBase(object):
}
class tempmon_critical_high_temp(TempmonBase, Email):
"""
Sent when a tempmon probe takes a "critical high" temperature reading.
"""
default_subject = "CRITICAL HIGH Temperature"
def sample_data(self, request):
data = super(tempmon_critical_high_temp, self).sample_data(request)
data['status'] = self.enum.TEMPMON_PROBE_STATUS[self.enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP]
return data
class tempmon_critical_low_temp(TempmonBase, Email):
"""
Sent when a tempmon probe takes a "critical low" temperature reading.
"""
default_subject = "CRITICAL LOW Temperature"
def sample_data(self, request):
data = super(tempmon_critical_low_temp, self).sample_data(request)
data['status'] = self.enum.TEMPMON_PROBE_STATUS[self.enum.TEMPMON_PROBE_STATUS_CRITICAL_LOW_TEMP]
return data
class tempmon_critical_temp(TempmonBase, Email):
"""
Sent when a tempmon probe takes a reading which is "critical" in either the
@ -92,7 +116,7 @@ class tempmon_good_temp(TempmonBase, Email):
Sent whenever a tempmon probe first takes a "good temp" reading, after
having previously had some bad reading(s).
"""
default_subject = "Good temperature detected"
default_subject = "OK Temperature"
def sample_data(self, request):
data = super(tempmon_good_temp, self).sample_data(request)
@ -105,7 +129,7 @@ class tempmon_high_temp(TempmonBase, Email):
Sent when a tempmon probe takes a reading which is above the "maximum good
temp" range, but still below the "critically high temp" threshold.
"""
default_subject = "High temperature detected"
default_subject = "HIGH Temperature"
def sample_data(self, request):
data = super(tempmon_high_temp, self).sample_data(request)
@ -118,7 +142,7 @@ class tempmon_low_temp(TempmonBase, Email):
Sent when a tempmon probe takes a reading which is below the "minimum good
temp" range, but still above the "critically low temp" threshold.
"""
default_subject = "Low temperature detected"
default_subject = "LOW Temperature"
def sample_data(self, request):
data = super(tempmon_low_temp, self).sample_data(request)

View file

@ -124,7 +124,7 @@ class TempmonServerDaemon(Daemon):
cutoff = self.now - datetime.timedelta(seconds=delay + 60)
online = False
for probe in client.enabled_probes():
if self.check_readings_for_probe(session, probe, cutoff) and not online:
if self.check_readings_for_probe(session, probe, cutoff):
online = True
# if client was previously marked online, but we have no "new"
@ -147,7 +147,6 @@ class TempmonServerDaemon(Daemon):
'now': localtime(self.config, self.now, from_utc=True),
})
def check_readings_for_probe(self, session, probe, cutoff):
"""
Check readings for the given probe, within the time window defined by
@ -161,18 +160,21 @@ class TempmonServerDaemon(Daemon):
.first()
if reading:
# is reading below critical min, or above critical max?
if (reading.degrees_f <= probe.critical_temp_min or
reading.degrees_f >= probe.critical_temp_max):
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP, reading)
# is reading above critical max?
if reading.degrees_f >= probe.critical_temp_max:
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP, reading)
# is reading below "good" min?
elif reading.degrees_f < probe.good_temp_min:
# is reading above good max?
elif reading.degrees_f >= probe.good_temp_max:
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP, reading)
# is reading below good min?
elif reading.degrees_f <= probe.good_temp_min:
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_LOW_TEMP, reading)
# is reading above "good" max?
elif reading.degrees_f > probe.good_temp_max:
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP, reading)
# is reading below critical min?
elif reading.degrees_f <= probe.critical_temp_min:
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_CRITICAL_LOW_TEMP, reading)
else: # temp is good
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_GOOD_TEMP, reading)
@ -196,13 +198,25 @@ class TempmonServerDaemon(Daemon):
prev_alert_sent = probe.status_alert_sent
if probe.status != status:
probe.status = status
probe.start_status(status, self.now)
probe.status_changed = self.now
probe.status_alert_sent = None
# send email when things go back to normal, after being bad
# send "high temp" email if previous status was critical, even if
# we haven't been high for that long overall
if (status == self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP
and prev_status in (self.enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP,
self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP)
and prev_alert_sent):
send_email(self.config, 'tempmon_high_temp', data)
probe.status_alert_sent = self.now
return
# send email when things go back to normal (i.e. from any other status)
if status == self.enum.TEMPMON_PROBE_STATUS_GOOD_TEMP and prev_alert_sent:
send_email(self.config, 'tempmon_good_temp', data)
probe.status_alert_sent = self.now
return
# no (more) email if status is good
if status == self.enum.TEMPMON_PROBE_STATUS_GOOD_TEMP:
@ -215,19 +229,39 @@ class TempmonServerDaemon(Daemon):
return
# delay even the first email, until configured threshold is reached
# unless we have a critical status
if status != self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP:
timeout = datetime.timedelta(minutes=probe.therm_status_timeout)
if (self.now - probe.status_changed) <= timeout:
return
timeout = probe.timeout_for_status(status) or 0
timeout = datetime.timedelta(minutes=timeout)
started = probe.status_started(status) or probe.status_changed
if (self.now - started) <= timeout:
return
msgtypes = {
self.enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP : 'tempmon_critical_high_temp',
self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP : 'tempmon_high_temp',
self.enum.TEMPMON_PROBE_STATUS_LOW_TEMP : 'tempmon_low_temp',
self.enum.TEMPMON_PROBE_STATUS_CRITICAL_LOW_TEMP : 'tempmon_critical_low_temp',
self.enum.TEMPMON_PROBE_STATUS_ERROR : 'tempmon_error',
}
self.send_email(status, msgtypes[status], data)
# maybe send more emails if config said so
for msgtype in self.extra_emails:
self.send_email(status, msgtype, data)
probe.status_alert_sent = self.now
def send_email(self, status, template, data):
probe = data['probe']
started = probe.status_started(status) or probe.status_changed
# determine URL for probe, if possible
url = self.config.get('tailbone', 'url.tempmon.probe', default='#')
data['probe_url'] = url.format(uuid=probe.uuid)
since = localtime(self.config, probe.status_changed, from_utc=True)
since = localtime(self.config, started, from_utc=True)
data['status_since'] = since.strftime('%I:%M %p')
data['status_since_delta'] = humanize.naturaltime(self.now - probe.status_changed)
data['status_since_delta'] = humanize.naturaltime(self.now - started)
# fetch last 90 minutes of readings
session = orm.object_session(probe)
@ -241,20 +275,7 @@ class TempmonServerDaemon(Daemon):
data['recent_readings'] = readings
data['pretty_time'] = lambda dt: localtime(self.config, dt, from_utc=True).strftime('%Y-%m-%d %I:%M %p')
msgtypes = {
self.enum.TEMPMON_PROBE_STATUS_LOW_TEMP : 'tempmon_low_temp',
self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP : 'tempmon_high_temp',
self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP : 'tempmon_critical_temp',
self.enum.TEMPMON_PROBE_STATUS_ERROR : 'tempmon_error',
}
send_email(self.config, msgtypes[status], data)
# maybe send more emails if config said so
for msgtype in self.extra_emails:
send_email(self.config, msgtype, data)
probe.status_alert_sent = self.now
send_email(self.config, template, data)
def make_daemon(config, pidfile=None):

View file

@ -0,0 +1,25 @@
## -*- coding: utf-8 -*-
<html>
<body>
<p>
<b>This is an alert from ${probe}!</b><br>
The status of ${probe} is: ${status}.<br>
The current temperature is: ${reading.degrees_f}.<br>
The temperature should never be this high.
Investigate Immediately!<br>
</p>
<p>
Notes: <br>
Frozen food that is above 40 degrees needs to be thrown away<br>
if it remains at that temperature for two hours or more.<br>
</p>
<p>
Check out <a href="http://www.fsis.usda.gov/wps/portal/fsis/topics/food-safety-education/get-answers/food-safety-fact-sheets/safe-food-handling/freezing-and-food-safety/CT_Index/!ut/p/a1/jZFRT8IwEIB_DY9dbw7J8G1ZYtiUTYJK2Qsp7NYt2dqlrU759RZ8UQJK-9LefV-ud6UFZbSQ_L0R3DZK8vZwLyYbWMDEn8aQ5lP_HpLsdZE_xDGEy1sHrP8AsuBK_8KK4D8_vaLAjZ7Hc0GLntuaNLJSlAm0hEszoDaUVUqVxPAK7Sep-M4SUyNalzjEyDFbc1m2jRQO1oh7d3J6SX6YlMXPm0SW-EFXtPj9KvDdTrJgOZ6lWQD5-BQ4M7Zv4PJcXOOiVdvjH60juQ1C16HGCjVq7027cG1tb-5GMIJhGDyhlGjR26nunFArYyk74fruhe0foxk0T90qNNEXiOIqAA!!/#16">this USDA link</a> for useful information
</p>
<p>
This email will repeat every ${probe.status_alert_timeout} minutes until the issue<br>
has been resolved.
</p>
<p>
</body>
</html>