Add per-status timeouts and tracking for probe status

i.e. this lets us keep track of when a probe becomes "high temp" and then later if it becomes "critical high temp" we can still know how long it's been high
2018-10-19 14:58:30 -05:00 · 2018-10-19 14:58:30 -05:00 · 19553edda6
commit 19553edda6
parent 8be64c0580
6 changed files with 324 additions and 41 deletions
--- a/rattail_tempmon/db/alembic/versions/b02c531caca5_add_more_timeouts.py
+++ b/rattail_tempmon/db/alembic/versions/b02c531caca5_add_more_timeouts.py
@ -0,0 +1,51 @@
 # -*- coding: utf-8; -*-
 """add more timeouts
 Revision ID: b02c531caca5
 Revises: 5f2b87474433
 Create Date: 2018-10-19 13:51:54.422490
 """
 from __future__ import unicode_literals, absolute_import
 # revision identifiers, used by Alembic.
 revision = 'b02c531caca5'
 down_revision = u'5f2b87474433'
 branch_labels = None
 depends_on = None
 from alembic import op
 import sqlalchemy as sa
 import rattail.db.types
 def upgrade():
    # probe
    op.add_column('probe', sa.Column('critical_max_started', sa.DateTime(), nullable=True))
    op.add_column('probe', sa.Column('critical_max_timeout', sa.Integer(), nullable=True))
    op.add_column('probe', sa.Column('critical_min_started', sa.DateTime(), nullable=True))
    op.add_column('probe', sa.Column('critical_min_timeout', sa.Integer(), nullable=True))
    op.add_column('probe', sa.Column('error_started', sa.DateTime(), nullable=True))
    op.add_column('probe', sa.Column('error_timeout', sa.Integer(), nullable=True))
    op.add_column('probe', sa.Column('good_max_started', sa.DateTime(), nullable=True))
    op.add_column('probe', sa.Column('good_max_timeout', sa.Integer(), nullable=True))
    op.add_column('probe', sa.Column('good_min_started', sa.DateTime(), nullable=True))
    op.add_column('probe', sa.Column('good_min_timeout', sa.Integer(), nullable=True))
 def downgrade():
    # probe
    op.drop_column('probe', 'good_min_timeout')
    op.drop_column('probe', 'good_min_started')
    op.drop_column('probe', 'good_max_timeout')
    op.drop_column('probe', 'good_max_started')
    op.drop_column('probe', 'error_timeout')
    op.drop_column('probe', 'error_started')
    op.drop_column('probe', 'critical_min_timeout')
    op.drop_column('probe', 'critical_min_started')
    op.drop_column('probe', 'critical_max_timeout')
    op.drop_column('probe', 'critical_max_started')
--- a/rattail_tempmon/db/model.py
+++ b/rattail_tempmon/db/model.py
@ -2,7 +2,7 @@
 ################################################################################
 #
 #  Rattail -- Retail Software Framework
-#  Copyright © 2010-2017 Lance Edgar
+#  Copyright © 2010-2018 Lance Edgar
 #
 #  This file is part of Rattail.
 #
@ -33,6 +33,7 @@ import sqlalchemy as sa
 from sqlalchemy import orm
 from sqlalchemy.ext.declarative import declarative_base
 from rattail import enum
 from rattail.db.model import uuid_column
 from rattail.db.model.core import ModelBase
@ -130,11 +131,88 @@ class Probe(Base):
    device_path = sa.Column(sa.String(length=255), nullable=True)
    enabled = sa.Column(sa.Boolean(), nullable=False, default=True)
-    good_temp_min = sa.Column(sa.Integer(), nullable=False)
+    critical_temp_max = sa.Column(sa.Integer(), nullable=False, doc="""
-    good_temp_max = sa.Column(sa.Integer(), nullable=False)
+    Maximum high temperature; when a reading is greater than or equal to this
-    critical_temp_min = sa.Column(sa.Integer(), nullable=False)
+    value, the probe's status becomes "critical high temp".
-    critical_temp_max = sa.Column(sa.Integer(), nullable=False)
+    """)
    critical_max_started = sa.Column(sa.DateTime(), nullable=True, doc="""
    Timestamp when the probe readings started to indicate "critical high temp"
    status.  This should be null unless the probe currently has that status.
    """)
    critical_max_timeout = sa.Column(sa.Integer(), nullable=True, doc="""
    Number of minutes the probe is allowed to have "critical high temp" status,
    before the first email alert is sent for that.  If empty, there will be no
    delay and the first email will go out as soon as that status is reached.
    If set, should probably be a *low* number.
    """)
    good_temp_max = sa.Column(sa.Integer(), nullable=False, doc="""
    Maximum good temperature; when a reading is greater than or equal to this
    value, the probe's status becomes "high temp" (unless the reading also
    breaches the :attr:`critical_temp_max` threshold).
    """)
    good_max_timeout = sa.Column(sa.Integer(), nullable=True, doc="""
    Number of minutes the probe is allowed to have "high temp" status, before
    the first email alert is sent for that.  This is typically meant to account
    for the length of the defrost cycle, so may be a rather large number.
    """)
    good_max_started = sa.Column(sa.DateTime(), nullable=True, doc="""
    Timestamp when the probe readings started to indicate "high temp" status.
    This should be null unless the probe currently has either "high temp" or
    "critical high temp" status.
    """)
    good_temp_min = sa.Column(sa.Integer(), nullable=False, doc="""
    Minimum good temperature; when a reading is less than or equal to this
    value, the probe's status becomes "low temp" (unless the reading also
    breaches the :attr:`critical_temp_min` threshold).
    """)
    good_min_timeout = sa.Column(sa.Integer(), nullable=True, doc="""
    Number of minutes the probe is allowed to have "low temp" status, before
    the first email alert is sent for that.
    """)
    good_min_started = sa.Column(sa.DateTime(), nullable=True, doc="""
    Timestamp when the probe readings started to indicate "low temp" status.
    This should be null unless the probe currently has either "low temp" or
    "critical low temp" status.
    """)
    critical_temp_min = sa.Column(sa.Integer(), nullable=False, doc="""
    Minimum low temperature; when a reading is less than or equal to this
    value, the probe's status becomes "critical low temp".  If empty, there
    will be no delay and the first email will go out as soon as that status is
    reached.
    """)
    critical_min_started = sa.Column(sa.DateTime(), nullable=True, doc="""
    Timestamp when the probe readings started to indicate "critical low temp"
    status.  This should be null unless the probe currently has that status.
    """)
    critical_min_timeout = sa.Column(sa.Integer(), nullable=True, doc="""
    Number of minutes the probe is allowed to have "critical low temp" status,
    before the first email alert is sent for that.  If empty, there will be no
    delay and the first email will go out as soon as that status is reached.
    """)
    error_started = sa.Column(sa.DateTime(), nullable=True, doc="""
    Timestamp when the probe readings started to indicate "error" status.  This
    should be null unless the probe currently has that status.
    """)
    error_timeout = sa.Column(sa.Integer(), nullable=True, doc="""
    Number of minutes the probe is allowed to have "error" status, before the
    first email alert is sent for that.  If empty, there will be no delay and
    the first email will go out as soon as that status is reached.
    """)
    # TODO: deprecate / remove this
    therm_status_timeout = sa.Column(sa.Integer(), nullable=False, doc="""
    Number of minutes the temperature is allowed to be "high" before the first
    "high temp" email alert is sent.  This is typically meant to account for
@ -159,6 +237,90 @@ class Probe(Base):
    def __str__(self):
        return self.description
    def start_status(self, status, time):
        """
        Update the "started" timestamp field for the given status.  This is
        used to track e.g. when we cross the "high temp" threshold, as a
        separate event from when the "critical high temp" threshold is reached.
        Note that in addition to setting the appropriate timestamp field, this
        also will clear out other timestamp fields, according to the specific
        (new) status.
        """
        if status in (enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP,
                      enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP):
            self.critical_max_started = time
            # note, we don't clear out "high temp" time
            self.good_min_started = None
            self.critical_min_started = None
            self.error_started = None
        elif status == enum.TEMPMON_PROBE_STATUS_HIGH_TEMP:
            self.critical_max_started = None
            self.good_max_started = time
            self.good_min_started = None
            self.critical_min_started = None
            self.error_started = None
        elif status == enum.TEMPMON_PROBE_STATUS_LOW_TEMP:
            self.critical_max_started = None
            self.good_max_started = None
            self.good_min_started = time
            self.critical_min_started = None
            self.error_started = None
        elif status == enum.TEMPMON_PROBE_STATUS_CRITICAL_LOW_TEMP:
            self.critical_max_started = None
            self.good_max_started = None
            # note, we don't clear out "low temp" time
            self.critical_min_started = time
            self.error_started = None
        elif status == enum.TEMPMON_PROBE_STATUS_ERROR:
            # note, we don't clear out any other status times
            self.error_started = time
    def status_started(self, status):
        """
        Return the timestamp indicating when the given status started.
        """
        if status in (enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP,
                      enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP):
            return self.critical_max_started
        elif status == enum.TEMPMON_PROBE_STATUS_HIGH_TEMP:
            return self.good_max_started
        elif status == enum.TEMPMON_PROBE_STATUS_LOW_TEMP:
            return self.good_min_started
        elif status == enum.TEMPMON_PROBE_STATUS_CRITICAL_LOW_TEMP:
            return self.critical_min_started
        elif status == enum.TEMPMON_PROBE_STATUS_ERROR:
            return self.error_started
    def timeout_for_status(self, status):
        """
        Returns the timeout value for the given status.  This is be the number
        of minutes by which we should delay the initial email for the status.
        """
        if status in (enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP,
                      enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP):
            return self.critical_max_timeout
        elif status == enum.TEMPMON_PROBE_STATUS_HIGH_TEMP:
            return self.good_max_timeout or self.therm_status_timeout
        elif status == enum.TEMPMON_PROBE_STATUS_LOW_TEMP:
            return self.good_min_timeout or self.therm_status_timeout
        elif status == enum.TEMPMON_PROBE_STATUS_CRITICAL_LOW_TEMP:
            return self.critical_min_timeout
        elif status == enum.TEMPMON_PROBE_STATUS_ERROR:
            return self.error_timeout
@six.python_2_unicode_compatible
 class Reading(Base):
--- a/rattail_tempmon/emails.py
+++ b/rattail_tempmon/emails.py
@ -54,6 +54,30 @@ class TempmonBase(object):
        }
 class tempmon_critical_high_temp(TempmonBase, Email):
    """
    Sent when a tempmon probe takes a "critical high" temperature reading.
    """
    default_subject = "CRITICAL HIGH Temperature"
    def sample_data(self, request):
        data = super(tempmon_critical_high_temp, self).sample_data(request)
        data['status'] = self.enum.TEMPMON_PROBE_STATUS[self.enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP]
        return data
 class tempmon_critical_low_temp(TempmonBase, Email):
    """
    Sent when a tempmon probe takes a "critical low" temperature reading.
    """
    default_subject = "CRITICAL LOW Temperature"
    def sample_data(self, request):
        data = super(tempmon_critical_low_temp, self).sample_data(request)
        data['status'] = self.enum.TEMPMON_PROBE_STATUS[self.enum.TEMPMON_PROBE_STATUS_CRITICAL_LOW_TEMP]
        return data
 class tempmon_critical_temp(TempmonBase, Email):
    """
    Sent when a tempmon probe takes a reading which is "critical" in either the
@ -92,7 +116,7 @@ class tempmon_good_temp(TempmonBase, Email):
    Sent whenever a tempmon probe first takes a "good temp" reading, after
    having previously had some bad reading(s).
    """
-    default_subject = "Good temperature detected"
+    default_subject = "OK Temperature"
    def sample_data(self, request):
        data = super(tempmon_good_temp, self).sample_data(request)
@ -105,7 +129,7 @@ class tempmon_high_temp(TempmonBase, Email):
    Sent when a tempmon probe takes a reading which is above the "maximum good
    temp" range, but still below the "critically high temp" threshold.
    """
-    default_subject = "High temperature detected"
+    default_subject = "HIGH Temperature"
    def sample_data(self, request):
        data = super(tempmon_high_temp, self).sample_data(request)
@ -118,7 +142,7 @@ class tempmon_low_temp(TempmonBase, Email):
    Sent when a tempmon probe takes a reading which is below the "minimum good
    temp" range, but still above the "critically low temp" threshold.
    """
-    default_subject = "Low temperature detected"
+    default_subject = "LOW Temperature"
    def sample_data(self, request):
        data = super(tempmon_low_temp, self).sample_data(request)
--- a/rattail_tempmon/server.py
+++ b/rattail_tempmon/server.py
@ -124,7 +124,7 @@ class TempmonServerDaemon(Daemon):
        cutoff = self.now - datetime.timedelta(seconds=delay + 60)
        online = False
        for probe in client.enabled_probes():
-            if self.check_readings_for_probe(session, probe, cutoff) and not online:
+            if self.check_readings_for_probe(session, probe, cutoff):
                online = True
        # if client was previously marked online, but we have no "new"
@ -147,7 +147,6 @@ class TempmonServerDaemon(Daemon):
                    'now': localtime(self.config, self.now, from_utc=True),
                })
    def check_readings_for_probe(self, session, probe, cutoff):
        """
        Check readings for the given probe, within the time window defined by
@ -161,18 +160,21 @@ class TempmonServerDaemon(Daemon):
                         .first()
        if reading:
-            # is reading below critical min, or above critical max?
+            # is reading above critical max?
-            if (reading.degrees_f <= probe.critical_temp_min or
+            if reading.degrees_f >= probe.critical_temp_max:
-                  reading.degrees_f >= probe.critical_temp_max):
+                self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP, reading)
                self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP, reading)
-            # is reading below "good" min?
+            # is reading above good max?
-            elif reading.degrees_f < probe.good_temp_min:
+            elif reading.degrees_f >= probe.good_temp_max:
                self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP, reading)
            # is reading below good min?
            elif reading.degrees_f <= probe.good_temp_min:
                self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_LOW_TEMP, reading)
-            # is reading above "good" max?
+            # is reading below critical min?
-            elif reading.degrees_f > probe.good_temp_max:
+            elif reading.degrees_f <= probe.critical_temp_min:
-                self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP, reading)
+                self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_CRITICAL_LOW_TEMP, reading)
            else: # temp is good
                self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_GOOD_TEMP, reading)
@ -196,13 +198,25 @@ class TempmonServerDaemon(Daemon):
        prev_alert_sent = probe.status_alert_sent
        if probe.status != status:
            probe.status = status
            probe.start_status(status, self.now)
            probe.status_changed = self.now
            probe.status_alert_sent = None
-            # send email when things go back to normal, after being bad
+            # send "high temp" email if previous status was critical, even if
            # we haven't been high for that long overall
            if (status == self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP
                and prev_status in (self.enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP,
                                    self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP)
                and prev_alert_sent):
                send_email(self.config, 'tempmon_high_temp', data)
                probe.status_alert_sent = self.now
                return
            # send email when things go back to normal (i.e. from any other status)
            if status == self.enum.TEMPMON_PROBE_STATUS_GOOD_TEMP and prev_alert_sent:
                send_email(self.config, 'tempmon_good_temp', data)
                probe.status_alert_sent = self.now
                return
        # no (more) email if status is good
        if status == self.enum.TEMPMON_PROBE_STATUS_GOOD_TEMP:
@ -215,19 +229,39 @@ class TempmonServerDaemon(Daemon):
                return
        # delay even the first email, until configured threshold is reached
-        # unless we have a critical status
+        timeout = probe.timeout_for_status(status) or 0
-        if status != self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP:
+        timeout = datetime.timedelta(minutes=timeout)
-            timeout = datetime.timedelta(minutes=probe.therm_status_timeout)
+        started = probe.status_started(status) or probe.status_changed
-            if (self.now - probe.status_changed) <= timeout:
+        if (self.now - started) <= timeout:
-                return
+            return
        msgtypes = {
            self.enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP   : 'tempmon_critical_high_temp',
            self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP            : 'tempmon_high_temp',
            self.enum.TEMPMON_PROBE_STATUS_LOW_TEMP             : 'tempmon_low_temp',
            self.enum.TEMPMON_PROBE_STATUS_CRITICAL_LOW_TEMP    : 'tempmon_critical_low_temp',
            self.enum.TEMPMON_PROBE_STATUS_ERROR                : 'tempmon_error',
        }
        self.send_email(status, msgtypes[status], data)
        # maybe send more emails if config said so
        for msgtype in self.extra_emails:
            self.send_email(status, msgtype, data)
        probe.status_alert_sent = self.now
    def send_email(self, status, template, data):
        probe = data['probe']
        started = probe.status_started(status) or probe.status_changed
        # determine URL for probe, if possible
        url = self.config.get('tailbone', 'url.tempmon.probe', default='#')
        data['probe_url'] = url.format(uuid=probe.uuid)
-        since = localtime(self.config, probe.status_changed, from_utc=True)
+        since = localtime(self.config, started, from_utc=True)
        data['status_since'] = since.strftime('%I:%M %p')
-        data['status_since_delta'] = humanize.naturaltime(self.now - probe.status_changed)
+        data['status_since_delta'] = humanize.naturaltime(self.now - started)
        # fetch last 90 minutes of readings
        session = orm.object_session(probe)
@ -241,20 +275,7 @@ class TempmonServerDaemon(Daemon):
        data['recent_readings'] = readings
        data['pretty_time'] = lambda dt: localtime(self.config, dt, from_utc=True).strftime('%Y-%m-%d %I:%M %p')
-        msgtypes = {
+        send_email(self.config, template, data)
            self.enum.TEMPMON_PROBE_STATUS_LOW_TEMP             : 'tempmon_low_temp',
            self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP            : 'tempmon_high_temp',
            self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP        : 'tempmon_critical_temp',
            self.enum.TEMPMON_PROBE_STATUS_ERROR                : 'tempmon_error',
        }
        send_email(self.config, msgtypes[status], data)
        # maybe send more emails if config said so
        for msgtype in self.extra_emails:
            send_email(self.config, msgtype, data)
        probe.status_alert_sent = self.now
 def make_daemon(config, pidfile=None):
--- a/rattail_tempmon/templates/mail/tempmon_critical_high_temp.html.mako
+++ b/rattail_tempmon/templates/mail/tempmon_critical_high_temp.html.mako
--- a/rattail_tempmon/templates/mail/tempmon_critical_low_temp.html.mako
+++ b/rattail_tempmon/templates/mail/tempmon_critical_low_temp.html.mako
@ -0,0 +1,25 @@
 ## -*- coding: utf-8 -*-
 <html>
  <body>
    <p>
      <b>This is an alert from ${probe}!</b><br>
      The status of ${probe} is: ${status}.<br>
      The current temperature is: ${reading.degrees_f}.<br>
      The temperature should never be this high.
      Investigate Immediately!<br>
    </p>
    <p>
      Notes: <br>
      Frozen food that is above 40 degrees needs to be thrown away<br>
      if it remains at that temperature for two hours or more.<br>
    </p>
    <p>
      Check out <a href="http://www.fsis.usda.gov/wps/portal/fsis/topics/food-safety-education/get-answers/food-safety-fact-sheets/safe-food-handling/freezing-and-food-safety/CT_Index/!ut/p/a1/jZFRT8IwEIB_DY9dbw7J8G1ZYtiUTYJK2Qsp7NYt2dqlrU759RZ8UQJK-9LefV-ud6UFZbSQ_L0R3DZK8vZwLyYbWMDEn8aQ5lP_HpLsdZE_xDGEy1sHrP8AsuBK_8KK4D8_vaLAjZ7Hc0GLntuaNLJSlAm0hEszoDaUVUqVxPAK7Sep-M4SUyNalzjEyDFbc1m2jRQO1oh7d3J6SX6YlMXPm0SW-EFXtPj9KvDdTrJgOZ6lWQD5-BQ4M7Zv4PJcXOOiVdvjH60juQ1C16HGCjVq7027cG1tb-5GMIJhGDyhlGjR26nunFArYyk74fruhe0foxk0T90qNNEXiOIqAA!!/#16">this USDA link</a> for useful information
    </p>
    <p>
      This email will repeat every ${probe.status_alert_timeout} minutes until the issue<br>
      has been resolved.
    </p>
    <p>
  </body>
 </html>