Add per-status timeouts and tracking for probe status

i.e. this lets us keep track of when a probe becomes "high temp" and then later if it becomes "critical high temp" we can still know how long it's been high
2018-10-19 14:58:30 -05:00 · 2018-10-19 14:58:30 -05:00 · 19553edda6
commit 19553edda6
parent 8be64c0580
6 changed files with 324 additions and 41 deletions
--- a/rattail_tempmon/server.py
+++ b/rattail_tempmon/server.py
@ -124,7 +124,7 @@ class TempmonServerDaemon(Daemon):
        cutoff = self.now - datetime.timedelta(seconds=delay + 60)
        online = False
        for probe in client.enabled_probes():
-            if self.check_readings_for_probe(session, probe, cutoff) and not online:
+            if self.check_readings_for_probe(session, probe, cutoff):
                online = True

        # if client was previously marked online, but we have no "new"
@ -147,7 +147,6 @@ class TempmonServerDaemon(Daemon):
                    'now': localtime(self.config, self.now, from_utc=True),
                })

-
    def check_readings_for_probe(self, session, probe, cutoff):
        """
        Check readings for the given probe, within the time window defined by
@ -161,18 +160,21 @@ class TempmonServerDaemon(Daemon):
                         .first()
        if reading:

-            # is reading below critical min, or above critical max?
-            if (reading.degrees_f <= probe.critical_temp_min or
-                  reading.degrees_f >= probe.critical_temp_max):
-                self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP, reading)
+            # is reading above critical max?
+            if reading.degrees_f >= probe.critical_temp_max:
+                self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP, reading)

-            # is reading below "good" min?
-            elif reading.degrees_f < probe.good_temp_min:
+            # is reading above good max?
+            elif reading.degrees_f >= probe.good_temp_max:
+                self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP, reading)
+
+            # is reading below good min?
+            elif reading.degrees_f <= probe.good_temp_min:
                self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_LOW_TEMP, reading)

-            # is reading above "good" max?
-            elif reading.degrees_f > probe.good_temp_max:
-                self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP, reading)
+            # is reading below critical min?
+            elif reading.degrees_f <= probe.critical_temp_min:
+                self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_CRITICAL_LOW_TEMP, reading)

            else: # temp is good
                self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_GOOD_TEMP, reading)
@ -196,13 +198,25 @@ class TempmonServerDaemon(Daemon):
        prev_alert_sent = probe.status_alert_sent
        if probe.status != status:
            probe.status = status
+            probe.start_status(status, self.now)
            probe.status_changed = self.now
            probe.status_alert_sent = None

-            # send email when things go back to normal, after being bad
+            # send "high temp" email if previous status was critical, even if
+            # we haven't been high for that long overall
+            if (status == self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP
+                and prev_status in (self.enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP,
+                                    self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP)
+                and prev_alert_sent):
+                send_email(self.config, 'tempmon_high_temp', data)
+                probe.status_alert_sent = self.now
+                return
+
+            # send email when things go back to normal (i.e. from any other status)
            if status == self.enum.TEMPMON_PROBE_STATUS_GOOD_TEMP and prev_alert_sent:
                send_email(self.config, 'tempmon_good_temp', data)
                probe.status_alert_sent = self.now
+                return

        # no (more) email if status is good
        if status == self.enum.TEMPMON_PROBE_STATUS_GOOD_TEMP:
@ -215,19 +229,39 @@ class TempmonServerDaemon(Daemon):
                return

        # delay even the first email, until configured threshold is reached
-        # unless we have a critical status
-        if status != self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP:
-            timeout = datetime.timedelta(minutes=probe.therm_status_timeout)
-            if (self.now - probe.status_changed) <= timeout:
-                return
+        timeout = probe.timeout_for_status(status) or 0
+        timeout = datetime.timedelta(minutes=timeout)
+        started = probe.status_started(status) or probe.status_changed
+        if (self.now - started) <= timeout:
+            return
+
+        msgtypes = {
+            self.enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP   : 'tempmon_critical_high_temp',
+            self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP            : 'tempmon_high_temp',
+            self.enum.TEMPMON_PROBE_STATUS_LOW_TEMP             : 'tempmon_low_temp',
+            self.enum.TEMPMON_PROBE_STATUS_CRITICAL_LOW_TEMP    : 'tempmon_critical_low_temp',
+            self.enum.TEMPMON_PROBE_STATUS_ERROR                : 'tempmon_error',
+        }
+
+        self.send_email(status, msgtypes[status], data)
+
+        # maybe send more emails if config said so
+        for msgtype in self.extra_emails:
+            self.send_email(status, msgtype, data)
+
+        probe.status_alert_sent = self.now
+
+    def send_email(self, status, template, data):
+        probe = data['probe']
+        started = probe.status_started(status) or probe.status_changed

        # determine URL for probe, if possible
        url = self.config.get('tailbone', 'url.tempmon.probe', default='#')
        data['probe_url'] = url.format(uuid=probe.uuid)

-        since = localtime(self.config, probe.status_changed, from_utc=True)
+        since = localtime(self.config, started, from_utc=True)
        data['status_since'] = since.strftime('%I:%M %p')
-        data['status_since_delta'] = humanize.naturaltime(self.now - probe.status_changed)
+        data['status_since_delta'] = humanize.naturaltime(self.now - started)

        # fetch last 90 minutes of readings
        session = orm.object_session(probe)
@ -241,20 +275,7 @@ class TempmonServerDaemon(Daemon):
        data['recent_readings'] = readings
        data['pretty_time'] = lambda dt: localtime(self.config, dt, from_utc=True).strftime('%Y-%m-%d %I:%M %p')

-        msgtypes = {
-            self.enum.TEMPMON_PROBE_STATUS_LOW_TEMP             : 'tempmon_low_temp',
-            self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP            : 'tempmon_high_temp',
-            self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP        : 'tempmon_critical_temp',
-            self.enum.TEMPMON_PROBE_STATUS_ERROR                : 'tempmon_error',
-        }
-
-        send_email(self.config, msgtypes[status], data)
-
-        # maybe send more emails if config said so
-        for msgtype in self.extra_emails:
-            send_email(self.config, msgtype, data)
-
-        probe.status_alert_sent = self.now
+        send_email(self.config, template, data)


 def make_daemon(config, pidfile=None):