Add per-status timeouts and tracking for probe status

i.e. this lets us keep track of when a probe becomes "high temp" and then later
if it becomes "critical high temp" we can still know how long it's been high
This commit is contained in:
Lance Edgar 2018-10-19 14:58:30 -05:00
parent 8be64c0580
commit 19553edda6
6 changed files with 324 additions and 41 deletions

View file

@ -124,7 +124,7 @@ class TempmonServerDaemon(Daemon):
cutoff = self.now - datetime.timedelta(seconds=delay + 60)
online = False
for probe in client.enabled_probes():
if self.check_readings_for_probe(session, probe, cutoff) and not online:
if self.check_readings_for_probe(session, probe, cutoff):
online = True
# if client was previously marked online, but we have no "new"
@ -147,7 +147,6 @@ class TempmonServerDaemon(Daemon):
'now': localtime(self.config, self.now, from_utc=True),
})
def check_readings_for_probe(self, session, probe, cutoff):
"""
Check readings for the given probe, within the time window defined by
@ -161,18 +160,21 @@ class TempmonServerDaemon(Daemon):
.first()
if reading:
# is reading below critical min, or above critical max?
if (reading.degrees_f <= probe.critical_temp_min or
reading.degrees_f >= probe.critical_temp_max):
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP, reading)
# is reading above critical max?
if reading.degrees_f >= probe.critical_temp_max:
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP, reading)
# is reading below "good" min?
elif reading.degrees_f < probe.good_temp_min:
# is reading above good max?
elif reading.degrees_f >= probe.good_temp_max:
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP, reading)
# is reading below good min?
elif reading.degrees_f <= probe.good_temp_min:
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_LOW_TEMP, reading)
# is reading above "good" max?
elif reading.degrees_f > probe.good_temp_max:
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP, reading)
# is reading below critical min?
elif reading.degrees_f <= probe.critical_temp_min:
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_CRITICAL_LOW_TEMP, reading)
else: # temp is good
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_GOOD_TEMP, reading)
@ -196,13 +198,25 @@ class TempmonServerDaemon(Daemon):
prev_alert_sent = probe.status_alert_sent
if probe.status != status:
probe.status = status
probe.start_status(status, self.now)
probe.status_changed = self.now
probe.status_alert_sent = None
# send email when things go back to normal, after being bad
# send "high temp" email if previous status was critical, even if
# we haven't been high for that long overall
if (status == self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP
and prev_status in (self.enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP,
self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP)
and prev_alert_sent):
send_email(self.config, 'tempmon_high_temp', data)
probe.status_alert_sent = self.now
return
# send email when things go back to normal (i.e. from any other status)
if status == self.enum.TEMPMON_PROBE_STATUS_GOOD_TEMP and prev_alert_sent:
send_email(self.config, 'tempmon_good_temp', data)
probe.status_alert_sent = self.now
return
# no (more) email if status is good
if status == self.enum.TEMPMON_PROBE_STATUS_GOOD_TEMP:
@ -215,19 +229,39 @@ class TempmonServerDaemon(Daemon):
return
# delay even the first email, until configured threshold is reached
# unless we have a critical status
if status != self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP:
timeout = datetime.timedelta(minutes=probe.therm_status_timeout)
if (self.now - probe.status_changed) <= timeout:
return
timeout = probe.timeout_for_status(status) or 0
timeout = datetime.timedelta(minutes=timeout)
started = probe.status_started(status) or probe.status_changed
if (self.now - started) <= timeout:
return
msgtypes = {
self.enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP : 'tempmon_critical_high_temp',
self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP : 'tempmon_high_temp',
self.enum.TEMPMON_PROBE_STATUS_LOW_TEMP : 'tempmon_low_temp',
self.enum.TEMPMON_PROBE_STATUS_CRITICAL_LOW_TEMP : 'tempmon_critical_low_temp',
self.enum.TEMPMON_PROBE_STATUS_ERROR : 'tempmon_error',
}
self.send_email(status, msgtypes[status], data)
# maybe send more emails if config said so
for msgtype in self.extra_emails:
self.send_email(status, msgtype, data)
probe.status_alert_sent = self.now
def send_email(self, status, template, data):
probe = data['probe']
started = probe.status_started(status) or probe.status_changed
# determine URL for probe, if possible
url = self.config.get('tailbone', 'url.tempmon.probe', default='#')
data['probe_url'] = url.format(uuid=probe.uuid)
since = localtime(self.config, probe.status_changed, from_utc=True)
since = localtime(self.config, started, from_utc=True)
data['status_since'] = since.strftime('%I:%M %p')
data['status_since_delta'] = humanize.naturaltime(self.now - probe.status_changed)
data['status_since_delta'] = humanize.naturaltime(self.now - started)
# fetch last 90 minutes of readings
session = orm.object_session(probe)
@ -241,20 +275,7 @@ class TempmonServerDaemon(Daemon):
data['recent_readings'] = readings
data['pretty_time'] = lambda dt: localtime(self.config, dt, from_utc=True).strftime('%Y-%m-%d %I:%M %p')
msgtypes = {
self.enum.TEMPMON_PROBE_STATUS_LOW_TEMP : 'tempmon_low_temp',
self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP : 'tempmon_high_temp',
self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP : 'tempmon_critical_temp',
self.enum.TEMPMON_PROBE_STATUS_ERROR : 'tempmon_error',
}
send_email(self.config, msgtypes[status], data)
# maybe send more emails if config said so
for msgtype in self.extra_emails:
send_email(self.config, msgtype, data)
probe.status_alert_sent = self.now
send_email(self.config, template, data)
def make_daemon(config, pidfile=None):