Add per-status timeouts and tracking for probe status
i.e. this lets us keep track of when a probe becomes "high temp" and then later if it becomes "critical high temp" we can still know how long it's been high
This commit is contained in:
parent
8be64c0580
commit
19553edda6
6 changed files with 324 additions and 41 deletions
|
@ -124,7 +124,7 @@ class TempmonServerDaemon(Daemon):
|
|||
cutoff = self.now - datetime.timedelta(seconds=delay + 60)
|
||||
online = False
|
||||
for probe in client.enabled_probes():
|
||||
if self.check_readings_for_probe(session, probe, cutoff) and not online:
|
||||
if self.check_readings_for_probe(session, probe, cutoff):
|
||||
online = True
|
||||
|
||||
# if client was previously marked online, but we have no "new"
|
||||
|
@ -147,7 +147,6 @@ class TempmonServerDaemon(Daemon):
|
|||
'now': localtime(self.config, self.now, from_utc=True),
|
||||
})
|
||||
|
||||
|
||||
def check_readings_for_probe(self, session, probe, cutoff):
|
||||
"""
|
||||
Check readings for the given probe, within the time window defined by
|
||||
|
@ -161,18 +160,21 @@ class TempmonServerDaemon(Daemon):
|
|||
.first()
|
||||
if reading:
|
||||
|
||||
# is reading below critical min, or above critical max?
|
||||
if (reading.degrees_f <= probe.critical_temp_min or
|
||||
reading.degrees_f >= probe.critical_temp_max):
|
||||
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP, reading)
|
||||
# is reading above critical max?
|
||||
if reading.degrees_f >= probe.critical_temp_max:
|
||||
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP, reading)
|
||||
|
||||
# is reading below "good" min?
|
||||
elif reading.degrees_f < probe.good_temp_min:
|
||||
# is reading above good max?
|
||||
elif reading.degrees_f >= probe.good_temp_max:
|
||||
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP, reading)
|
||||
|
||||
# is reading below good min?
|
||||
elif reading.degrees_f <= probe.good_temp_min:
|
||||
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_LOW_TEMP, reading)
|
||||
|
||||
# is reading above "good" max?
|
||||
elif reading.degrees_f > probe.good_temp_max:
|
||||
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP, reading)
|
||||
# is reading below critical min?
|
||||
elif reading.degrees_f <= probe.critical_temp_min:
|
||||
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_CRITICAL_LOW_TEMP, reading)
|
||||
|
||||
else: # temp is good
|
||||
self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_GOOD_TEMP, reading)
|
||||
|
@ -196,13 +198,25 @@ class TempmonServerDaemon(Daemon):
|
|||
prev_alert_sent = probe.status_alert_sent
|
||||
if probe.status != status:
|
||||
probe.status = status
|
||||
probe.start_status(status, self.now)
|
||||
probe.status_changed = self.now
|
||||
probe.status_alert_sent = None
|
||||
|
||||
# send email when things go back to normal, after being bad
|
||||
# send "high temp" email if previous status was critical, even if
|
||||
# we haven't been high for that long overall
|
||||
if (status == self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP
|
||||
and prev_status in (self.enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP,
|
||||
self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP)
|
||||
and prev_alert_sent):
|
||||
send_email(self.config, 'tempmon_high_temp', data)
|
||||
probe.status_alert_sent = self.now
|
||||
return
|
||||
|
||||
# send email when things go back to normal (i.e. from any other status)
|
||||
if status == self.enum.TEMPMON_PROBE_STATUS_GOOD_TEMP and prev_alert_sent:
|
||||
send_email(self.config, 'tempmon_good_temp', data)
|
||||
probe.status_alert_sent = self.now
|
||||
return
|
||||
|
||||
# no (more) email if status is good
|
||||
if status == self.enum.TEMPMON_PROBE_STATUS_GOOD_TEMP:
|
||||
|
@ -215,19 +229,39 @@ class TempmonServerDaemon(Daemon):
|
|||
return
|
||||
|
||||
# delay even the first email, until configured threshold is reached
|
||||
# unless we have a critical status
|
||||
if status != self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP:
|
||||
timeout = datetime.timedelta(minutes=probe.therm_status_timeout)
|
||||
if (self.now - probe.status_changed) <= timeout:
|
||||
return
|
||||
timeout = probe.timeout_for_status(status) or 0
|
||||
timeout = datetime.timedelta(minutes=timeout)
|
||||
started = probe.status_started(status) or probe.status_changed
|
||||
if (self.now - started) <= timeout:
|
||||
return
|
||||
|
||||
msgtypes = {
|
||||
self.enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP : 'tempmon_critical_high_temp',
|
||||
self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP : 'tempmon_high_temp',
|
||||
self.enum.TEMPMON_PROBE_STATUS_LOW_TEMP : 'tempmon_low_temp',
|
||||
self.enum.TEMPMON_PROBE_STATUS_CRITICAL_LOW_TEMP : 'tempmon_critical_low_temp',
|
||||
self.enum.TEMPMON_PROBE_STATUS_ERROR : 'tempmon_error',
|
||||
}
|
||||
|
||||
self.send_email(status, msgtypes[status], data)
|
||||
|
||||
# maybe send more emails if config said so
|
||||
for msgtype in self.extra_emails:
|
||||
self.send_email(status, msgtype, data)
|
||||
|
||||
probe.status_alert_sent = self.now
|
||||
|
||||
def send_email(self, status, template, data):
|
||||
probe = data['probe']
|
||||
started = probe.status_started(status) or probe.status_changed
|
||||
|
||||
# determine URL for probe, if possible
|
||||
url = self.config.get('tailbone', 'url.tempmon.probe', default='#')
|
||||
data['probe_url'] = url.format(uuid=probe.uuid)
|
||||
|
||||
since = localtime(self.config, probe.status_changed, from_utc=True)
|
||||
since = localtime(self.config, started, from_utc=True)
|
||||
data['status_since'] = since.strftime('%I:%M %p')
|
||||
data['status_since_delta'] = humanize.naturaltime(self.now - probe.status_changed)
|
||||
data['status_since_delta'] = humanize.naturaltime(self.now - started)
|
||||
|
||||
# fetch last 90 minutes of readings
|
||||
session = orm.object_session(probe)
|
||||
|
@ -241,20 +275,7 @@ class TempmonServerDaemon(Daemon):
|
|||
data['recent_readings'] = readings
|
||||
data['pretty_time'] = lambda dt: localtime(self.config, dt, from_utc=True).strftime('%Y-%m-%d %I:%M %p')
|
||||
|
||||
msgtypes = {
|
||||
self.enum.TEMPMON_PROBE_STATUS_LOW_TEMP : 'tempmon_low_temp',
|
||||
self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP : 'tempmon_high_temp',
|
||||
self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP : 'tempmon_critical_temp',
|
||||
self.enum.TEMPMON_PROBE_STATUS_ERROR : 'tempmon_error',
|
||||
}
|
||||
|
||||
send_email(self.config, msgtypes[status], data)
|
||||
|
||||
# maybe send more emails if config said so
|
||||
for msgtype in self.extra_emails:
|
||||
send_email(self.config, msgtype, data)
|
||||
|
||||
probe.status_alert_sent = self.now
|
||||
send_email(self.config, template, data)
|
||||
|
||||
|
||||
def make_daemon(config, pidfile=None):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue