Add per-status timeouts and tracking for probe status
i.e. this lets us keep track of when a probe becomes "high temp" and then later if it becomes "critical high temp" we can still know how long it's been high
This commit is contained in:
		
							parent
							
								
									8be64c0580
								
							
						
					
					
						commit
						19553edda6
					
				
					 6 changed files with 324 additions and 41 deletions
				
			
		| 
						 | 
				
			
			@ -124,7 +124,7 @@ class TempmonServerDaemon(Daemon):
 | 
			
		|||
        cutoff = self.now - datetime.timedelta(seconds=delay + 60)
 | 
			
		||||
        online = False
 | 
			
		||||
        for probe in client.enabled_probes():
 | 
			
		||||
            if self.check_readings_for_probe(session, probe, cutoff) and not online:
 | 
			
		||||
            if self.check_readings_for_probe(session, probe, cutoff):
 | 
			
		||||
                online = True
 | 
			
		||||
 | 
			
		||||
        # if client was previously marked online, but we have no "new"
 | 
			
		||||
| 
						 | 
				
			
			@ -147,7 +147,6 @@ class TempmonServerDaemon(Daemon):
 | 
			
		|||
                    'now': localtime(self.config, self.now, from_utc=True),
 | 
			
		||||
                })
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def check_readings_for_probe(self, session, probe, cutoff):
 | 
			
		||||
        """
 | 
			
		||||
        Check readings for the given probe, within the time window defined by
 | 
			
		||||
| 
						 | 
				
			
			@ -161,18 +160,21 @@ class TempmonServerDaemon(Daemon):
 | 
			
		|||
                         .first()
 | 
			
		||||
        if reading:
 | 
			
		||||
 | 
			
		||||
            # is reading below critical min, or above critical max?
 | 
			
		||||
            if (reading.degrees_f <= probe.critical_temp_min or
 | 
			
		||||
                  reading.degrees_f >= probe.critical_temp_max):
 | 
			
		||||
                self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP, reading)
 | 
			
		||||
            # is reading above critical max?
 | 
			
		||||
            if reading.degrees_f >= probe.critical_temp_max:
 | 
			
		||||
                self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP, reading)
 | 
			
		||||
 | 
			
		||||
            # is reading below "good" min?
 | 
			
		||||
            elif reading.degrees_f < probe.good_temp_min:
 | 
			
		||||
            # is reading above good max?
 | 
			
		||||
            elif reading.degrees_f >= probe.good_temp_max:
 | 
			
		||||
                self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP, reading)
 | 
			
		||||
 | 
			
		||||
            # is reading below good min?
 | 
			
		||||
            elif reading.degrees_f <= probe.good_temp_min:
 | 
			
		||||
                self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_LOW_TEMP, reading)
 | 
			
		||||
 | 
			
		||||
            # is reading above "good" max?
 | 
			
		||||
            elif reading.degrees_f > probe.good_temp_max:
 | 
			
		||||
                self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP, reading)
 | 
			
		||||
            # is reading below critical min?
 | 
			
		||||
            elif reading.degrees_f <= probe.critical_temp_min:
 | 
			
		||||
                self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_CRITICAL_LOW_TEMP, reading)
 | 
			
		||||
 | 
			
		||||
            else: # temp is good
 | 
			
		||||
                self.update_status(probe, self.enum.TEMPMON_PROBE_STATUS_GOOD_TEMP, reading)
 | 
			
		||||
| 
						 | 
				
			
			@ -196,13 +198,25 @@ class TempmonServerDaemon(Daemon):
 | 
			
		|||
        prev_alert_sent = probe.status_alert_sent
 | 
			
		||||
        if probe.status != status:
 | 
			
		||||
            probe.status = status
 | 
			
		||||
            probe.start_status(status, self.now)
 | 
			
		||||
            probe.status_changed = self.now
 | 
			
		||||
            probe.status_alert_sent = None
 | 
			
		||||
 | 
			
		||||
            # send email when things go back to normal, after being bad
 | 
			
		||||
            # send "high temp" email if previous status was critical, even if
 | 
			
		||||
            # we haven't been high for that long overall
 | 
			
		||||
            if (status == self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP
 | 
			
		||||
                and prev_status in (self.enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP,
 | 
			
		||||
                                    self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP)
 | 
			
		||||
                and prev_alert_sent):
 | 
			
		||||
                send_email(self.config, 'tempmon_high_temp', data)
 | 
			
		||||
                probe.status_alert_sent = self.now
 | 
			
		||||
                return
 | 
			
		||||
 | 
			
		||||
            # send email when things go back to normal (i.e. from any other status)
 | 
			
		||||
            if status == self.enum.TEMPMON_PROBE_STATUS_GOOD_TEMP and prev_alert_sent:
 | 
			
		||||
                send_email(self.config, 'tempmon_good_temp', data)
 | 
			
		||||
                probe.status_alert_sent = self.now
 | 
			
		||||
                return
 | 
			
		||||
 | 
			
		||||
        # no (more) email if status is good
 | 
			
		||||
        if status == self.enum.TEMPMON_PROBE_STATUS_GOOD_TEMP:
 | 
			
		||||
| 
						 | 
				
			
			@ -215,19 +229,39 @@ class TempmonServerDaemon(Daemon):
 | 
			
		|||
                return
 | 
			
		||||
 | 
			
		||||
        # delay even the first email, until configured threshold is reached
 | 
			
		||||
        # unless we have a critical status
 | 
			
		||||
        if status != self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP:
 | 
			
		||||
            timeout = datetime.timedelta(minutes=probe.therm_status_timeout)
 | 
			
		||||
            if (self.now - probe.status_changed) <= timeout:
 | 
			
		||||
                return
 | 
			
		||||
        timeout = probe.timeout_for_status(status) or 0
 | 
			
		||||
        timeout = datetime.timedelta(minutes=timeout)
 | 
			
		||||
        started = probe.status_started(status) or probe.status_changed
 | 
			
		||||
        if (self.now - started) <= timeout:
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        msgtypes = {
 | 
			
		||||
            self.enum.TEMPMON_PROBE_STATUS_CRITICAL_HIGH_TEMP   : 'tempmon_critical_high_temp',
 | 
			
		||||
            self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP            : 'tempmon_high_temp',
 | 
			
		||||
            self.enum.TEMPMON_PROBE_STATUS_LOW_TEMP             : 'tempmon_low_temp',
 | 
			
		||||
            self.enum.TEMPMON_PROBE_STATUS_CRITICAL_LOW_TEMP    : 'tempmon_critical_low_temp',
 | 
			
		||||
            self.enum.TEMPMON_PROBE_STATUS_ERROR                : 'tempmon_error',
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        self.send_email(status, msgtypes[status], data)
 | 
			
		||||
 | 
			
		||||
        # maybe send more emails if config said so
 | 
			
		||||
        for msgtype in self.extra_emails:
 | 
			
		||||
            self.send_email(status, msgtype, data)
 | 
			
		||||
 | 
			
		||||
        probe.status_alert_sent = self.now
 | 
			
		||||
 | 
			
		||||
    def send_email(self, status, template, data):
 | 
			
		||||
        probe = data['probe']
 | 
			
		||||
        started = probe.status_started(status) or probe.status_changed
 | 
			
		||||
 | 
			
		||||
        # determine URL for probe, if possible
 | 
			
		||||
        url = self.config.get('tailbone', 'url.tempmon.probe', default='#')
 | 
			
		||||
        data['probe_url'] = url.format(uuid=probe.uuid)
 | 
			
		||||
 | 
			
		||||
        since = localtime(self.config, probe.status_changed, from_utc=True)
 | 
			
		||||
        since = localtime(self.config, started, from_utc=True)
 | 
			
		||||
        data['status_since'] = since.strftime('%I:%M %p')
 | 
			
		||||
        data['status_since_delta'] = humanize.naturaltime(self.now - probe.status_changed)
 | 
			
		||||
        data['status_since_delta'] = humanize.naturaltime(self.now - started)
 | 
			
		||||
 | 
			
		||||
        # fetch last 90 minutes of readings
 | 
			
		||||
        session = orm.object_session(probe)
 | 
			
		||||
| 
						 | 
				
			
			@ -241,20 +275,7 @@ class TempmonServerDaemon(Daemon):
 | 
			
		|||
        data['recent_readings'] = readings
 | 
			
		||||
        data['pretty_time'] = lambda dt: localtime(self.config, dt, from_utc=True).strftime('%Y-%m-%d %I:%M %p')
 | 
			
		||||
 | 
			
		||||
        msgtypes = {
 | 
			
		||||
            self.enum.TEMPMON_PROBE_STATUS_LOW_TEMP             : 'tempmon_low_temp',
 | 
			
		||||
            self.enum.TEMPMON_PROBE_STATUS_HIGH_TEMP            : 'tempmon_high_temp',
 | 
			
		||||
            self.enum.TEMPMON_PROBE_STATUS_CRITICAL_TEMP        : 'tempmon_critical_temp',
 | 
			
		||||
            self.enum.TEMPMON_PROBE_STATUS_ERROR                : 'tempmon_error',
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        send_email(self.config, msgtypes[status], data)
 | 
			
		||||
 | 
			
		||||
        # maybe send more emails if config said so
 | 
			
		||||
        for msgtype in self.extra_emails:
 | 
			
		||||
            send_email(self.config, msgtype, data)
 | 
			
		||||
 | 
			
		||||
        probe.status_alert_sent = self.now
 | 
			
		||||
        send_email(self.config, template, data)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def make_daemon(config, pidfile=None):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue