#!/bin/bash
#
# healthd -- 	This is a simple daemon which can be used to alert you in the
#		event of a hardware health monitoring alarm by sending an 
#		email to the value of ADMIN_EMAIL (defined below).
#
# To Use  --	Simply start the daemon from a shell (may be backgrounded)
#
# Other details -- Checks status every 15 seconds.  Sends warning emails every
#		   ten minutes during alarm until the alarm is cleared.
#		   It won't start up if there is a pending alarm on startup.
#		   Very low loading on the machine (sleeps almost all the time).
#		   This is just an example.  It works, but hopefully we can
#		   get something better written. :')
#
# Requirements -- mail, sensors, bash, sleep
#
# Written & Copyrighten by Philip Edelbrock, 1999.
#
# Version: 1.1
#

if [ ! -f /etc/sysconfig/healthd ]; then
	echo "Missing /etc/sysconfig/healthd"
	exit 1
fi
. /etc/sysconfig/healthd

if [ "$CPU_TEMP_LIMIT" == "" ]; then
	echo "Error: CPU_TEMP_LIMIT is not set" >&2
	exit 1
fi

if [ "$DISK_TEMP_LIMIT" == "" ]; then
	echo "Error: DISK_TEMP_LIMIT is not set" >&2
	exit 1
fi

DELAY=60
ALARM_DELAY=600
RRDFILE=$HEALTHDIR/health.rrd

alert()
{
  echo "Temperature alarm" >&2
  echo "healthd-alert: Temperature alarm: $*" | wall
  if [ "$ADMIN_EMAIL" != "" ]; then
	  (
		  echo "Temperature Alarm"
		  echo "  CPU Limit = $CPU_LIMIT"
		  echo "  Disk Limit = $DISK_LIMIT"
		  echo "Date = $(date)  // $(date '+%s')"
		  echo "Timestamp $LABELS"
		  echo "$*"
	  ) | mail -s "**** Hardware Health Warning on $(hostname) ****" $ADMIN_EMAIL
  fi
}

alert_too_old()
{
  echo "RRD is too old. Something went wrong" >&2

  if [ "$ADMIN_EMAIL" != "" ]; then
	  (
		  echo "Date = $(date)  // $(date '+%s')"
		  echo "Last RRD Update:"
		  rrdtool lastupdate $RRDFILE
	  ) | mail -s '**** Hardware Health Warning on $(hostname) - RRD Out-of-date ****' $ADMIN_EMAIL
  fi

}
# Try loading the built-in sleep implementation to avoid spawning a
# new process every 15 seconds
enable -f sleep.so sleep >/dev/null 2>&1


LABELS=$(rrdtool lastupdate $RRDFILE | head -n 1)
read -r -a array <<< "$LABELS"

# Try to get the max CPU temp from sensors
SENSORS_HIGH_TEMP=`sensors | grep Core | sed -e 's/.*high = +\?\([0-9]\+\)\.[0-9]\+°C.*/\1/' | \
						   sort -n -u | head -n 1`
case $SENSORS_HIGH_TEMP in
	''|*[!0-9]*)
		# Wrong format. Ignore this value
		false;;
    *)
		# Good number. Override sysconfig default with it
		CPU_TEMP_LIMIT=$SENSORS_HIGH_TEMP
		echo "Using CPU specific value for temperature alert: ${SENSORS_HIGH_TEMP}"
		;;
esac

while true
do
 i=0
 temps=$(rrdtool lastupdate $RRDFILE | tail -n 1 | sed -e 's/://g')
 set -- $temps
 ts=$1
 if [ $(expr $ts + $DELAY + $DELAY + $DELAY) -lt $(date '+%s') ]; then
	alert_too_old
        sleep $DELAY
	continue
 fi

 shift
 while [ $# -ne 0 ]; do
        LIMIT=99
	if [[ "${array[$i]}" =~ "CPU" ]] || [[ "${array[$i]}" =~ "Core" ]]; then
	  LIMIT=$CPU_TEMP_LIMIT
	elif [[ "${array[$i]}" =~ "sd" ]] || [[ "${array[$i]}" =~ "HDD" ]]; then
	  LIMIT=$DISK_TEMP_LIMIT
        fi

	sane_temp=$(echo $1 | sed -e 's/\.[0-9]\+//')

	if [ $sane_temp -ge $LIMIT ]; then
		alert $temps
		sleep $ALARM_DELAY
		break
	fi
	shift
	i=$(expr $i + 1)
 done
 sleep $DELAY
done
