#! /bin/bash
# file: /usr/sbin/check-split-brain.sh
# auth: jro@suse.de
# modified: mguertler@novell.com
# date: 21.01.08
# copyright: Novell Deutschland GmbH
# licence: GPL
# desc: check the local cluster status and if we find a split brain problem
#       (no ping node) we will shutdown this machine asap
#

# set -x

PATH=/sbin:/usr/sbin:/usr/bin:/bin
THISHOST=`uname -n`

# uncomment the next line for detailed output to syslog
DEBUG=1

# time to wait until the script terminates
SLEEPONEXIT=20

# time to wait until return to the main loop
SLEEPONRETURN=20

##################################################################


#
# Function DoCheckSplitBrain (running in MainLoop, see below)
#
function DoCheckSplitBrain {

# check the current local heartbeat status
# we need an up and running heartbeat for cl_status
cl_status nodestatus $THISHOST >/dev/null 2>&1
if [ $? -ne 0 ]
then
        logger "$0: heartbeat is stopped on this machine."
	sleep $SLEEPONRETURN
        return 1
fi

# check the other node if there is a heartbeat up and running
# we dont want to stop heartbeat local while we have no machine
# for a failover

NODELIST=`cl_status listnodes -n`
if [ $? -ne 0 ]
then
        logger "$0: cannot get list of nodenames"
        sleep $SLEEPONEXIT
        exit 2
fi

test -n "$DEBUG" && logger "$0: check if we can reach the ping node ..."

PINGNODES=`cl_status listnodes -p 2>/dev/null`
if [ $? -ne 0 ]
then
	logger "$0: unexpected error, cannot get list of ping nodes"
        sleep $SLEEPONEXIT
	exit 1
fi

# timeout for pings in seconds
PINGTIMEOUT=5

# actual count of faulty ping packets
FAULTYPINGS=0

for machine in `echo $PINGNODES`
do
	# If ping fails, let us see if the failure persists at least
	#   65 seconds:
	while [ $FAULTYPINGS -le $((65/${PINGTIMEOUT})) ]; do
		if ping -c 1 -W $PINGTIMEOUT $machine >/dev/null 2>&1; then
			# Got a successfull ping, all OK!
			test -n "$DEBUG" &&	\
			logger "$0: OK: all ping nodes can be reached."
			sleep $SLEEPONRETURN
			return
		fi
		FAULTYPINGS=$((FAULTYPINGS+1))
	done
	# find hostname of the other node and try to reach the other node
	# if reachable everything is ok and only the ping node is down
        for machine2 in `echo $NODELIST`
	do
		# this is our machine, no test necessary ;)
		if [ `uname -n` = $machine2 ]
		then
			continue
		fi
		if ping -c 1 -W $PINGTIMEOUT $machine2 >/dev/null 2>&1; then
			# Got a successfull ping, all OK!
			test -n "$DEBUG" &&	\
			logger "$0: OK: other node can be reached."
			sleep $SLEEPONRETURN
			return
		fi
	done
	logger "$0: Warning: cannot reach ping_node $machine!!!"
	logger "$0: emergency shutdown this machine"
	sleep 5
# exit 1
	echo "Emergency reboot triggered by $0" | wall
	{ sleep 10; echo "b" > /proc/sysrq-trigger; } &
	echo "s" > /proc/sysrq-trigger
	sleep 8
	echo "u" > /proc/sysrq-trigger
	echo "b" > /proc/sysrq-trigger
	exit 255
done
}

#
# MainLoop
#
test -n "$DEBUG" &&	\
logger "$0: Enter main loop"
while [ 1==1 ]
	do
	DoCheckSplitBrain
	done
