#!/bin/bash
#
# Warewulf Node Health Check Script
#
# Michael Jennings <mej@lbl.gov>
# 13 December 2010
#
# $Id: nhc 1103 2012-09-07 21:53:16Z mej $
#

# This is the driver program for the node health check script
# subsystem.  The include directory (/etc/nhc/scripts by default)
# contains a series of bash scripts which, when sourced, should define
# bash functions which will later be invoked to check node health.
#
# The configuration file (/etc/nhc/nhc.conf by default) is then read a
# line at a time.  Any lines beginning with a mask that matches the
# current hostname will invoke the specified check (usually one of the
# bash functions loaded above, but could also be an external command
# or script).  Failure of any check will result in the node being
# flagged as "unhealthy" and the termination of further checks.

### Library functions

# Declare a print-error-and-exit function.
function die() {
    local RET=$1
    shift
    log "Health check failed:  $*"
    syslog "Health check failed:  $*"
    syslog_flush
    if [[ "$MARK_OFFLINE" = "1" ]]; then
        eval '$OFFLINE_NODE "$HOSTNAME" "$*" </dev/null >/dev/null' $LOGFILE '2>&1 &'
    fi
    if [[ -n "$NHC_DETACHED" ]]; then
        echo "$RET $*" > $RESULTFILE
    else
        echo "ERROR Health check failed:  $*"
    fi
    kill_watchdog
    exit $RET
}

# Quick-and-dirty debugging output
function dbg() {
    if [[ "$DEBUG" != "0" ]]; then
        eval echo '"DEBUG:  $*"' $LOGFILE
    fi
}

# Quick-and-dirty log output
function log() {
    if [[ "$SILENT" = "0" ]]; then
        eval echo '"$@"' $LOGFILE
    fi
}

# Store syslog output, send at end of script execution.
function syslog() {
    if [[ -z "$LOGGER_TEXT" ]]; then
        LOGGER_TEXT="$*"
    else
        LOGGER_TEXT="$LOGGER_TEXT"$'\n'"$*"
    fi
}

function syslog_flush() {
    if [[ -n "$LOGGER_TEXT" ]]; then
        echo "$LOGGER_TEXT" | logger -p daemon.err -t "$NAME[$$]"
    fi
    LOGGER_TEXT=""
}

function kill_watchdog() {
    dbg "$FUNCNAME:  Watchdog PID is $WATCHDOG_PID."
    [[ $WATCHDOG_PID -gt 0 ]] && kill -9 $WATCHDOG_PID >/dev/null 2>&1
    return 0
}

#########################

function nhcmain_init_env() {
    ### Variable declarations

    # Static variables
    PATH="/sbin:/usr/sbin:/bin:/usr/bin"
    SYSCONFIGDIR="/etc/sysconfig"
    LIBEXECDIR="/usr/libexec"
    RESULTFILE="/var/run/nhc.status"
    if [[ -r /proc/sys/kernel/hostname ]]; then
        read HOSTNAME < /proc/sys/kernel/hostname
    elif [[ -z "$HOSTNAME" ]]; then
        HOSTNAME="localhost"
    fi
    HOSTNAME_S=${HOSTNAME/%.*}
    RET=0
    LOGGER_TEXT=""
    NHC_PID=$$
    WATCHDOG_PID=0
    export PATH SYSCONFIGDIR LIBEXECDIR RESULTFILE HOSTNAME HOSTNAME_S RET LOGGER_TEXT NHC_PID WATCHDOG_PID

    # Users may override this in /etc/sysconfig/nhc.
    NAME=${0/#*\/}

    # Don't allow previous environment to leak in.  Must be done from /etc/sysconfig/nhc only.
    unset CONFFILE INCDIR HELPERDIR ONLINE_NODE OFFLINE_NODE LOGFILE DEBUG SILENT TIMEOUT MAX_SYS_UID

    if [[ -n "$NHC_DETACHED" ]]; then
        # We're running detached.
        export NHC_DETACHED
        DETACHED_MODE=1
    fi
}

function nhcmain_load_sysconfig() {
    # Load settings from system-wide location.  NOTE:  To change value of $NAME
    # here, the driver script must be renamed to something other than "nhc."
    if [[ -f $SYSCONFIGDIR/$NAME ]]; then
        . $SYSCONFIGDIR/$NAME
    fi
}

function nhcmain_finalize_env() {
    # Set some variables relative to possible /etc/sysconfig/nhc
    # modifications.  Users may have overridden some of these.
    CONFDIR="${CONFDIR:-/etc/$NAME}"
    CONFFILE="${CONFFILE:-$CONFDIR/$NAME.conf}"
    INCDIR="${INCDIR:-$CONFDIR/scripts}"
    HELPERDIR="${HELPERDIR:-$LIBEXECDIR/$NAME}"
    ONLINE_NODE="${ONLINE_NODE:-$HELPERDIR/node-mark-online}"
    OFFLINE_NODE="${OFFLINE_NODE:-$HELPERDIR/node-mark-offline}"
    LOGFILE="${LOGFILE:->>/var/log/$NAME.log}"
    DEBUG=${DEBUG:-0}
    SILENT=${SILENT:-0}
    MARK_OFFLINE=${MARK_OFFLINE:-1}
    DETACHED_MODE=${DETACHED_MODE:-0}
    TIMEOUT=${TIMEOUT:-10}
    MAX_SYS_UID=${MAX_SYS_UID:-99}

    if [[ -n "$NHC_DETACHED" ]]; then
        dbg "This session is running detached from $NHC_DETACHED."
    elif [[ $DETACHED_MODE -eq 1 ]]; then
        dbg "Activating detached mode."
        nhcmain_detach
        return
    fi
    export NAME CONFDIR CONFFILE INCDIR HELPERDIR ONLINE_NODE OFFLINE_NODE LOGFILE DEBUG SILENT TIMEOUT MAX_SYS_UID
}

function nhcmain_check_conffile() {
    # Check for config file before we do too much work.
    if [[ ! -f "$CONFFILE" ]]; then
        # Missing config means no checks.  No checks means no failures.
        return 1
    fi
    return 0
}

function nhcmain_load_scripts() {
    log "Node Health Check starting."

    # Load all include scripts.
    dbg "Loading scripts from $INCDIR..."
    for SCRIPT in $INCDIR/* ; do
        dbg "Loading ${SCRIPT/#*\/}"
        . $SCRIPT
    done
}

function nhcmain_set_watchdog() {
    # Set ALARM to timeout script.
    if [[ $TIMEOUT -gt 0 ]]; then
        (sleep $TIMEOUT ; kill -ALRM -$NHC_PID ; sleep 1 ; kill -TERM -$NHC_PID) &
        WATCHDOG_PID=$!
        export WATCHDOG_PID
        dbg "Watchdog PID is $WATCHDOG_PID, NHC PID is $NHC_PID"
    else
        dbg "No watchdog, NHC PID is $NHC_PID"
    fi
}

function nhcmain_spawn_detached() {
    rm -f "$RESULTFILE" >/dev/null 2>&1
    export NHC_DETACHED=$$
    exec -a nhc-detached $0 </dev/null >/dev/null 2>&1
}

function nhcmain_detach() {
    local RC MSG

    if [[ -r "$RESULTFILE" ]]; then
        read RC MSG < "$RESULTFILE"
    else
        RC=0
        MSG=""
    fi

    # Launch detached process
    nhcmain_spawn_detached &

    # Only mark offline/online in detached copy.
    MARK_OFFLINE=0

    if [[ $RC != 0 ]]; then
        die $RC "$MSG"
        return 1
    fi
    nhcmain_finish
}

function nhcmain_run_checks() {
    CHECKS=( )
    nhc_load_conf "$CONFFILE"
    for ((CNUM=0; CNUM<${#CHECKS[*]}; CNUM++)); do
        CHECK="${CHECKS[$CNUM]}"

        # Run the check.
        log "Running check:  \"$CHECK\""
        eval $CHECK
        RET=$?

        # Check for failure.
        if [[ $RET -ne 0 ]]; then
            log "Node Health Check failed.  Check $CHECK returned $RET"
            die $RET "Check $CHECK returned $RET"
        fi
    done
}

function nhcmain_mark_online() {
    if [[ "$MARK_OFFLINE" = "1" ]]; then
        eval '$ONLINE_NODE "$HOSTNAME" </dev/null >/dev/null' $LOGFILE '2>&1 &'
    fi
}

function nhcmain_finish() {
    syslog_flush
    log "Node Health Check completed successfully (${SECONDS}s${BASH_SUBSHELL:+, $BASH_SUBSHELL subshells})."
    kill_watchdog
    exit 0
}

### Script guts begin here.
if [[ -n "$NHC_LOAD_ONLY" ]]; then
    # We're only supposed to define functions, not actually run anything.
    return 0 || exit 0
fi

trap 'die 128 Terminated by signal.' 1 2 15
trap 'die 127 Script timed out.' 14

nhcmain_init_env
nhcmain_load_sysconfig
nhcmain_finalize_env
nhcmain_check_conffile || exit 0
nhcmain_load_scripts
nhcmain_set_watchdog
nhcmain_run_checks
nhcmain_mark_online
nhcmain_finish
