#!/bin/bash

###########################################################################
# Wait for ping and ssh responses, no extra tools required.  Well, ping :-).

[ -z "$NETNAME" ] && echo "export NETNAME=something" >&1 && exit 1

jq --version >/dev/null 2>&1
[ $? -ne 0 ] && echo 'First "apt-get install jq"' >&2 && exit 1

set -u

###########################################################################
# Logging

LOG=/tmp/waitnodes.$NETNAME.log

function log() {
	echo -e "$@"
	echo -e "$@" >> $LOG
}

###########################################################################
# Turn "all" into a list of integers

function expand_nodelist() {
    if [ $1 != all ]; then
	NODES="$@"	# These aren't qualified in any way
    else # Use manifesting service to walk through all nodes ready to run.
    	NODE_COORDS=`tm-manifest listnodes | jq -r '."200".nodes[]'`
    	# echo $NODE_COORDS >&2

    	echo -n 'Building list of "ready" nodes:' >&2
    	NODES=
    	for N in $NODE_COORDS; do
	    TEMP=`tm-manifest getnode $N`
	    STATUS=`echo $TEMP | jq -r '."200".status'`
	    [ "$STATUS" != ready ] && continue
	    NODE_ID=`echo $TEMP | jq -r '."200".node_id'`
	    echo -n " $NODE_ID" >&2
	    NODES="$NODES $NODE_ID"
	done
	echo >&2
    fi
    echo $NODES
}

###########################################################################
# Count QEMUs

function all_running() {	# Optional $1 means wait, else die
    while true; do
	N_RUNNING=`pgrep -a qemu-system-aar | grep $NETNAME | wc -l`
	let DEAD=$N_WANTED-$N_RUNNING
	[ $DEAD -eq 0 ] && return
	[ $# -eq 0 ] && log "$DEAD qemu(s) are not running" >&2 && exit 1
	log "Waiting for $DEAD qemu(s) to start..."
	sleep 2
    done
}

###########################################################################
# Pretty-print the time elapsed.  Reset ELAPSED_START as needed.

ELAPSED_START=`date +%s`

function elapsed() {
    END=`date +%s`
    let E=$END-$ELAPSED_START
    let H=$E/3600
    let M=$E/60
    let S=$E%60
    printf "%2d:%02d:%02d" $H $M $S
}

###########################################################################
# Common function to eventually give up.  Count and sleeps combine for
# a minute per unique WAITLIST.

LAST_COUNT=0
LAST_WAITLIST=

function waitlimit() {	# $1: topic; $2: current wait list
    log "`elapsed` elapsed for $1, count = $LAST_COUNT; waiting on $2"
    if [ -z "$2" -o "$2" != "$LAST_WAITLIST" ]; then
	LAST_WAITLIST="$2"
	LAST_COUNT=0
	[ "$2" ] && sleep 10
	return
    fi
    let LAST_COUNT=$LAST_COUNT+1
    [ $LAST_COUNT -le 5 ] && sleep 10 && return
    log "\n$1 STUCK ON $2\n\n`elapsed` total ($1)"
    exit 1
}

###########################################################################
# Main

log "\n-----------------------\nSTARTING NEW RUN AT `date`"

NODES=`expand_nodelist $@`
HOSTNAMES=`for N in $NODES; do printf "node%02d " $N; done`

N_WANTED=`echo $HOSTNAMES | wc -w`	# Needed by...
all_running wait			# settling time after startnodes

WAITLIST=$HOSTNAMES
LAST_WAITLIST=				# Used by waitlimit()
while [ "$WAITLIST" ]; do
    all_running
    WAITLIST=
    echo -en "\nPing"
    for H in $HOSTNAMES; do		# Yes do them all each time
	echo -n " $H"
	ping -c1 $H >/dev/null 2>&1
	[ $? -ne 0 ] && WAITLIST="$WAITLIST $H" && echo -n " (no ping)"
	echo -n ";"
    done
    echo; echo
    waitlimit "Ping" "$WAITLIST"
done

WAITLIST=$HOSTNAMES
LAST_WAITLIST=				# Used by waitlimit()
while [ "$WAITLIST" ]; do
    all_running
    WAITLIST=
    echo -en "\nRe-ping and test sshd to"
    for H in $HOSTNAMES; do	# yeah do them all each time
	# Either kernel panic (maybe a dropped TFTP packet?) or systemd
	# hang at "bring up network interface"
	echo -n " $H"
	ping -c1 $H >/dev/null 2>&1
	if [ $? -ne 0 ]; then
	    echo -n " (no ping)"
	    WAITLIST="$WAITLIST $H"
	else
	    # From the Google.  First time, the timeout is about 20 seconds.
	    # Second time it's about 3.
	    (echo > /dev/tcp/$H/22) >/dev/null 2>&1
	    if [ $? -ne 0 ]; then
		echo -n " (no ssh)"
		WAITLIST="$WAITLIST $H"
	    fi
	fi
	echo -n ";"
    done
    echo; echo
    waitlimit "ping + ssh" "$WAITLIST"
done

exit 0
