#!/bin/bash
#
# cs_show_error_patterns
#
# (c) 2011-2016 SUSE Linux GmbH, Germany. Author: L.Pinne.
# GNU General Public License v2. No warranty.
# http://www.gnu.org/licenses/gpl.html
#
# Version: 2016-08-03 SLES11
#

EXE="$0"

CFG="/etc/ClusterTools2/cs_show_error_patterns"
test -s $CFG && source $CFG

RDM=$RANDOM
DMSG="/tmp/dmesg.$RDM"
JCTL=""

CFGVAR="
ERROR_PATTERN
CLUERR_PATTERN
APPERR_PATTERN
CLUSTER_LOG
SYSTEM_LOG
ZIPPED_LOG
"

test -z "${SYSTEM_LOG}" &&\
	 SYSTEM_LOG="/var/log/messages /var/log/boot.msg /var/log/boot.omsg /var/spool/mail/root /proc/mdstat /proc/net/bonding/bond0"

# TODO SLES12 bond ...?
if which journalctl 2>/dev/null; then
	JCTL="/tmp/journalctl.$RDM"
	test -z "${SYSTEM_LOG}" &&\
		SYSTEM_LOG="/var/log/messages /var/spool/mail/root /proc/mdstat /proc/net/bonding/bond0"
fi
SYSTEM_LOG="${SYSTEM_LOG} ${DMSG} ${JCTL}"

test -z "/etc/corosync/corosync.conf" &&\
test -z "${CLUSTER_LOG}" &&\
	 CLUSTER_LOG=$(grep "logfile:.*/" /etc/corosync/corosync.conf |\
		 tr -d "    " | cut -d":" -f2)

test -z "${ZIPPED_LOG}" &&\
	ZIPPED_LOG="/var/log/messages*bz2"
	ZIPPED_LOG="${ZIPPED_LOG} ${CLUSTER_LOG}*bz2"

# TODO add more HBA errors: hpsa cciss
test -z "${ERROR_PATTERN}" &&\
	 ERROR_PATTERN="
"taint.flag"
"no.device.*found"
"be2net.*Error"
"mlx4_en.*error"
"bfa.*err"
"lpfc.*err"
"qla.*err"
"qla.*failed"
"qlge.*error"
"Ramping.down.queue"
"hfcldd.*Err"
"unsupported.SFP.*module"
"NMI.received"
"NMI.*error"
"irq.*nobody.cared"
"Error.DRAM"
"mce:.CPU.supports.0.MCE.banks"
"SMP.disabled"
"kernel.*task.*abort"
"kernel.*ACPI.Error"
"kernel.*page.allocation.failure"
"double.fault:"
"target.failure"
"reservation.conflict"
"SCSI.*reset"
"LUN.larger"
"I/O.error"
"Broken.pipe"
"Dropped.frame"
"disabling.*barrier"
"doesn.*support.*DPO.*FUA"
"Unable.*SMART.enabled"
"BBU.disabled"
"multipathd.*timed.out"
"multipathd.*mpath.*unable"
"multipathd.*reinstated"
"multi.*path.*down"
"[Ff]ail.*/dev/md"
"U_\|_U"
"found.0.matching.devices"
"/dev/md.*failed"
"duplicate.VG"
"duplicate.PV"
"not.found"
"dlm.*link.*down"
"ocfs2.*has.evicted"
"ocfs2.*ERR"
"ocfs2.*not.unmounted.cleanly"
"OCFS2.*kernel.interface.loaded"
"o2dlm_eviction_cb"
"fsck.*recommended"
"EXT.-fs.error"
"Starting.XFS.recovery"
"xfs_log_force.*error"
"sdrServ:.*err"
"server.*not.responding"
"iSCSI.connection.*error"
"Sense.*error"
"bond.*link.*down"
"bond.*duplicate.address"
"bond.*not.detect.*failures"
"bond.*permanent.HWaddr.*in.use"
"link.*not.ready"
"link.failure"
"Link.Failure.Count"
"RPC.*multiple.fragments"
"martian.source.*dev"
"unexpectedly.shrunk.window"
"TCP:.Possible.SYN.flooding.on.port"
"net_ratelimit:"
"Host.*is.not.allowed.to.talk.to.us!"
"COMPLAIN.ACCEPT.*MAC"
"POSSIBLE.BREAK-IN.ATTEMPT"
"ntpd.*failed"
"ntpd.*time.reset"
"time.sync.status.changed"
"Frequency.format.error"
"failed.to.bind.to.LDAP"
"error:.PAM:.User.not.known"
"conversation.failed"
"refused.mount.request"
"nfs..server.*not.responding"
"logrotate.*error"
"Failed.to.read"
"exits.with.status.*[1-9]"
"Failed.services.*runlevel"
"shutdown.*shutting.down"
"syslog-ng.shutting.down"
"kernel.*segfault.at"
"Out.of.memory"
"invoked.oom-killer"
"Call.Trace"
"soft.lockup"
"kernel.*deprecated"
"annot.open.*file"
"nable.to.open.*file"
"nable.to.load"
"Potential.spurious.wakeup"
"annot.create..ocket"
"end.*failed"
"ead.*failed"
"GAB.*failure"
"Symantec.Technical.Support"
"error.*VRT"
"VCS.CRITICAL"
"vxvm.*volume.*not"
"Hangcheck.*clock"
"pagecache.limit.set"
"
# TODO: separate variable for application patterns (annot.open..rofile" ... "Hangcheck.*clock"
# TODO: raid.set.*active.with.2.out.of.2.mirrors ?
# TODO: check for un-matched userfreindly names (mpath[a-z])
# TODO: pattern for failed module (could not load)
# TODO: async IO from grep kio /proc/slabinfo ?
# TODO: direct IO from grep ...,DIR, /proc/slabinfo ?
# TODO: verify cluster log msgs. maybe more generic possible ?

test -z "${CLUERR_PATTERN}" &&\
	 CLUERR_PATTERN="
"TOTEM.*FAILED.*RECEIVE"
"TOTEM.*processor.failed"
"TOTEM.*Incrementing.problem"
"TOTEM.*Retransmit.List"
"quorum.lost"
"fenced.because.*.un-expectedly.down"
"stonith.*Device.*not.fond"
"Node.*unclean"
"pcmk.*Child.process.*exited"
"pcmk.*message.*local.cib.failed"
"pengine.*fenced.*resource.failure"
"pengine.*Forcing.*away.*failures"
"crm_resource.*Error"
"crmd.*ERROR:"
"crmd.*Updating.failcount.*stop"
"crmd.*Updating.failcount.*start"
"sbd.*mbox.read.failed"
"sbd.*Latency"
"SoftDog.*not"
"

test -z "${APPERR_PATTERN}" &&\
	APPERR_PATTERN="
"vasd.will.not.synchronize.time"
"vascache.*error"
"

function help() {
	echo "usage:	$(basename $0)"
	echo "	$(basename $0) [OPTION]"
	echo
	echo " --help		show help."
	echo " --version	show version."
	echo " --writecfg	show some internal variables."
	echo " --zip		search compressed logs, too."
}


function writecfg(){
	echo -e "# $CFG \n# For SLES11.\n#"
	for c in $CFGVAR; do
		echo "${c}=\""
		echo ${!c} | tr " " "\n"
		echo "\""
		echo "#"
	done
}


function run_grep() {
	echo "logs: $LOG" >/dev/stderr

	dmesg >$DMSG
	which journalctl 2>/dev/null && journalctl --no-pager >$JCTL

	# TODO: more efficient loop
	for e in ${ERROR_PATTERN} ${CLUERR_PATTERN} ${APPERR_PATTERN}; do
        	echo -n "$e = "
		for f in ${LOG}; do
			test -r $f && cat $f
		done | zgrep -ic $e
	done

	rm $DMSG $JCTL
}


# main()
case $1 in
	-v|--version)
		echo -n "$(basename $EXE) "
        	head -11 $EXE | grep "^# Version: "
		exit
	;;
	-h|--help)
		help
		exit
	;;
	-w|--writecfg)
		writecfg
		exit
	;;
	-z|--zip)
		LOG="${SYSTEM_LOG}"
		test -s ${CLUSTER_LOG} && LOG="${LOG} ${CLUSTER_LOG}"
		for z in ${ZIPPED_LOG}; do
			test -s $z && LOG="${LOG} ${z}"
		done
		run_grep
		exit
	;;
	*)
		LOG="${SYSTEM_LOG}"
		test -s ${CLUSTER_LOG} && LOG="${LOG} ${CLUSTER_LOG}"
		run_grep
		exit		
	;;
esac
#
