#!/bin/bash
# hlbx - backup and archive with a copy of the original tree and
#	 hardlinks to save space for the archive of backups. Makes use of
#	 rsync.
#
#
# author: Roman Drahtmueller <draht@suse.de> 20051010
#
#
#

#  /---------\  run_rsync   /-------------\
#  | $SOURCE |      ->      | $DEST/$BASE |
#  \---------/              \-------------/
#      .                                      /---------------------------\
#       .......->          hardlink copy \->  | $DEST/$BASE-YYYYMMDD-HHMM |
#       (no rsync)                            \---------------------------/
#
#                    removal of NONDAILY and move to DAILY |
#                                                          V
#                                             /----------------------\
#                                             | $DEST/$BASE-YYYYMMDD |
#                                             \----------------------/
#
# the removal of old DAILY and NONDAILY directories happens _before_
# a cp or rsync takes place to ensure that there is enough space.


# hlb - hard link backup
# create an archive of a directory like snapshots in userspace
# by using hardlinks to save diskspace.
# relies on nobody editing files that exist already. By consequence,
# all files that need an update must be transferred first, the target
# file must be unlinked, and at last the transferred new file must be
# rename(2)d in place.
# rsync does this correctly.


# configurables:

# USE_RSYNC	- If not set to "yes", then no rsync will be used to create a
#		  non-hardlinked copy of $SOURCE to $DEST - as a consequence,
#		  the admin must make sure that no file will be changed
#		  in $DEST, but instead a file is removed and a copy of its
#		  changed content is placed there instead, eg a new inode.
#		  Otherwise, all hardlinks to a file that is about to change
#		  will be affected by the change, obsoleting the hardlink
#		  archive created with the hardlink copy trees.
#		  If set to "no", then $SOURCE and $DEST must reside on the
#		  same filesystem. This is implicit if $SOURCE and
#		  $DEST/`basename $SOURCE` are the same.
USE_RSYNC=no


# SOURCE 	- the tree that is the source for the backup. Directory,
#		  no trailing slash, shell-sane characters, absolute, 
#		  existent, readable.
SOURCE=/your/directory/to_backup_and_archive

# DEST		- the destination for the backup. The target filesystem
#		  should be big enough to hold $SOURCE. Directory, no
#		  trailing slash, shell-sane characters, absolute, writeable,
#		  will contain `basename $SOURCE` and its hardlink copies.
DEST=/your_destination/directory


# HIGHWATER	- Megabytes. $0 removes as many copies of 
#		  $DEST-date-timegroup as necessary to free $HIGHWATER
#		  MB. Should be more than what you expect for the changes
#		  between two runs of $0.
HIGHWATER=20000

# MAXNONDAILY	- $0 creates copies of $SOURCE named $DEST/$SOURCE-YYYYMMDD
#		  (DAILY) and $DEST/$SOURCE-YYYYMMDD-HHMM (NONDAILY).
#		  Only $MAXNONDAILY copies of NONDAILY form will be held.
#		  If more are present, the oldest will be renamed to its
#		  DAILY name, others of that day will be removed.
#		  Example: If set to 18 and $0 runs three times a day, then
#		  you will have 6 days with three copies of the tree per day,
#		  and everything older will be removed.
MAXNONDAILY=18

# MINDAILY	- If MINDAILY DAILY copies are present or less, then no
#		  old trees will be removed. This is a safety net.
MINDAILY=2


### end of configurables.
##############################################################################

PATH=/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/sbin:/usr/local/bin

#export DEBUG=1

LOCK=".`basename $0`-lock"
DATESTRING=`date +"%Y%m%d-%H%M"`

BASE=`basename $SOURCE`

export BASE SOURCE DEST HIGHWATER MAXNONDAILY MINDAILY DATESTRING LOCK
export USE_RSYNC

function my_do() {
if [ -z "$DEBUG" ]; then
  $*
else
  echo "$*"
fi

}

function diskfree() {
  df -m .|awk '/^Filesystem/ { next;} 
		/^ .*% \// 	{ print $3 }
		/^\/.*% \// 	{ print $4 }'
}

function toofull() {
if [ "`diskfree`" -lt $HIGHWATER ]; then
  echo "1"
  return 1
else
  echo "0"
  return 0
fi
}

function run_rsync() {
# it is assumed that we are in $DEST and that we sync from $SOURCE to $BASE.

	my_do rsync -aHS --delete $SOURCE/ $BASE

}

if [ -z "$SOURCE" ]; then
	echo "SOURCE is empty. Fatal, please fix."
	exit 1
fi

if [ -z "$DEST" ]; then
	echo "DEST is empty. Fatal, please fix."
	exit 2
fi

cd "$SOURCE" || {
	echo "Cannot access SOURCE: $SOURCE. Fatal, bailing out."
	exit 3
}

cd "$DEST" || {
	echo "Cannot access DEST: $DEST. Fatal, bailing out."
	exit 4
}

touch .`basename $0`-writetest-$$ || {
	echo "Cannot write to DEST: $DEST. Fatal, bailing out."
	exit 5
}
rm -f .`basename $0`-writetest-$$ /nosuchfile.$$


# Insanity checking: $SOURCE and $DEST/$BASE can be the same. This is allowed,
# but we do not rsync in that case.
if [ "`cd $SOURCE; pwd -P`" = "`pwd -P`/$BASE" \
     -a "$USE_RSYNC" = "yes" ]; then
	echo "SOURCE: $SOURCE and DEST/<last_element_of_SOURCE>: $DEST/$BASE are the same"
	echo "directory. Not using rsync. Consider setting USE_RSYNC=no"
	USE_RSYNC=no
fi




if [ -d "$BASE-$DATESTRING" ]; then
  echo "$BASE-$DATESTRING already exists. Fatal error"
  exit 6 
fi




if [ `fuser "$LOCK" 2>/dev/null | wc -l` = 0 ]; then
    :
else
    # log file is busy. hmmm.
    if [ -t 0 ]; then
        echo "the LOCK $LOCK is busy."
        echo -n "Shall I kill the process? ([y]/n)?"
        read ans < /dev/tty
        case "$ans" in
            y*|Y*)
                fuser -v -k -TERM $LOCK
            ;;
            *)
		echo "not killed. Aborting."
		exit 7
            ;;
        esac
    else
        if [ `find $LOCK -mmin -60|wc -l` = 0 ]; then
            # older than 1h
            echo "the LOCK $LOCK has been busy for more than 1 hour."
            echo "Killing the process..."
            fuser -v -k -TERM $LOCK
	    # sometimes, there are BIG rsync processes that need time to die.
            sleep 4	
            fuser -v -k -KILL $LOCK
        fi
    fi
fi

### work begins here. ########################################################
exec 42> $LOCK


# check how many non-daily backups we have: dirs like $BASE-20030815-0815

while [ `ls -d1 "$BASE"-????????-???? 2> /dev/null | wc -l` -gt $MAXNONDAILY ]; do

    # grab the oldest and rename it to a DAILY directory.
    oldestdaysname=`ls -d1 ${BASE}-????????-????  2> /dev/null| head -1 | awk '
	 {n=split($0,a,"-"); printf("%s-%s\n",a[n-2],a[n-1]); }'`

    if [ ! -d "$oldestdaysname" ]; then
	my_do mv "`ls -d1 ${BASE}-????????-????  2> /dev/null| head -1`" "$oldestdaysname"
    fi

    # then, remove all others from that day.
    my_do rm -rf "$oldestdaysname"-???? /nosuchfile.$$

done


while [ `toofull` = 1 -a `ls -d1 "$BASE"-???????? 2>/dev/null | wc -l` -gt $MINDAILY ]; do
	# remove the oldest of them.
	  my_do rm -rf `ls -d1r "$BASE"-???????? | tail -1`
done

# for the case that we have removed until only $MINDAILY are left, and 
# we're still full:
if [ `toofull` = 1 ]; then
  echo "Tried to remove trees, but DEST $DEST"
  echo "filesystem is still too full. Please correct this! Fatal."
  rm -f $LOCK
  exit 8
fi



if [ "$USE_RSYNC" = "yes" ]; then
	run_rsync
	if [ `diskfree` -gt 100 ]; then
	    my_do cp -arpl "$BASE" "$BASE-$DATESTRING"
	    retval=$?
	else
	    echo "not even 100MB available. Not copying."
	    retval=100
	fi
else
	if [ `diskfree` -gt 100 ]; then
		# need to find out if $SOURCE and $DEST are on the same fs.
		# use df...
	    FS_SOURCE=$(df -m "$SOURCE" | awk '
				/^Filesystem/ { next;}
				/^\/.*/ { print $1;}' )
	    FS_DEST=$(df -m "$DEST" | awk '
				/^Filesystem/ { next;}
				/^\/.*/ { print $1;}' )
	    if [ "$FS_SOURCE" != "$FS_DEST" ]; then
		echo "SOURCE $SOURCE and DEST $DEST"
		echo "are not on the same filesystem. Cannot create hardlinks."
		echo "Fatal."
		retval=9
	    else
	        my_do cp -arpl "$SOURCE" "$BASE-$DATESTRING"
	        retval=$?
	    fi
	else
	    echo "not even 100MB available. Not copying."
	    retval=101
	fi
fi

exec 42>&-
rm -f $LOCK
if [ "$retval" != 0 ]; then
	echo "return value: $retval"
fi
exit $retval

