#!/bin/bash
#
# Generic release agent for SLURM cgroup usage
#
# Manage cgroup hierarchy like :
#
# /cgroup/subsystem/uid_%/job_%/step_%/task_%
#
# Automatically sync uid_% cgroups to be coherent
# with remaining job childs when one of them is removed
# by a call to this release agent.
# The synchronisation is made in a flock on the root cgroup
# to ensure coherency of the cgroups contents.
#

progname=$(basename $0)
subsystem=${progname##*_}

get_mount_dir()
{
    local lssubsys=$(type -p lssubsys)
    if [[ $lssubsys ]]; then
        $lssubsys -m $subsystem | awk '{print $2}'
    elif X=$(grep -si -m1 "^CgroupMountpoint=" /etc/slurm/cgroup.conf); then
        echo $X | sed "s/.*=\(.*\)/\1\/${subsystem}/"
    else
        echo "/cgroup/$subsystem"
    fi
}

mountdir=$(get_mount_dir)

if [[ $# -eq 0 ]]
then
    echo "Usage: $(basename $0) [sync] cgroup"
    exit 1
fi

# build orphan cg path
if [[ $# -eq 1 ]]
then
    rmcg=${mountdir}$1
else
    rmcg=${mountdir}$2
fi
slurmcg=${rmcg%/uid_*}
if [[ ${slurmcg} == ${rmcg} ]]
then
    # not a slurm job pattern, perhaps the slurmcg, just remove
    # the dir with a lock and exit
    flock -x ${mountdir} -c "rmdir ${rmcg}"
    exit $?
fi
orphancg=${slurmcg}/orphan

# make sure orphan cgroup is existing
if [[ ! -d ${orphancg} ]]
then
    mkdir ${orphancg}
    case ${subsystem} in
	cpuset)
	    cat ${mountdir}/cpuset.cpus > ${orphancg}/cpuset.cpus
	    cat ${mountdir}/cpuset.mems > ${orphancg}/cpuset.mems
	    ;;
	*)
	    ;;
    esac
fi

# try to extract the uid cgroup from the input one
# ( extract /uid_% from /uid%/job_*...)
uidcg=${rmcg%/job_*}

# kernel call
if [[ $# -eq 1 ]]
then

    if [[ ${uidcg} == ${rmcg} ]]
    then
	# not a slurm job pattern, perhaps the uidcg, just remove
	# the dir with a lock and exit
	flock -x ${mountdir} -c "rmdir ${rmcg}"
	exit $?
    fi

    if [[ -d ${mountdir} ]]
    then
	flock -x ${mountdir} -c "$0 sync $@"
    fi

    exit $?

# sync subcall (called using flock by the kernel hook to be sure
# that no one is manipulating the hierarchy, i.e. PAM, SLURM, ...)
elif [[ $# -eq 2 ]] && [[ $1 == "sync" ]]
then

    # remove this cgroup
    if [[ -d ${rmcg} ]]
    then
        case ${subsystem} in
            memory)
		# help to correctly remove lazy cleaning memcg
		# but still not perfect
                sleep 1
                ;;
            *)
		;;
        esac
	rmdir ${rmcg}
    fi
    if [[ ${uidcg} == ${rmcg} ]]
    then
	## not a slurm job pattern exit now do not sync
	exit 0
    fi

    # sync the user cgroup based on targeted subsystem
    # and the remaining job
    if [[ -d ${uidcg} ]]
    then
	case ${subsystem} in
	    cpuset)
		cpus=$(cat ${uidcg}/job_*/cpuset.cpus 2>/dev/null)
		if [[ -n ${cpus} ]]
		then
		    cpus=$(/usr/bin/scontrol show hostnames $(echo ${cpus} | tr ' ' ','))
		    cpus=$(echo ${cpus} | tr ' ' ',')
		    echo ${cpus} > ${uidcg}/cpuset.cpus
		else
		    # first move the remaining processes to
		    # a cgroup reserved for orphaned processes
		    for t in $(cat ${uidcg}/tasks)
		    do
			echo $t > ${orphancg}/tasks
		    done
		    # then remove the remaining cpus from the cgroup
		    echo "" > ${uidcg}/cpuset.cpus
		fi
		;;
	    *)
		;;
	esac
    fi

# error
else
    echo "Usage: $(basename $0) [sync] cgroup"
    exit 1
fi

exit 0
