#!/bin/bash # # Generic release agent for SLURM cgroup usage # # Manage cgroup hierarchy like : # # /cgroup/subsystem/uid_%/job_%/step_%/task_% # # Automatically sync uid_% cgroups to be coherent # with remaining job childs when one of them is removed # by a call to this release agent. # The synchronisation is made in a flock on the root cgroup # to ensure coherency of the cgroups contents. # progname=$(basename $0) subsystem=${progname##*_} get_mount_dir() { local lssubsys=$(type -p lssubsys) if [[ $lssubsys ]]; then $lssubsys -m $subsystem | awk '{print $2}' else echo "/cgroup/$subsystem" fi } mountdir=$(get_mount_dir) if [[ $# -eq 0 ]] then echo "Usage: $(basename $0) [sync] cgroup" exit 1 fi # build orphan cg path if [[ $# -eq 1 ]] then rmcg=${mountdir}$1 else rmcg=${mountdir}$2 fi slurmcg=${rmcg%/uid_*} if [[ ${slurmcg} == ${rmcg} ]] then # not a slurm job pattern, perhaps the slurmcg, just remove # the dir with a lock and exit flock -x ${mountdir} -c "rmdir ${rmcg}" exit $? fi orphancg=${slurmcg}/orphan # make sure orphan cgroup is existing if [[ ! -d ${orphancg} ]] then mkdir ${orphancg} case ${subsystem} in cpuset) cat ${mountdir}/cpuset.cpus > ${orphancg}/cpuset.cpus cat ${mountdir}/cpuset.mems > ${orphancg}/cpuset.mems ;; *) ;; esac fi # kernel call if [[ $# -eq 1 ]] then rmcg=${mountdir}$@ # try to extract the uid cgroup from the input one # ( extract /uid_% from /uid%/job_*...) uidcg=${rmcg%/job_*} if [[ ${uidcg} == ${rmcg} ]] then # not a slurm job pattern, perhaps the uidcg, just remove # the dir with a lock and exit flock -x ${mountdir} -c "rmdir ${rmcg}" exit $? fi if [[ -d ${mountdir} ]] then flock -x ${mountdir} -c "$0 sync $@" fi exit $? # sync subcall (called using flock by the kernel hook to be sure # that no one is manipulating the hierarchy, i.e. PAM, SLURM, ...) elif [[ $# -eq 2 ]] && [[ $1 == "sync" ]] then shift rmcg=${mountdir}$@ uidcg=${rmcg%/job_*} # remove this cgroup if [[ -d ${rmcg} ]] then case ${subsystem} in memory) # help to correctly remove lazy cleaning memcg # but still not perfect sleep 1 ;; *) ;; esac rmdir ${rmcg} fi if [[ ${uidcg} == ${rmcg} ]] then ## not a slurm job pattern exit now do not sync exit 0 fi # sync the user cgroup based on targeted subsystem # and the remaining job if [[ -d ${uidcg} ]] then case ${subsystem} in cpuset) cpus=$(cat ${uidcg}/job_*/cpuset.cpus 2>/dev/null) if [[ -n ${cpus} ]] then cpus=$(scontrol show hostnames $(echo ${cpus} | tr ' ' ',')) cpus=$(echo ${cpus} | tr ' ' ',') echo ${cpus} > ${uidcg}/cpuset.cpus else # first move the remaining processes to # a cgroup reserved for orphaned processes for t in $(cat ${uidcg}/tasks) do echo $t > ${orphancg}/tasks done # then remove the remaining cpus from the cgroup echo "" > ${uidcg}/cpuset.cpus fi ;; *) ;; esac fi # error else echo "Usage: $(basename $0) [sync] cgroup" exit 1 fi exit 0