## slurm.conf: main configuration file for SLURM ## $Id: slurm.conf,v 1.74 2015/03/04 14:49:38 bhm Exp $ ### ### Cluster ### ClusterName=abel #default: AuthType=auth/munge #default: CryptoType=crypto/munge SlurmctldPort=6817 SlurmdPort=6818 TmpFs=/work ## FIXME: /scratch? #default: TreeWidth=50 FIXME: try ceil(sqrt(#nodes)) TreeWidth=26 ## Timers: #default: MessageTimeout=10 SlurmdTimeout=600 # Increased as a result of a ~ 10 minute network crash (2015-01-23) WaitTime=0 BatchStartTimeout=300 # Temporary, due to gold delays (2013-10-28) ### ### Slurmctld ### ControlMachine=nielshenrik #default: MinJobAge=300 SlurmUser=slurm StateSaveLocation=/state/partition1/slurm/slurmstate ### ### Nodes ### FastSchedule=2 HealthCheckInterval=300 HealthCheckProgram=/hpc/sbin/healthcheck ReturnToService=1 GresTypes=gpu,phi,localtmp ## The default is 61.5 GB (62976 MB) allocatable RAM. ## This is "free -m" / 1024 - 1.5 Nodename=DEFAULT CPUs=16 Sockets=2 CoresPerSocket=8 ThreadsPerCore=2 RealMemory=62976 Gres=localtmp:90 State=unknown PartitionName=DEFAULT State=up Shared=NO Include /etc/slurm/slurmnodes.conf TopologyPlugin=topology/tree ### ### Jobs ### PropagateResourceLimits=STACK DefMemPerCPU=1000 EnforcePartLimits=yes #default: InactiveLimit=0 JobFileAppend=1 #default: JobRequeue=1 JobSubmitPlugins=lua #default: MaxJobCount=10000 #default: MpiDefault=none #FIXME: openmpi? #default: OverTimeLimit=0 #default: VSizeFactor=0 TaskPlugin=task/cgroup # Note that the max array size will also be limited by max jobs per user, currently 400: MaxArraySize=1001 ## Prologs/Epilogs # run by slurmctld as SlurmUser on ControlMachine before granting a job allocation: PrologSlurmctld=/hpc/sbin/prolog_slurmctld # run by slurmd on each node prior to the first job step on the node: Prolog=/hpc/sbin/prolog_slurmd # run by srun on the node running srun, prior to the launch of a job step: #SrunProlog= # run as user for each task prior to initiate the task: TaskProlog=/hpc/sbin/prolog_task # run as user for each task after the task finishes: #TaskEpilog= # run by srun on the node running srun, after a job step finishes: #SrunEpilog= # run as root on each node when job has completed: Epilog=/hpc/sbin/epilog_slurmd # run as SlurmUser on ControlMachine after the allocation is released: EpilogSlurmctld=/hpc/sbin/epilog_slurmctld ### ### Job Priority ### PriorityType=priority/multifactor #default: PriorityCalcPeriod=5 #default: PriorityDecayHalfLife=7-0 #(7 days) #default: PriorityUsageResetPeriod=NONE #To reset: PriorityUsageResetPeriod=NOW #default: PriorityMaxAge=7-0 #(7 days) #default: PriorityFavorSmall=no PriorityFlags=SMALL_RELATIVE_TO_TIME PriorityWeightAge=3000 PriorityWeightFairshare=5000 PriorityWeightJobSize=1000 #default: PriorityWeightPartition=0 PriorityWeightQOS=30000 ### ### Scheduling ### SchedulerType=sched/backfill #default: SchedulerParameters=default_queue_depth=100,!defer,!bf_continue,bf_interval=30,bf_max_job_part=0,bf_max_job_start=0,bf_max_job_user=0,bf_resolution=60,bf_window=1440,max_job_bf=50,max_depend_depth=10,max_switch_wait=300 SchedulerParameters=default_queue_depth=50,partition_job_depth=60,defer,bf_continue,bf_max_job_user=10,bf_max_job_start=50,bf_resolution=7200,max_switch_wait=86400 SelectType=select/cons_res SelectTypeParameters=CR_CPU_Memory # FIXME: perhaps Core! PreemptMode=requeue PreemptType=preempt/qos CompleteWait=32 # KillWait + 2 #default: KillWait=30 ### ### Checkpointing ### # ************** WARNING *********************** # *** ENABLING/DISABLING THIS KILLS ALL JOBS *** # ********************************************** ## Checkpointing currently not implemented on abel ### ### Logging ### SlurmctldDebug=debug SlurmctldLogFile=/var/log/slurm/slurmctld.log SlurmSchedLogLevel=1 SlurmSchedLogFile=/var/log/slurm/sched.log SlurmdDebug=debug2 SlurmdLogFile=/var/log/slurm/slurmd.log DebugFlags=backfill ### ### Accounting (Slurmdbd) ### AccountingStorageType=accounting_storage/slurmdbd AccountingStorageHost=nielshenrik JobAcctGatherType=jobacct_gather/linux #default: JobAcctGatherFrequency=30 ProctrackType=proctrack/cgroup AccountingStorageEnforce=limits,qos # kombinasjon av associations < limits < wckeys, qos