Slurm Setup
OS is CentOS 7
Prerequisite
useradd slurm
yum group install "Development Tools" -y
yum install openssl-devel.x86_64 -y
yum install munge.x86_64 munge-devel.x86_64 munge-libs.x86_64 -y
yum install -y mariadb.x86_64 mariadb-server.x86_64 mariadb-libs.x86_64 mariadb-devel.x86_64
yum install openssl openssl-devel pam-devel numactl numactl-devel hwloc hwloc-devel lua lua-devel readline-devel rrdtool-devel ncurses-devel man2html libibmad libibumad -y
yum install rpm-build -y
yum install perl-Tk-devel.x86_64 -y
Munge and Mysql
dd if=/dev/urandom bs=1 count=1024 >/etc/munge/munge.key
chmod 400 /etc/munge/munge.key
chmod 711 /var/lib/munge/
chown munge /etc/munge/munge.key
systemctl start munge
systemctl status munge
systemctl enable munge
systemctl start mariadb
systemctl status mariadb
systemctl enable mariadb
mysql> create user 'slurm'@'localhost' identified by 'password';
Query OK, 0 rows affected (0.00 sec)
# The password created for slurm user need to match "StoragePass" in "/etc/slurm/slurmdbd.conf"
mysql> grant all on slurm_acct_db.* TO 'slurm'@'localhost';
Query OK, 0 rows affected (0.00 sec)
mysql> create database slurm_acct_db;
SLURM
rpmbuild --tb --with mysql slurm-17.02.7.tar.bz2
#NOTE slurm-17.02.7.tar.bz2 name format is required. slurm-17-02-7.tar.bz2 will fail the rpmbuild.
cd ~/rpmbuild/RPMS/
yum install * -y
cd /etc/slurm/
cp slurm.conf.example slurm.conf
mkdir -p /var/spool/slurm/d
chown slurm: /var/spool/slurm/d
mkdir /var/spool/slurm/ctld
chown slurm: /var/spool/slurm/ctld
systemctl start slurmd.service
systemctl status slurmd.service
systemctl enable slurmd.service
systemctl start slurmctld.service
systemctl status slurmctld.service
systemctl enable slurmctld.service
#Slurm DBD
#Uncomment AccountingStorageType in /etc/slurm/slurm.conf
mkdir -p /var/log/slurm/archive
chown slurm: /var/log/slurm/archive
systemctl start slurmdbd.service
systemctl status slurmdbd.service
systemctl enable slurmdbd.service
Populate the mysql database from a previous dump
mysql -u slurm -p slurm_acct_db < slurm_acct_db.sql
Check the processes in mysql database every second
mysqladmin -u root -i 1 processlist
slurm.conf
#
# Example slurm.conf file. Please run configurator.html
# (in doc/html) to build a configuration file customized
# for your environment.
#
#
# slurm.conf file generated by configurator.html.
#
# See the slurm.conf man page for more information.
#
ClusterName=linux
ControlMachine=comsol-centos
#ControlAddr=
#BackupController=
#BackupAddr=
#
SlurmUser=slurm
#SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/var/spool/slurm/ctld
SlurmdSpoolDir=/var/spool/slurm/d
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmdPidFile=/var/run/slurmd.pid
ProctrackType=proctrack/pgid
#PluginDir=
#FirstJobId=
ReturnToService=0
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
#TaskPlugin=
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
#
# TIMERS
SlurmctldTimeout=300
SlurmdTimeout=300
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
#
# SCHEDULING
SchedulerType=sched/backfill
#SchedulerAuth=
#SelectType=select/linear
FastSchedule=1
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=14-0
#PriorityUsageResetPeriod=14-0
#PriorityWeightFairshare=100000
#PriorityWeightAge=1000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=1000
#PriorityMaxAge=1-0
#
# LOGGING
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=3
SlurmdLogFile=/var/log/slurmd.log
JobCompType=jobcomp/none
#JobCompLoc=
#
# ACCOUNTING
#JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherFrequency=30
#
#AccountingStorageType=accounting_storage/slurmdbd
#AccountingStorageHost=
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStorageUser=
#
# COMPUTE NODES
NodeName=comsol-centos Procs=8 State=UNKNOWN
PartitionName=batch Nodes=ALL Default=YES MaxTime=INFINITE State=UP
Grep the NodeName from hostname
output.