################ # SLURM SERVER ################ # cat commands_slurm_1.txt rpm -ivh http://dl.fedoraproject.org/pub/epel/6/x86_64/epel-release-6-8.noarch.rpm wget http://gimo2.pd.infn.it/rpms/slurm-2.6.2/slurm-2.6.2_sl6_x86_64.repo -O /etc/yum.repos.d/slurm-2.6.2_sl6_x86_64.repo yum install -y slurm slurm-munge slurm-slurmdbd mysql-server /usr/sbin/create-munge-key service munge start && chkconfig munge on useradd -p "XXXXXXX" -c "SLURM user" slurm mkdir /var/lib/slurmctld && chown slurm.slurm /var/lib/slurmctld mkdir /var/lib/slurmd && chown slurm.slurm /var/lib/slurmd mkdir /var/run/slurm/ && chown slurm.slurm /var/run/slurm mkdir /var/log/slurm && chown slurm.slurm /var/log/slurm service mysqld start mysql -u root -e "create database slurm_acct_db;" mysql -u root -e "grant all on slurm_acct_db.* TO 'slurm'@'localhost' identified by 'XXXXXX' with grant option;" mysql -u root -e "grant all on slurm_acct_db.* TO 'slurm'@'`hostname -f`' identified by 'XXXXXX' with grant option;" echo "HostbasedAuthentication yes" >> /etc/ssh/sshd_config touch /etc/ssh/shosts.equiv && chmod 644 /etc/ssh/shosts.equiv for i in `cat nodes.txt`; do ssh-keyscan -t rsa $i >> /etc/ssh/ssh_known_hosts; done chmod 644 /etc/ssh/ssh_known_hosts service sshd restart cp /etc/slurm/slurm.conf.example /etc/slurm/slurm.conf cp /etc/slurm/slurmdbd.conf.example /etc/slurm/slurmdbd.conf 1. edit commands_slurm_1.txt for slurm user passwd 2. create nodes.txt with FQDN WNs 3. wget https://raw.github.com/italiangrid/test_tools/master/script-wrapper.py 3. script-wrapper.py commands_slurm_1.txt SLURM_INSTALL_1 4. edit /etc/slurm/slurm.conf & /etc/slurm/slurmdbd.conf # cat commands_slurm_2.txt chmod 400 /etc/slurm/slurmdbd.conf && chown slurm.slurm /etc/slurm/slurmdbd.conf sacctmgr add cluster cluster_on_`hostname -s` service slurm start && chkconfig slurm on service slurmdbd start && chkconfig slurmdbd on if [ -s /etc/exports ] ; then grep -v "/home" /etc/exports > /etc/exports.tmp ; else cp /etc/exports /etc/exports.tmp ; fi echo "/home cert-*.pn.pd.infn.it(rw,sync,no_root_squash)" >> /etc/exports.tmp mv -f /etc/exports.tmp /etc/exports exportfs -r /etcinit.d/nfs start 5. script-wrapper.py commands_slurm_2.txt SLURM_INSTALL_2 6. cp /usr/local/nfs/slurm/setup_slurm_acct.sh in 7. adapt VOLIST to the one used in #./setup_slurm_acct.sh 8. adapt site info/site-info.def, siteinfo/rtc-wn-list.conf, etc # mkdir -p /etc/grid-security 9. install certificate from passtore # cat site info/commands_cream.txt wget http://repository.egi.eu/sw/production/cas/1/current/repo-files/EGI-trustanchors.repo -P /etc/yum.repos.d/ rpm --import http://emisoft.web.cern.ch/emisoft/dist/EMI/3/RPM-GPG-KEY-emi wget http://emisoft.web.cern.ch/emisoft/dist/EMI/3/sl6/x86_64/base/emi-release-3.0.0-2.el6.noarch.rpm yum -y localinstall emi-release-3.0.0-2.el6.noarch.rpm yum clean all yum install -y ca-policy-egi-core yum -y install emi-cream-ce yum -y install emi-slurm-utils yum -y install ca-policy-egi-core yum -y install fetch-crl wget --no-check-certificate https://ci-01.cnaf.infn.it/igi-mw/repos/sl6/igi-3-testing-sl6.repo -P /etc/yum.repos.d/ yum clean all yum -y update /sbin/service fetch-crl-cron start /sbin/chkconfig fetch-crl-cron on /usr/sbin/fetch-crl -r 20 -a 24 /opt/glite/yaim/bin/yaim -c -s rtc-emi-site-info.def -n creamCE -n SLURM_utils 10. python script-wrapper.py commands_cream.txt CREAM_CONFIG ######################## # SLURM CLIENT /HOME on NFS ######################## [root@cert-wn64-05 ~]# wget https://raw.github.com/italiangrid/test_tools/master/script-wrapper.py # cat commands_slurm_client_1.txt rpm -ivh http://dl.fedoraproject.org/pub/epel/6/x86_64/epel-release-6-8.noarch.rpm wget http://gimo2.pd.infn.it/rpms/slurm-2.6.2/slurm-2.6.2_sl6_x86_64.repo -O /etc/yum.repos.d/slurm-2.6.2_sl6_x86_64.repo yum install -y slurm slurm-munge slurm-slurmdbd mkdir /var/lib/slurmd && chown slurm.slurm /var/lib/slurmd mkdir /var/run/slurm/ && chown slurm.slurm /var/run/slurm mkdir /var/log/slurm && chown slurm.slurm /var/log/slurm # python script-wrapper.py commands_slurm_client_1.txt SLURM_CLIENT_1 [root@cert-08 ~]# scp /etc/munge/munge.key cert-wn64-05.pn:/etc/munge/ root@cert-wn64-05.pn's password: munge.key [root@cert-08 ~]# scp /etc/slurm/slurm.conf cert-wn64-05.pn:/etc/slurm/slurm.conf root@cert-wn64-05.pn's password: slurm.conf # python script-wrapper.py commands_slurm_client_2.txt SLURM_CLIENT_2 [root@cert-08 ~]# scp -r siteinfo/ cert-wn64-06.pn:. # cat siteinfo/commands_cream_wn.txt wget http://repository.egi.eu/sw/production/cas/1/current/repo-files/EGI-trustanchors.repo -P /etc/yum.repos.d/ rpm --import http://emisoft.web.cern.ch/emisoft/dist/EMI/3/RPM-GPG-KEY-emi wget http://emisoft.web.cern.ch/emisoft/dist/EMI/3/sl6/x86_64/base/emi-release-3.0.0-2.el6.noarch.rpm yum -y localinstall emi-release-3.0.0-2.el6.noarch.rpm yum clean all yum install -y ca-policy-egi-core yum install -y emi-wn /opt/glite/yaim/bin/yaim -c -s rtc-emi-site-info.def -n WN # python script-wrapper.py commands_cream_wn.txt WN_INSTALL ################### 537 sinfo 538 scontrol show partition 539 showmount 540 export PS1="PROMPT: " 541 /usr//bin/srun -l -n 1 -N 1 printenv SLURM_NODELIST 542 /usr//bin/srun -l -n1 -N1 /usr//bin/scontrol show hostnames cert-wn64-[05-06] 543 /usr//bin/srun -l -n 1 -N 1 -w cert-wn64-05 printenv SLURMD_NODENAME 544 /usr//bin/srun -l printenv SLURMD_NODENAME 545 /usr//bin/srun -l -N1 -n1 --exclude=cert-wn64-05 printenv SLURMD_NODENAME 546 /usr//bin/srun -l -N1 -n1 --exclude=cert-wn64-06 printenv SLURMD_NODENAME 547 /usr//bin/srun -l -N1 -n1 --nodelist=cert-wn64-05 printenv SLURMD_NODENAME 548 /usr//bin/srun -l -N1 -n1 --nodelist=cert-wn64-06 printenv SLURMD_NODENAME 549 /usr//bin/srun -l -N1 -n1 --nodelist=cert-wn64-05 --exclude=cert-wn64-05 printenv SLURMD_NODENAME 550 /usr//bin/srun -l -N3 -n3 -O printenv SLURMD_NODENAME 551 /usr//bin/srun -l -N1-1 -n2 -O printenv SLURMD_NODENAME 552 /usr//bin/srun -l -c1 printenv SLURMD_NODENAME 553 /usr//bin/srun -l -n 5 printenv SLURMD_NODENAME 554 exit 555 /usr//bin/srun -l -c1 ./test1.92.bash | sort -n 556 /usr//bin/srun -l --mpi=lam printenv SLURMD_NODENAME 619 man srun 620 srun -n8 -l hostname 621 srun -n2 -l hostname 671 scp -r siteinfo/ gw-master:/opt/nfs_install/slurm/ 672 sacctmgr show associations 673 sacctmgr -Pn show associations 674 sacctmgr -Pn show associations format=Account,User,Partition,Fairshare,MaxJobs,MaxSubmitJobs,MaxWall,MaxCPUMins,MaxCPUs,ID,ParentID 675 scontrol show partitions 726 scontrol show partitions 727 less /etc/slurm/slurm.conf 728 sacctmgr all 729 sacctmgr -a all 730 sacctmgr -help 731 sacctmgr list 732 sacctmgr list User 733 sacctmgr list User -v 734 sacctmgr list --all 735 sacctmgr all User 736 sacctmgr all User -v 737 sacctmgr all User --verbose 761 scontrol show configuration 762 scontrol 763 sinfo --Node 804 squeue 805 squeue -a 806 squeue -h 807 squeue 764 glue-validator -H localhost -p 2170 -b o=glue -k -v 3 959 sinfo -V 960 scontrol show partition 961 scontrol show node cert-wn64-05 962 scontrol show job 963 tail -f /var/log/slurm/slurmctld.log 964 scontrol show job 350 965 tail -f /var/log/slurm/slurmctld.log 966 scontrol show step 967 scontrol show step 351 968 sjobexitmod -l 127 969 tail -f /var/log/slurm/slurmctld.log 970 scontrol show step 353 971 scontrol show job 353 972 scontrol show job 354 973 scontrol show job 355 974 tail -f /var/log/slurm/slurmctld.log 975 /etc/init.d/slurm restart 976 tail -f /var/log/slurm/slurmctld.log 977 cat /etc/resolv.conf 978 /etc/init.d/nfs restart 979 exportfs -r 980 showmount -a 1026 sed -i "s/ENABLED=.*/ENABLED="false"/" /etc/sysconfig/yum-autoupdate [root@cert-08 siteinfo]# sinfo PARTITION AVAIL TIMELIMIT NODES STATE NODELIST creamtes* up 1:00:00 2 idle cert-wn64-[05-06] [root@cert-08 siteinfo]# [root@cert-08 siteinfo]# [root@cert-08 siteinfo]# [root@cert-08 siteinfo]# srun -n2 -l hostname 1: cert-wn64-05.pn.pd.infn.it 0: cert-wn64-05.pn.pd.infn.it [root@cert-08 siteinfo]# sinfo PARTITION AVAIL TIMELIMIT NODES STATE NODELIST creamtes* up 1:00:00 2 idle cert-wn64-[05-06] [root@cert-08 siteinfo]# scontrol show job JobId=467 Name=hostname UserId=root(0) GroupId=root(0) Priority=4294901616 Account=root QOS=normal JobState=COMPLETED Reason=None Dependency=(null) Requeue=1 Restarts=0 BatchFlag=0 ExitCode=0:0 RunTime=00:00:00 TimeLimit=01:00:00 TimeMin=N/A SubmitTime=2013-11-26T09:55:18 EligibleTime=2013-11-26T09:55:18 StartTime=2013-11-26T09:55:18 EndTime=2013-11-26T09:55:18 PreemptTime=None SuspendTime=None SecsPreSuspend=0 Partition=creamtest AllocNode:Sid=cert-08:7559 ReqNodeList=(null) ExcNodeList=(null) NodeList=cert-wn64-05 BatchHost=cert-wn64-05 NumNodes=1 NumCPUs=2 CPUs/Task=1 ReqS:C:T=*:*:* MinCPUsNode=1 MinMemoryNode=0 MinTmpDiskNode=0 Features=(null) Gres=(null) Reservation=(null) Shared=OK Contiguous=0 Licenses=(null) Network=(null) Command=/bin/hostname WorkDir=/root/siteinfo [root@cert-08 siteinfo]# [root@cert-08 siteinfo]# [root@cert-08 siteinfo]# scontrol show job JobId=467 Name=hostname UserId=root(0) GroupId=root(0) Priority=4294901616 Account=root QOS=normal JobState=COMPLETED Reason=None Dependency=(null) Requeue=1 Restarts=0 BatchFlag=0 ExitCode=0:0 RunTime=00:00:00 TimeLimit=01:00:00 TimeMin=N/A SubmitTime=2013-11-26T09:55:18 EligibleTime=2013-11-26T09:55:18 StartTime=2013-11-26T09:55:18 EndTime=2013-11-26T09:55:18 PreemptTime=None SuspendTime=None SecsPreSuspend=0 Partition=creamtest AllocNode:Sid=cert-08:7559 ReqNodeList=(null) ExcNodeList=(null) NodeList=cert-wn64-05 BatchHost=cert-wn64-05 NumNodes=1 NumCPUs=2 CPUs/Task=1 ReqS:C:T=*:*:* MinCPUsNode=1 MinMemoryNode=0 MinTmpDiskNode=0 Features=(null) Gres=(null) Reservation=(null) Shared=OK Contiguous=0 Licenses=(null) Network=(null) Command=/bin/hostname WorkDir=/root/siteinfo [root@cert-08 siteinfo]# srun -n8 -l hostname srun: error: Unable to allocate resources: More processors requested than permitted [root@cert-08 siteinfo]# srun -n4 -l hostname 2: cert-wn64-06.pn.pd.infn.it 3: cert-wn64-06.pn.pd.infn.it 0: cert-wn64-05.pn.pd.infn.it 1: cert-wn64-05.pn.pd.infn.it [root@cert-08 siteinfo]# scontrol show job JobId=467 Name=hostname UserId=root(0) GroupId=root(0) Priority=4294901616 Account=root QOS=normal JobState=COMPLETED Reason=None Dependency=(null) Requeue=1 Restarts=0 BatchFlag=0 ExitCode=0:0 RunTime=00:00:00 TimeLimit=01:00:00 TimeMin=N/A SubmitTime=2013-11-26T09:55:18 EligibleTime=2013-11-26T09:55:18 StartTime=2013-11-26T09:55:18 EndTime=2013-11-26T09:55:18 PreemptTime=None SuspendTime=None SecsPreSuspend=0 Partition=creamtest AllocNode:Sid=cert-08:7559 ReqNodeList=(null) ExcNodeList=(null) NodeList=cert-wn64-05 BatchHost=cert-wn64-05 NumNodes=1 NumCPUs=2 CPUs/Task=1 ReqS:C:T=*:*:* MinCPUsNode=1 MinMemoryNode=0 MinTmpDiskNode=0 Features=(null) Gres=(null) Reservation=(null) Shared=OK Contiguous=0 Licenses=(null) Network=(null) Command=/bin/hostname WorkDir=/root/siteinfo JobId=468 Name=hostname UserId=root(0) GroupId=root(0) Priority=4294901615 Account=root QOS=normal JobState=COMPLETED Reason=None Dependency=(null) Requeue=1 Restarts=0 BatchFlag=0 ExitCode=0:0 RunTime=00:00:00 TimeLimit=01:00:00 TimeMin=N/A SubmitTime=2013-11-26T09:56:22 EligibleTime=2013-11-26T09:56:22 StartTime=2013-11-26T09:56:22 EndTime=2013-11-26T09:56:22 PreemptTime=None SuspendTime=None SecsPreSuspend=0 Partition=creamtest AllocNode:Sid=cert-08:7559 ReqNodeList=(null) ExcNodeList=(null) NodeList=cert-wn64-[05-06] BatchHost=cert-wn64-05 NumNodes=2 NumCPUs=4 CPUs/Task=1 ReqS:C:T=*:*:* MinCPUsNode=1 MinMemoryNode=0 MinTmpDiskNode=0 Features=(null) Gres=(null) Reservation=(null) Shared=OK Contiguous=0 Licenses=(null) Network=(null) Command=/bin/hostname WorkDir=/root/siteinfo