Changeset 1620


Ignore:
Timestamp:
Jul 17, 2015 11:37:41 AM (10 years ago)
Author:
heinze
Message:

adjustments for Mistral at DKRZ Hamburg (lcbullhh)

Location:
palm/trunk/SCRIPTS
Files:
1 added
3 edited

Legend:

Unmodified
Added
Removed
  • palm/trunk/SCRIPTS/mbuild

    r1614 r1620  
    2222# Current revisions:
    2323# ------------------
    24 #
     24# adjustments for Mistral at DKRZ Hamburg (lcbullhh)
    2525#
    2626# Former revisions:
     
    516516       # DETERMINE IP-ADDRES OF THE REMOTE-HOST
    517517    case  $remote_host  in
     518        (lcbullhh)       remote_address=136.172.50.13;;
    518519        (lccrayb)        remote_address=130.73.233.1;;
    519520        (lccrayh)        remote_address=130.75.4.1;;
  • palm/trunk/SCRIPTS/mrun

    r1610 r1620  
    2222# Current revisions:
    2323# ------------------
    24 #
     24# adjustments for Mistral at DKRZ Hamburg (lcbullhh)
    2525#
    2626# Former revisions:
     
    324324 typeset -i  cputime i ii iia iii iio icycle inode ival jobges jobsek last_char_int maxcycle minuten nodes pes remaining_pes sekunden tp1
    325325
    326 
    327 
    328326    # ERROR HANDLING IN CASE OF EXIT
    329327 trap 'rm -rf  $working_directory/tmp_mrun
     
    610608    do_remote=true
    611609    case  $host  in
    612         (ibm|ibmh|ibmkisti|ibmku|ibms|nech|lccrayb|lccrayh|lccrayf|lcflow|lckyoto|unics|lcxe6|lcxt5m|lck|lckiaps|lckordi|lckyuh|lckyut|lcsb)  true;;
     610        (ibm|ibmh|ibmkisti|ibmku|ibms|nech|lcbullhh|lccrayb|lccrayh|lccrayf|lcflow|lckyoto|unics|lcxe6|lcxt5m|lck|lckiaps|lckordi|lckyuh|lckyut|lcsb)  true;;
    613611        (*)  printf "\n"
    614612             printf "\n  +++ sorry: execution of batch jobs on remote host \"$host\""
     
    669667 fi
    670668
    671 
    672669    # CHECK, IF FILE-ARCHIVING HAS FAILED IN PREVIOUS JOB (OF A JOB-CHAIN)
    673670 if [[ -f ~/job_queue/ARCHIVE_ERROR_$fname ]]
     
    697694 if [[ "$read_from_config" = false ]]
    698695 then
    699 
    700696    [[ $silent = false ]]  &&  printf "\n    Reading the configuration file... "
    701697
     
    769765                   do_remote=true
    770766                   case  $host  in
    771                        (ibm|ibmh|ibmkisti|ibmku|ibms|lccrayb|lccrayh|lccrayf|lcflow|lckyoto|nech|unics|lcxe6|lcxt5m|lck|lckiaps|lckordi|lckyuh|lckyut|lcsb)  true;;
     767                       (ibm|ibmh|ibmkisti|ibmku|ibms|lcbullhh|lccrayb|lccrayh|lccrayf|lcflow|lckyoto|nech|unics|lcxe6|lcxt5m|lck|lckiaps|lckordi|lckyuh|lckyut|lcsb)  true;;
    772768                       (*)  printf "\n  +++ sorry: execution of batch jobs on remote host \"$host\""
    773769                            printf "\n      is not available"
     
    887883 else
    888884
    889 
    890885       # EVALUATE THE CONFIGURATION-FILE BY FORTRAN-PROGRAM
    891886    [[ $silent = false ]]  &&  printf "..."
     
    10341029    do_remote=true
    10351030    case  $host  in
    1036         (ibm|ibmh|ibmkisti|ibmku|ibms|lccrayb|lccrayh|lccrayf|lcflow|lckyoto|nech|unics|lcxe6|lcxt5m|lck|lckiaps|lckordi|lckyuh|lckyut|lcsb)  true;;
     1031        (ibm|ibmh|ibmkisti|ibmku|ibms|lcbullhh|lccrayb|lccrayh|lccrayf|lcflow|lckyoto|nech|unics|lcxe6|lcxt5m|lck|lckiaps|lckordi|lckyuh|lckyut|lcsb)  true;;
    10371032        (*)  printf "\n"
    10381033             printf "\n  +++ sorry: execution of batch jobs on remote host \"$host\""
     
    10451040
    10461041
     1042
    10471043    # IN CASE OF PARALLEL EXECUTION, CHECK SOME SPECIFICATIONS CONCERNING PROCESSOR NUMBERS
    10481044 if [[ "$cond1" = parallel  ||  "$cond2" = parallel ]]
     
    11601156        (ibmh)       queue=cluster;;
    11611157        (ibmkisti)   queue=class.32plus;;
     1158        (lcbullhh)   queue=compute;;
    11621159        (lccrayb)    queue=mpp1q;;
    11631160        (lccrayh)    queue=mpp1q;;
     
    21702167 fi
    21712168
    2172 
    21732169    # QUERY FOR CONTINUE (ON LOCAL MACHINES ONLY)
    21742170 if [[ $remotecall = false  &&  $silent = false  &&  $jobmo != BATCH ]]
     
    26342630 rm -rf  $check_sources
    26352631 cd $working_directory
    2636 
    26372632
    26382633    # DETERMINE PATH FOR MAKE DEPOSITORY
     
    28252820             [[ "$check_for_file" = "" ]]  &&  compile_error=true
    28262821             continue   # STATUS=1, IF a.out EXISTS
    2827           elif [[ $localhost = lccrayb  ||  $localhost = lccrayf ||  $localhost = lccrayh ]]
     2822          elif [[ $localhost = lcbullhh  ||  $localhost = lccrayb  ||  $localhost = lccrayf ||  $localhost = lccrayh ]]
    28282823          then
    28292824             make $mopts -f Makefile PROG=a.out  F90=$compiler_name  COPT="$cpp_options"  F90FLAGS="$fopts"  LDFLAGS="$lopts"
     
    34973492                # COPY HOSTFILE FROM SOURCE DIRECTORY OR CREATE IT, IF IT
    34983493                # DOES NOT EXIST
    3499              if [[  $host != lccrayb  &&$host != lccrayf  && $host != lccrayh  &&  $host != lckyuh  &&  $host != lckyut ]]
     3494             if [[  $host != lccrayb  && $host != lccrayf  && $host != lccrayh  &&  $host != lckyuh  &&  $host != lckyut ]]
    35003495             then
    35013496                if [[ -f $hostfile ]]
     
    36663661             echo "mpirun  -np $ii  -hostfile $PBS_NODEFILE ./a.out  $ROPTS"
    36673662             mpirun  -np $ii  -hostfile $PBS_NODEFILE ./a.out  $ROPTS
     3663
     3664          elif [[ $host = lcbullhh ]]
     3665          then
     3666             export OMPI_MCA_pml=cm
     3667             export OMPI_MCA_mtl=mxm
     3668             export OMPI_MCA_coll=^ghc
     3669             export OMPI_MCA_mtl_mxm_np=0
     3670             export MXM_RDMA_PORTS=mlx5_0:1
     3671             export MXM_LOG_LEVEL=ERROR
     3672             export OMP_NUM_THREADS=$threads_per_task
     3673             export KMP_AFFINITY=verbose,granularity=core,compact,1
     3674             export KMP_STACKSIZE=64m
     3675
     3676             srun  --nodes=$nodes --ntasks-per-node=$tasks_per_node ./a.out 
    36683677          else
    36693678             mpprun  -n $numprocs  a.out  $ROPTS
    36703679          fi
    3671        else
     3680      else
    36723681          a.out  $ROPTS
    36733682       fi
     
    38923901                   then
    38933902                      ssh $usern@hlogin1 ". \\$HOME/.profile; cd $TEMPDIR; batch_scp $PORTOPT $cps -b -m -u $return_username $return_address  ${localout[$i]} \"${pathout[$i]}\" ${localhost}_${fname}${endout[$i]}  ${extout[$i]}"
     3903                   elif [[ $localhost = lcbullhh ]]
     3904                   then
     3905                      ssh $usern@mlogin101 ". \\$HOME/.profile; cd $TEMPDIR; batch_scp $PORTOPT $cps -b -m -u $return_username $return_address  ${localout[$i]} \"${pathout[$i]}\" ${localhost}_${fname}${endout[$i]}  ${extout[$i]}"
    38943906                   elif [[ $localhost = lcxe6 ]]
    38953907                   then
     
    44544466          then
    44554467
    4456              if [[ $localhost = lccrayb  || $localhost = lccrayh  ||  $localhost = nech  ||  $localhost = ibmh  ||  $localhost = ibmkisti  ||  $localhost = ibmku  ||  $localhost = ibms  ||  $localhost = lcflow  ||  $localhost = lckyu* || $localhost = lcxe6 ]]
     4468             if [[ $localhost = lcbullhh  ||  $localhost = lccrayb  ||  $localhost = lccrayh  ||  $localhost = nech  ||  $localhost = ibmh  ||  $localhost = ibmkisti  ||  $localhost = ibmku  ||  $localhost = ibms  ||  $localhost = lcflow  ||  $localhost = lckyu*  || $localhost = lcxe6 ]]
    44574469             then
    44584470                echo "*** ssh will be used to initiate restart-runs!"
     
    44644476                   then
    44654477                      ssh $SSH_PORTOPT $usern@136.172.40.15 "ssh $SSH_PORTOPT $return_address -l $return_username \". \\\$HOME/.profile; module load intel-compiler hdf5 netcdf; PATH=\\\$PATH:$LOCAL_MRUN_PATH;export PALM_BIN=$LOCAL_MRUN_PATH;cd $LOCAL_PWD; $mc\" "
     4478                   elif [[ $localhost = lcbullhh ]]
     4479                   then
     4480                      ssh $SSH_PORTOPT $usern@mlogin101 "ssh $SSH_PORTOPT $return_address -l $return_username \". \\\$HOME/.profile; module load intel-compiler hdf5 netcdf; PATH=\\\$PATH:$LOCAL_MRUN_PATH;export PALM_BIN=$LOCAL_MRUN_PATH;cd $LOCAL_PWD; $mc\" "
    44664481                   elif [[ $localhost = lccrayb ]]
    44674482                   then
     
    45824597 else
    45834598
    4584 
    45854599       # PREPARING ACTIONS,
    45864600       # IF A BATCH-JOB IS TO BE GENERATED AND STARTED ON A REMOTE-MACHINE GERECHNET
     
    46964710       #              THIS MAY CREATE A QUITE LARGE JOB-FILE, WHICH CAN CAUSE PROBLEMS WITH SOME
    46974711       #              QUEUEING-SYSTEMS
    4698     if [[ $host = ibmkisti  ||  $host = lccrayb  ||  $host = lccrayf  ||  $host = lccrayh ]]
     4712    if [[ $host = ibmkisti  ||  $host = lcbullhh  ||  $host = lccrayb  ||  $host = lccrayf  ||  $host = lccrayh ]]
    46994713    then
    47004714
     
    47114725       if [[ $host = $localhost ]]
    47124726       then
    4713 
    47144727             # DUE TO UNKNOWN REASONS, COPY WITH cp COMMAND CREATES CORRUPT
    47154728             # FILES ON CRAY XC30 SYSTEMS (CSC HELSINKI), rsync IS USED INSTEAD
     
    47274740             echo  "localdir=\`pwd\`"                            >>  $jobfile
    47284741             echo  "ssh $SSH_PORTOPT $remote_username@gaiad \"cd \$localdir; scp $PORTOPT -r  $return_username@$return_address:$working_directory/SOURCES_FOR_RUN_$fname .\" "  >>  $jobfile
     4742         elif [[ $host = lcbullhh ]]
     4743          then
     4744             echo  "localdir=\`pwd\`"                            >>  $jobfile
     4745             echo  "ssh $SSH_PORTOPT $remote_username@mlogin101 \"cd \$localdir; scp $PORTOPT -r  $return_username@$return_address:$working_directory/SOURCES_FOR_RUN_$fname .\" "  >>  $jobfile
     4746
    47294747          elif [[ $host = lccrayb ]]
    47304748          then
  • palm/trunk/SCRIPTS/subjob

    r1576 r1620  
    2323# Current revisions:
    2424# ------------------
    25 #
     25# adjustments for Mistral at DKRZ Hamburg (lcbullhh)
    2626#
    2727# Former revisions:
     
    174174
    175175
    176 
    177 
    178176    # DETERMINE NAME OF LOCAL HOST
    179177 local_host=$(hostname)
    180 
    181 
    182178
    183179    # SET HOST-SPECIFIC VARIABLES VEREINBAREN (CHECK, IF LOCAL HOST
     
    223219     (meller)                local_address=134.106.74.155; local_host=lcfor;;
    224220     (meteo-login*)          local_address=193.166.211.144;local_host=lcxt5m;;
     221     (mlogin1*|m1*)          local_address=136.172.50.13;  local_host=lcbullhh;;
    225222     (hexagon*)              local_address=129.177.20.113; local_host=lcxe6;;
    226223     (nobel*)                local_address=150.183.5.101;  local_host=ibms;;
     
    367364        (ibmku)   queue=s4; remote_address=133.5.4.129; submcom=/usr/local/bin/llsubmit;;
    368365        (ibms)    queue=p_normal; remote_address=150.183.5.101; submcom=/usr/lpp/LoadL/full/bin/llsubmit;;
     366        (lcbullhh)    queue=compute; remote_address=136.172.50.13; submcom=/usr/bin/sbatch;;
    369367        (lccrayb) queue=mpp1testq; remote_address=130.73.233.1; submcom=/opt/moab/default/bin/msub;;
    370368        (lccrayh) queue=mpp1testq; remote_address=130.75.4.1; submcom=/opt/moab/default/bin/msub;;
     
    413411        (ibms)   case  $ndq  in
    414412                     (express|normal|p_express|p_normal|p_normal_1.3|p_normal_1.7|grand)     error=false;;
     413                     (*)                                     error=true;;
     414                 esac;;
     415        (lcbullhh) case  $ndq  in
     416                     (compute|shared)  error=false;;
    415417                     (*)                                     error=true;;
    416418                 esac;;
     
    617619 fi
    618620
    619 
    620 
    621621    # GENERATE RANDOM IDENTIFIER, AND DETERMINE THE JOBNAME ON THE TARGET HOST
    622622 identifier=$RANDOM
     
    630630    remote_dayfile=/dev/null
    631631 fi
    632 
    633632
    634633
     
    780779%%END%%
    781780
     781 elif [[ $remote_host = lcbullhh ]]
     782 then
     783    if [[ $numprocs != 0 ]]
     784    then
     785       cat > $job_to_send << %%END%%
     786#!/bin/bash -l
     787#SBATCH -J $job_name
     788#SBATCH -t $timestring
     789#SBATCH -N $nodes
     790#SBATCH --ntasks-per-node=$processes_per_node
     791#SBATCH -p $queue
     792#SBATCH -o $remote_dayfile
     793#SBATCH -e $remote_dayfile
     794#SBATCH -A $project_account
     795
     796$init_cmds
     797$module_calls
     798
     799%%END%%
     800
     801    else
     802       cat > $job_to_send << %%END%%
     803#!/bin/bash -l
     804#SBATCH -J $job_name
     805#SBATCH -t $timestring
     806#SBATCH -l ncpus=1
     807#SBATCH -l pmem=${memory}mb
     808#SBATCH -m abe
     809#SBATCH -o $remote_dayfile
     810#SBATCH -e $remote_dayfile
     811#SBATCH -A $project_account
     812
     813$init_cmds
     814$module_calls
     815
     816%%END%%
     817
     818    fi
     819
    782820 elif [[ $remote_host = lccrayb || $remote_host = lccrayh ]]
    783821 then
     
    11961234    echo "trap '"                               >>  $job_to_send
    11971235    echo "set +vx"                              >>  $job_to_send
    1198     if [[ $(echo $remote_host | cut -c1-3) = ibm  ||  $remote_host = lccrayb  ||  $remote_host = lccrayh  ||  $(echo $remote_host | cut -c1-3) = nec  ||  $remote_host = lcflow  ||  $remote_host = lckiaps  ||  $remote_host = lckyu* || $remote_host = lcxe6 ]]
     1236    if [[ $(echo $remote_host | cut -c1-3) = ibm  ||  $remote_host = lcbullhh  ||  $remote_host = lccrayb  ||  $remote_host = lccrayh  ||  $(echo $remote_host | cut -c1-3) = nec  ||  $remote_host = lcflow  ||  $remote_host = lckiaps  ||  $remote_host = lckyu* || $remote_host = lcxe6 ]]
    11991237    then
    12001238       if [[ $remote_host = ibmh ]]
     
    12101248       then
    12111249          return_queue=p_normal
     1250       elif [[ $remote_host = lcbullhh ]]
     1251       then
     1252          return_queue=shared
    12121253       elif [[ $remote_host = lccrayb || $remote_host = lccrayh ]]
    12131254       then
     
    12761317          echo "[[ \"\$for_subjob_to_do\" != \"\" ]]  &&  eval \$for_subjob_to_do"  >>  $job_to_send
    12771318          echo "%%END%%"                           >>  $job_to_send
     1319
     1320       elif [[ $remote_host = lcbullhh ]]
     1321       then
     1322          echo "cat > scpjob.$identifier << %%END%%"        >>  $job_to_send
     1323          echo "#!/bin/bash"                             >>  $job_to_send
     1324          echo "#SBATCH --job-name=job_protocol_transfer" >>  $job_to_send
     1325          echo "#SBATCH -t 00:20:00"                     >>  $job_to_send
     1326          echo "#SBATCH -N 1"                            >>  $job_to_send
     1327          echo "#SBATCH -n 1"                            >>  $job_to_send
     1328          echo "#SBATCH -o \$HOME/job_queue/last_job_transfer_protocol"      >>  $job_to_send
     1329          echo "#SBATCH -o $remote_dayfile"              >>  $job_to_send
     1330          echo "#SBATCH -e $remote_dayfile"              >>  $job_to_send
     1331          echo "#SBATCH -A $project_account"             >>  $job_to_send
     1332          echo "#SBATCH -p $return_queue"                >>  $job_to_send
     1333          echo " "                                       >>  $job_to_send
     1334          echo "set -x"                                  >>  $job_to_send
     1335          echo "batch_scp  $PORTOPT  -d  -w 10  -u $local_user $local_address  ${job_catalog}/$remote_dayfile  \"$job_catalog\"  $local_dayfile"  >>  $job_to_send
     1336          echo "[[ \"\$for_subjob_to_do\" != \"\" ]]  &&  eval \$for_subjob_to_do"  >>  $job_to_send
     1337          echo "%%END%%"                                 >>  $job_to_send
    12781338
    12791339       elif [[ $remote_host = lckyuh ]]
     
    13871447          echo "[[ \"\$for_subjob_to_do\" != \"\" ]]  &&  eval \$for_subjob_to_do"  >>  $job_to_send
    13881448          echo "%%END%%"                           >>  $job_to_send
     1449
    13891450       fi
    13901451
     
    13921453       then
    13931454          echo "llsubmit  scpjob.$identifier"      >>  $job_to_send
     1455       elif [[ $remote_host = lcbullhh ]]
     1456       then
     1457          echo "sbatch  scpjob.$identifier"               >>  $job_to_send
    13941458       elif [[ $remote_host = lccrayb || $remote_host = lccrayh ]]
    13951459       then
     
    14361500
    14371501
    1438 
    14391502    # APPEND THE JOB-FILE (CREATE BY mrun) TO THE JOB-DIRECTIVES GENERATED ABOVE
    14401503 cat  $file_to_send  >>  $job_to_send
     
    15251588             eval  $submcom  -q $queue  $job_on_remhost
    15261589          fi
     1590       elif [[ $local_host = lcbullhh ]]
     1591       then
     1592          if [[ $queue = default ]]
     1593          then
     1594             eval  $submcom  $job_on_remhost
     1595          fi
    15271596       else
    15281597          qsub  $job_on_remhost
     
    15391608 fi
    15401609
    1541 
    1542 
    15431610    # FINAL ACTIONS
    15441611 if [[ $no_submit = false ]]
Note: See TracChangeset for help on using the changeset viewer.