Changeset 1468 for palm/trunk/SOURCE


Ignore:
Timestamp:
Sep 24, 2014 2:06:57 PM (10 years ago)
Author:
maronga
Message:

New flag files allow to force unscheduled termination/restarts of batch jobs, progress output is made for batch runs, small adjustments for lxce6 and lccrayh/lccrayb

Location:
palm/trunk/SOURCE
Files:
8 edited

Legend:

Unmodified
Added
Removed
  • palm/trunk/SOURCE/check_for_restart.f90

    r1354 r1468  
    2020! Current revisions:
    2121! -----------------
    22 !
     22! Added support for unscheduled job termination using the flag files
     23! DO_STOP_NOW and DO_RESTART_NOW
    2324!
    2425! Former revisions:
     
    6566
    6667
    67     LOGICAL :: terminate_run_l  !:
     68    LOGICAL :: terminate_run_l           !:
     69    LOGICAL :: do_stop_now = .FALSE.     !:
     70    LOGICAL :: do_restart_now = .FALSE.  !:
    6871
    6972    REAL(wp) ::  remaining_time !:
     
    113116
    114117       terminate_coupled = 3
     118
    115119#if defined( __parallel )
    116120       IF ( myid == 0 ) THEN
     
    126130    ENDIF
    127131
    128 !
    129 !-- Set the stop flag also, if restart is forced by user
    130     IF ( time_restart /= 9999999.9_wp  .AND.                                      &
     132
     133!
     134!-- Check if a flag file exists that forces a termination of the model
     135    terminate_run_l = .FALSE.
     136    IF ( myid == 0 )  THEN
     137       INQUIRE(FILE="DO_STOP_NOW", EXIST=do_stop_now)
     138       INQUIRE(FILE="DO_RESTART_NOW", EXIST=do_restart_now)
     139
     140       IF ( do_stop_now .OR. do_restart_now )  THEN
     141
     142          terminate_run_l = .TRUE.
     143
     144          WRITE( message_string, * ) 'run will be terminated because user ',   &
     145                                  'forced a job finialization using a flag',   &
     146                                  'file:',                                     &
     147                                  '&DO_STOP_NOW: ', do_stop_now,               &
     148                                  '&DO_RESTART_NOW: ', do_restart_now
     149          CALL message( 'check_for_restart', 'PA0398', 0, 0, 0, 6, 0 )
     150
     151       ENDIF
     152    ENDIF
     153
     154
     155#if defined( __parallel )
     156!
     157!-- Make a logical OR for all processes. Stop the model run if at least
     158!-- one processor has reached the time limit.
     159    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
     160    CALL MPI_ALLREDUCE( terminate_run_l, terminate_run, 1, MPI_LOGICAL,        &
     161                        MPI_LOR, comm2d, ierr )
     162#else
     163    terminate_run = terminate_run_l
     164#endif
     165
     166!
     167!-- In case of coupled runs inform the remote model of the termination
     168!-- and its reason, provided the remote model has not already been
     169!-- informed of another termination reason (terminate_coupled > 0) before,
     170!-- or vice versa (terminate_coupled_remote > 0).
     171    IF ( terminate_run .AND. coupling_mode /= 'uncoupled' .AND.                &
     172         terminate_coupled == 0 .AND.  terminate_coupled_remote == 0 )  THEN
     173
     174       terminate_coupled = 6
     175
     176#if defined( __parallel )
     177       IF ( myid == 0 ) THEN
     178          CALL MPI_SENDRECV( terminate_coupled,        1, MPI_INTEGER,      &
     179                             target_id,  0,                                 &
     180                             terminate_coupled_remote, 1, MPI_INTEGER,      &
     181                             target_id,  0,                                 &
     182                             comm_inter, status, ierr )   
     183       ENDIF
     184       CALL MPI_BCAST( terminate_coupled_remote, 1, MPI_INTEGER, 0,         &
     185                       comm2d, ierr ) 
     186#endif
     187
     188    ENDIF
     189
     190!
     191!-- Set the stop flag also, if restart is forced by user settings
     192    IF ( time_restart /= 9999999.9_wp  .AND.                                   &
    131193         time_restart < time_since_reference_point )  THEN
    132194
     
    183245!
    184246!-- If the run is stopped, set a flag file which is necessary to initiate
    185 !-- the start of a continuation run
    186     IF ( terminate_run  .AND.  myid == 0 )  THEN
     247!-- the start of a continuation run, except if the user forced to stop the
     248!-- run without restart
     249    IF ( terminate_run  .AND.  myid == 0 .AND. .NOT. do_stop_now)  THEN
    187250
    188251       OPEN ( 90, FILE='CONTINUE_RUN', FORM='FORMATTED' )
  • palm/trunk/SOURCE/check_open.f90

    r1360 r1468  
    2020! Current revisions:
    2121! -----------------
    22 !
     22! Adapted for use on up to 6-digit processor cores
     23! Added file unit 117 (PROGRESS)
    2324!
    2425! Former revisions:
     
    187188    IF ( openfile(file_id)%opened_before )  THEN
    188189       SELECT CASE ( file_id )
    189           CASE ( 13, 14, 21, 22, 23, 80:85 )
     190          CASE ( 13, 14, 21, 22, 23, 80:85, 117 )
    190191             IF ( file_id == 14 .AND. openfile(file_id)%opened_before )  THEN
    191192                message_string = 're-open of unit ' //                         &
     
    209210    SELECT CASE ( file_id )
    210211
    211        CASE ( 15, 16, 17, 18, 19, 50:59, 81:84, 104:105, 107, 109 )
     212       CASE ( 15, 16, 17, 18, 19, 50:59, 81:84, 104:105, 107, 109, 117 )
    212213             
    213214          IF ( myid /= 0 )  THEN
     
    280281          ELSE
    281282!
    282 !--          First opening of unit 13 openes file _0000 on all PEs because only
    283 !--          this file contains the global variables
     283!--          First opening of unit 13 openes file _000000 on all PEs because
     284!--          only this file contains the global variables
    284285             IF ( .NOT. openfile(file_id)%opened_before )  THEN
    285                 OPEN ( 13, FILE='BININ'//TRIM( coupling_char )//'/_0000',      &
     286                OPEN ( 13, FILE='BININ'//TRIM( coupling_char )//'/_000000',      &
    286287                           FORM='UNFORMATTED', STATUS='OLD' )
    287288             ELSE
     
    336337          ENDIF
    337338          IF ( myid_char == '' )  THEN
    338              OPEN ( 20, FILE='DATA_LOG'//TRIM( coupling_char )//'/_0000',      &
     339             OPEN ( 20, FILE='DATA_LOG'//TRIM( coupling_char )//'/_000000',      &
    339340                        FORM='UNFORMATTED', POSITION='APPEND' )
    340341          ELSE
     
    10531054
    10541055          ENDIF
     1056
     1057
     1058!
     1059!--    Progress file that is used by the PALM watchdog
     1060       CASE ( 117 )
     1061
     1062          OPEN ( 117, FILE='PROGRESS'//coupling_char, STATUS='REPLACE', FORM='FORMATTED' )
     1063
    10551064
    10561065       CASE ( 201:200+2*max_masks )
  • palm/trunk/SOURCE/header.f90

    r1430 r1468  
    2020! Current revisions:
    2121! -----------------
    22 !
     22! Adapted for use on up to 6-digit processor cores
    2323!
    2424! Former revisions:
     
    16941694
    16951695 99 FORMAT (1X,78('-'))
    1696 100 FORMAT (/1X,'******************************',6X,42('-')/        &
    1697             1X,'* ',A,' *',6X,A/                               &
    1698             1X,'******************************',6X,42('-'))
    1699 101 FORMAT (37X,'coupled run using MPI-',I1,': ',A/ &
    1700             37X,42('-'))
    1701 102 FORMAT (/' Date:                 ',A8,6X,'Run:       ',A20/      &
    1702             ' Time:                 ',A8,6X,'Run-No.:   ',I2.2/     &
     1696100 FORMAT (/1X,'******************************',4X,44('-')/        &
     1697            1X,'* ',A,' *',4X,A/                               &
     1698            1X,'******************************',4X,44('-'))
     1699101 FORMAT (35X,'coupled run using MPI-',I1,': ',A/ &
     1700            35X,42('-'))
     1701102 FORMAT (/' Date:                 ',A8,4X,'Run:       ',A20/      &
     1702            ' Time:                 ',A8,4X,'Run-No.:   ',I2.2/     &
    17031703            ' Run on host:        ',A10)
    17041704#if defined( __parallel )
    1705 103 FORMAT (' Number of PEs:',10X,I6,6X,'Processor grid (x,y): (',I3,',',I3, &
     1705103 FORMAT (' Number of PEs:',10X,I6,4X,'Processor grid (x,y): (',I4,',',I4, &
    17061706              ')',1X,A)
    1707 104 FORMAT (' Number of PEs:',8X,I5,9X,'Tasks:',I4,'   threads per task:',I4/ &
    1708               37X,'Processor grid (x,y): (',I3,',',I3,')',1X,A)
    1709 105 FORMAT (37X,'One additional PE is used to handle'/37X,'the dvrp output!')
    1710 106 FORMAT (37X,'A 1d-decomposition along x is forced'/ &
    1711             37X,'because the job is running on an SMP-cluster')
    1712 107 FORMAT (37X,'A 1d-decomposition along ',A,' is used')
    1713 108 FORMAT (37X,'Max. # of parallel I/O streams is ',I5)
    1714 109 FORMAT (37X,'Precursor run for coupled atmos-ocean run'/ &
    1715             37X,42('-'))
    1716 114 FORMAT (37X,'Coupled atmosphere-ocean run following'/ &
    1717             37X,'independent precursor runs'/             &
    1718             37X,42('-'))
     1707104 FORMAT (' Number of PEs:',10X,I6,4X,'Tasks:',I4,'   threads per task:',I4/ &
     1708              35X,'Processor grid (x,y): (',I4,',',I4,')',1X,A)
     1709105 FORMAT (35X,'One additional PE is used to handle'/37X,'the dvrp output!')
     1710106 FORMAT (35X,'A 1d-decomposition along x is forced'/ &
     1711            35X,'because the job is running on an SMP-cluster')
     1712107 FORMAT (35X,'A 1d-decomposition along ',A,' is used')
     1713108 FORMAT (35X,'Max. # of parallel I/O streams is ',I5)
     1714109 FORMAT (35X,'Precursor run for coupled atmos-ocean run'/ &
     1715            35X,42('-'))
     1716114 FORMAT (35X,'Coupled atmosphere-ocean run following'/ &
     1717            35X,'independent precursor runs'/             &
     1718            35X,42('-'))
    17191719117 FORMAT (' Accelerator boards / node:  ',I2)
    17201720#endif
  • palm/trunk/SOURCE/init_pegrid.f90

    r1436 r1468  
    2020! Current revisions:
    2121! ------------------
    22 !
     22! Adapted for use on up to 6-digit processor cores
    2323!
    2424! Former revisions:
     
    242242                          comm2d, ierr )
    243243    CALL MPI_COMM_RANK( comm2d, myid, ierr )
    244     WRITE (myid_char,'(''_'',I4.4)')  myid
     244    WRITE (myid_char,'(''_'',I6.6)')  myid
    245245
    246246    CALL MPI_CART_COORDS( comm2d, myid, ndim, pcoord, ierr )
  • palm/trunk/SOURCE/modules.f90

    r1451 r1468  
    2020! Current revisions:
    2121! ------------------
    22 !
     22! Adapted for use on up to 6-digit processor cores.
     23! Increased identifier string length for user-defined quantities to 20.
    2324!
    2425! Former revisions:
     
    548549    CHARACTER (LEN=1000) ::  message_string = ' '
    549550
    550     CHARACTER (LEN=11), DIMENSION(100) ::  data_output = ' ',    &
     551    CHARACTER (LEN=20), DIMENSION(100) ::  data_output = ' ',    &
    551552                                           data_output_user = ' ', doav = ' '
    552     CHARACTER (LEN=10), DIMENSION(max_masks,100) ::  &
     553    CHARACTER (LEN=20), DIMENSION(max_masks,100) ::  &
    553554         data_output_masks = ' ', data_output_masks_user = ' '
    554555
    555     CHARACTER (LEN=10), DIMENSION(300) ::  data_output_pr = ' '
    556     CHARACTER (LEN=10), DIMENSION(200) ::  data_output_pr_user = ' '
     556    CHARACTER (LEN=20), DIMENSION(300) ::  data_output_pr = ' '
     557    CHARACTER (LEN=20), DIMENSION(200) ::  data_output_pr_user = ' '
    557558    CHARACTER (LEN=20), DIMENSION(11)  ::  netcdf_precision = ' '
    558559
     
    12411242#endif
    12421243    CHARACTER(LEN=2) ::  send_receive = 'al'
    1243     CHARACTER(LEN=5) ::  myid_char = ''
     1244    CHARACTER(LEN=7) ::  myid_char = ''
    12441245    INTEGER(iwp)          ::  acc_rank, comm1dx, comm1dy, comm2d, comm_inter,       &
    12451246                              comm_palm, id_inflow = 0, id_recycling = 0, ierr,     &
  • palm/trunk/SOURCE/palm.f90

    r1403 r1468  
    2020! Current revisions:
    2121! -----------------
    22 !
     22! Adapted for use on up to 6-digit processor cores
    2323!
    2424! Former revisions:
     
    169169!
    170170!-- Test output (to be removed later)
    171     WRITE (*,'(A,I4,A,I3,A,I3,A,I3)') '*** Connect MPI-Task ', myid,' to CPU ',&
     171    WRITE (*,'(A,I6,A,I3,A,I3,A,I3)') '*** Connect MPI-Task ', myid,' to CPU ',&
    172172                                      acc_rank, ' Devices: ', num_acc_per_node,&
    173173                                      ' connected to:',                        &
     
    191191!
    192192!-- Open a file for debug output
    193     WRITE (myid_char,'(''_'',I4.4)')  myid
     193    WRITE (myid_char,'(''_'',I6.6)')  myid
    194194    OPEN( 9, FILE='DEBUG'//TRIM( coupling_char )//myid_char, FORM='FORMATTED' )
    195195
  • palm/trunk/SOURCE/progress_bar.f90

    r1402 r1468  
    2020! Current revisions:
    2121! -----------------
    22 !
     22! Added support for progress file PROGRESS which is used in case of batch jobs
    2323!
    2424! Former revisions:
     
    2828! Description:
    2929! ------------
    30 ! CPU-time measurements for any program part whatever. Results of the
     30! This routine prints either a progress bar on the standard output in case of
     31! interactive runs, or it prints the progress in a separate file called
     32! PROGRESS.
    3133!------------------------------------------------------------------------------!
    3234
    3335    USE control_parameters,                                                    &
    34         ONLY : end_time, simulated_time, simulated_time_at_begin, time_restart
     36        ONLY : end_time, run_identifier, simulated_time,                       &
     37               simulated_time_at_begin, time_restart
    3538
    36     USE, INTRINSIC ::  ISO_FORTRAN_ENV,                                     &
     39    USE, INTRINSIC ::  ISO_FORTRAN_ENV,                                        &
    3740        ONLY :  OUTPUT_UNIT
    3841
     
    6063
    6164    SUBROUTINE init_progress_bar
     65!------------------------------------------------------------------------------!
     66! Description:
     67! ------------
     68! Initialize the progress bar/file
     69!------------------------------------------------------------------------------!
    6270
    6371       IMPLICIT NONE
     
    7482       ENDIF
    7583
    76        bar = '____________________________________________________________'
    77        crosses = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
     84       IF ( batch_job )  THEN
     85
     86          CALL check_open ( 117 )
     87          WRITE ( 117, FMT='(A20,/)' ) run_identifier
     88
     89       ELSE
     90          bar = '____________________________________________________________'
     91          crosses = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
    7892!
    79 !--    Line feed on stdout to seperate the progress bar from previous messages
    80        WRITE ( OUTPUT_UNIT, '(1X)' )
     93!--       Line feed on stdout to seperate the progress bar from previous messages
     94          WRITE ( OUTPUT_UNIT, '(1X)' )
    8195#if defined( __intel_compiler )
    8296!
    83 !--    The Intel compiler does not allow to immediately flush the output buffer
    84 !--    in case that option ADVANCE='NO' is used in the write statement.
    85 !--    A workaround is to set a special carriage control feature and use "+" as
    86 !--    first output character, but this non-standard and only available with the
    87 !--    Intel compiler
    88        OPEN ( OUTPUT_UNIT, CARRIAGECONTROL='FORTRAN' )
     97!--       The Intel compiler does not allow to immediately flush the output buffer
     98!--       in case that option ADVANCE='NO' is used in the write statement.
     99!--       A workaround is to set a special carriage control feature and use "+" as
     100!--       first output character, but this non-standard and only available with the
     101!--       Intel compiler
     102          OPEN ( OUTPUT_UNIT, CARRIAGECONTROL='FORTRAN' )
    89103#endif
     104
     105       ENDIF
     106
    90107       initialized = .TRUE.
    91108
     
    97114! Description:
    98115! ------------
    99 !
     116! Print progress data to standard output (interactive) or to file (batch jobs)
    100117!------------------------------------------------------------------------------!
    101118
     
    104121       REAL(wp) ::  remaining_time_in_percent  !: remaining time to be simulated
    105122                                               !: in the job
     123       REAL(wp) ::  remaining_time_in_percent_total !: total remaining time of
     124                                                    !: the job chain
     125
     126       IF ( .NOT. initialized )  CALL init_progress_bar
     127
     128
     129       remaining_time_in_percent =                                             &
     130          ( simulated_time - simulated_time_at_begin ) / time_to_be_simulated
     131
     132       remaining_time_in_percent_total = ( simulated_time / end_time )
    106133
    107134!
    108 !--    Porgress bar does not make sense in batch mode (and also ADVANCE=no does
    109 !--    not properly work in batch mode on Cray XC30)
    110        IF ( batch_job )  RETURN
     135!--    In batch mode, use a file (PROGRESS), otherwise use progress bar
     136       IF ( batch_job )  THEN
    111137
    112        IF ( .NOT. initialized )  CALL init_progress_bar
     138          BACKSPACE ( 117 )
     139          WRITE ( 117, FMT='(F5.2,1X,F5.2)' ) remaining_time_in_percent,       &
     140                                              remaining_time_in_percent_total
     141          CALL local_flush( 117 )
     142
     143       ELSE
     144
    113145!
    114 !--    Calculate length of progress bar
    115        remaining_time_in_percent =                                             &
    116              ( simulated_time - simulated_time_at_begin ) / time_to_be_simulated
     146!--       Calculate length of progress bar
     147          ilength = remaining_time_in_percent * 60.0_wp
     148          ilength = MIN( ilength, 60 )
    117149
    118        ilength = remaining_time_in_percent * 60.0_wp
    119        ilength = MIN( ilength, 60 )
    120 
    121        bar(1:ilength) = crosses(1:ilength)
     150          bar(1:ilength) = crosses(1:ilength)
    122151
    123152#if defined( __intel_compiler )
    124        WRITE ( OUTPUT_UNIT, '(A,6X,''['',A,''] '',F5.1,'' left'')' )           &
    125                '+', bar,                                                       &
    126                 MAX( 0.0_wp, ( 1.0_wp - remaining_time_in_percent ) * 100.0_wp )
     153          WRITE ( OUTPUT_UNIT, '(A,6X,''['',A,''] '',F5.1,'' left'')' )        &
     154                  '+', bar,                                                    &
     155                   MAX( 0.0_wp, ( 1.0_wp - remaining_time_in_percent ) *       &
     156                                  100.0_wp )
    127157#else
    128        WRITE ( OUTPUT_UNIT, '(A,6X,''['',A,''] '',F5.1,'' left'')',            &
    129                ADVANCE='NO' )  CHAR( 13 ), bar,                                &
    130                 MAX( 0.0_wp, ( 1.0_wp - remaining_time_in_percent ) * 100.0_wp )
     158          WRITE ( OUTPUT_UNIT, '(A,6X,''['',A,''] '',F5.1,'' left'')',         &
     159                  ADVANCE='NO' )  CHAR( 13 ), bar,                             &
     160                   MAX( 0.0_wp, ( 1.0_wp - remaining_time_in_percent ) *       &
     161                                  100.0_wp )
    131162#endif
    132        CALL local_flush( OUTPUT_UNIT )
     163          CALL local_flush( OUTPUT_UNIT )
     164
     165       ENDIF
    133166
    134167    END SUBROUTINE output_progress_bar
    135168
    136169    SUBROUTINE finish_progress_bar
     170!------------------------------------------------------------------------------!
     171! Description:
     172! ------------
     173! Finalization of the progress bar/file
     174!------------------------------------------------------------------------------!
    137175
    138176       IMPLICIT NONE
    139177
    140        IF ( batch_job )  RETURN
     178       IF ( batch_job )  THEN
    141179
     180          CALL close_file ( 117 )
     181
     182       ELSE
     183       
    142184#if defined( __intel_compiler )
    143185!
    144 !--    Reset to the default carriage control
    145        OPEN ( OUTPUT_UNIT, CARRIAGECONTROL='LIST' )
     186!--       Reset to the default carriage control
     187          OPEN ( OUTPUT_UNIT, CARRIAGECONTROL='LIST' )
    146188#endif
    147189!
    148 !--    Line feed when simulation has finished
    149        WRITE ( OUTPUT_UNIT, '(1X)' )
     190!--       Line feed when simulation has finished
     191          WRITE ( OUTPUT_UNIT, '(1X)' )
     192
     193       ENDIF
    150194
    151195    END SUBROUTINE finish_progress_bar
    152196
     197
    153198 END MODULE progress_bar
  • palm/trunk/SOURCE/read_3d_binary.f90

    r1401 r1468  
    2020! Current revisions:
    2121! -----------------
    22 !
     22! Adapted for use on up to 6-digit processor cores
    2323!
    2424! Former revisions:
     
    108108    IMPLICIT NONE
    109109
    110     CHARACTER (LEN=5)  ::  myid_char_save
     110    CHARACTER (LEN=7)  ::  myid_char_save
    111111    CHARACTER (LEN=10) ::  binary_version
    112112    CHARACTER (LEN=10) ::  version_on_file
     
    296296!
    297297!--    Set the filename (underscore followed by four digit processor id)
    298        WRITE (myid_char,'(''_'',I4.4)')  j
     298       WRITE (myid_char,'(''_'',I6.6)')  j
    299299       WRITE (9,*) 'myid=',myid,' opening file "',myid_char,'"'
    300300       CALL local_flush( 9 )
    301301
    302302!
    303 !--    Open the restart file. If this file has been created by PE0 (_0000),
     303!--    Open the restart file. If this file has been created by PE0 (_000000),
    304304!--    the global variables at the beginning of the file have to be skipped
    305305!--    first.
Note: See TracChangeset for help on using the changeset viewer.