source: palm/trunk/SOURCE/palm.f90 @ 1482

Last change on this file since 1482 was 1482, checked in by raasch, 10 years ago

adjustments for using CUDA-aware MPI

  • Property svn:keywords set to Id
File size: 12.4 KB
RevLine 
[1]1 PROGRAM palm
2
[1036]3!--------------------------------------------------------------------------------!
4! This file is part of PALM.
5!
6! PALM is free software: you can redistribute it and/or modify it under the terms
7! of the GNU General Public License as published by the Free Software Foundation,
8! either version 3 of the License, or (at your option) any later version.
9!
10! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12! A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13!
14! You should have received a copy of the GNU General Public License along with
15! PALM. If not, see <http://www.gnu.org/licenses/>.
16!
[1310]17! Copyright 1997-2014 Leibniz Universitaet Hannover
[1036]18!--------------------------------------------------------------------------------!
19!
[484]20! Current revisions:
[1]21! -----------------
[1482]22! adjustments for using CUDA-aware OpenMPI
[1375]23!
[1321]24! Former revisions:
25! -----------------
26! $Id: palm.f90 1482 2014-10-18 12:34:45Z raasch $
27!
[1469]28! 1468 2014-09-24 14:06:57Z maronga
29! Adapted for use on up to 6-digit processor cores
30!
[1403]31! 1402 2014-05-09 14:25:13Z raasch
32! location messages added
33!
[1375]34! 1374 2014-04-25 12:55:07Z raasch
35! bugfix: various modules added
36!
[1321]37! 1320 2014-03-20 08:40:49Z raasch
[1320]38! ONLY-attribute added to USE-statements,
39! kind-parameters added to all INTEGER and REAL declaration statements,
40! kinds are defined in new module kinds,
41! old module precision_kind is removed,
42! revision history before 2012 removed,
43! comment fields (!:) to be used for variable explanations added to
44! all variable declaration statements
[77]45!
[1319]46! 1318 2014-03-17 13:35:16Z raasch
47! module interfaces removed
48!
[1242]49! 1241 2013-10-30 11:36:58Z heinze
50! initialization of nuding and large scale forcing from external file
51!
[1222]52! 1221 2013-09-10 08:59:13Z raasch
53! +wall_flags_00, rflags_invers, rflags_s_inner in copyin statement
54!
[1213]55! 1212 2013-08-15 08:46:27Z raasch
56! +tri in copyin statement
57!
[1182]58! 1179 2013-06-14 05:57:58Z raasch
59! ref_state added to copyin-list
60!
[1114]61! 1113 2013-03-10 02:48:14Z raasch
62! openACC statements modified
63!
[1112]64! 1111 2013-03-08 23:54:10Z raasch
65! openACC statements updated
66!
[1093]67! 1092 2013-02-02 11:24:22Z raasch
68! unused variables removed
69!
[1037]70! 1036 2012-10-22 13:43:42Z raasch
71! code put under GPL (PALM 3.9)
72!
[1017]73! 1015 2012-09-27 09:23:24Z raasch
[863]74! Version number changed from 3.8 to 3.8a.
[1017]75! OpenACC statements added + code changes required for GPU optimization
[863]76!
[850]77! 849 2012-03-15 10:35:09Z raasch
78! write_particles renamed lpm_write_restart_file
79!
[1]80! Revision 1.1  1997/07/24 11:23:35  raasch
81! Initial revision
82!
83!
84! Description:
85! ------------
86! Large-Eddy Simulation (LES) model for the convective boundary layer,
87! optimized for use on parallel machines (implementation realized using the
88! Message Passing Interface (MPI)). The model can also be run on vector machines
89! (less well optimized) and workstations. Versions for the different types of
90! machines are controlled via cpp-directives.
91! Model runs are only feasible using the ksh-script mrun.
92!------------------------------------------------------------------------------!
93
[1374]94    USE arrays_3d
[1]95
[1320]96    USE control_parameters,                                                    &
97        ONLY:  coupling_char, coupling_mode, do2d_at_begin, do3d_at_begin,     &
98               io_blocks, io_group, large_scale_forcing, nudging,              &
[1374]99               simulated_time, simulated_time_chr, version, wall_heatflux, write_binary
[1320]100
101    USE cpulog,                                                                &
102        ONLY:  cpu_log, log_point, cpu_statistics
103
[1374]104    USE grid_variables,                                                        &
105        ONLY:  fxm, fxp, fym, fyp, fwxm, fwxp, fwym, fwyp, wall_e_x, wall_e_y, &
106               wall_u, wall_v, wall_w_x, wall_w_y
107
108    USE indices,                                                               &
109        ONLY:  ngp_2dh, ngp_2dh_s_inner, nzb_diff_s_inner, nzb_diff_s_outer, nzb_diff_u, nzb_diff_v,     &
110               nzb_s_inner, nzb_s_outer, nzb_u_inner, nzb_u_outer, nzb_v_inner,&
111               nzb_v_outer, nzb_w_inner, nzb_w_outer, rflags_invers,           &
112               rflags_s_inner, wall_flags_0, wall_flags_00
113
[1320]114    USE kinds
115
116    USE ls_forcing_mod,                                                        &
117        ONLY:  init_ls_forcing
118
119    USE nudge_mod,                                                             &
120        ONLY:  init_nudge
121
122    USE particle_attributes,                                                   &
123        ONLY:  particle_advection
124
[1]125    USE pegrid
126
[1374]127    USE statistics,                                                            &
128        ONLY:  hom, rmask, weight_pres, weight_substep
129
[1015]130#if defined( __openacc )
131    USE OPENACC
132#endif
133
[1]134    IMPLICIT NONE
135
136!
137!-- Local variables
[1482]138    CHARACTER(LEN=9)  ::  time_to_string  !:
139    CHARACTER(LEN=10) ::  env_string      !: to store string of environment var
140    INTEGER(iwp)      ::  env_stat        !: to hold status of GET_ENV
141    INTEGER(iwp)      ::  i               !:
142    INTEGER(iwp)      ::  myid_openmpi    !: OpenMPI local rank for CUDA aware MPI
[1015]143#if defined( __openacc )
[1320]144    REAL(wp), DIMENSION(100) ::  acc_dum     !:
[1015]145#endif
[1]146
[1289]147    version = 'PALM 3.10'
[75]148
[1]149#if defined( __parallel )
150!
151!-- MPI initialisation. comm2d is preliminary set, because
152!-- it will be defined in init_pegrid but is used before in cpu_log.
153    CALL MPI_INIT( ierr )
154    CALL MPI_COMM_SIZE( MPI_COMM_WORLD, numprocs, ierr )
[206]155    CALL MPI_COMM_RANK( MPI_COMM_WORLD, myid, ierr )
[1]156    comm_palm = MPI_COMM_WORLD
157    comm2d    = MPI_COMM_WORLD
158
159!
[206]160!-- Initialize PE topology in case of coupled runs
161    CALL init_coupling
[102]162#endif
163
[1015]164#if defined( __openacc )
[102]165!
[1482]166!-- Get the local MPI rank in case of CUDA aware OpenMPI. Important, if there
167!-- is more than one accelerator board on the node
168    CALL GET_ENVIRONMENT_VARIABLE('OMPI_COMM_WORLD_LOCAL_RANK',                &
169         VALUE=env_string, STATUS=env_stat )
170    READ( env_string, '(I1)' )  myid_openmpi
171    PRINT*, '### local_rank = ', myid_openmpi, '  status=',env_stat
172!
[1015]173!-- Get the number of accelerator boards per node and assign the MPI processes
174!-- to these boards
[1092]175    PRINT*, '*** ACC_DEVICE_NVIDIA = ', ACC_DEVICE_NVIDIA
[1015]176    num_acc_per_node  = ACC_GET_NUM_DEVICES( ACC_DEVICE_NVIDIA )
[1092]177    IF ( numprocs == 1  .AND.  num_acc_per_node > 0 )  num_acc_per_node = 1
[1482]178    PRINT*, '*** myid = ', myid_openmpi, ' num_acc_per_node = ', num_acc_per_node
179    acc_rank = MOD( myid_openmpi, num_acc_per_node )
[1015]180    CALL ACC_SET_DEVICE_NUM ( acc_rank, ACC_DEVICE_NVIDIA )
181!
182!-- Test output (to be removed later)
[1482]183    WRITE (*,'(A,I6,A,I3,A,I3,A,I3)') '*** Connect MPI-Task ', myid_openmpi,   &
184                                      ' to CPU ', acc_rank, ' Devices: ',      &
185                                      num_acc_per_node, ' connected to:',      &
[1015]186                                      ACC_GET_DEVICE_NUM( ACC_DEVICE_NVIDIA )
187#endif
[1092]188
[1015]189!
190!-- Ensure that OpenACC first attaches the GPU devices by copying a dummy data
191!-- region
192    !$acc data copyin( acc_dum )
193
194!
[1]195!-- Initialize measuring of the CPU-time remaining to the run
196    CALL local_tremain_ini
197
198!
199!-- Start of total CPU time measuring.
200    CALL cpu_log( log_point(1), 'total', 'start' )
201    CALL cpu_log( log_point(2), 'initialisation', 'start' )
202
203!
[206]204!-- Open a file for debug output
[1468]205    WRITE (myid_char,'(''_'',I6.6)')  myid
[206]206    OPEN( 9, FILE='DEBUG'//TRIM( coupling_char )//myid_char, FORM='FORMATTED' )
207
208!
[1]209!-- Initialize dvrp logging. Also, one PE maybe split from the global
210!-- communicator for doing the dvrp output. In that case, the number of
211!-- PEs available for PALM is reduced by one and communicator comm_palm
212!-- is changed respectively.
213#if defined( __parallel )
214    CALL MPI_COMM_RANK( comm_palm, myid, ierr )
215!
[102]216!-- TEST OUTPUT (TO BE REMOVED)
217    WRITE(9,*) '*** coupling_mode = "', TRIM( coupling_mode ), '"'
218    CALL LOCAL_FLUSH( 9 )
[215]219    IF ( TRIM( coupling_mode ) /= 'uncoupled' )  THEN
220       PRINT*, '*** PE', myid, ' Global target PE:', target_id, &
221               TRIM( coupling_mode )
222    ENDIF
[102]223#endif
224
[108]225    CALL init_dvrp_logging
226
[102]227!
[108]228!-- Read control parameters from NAMELIST files and read environment-variables
229    CALL parin
230
231!
232!-- Determine processor topology and local array indices
233    CALL init_pegrid
234
235!
[1]236!-- Generate grid parameters
237    CALL init_grid
238
239!
[1241]240!-- Initialize nudging if required
241    IF ( nudging )  THEN
242       CALL init_nudge
243    ENDIF
244
245!
246!-- Initialize reading of large scale forcing from external file - if required
247    IF ( large_scale_forcing )  THEN
248       CALL init_ls_forcing
249    ENDIF
250
251!
[1]252!-- Check control parameters and deduce further quantities
253    CALL check_parameters
254
255!
256!-- Initialize all necessary variables
257    CALL init_3d_model
258
259!
260!-- Output of program header
261    IF ( myid == 0 )  CALL header
262
263    CALL cpu_log( log_point(2), 'initialisation', 'stop' )
264
265!
266!-- Set start time in format hh:mm:ss
267    simulated_time_chr = time_to_string( simulated_time )
268
269!
270!-- If required, output of initial arrays
271    IF ( do2d_at_begin )  THEN
272       CALL data_output_2d( 'xy', 0 )
273       CALL data_output_2d( 'xz', 0 )
274       CALL data_output_2d( 'yz', 0 )
275    ENDIF
276    IF ( do3d_at_begin )  THEN
277       CALL data_output_3d( 0 )
278    ENDIF
279
280!
[1015]281!-- Declare and initialize variables in the accelerator memory with their
282!-- host values
[1113]283    !$acc  data copyin( d, diss, e, e_p, kh, km, p, pt, pt_p, q, ql, tend, te_m, tpt_m, tu_m, tv_m, tw_m, u, u_p, v, vpt, v_p, w, w_p )          &
[1212]284    !$acc       copyin( tri, tric, dzu, ddzu, ddzw, dd2zu, l_grid, l_wall, ptdf_x, ptdf_y, pt_init, rdf, rdf_sc, ref_state, ug, u_init, vg, v_init, zu, zw )   &
[1015]285    !$acc       copyin( hom, qs, qsws, qswst, rif, rif_wall, shf, ts, tswst, us, usws, uswst, vsws, vswst, z0, z0h )      &
286    !$acc       copyin( fxm, fxp, fym, fyp, fwxm, fwxp, fwym, fwyp, nzb_diff_s_inner, nzb_diff_s_outer, nzb_diff_u )       &
287    !$acc       copyin( nzb_diff_v, nzb_s_inner, nzb_s_outer, nzb_u_inner )    &
288    !$acc       copyin( nzb_u_outer, nzb_v_inner, nzb_v_outer, nzb_w_inner )   &
[1221]289    !$acc       copyin( nzb_w_outer, rflags_invers, rflags_s_inner, rmask, wall_heatflux, wall_e_x, wall_e_y, wall_u, wall_v, wall_w_x, wall_w_y, wall_flags_0, wall_flags_00 )  &
290    !$acc       copyin( ngp_2dh, ngp_2dh_s_inner )  &
[1113]291    !$acc       copyin( weight_pres, weight_substep )
[1015]292!
[495]293!-- Integration of the model equations using timestep-scheme
[1]294    CALL time_integration
295
296!
[495]297!-- If required, write binary data for restart runs
298    IF ( write_binary(1:4) == 'true' )  THEN
[759]299
300       CALL cpu_log( log_point(22), 'write_3d_binary', 'start' )
301
[1402]302       CALL location_message( 'writing restart data', .FALSE. )
303
[759]304       CALL check_open( 14 )
305
306       DO  i = 0, io_blocks-1
307          IF ( i == io_group )  THEN
[1]308!
[759]309!--          Write flow field data
310             CALL write_3d_binary
311          ENDIF
312#if defined( __parallel )
313          CALL MPI_BARRIER( comm2d, ierr )
314#endif
315       ENDDO
316
[1402]317       CALL location_message( 'finished', .TRUE. )
318
[759]319       CALL cpu_log( log_point(22), 'write_3d_binary', 'stop' )
320
[495]321!
322!--    If required, write particle data
[849]323       IF ( particle_advection )  CALL lpm_write_restart_file
[495]324    ENDIF
[1]325
326!
327!-- If required, repeat output of header including the required CPU-time
328    IF ( myid == 0 )  CALL header
329!
330!-- If required, final user-defined actions, and
331!-- last actions on the open files and close files. Unit 14 was opened
332!-- in write_3d_binary but it is closed here, to allow writing on this
333!-- unit in routine user_last_actions.
334    CALL cpu_log( log_point(4), 'last actions', 'start' )
[759]335    DO  i = 0, io_blocks-1
336       IF ( i == io_group )  THEN
337          CALL user_last_actions
338          IF ( write_binary(1:4) == 'true' )  CALL close_file( 14 )
339       ENDIF
340#if defined( __parallel )
341       CALL MPI_BARRIER( comm2d, ierr )
342#endif
343    ENDDO
[1]344    CALL close_file( 0 )
345    CALL close_dvrp
346    CALL cpu_log( log_point(4), 'last actions', 'stop' )
347
[102]348#if defined( __mpi2 )
[1]349!
[206]350!-- Test exchange via intercommunicator in case of a MPI-2 coupling
[102]351    IF ( coupling_mode == 'atmosphere_to_ocean' )  THEN
352       i = 12345 + myid
353       CALL MPI_SEND( i, 1, MPI_INTEGER, myid, 11, comm_inter, ierr )
354    ELSEIF ( coupling_mode == 'ocean_to_atmosphere' )  THEN
355       CALL MPI_RECV( i, 1, MPI_INTEGER, myid, 11, comm_inter, status, ierr )
356       PRINT*, '### myid: ', myid, '   received from atmosphere:  i = ', i
357    ENDIF
358#endif
359
360!
[1015]361!-- Close the OpenACC dummy data region
362    !$acc end data
363    !$acc end data
364
365!
[1]366!-- Take final CPU-time for CPU-time analysis
367    CALL cpu_log( log_point(1), 'total', 'stop' )
368    CALL cpu_statistics
369
370#if defined( __parallel )
371    CALL MPI_FINALIZE( ierr )
372#endif
373
374 END PROGRAM palm
Note: See TracBrowser for help on using the repository browser.