source: palm/trunk/SOURCE/palm.f90 @ 1221

Last change on this file since 1221 was 1221, checked in by raasch, 8 years ago

New:


openACC porting of reduction operations
additional 3D-flag arrays for replacing the 2D-index arrays nzb_s_inner and nzb_diff_s_inner
(flow_statistics, init_grid, init_3d_model, modules, palm, pres, time_integration)

Changed:


for PGI/openACC performance reasons set default compile options for openACC to "-ta=nocache",
and set environment variable PGI_ACC_SYNCHRONOUS=1
(MAKE.inc.pgi.openacc, palm_simple_run)

wall_flags_0 changed to 32bit INTEGER, additional array wall_flags_00 introduced to hold
bits 32-63
(advec_ws, init_grid, modules, palm)

Errors:


dummy argument tri in 1d-routines replaced by tri_for_1d because of name
conflict with arry tri in module arrays_3d
(tridia_solver)

  • Property svn:keywords set to Id
File size: 10.5 KB
Line 
1 PROGRAM palm
2
3!--------------------------------------------------------------------------------!
4! This file is part of PALM.
5!
6! PALM is free software: you can redistribute it and/or modify it under the terms
7! of the GNU General Public License as published by the Free Software Foundation,
8! either version 3 of the License, or (at your option) any later version.
9!
10! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12! A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13!
14! You should have received a copy of the GNU General Public License along with
15! PALM. If not, see <http://www.gnu.org/licenses/>.
16!
17! Copyright 1997-2012  Leibniz University Hannover
18!--------------------------------------------------------------------------------!
19!
20! Current revisions:
21! -----------------
22! +wall_flags_00, rflags_invers, rflags_s_inner in copyin statement
23!
24! Former revisions:
25! -----------------
26! $Id: palm.f90 1221 2013-09-10 08:59:13Z raasch $
27!
28! 1212 2013-08-15 08:46:27Z raasch
29! +tri in copyin statement
30!
31! 1179 2013-06-14 05:57:58Z raasch
32! ref_state added to copyin-list
33!
34! 1113 2013-03-10 02:48:14Z raasch
35! openACC statements modified
36!
37! 1111 2013-03-08 23:54:10Z raasch
38! openACC statements updated
39!
40! 1092 2013-02-02 11:24:22Z raasch
41! unused variables removed
42!
43! 1036 2012-10-22 13:43:42Z raasch
44! code put under GPL (PALM 3.9)
45!
46! 1015 2012-09-27 09:23:24Z raasch
47! Version number changed from 3.8 to 3.8a.
48! OpenACC statements added + code changes required for GPU optimization
49!
50! 849 2012-03-15 10:35:09Z raasch
51! write_particles renamed lpm_write_restart_file
52!
53! 759 2011-09-15 13:58:31Z raasch
54! Splitting of parallel I/O, cpu measurement for write_3d_binary and opening
55! of unit 14 moved to here
56!
57! 495 2010-03-02 00:40:15Z raasch
58! Particle data for restart runs are only written if write_binary=.T..
59!
60! 215 2008-11-18 09:54:31Z raasch
61! Initialization of coupled runs modified for MPI-1 and moved to external
62! subroutine init_coupling
63!
64! 197 2008-09-16 15:29:03Z raasch
65! Workaround for getting information about the coupling mode
66!
67! 108 2007-08-24 15:10:38Z letzel
68! Get coupling mode from environment variable, change location of debug output
69!
70! 75 2007-03-22 09:54:05Z raasch
71! __vtk directives removed, write_particles is called only in case of particle
72! advection switched on, open unit 9 for debug output,
73! setting of palm version moved from modules to here
74!
75! RCS Log replace by Id keyword, revision history cleaned up
76!
77! Revision 1.10  2006/08/04 14:53:12  raasch
78! Distibution of run description header removed, call of header moved behind
79! init_3d_model
80!
81! Revision 1.2  2001/01/25 07:15:06  raasch
82! Program name changed to PALM, module test_variables removed.
83! Initialization of dvrp logging as well as exit of dvrp moved to new
84! subroutines init_dvrp_logging and close_dvrp (file init_dvrp.f90)
85!
86! Revision 1.1  1997/07/24 11:23:35  raasch
87! Initial revision
88!
89!
90! Description:
91! ------------
92! Large-Eddy Simulation (LES) model for the convective boundary layer,
93! optimized for use on parallel machines (implementation realized using the
94! Message Passing Interface (MPI)). The model can also be run on vector machines
95! (less well optimized) and workstations. Versions for the different types of
96! machines are controlled via cpp-directives.
97! Model runs are only feasible using the ksh-script mrun.
98!------------------------------------------------------------------------------!
99
100
101    USE arrays_3d
102    USE constants
103    USE control_parameters
104    USE cpulog
105    USE dvrp_variables
106    USE grid_variables
107    USE indices
108    USE interfaces
109    USE model_1d
110    USE particle_attributes
111    USE pegrid
112    USE spectrum
113    USE statistics
114
115#if defined( __openacc )
116    USE OPENACC
117#endif
118
119    IMPLICIT NONE
120
121!
122!-- Local variables
123    CHARACTER (LEN=9) ::  time_to_string
124    INTEGER           ::  i
125#if defined( __openacc )
126    REAL, DIMENSION(100) ::  acc_dum
127#endif
128
129    version = 'PALM 3.9'
130
131#if defined( __parallel )
132!
133!-- MPI initialisation. comm2d is preliminary set, because
134!-- it will be defined in init_pegrid but is used before in cpu_log.
135    CALL MPI_INIT( ierr )
136    CALL MPI_COMM_SIZE( MPI_COMM_WORLD, numprocs, ierr )
137    CALL MPI_COMM_RANK( MPI_COMM_WORLD, myid, ierr )
138    comm_palm = MPI_COMM_WORLD
139    comm2d    = MPI_COMM_WORLD
140
141!
142!-- Initialize PE topology in case of coupled runs
143    CALL init_coupling
144#endif
145
146#if defined( __openacc )
147!
148!-- Get the number of accelerator boards per node and assign the MPI processes
149!-- to these boards
150    PRINT*, '*** ACC_DEVICE_NVIDIA = ', ACC_DEVICE_NVIDIA
151    num_acc_per_node  = ACC_GET_NUM_DEVICES( ACC_DEVICE_NVIDIA )
152    IF ( numprocs == 1  .AND.  num_acc_per_node > 0 )  num_acc_per_node = 1
153    PRINT*, '*** myid = ', myid, ' num_acc_per_node = ', num_acc_per_node
154    acc_rank = MOD( myid, num_acc_per_node )
155!    STOP '****'
156    CALL ACC_SET_DEVICE_NUM ( acc_rank, ACC_DEVICE_NVIDIA )
157!
158!-- Test output (to be removed later)
159    WRITE (*,'(A,I4,A,I3,A,I3,A,I3)') '*** Connect MPI-Task ', myid,' to CPU ',&
160                                      acc_rank, ' Devices: ', num_acc_per_node,&
161                                      ' connected to:',                        &
162                                      ACC_GET_DEVICE_NUM( ACC_DEVICE_NVIDIA )
163#endif
164
165!
166!-- Ensure that OpenACC first attaches the GPU devices by copying a dummy data
167!-- region
168    !$acc data copyin( acc_dum )
169
170!
171!-- Initialize measuring of the CPU-time remaining to the run
172    CALL local_tremain_ini
173
174!
175!-- Start of total CPU time measuring.
176    CALL cpu_log( log_point(1), 'total', 'start' )
177    CALL cpu_log( log_point(2), 'initialisation', 'start' )
178
179!
180!-- Open a file for debug output
181    WRITE (myid_char,'(''_'',I4.4)')  myid
182    OPEN( 9, FILE='DEBUG'//TRIM( coupling_char )//myid_char, FORM='FORMATTED' )
183
184!
185!-- Initialize dvrp logging. Also, one PE maybe split from the global
186!-- communicator for doing the dvrp output. In that case, the number of
187!-- PEs available for PALM is reduced by one and communicator comm_palm
188!-- is changed respectively.
189#if defined( __parallel )
190    CALL MPI_COMM_RANK( comm_palm, myid, ierr )
191!
192!-- TEST OUTPUT (TO BE REMOVED)
193    WRITE(9,*) '*** coupling_mode = "', TRIM( coupling_mode ), '"'
194    CALL LOCAL_FLUSH( 9 )
195    IF ( TRIM( coupling_mode ) /= 'uncoupled' )  THEN
196       PRINT*, '*** PE', myid, ' Global target PE:', target_id, &
197               TRIM( coupling_mode )
198    ENDIF
199#endif
200
201    CALL init_dvrp_logging
202
203!
204!-- Read control parameters from NAMELIST files and read environment-variables
205    CALL parin
206
207!
208!-- Determine processor topology and local array indices
209    CALL init_pegrid
210
211!
212!-- Generate grid parameters
213    CALL init_grid
214
215!
216!-- Check control parameters and deduce further quantities
217    CALL check_parameters
218
219
220!
221!-- Initialize all necessary variables
222    CALL init_3d_model
223
224!
225!-- Output of program header
226    IF ( myid == 0 )  CALL header
227
228    CALL cpu_log( log_point(2), 'initialisation', 'stop' )
229
230!
231!-- Set start time in format hh:mm:ss
232    simulated_time_chr = time_to_string( simulated_time )
233
234!
235!-- If required, output of initial arrays
236    IF ( do2d_at_begin )  THEN
237       CALL data_output_2d( 'xy', 0 )
238       CALL data_output_2d( 'xz', 0 )
239       CALL data_output_2d( 'yz', 0 )
240    ENDIF
241    IF ( do3d_at_begin )  THEN
242       CALL data_output_3d( 0 )
243    ENDIF
244
245!
246!-- Declare and initialize variables in the accelerator memory with their
247!-- host values
248    !$acc  data copyin( d, diss, e, e_p, kh, km, p, pt, pt_p, q, ql, tend, te_m, tpt_m, tu_m, tv_m, tw_m, u, u_p, v, vpt, v_p, w, w_p )          &
249    !$acc       copyin( tri, tric, dzu, ddzu, ddzw, dd2zu, l_grid, l_wall, ptdf_x, ptdf_y, pt_init, rdf, rdf_sc, ref_state, ug, u_init, vg, v_init, zu, zw )   &
250    !$acc       copyin( hom, qs, qsws, qswst, rif, rif_wall, shf, ts, tswst, us, usws, uswst, vsws, vswst, z0, z0h )      &
251    !$acc       copyin( fxm, fxp, fym, fyp, fwxm, fwxp, fwym, fwyp, nzb_diff_s_inner, nzb_diff_s_outer, nzb_diff_u )       &
252    !$acc       copyin( nzb_diff_v, nzb_s_inner, nzb_s_outer, nzb_u_inner )    &
253    !$acc       copyin( nzb_u_outer, nzb_v_inner, nzb_v_outer, nzb_w_inner )   &
254    !$acc       copyin( nzb_w_outer, rflags_invers, rflags_s_inner, rmask, wall_heatflux, wall_e_x, wall_e_y, wall_u, wall_v, wall_w_x, wall_w_y, wall_flags_0, wall_flags_00 )  &
255    !$acc       copyin( ngp_2dh, ngp_2dh_s_inner )  &
256    !$acc       copyin( weight_pres, weight_substep )
257!
258!-- Integration of the model equations using timestep-scheme
259    CALL time_integration
260
261!
262!-- If required, write binary data for restart runs
263    IF ( write_binary(1:4) == 'true' )  THEN
264
265       CALL cpu_log( log_point(22), 'write_3d_binary', 'start' )
266
267       CALL check_open( 14 )
268
269       DO  i = 0, io_blocks-1
270          IF ( i == io_group )  THEN
271!
272!--          Write flow field data
273             CALL write_3d_binary
274          ENDIF
275#if defined( __parallel )
276          CALL MPI_BARRIER( comm2d, ierr )
277#endif
278       ENDDO
279
280       CALL cpu_log( log_point(22), 'write_3d_binary', 'stop' )
281
282!
283!--    If required, write particle data
284       IF ( particle_advection )  CALL lpm_write_restart_file
285    ENDIF
286
287!
288!-- If required, repeat output of header including the required CPU-time
289    IF ( myid == 0 )  CALL header
290!
291!-- If required, final user-defined actions, and
292!-- last actions on the open files and close files. Unit 14 was opened
293!-- in write_3d_binary but it is closed here, to allow writing on this
294!-- unit in routine user_last_actions.
295    CALL cpu_log( log_point(4), 'last actions', 'start' )
296    DO  i = 0, io_blocks-1
297       IF ( i == io_group )  THEN
298          CALL user_last_actions
299          IF ( write_binary(1:4) == 'true' )  CALL close_file( 14 )
300       ENDIF
301#if defined( __parallel )
302       CALL MPI_BARRIER( comm2d, ierr )
303#endif
304    ENDDO
305    CALL close_file( 0 )
306    CALL close_dvrp
307    CALL cpu_log( log_point(4), 'last actions', 'stop' )
308
309#if defined( __mpi2 )
310!
311!-- Test exchange via intercommunicator in case of a MPI-2 coupling
312    IF ( coupling_mode == 'atmosphere_to_ocean' )  THEN
313       i = 12345 + myid
314       CALL MPI_SEND( i, 1, MPI_INTEGER, myid, 11, comm_inter, ierr )
315    ELSEIF ( coupling_mode == 'ocean_to_atmosphere' )  THEN
316       CALL MPI_RECV( i, 1, MPI_INTEGER, myid, 11, comm_inter, status, ierr )
317       PRINT*, '### myid: ', myid, '   received from atmosphere:  i = ', i
318    ENDIF
319#endif
320
321!
322!-- Close the OpenACC dummy data region
323    !$acc end data
324    !$acc end data
325
326!
327!-- Take final CPU-time for CPU-time analysis
328    CALL cpu_log( log_point(1), 'total', 'stop' )
329    CALL cpu_statistics
330
331#if defined( __parallel )
332    CALL MPI_FINALIZE( ierr )
333#endif
334
335 END PROGRAM palm
Note: See TracBrowser for help on using the repository browser.