source: palm/trunk/SOURCE/transpose.f90 @ 1257

Last change on this file since 1257 was 1257, checked in by raasch, 10 years ago

New:
---

openACC porting of timestep calculation
(modules, timestep, time_integration)

Changed:


openACC loop directives and vector clauses removed (because they do not give any performance improvement with PGI
compiler versions > 13.6)
(advec_ws, buoyancy, coriolis, diffusion_e, diffusion_s, diffusion_u, diffusion_v, diffusion_w, diffusivities, exchange_horiz, fft_xy, pres, production_e, transpose, tridia_solver, wall_fluxes)

openACC loop independent clauses added
(boundary_conds, prandtl_fluxes, pres)

openACC declare create statements moved after FORTRAN declaration statement
(diffusion_u, diffusion_v, diffusion_w, fft_xy, poisfft, production_e, tridia_solver)

openACC end parallel replaced by end parallel loop
(flow_statistics, pres)

openACC "kernels do" replaced by "kernels loop"
(prandtl_fluxes)

output format for theta* changed to avoid output of *
(run_control)

Errors:


bugfix for calculation of advective timestep (old version may cause wrong timesteps in case of
vertixcally stretched grids)
Attention: standard run-control output has changed!
(timestep)

  • Property svn:keywords set to Id
File size: 25.9 KB
Line 
1 SUBROUTINE resort_for_xy( f_in, f_inv )
2
3!--------------------------------------------------------------------------------!
4! This file is part of PALM.
5!
6! PALM is free software: you can redistribute it and/or modify it under the terms
7! of the GNU General Public License as published by the Free Software Foundation,
8! either version 3 of the License, or (at your option) any later version.
9!
10! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12! A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13!
14! You should have received a copy of the GNU General Public License along with
15! PALM. If not, see <http://www.gnu.org/licenses/>.
16!
17! Copyright 1997-2012  Leibniz University Hannover
18!--------------------------------------------------------------------------------!
19!
20! Current revisions:
21! -----------------
22! openacc loop and loop vector clauses removed
23!
24! Former revisions:
25! -----------------
26! $Id: transpose.f90 1257 2013-11-08 15:18:40Z raasch $
27!
28! 1216 2013-08-26 09:31:42Z raasch
29! re-sorting of the transposed / to be transposed arrays moved to separate
30! routines resort_for_...
31!
32! 1111 2013-03-08 23:54:10Z raasch
33! openACC directives added,
34! resorting data from/to work changed, work got 4 dimensions instead of 1
35!
36! 1106 2013-03-04 05:31:38Z raasch
37! preprocessor lines rearranged so that routines can also be used in serial
38! (non-parallel) mode
39!
40! 1092 2013-02-02 11:24:22Z raasch
41! unused variables removed
42!
43! 1036 2012-10-22 13:43:42Z raasch
44! code put under GPL (PALM 3.9)
45!
46! 1003 2012-09-14 14:35:53Z raasch
47! indices nxa, nya, etc. replaced by nx, ny, etc.
48!
49! 683 2011-02-09 14:25:15Z raasch
50! openMP parallelization of transpositions for 2d-domain-decomposition
51!
52! 622 2010-12-10 08:08:13Z raasch
53! optional barriers included in order to speed up collective operations
54!
55! 164 2008-05-15 08:46:15Z raasch
56! f_inv changed from subroutine argument to automatic array in order to do
57! re-ordering from f_in to f_inv in one step, one array work is needed instead
58! of work1 and work2
59!
60! February 2007
61! RCS Log replace by Id keyword, revision history cleaned up
62!
63! Revision 1.2  2004/04/30 13:12:17  raasch
64! Switched from mpi_alltoallv to the simpler mpi_alltoall,
65! all former transpose-routine files collected in this file, enlarged
66! transposition arrays introduced
67!
68! Revision 1.1  2004/04/30 13:08:16  raasch
69! Initial revision (collection of former routines transpose_xy, transpose_xz,
70!                   transpose_yx, transpose_yz, transpose_zx, transpose_zy)
71!
72! Revision 1.1  1997/07/24 11:25:18  raasch
73! Initial revision
74!
75!------------------------------------------------------------------------------!
76! Description:
77! ------------
78! Resorting data for the transposition from x to y. The transposition itself
79! is carried out in transpose_xy
80!------------------------------------------------------------------------------!
81
82     USE indices
83     USE transpose_indices
84
85     IMPLICIT NONE
86
87     REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
88     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
89
90
91     INTEGER ::  i, j, k
92
93!
94!-- Rearrange indices of input array in order to make data to be send
95!-- by MPI contiguous
96    !$OMP  PARALLEL PRIVATE ( i, j, k )
97    !$OMP  DO
98    !$acc kernels present( f_in, f_inv )
99     DO  i = 0, nx
100         DO  k = nzb_x, nzt_x
101             DO  j = nys_x, nyn_x
102                 f_inv(j,k,i) = f_in(i,j,k)
103             ENDDO
104         ENDDO
105     ENDDO
106     !$acc end kernels
107     !$OMP  END PARALLEL
108
109 END SUBROUTINE resort_for_xy
110
111
112 SUBROUTINE transpose_xy( f_inv, f_out )
113
114!------------------------------------------------------------------------------!
115! Description:
116! ------------
117! Transposition of input array (f_in) from x to y. For the input array, all
118! elements along x reside on the same PE, while after transposition, all
119! elements along y reside on the same PE.
120!------------------------------------------------------------------------------!
121
122    USE cpulog
123    USE indices
124    USE interfaces
125    USE pegrid
126    USE transpose_indices
127
128    IMPLICIT NONE
129
130    INTEGER ::  i, j, k, l, ys
131   
132    REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
133
134    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
135
136
137    IF ( numprocs /= 1 )  THEN
138
139#if defined( __parallel )
140!
141!--    Transpose array
142       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
143       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
144       !$acc update host( f_inv )
145       CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
146                          work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
147                          comm1dy, ierr )
148       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
149
150!
151!--    Reorder transposed array
152!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
153!$OMP  DO
154       !$acc data copyin( work )
155       DO  l = 0, pdims(2) - 1
156          ys = 0 + l * ( nyn_x - nys_x + 1 )
157          !$acc kernels present( f_out, work )
158          DO  i = nxl_y, nxr_y
159             DO  k = nzb_y, nzt_y
160                DO  j = ys, ys + nyn_x - nys_x
161                   f_out(j,i,k) = work(j-ys+1,k,i,l)
162                ENDDO
163             ENDDO
164          ENDDO
165          !$acc end kernels
166       ENDDO
167       !$acc end data
168!$OMP  END PARALLEL
169#endif
170
171    ELSE
172
173!
174!--    Reorder transposed array
175!$OMP  PARALLEL PRIVATE ( i, j, k )
176!$OMP  DO
177       !$acc kernels present( f_inv, f_out )
178       DO  k = nzb_y, nzt_y
179          DO  i = nxl_y, nxr_y
180             DO  j = 0, ny
181                f_out(j,i,k) = f_inv(j,k,i)
182             ENDDO
183          ENDDO
184       ENDDO
185       !$acc end kernels
186!$OMP  END PARALLEL
187
188    ENDIF
189
190 END SUBROUTINE transpose_xy
191
192
193 SUBROUTINE resort_for_xz( f_inv, f_out )
194
195!------------------------------------------------------------------------------!
196! Description:
197! ------------
198! Resorting data after the transposition from x to z. The transposition itself
199! is carried out in transpose_xz
200!------------------------------------------------------------------------------!
201
202     USE indices
203     USE transpose_indices
204
205     IMPLICIT NONE
206
207     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
208     REAL ::  f_out(1:nz,nys:nyn,nxl:nxr)
209
210
211     INTEGER ::  i, j, k
212
213!
214!-- Rearrange indices of input array in order to make data to be send
215!-- by MPI contiguous.
216!-- In case of parallel fft/transposition, scattered store is faster in
217!-- backward direction!!!
218    !$OMP  PARALLEL PRIVATE ( i, j, k )
219    !$OMP  DO
220    !$acc kernels present( f_inv, f_out )
221     DO  k = 1, nz
222         DO  i = nxl, nxr
223             DO  j = nys, nyn
224                 f_out(k,j,i) = f_inv(j,i,k)
225             ENDDO
226         ENDDO
227     ENDDO
228     !$acc end kernels
229     !$OMP  END PARALLEL
230
231 END SUBROUTINE resort_for_xz
232
233
234 SUBROUTINE transpose_xz( f_in, f_inv )
235
236!------------------------------------------------------------------------------!
237! Description:
238! ------------
239! Transposition of input array (f_in) from x to z. For the input array, all
240! elements along x reside on the same PE, while after transposition, all
241! elements along z reside on the same PE.
242!------------------------------------------------------------------------------!
243
244    USE cpulog
245    USE indices
246    USE interfaces
247    USE pegrid
248    USE transpose_indices
249
250    IMPLICIT NONE
251
252    INTEGER ::  i, j, k, l, xs
253   
254    REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz)
255
256    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
257
258
259!
260!-- If the PE grid is one-dimensional along y, the array has only to be
261!-- reordered locally and therefore no transposition has to be done.
262    IF ( pdims(1) /= 1 )  THEN
263
264#if defined( __parallel )
265!
266!--    Reorder input array for transposition
267!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
268!$OMP  DO
269       !$acc data copyout( work )
270       DO  l = 0, pdims(1) - 1
271          xs = 0 + l * nnx
272          !$acc kernels present( f_in, work )
273          DO  k = nzb_x, nzt_x
274             DO  i = xs, xs + nnx - 1
275                DO  j = nys_x, nyn_x
276                   work(j,i-xs+1,k,l) = f_in(i,j,k)
277                ENDDO
278             ENDDO
279          ENDDO
280          !$acc end kernels
281       ENDDO
282       !$acc end data
283!$OMP  END PARALLEL
284
285!
286!--    Transpose array
287       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
288       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
289       CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
290                          f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
291                          comm1dx, ierr )
292       !$acc update device( f_inv )
293       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
294#endif
295
296    ELSE
297
298!
299!--    Reorder the array in a way that the z index is in first position
300!$OMP  PARALLEL PRIVATE ( i, j, k )
301!$OMP  DO
302       !$acc kernels present( f_in, f_inv )
303       DO  i = nxl, nxr
304          DO  j = nys, nyn
305             DO  k = 1, nz
306                f_inv(j,i,k) = f_in(i,j,k)
307             ENDDO
308          ENDDO
309       ENDDO
310       !$acc end kernels
311!$OMP  END PARALLEL
312
313    ENDIF
314
315 END SUBROUTINE transpose_xz
316
317
318 SUBROUTINE resort_for_yx( f_inv, f_out )
319
320!------------------------------------------------------------------------------!
321! Description:
322! ------------
323! Resorting data after the transposition from y to x. The transposition itself
324! is carried out in transpose_yx
325!------------------------------------------------------------------------------!
326
327     USE indices
328     USE transpose_indices
329
330     IMPLICIT NONE
331
332     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
333     REAL ::  f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
334
335
336     INTEGER ::  i, j, k
337
338!
339!-- Rearrange indices of input array in order to make data to be send
340!-- by MPI contiguous
341    !$OMP  PARALLEL PRIVATE ( i, j, k )
342    !$OMP  DO
343    !$acc kernels present( f_inv, f_out )
344     DO  i = 0, nx
345         DO  k = nzb_x, nzt_x
346             DO  j = nys_x, nyn_x
347                 f_out(i,j,k) = f_inv(j,k,i)
348             ENDDO
349         ENDDO
350     ENDDO
351     !$acc end kernels
352     !$OMP  END PARALLEL
353
354 END SUBROUTINE resort_for_yx
355
356
357 SUBROUTINE transpose_yx( f_in, f_inv )
358
359!------------------------------------------------------------------------------!
360! Description:
361! ------------
362! Transposition of input array (f_in) from y to x. For the input array, all
363! elements along y reside on the same PE, while after transposition, all
364! elements along x reside on the same PE.
365!------------------------------------------------------------------------------!
366
367    USE cpulog
368    USE indices
369    USE interfaces
370    USE pegrid
371    USE transpose_indices
372
373    IMPLICIT NONE
374
375    INTEGER ::  i, j, k, l, ys
376   
377    REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
378
379    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
380
381
382    IF ( numprocs /= 1 )  THEN
383
384#if defined( __parallel )
385!
386!--    Reorder input array for transposition
387!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
388!$OMP  DO
389       !$acc data copyout( work )
390       DO  l = 0, pdims(2) - 1
391          ys = 0 + l * ( nyn_x - nys_x + 1 )
392          !$acc kernels present( f_in, work )
393          DO  i = nxl_y, nxr_y
394             DO  k = nzb_y, nzt_y
395                DO  j = ys, ys + nyn_x - nys_x
396                   work(j-ys+1,k,i,l) = f_in(j,i,k)
397                ENDDO
398             ENDDO
399          ENDDO
400          !$acc end kernels
401       ENDDO
402       !$acc end data
403!$OMP  END PARALLEL
404
405!
406!--    Transpose array
407       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
408       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
409       CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
410                          f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
411                          comm1dy, ierr )
412       !$acc update device( f_inv )
413       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
414#endif
415
416    ELSE
417
418!
419!--    Reorder array f_in the same way as ALLTOALL did it
420!$OMP  PARALLEL PRIVATE ( i, j, k )
421!$OMP  DO
422       !$acc kernels present( f_in, f_inv )
423       DO  i = nxl_y, nxr_y
424          DO  k = nzb_y, nzt_y
425             DO  j = 0, ny
426                f_inv(j,k,i) = f_in(j,i,k)
427             ENDDO
428          ENDDO
429       ENDDO
430       !$acc end kernels
431!$OMP  END PARALLEL
432
433    ENDIF
434
435 END SUBROUTINE transpose_yx
436
437
438 SUBROUTINE transpose_yxd( f_in, f_out )
439
440!------------------------------------------------------------------------------!
441! Description:
442! ------------
443! Transposition of input array (f_in) from y to x. For the input array, all
444! elements along y reside on the same PE, while after transposition, all
445! elements along x reside on the same PE.
446! This is a direct transposition for arrays with indices in regular order
447! (k,j,i) (cf. transpose_yx).
448!------------------------------------------------------------------------------!
449
450    USE cpulog
451    USE indices
452    USE interfaces
453    USE pegrid
454    USE transpose_indices
455
456    IMPLICIT NONE
457
458    INTEGER ::  i, j, k, l, m, xs
459
460    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
461             f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x),                     &
462             work(nnx*nny*nnz)
463
464#if defined( __parallel )
465
466!
467!-- Rearrange indices of input array in order to make data to be send
468!-- by MPI contiguous
469    DO  k = 1, nz
470       DO  j = nys, nyn
471          DO  i = nxl, nxr
472             f_inv(i,k,j) = f_in(k,j,i)
473          ENDDO
474       ENDDO
475    ENDDO
476
477!
478!-- Transpose array
479    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
480    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
481    CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
482                       work(1),          sendrecvcount_xy, MPI_REAL, &
483                       comm1dx, ierr )
484    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
485
486!
487!-- Reorder transposed array
488    m = 0
489    DO  l = 0, pdims(1) - 1
490       xs = 0 + l * nnx
491       DO  j = nys_x, nyn_x
492          DO  k = 1, nz
493             DO  i = xs, xs + nnx - 1
494                m = m + 1
495                f_out(i,j,k) = work(m)
496             ENDDO
497          ENDDO
498       ENDDO
499    ENDDO
500
501#endif
502
503 END SUBROUTINE transpose_yxd
504
505
506 SUBROUTINE resort_for_yz( f_in, f_inv )
507
508!------------------------------------------------------------------------------!
509! Description:
510! ------------
511! Resorting data for the transposition from y to z. The transposition itself
512! is carried out in transpose_yz
513!------------------------------------------------------------------------------!
514
515     USE indices
516     USE transpose_indices
517
518     IMPLICIT NONE
519
520     REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
521     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
522
523
524     INTEGER ::  i, j, k
525
526!
527!-- Rearrange indices of input array in order to make data to be send
528!-- by MPI contiguous
529    !$OMP  PARALLEL PRIVATE ( i, j, k )
530    !$OMP  DO
531    !$acc kernels present( f_in, f_inv )
532     DO  j = 0, ny
533         DO  k = nzb_y, nzt_y
534             DO  i = nxl_y, nxr_y
535                 f_inv(i,k,j) = f_in(j,i,k)
536             ENDDO
537         ENDDO
538     ENDDO
539     !$acc end kernels
540     !$OMP  END PARALLEL
541
542 END SUBROUTINE resort_for_yz
543
544
545 SUBROUTINE transpose_yz( f_inv, f_out )
546
547!------------------------------------------------------------------------------!
548! Description:
549! ------------
550! Transposition of input array (f_in) from y to z. For the input array, all
551! elements along y reside on the same PE, while after transposition, all
552! elements along z reside on the same PE.
553!------------------------------------------------------------------------------!
554
555    USE cpulog
556    USE indices
557    USE interfaces
558    USE pegrid
559    USE transpose_indices
560
561    IMPLICIT NONE
562
563    INTEGER ::  i, j, k, l, zs
564   
565    REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
566
567    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
568
569
570!
571!-- If the PE grid is one-dimensional along y, only local reordering
572!-- of the data is necessary and no transposition has to be done.
573    IF ( pdims(1) == 1 )  THEN
574
575!$OMP  PARALLEL PRIVATE ( i, j, k )
576!$OMP  DO
577       !$acc kernels present( f_inv, f_out )
578       DO  j = 0, ny
579          DO  k = nzb_y, nzt_y
580             DO  i = nxl_y, nxr_y
581                f_out(i,j,k) = f_inv(i,k,j)
582             ENDDO
583          ENDDO
584       ENDDO
585       !$acc end kernels
586!$OMP  END PARALLEL
587
588    ELSE
589
590#if defined( __parallel )
591!
592!--    Transpose array
593       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
594       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
595       !$acc update host( f_inv )
596       CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
597                          work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
598                          comm1dx, ierr )
599       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
600
601!
602!--    Reorder transposed array
603!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
604!$OMP  DO
605       !$acc data copyin( work )
606       DO  l = 0, pdims(1) - 1
607          zs = 1 + l * ( nzt_y - nzb_y + 1 )
608          !$acc kernels present( f_out )
609          DO  j = nys_z, nyn_z
610             DO  k = zs, zs + nzt_y - nzb_y
611                DO  i = nxl_z, nxr_z
612                   f_out(i,j,k) = work(i,k-zs+1,j,l)
613                ENDDO
614             ENDDO
615          ENDDO
616          !$acc end kernels
617       ENDDO
618       !$acc end data
619!$OMP  END PARALLEL
620#endif
621
622   ENDIF
623
624 END SUBROUTINE transpose_yz
625
626
627 SUBROUTINE resort_for_zx( f_in, f_inv )
628
629!------------------------------------------------------------------------------!
630! Description:
631! ------------
632! Resorting data for the transposition from z to x. The transposition itself
633! is carried out in transpose_zx
634!------------------------------------------------------------------------------!
635
636     USE indices
637     USE transpose_indices
638
639     IMPLICIT NONE
640
641     REAL ::  f_in(1:nz,nys:nyn,nxl:nxr)
642     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
643
644
645     INTEGER ::  i, j, k
646
647!
648!-- Rearrange indices of input array in order to make data to be send
649!-- by MPI contiguous
650    !$OMP  PARALLEL PRIVATE ( i, j, k )
651    !$OMP  DO
652    !$acc kernels present( f_in, f_inv )
653     DO  k = 1,nz
654         DO  i = nxl, nxr
655             DO  j = nys, nyn
656                 f_inv(j,i,k) = f_in(k,j,i)
657             ENDDO
658         ENDDO
659     ENDDO
660     !$acc end kernels
661     !$OMP  END PARALLEL
662
663 END SUBROUTINE resort_for_zx
664
665
666 SUBROUTINE transpose_zx( f_inv, f_out )
667
668!------------------------------------------------------------------------------!
669! Description:
670! ------------
671! Transposition of input array (f_in) from z to x. For the input array, all
672! elements along z reside on the same PE, while after transposition, all
673! elements along x reside on the same PE.
674!------------------------------------------------------------------------------!
675
676    USE cpulog
677    USE indices
678    USE interfaces
679    USE pegrid
680    USE transpose_indices
681
682    IMPLICIT NONE
683
684    INTEGER ::  i, j, k, l, xs
685   
686    REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
687
688    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
689
690
691!
692!-- If the PE grid is one-dimensional along y, only local reordering
693!-- of the data is necessary and no transposition has to be done.
694    IF ( pdims(1) == 1 )  THEN
695
696!$OMP  PARALLEL PRIVATE ( i, j, k )
697!$OMP  DO
698       !$acc kernels present( f_inv, f_out )
699       DO  k = 1, nz
700          DO  i = nxl, nxr
701             DO  j = nys, nyn
702                f_out(i,j,k) = f_inv(j,i,k)
703             ENDDO
704          ENDDO
705       ENDDO
706       !$acc end kernels
707!$OMP  END PARALLEL
708
709    ELSE
710
711#if defined( __parallel )
712!
713!--    Transpose array
714       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
715       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
716       !$acc update host( f_inv )
717       CALL MPI_ALLTOALL( f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
718                          work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
719                          comm1dx, ierr )
720       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
721
722!
723!--    Reorder transposed array
724!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
725!$OMP  DO
726       !$acc data copyin( work )
727       DO  l = 0, pdims(1) - 1
728          xs = 0 + l * nnx
729          !$acc kernels present( f_out )
730          DO  k = nzb_x, nzt_x
731             DO  i = xs, xs + nnx - 1
732                DO  j = nys_x, nyn_x
733                   f_out(i,j,k) = work(j,i-xs+1,k,l)
734                ENDDO
735             ENDDO
736          ENDDO
737          !$acc end kernels
738       ENDDO
739       !$acc end data
740!$OMP  END PARALLEL
741#endif
742
743    ENDIF
744
745 END SUBROUTINE transpose_zx
746
747
748 SUBROUTINE resort_for_zy( f_inv, f_out )
749
750!------------------------------------------------------------------------------!
751! Description:
752! ------------
753! Resorting data after the transposition from z to y. The transposition itself
754! is carried out in transpose_zy
755!------------------------------------------------------------------------------!
756
757     USE indices
758     USE transpose_indices
759
760     IMPLICIT NONE
761
762     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
763     REAL ::  f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
764
765
766     INTEGER ::  i, j, k
767
768!
769!-- Rearrange indices of input array in order to make data to be send
770!-- by MPI contiguous
771    !$OMP  PARALLEL PRIVATE ( i, j, k )
772    !$OMP  DO
773    !$acc kernels present( f_inv, f_out )
774     DO  k = nzb_y, nzt_y
775         DO  j = 0, ny
776             DO  i = nxl_y, nxr_y
777                 f_out(j,i,k) = f_inv(i,k,j)
778             ENDDO
779         ENDDO
780     ENDDO
781     !$acc end kernels
782     !$OMP  END PARALLEL
783
784 END SUBROUTINE resort_for_zy
785
786
787 SUBROUTINE transpose_zy( f_in, f_inv )
788
789!------------------------------------------------------------------------------!
790! Description:
791! ------------
792! Transposition of input array (f_in) from z to y. For the input array, all
793! elements along z reside on the same PE, while after transposition, all
794! elements along y reside on the same PE.
795!------------------------------------------------------------------------------!
796
797    USE cpulog
798    USE indices
799    USE interfaces
800    USE pegrid
801    USE transpose_indices
802
803    IMPLICIT NONE
804
805    INTEGER ::  i, j, k, l, zs
806   
807    REAL ::  f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
808
809    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
810
811
812!
813!-- If the PE grid is one-dimensional along y, the array has only to be
814!-- reordered locally and therefore no transposition has to be done.
815    IF ( pdims(1) /= 1 )  THEN
816
817#if defined( __parallel )
818!
819!--    Reorder input array for transposition
820!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
821!$OMP  DO
822       !$acc data copyout( work )
823       DO  l = 0, pdims(1) - 1
824          zs = 1 + l * ( nzt_y - nzb_y + 1 )
825          !$acc kernels present( f_in, work )
826          DO  j = nys_z, nyn_z
827             DO  k = zs, zs + nzt_y - nzb_y
828                DO  i = nxl_z, nxr_z
829                   work(i,k-zs+1,j,l) = f_in(i,j,k)
830                ENDDO
831             ENDDO
832          ENDDO
833          !$acc end kernels
834       ENDDO
835       !$acc end data
836!$OMP  END PARALLEL
837
838!
839!--    Transpose array
840       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
841       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
842       CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
843                          f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
844                          comm1dx, ierr )
845       !$acc update device( f_inv )
846       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
847#endif
848
849    ELSE
850!
851!--    Reorder the array in the same way like ALLTOALL did it
852!$OMP  PARALLEL PRIVATE ( i, j, k )
853!$OMP  DO
854       !$acc kernels present( f_in, f_inv )
855       DO  k = nzb_y, nzt_y
856          DO  j = 0, ny
857             DO  i = nxl_y, nxr_y
858                f_inv(i,k,j) = f_in(i,j,k)
859             ENDDO
860          ENDDO
861       ENDDO
862       !$acc end kernels
863!$OMP  END PARALLEL
864
865    ENDIF
866
867 END SUBROUTINE transpose_zy
868
869
870 SUBROUTINE transpose_zyd( f_in, f_out )
871
872!------------------------------------------------------------------------------!
873! Description:
874! ------------
875! Transposition of input array (f_in) from z to y. For the input array, all
876! elements along z reside on the same PE, while after transposition, all
877! elements along y reside on the same PE.
878! This is a direct transposition for arrays with indices in regular order
879! (k,j,i) (cf. transpose_zy).
880!------------------------------------------------------------------------------!
881
882    USE cpulog
883    USE indices
884    USE interfaces
885    USE pegrid
886    USE transpose_indices
887
888    IMPLICIT NONE
889
890    INTEGER ::  i, j, k, l, m, ys
891   
892    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
893             f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd),                 &
894             work(nnx*nny*nnz)
895
896#if defined( __parallel )
897
898!
899!-- Rearrange indices of input array in order to make data to be send
900!-- by MPI contiguous
901    DO  i = nxl, nxr
902       DO  j = nys, nyn
903          DO  k = 1, nz
904             f_inv(j,i,k) = f_in(k,j,i)
905          ENDDO
906       ENDDO
907    ENDDO
908
909!
910!-- Move data to different array, because memory location of work1 is
911!-- needed further below (work1 = work2).
912!-- If the PE grid is one-dimensional along x, only local reordering
913!-- of the data is necessary and no transposition has to be done.
914    IF ( pdims(2) == 1 )  THEN
915       DO  k = 1, nz
916          DO  i = nxl, nxr
917             DO  j = nys, nyn
918                f_out(j,i,k) = f_inv(j,i,k)
919             ENDDO
920          ENDDO
921       ENDDO
922       RETURN
923    ENDIF
924
925!
926!-- Transpose array
927    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
928    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
929    CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
930                       work(1),          sendrecvcount_zyd, MPI_REAL, &
931                       comm1dy, ierr )
932    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
933
934!
935!-- Reorder transposed array
936    m = 0
937    DO  l = 0, pdims(2) - 1
938       ys = 0 + l * nny
939       DO  k = nzb_yd, nzt_yd
940          DO  i = nxl_yd, nxr_yd
941             DO  j = ys, ys + nny - 1
942                m = m + 1
943                f_out(j,i,k) = work(m)
944             ENDDO
945          ENDDO
946       ENDDO
947    ENDDO
948
949#endif
950
951 END SUBROUTINE transpose_zyd
Note: See TracBrowser for help on using the repository browser.