source: palm/trunk/SOURCE/transpose.f90 @ 1256

Last change on this file since 1256 was 1217, checked in by raasch, 11 years ago

last commit documented

  • Property svn:keywords set to Id
File size: 26.8 KB
RevLine 
[1216]1 SUBROUTINE resort_for_xy( f_in, f_inv )
[1]2
[1036]3!--------------------------------------------------------------------------------!
4! This file is part of PALM.
5!
6! PALM is free software: you can redistribute it and/or modify it under the terms
7! of the GNU General Public License as published by the Free Software Foundation,
8! either version 3 of the License, or (at your option) any later version.
9!
10! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12! A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13!
14! You should have received a copy of the GNU General Public License along with
15! PALM. If not, see <http://www.gnu.org/licenses/>.
16!
17! Copyright 1997-2012  Leibniz University Hannover
18!--------------------------------------------------------------------------------!
19!
[484]20! Current revisions:
[1]21! -----------------
[198]22!
[1217]23!
[198]24! Former revisions:
25! -----------------
26! $Id: transpose.f90 1217 2013-08-26 11:30:05Z raasch $
27!
[1217]28! 1216 2013-08-26 09:31:42Z raasch
29! re-sorting of the transposed / to be transposed arrays moved to separate
30! routines resort_for_...
31!
[1112]32! 1111 2013-03-08 23:54:10Z raasch
33! openACC directives added,
34! resorting data from/to work changed, work got 4 dimensions instead of 1
35!
[1107]36! 1106 2013-03-04 05:31:38Z raasch
37! preprocessor lines rearranged so that routines can also be used in serial
38! (non-parallel) mode
39!
[1093]40! 1092 2013-02-02 11:24:22Z raasch
41! unused variables removed
42!
[1037]43! 1036 2012-10-22 13:43:42Z raasch
44! code put under GPL (PALM 3.9)
45!
[1004]46! 1003 2012-09-14 14:35:53Z raasch
47! indices nxa, nya, etc. replaced by nx, ny, etc.
48!
[684]49! 683 2011-02-09 14:25:15Z raasch
50! openMP parallelization of transpositions for 2d-domain-decomposition
51!
[623]52! 622 2010-12-10 08:08:13Z raasch
53! optional barriers included in order to speed up collective operations
54!
[198]55! 164 2008-05-15 08:46:15Z raasch
[164]56! f_inv changed from subroutine argument to automatic array in order to do
57! re-ordering from f_in to f_inv in one step, one array work is needed instead
58! of work1 and work2
[1]59!
[198]60! February 2007
[3]61! RCS Log replace by Id keyword, revision history cleaned up
62!
[1]63! Revision 1.2  2004/04/30 13:12:17  raasch
64! Switched from mpi_alltoallv to the simpler mpi_alltoall,
65! all former transpose-routine files collected in this file, enlarged
66! transposition arrays introduced
67!
68! Revision 1.1  2004/04/30 13:08:16  raasch
69! Initial revision (collection of former routines transpose_xy, transpose_xz,
70!                   transpose_yx, transpose_yz, transpose_zx, transpose_zy)
71!
72! Revision 1.1  1997/07/24 11:25:18  raasch
73! Initial revision
74!
[1216]75!------------------------------------------------------------------------------!
76! Description:
77! ------------
78! Resorting data for the transposition from x to y. The transposition itself
79! is carried out in transpose_xy
80!------------------------------------------------------------------------------!
81
82     USE indices
83     USE transpose_indices
84
85     IMPLICIT NONE
86
87     REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
88     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
89
90
91     INTEGER ::  i, j, k
92
[1]93!
[1216]94!-- Rearrange indices of input array in order to make data to be send
95!-- by MPI contiguous
96    !$OMP  PARALLEL PRIVATE ( i, j, k )
97    !$OMP  DO
98    !$acc kernels present( f_in, f_inv )
99    !$acc loop
100     DO  i = 0, nx
101         DO  k = nzb_x, nzt_x
102             !$acc loop vector( 32 )
103             DO  j = nys_x, nyn_x
104                 f_inv(j,k,i) = f_in(i,j,k)
105             ENDDO
106         ENDDO
107     ENDDO
108     !$acc end kernels
109     !$OMP  END PARALLEL
110
111 END SUBROUTINE resort_for_xy
112
113
114 SUBROUTINE transpose_xy( f_inv, f_out )
115
116!------------------------------------------------------------------------------!
[1]117! Description:
118! ------------
119! Transposition of input array (f_in) from x to y. For the input array, all
120! elements along x reside on the same PE, while after transposition, all
121! elements along y reside on the same PE.
122!------------------------------------------------------------------------------!
123
124    USE cpulog
125    USE indices
126    USE interfaces
127    USE pegrid
128    USE transpose_indices
129
130    IMPLICIT NONE
131
[1111]132    INTEGER ::  i, j, k, l, ys
[1]133   
[1216]134    REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
[1]135
[1111]136    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
137
138
[1106]139    IF ( numprocs /= 1 )  THEN
140
141#if defined( __parallel )
[1]142!
[1106]143!--    Transpose array
144       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
145       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]146       !$acc update host( f_inv )
147       CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
148                          work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]149                          comm1dy, ierr )
150       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]151
152!
[1106]153!--    Reorder transposed array
[1111]154!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
[683]155!$OMP  DO
[1216]156       !$acc data copyin( work )
[1106]157       DO  l = 0, pdims(2) - 1
158          ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]159          !$acc kernels present( f_out, work )
160          !$acc loop
[1106]161          DO  i = nxl_y, nxr_y
162             DO  k = nzb_y, nzt_y
[1111]163                !$acc loop vector( 32 )
[1106]164                DO  j = ys, ys + nyn_x - nys_x
[1111]165                   f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]166                ENDDO
[1]167             ENDDO
168          ENDDO
[1111]169          !$acc end kernels
[1]170       ENDDO
[1216]171       !$acc end data
[683]172!$OMP  END PARALLEL
[1]173#endif
174
[1106]175    ELSE
176
177!
178!--    Reorder transposed array
179!$OMP  PARALLEL PRIVATE ( i, j, k )
180!$OMP  DO
[1216]181       !$acc kernels present( f_inv, f_out )
[1111]182       !$acc loop
[1106]183       DO  k = nzb_y, nzt_y
184          DO  i = nxl_y, nxr_y
[1111]185             !$acc loop vector( 32 )
[1106]186             DO  j = 0, ny
187                f_out(j,i,k) = f_inv(j,k,i)
188             ENDDO
189          ENDDO
190       ENDDO
[1111]191       !$acc end kernels
[1106]192!$OMP  END PARALLEL
193
194    ENDIF
195
[1]196 END SUBROUTINE transpose_xy
197
198
[1216]199 SUBROUTINE resort_for_xz( f_inv, f_out )
[1]200
201!------------------------------------------------------------------------------!
202! Description:
203! ------------
[1216]204! Resorting data after the transposition from x to z. The transposition itself
205! is carried out in transpose_xz
206!------------------------------------------------------------------------------!
207
208     USE indices
209     USE transpose_indices
210
211     IMPLICIT NONE
212
213     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
214     REAL ::  f_out(1:nz,nys:nyn,nxl:nxr)
215
216
217     INTEGER ::  i, j, k
218
219!
220!-- Rearrange indices of input array in order to make data to be send
221!-- by MPI contiguous.
222!-- In case of parallel fft/transposition, scattered store is faster in
223!-- backward direction!!!
224    !$OMP  PARALLEL PRIVATE ( i, j, k )
225    !$OMP  DO
226    !$acc kernels present( f_inv, f_out )
227    !$acc loop
228     DO  k = 1, nz
229         DO  i = nxl, nxr
230             !$acc loop vector( 32 )
231             DO  j = nys, nyn
232                 f_out(k,j,i) = f_inv(j,i,k)
233             ENDDO
234         ENDDO
235     ENDDO
236     !$acc end kernels
237     !$OMP  END PARALLEL
238
239 END SUBROUTINE resort_for_xz
240
241
242 SUBROUTINE transpose_xz( f_in, f_inv )
243
244!------------------------------------------------------------------------------!
245! Description:
246! ------------
[1]247! Transposition of input array (f_in) from x to z. For the input array, all
248! elements along x reside on the same PE, while after transposition, all
249! elements along z reside on the same PE.
250!------------------------------------------------------------------------------!
251
252    USE cpulog
253    USE indices
254    USE interfaces
255    USE pegrid
256    USE transpose_indices
257
258    IMPLICIT NONE
259
[1111]260    INTEGER ::  i, j, k, l, xs
[1]261   
[1216]262    REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz)
[1]263
[1111]264    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
[1]265
[1111]266
[1]267!
268!-- If the PE grid is one-dimensional along y, the array has only to be
269!-- reordered locally and therefore no transposition has to be done.
270    IF ( pdims(1) /= 1 )  THEN
[1106]271
272#if defined( __parallel )
[1]273!
274!--    Reorder input array for transposition
[1111]275!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
[683]276!$OMP  DO
[1216]277       !$acc data copyout( work )
[1]278       DO  l = 0, pdims(1) - 1
279          xs = 0 + l * nnx
[1111]280          !$acc kernels present( f_in, work )
281          !$acc loop
[1003]282          DO  k = nzb_x, nzt_x
[164]283             DO  i = xs, xs + nnx - 1
[1111]284                !$acc loop vector( 32 )
[1003]285                DO  j = nys_x, nyn_x
[1111]286                   work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]287                ENDDO
288             ENDDO
289          ENDDO
[1111]290          !$acc end kernels
[1]291       ENDDO
[1216]292       !$acc end data
[683]293!$OMP  END PARALLEL
[1]294
295!
296!--    Transpose array
297       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]298       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]299       CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
300                          f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
[1]301                          comm1dx, ierr )
[1111]302       !$acc update device( f_inv )
[1]303       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]304#endif
305
[1]306    ELSE
[1106]307
[1]308!
309!--    Reorder the array in a way that the z index is in first position
[683]310!$OMP  PARALLEL PRIVATE ( i, j, k )
311!$OMP  DO
[1216]312       !$acc kernels present( f_in, f_inv )
[1111]313       !$acc loop
[1003]314       DO  i = nxl, nxr
315          DO  j = nys, nyn
[1111]316             !$acc loop vector( 32 )
[1003]317             DO  k = 1, nz
[164]318                f_inv(j,i,k) = f_in(i,j,k)
[1]319             ENDDO
320          ENDDO
321       ENDDO
[1111]322       !$acc end kernels
[683]323!$OMP  END PARALLEL
[1]324
[164]325    ENDIF
326
[1]327 END SUBROUTINE transpose_xz
328
329
[1216]330 SUBROUTINE resort_for_yx( f_inv, f_out )
[1]331
332!------------------------------------------------------------------------------!
333! Description:
334! ------------
[1216]335! Resorting data after the transposition from y to x. The transposition itself
336! is carried out in transpose_yx
337!------------------------------------------------------------------------------!
338
339     USE indices
340     USE transpose_indices
341
342     IMPLICIT NONE
343
344     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
345     REAL ::  f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
346
347
348     INTEGER ::  i, j, k
349
350!
351!-- Rearrange indices of input array in order to make data to be send
352!-- by MPI contiguous
353    !$OMP  PARALLEL PRIVATE ( i, j, k )
354    !$OMP  DO
355    !$acc kernels present( f_inv, f_out )
356    !$acc loop
357     DO  i = 0, nx
358         DO  k = nzb_x, nzt_x
359             !$acc loop vector( 32 )
360             DO  j = nys_x, nyn_x
361                 f_out(i,j,k) = f_inv(j,k,i)
362             ENDDO
363         ENDDO
364     ENDDO
365     !$acc end kernels
366     !$OMP  END PARALLEL
367
368 END SUBROUTINE resort_for_yx
369
370
371 SUBROUTINE transpose_yx( f_in, f_inv )
372
373!------------------------------------------------------------------------------!
374! Description:
375! ------------
[1]376! Transposition of input array (f_in) from y to x. For the input array, all
377! elements along y reside on the same PE, while after transposition, all
378! elements along x reside on the same PE.
379!------------------------------------------------------------------------------!
380
381    USE cpulog
382    USE indices
383    USE interfaces
384    USE pegrid
385    USE transpose_indices
386
387    IMPLICIT NONE
388
[1111]389    INTEGER ::  i, j, k, l, ys
[1]390   
[1216]391    REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
[1]392
[1111]393    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
394
395
[1106]396    IF ( numprocs /= 1 )  THEN
397
[1]398#if defined( __parallel )
399!
[1106]400!--    Reorder input array for transposition
[1111]401!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
[683]402!$OMP  DO
[1216]403       !$acc data copyout( work )
[1106]404       DO  l = 0, pdims(2) - 1
405          ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]406          !$acc kernels present( f_in, work )
407          !$acc loop
[1106]408          DO  i = nxl_y, nxr_y
409             DO  k = nzb_y, nzt_y
[1111]410                !$acc loop vector( 32 )
[1106]411                DO  j = ys, ys + nyn_x - nys_x
[1111]412                   work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]413                ENDDO
414             ENDDO
415          ENDDO
[1111]416          !$acc end kernels
[1106]417       ENDDO
[1216]418       !$acc end data
[1106]419!$OMP  END PARALLEL
420
421!
422!--    Transpose array
423       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
424       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]425       CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
426                          f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
[1106]427                          comm1dy, ierr )
[1111]428       !$acc update device( f_inv )
[1106]429       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
430#endif
431
432    ELSE
433
434!
435!--    Reorder array f_in the same way as ALLTOALL did it
436!$OMP  PARALLEL PRIVATE ( i, j, k )
437!$OMP  DO
[1216]438       !$acc kernels present( f_in, f_inv )
[1111]439       !$acc loop
[1003]440       DO  i = nxl_y, nxr_y
441          DO  k = nzb_y, nzt_y
[1111]442             !$acc loop vector( 32 )
[1106]443             DO  j = 0, ny
444                f_inv(j,k,i) = f_in(j,i,k)
[1]445             ENDDO
446          ENDDO
447       ENDDO
[1111]448       !$acc end kernels
[683]449!$OMP  END PARALLEL
[1]450
[1106]451    ENDIF
[1]452
453 END SUBROUTINE transpose_yx
454
455
[1216]456 SUBROUTINE transpose_yxd( f_in, f_out )
[1]457
458!------------------------------------------------------------------------------!
459! Description:
460! ------------
461! Transposition of input array (f_in) from y to x. For the input array, all
462! elements along y reside on the same PE, while after transposition, all
463! elements along x reside on the same PE.
464! This is a direct transposition for arrays with indices in regular order
465! (k,j,i) (cf. transpose_yx).
466!------------------------------------------------------------------------------!
467
468    USE cpulog
469    USE indices
470    USE interfaces
471    USE pegrid
472    USE transpose_indices
473
474    IMPLICIT NONE
475
[1092]476    INTEGER ::  i, j, k, l, m, xs
[1]477
[1003]478    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
479             f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x),                     &
[164]480             work(nnx*nny*nnz)
[1]481
482#if defined( __parallel )
483
484!
485!-- Rearrange indices of input array in order to make data to be send
486!-- by MPI contiguous
[1003]487    DO  k = 1, nz
488       DO  j = nys, nyn
489          DO  i = nxl, nxr
[164]490             f_inv(i,k,j) = f_in(k,j,i)
[1]491          ENDDO
492       ENDDO
493    ENDDO
494
495!
496!-- Transpose array
497    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]498    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1]499    CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]500                       work(1),          sendrecvcount_xy, MPI_REAL, &
[1]501                       comm1dx, ierr )
502    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
503
504!
505!-- Reorder transposed array
506    m = 0
507    DO  l = 0, pdims(1) - 1
508       xs = 0 + l * nnx
[1003]509       DO  j = nys_x, nyn_x
510          DO  k = 1, nz
[1]511             DO  i = xs, xs + nnx - 1
512                m = m + 1
[164]513                f_out(i,j,k) = work(m)
[1]514             ENDDO
515          ENDDO
516       ENDDO
517    ENDDO
518
519#endif
520
521 END SUBROUTINE transpose_yxd
522
523
[1216]524 SUBROUTINE resort_for_yz( f_in, f_inv )
[1]525
526!------------------------------------------------------------------------------!
527! Description:
528! ------------
[1216]529! Resorting data for the transposition from y to z. The transposition itself
530! is carried out in transpose_yz
531!------------------------------------------------------------------------------!
532
533     USE indices
534     USE transpose_indices
535
536     IMPLICIT NONE
537
538     REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
539     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
540
541
542     INTEGER ::  i, j, k
543
544!
545!-- Rearrange indices of input array in order to make data to be send
546!-- by MPI contiguous
547    !$OMP  PARALLEL PRIVATE ( i, j, k )
548    !$OMP  DO
549    !$acc kernels present( f_in, f_inv )
550    !$acc loop
551     DO  j = 0, ny
552         DO  k = nzb_y, nzt_y
553             !$acc loop vector( 32 )
554             DO  i = nxl_y, nxr_y
555                 f_inv(i,k,j) = f_in(j,i,k)
556             ENDDO
557         ENDDO
558     ENDDO
559     !$acc end kernels
560     !$OMP  END PARALLEL
561
562 END SUBROUTINE resort_for_yz
563
564
565 SUBROUTINE transpose_yz( f_inv, f_out )
566
567!------------------------------------------------------------------------------!
568! Description:
569! ------------
[1]570! Transposition of input array (f_in) from y to z. For the input array, all
571! elements along y reside on the same PE, while after transposition, all
572! elements along z reside on the same PE.
573!------------------------------------------------------------------------------!
574
575    USE cpulog
576    USE indices
577    USE interfaces
578    USE pegrid
579    USE transpose_indices
580
581    IMPLICIT NONE
582
[1111]583    INTEGER ::  i, j, k, l, zs
[1]584   
[1216]585    REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]586
[1111]587    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
588
589
[1]590!
591!-- If the PE grid is one-dimensional along y, only local reordering
592!-- of the data is necessary and no transposition has to be done.
593    IF ( pdims(1) == 1 )  THEN
[1106]594
[683]595!$OMP  PARALLEL PRIVATE ( i, j, k )
596!$OMP  DO
[1216]597       !$acc kernels present( f_inv, f_out )
[1111]598       !$acc loop
[1003]599       DO  j = 0, ny
600          DO  k = nzb_y, nzt_y
[1111]601             !$acc loop vector( 32 )
[1003]602             DO  i = nxl_y, nxr_y
[164]603                f_out(i,j,k) = f_inv(i,k,j)
[1]604             ENDDO
605          ENDDO
606       ENDDO
[1111]607       !$acc end kernels
[683]608!$OMP  END PARALLEL
[1]609
[1106]610    ELSE
611
612#if defined( __parallel )
[1]613!
[1106]614!--    Transpose array
615       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
616       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]617       !$acc update host( f_inv )
618       CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
619                          work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]620                          comm1dx, ierr )
621       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]622
623!
[1106]624!--    Reorder transposed array
[1111]625!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
[683]626!$OMP  DO
[1216]627       !$acc data copyin( work )
[1106]628       DO  l = 0, pdims(1) - 1
629          zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1216]630          !$acc kernels present( f_out )
[1111]631          !$acc loop
[1106]632          DO  j = nys_z, nyn_z
633             DO  k = zs, zs + nzt_y - nzb_y
[1111]634                !$acc loop vector( 32 )
[1106]635                DO  i = nxl_z, nxr_z
[1111]636                   f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]637                ENDDO
[1]638             ENDDO
639          ENDDO
[1111]640          !$acc end kernels
[1]641       ENDDO
[1216]642       !$acc end data
[683]643!$OMP  END PARALLEL
[1]644#endif
645
[1106]646   ENDIF
647
[1]648 END SUBROUTINE transpose_yz
649
650
[1216]651 SUBROUTINE resort_for_zx( f_in, f_inv )
[1]652
653!------------------------------------------------------------------------------!
654! Description:
655! ------------
[1216]656! Resorting data for the transposition from z to x. The transposition itself
657! is carried out in transpose_zx
658!------------------------------------------------------------------------------!
659
660     USE indices
661     USE transpose_indices
662
663     IMPLICIT NONE
664
665     REAL ::  f_in(1:nz,nys:nyn,nxl:nxr)
666     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
667
668
669     INTEGER ::  i, j, k
670
671!
672!-- Rearrange indices of input array in order to make data to be send
673!-- by MPI contiguous
674    !$OMP  PARALLEL PRIVATE ( i, j, k )
675    !$OMP  DO
676    !$acc kernels present( f_in, f_inv )
677    !$acc loop
678     DO  k = 1,nz
679         DO  i = nxl, nxr
680             !$acc loop vector( 32 )
681             DO  j = nys, nyn
682                 f_inv(j,i,k) = f_in(k,j,i)
683             ENDDO
684         ENDDO
685     ENDDO
686     !$acc end kernels
687     !$OMP  END PARALLEL
688
689 END SUBROUTINE resort_for_zx
690
691
692 SUBROUTINE transpose_zx( f_inv, f_out )
693
694!------------------------------------------------------------------------------!
695! Description:
696! ------------
[1]697! Transposition of input array (f_in) from z to x. For the input array, all
698! elements along z reside on the same PE, while after transposition, all
699! elements along x reside on the same PE.
700!------------------------------------------------------------------------------!
701
702    USE cpulog
703    USE indices
704    USE interfaces
705    USE pegrid
706    USE transpose_indices
707
708    IMPLICIT NONE
709
[1111]710    INTEGER ::  i, j, k, l, xs
[1]711   
[1216]712    REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
[1]713
[1111]714    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
715
[1]716
717!
718!-- If the PE grid is one-dimensional along y, only local reordering
719!-- of the data is necessary and no transposition has to be done.
720    IF ( pdims(1) == 1 )  THEN
[1106]721
[683]722!$OMP  PARALLEL PRIVATE ( i, j, k )
723!$OMP  DO
[1216]724       !$acc kernels present( f_inv, f_out )
[1106]725       !$acc loop
[1003]726       DO  k = 1, nz
727          DO  i = nxl, nxr
[1106]728             !$acc loop vector( 32 )
[1003]729             DO  j = nys, nyn
[164]730                f_out(i,j,k) = f_inv(j,i,k)
[1]731             ENDDO
732          ENDDO
733       ENDDO
[1111]734       !$acc end kernels
[683]735!$OMP  END PARALLEL
[1]736
[1106]737    ELSE
738
739#if defined( __parallel )
[1]740!
[1106]741!--    Transpose array
742       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
743       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]744       !$acc update host( f_inv )
745       CALL MPI_ALLTOALL( f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
746                          work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]747                          comm1dx, ierr )
748       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]749
750!
[1106]751!--    Reorder transposed array
[1111]752!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
[683]753!$OMP  DO
[1216]754       !$acc data copyin( work )
[1106]755       DO  l = 0, pdims(1) - 1
756          xs = 0 + l * nnx
[1216]757          !$acc kernels present( f_out )
[1111]758          !$acc loop
[1106]759          DO  k = nzb_x, nzt_x
760             DO  i = xs, xs + nnx - 1
[1111]761                !$acc loop vector( 32 )
[1106]762                DO  j = nys_x, nyn_x
[1111]763                   f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]764                ENDDO
[1]765             ENDDO
766          ENDDO
[1111]767          !$acc end kernels
[1]768       ENDDO
[1216]769       !$acc end data
[683]770!$OMP  END PARALLEL
[1]771#endif
772
[1106]773    ENDIF
774
[1]775 END SUBROUTINE transpose_zx
776
777
[1216]778 SUBROUTINE resort_for_zy( f_inv, f_out )
[1]779
780!------------------------------------------------------------------------------!
781! Description:
782! ------------
[1216]783! Resorting data after the transposition from z to y. The transposition itself
784! is carried out in transpose_zy
785!------------------------------------------------------------------------------!
786
787     USE indices
788     USE transpose_indices
789
790     IMPLICIT NONE
791
792     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
793     REAL ::  f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
794
795
796     INTEGER ::  i, j, k
797
798!
799!-- Rearrange indices of input array in order to make data to be send
800!-- by MPI contiguous
801    !$OMP  PARALLEL PRIVATE ( i, j, k )
802    !$OMP  DO
803    !$acc kernels present( f_inv, f_out )
804    !$acc loop
805     DO  k = nzb_y, nzt_y
806         DO  j = 0, ny
807             !$acc loop vector( 32 )
808             DO  i = nxl_y, nxr_y
809                 f_out(j,i,k) = f_inv(i,k,j)
810             ENDDO
811         ENDDO
812     ENDDO
813     !$acc end kernels
814     !$OMP  END PARALLEL
815
816 END SUBROUTINE resort_for_zy
817
818
819 SUBROUTINE transpose_zy( f_in, f_inv )
820
821!------------------------------------------------------------------------------!
822! Description:
823! ------------
[1]824! Transposition of input array (f_in) from z to y. For the input array, all
825! elements along z reside on the same PE, while after transposition, all
826! elements along y reside on the same PE.
827!------------------------------------------------------------------------------!
828
829    USE cpulog
830    USE indices
831    USE interfaces
832    USE pegrid
833    USE transpose_indices
834
835    IMPLICIT NONE
836
[1111]837    INTEGER ::  i, j, k, l, zs
[1]838   
[1216]839    REAL ::  f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
[1]840
[1111]841    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
842
843
[1]844!
845!-- If the PE grid is one-dimensional along y, the array has only to be
846!-- reordered locally and therefore no transposition has to be done.
847    IF ( pdims(1) /= 1 )  THEN
[1106]848
849#if defined( __parallel )
[1]850!
851!--    Reorder input array for transposition
[1111]852!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
[683]853!$OMP  DO
[1216]854       !$acc data copyout( work )
[1]855       DO  l = 0, pdims(1) - 1
[1003]856          zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1111]857          !$acc kernels present( f_in, work )
858          !$acc loop
[1003]859          DO  j = nys_z, nyn_z
860             DO  k = zs, zs + nzt_y - nzb_y
[1111]861                !$acc loop vector( 32 )
[1003]862                DO  i = nxl_z, nxr_z
[1111]863                   work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]864                ENDDO
865             ENDDO
866          ENDDO
[1111]867          !$acc end kernels
[1]868       ENDDO
[1216]869       !$acc end data
[683]870!$OMP  END PARALLEL
[1]871
872!
873!--    Transpose array
874       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]875       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]876       CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
877                          f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
[1]878                          comm1dx, ierr )
[1111]879       !$acc update device( f_inv )
[1]880       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]881#endif
[1]882
883    ELSE
884!
[1106]885!--    Reorder the array in the same way like ALLTOALL did it
[683]886!$OMP  PARALLEL PRIVATE ( i, j, k )
887!$OMP  DO
[1216]888       !$acc kernels present( f_in, f_inv )
[1111]889       !$acc loop
[1003]890       DO  k = nzb_y, nzt_y
891          DO  j = 0, ny
[1111]892             !$acc loop vector( 32 )
[1003]893             DO  i = nxl_y, nxr_y
[164]894                f_inv(i,k,j) = f_in(i,j,k)
895             ENDDO
896          ENDDO
897       ENDDO
[1111]898       !$acc end kernels
[683]899!$OMP  END PARALLEL
[1106]900
901    ENDIF
902
[1]903 END SUBROUTINE transpose_zy
904
905
[1216]906 SUBROUTINE transpose_zyd( f_in, f_out )
[1]907
908!------------------------------------------------------------------------------!
909! Description:
910! ------------
911! Transposition of input array (f_in) from z to y. For the input array, all
912! elements along z reside on the same PE, while after transposition, all
913! elements along y reside on the same PE.
914! This is a direct transposition for arrays with indices in regular order
915! (k,j,i) (cf. transpose_zy).
916!------------------------------------------------------------------------------!
917
918    USE cpulog
919    USE indices
920    USE interfaces
921    USE pegrid
922    USE transpose_indices
923
924    IMPLICIT NONE
925
926    INTEGER ::  i, j, k, l, m, ys
927   
[1003]928    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
929             f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd),                 &
[164]930             work(nnx*nny*nnz)
[1]931
932#if defined( __parallel )
933
934!
935!-- Rearrange indices of input array in order to make data to be send
936!-- by MPI contiguous
[1003]937    DO  i = nxl, nxr
938       DO  j = nys, nyn
939          DO  k = 1, nz
[164]940             f_inv(j,i,k) = f_in(k,j,i)
[1]941          ENDDO
942       ENDDO
943    ENDDO
944
945!
946!-- Move data to different array, because memory location of work1 is
947!-- needed further below (work1 = work2).
948!-- If the PE grid is one-dimensional along x, only local reordering
949!-- of the data is necessary and no transposition has to be done.
950    IF ( pdims(2) == 1 )  THEN
[1003]951       DO  k = 1, nz
952          DO  i = nxl, nxr
953             DO  j = nys, nyn
[164]954                f_out(j,i,k) = f_inv(j,i,k)
[1]955             ENDDO
956          ENDDO
957       ENDDO
958       RETURN
959    ENDIF
960
961!
962!-- Transpose array
963    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]964    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1]965    CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]966                       work(1),          sendrecvcount_zyd, MPI_REAL, &
[1]967                       comm1dy, ierr )
968    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
969
970!
971!-- Reorder transposed array
972    m = 0
973    DO  l = 0, pdims(2) - 1
974       ys = 0 + l * nny
[1003]975       DO  k = nzb_yd, nzt_yd
976          DO  i = nxl_yd, nxr_yd
[1]977             DO  j = ys, ys + nny - 1
978                m = m + 1
[164]979                f_out(j,i,k) = work(m)
[1]980             ENDDO
981          ENDDO
982       ENDDO
983    ENDDO
984
985#endif
986
987 END SUBROUTINE transpose_zyd
Note: See TracBrowser for help on using the repository browser.