source: palm/trunk/SOURCE/transpose.f90 @ 1152

Last change on this file since 1152 was 1112, checked in by raasch, 12 years ago

last commit documented

  • Property svn:keywords set to Id
File size: 23.9 KB
RevLine 
[164]1 SUBROUTINE transpose_xy( f_in, work, f_out )
[1]2
[1036]3!--------------------------------------------------------------------------------!
4! This file is part of PALM.
5!
6! PALM is free software: you can redistribute it and/or modify it under the terms
7! of the GNU General Public License as published by the Free Software Foundation,
8! either version 3 of the License, or (at your option) any later version.
9!
10! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12! A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13!
14! You should have received a copy of the GNU General Public License along with
15! PALM. If not, see <http://www.gnu.org/licenses/>.
16!
17! Copyright 1997-2012  Leibniz University Hannover
18!--------------------------------------------------------------------------------!
19!
[484]20! Current revisions:
[1]21! -----------------
[1112]22!
[198]23!
24! Former revisions:
25! -----------------
26! $Id: transpose.f90 1112 2013-03-09 00:34:37Z witha $
27!
[1112]28! 1111 2013-03-08 23:54:10Z raasch
29! openACC directives added,
30! resorting data from/to work changed, work got 4 dimensions instead of 1
31!
[1107]32! 1106 2013-03-04 05:31:38Z raasch
33! preprocessor lines rearranged so that routines can also be used in serial
34! (non-parallel) mode
35!
[1093]36! 1092 2013-02-02 11:24:22Z raasch
37! unused variables removed
38!
[1037]39! 1036 2012-10-22 13:43:42Z raasch
40! code put under GPL (PALM 3.9)
41!
[1004]42! 1003 2012-09-14 14:35:53Z raasch
43! indices nxa, nya, etc. replaced by nx, ny, etc.
44!
[684]45! 683 2011-02-09 14:25:15Z raasch
46! openMP parallelization of transpositions for 2d-domain-decomposition
47!
[623]48! 622 2010-12-10 08:08:13Z raasch
49! optional barriers included in order to speed up collective operations
50!
[198]51! 164 2008-05-15 08:46:15Z raasch
[164]52! f_inv changed from subroutine argument to automatic array in order to do
53! re-ordering from f_in to f_inv in one step, one array work is needed instead
54! of work1 and work2
[1]55!
[198]56! February 2007
[3]57! RCS Log replace by Id keyword, revision history cleaned up
58!
[1]59! Revision 1.2  2004/04/30 13:12:17  raasch
60! Switched from mpi_alltoallv to the simpler mpi_alltoall,
61! all former transpose-routine files collected in this file, enlarged
62! transposition arrays introduced
63!
64! Revision 1.1  2004/04/30 13:08:16  raasch
65! Initial revision (collection of former routines transpose_xy, transpose_xz,
66!                   transpose_yx, transpose_yz, transpose_zx, transpose_zy)
67!
68! Revision 1.1  1997/07/24 11:25:18  raasch
69! Initial revision
70!
71!
72! Description:
73! ------------
74! Transposition of input array (f_in) from x to y. For the input array, all
75! elements along x reside on the same PE, while after transposition, all
76! elements along y reside on the same PE.
77!------------------------------------------------------------------------------!
78
79    USE cpulog
80    USE indices
81    USE interfaces
82    USE pegrid
83    USE transpose_indices
84
85    IMPLICIT NONE
86
[1111]87    INTEGER ::  i, j, k, l, ys
[1]88   
[1111]89    REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
[1]90
[1111]91    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
92
93    !$acc declare create( f_inv )
94    REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
95
96
[1]97!
98!-- Rearrange indices of input array in order to make data to be send
99!-- by MPI contiguous
[683]100!$OMP  PARALLEL PRIVATE ( i, j, k )
101!$OMP  DO
[1111]102    !$acc kernels present( f_in )
103    !$acc loop
[1003]104    DO  i = 0, nx
105       DO  k = nzb_x, nzt_x
[1111]106          !$acc loop vector( 32 )
[1003]107          DO  j = nys_x, nyn_x
[164]108             f_inv(j,k,i) = f_in(i,j,k)
[1]109          ENDDO
110       ENDDO
111    ENDDO
[1111]112    !$acc end kernels
[683]113!$OMP  END PARALLEL
[1]114
[1106]115    IF ( numprocs /= 1 )  THEN
116
117#if defined( __parallel )
[1]118!
[1106]119!--    Transpose array
120       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
121       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]122       !$acc update host( f_inv )
123       CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
124                          work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]125                          comm1dy, ierr )
[1111]126       !$acc update device( work )
[1106]127       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]128
129!
[1106]130!--    Reorder transposed array
[1111]131!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
[683]132!$OMP  DO
[1106]133       DO  l = 0, pdims(2) - 1
134          ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]135          !$acc kernels present( f_out, work )
136          !$acc loop
[1106]137          DO  i = nxl_y, nxr_y
138             DO  k = nzb_y, nzt_y
[1111]139                !$acc loop vector( 32 )
[1106]140                DO  j = ys, ys + nyn_x - nys_x
[1111]141                   f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]142                ENDDO
[1]143             ENDDO
144          ENDDO
[1111]145          !$acc end kernels
[1]146       ENDDO
[683]147!$OMP  END PARALLEL
[1]148#endif
149
[1106]150    ELSE
151
152!
153!--    Reorder transposed array
154!$OMP  PARALLEL PRIVATE ( i, j, k )
155!$OMP  DO
[1111]156       !$acc kernels present( f_out )
157       !$acc loop
[1106]158       DO  k = nzb_y, nzt_y
159          DO  i = nxl_y, nxr_y
[1111]160             !$acc loop vector( 32 )
[1106]161             DO  j = 0, ny
162                f_out(j,i,k) = f_inv(j,k,i)
163             ENDDO
164          ENDDO
165       ENDDO
[1111]166       !$acc end kernels
[1106]167!$OMP  END PARALLEL
168
169    ENDIF
170
[1]171 END SUBROUTINE transpose_xy
172
173
[164]174 SUBROUTINE transpose_xz( f_in, work, f_out )
[1]175
176!------------------------------------------------------------------------------!
177! Description:
178! ------------
179! Transposition of input array (f_in) from x to z. For the input array, all
180! elements along x reside on the same PE, while after transposition, all
181! elements along z reside on the same PE.
182!------------------------------------------------------------------------------!
183
184    USE cpulog
185    USE indices
186    USE interfaces
187    USE pegrid
188    USE transpose_indices
189
190    IMPLICIT NONE
191
[1111]192    INTEGER ::  i, j, k, l, xs
[1]193   
[1111]194    REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_out(1:nz,nys:nyn,nxl:nxr)
[1]195
[1111]196    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
[1]197
[1111]198    !$acc declare create( f_inv )
199    REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
200
201
[1]202!
203!-- If the PE grid is one-dimensional along y, the array has only to be
204!-- reordered locally and therefore no transposition has to be done.
205    IF ( pdims(1) /= 1 )  THEN
[1106]206
207#if defined( __parallel )
[1]208!
209!--    Reorder input array for transposition
[1111]210!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
[683]211!$OMP  DO
[1]212       DO  l = 0, pdims(1) - 1
213          xs = 0 + l * nnx
[1111]214          !$acc kernels present( f_in, work )
215          !$acc loop
[1003]216          DO  k = nzb_x, nzt_x
[164]217             DO  i = xs, xs + nnx - 1
[1111]218                !$acc loop vector( 32 )
[1003]219                DO  j = nys_x, nyn_x
[1111]220                   work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]221                ENDDO
222             ENDDO
223          ENDDO
[1111]224          !$acc end kernels
[1]225       ENDDO
[683]226!$OMP  END PARALLEL
[1]227
228!
229!--    Transpose array
230       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]231       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]232       !$acc update host( work )
233       CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
234                          f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
[1]235                          comm1dx, ierr )
[1111]236       !$acc update device( f_inv )
[1]237       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
238
239!
240!--    Reorder transposed array in a way that the z index is in first position
[683]241!$OMP  PARALLEL PRIVATE ( i, j, k )
242!$OMP  DO
[1111]243       !$acc kernels present( f_out )
244       !$acc loop
[1003]245       DO  k = 1, nz
246          DO  i = nxl, nxr
[1111]247             !$acc loop vector( 32 )
[1003]248             DO  j = nys, nyn
[164]249                f_out(k,j,i) = f_inv(j,i,k)
[1]250             ENDDO
251          ENDDO
252       ENDDO
[1111]253       !$acc end kernels
[683]254!$OMP  END PARALLEL
[1106]255#endif
256
[1]257    ELSE
[1106]258
[1]259!
260!--    Reorder the array in a way that the z index is in first position
[683]261!$OMP  PARALLEL PRIVATE ( i, j, k )
262!$OMP  DO
[1111]263       !$acc kernels present( f_in )
264       !$acc loop
[1003]265       DO  i = nxl, nxr
266          DO  j = nys, nyn
[1111]267             !$acc loop vector( 32 )
[1003]268             DO  k = 1, nz
[164]269                f_inv(j,i,k) = f_in(i,j,k)
[1]270             ENDDO
271          ENDDO
272       ENDDO
[1111]273       !$acc end kernels
[683]274!$OMP  END PARALLEL
[1]275
[683]276!$OMP  PARALLEL PRIVATE ( i, j, k )
277!$OMP  DO
[1111]278       !$acc kernels present( f_out )
279       !$acc loop
[1003]280       DO  k = 1, nz
281          DO  i = nxl, nxr
[1111]282             !$acc loop vector( 32 )
[1003]283             DO  j = nys, nyn
[164]284                f_out(k,j,i) = f_inv(j,i,k)
285             ENDDO
[1]286          ENDDO
287       ENDDO
[1111]288       !$acc end kernels
[683]289!$OMP  END PARALLEL
[1]290
[164]291    ENDIF
292
[1]293 END SUBROUTINE transpose_xz
294
295
[164]296 SUBROUTINE transpose_yx( f_in, work, f_out )
[1]297
298!------------------------------------------------------------------------------!
299! Description:
300! ------------
301! Transposition of input array (f_in) from y to x. For the input array, all
302! elements along y reside on the same PE, while after transposition, all
303! elements along x reside on the same PE.
304!------------------------------------------------------------------------------!
305
306    USE cpulog
307    USE indices
308    USE interfaces
309    USE pegrid
310    USE transpose_indices
311
312    IMPLICIT NONE
313
[1111]314    INTEGER ::  i, j, k, l, ys
[1]315   
[1111]316    REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
[1]317
[1111]318    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
319
320    !$acc declare create( f_inv )
321    REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
322
323
[1106]324    IF ( numprocs /= 1 )  THEN
325
[1]326#if defined( __parallel )
327!
[1106]328!--    Reorder input array for transposition
[1111]329!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
[683]330!$OMP  DO
[1106]331       DO  l = 0, pdims(2) - 1
332          ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]333          !$acc kernels present( f_in, work )
334          !$acc loop
[1106]335          DO  i = nxl_y, nxr_y
336             DO  k = nzb_y, nzt_y
[1111]337                !$acc loop vector( 32 )
[1106]338                DO  j = ys, ys + nyn_x - nys_x
[1111]339                   work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]340                ENDDO
341             ENDDO
342          ENDDO
[1111]343          !$acc end kernels
[1106]344       ENDDO
345!$OMP  END PARALLEL
346
347!
348!--    Transpose array
349       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
350       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]351       !$acc update host( work )
352       CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
353                          f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
[1106]354                          comm1dy, ierr )
[1111]355       !$acc update device( f_inv )
[1106]356       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
357#endif
358
359    ELSE
360
361!
362!--    Reorder array f_in the same way as ALLTOALL did it
363!$OMP  PARALLEL PRIVATE ( i, j, k )
364!$OMP  DO
[1111]365       !$acc kernels present( f_in )
366       !$acc loop
[1003]367       DO  i = nxl_y, nxr_y
368          DO  k = nzb_y, nzt_y
[1111]369             !$acc loop vector( 32 )
[1106]370             DO  j = 0, ny
371                f_inv(j,k,i) = f_in(j,i,k)
[1]372             ENDDO
373          ENDDO
374       ENDDO
[1111]375       !$acc end kernels
[683]376!$OMP  END PARALLEL
[1]377
[1106]378    ENDIF
[1]379
380!
381!-- Reorder transposed array in a way that the x index is in first position
[683]382!$OMP  PARALLEL PRIVATE ( i, j, k )
383!$OMP  DO
[1111]384    !$acc kernels present( f_out )
385    !$acc loop
[1003]386    DO  i = 0, nx
387       DO  k = nzb_x, nzt_x
[1111]388          !$acc loop vector( 32 )
[1003]389          DO  j = nys_x, nyn_x
[164]390             f_out(i,j,k) = f_inv(j,k,i)
[1]391          ENDDO
392       ENDDO
393    ENDDO
[1111]394    !$acc end kernels
[683]395!$OMP  END PARALLEL
[1]396
397 END SUBROUTINE transpose_yx
398
399
[164]400 SUBROUTINE transpose_yxd( f_in, work, f_out )
[1]401
402!------------------------------------------------------------------------------!
403! Description:
404! ------------
405! Transposition of input array (f_in) from y to x. For the input array, all
406! elements along y reside on the same PE, while after transposition, all
407! elements along x reside on the same PE.
408! This is a direct transposition for arrays with indices in regular order
409! (k,j,i) (cf. transpose_yx).
410!------------------------------------------------------------------------------!
411
412    USE cpulog
413    USE indices
414    USE interfaces
415    USE pegrid
416    USE transpose_indices
417
418    IMPLICIT NONE
419
[1092]420    INTEGER ::  i, j, k, l, m, xs
[1]421
[1003]422    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
423             f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x),                     &
[164]424             work(nnx*nny*nnz)
[1]425
426#if defined( __parallel )
427
428!
429!-- Rearrange indices of input array in order to make data to be send
430!-- by MPI contiguous
[1003]431    DO  k = 1, nz
432       DO  j = nys, nyn
433          DO  i = nxl, nxr
[164]434             f_inv(i,k,j) = f_in(k,j,i)
[1]435          ENDDO
436       ENDDO
437    ENDDO
438
439!
440!-- Transpose array
441    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]442    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1]443    CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]444                       work(1),          sendrecvcount_xy, MPI_REAL, &
[1]445                       comm1dx, ierr )
446    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
447
448!
449!-- Reorder transposed array
450    m = 0
451    DO  l = 0, pdims(1) - 1
452       xs = 0 + l * nnx
[1003]453       DO  j = nys_x, nyn_x
454          DO  k = 1, nz
[1]455             DO  i = xs, xs + nnx - 1
456                m = m + 1
[164]457                f_out(i,j,k) = work(m)
[1]458             ENDDO
459          ENDDO
460       ENDDO
461    ENDDO
462
463#endif
464
465 END SUBROUTINE transpose_yxd
466
467
[164]468 SUBROUTINE transpose_yz( f_in, work, f_out )
[1]469
470!------------------------------------------------------------------------------!
471! Description:
472! ------------
473! Transposition of input array (f_in) from y to z. For the input array, all
474! elements along y reside on the same PE, while after transposition, all
475! elements along z reside on the same PE.
476!------------------------------------------------------------------------------!
477
478    USE cpulog
479    USE indices
480    USE interfaces
481    USE pegrid
482    USE transpose_indices
483
484    IMPLICIT NONE
485
[1111]486    INTEGER ::  i, j, k, l, zs
[1]487   
[1111]488    REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]489
[1111]490    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
491
492    !$acc declare create( f_inv )
493    REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
494
495
[1]496!
497!-- Rearrange indices of input array in order to make data to be send
498!-- by MPI contiguous
[683]499!$OMP  PARALLEL PRIVATE ( i, j, k )
500!$OMP  DO
[1111]501    !$acc kernels present( f_in )
502    !$acc loop
[1003]503    DO  j = 0, ny
504       DO  k = nzb_y, nzt_y
[1111]505          !$acc loop vector( 32 )
[1003]506          DO  i = nxl_y, nxr_y
[164]507             f_inv(i,k,j) = f_in(j,i,k)
[1]508          ENDDO
509       ENDDO
510    ENDDO
[1111]511    !$acc end kernels
[683]512!$OMP  END PARALLEL
[1]513
514!
515!-- Move data to different array, because memory location of work1 is
516!-- needed further below (work1 = work2).
517!-- If the PE grid is one-dimensional along y, only local reordering
518!-- of the data is necessary and no transposition has to be done.
519    IF ( pdims(1) == 1 )  THEN
[1106]520
[683]521!$OMP  PARALLEL PRIVATE ( i, j, k )
522!$OMP  DO
[1111]523       !$acc kernels present( f_out )
524       !$acc loop
[1003]525       DO  j = 0, ny
526          DO  k = nzb_y, nzt_y
[1111]527             !$acc loop vector( 32 )
[1003]528             DO  i = nxl_y, nxr_y
[164]529                f_out(i,j,k) = f_inv(i,k,j)
[1]530             ENDDO
531          ENDDO
532       ENDDO
[1111]533       !$acc end kernels
[683]534!$OMP  END PARALLEL
[1]535
[1106]536    ELSE
537
538#if defined( __parallel )
[1]539!
[1106]540!--    Transpose array
541       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
542       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]543       !$acc update host( f_inv )
544       CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
545                          work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]546                          comm1dx, ierr )
[1111]547       !$acc update device( work )
[1106]548       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]549
550!
[1106]551!--    Reorder transposed array
[1111]552!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
[683]553!$OMP  DO
[1106]554       DO  l = 0, pdims(1) - 1
555          zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1111]556          !$acc kernels present( f_out, work )
557          !$acc loop
[1106]558          DO  j = nys_z, nyn_z
559             DO  k = zs, zs + nzt_y - nzb_y
[1111]560                !$acc loop vector( 32 )
[1106]561                DO  i = nxl_z, nxr_z
[1111]562                   f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]563                ENDDO
[1]564             ENDDO
565          ENDDO
[1111]566          !$acc end kernels
[1]567       ENDDO
[683]568!$OMP  END PARALLEL
[1]569#endif
570
[1106]571   ENDIF
572
[1]573 END SUBROUTINE transpose_yz
574
575
[164]576 SUBROUTINE transpose_zx( f_in, work, f_out )
[1]577
578!------------------------------------------------------------------------------!
579! Description:
580! ------------
581! Transposition of input array (f_in) from z to x. For the input array, all
582! elements along z reside on the same PE, while after transposition, all
583! elements along x reside on the same PE.
584!------------------------------------------------------------------------------!
585
586    USE cpulog
587    USE indices
588    USE interfaces
589    USE pegrid
590    USE transpose_indices
591
592    IMPLICIT NONE
593
[1111]594    INTEGER ::  i, j, k, l, xs
[1]595   
[1111]596    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
[1]597
[1111]598    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
599
600    !$acc declare create( f_inv )
[1106]601    REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
[1]602
[1106]603
[1]604!
605!-- Rearrange indices of input array in order to make data to be send
606!-- by MPI contiguous
[683]607!$OMP  PARALLEL PRIVATE ( i, j, k )
608!$OMP  DO
[1106]609    !$acc kernels present( f_in )
610    !$acc loop
[1003]611    DO  k = 1,nz
612       DO  i = nxl, nxr
[1106]613          !$acc loop vector( 32 )
[1003]614          DO  j = nys, nyn
[164]615             f_inv(j,i,k) = f_in(k,j,i)
[1]616          ENDDO
617       ENDDO
618    ENDDO
[1111]619    !$acc end kernels
[683]620!$OMP  END PARALLEL
[1]621
622!
623!-- Move data to different array, because memory location of work1 is
624!-- needed further below (work1 = work2).
625!-- If the PE grid is one-dimensional along y, only local reordering
626!-- of the data is necessary and no transposition has to be done.
627    IF ( pdims(1) == 1 )  THEN
[1106]628
[683]629!$OMP  PARALLEL PRIVATE ( i, j, k )
630!$OMP  DO
[1106]631       !$acc kernels present( f_out )
632       !$acc loop
[1003]633       DO  k = 1, nz
634          DO  i = nxl, nxr
[1106]635             !$acc loop vector( 32 )
[1003]636             DO  j = nys, nyn
[164]637                f_out(i,j,k) = f_inv(j,i,k)
[1]638             ENDDO
639          ENDDO
640       ENDDO
[1111]641       !$acc end kernels
[683]642!$OMP  END PARALLEL
[1]643
[1106]644    ELSE
645
646#if defined( __parallel )
[1]647!
[1106]648!--    Transpose array
649       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
650       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]651       !$acc update host( f_inv )
652       CALL MPI_ALLTOALL( f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
653                          work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]654                          comm1dx, ierr )
[1111]655       !$acc update device( work )
[1106]656       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]657
658!
[1106]659!--    Reorder transposed array
[1111]660!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
[683]661!$OMP  DO
[1106]662       DO  l = 0, pdims(1) - 1
663          xs = 0 + l * nnx
[1111]664          !$acc kernels present( f_out, work )
665          !$acc loop
[1106]666          DO  k = nzb_x, nzt_x
667             DO  i = xs, xs + nnx - 1
[1111]668                !$acc loop vector( 32 )
[1106]669                DO  j = nys_x, nyn_x
[1111]670                   f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]671                ENDDO
[1]672             ENDDO
673          ENDDO
[1111]674          !$acc end kernels
[1]675       ENDDO
[683]676!$OMP  END PARALLEL
[1]677#endif
678
[1106]679    ENDIF
680
[1]681 END SUBROUTINE transpose_zx
682
683
[164]684 SUBROUTINE transpose_zy( f_in, work, f_out )
[1]685
686!------------------------------------------------------------------------------!
687! Description:
688! ------------
689! Transposition of input array (f_in) from z to y. For the input array, all
690! elements along z reside on the same PE, while after transposition, all
691! elements along y reside on the same PE.
692!------------------------------------------------------------------------------!
693
694    USE cpulog
695    USE indices
696    USE interfaces
697    USE pegrid
698    USE transpose_indices
699
700    IMPLICIT NONE
701
[1111]702    INTEGER ::  i, j, k, l, zs
[1]703   
[1111]704    REAL ::  f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
[1]705
[1111]706    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
707
708    !$acc declare create( f_inv )
709    REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
710
711
[1]712!
713!-- If the PE grid is one-dimensional along y, the array has only to be
714!-- reordered locally and therefore no transposition has to be done.
715    IF ( pdims(1) /= 1 )  THEN
[1106]716
717#if defined( __parallel )
[1]718!
719!--    Reorder input array for transposition
[1111]720!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
[683]721!$OMP  DO
[1]722       DO  l = 0, pdims(1) - 1
[1003]723          zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1111]724          !$acc kernels present( f_in, work )
725          !$acc loop
[1003]726          DO  j = nys_z, nyn_z
727             DO  k = zs, zs + nzt_y - nzb_y
[1111]728                !$acc loop vector( 32 )
[1003]729                DO  i = nxl_z, nxr_z
[1111]730                   work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]731                ENDDO
732             ENDDO
733          ENDDO
[1111]734          !$acc end kernels
[1]735       ENDDO
[683]736!$OMP  END PARALLEL
[1]737
738!
739!--    Transpose array
740       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]741       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]742       !$acc update host( work )
743       CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
744                          f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
[1]745                          comm1dx, ierr )
[1111]746       !$acc update device( f_inv )
[1]747       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]748#endif
[1]749
750    ELSE
751!
[1106]752!--    Reorder the array in the same way like ALLTOALL did it
[683]753!$OMP  PARALLEL PRIVATE ( i, j, k )
754!$OMP  DO
[1111]755       !$acc kernels present( f_in )
756       !$acc loop
[1003]757       DO  k = nzb_y, nzt_y
758          DO  j = 0, ny
[1111]759             !$acc loop vector( 32 )
[1003]760             DO  i = nxl_y, nxr_y
[164]761                f_inv(i,k,j) = f_in(i,j,k)
762             ENDDO
763          ENDDO
764       ENDDO
[1111]765       !$acc end kernels
[683]766!$OMP  END PARALLEL
[1106]767
768    ENDIF
769
[164]770!
[1106]771!-- Reorder transposed array in a way that the y index is in first position
[683]772!$OMP  PARALLEL PRIVATE ( i, j, k )
773!$OMP  DO
[1111]774    !$acc kernels present( f_out )
775    !$acc loop
[1106]776    DO  k = nzb_y, nzt_y
777       DO  i = nxl_y, nxr_y
[1111]778          !$acc loop vector( 32 )
[1106]779          DO  j = 0, ny
780             f_out(j,i,k) = f_inv(i,k,j)
[1]781          ENDDO
782       ENDDO
[1106]783    ENDDO
[1111]784    !$acc end kernels
[683]785!$OMP  END PARALLEL
[164]786
[1]787 END SUBROUTINE transpose_zy
788
789
[164]790 SUBROUTINE transpose_zyd( f_in, work, f_out )
[1]791
792!------------------------------------------------------------------------------!
793! Description:
794! ------------
795! Transposition of input array (f_in) from z to y. For the input array, all
796! elements along z reside on the same PE, while after transposition, all
797! elements along y reside on the same PE.
798! This is a direct transposition for arrays with indices in regular order
799! (k,j,i) (cf. transpose_zy).
800!------------------------------------------------------------------------------!
801
802    USE cpulog
803    USE indices
804    USE interfaces
805    USE pegrid
806    USE transpose_indices
807
808    IMPLICIT NONE
809
810    INTEGER ::  i, j, k, l, m, ys
811   
[1003]812    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
813             f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd),                 &
[164]814             work(nnx*nny*nnz)
[1]815
816#if defined( __parallel )
817
818!
819!-- Rearrange indices of input array in order to make data to be send
820!-- by MPI contiguous
[1003]821    DO  i = nxl, nxr
822       DO  j = nys, nyn
823          DO  k = 1, nz
[164]824             f_inv(j,i,k) = f_in(k,j,i)
[1]825          ENDDO
826       ENDDO
827    ENDDO
828
829!
830!-- Move data to different array, because memory location of work1 is
831!-- needed further below (work1 = work2).
832!-- If the PE grid is one-dimensional along x, only local reordering
833!-- of the data is necessary and no transposition has to be done.
834    IF ( pdims(2) == 1 )  THEN
[1003]835       DO  k = 1, nz
836          DO  i = nxl, nxr
837             DO  j = nys, nyn
[164]838                f_out(j,i,k) = f_inv(j,i,k)
[1]839             ENDDO
840          ENDDO
841       ENDDO
842       RETURN
843    ENDIF
844
845!
846!-- Transpose array
847    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]848    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1]849    CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]850                       work(1),          sendrecvcount_zyd, MPI_REAL, &
[1]851                       comm1dy, ierr )
852    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
853
854!
855!-- Reorder transposed array
856    m = 0
857    DO  l = 0, pdims(2) - 1
858       ys = 0 + l * nny
[1003]859       DO  k = nzb_yd, nzt_yd
860          DO  i = nxl_yd, nxr_yd
[1]861             DO  j = ys, ys + nny - 1
862                m = m + 1
[164]863                f_out(j,i,k) = work(m)
[1]864             ENDDO
865          ENDDO
866       ENDDO
867    ENDDO
868
869#endif
870
871 END SUBROUTINE transpose_zyd
Note: See TracBrowser for help on using the repository browser.