source: palm/trunk/SOURCE/transpose.f90 @ 1316

Last change on this file since 1316 was 1310, checked in by raasch, 11 years ago

update of GPL copyright

  • Property svn:keywords set to Id
File size: 25.9 KB
RevLine 
[1216]1 SUBROUTINE resort_for_xy( f_in, f_inv )
[1]2
[1036]3!--------------------------------------------------------------------------------!
4! This file is part of PALM.
5!
6! PALM is free software: you can redistribute it and/or modify it under the terms
7! of the GNU General Public License as published by the Free Software Foundation,
8! either version 3 of the License, or (at your option) any later version.
9!
10! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12! A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13!
14! You should have received a copy of the GNU General Public License along with
15! PALM. If not, see <http://www.gnu.org/licenses/>.
16!
[1310]17! Copyright 1997-2014 Leibniz Universitaet Hannover
[1036]18!--------------------------------------------------------------------------------!
19!
[484]20! Current revisions:
[1]21! -----------------
[198]22!
[1258]23!
[198]24! Former revisions:
25! -----------------
26! $Id: transpose.f90 1310 2014-03-14 08:01:56Z heinze $
27!
[1258]28! 1257 2013-11-08 15:18:40Z raasch
29! openacc loop and loop vector clauses removed
30!
[1217]31! 1216 2013-08-26 09:31:42Z raasch
32! re-sorting of the transposed / to be transposed arrays moved to separate
33! routines resort_for_...
34!
[1112]35! 1111 2013-03-08 23:54:10Z raasch
36! openACC directives added,
37! resorting data from/to work changed, work got 4 dimensions instead of 1
38!
[1107]39! 1106 2013-03-04 05:31:38Z raasch
40! preprocessor lines rearranged so that routines can also be used in serial
41! (non-parallel) mode
42!
[1093]43! 1092 2013-02-02 11:24:22Z raasch
44! unused variables removed
45!
[1037]46! 1036 2012-10-22 13:43:42Z raasch
47! code put under GPL (PALM 3.9)
48!
[1004]49! 1003 2012-09-14 14:35:53Z raasch
50! indices nxa, nya, etc. replaced by nx, ny, etc.
51!
[684]52! 683 2011-02-09 14:25:15Z raasch
53! openMP parallelization of transpositions for 2d-domain-decomposition
54!
[623]55! 622 2010-12-10 08:08:13Z raasch
56! optional barriers included in order to speed up collective operations
57!
[198]58! 164 2008-05-15 08:46:15Z raasch
[164]59! f_inv changed from subroutine argument to automatic array in order to do
60! re-ordering from f_in to f_inv in one step, one array work is needed instead
61! of work1 and work2
[1]62!
[198]63! February 2007
[3]64! RCS Log replace by Id keyword, revision history cleaned up
65!
[1]66! Revision 1.2  2004/04/30 13:12:17  raasch
67! Switched from mpi_alltoallv to the simpler mpi_alltoall,
68! all former transpose-routine files collected in this file, enlarged
69! transposition arrays introduced
70!
71! Revision 1.1  2004/04/30 13:08:16  raasch
72! Initial revision (collection of former routines transpose_xy, transpose_xz,
73!                   transpose_yx, transpose_yz, transpose_zx, transpose_zy)
74!
75! Revision 1.1  1997/07/24 11:25:18  raasch
76! Initial revision
77!
[1216]78!------------------------------------------------------------------------------!
79! Description:
80! ------------
81! Resorting data for the transposition from x to y. The transposition itself
82! is carried out in transpose_xy
83!------------------------------------------------------------------------------!
84
85     USE indices
86     USE transpose_indices
87
88     IMPLICIT NONE
89
90     REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
91     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
92
93
94     INTEGER ::  i, j, k
95
[1]96!
[1216]97!-- Rearrange indices of input array in order to make data to be send
98!-- by MPI contiguous
99    !$OMP  PARALLEL PRIVATE ( i, j, k )
100    !$OMP  DO
101    !$acc kernels present( f_in, f_inv )
102     DO  i = 0, nx
103         DO  k = nzb_x, nzt_x
104             DO  j = nys_x, nyn_x
105                 f_inv(j,k,i) = f_in(i,j,k)
106             ENDDO
107         ENDDO
108     ENDDO
109     !$acc end kernels
110     !$OMP  END PARALLEL
111
112 END SUBROUTINE resort_for_xy
113
114
115 SUBROUTINE transpose_xy( f_inv, f_out )
116
117!------------------------------------------------------------------------------!
[1]118! Description:
119! ------------
120! Transposition of input array (f_in) from x to y. For the input array, all
121! elements along x reside on the same PE, while after transposition, all
122! elements along y reside on the same PE.
123!------------------------------------------------------------------------------!
124
125    USE cpulog
126    USE indices
127    USE interfaces
128    USE pegrid
129    USE transpose_indices
130
131    IMPLICIT NONE
132
[1111]133    INTEGER ::  i, j, k, l, ys
[1]134   
[1216]135    REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
[1]136
[1111]137    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
138
139
[1106]140    IF ( numprocs /= 1 )  THEN
141
142#if defined( __parallel )
[1]143!
[1106]144!--    Transpose array
145       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
146       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]147       !$acc update host( f_inv )
148       CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
149                          work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]150                          comm1dy, ierr )
151       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]152
153!
[1106]154!--    Reorder transposed array
[1111]155!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
[683]156!$OMP  DO
[1216]157       !$acc data copyin( work )
[1106]158       DO  l = 0, pdims(2) - 1
159          ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]160          !$acc kernels present( f_out, work )
[1106]161          DO  i = nxl_y, nxr_y
162             DO  k = nzb_y, nzt_y
163                DO  j = ys, ys + nyn_x - nys_x
[1111]164                   f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]165                ENDDO
[1]166             ENDDO
167          ENDDO
[1111]168          !$acc end kernels
[1]169       ENDDO
[1216]170       !$acc end data
[683]171!$OMP  END PARALLEL
[1]172#endif
173
[1106]174    ELSE
175
176!
177!--    Reorder transposed array
178!$OMP  PARALLEL PRIVATE ( i, j, k )
179!$OMP  DO
[1216]180       !$acc kernels present( f_inv, f_out )
[1106]181       DO  k = nzb_y, nzt_y
182          DO  i = nxl_y, nxr_y
183             DO  j = 0, ny
184                f_out(j,i,k) = f_inv(j,k,i)
185             ENDDO
186          ENDDO
187       ENDDO
[1111]188       !$acc end kernels
[1106]189!$OMP  END PARALLEL
190
191    ENDIF
192
[1]193 END SUBROUTINE transpose_xy
194
195
[1216]196 SUBROUTINE resort_for_xz( f_inv, f_out )
[1]197
198!------------------------------------------------------------------------------!
199! Description:
200! ------------
[1216]201! Resorting data after the transposition from x to z. The transposition itself
202! is carried out in transpose_xz
203!------------------------------------------------------------------------------!
204
205     USE indices
206     USE transpose_indices
207
208     IMPLICIT NONE
209
210     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
211     REAL ::  f_out(1:nz,nys:nyn,nxl:nxr)
212
213
214     INTEGER ::  i, j, k
215
216!
217!-- Rearrange indices of input array in order to make data to be send
218!-- by MPI contiguous.
219!-- In case of parallel fft/transposition, scattered store is faster in
220!-- backward direction!!!
221    !$OMP  PARALLEL PRIVATE ( i, j, k )
222    !$OMP  DO
223    !$acc kernels present( f_inv, f_out )
224     DO  k = 1, nz
225         DO  i = nxl, nxr
226             DO  j = nys, nyn
227                 f_out(k,j,i) = f_inv(j,i,k)
228             ENDDO
229         ENDDO
230     ENDDO
231     !$acc end kernels
232     !$OMP  END PARALLEL
233
234 END SUBROUTINE resort_for_xz
235
236
237 SUBROUTINE transpose_xz( f_in, f_inv )
238
239!------------------------------------------------------------------------------!
240! Description:
241! ------------
[1]242! Transposition of input array (f_in) from x to z. For the input array, all
243! elements along x reside on the same PE, while after transposition, all
244! elements along z reside on the same PE.
245!------------------------------------------------------------------------------!
246
247    USE cpulog
248    USE indices
249    USE interfaces
250    USE pegrid
251    USE transpose_indices
252
253    IMPLICIT NONE
254
[1111]255    INTEGER ::  i, j, k, l, xs
[1]256   
[1216]257    REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz)
[1]258
[1111]259    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
[1]260
[1111]261
[1]262!
263!-- If the PE grid is one-dimensional along y, the array has only to be
264!-- reordered locally and therefore no transposition has to be done.
265    IF ( pdims(1) /= 1 )  THEN
[1106]266
267#if defined( __parallel )
[1]268!
269!--    Reorder input array for transposition
[1111]270!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
[683]271!$OMP  DO
[1216]272       !$acc data copyout( work )
[1]273       DO  l = 0, pdims(1) - 1
274          xs = 0 + l * nnx
[1111]275          !$acc kernels present( f_in, work )
[1003]276          DO  k = nzb_x, nzt_x
[164]277             DO  i = xs, xs + nnx - 1
[1003]278                DO  j = nys_x, nyn_x
[1111]279                   work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]280                ENDDO
281             ENDDO
282          ENDDO
[1111]283          !$acc end kernels
[1]284       ENDDO
[1216]285       !$acc end data
[683]286!$OMP  END PARALLEL
[1]287
288!
289!--    Transpose array
290       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]291       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]292       CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
293                          f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
[1]294                          comm1dx, ierr )
[1111]295       !$acc update device( f_inv )
[1]296       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]297#endif
298
[1]299    ELSE
[1106]300
[1]301!
302!--    Reorder the array in a way that the z index is in first position
[683]303!$OMP  PARALLEL PRIVATE ( i, j, k )
304!$OMP  DO
[1216]305       !$acc kernels present( f_in, f_inv )
[1003]306       DO  i = nxl, nxr
307          DO  j = nys, nyn
308             DO  k = 1, nz
[164]309                f_inv(j,i,k) = f_in(i,j,k)
[1]310             ENDDO
311          ENDDO
312       ENDDO
[1111]313       !$acc end kernels
[683]314!$OMP  END PARALLEL
[1]315
[164]316    ENDIF
317
[1]318 END SUBROUTINE transpose_xz
319
320
[1216]321 SUBROUTINE resort_for_yx( f_inv, f_out )
[1]322
323!------------------------------------------------------------------------------!
324! Description:
325! ------------
[1216]326! Resorting data after the transposition from y to x. The transposition itself
327! is carried out in transpose_yx
328!------------------------------------------------------------------------------!
329
330     USE indices
331     USE transpose_indices
332
333     IMPLICIT NONE
334
335     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
336     REAL ::  f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
337
338
339     INTEGER ::  i, j, k
340
341!
342!-- Rearrange indices of input array in order to make data to be send
343!-- by MPI contiguous
344    !$OMP  PARALLEL PRIVATE ( i, j, k )
345    !$OMP  DO
346    !$acc kernels present( f_inv, f_out )
347     DO  i = 0, nx
348         DO  k = nzb_x, nzt_x
349             DO  j = nys_x, nyn_x
350                 f_out(i,j,k) = f_inv(j,k,i)
351             ENDDO
352         ENDDO
353     ENDDO
354     !$acc end kernels
355     !$OMP  END PARALLEL
356
357 END SUBROUTINE resort_for_yx
358
359
360 SUBROUTINE transpose_yx( f_in, f_inv )
361
362!------------------------------------------------------------------------------!
363! Description:
364! ------------
[1]365! Transposition of input array (f_in) from y to x. For the input array, all
366! elements along y reside on the same PE, while after transposition, all
367! elements along x reside on the same PE.
368!------------------------------------------------------------------------------!
369
370    USE cpulog
371    USE indices
372    USE interfaces
373    USE pegrid
374    USE transpose_indices
375
376    IMPLICIT NONE
377
[1111]378    INTEGER ::  i, j, k, l, ys
[1]379   
[1216]380    REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
[1]381
[1111]382    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
383
384
[1106]385    IF ( numprocs /= 1 )  THEN
386
[1]387#if defined( __parallel )
388!
[1106]389!--    Reorder input array for transposition
[1111]390!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
[683]391!$OMP  DO
[1216]392       !$acc data copyout( work )
[1106]393       DO  l = 0, pdims(2) - 1
394          ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]395          !$acc kernels present( f_in, work )
[1106]396          DO  i = nxl_y, nxr_y
397             DO  k = nzb_y, nzt_y
398                DO  j = ys, ys + nyn_x - nys_x
[1111]399                   work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]400                ENDDO
401             ENDDO
402          ENDDO
[1111]403          !$acc end kernels
[1106]404       ENDDO
[1216]405       !$acc end data
[1106]406!$OMP  END PARALLEL
407
408!
409!--    Transpose array
410       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
411       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]412       CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
413                          f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
[1106]414                          comm1dy, ierr )
[1111]415       !$acc update device( f_inv )
[1106]416       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
417#endif
418
419    ELSE
420
421!
422!--    Reorder array f_in the same way as ALLTOALL did it
423!$OMP  PARALLEL PRIVATE ( i, j, k )
424!$OMP  DO
[1216]425       !$acc kernels present( f_in, f_inv )
[1003]426       DO  i = nxl_y, nxr_y
427          DO  k = nzb_y, nzt_y
[1106]428             DO  j = 0, ny
429                f_inv(j,k,i) = f_in(j,i,k)
[1]430             ENDDO
431          ENDDO
432       ENDDO
[1111]433       !$acc end kernels
[683]434!$OMP  END PARALLEL
[1]435
[1106]436    ENDIF
[1]437
438 END SUBROUTINE transpose_yx
439
440
[1216]441 SUBROUTINE transpose_yxd( f_in, f_out )
[1]442
443!------------------------------------------------------------------------------!
444! Description:
445! ------------
446! Transposition of input array (f_in) from y to x. For the input array, all
447! elements along y reside on the same PE, while after transposition, all
448! elements along x reside on the same PE.
449! This is a direct transposition for arrays with indices in regular order
450! (k,j,i) (cf. transpose_yx).
451!------------------------------------------------------------------------------!
452
453    USE cpulog
454    USE indices
455    USE interfaces
456    USE pegrid
457    USE transpose_indices
458
459    IMPLICIT NONE
460
[1092]461    INTEGER ::  i, j, k, l, m, xs
[1]462
[1003]463    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
464             f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x),                     &
[164]465             work(nnx*nny*nnz)
[1]466
467#if defined( __parallel )
468
469!
470!-- Rearrange indices of input array in order to make data to be send
471!-- by MPI contiguous
[1003]472    DO  k = 1, nz
473       DO  j = nys, nyn
474          DO  i = nxl, nxr
[164]475             f_inv(i,k,j) = f_in(k,j,i)
[1]476          ENDDO
477       ENDDO
478    ENDDO
479
480!
481!-- Transpose array
482    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]483    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1]484    CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]485                       work(1),          sendrecvcount_xy, MPI_REAL, &
[1]486                       comm1dx, ierr )
487    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
488
489!
490!-- Reorder transposed array
491    m = 0
492    DO  l = 0, pdims(1) - 1
493       xs = 0 + l * nnx
[1003]494       DO  j = nys_x, nyn_x
495          DO  k = 1, nz
[1]496             DO  i = xs, xs + nnx - 1
497                m = m + 1
[164]498                f_out(i,j,k) = work(m)
[1]499             ENDDO
500          ENDDO
501       ENDDO
502    ENDDO
503
504#endif
505
506 END SUBROUTINE transpose_yxd
507
508
[1216]509 SUBROUTINE resort_for_yz( f_in, f_inv )
[1]510
511!------------------------------------------------------------------------------!
512! Description:
513! ------------
[1216]514! Resorting data for the transposition from y to z. The transposition itself
515! is carried out in transpose_yz
516!------------------------------------------------------------------------------!
517
518     USE indices
519     USE transpose_indices
520
521     IMPLICIT NONE
522
523     REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
524     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
525
526
527     INTEGER ::  i, j, k
528
529!
530!-- Rearrange indices of input array in order to make data to be send
531!-- by MPI contiguous
532    !$OMP  PARALLEL PRIVATE ( i, j, k )
533    !$OMP  DO
534    !$acc kernels present( f_in, f_inv )
535     DO  j = 0, ny
536         DO  k = nzb_y, nzt_y
537             DO  i = nxl_y, nxr_y
538                 f_inv(i,k,j) = f_in(j,i,k)
539             ENDDO
540         ENDDO
541     ENDDO
542     !$acc end kernels
543     !$OMP  END PARALLEL
544
545 END SUBROUTINE resort_for_yz
546
547
548 SUBROUTINE transpose_yz( f_inv, f_out )
549
550!------------------------------------------------------------------------------!
551! Description:
552! ------------
[1]553! Transposition of input array (f_in) from y to z. For the input array, all
554! elements along y reside on the same PE, while after transposition, all
555! elements along z reside on the same PE.
556!------------------------------------------------------------------------------!
557
558    USE cpulog
559    USE indices
560    USE interfaces
561    USE pegrid
562    USE transpose_indices
563
564    IMPLICIT NONE
565
[1111]566    INTEGER ::  i, j, k, l, zs
[1]567   
[1216]568    REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]569
[1111]570    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
571
572
[1]573!
574!-- If the PE grid is one-dimensional along y, only local reordering
575!-- of the data is necessary and no transposition has to be done.
576    IF ( pdims(1) == 1 )  THEN
[1106]577
[683]578!$OMP  PARALLEL PRIVATE ( i, j, k )
579!$OMP  DO
[1216]580       !$acc kernels present( f_inv, f_out )
[1003]581       DO  j = 0, ny
582          DO  k = nzb_y, nzt_y
583             DO  i = nxl_y, nxr_y
[164]584                f_out(i,j,k) = f_inv(i,k,j)
[1]585             ENDDO
586          ENDDO
587       ENDDO
[1111]588       !$acc end kernels
[683]589!$OMP  END PARALLEL
[1]590
[1106]591    ELSE
592
593#if defined( __parallel )
[1]594!
[1106]595!--    Transpose array
596       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
597       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]598       !$acc update host( f_inv )
599       CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
600                          work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]601                          comm1dx, ierr )
602       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]603
604!
[1106]605!--    Reorder transposed array
[1111]606!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
[683]607!$OMP  DO
[1216]608       !$acc data copyin( work )
[1106]609       DO  l = 0, pdims(1) - 1
610          zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1216]611          !$acc kernels present( f_out )
[1106]612          DO  j = nys_z, nyn_z
613             DO  k = zs, zs + nzt_y - nzb_y
614                DO  i = nxl_z, nxr_z
[1111]615                   f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]616                ENDDO
[1]617             ENDDO
618          ENDDO
[1111]619          !$acc end kernels
[1]620       ENDDO
[1216]621       !$acc end data
[683]622!$OMP  END PARALLEL
[1]623#endif
624
[1106]625   ENDIF
626
[1]627 END SUBROUTINE transpose_yz
628
629
[1216]630 SUBROUTINE resort_for_zx( f_in, f_inv )
[1]631
632!------------------------------------------------------------------------------!
633! Description:
634! ------------
[1216]635! Resorting data for the transposition from z to x. The transposition itself
636! is carried out in transpose_zx
637!------------------------------------------------------------------------------!
638
639     USE indices
640     USE transpose_indices
641
642     IMPLICIT NONE
643
644     REAL ::  f_in(1:nz,nys:nyn,nxl:nxr)
645     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
646
647
648     INTEGER ::  i, j, k
649
650!
651!-- Rearrange indices of input array in order to make data to be send
652!-- by MPI contiguous
653    !$OMP  PARALLEL PRIVATE ( i, j, k )
654    !$OMP  DO
655    !$acc kernels present( f_in, f_inv )
656     DO  k = 1,nz
657         DO  i = nxl, nxr
658             DO  j = nys, nyn
659                 f_inv(j,i,k) = f_in(k,j,i)
660             ENDDO
661         ENDDO
662     ENDDO
663     !$acc end kernels
664     !$OMP  END PARALLEL
665
666 END SUBROUTINE resort_for_zx
667
668
669 SUBROUTINE transpose_zx( f_inv, f_out )
670
671!------------------------------------------------------------------------------!
672! Description:
673! ------------
[1]674! Transposition of input array (f_in) from z to x. For the input array, all
675! elements along z reside on the same PE, while after transposition, all
676! elements along x reside on the same PE.
677!------------------------------------------------------------------------------!
678
679    USE cpulog
680    USE indices
681    USE interfaces
682    USE pegrid
683    USE transpose_indices
684
685    IMPLICIT NONE
686
[1111]687    INTEGER ::  i, j, k, l, xs
[1]688   
[1216]689    REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
[1]690
[1111]691    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
692
[1]693
694!
695!-- If the PE grid is one-dimensional along y, only local reordering
696!-- of the data is necessary and no transposition has to be done.
697    IF ( pdims(1) == 1 )  THEN
[1106]698
[683]699!$OMP  PARALLEL PRIVATE ( i, j, k )
700!$OMP  DO
[1216]701       !$acc kernels present( f_inv, f_out )
[1003]702       DO  k = 1, nz
703          DO  i = nxl, nxr
704             DO  j = nys, nyn
[164]705                f_out(i,j,k) = f_inv(j,i,k)
[1]706             ENDDO
707          ENDDO
708       ENDDO
[1111]709       !$acc end kernels
[683]710!$OMP  END PARALLEL
[1]711
[1106]712    ELSE
713
714#if defined( __parallel )
[1]715!
[1106]716!--    Transpose array
717       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
718       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]719       !$acc update host( f_inv )
720       CALL MPI_ALLTOALL( f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
721                          work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]722                          comm1dx, ierr )
723       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]724
725!
[1106]726!--    Reorder transposed array
[1111]727!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
[683]728!$OMP  DO
[1216]729       !$acc data copyin( work )
[1106]730       DO  l = 0, pdims(1) - 1
731          xs = 0 + l * nnx
[1216]732          !$acc kernels present( f_out )
[1106]733          DO  k = nzb_x, nzt_x
734             DO  i = xs, xs + nnx - 1
735                DO  j = nys_x, nyn_x
[1111]736                   f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]737                ENDDO
[1]738             ENDDO
739          ENDDO
[1111]740          !$acc end kernels
[1]741       ENDDO
[1216]742       !$acc end data
[683]743!$OMP  END PARALLEL
[1]744#endif
745
[1106]746    ENDIF
747
[1]748 END SUBROUTINE transpose_zx
749
750
[1216]751 SUBROUTINE resort_for_zy( f_inv, f_out )
[1]752
753!------------------------------------------------------------------------------!
754! Description:
755! ------------
[1216]756! Resorting data after the transposition from z to y. The transposition itself
757! is carried out in transpose_zy
758!------------------------------------------------------------------------------!
759
760     USE indices
761     USE transpose_indices
762
763     IMPLICIT NONE
764
765     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
766     REAL ::  f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
767
768
769     INTEGER ::  i, j, k
770
771!
772!-- Rearrange indices of input array in order to make data to be send
773!-- by MPI contiguous
774    !$OMP  PARALLEL PRIVATE ( i, j, k )
775    !$OMP  DO
776    !$acc kernels present( f_inv, f_out )
777     DO  k = nzb_y, nzt_y
778         DO  j = 0, ny
779             DO  i = nxl_y, nxr_y
780                 f_out(j,i,k) = f_inv(i,k,j)
781             ENDDO
782         ENDDO
783     ENDDO
784     !$acc end kernels
785     !$OMP  END PARALLEL
786
787 END SUBROUTINE resort_for_zy
788
789
790 SUBROUTINE transpose_zy( f_in, f_inv )
791
792!------------------------------------------------------------------------------!
793! Description:
794! ------------
[1]795! Transposition of input array (f_in) from z to y. For the input array, all
796! elements along z reside on the same PE, while after transposition, all
797! elements along y reside on the same PE.
798!------------------------------------------------------------------------------!
799
800    USE cpulog
801    USE indices
802    USE interfaces
803    USE pegrid
804    USE transpose_indices
805
806    IMPLICIT NONE
807
[1111]808    INTEGER ::  i, j, k, l, zs
[1]809   
[1216]810    REAL ::  f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
[1]811
[1111]812    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
813
814
[1]815!
816!-- If the PE grid is one-dimensional along y, the array has only to be
817!-- reordered locally and therefore no transposition has to be done.
818    IF ( pdims(1) /= 1 )  THEN
[1106]819
820#if defined( __parallel )
[1]821!
822!--    Reorder input array for transposition
[1111]823!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
[683]824!$OMP  DO
[1216]825       !$acc data copyout( work )
[1]826       DO  l = 0, pdims(1) - 1
[1003]827          zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1111]828          !$acc kernels present( f_in, work )
[1003]829          DO  j = nys_z, nyn_z
830             DO  k = zs, zs + nzt_y - nzb_y
831                DO  i = nxl_z, nxr_z
[1111]832                   work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]833                ENDDO
834             ENDDO
835          ENDDO
[1111]836          !$acc end kernels
[1]837       ENDDO
[1216]838       !$acc end data
[683]839!$OMP  END PARALLEL
[1]840
841!
842!--    Transpose array
843       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]844       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]845       CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
846                          f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
[1]847                          comm1dx, ierr )
[1111]848       !$acc update device( f_inv )
[1]849       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]850#endif
[1]851
852    ELSE
853!
[1106]854!--    Reorder the array in the same way like ALLTOALL did it
[683]855!$OMP  PARALLEL PRIVATE ( i, j, k )
856!$OMP  DO
[1216]857       !$acc kernels present( f_in, f_inv )
[1003]858       DO  k = nzb_y, nzt_y
859          DO  j = 0, ny
860             DO  i = nxl_y, nxr_y
[164]861                f_inv(i,k,j) = f_in(i,j,k)
862             ENDDO
863          ENDDO
864       ENDDO
[1111]865       !$acc end kernels
[683]866!$OMP  END PARALLEL
[1106]867
868    ENDIF
869
[1]870 END SUBROUTINE transpose_zy
871
872
[1216]873 SUBROUTINE transpose_zyd( f_in, f_out )
[1]874
875!------------------------------------------------------------------------------!
876! Description:
877! ------------
878! Transposition of input array (f_in) from z to y. For the input array, all
879! elements along z reside on the same PE, while after transposition, all
880! elements along y reside on the same PE.
881! This is a direct transposition for arrays with indices in regular order
882! (k,j,i) (cf. transpose_zy).
883!------------------------------------------------------------------------------!
884
885    USE cpulog
886    USE indices
887    USE interfaces
888    USE pegrid
889    USE transpose_indices
890
891    IMPLICIT NONE
892
893    INTEGER ::  i, j, k, l, m, ys
894   
[1003]895    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
896             f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd),                 &
[164]897             work(nnx*nny*nnz)
[1]898
899#if defined( __parallel )
900
901!
902!-- Rearrange indices of input array in order to make data to be send
903!-- by MPI contiguous
[1003]904    DO  i = nxl, nxr
905       DO  j = nys, nyn
906          DO  k = 1, nz
[164]907             f_inv(j,i,k) = f_in(k,j,i)
[1]908          ENDDO
909       ENDDO
910    ENDDO
911
912!
913!-- Move data to different array, because memory location of work1 is
914!-- needed further below (work1 = work2).
915!-- If the PE grid is one-dimensional along x, only local reordering
916!-- of the data is necessary and no transposition has to be done.
917    IF ( pdims(2) == 1 )  THEN
[1003]918       DO  k = 1, nz
919          DO  i = nxl, nxr
920             DO  j = nys, nyn
[164]921                f_out(j,i,k) = f_inv(j,i,k)
[1]922             ENDDO
923          ENDDO
924       ENDDO
925       RETURN
926    ENDIF
927
928!
929!-- Transpose array
930    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]931    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1]932    CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]933                       work(1),          sendrecvcount_zyd, MPI_REAL, &
[1]934                       comm1dy, ierr )
935    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
936
937!
938!-- Reorder transposed array
939    m = 0
940    DO  l = 0, pdims(2) - 1
941       ys = 0 + l * nny
[1003]942       DO  k = nzb_yd, nzt_yd
943          DO  i = nxl_yd, nxr_yd
[1]944             DO  j = ys, ys + nny - 1
945                m = m + 1
[164]946                f_out(j,i,k) = work(m)
[1]947             ENDDO
948          ENDDO
949       ENDDO
950    ENDDO
951
952#endif
953
954 END SUBROUTINE transpose_zyd
Note: See TracBrowser for help on using the repository browser.