source: palm/trunk/SOURCE/transpose.f90 @ 1319

Last change on this file since 1319 was 1319, checked in by raasch, 10 years ago

last commit documented

  • Property svn:keywords set to Id
File size: 26.1 KB
RevLine 
[1216]1 SUBROUTINE resort_for_xy( f_in, f_inv )
[1]2
[1036]3!--------------------------------------------------------------------------------!
4! This file is part of PALM.
5!
6! PALM is free software: you can redistribute it and/or modify it under the terms
7! of the GNU General Public License as published by the Free Software Foundation,
8! either version 3 of the License, or (at your option) any later version.
9!
10! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12! A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13!
14! You should have received a copy of the GNU General Public License along with
15! PALM. If not, see <http://www.gnu.org/licenses/>.
16!
[1310]17! Copyright 1997-2014 Leibniz Universitaet Hannover
[1036]18!--------------------------------------------------------------------------------!
19!
[484]20! Current revisions:
[1]21! -----------------
[198]22!
[1319]23!
[198]24! Former revisions:
25! -----------------
26! $Id: transpose.f90 1319 2014-03-17 15:08:44Z raasch $
27!
[1319]28! 1318 2014-03-17 13:35:16Z raasch
29! cpu_log_nowait parameter added to cpu measurements of the transpositions
30! required for solving the Poisson equation (poisfft),
31! module interfaces removed
32!
[1258]33! 1257 2013-11-08 15:18:40Z raasch
34! openacc loop and loop vector clauses removed
35!
[1217]36! 1216 2013-08-26 09:31:42Z raasch
37! re-sorting of the transposed / to be transposed arrays moved to separate
38! routines resort_for_...
39!
[1112]40! 1111 2013-03-08 23:54:10Z raasch
41! openACC directives added,
42! resorting data from/to work changed, work got 4 dimensions instead of 1
43!
[1107]44! 1106 2013-03-04 05:31:38Z raasch
45! preprocessor lines rearranged so that routines can also be used in serial
46! (non-parallel) mode
47!
[1093]48! 1092 2013-02-02 11:24:22Z raasch
49! unused variables removed
50!
[1037]51! 1036 2012-10-22 13:43:42Z raasch
52! code put under GPL (PALM 3.9)
53!
[1004]54! 1003 2012-09-14 14:35:53Z raasch
55! indices nxa, nya, etc. replaced by nx, ny, etc.
56!
[684]57! 683 2011-02-09 14:25:15Z raasch
58! openMP parallelization of transpositions for 2d-domain-decomposition
59!
[623]60! 622 2010-12-10 08:08:13Z raasch
61! optional barriers included in order to speed up collective operations
62!
[198]63! 164 2008-05-15 08:46:15Z raasch
[164]64! f_inv changed from subroutine argument to automatic array in order to do
65! re-ordering from f_in to f_inv in one step, one array work is needed instead
66! of work1 and work2
[1]67!
[198]68! February 2007
[3]69! RCS Log replace by Id keyword, revision history cleaned up
70!
[1]71! Revision 1.2  2004/04/30 13:12:17  raasch
72! Switched from mpi_alltoallv to the simpler mpi_alltoall,
73! all former transpose-routine files collected in this file, enlarged
74! transposition arrays introduced
75!
76! Revision 1.1  2004/04/30 13:08:16  raasch
77! Initial revision (collection of former routines transpose_xy, transpose_xz,
78!                   transpose_yx, transpose_yz, transpose_zx, transpose_zy)
79!
80! Revision 1.1  1997/07/24 11:25:18  raasch
81! Initial revision
82!
[1216]83!------------------------------------------------------------------------------!
84! Description:
85! ------------
86! Resorting data for the transposition from x to y. The transposition itself
87! is carried out in transpose_xy
88!------------------------------------------------------------------------------!
89
90     USE indices
91     USE transpose_indices
92
93     IMPLICIT NONE
94
95     REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
96     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
97
98
99     INTEGER ::  i, j, k
100
[1]101!
[1216]102!-- Rearrange indices of input array in order to make data to be send
103!-- by MPI contiguous
104    !$OMP  PARALLEL PRIVATE ( i, j, k )
105    !$OMP  DO
106    !$acc kernels present( f_in, f_inv )
107     DO  i = 0, nx
108         DO  k = nzb_x, nzt_x
109             DO  j = nys_x, nyn_x
110                 f_inv(j,k,i) = f_in(i,j,k)
111             ENDDO
112         ENDDO
113     ENDDO
114     !$acc end kernels
115     !$OMP  END PARALLEL
116
117 END SUBROUTINE resort_for_xy
118
119
120 SUBROUTINE transpose_xy( f_inv, f_out )
121
122!------------------------------------------------------------------------------!
[1]123! Description:
124! ------------
125! Transposition of input array (f_in) from x to y. For the input array, all
126! elements along x reside on the same PE, while after transposition, all
127! elements along y reside on the same PE.
128!------------------------------------------------------------------------------!
129
130    USE cpulog
131    USE indices
132    USE pegrid
133    USE transpose_indices
134
135    IMPLICIT NONE
136
[1111]137    INTEGER ::  i, j, k, l, ys
[1]138   
[1216]139    REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
[1]140
[1111]141    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
142
143
[1106]144    IF ( numprocs /= 1 )  THEN
145
146#if defined( __parallel )
[1]147!
[1106]148!--    Transpose array
[1318]149       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]150       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]151       !$acc update host( f_inv )
152       CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
153                          work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]154                          comm1dy, ierr )
155       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]156
157!
[1106]158!--    Reorder transposed array
[1111]159!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
[683]160!$OMP  DO
[1216]161       !$acc data copyin( work )
[1106]162       DO  l = 0, pdims(2) - 1
163          ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]164          !$acc kernels present( f_out, work )
[1106]165          DO  i = nxl_y, nxr_y
166             DO  k = nzb_y, nzt_y
167                DO  j = ys, ys + nyn_x - nys_x
[1111]168                   f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]169                ENDDO
[1]170             ENDDO
171          ENDDO
[1111]172          !$acc end kernels
[1]173       ENDDO
[1216]174       !$acc end data
[683]175!$OMP  END PARALLEL
[1]176#endif
177
[1106]178    ELSE
179
180!
181!--    Reorder transposed array
182!$OMP  PARALLEL PRIVATE ( i, j, k )
183!$OMP  DO
[1216]184       !$acc kernels present( f_inv, f_out )
[1106]185       DO  k = nzb_y, nzt_y
186          DO  i = nxl_y, nxr_y
187             DO  j = 0, ny
188                f_out(j,i,k) = f_inv(j,k,i)
189             ENDDO
190          ENDDO
191       ENDDO
[1111]192       !$acc end kernels
[1106]193!$OMP  END PARALLEL
194
195    ENDIF
196
[1]197 END SUBROUTINE transpose_xy
198
199
[1216]200 SUBROUTINE resort_for_xz( f_inv, f_out )
[1]201
202!------------------------------------------------------------------------------!
203! Description:
204! ------------
[1216]205! Resorting data after the transposition from x to z. The transposition itself
206! is carried out in transpose_xz
207!------------------------------------------------------------------------------!
208
209     USE indices
210     USE transpose_indices
211
212     IMPLICIT NONE
213
214     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
215     REAL ::  f_out(1:nz,nys:nyn,nxl:nxr)
216
217
218     INTEGER ::  i, j, k
219
220!
221!-- Rearrange indices of input array in order to make data to be send
222!-- by MPI contiguous.
223!-- In case of parallel fft/transposition, scattered store is faster in
224!-- backward direction!!!
225    !$OMP  PARALLEL PRIVATE ( i, j, k )
226    !$OMP  DO
227    !$acc kernels present( f_inv, f_out )
228     DO  k = 1, nz
229         DO  i = nxl, nxr
230             DO  j = nys, nyn
231                 f_out(k,j,i) = f_inv(j,i,k)
232             ENDDO
233         ENDDO
234     ENDDO
235     !$acc end kernels
236     !$OMP  END PARALLEL
237
238 END SUBROUTINE resort_for_xz
239
240
241 SUBROUTINE transpose_xz( f_in, f_inv )
242
243!------------------------------------------------------------------------------!
244! Description:
245! ------------
[1]246! Transposition of input array (f_in) from x to z. For the input array, all
247! elements along x reside on the same PE, while after transposition, all
248! elements along z reside on the same PE.
249!------------------------------------------------------------------------------!
250
251    USE cpulog
252    USE indices
253    USE pegrid
254    USE transpose_indices
255
256    IMPLICIT NONE
257
[1111]258    INTEGER ::  i, j, k, l, xs
[1]259   
[1216]260    REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz)
[1]261
[1111]262    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
[1]263
[1111]264
[1]265!
266!-- If the PE grid is one-dimensional along y, the array has only to be
267!-- reordered locally and therefore no transposition has to be done.
268    IF ( pdims(1) /= 1 )  THEN
[1106]269
270#if defined( __parallel )
[1]271!
272!--    Reorder input array for transposition
[1111]273!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
[683]274!$OMP  DO
[1216]275       !$acc data copyout( work )
[1]276       DO  l = 0, pdims(1) - 1
277          xs = 0 + l * nnx
[1111]278          !$acc kernels present( f_in, work )
[1003]279          DO  k = nzb_x, nzt_x
[164]280             DO  i = xs, xs + nnx - 1
[1003]281                DO  j = nys_x, nyn_x
[1111]282                   work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]283                ENDDO
284             ENDDO
285          ENDDO
[1111]286          !$acc end kernels
[1]287       ENDDO
[1216]288       !$acc end data
[683]289!$OMP  END PARALLEL
[1]290
291!
292!--    Transpose array
[1318]293       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[622]294       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]295       CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
296                          f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
[1]297                          comm1dx, ierr )
[1111]298       !$acc update device( f_inv )
[1]299       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]300#endif
301
[1]302    ELSE
[1106]303
[1]304!
305!--    Reorder the array in a way that the z index is in first position
[683]306!$OMP  PARALLEL PRIVATE ( i, j, k )
307!$OMP  DO
[1216]308       !$acc kernels present( f_in, f_inv )
[1003]309       DO  i = nxl, nxr
310          DO  j = nys, nyn
311             DO  k = 1, nz
[164]312                f_inv(j,i,k) = f_in(i,j,k)
[1]313             ENDDO
314          ENDDO
315       ENDDO
[1111]316       !$acc end kernels
[683]317!$OMP  END PARALLEL
[1]318
[164]319    ENDIF
320
[1]321 END SUBROUTINE transpose_xz
322
323
[1216]324 SUBROUTINE resort_for_yx( f_inv, f_out )
[1]325
326!------------------------------------------------------------------------------!
327! Description:
328! ------------
[1216]329! Resorting data after the transposition from y to x. The transposition itself
330! is carried out in transpose_yx
331!------------------------------------------------------------------------------!
332
333     USE indices
334     USE transpose_indices
335
336     IMPLICIT NONE
337
338     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
339     REAL ::  f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
340
341
342     INTEGER ::  i, j, k
343
344!
345!-- Rearrange indices of input array in order to make data to be send
346!-- by MPI contiguous
347    !$OMP  PARALLEL PRIVATE ( i, j, k )
348    !$OMP  DO
349    !$acc kernels present( f_inv, f_out )
350     DO  i = 0, nx
351         DO  k = nzb_x, nzt_x
352             DO  j = nys_x, nyn_x
353                 f_out(i,j,k) = f_inv(j,k,i)
354             ENDDO
355         ENDDO
356     ENDDO
357     !$acc end kernels
358     !$OMP  END PARALLEL
359
360 END SUBROUTINE resort_for_yx
361
362
363 SUBROUTINE transpose_yx( f_in, f_inv )
364
365!------------------------------------------------------------------------------!
366! Description:
367! ------------
[1]368! Transposition of input array (f_in) from y to x. For the input array, all
369! elements along y reside on the same PE, while after transposition, all
370! elements along x reside on the same PE.
371!------------------------------------------------------------------------------!
372
373    USE cpulog
374    USE indices
375    USE pegrid
376    USE transpose_indices
377
378    IMPLICIT NONE
379
[1111]380    INTEGER ::  i, j, k, l, ys
[1]381   
[1216]382    REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
[1]383
[1111]384    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
385
386
[1106]387    IF ( numprocs /= 1 )  THEN
388
[1]389#if defined( __parallel )
390!
[1106]391!--    Reorder input array for transposition
[1111]392!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
[683]393!$OMP  DO
[1216]394       !$acc data copyout( work )
[1106]395       DO  l = 0, pdims(2) - 1
396          ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]397          !$acc kernels present( f_in, work )
[1106]398          DO  i = nxl_y, nxr_y
399             DO  k = nzb_y, nzt_y
400                DO  j = ys, ys + nyn_x - nys_x
[1111]401                   work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]402                ENDDO
403             ENDDO
404          ENDDO
[1111]405          !$acc end kernels
[1106]406       ENDDO
[1216]407       !$acc end data
[1106]408!$OMP  END PARALLEL
409
410!
411!--    Transpose array
[1318]412       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]413       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]414       CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
415                          f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
[1106]416                          comm1dy, ierr )
[1111]417       !$acc update device( f_inv )
[1106]418       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
419#endif
420
421    ELSE
422
423!
424!--    Reorder array f_in the same way as ALLTOALL did it
425!$OMP  PARALLEL PRIVATE ( i, j, k )
426!$OMP  DO
[1216]427       !$acc kernels present( f_in, f_inv )
[1003]428       DO  i = nxl_y, nxr_y
429          DO  k = nzb_y, nzt_y
[1106]430             DO  j = 0, ny
431                f_inv(j,k,i) = f_in(j,i,k)
[1]432             ENDDO
433          ENDDO
434       ENDDO
[1111]435       !$acc end kernels
[683]436!$OMP  END PARALLEL
[1]437
[1106]438    ENDIF
[1]439
440 END SUBROUTINE transpose_yx
441
442
[1216]443 SUBROUTINE transpose_yxd( f_in, f_out )
[1]444
445!------------------------------------------------------------------------------!
446! Description:
447! ------------
448! Transposition of input array (f_in) from y to x. For the input array, all
449! elements along y reside on the same PE, while after transposition, all
450! elements along x reside on the same PE.
451! This is a direct transposition for arrays with indices in regular order
452! (k,j,i) (cf. transpose_yx).
453!------------------------------------------------------------------------------!
454
455    USE cpulog
456    USE indices
457    USE pegrid
458    USE transpose_indices
459
460    IMPLICIT NONE
461
[1092]462    INTEGER ::  i, j, k, l, m, xs
[1]463
[1003]464    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
465             f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x),                     &
[164]466             work(nnx*nny*nnz)
[1]467
468#if defined( __parallel )
469
470!
471!-- Rearrange indices of input array in order to make data to be send
472!-- by MPI contiguous
[1003]473    DO  k = 1, nz
474       DO  j = nys, nyn
475          DO  i = nxl, nxr
[164]476             f_inv(i,k,j) = f_in(k,j,i)
[1]477          ENDDO
478       ENDDO
479    ENDDO
480
481!
482!-- Transpose array
483    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]484    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1]485    CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]486                       work(1),          sendrecvcount_xy, MPI_REAL, &
[1]487                       comm1dx, ierr )
488    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
489
490!
491!-- Reorder transposed array
492    m = 0
493    DO  l = 0, pdims(1) - 1
494       xs = 0 + l * nnx
[1003]495       DO  j = nys_x, nyn_x
496          DO  k = 1, nz
[1]497             DO  i = xs, xs + nnx - 1
498                m = m + 1
[164]499                f_out(i,j,k) = work(m)
[1]500             ENDDO
501          ENDDO
502       ENDDO
503    ENDDO
504
505#endif
506
507 END SUBROUTINE transpose_yxd
508
509
[1216]510 SUBROUTINE resort_for_yz( f_in, f_inv )
[1]511
512!------------------------------------------------------------------------------!
513! Description:
514! ------------
[1216]515! Resorting data for the transposition from y to z. The transposition itself
516! is carried out in transpose_yz
517!------------------------------------------------------------------------------!
518
519     USE indices
520     USE transpose_indices
521
522     IMPLICIT NONE
523
524     REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
525     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
526
527
528     INTEGER ::  i, j, k
529
530!
531!-- Rearrange indices of input array in order to make data to be send
532!-- by MPI contiguous
533    !$OMP  PARALLEL PRIVATE ( i, j, k )
534    !$OMP  DO
535    !$acc kernels present( f_in, f_inv )
536     DO  j = 0, ny
537         DO  k = nzb_y, nzt_y
538             DO  i = nxl_y, nxr_y
539                 f_inv(i,k,j) = f_in(j,i,k)
540             ENDDO
541         ENDDO
542     ENDDO
543     !$acc end kernels
544     !$OMP  END PARALLEL
545
546 END SUBROUTINE resort_for_yz
547
548
549 SUBROUTINE transpose_yz( f_inv, f_out )
550
551!------------------------------------------------------------------------------!
552! Description:
553! ------------
[1]554! Transposition of input array (f_in) from y to z. For the input array, all
555! elements along y reside on the same PE, while after transposition, all
556! elements along z reside on the same PE.
557!------------------------------------------------------------------------------!
558
559    USE cpulog
560    USE indices
561    USE pegrid
562    USE transpose_indices
563
564    IMPLICIT NONE
565
[1111]566    INTEGER ::  i, j, k, l, zs
[1]567   
[1216]568    REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]569
[1111]570    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
571
572
[1]573!
574!-- If the PE grid is one-dimensional along y, only local reordering
575!-- of the data is necessary and no transposition has to be done.
576    IF ( pdims(1) == 1 )  THEN
[1106]577
[683]578!$OMP  PARALLEL PRIVATE ( i, j, k )
579!$OMP  DO
[1216]580       !$acc kernels present( f_inv, f_out )
[1003]581       DO  j = 0, ny
582          DO  k = nzb_y, nzt_y
583             DO  i = nxl_y, nxr_y
[164]584                f_out(i,j,k) = f_inv(i,k,j)
[1]585             ENDDO
586          ENDDO
587       ENDDO
[1111]588       !$acc end kernels
[683]589!$OMP  END PARALLEL
[1]590
[1106]591    ELSE
592
593#if defined( __parallel )
[1]594!
[1106]595!--    Transpose array
[1318]596       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]597       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]598       !$acc update host( f_inv )
599       CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
600                          work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]601                          comm1dx, ierr )
602       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]603
604!
[1106]605!--    Reorder transposed array
[1111]606!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
[683]607!$OMP  DO
[1216]608       !$acc data copyin( work )
[1106]609       DO  l = 0, pdims(1) - 1
610          zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1216]611          !$acc kernels present( f_out )
[1106]612          DO  j = nys_z, nyn_z
613             DO  k = zs, zs + nzt_y - nzb_y
614                DO  i = nxl_z, nxr_z
[1111]615                   f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]616                ENDDO
[1]617             ENDDO
618          ENDDO
[1111]619          !$acc end kernels
[1]620       ENDDO
[1216]621       !$acc end data
[683]622!$OMP  END PARALLEL
[1]623#endif
624
[1106]625   ENDIF
626
[1]627 END SUBROUTINE transpose_yz
628
629
[1216]630 SUBROUTINE resort_for_zx( f_in, f_inv )
[1]631
632!------------------------------------------------------------------------------!
633! Description:
634! ------------
[1216]635! Resorting data for the transposition from z to x. The transposition itself
636! is carried out in transpose_zx
637!------------------------------------------------------------------------------!
638
639     USE indices
640     USE transpose_indices
641
642     IMPLICIT NONE
643
644     REAL ::  f_in(1:nz,nys:nyn,nxl:nxr)
645     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
646
647
648     INTEGER ::  i, j, k
649
650!
651!-- Rearrange indices of input array in order to make data to be send
652!-- by MPI contiguous
653    !$OMP  PARALLEL PRIVATE ( i, j, k )
654    !$OMP  DO
655    !$acc kernels present( f_in, f_inv )
656     DO  k = 1,nz
657         DO  i = nxl, nxr
658             DO  j = nys, nyn
659                 f_inv(j,i,k) = f_in(k,j,i)
660             ENDDO
661         ENDDO
662     ENDDO
663     !$acc end kernels
664     !$OMP  END PARALLEL
665
666 END SUBROUTINE resort_for_zx
667
668
669 SUBROUTINE transpose_zx( f_inv, f_out )
670
671!------------------------------------------------------------------------------!
672! Description:
673! ------------
[1]674! Transposition of input array (f_in) from z to x. For the input array, all
675! elements along z reside on the same PE, while after transposition, all
676! elements along x reside on the same PE.
677!------------------------------------------------------------------------------!
678
679    USE cpulog
680    USE indices
681    USE pegrid
682    USE transpose_indices
683
684    IMPLICIT NONE
685
[1111]686    INTEGER ::  i, j, k, l, xs
[1]687   
[1216]688    REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
[1]689
[1111]690    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
691
[1]692
693!
694!-- If the PE grid is one-dimensional along y, only local reordering
695!-- of the data is necessary and no transposition has to be done.
696    IF ( pdims(1) == 1 )  THEN
[1106]697
[683]698!$OMP  PARALLEL PRIVATE ( i, j, k )
699!$OMP  DO
[1216]700       !$acc kernels present( f_inv, f_out )
[1003]701       DO  k = 1, nz
702          DO  i = nxl, nxr
703             DO  j = nys, nyn
[164]704                f_out(i,j,k) = f_inv(j,i,k)
[1]705             ENDDO
706          ENDDO
707       ENDDO
[1111]708       !$acc end kernels
[683]709!$OMP  END PARALLEL
[1]710
[1106]711    ELSE
712
713#if defined( __parallel )
[1]714!
[1106]715!--    Transpose array
[1318]716       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]717       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]718       !$acc update host( f_inv )
719       CALL MPI_ALLTOALL( f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
720                          work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]721                          comm1dx, ierr )
722       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]723
724!
[1106]725!--    Reorder transposed array
[1111]726!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
[683]727!$OMP  DO
[1216]728       !$acc data copyin( work )
[1106]729       DO  l = 0, pdims(1) - 1
730          xs = 0 + l * nnx
[1216]731          !$acc kernels present( f_out )
[1106]732          DO  k = nzb_x, nzt_x
733             DO  i = xs, xs + nnx - 1
734                DO  j = nys_x, nyn_x
[1111]735                   f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]736                ENDDO
[1]737             ENDDO
738          ENDDO
[1111]739          !$acc end kernels
[1]740       ENDDO
[1216]741       !$acc end data
[683]742!$OMP  END PARALLEL
[1]743#endif
744
[1106]745    ENDIF
746
[1]747 END SUBROUTINE transpose_zx
748
749
[1216]750 SUBROUTINE resort_for_zy( f_inv, f_out )
[1]751
752!------------------------------------------------------------------------------!
753! Description:
754! ------------
[1216]755! Resorting data after the transposition from z to y. The transposition itself
756! is carried out in transpose_zy
757!------------------------------------------------------------------------------!
758
759     USE indices
760     USE transpose_indices
761
762     IMPLICIT NONE
763
764     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
765     REAL ::  f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
766
767
768     INTEGER ::  i, j, k
769
770!
771!-- Rearrange indices of input array in order to make data to be send
772!-- by MPI contiguous
773    !$OMP  PARALLEL PRIVATE ( i, j, k )
774    !$OMP  DO
775    !$acc kernels present( f_inv, f_out )
776     DO  k = nzb_y, nzt_y
777         DO  j = 0, ny
778             DO  i = nxl_y, nxr_y
779                 f_out(j,i,k) = f_inv(i,k,j)
780             ENDDO
781         ENDDO
782     ENDDO
783     !$acc end kernels
784     !$OMP  END PARALLEL
785
786 END SUBROUTINE resort_for_zy
787
788
789 SUBROUTINE transpose_zy( f_in, f_inv )
790
791!------------------------------------------------------------------------------!
792! Description:
793! ------------
[1]794! Transposition of input array (f_in) from z to y. For the input array, all
795! elements along z reside on the same PE, while after transposition, all
796! elements along y reside on the same PE.
797!------------------------------------------------------------------------------!
798
799    USE cpulog
800    USE indices
801    USE pegrid
802    USE transpose_indices
803
804    IMPLICIT NONE
805
[1111]806    INTEGER ::  i, j, k, l, zs
[1]807   
[1216]808    REAL ::  f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
[1]809
[1111]810    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
811
812
[1]813!
814!-- If the PE grid is one-dimensional along y, the array has only to be
815!-- reordered locally and therefore no transposition has to be done.
816    IF ( pdims(1) /= 1 )  THEN
[1106]817
818#if defined( __parallel )
[1]819!
820!--    Reorder input array for transposition
[1111]821!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
[683]822!$OMP  DO
[1216]823       !$acc data copyout( work )
[1]824       DO  l = 0, pdims(1) - 1
[1003]825          zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1111]826          !$acc kernels present( f_in, work )
[1003]827          DO  j = nys_z, nyn_z
828             DO  k = zs, zs + nzt_y - nzb_y
829                DO  i = nxl_z, nxr_z
[1111]830                   work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]831                ENDDO
832             ENDDO
833          ENDDO
[1111]834          !$acc end kernels
[1]835       ENDDO
[1216]836       !$acc end data
[683]837!$OMP  END PARALLEL
[1]838
839!
840!--    Transpose array
[1318]841       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[622]842       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]843       CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
844                          f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
[1]845                          comm1dx, ierr )
[1111]846       !$acc update device( f_inv )
[1]847       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]848#endif
[1]849
850    ELSE
851!
[1106]852!--    Reorder the array in the same way like ALLTOALL did it
[683]853!$OMP  PARALLEL PRIVATE ( i, j, k )
854!$OMP  DO
[1216]855       !$acc kernels present( f_in, f_inv )
[1003]856       DO  k = nzb_y, nzt_y
857          DO  j = 0, ny
858             DO  i = nxl_y, nxr_y
[164]859                f_inv(i,k,j) = f_in(i,j,k)
860             ENDDO
861          ENDDO
862       ENDDO
[1111]863       !$acc end kernels
[683]864!$OMP  END PARALLEL
[1106]865
866    ENDIF
867
[1]868 END SUBROUTINE transpose_zy
869
870
[1216]871 SUBROUTINE transpose_zyd( f_in, f_out )
[1]872
873!------------------------------------------------------------------------------!
874! Description:
875! ------------
876! Transposition of input array (f_in) from z to y. For the input array, all
877! elements along z reside on the same PE, while after transposition, all
878! elements along y reside on the same PE.
879! This is a direct transposition for arrays with indices in regular order
880! (k,j,i) (cf. transpose_zy).
881!------------------------------------------------------------------------------!
882
883    USE cpulog
884    USE indices
885    USE pegrid
886    USE transpose_indices
887
888    IMPLICIT NONE
889
890    INTEGER ::  i, j, k, l, m, ys
891   
[1003]892    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
893             f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd),                 &
[164]894             work(nnx*nny*nnz)
[1]895
896#if defined( __parallel )
897
898!
899!-- Rearrange indices of input array in order to make data to be send
900!-- by MPI contiguous
[1003]901    DO  i = nxl, nxr
902       DO  j = nys, nyn
903          DO  k = 1, nz
[164]904             f_inv(j,i,k) = f_in(k,j,i)
[1]905          ENDDO
906       ENDDO
907    ENDDO
908
909!
910!-- Move data to different array, because memory location of work1 is
911!-- needed further below (work1 = work2).
912!-- If the PE grid is one-dimensional along x, only local reordering
913!-- of the data is necessary and no transposition has to be done.
914    IF ( pdims(2) == 1 )  THEN
[1003]915       DO  k = 1, nz
916          DO  i = nxl, nxr
917             DO  j = nys, nyn
[164]918                f_out(j,i,k) = f_inv(j,i,k)
[1]919             ENDDO
920          ENDDO
921       ENDDO
922       RETURN
923    ENDIF
924
925!
926!-- Transpose array
927    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]928    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1]929    CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]930                       work(1),          sendrecvcount_zyd, MPI_REAL, &
[1]931                       comm1dy, ierr )
932    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
933
934!
935!-- Reorder transposed array
936    m = 0
937    DO  l = 0, pdims(2) - 1
938       ys = 0 + l * nny
[1003]939       DO  k = nzb_yd, nzt_yd
940          DO  i = nxl_yd, nxr_yd
[1]941             DO  j = ys, ys + nny - 1
942                m = m + 1
[164]943                f_out(j,i,k) = work(m)
[1]944             ENDDO
945          ENDDO
946       ENDDO
947    ENDDO
948
949#endif
950
951 END SUBROUTINE transpose_zyd
Note: See TracBrowser for help on using the repository browser.