source: palm/trunk/SOURCE/transpose.f90 @ 1216

Last change on this file since 1216 was 1216, checked in by raasch, 8 years ago

overlapping execution of fft and transpositions (MPI_ALLTOALL), but real overlapping is not activated so far,
fftw implemented for 1D-decomposition
resorting of arrays moved to separate routines resort_for_...
bugfix in mbuild concerning Makefile_check

  • Property svn:keywords set to Id
File size: 26.8 KB
RevLine 
[1216]1 SUBROUTINE resort_for_xy( f_in, f_inv )
[1]2
[1036]3!--------------------------------------------------------------------------------!
4! This file is part of PALM.
5!
6! PALM is free software: you can redistribute it and/or modify it under the terms
7! of the GNU General Public License as published by the Free Software Foundation,
8! either version 3 of the License, or (at your option) any later version.
9!
10! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12! A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13!
14! You should have received a copy of the GNU General Public License along with
15! PALM. If not, see <http://www.gnu.org/licenses/>.
16!
17! Copyright 1997-2012  Leibniz University Hannover
18!--------------------------------------------------------------------------------!
19!
[484]20! Current revisions:
[1]21! -----------------
[1216]22! re-sorting of the transposed / to be transposed arrays moved to separate
23! routines resort_for_...
[198]24!
25! Former revisions:
26! -----------------
27! $Id: transpose.f90 1216 2013-08-26 09:31:42Z raasch $
28!
[1112]29! 1111 2013-03-08 23:54:10Z raasch
30! openACC directives added,
31! resorting data from/to work changed, work got 4 dimensions instead of 1
32!
[1107]33! 1106 2013-03-04 05:31:38Z raasch
34! preprocessor lines rearranged so that routines can also be used in serial
35! (non-parallel) mode
36!
[1093]37! 1092 2013-02-02 11:24:22Z raasch
38! unused variables removed
39!
[1037]40! 1036 2012-10-22 13:43:42Z raasch
41! code put under GPL (PALM 3.9)
42!
[1004]43! 1003 2012-09-14 14:35:53Z raasch
44! indices nxa, nya, etc. replaced by nx, ny, etc.
45!
[684]46! 683 2011-02-09 14:25:15Z raasch
47! openMP parallelization of transpositions for 2d-domain-decomposition
48!
[623]49! 622 2010-12-10 08:08:13Z raasch
50! optional barriers included in order to speed up collective operations
51!
[198]52! 164 2008-05-15 08:46:15Z raasch
[164]53! f_inv changed from subroutine argument to automatic array in order to do
54! re-ordering from f_in to f_inv in one step, one array work is needed instead
55! of work1 and work2
[1]56!
[198]57! February 2007
[3]58! RCS Log replace by Id keyword, revision history cleaned up
59!
[1]60! Revision 1.2  2004/04/30 13:12:17  raasch
61! Switched from mpi_alltoallv to the simpler mpi_alltoall,
62! all former transpose-routine files collected in this file, enlarged
63! transposition arrays introduced
64!
65! Revision 1.1  2004/04/30 13:08:16  raasch
66! Initial revision (collection of former routines transpose_xy, transpose_xz,
67!                   transpose_yx, transpose_yz, transpose_zx, transpose_zy)
68!
69! Revision 1.1  1997/07/24 11:25:18  raasch
70! Initial revision
71!
[1216]72!------------------------------------------------------------------------------!
73! Description:
74! ------------
75! Resorting data for the transposition from x to y. The transposition itself
76! is carried out in transpose_xy
77!------------------------------------------------------------------------------!
78
79     USE indices
80     USE transpose_indices
81
82     IMPLICIT NONE
83
84     REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
85     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
86
87
88     INTEGER ::  i, j, k
89
[1]90!
[1216]91!-- Rearrange indices of input array in order to make data to be send
92!-- by MPI contiguous
93    !$OMP  PARALLEL PRIVATE ( i, j, k )
94    !$OMP  DO
95    !$acc kernels present( f_in, f_inv )
96    !$acc loop
97     DO  i = 0, nx
98         DO  k = nzb_x, nzt_x
99             !$acc loop vector( 32 )
100             DO  j = nys_x, nyn_x
101                 f_inv(j,k,i) = f_in(i,j,k)
102             ENDDO
103         ENDDO
104     ENDDO
105     !$acc end kernels
106     !$OMP  END PARALLEL
107
108 END SUBROUTINE resort_for_xy
109
110
111 SUBROUTINE transpose_xy( f_inv, f_out )
112
113!------------------------------------------------------------------------------!
[1]114! Description:
115! ------------
116! Transposition of input array (f_in) from x to y. For the input array, all
117! elements along x reside on the same PE, while after transposition, all
118! elements along y reside on the same PE.
119!------------------------------------------------------------------------------!
120
121    USE cpulog
122    USE indices
123    USE interfaces
124    USE pegrid
125    USE transpose_indices
126
127    IMPLICIT NONE
128
[1111]129    INTEGER ::  i, j, k, l, ys
[1]130   
[1216]131    REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
[1]132
[1111]133    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
134
135
[1106]136    IF ( numprocs /= 1 )  THEN
137
138#if defined( __parallel )
[1]139!
[1106]140!--    Transpose array
141       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
142       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]143       !$acc update host( f_inv )
144       CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
145                          work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]146                          comm1dy, ierr )
147       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]148
149!
[1106]150!--    Reorder transposed array
[1111]151!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
[683]152!$OMP  DO
[1216]153       !$acc data copyin( work )
[1106]154       DO  l = 0, pdims(2) - 1
155          ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]156          !$acc kernels present( f_out, work )
157          !$acc loop
[1106]158          DO  i = nxl_y, nxr_y
159             DO  k = nzb_y, nzt_y
[1111]160                !$acc loop vector( 32 )
[1106]161                DO  j = ys, ys + nyn_x - nys_x
[1111]162                   f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]163                ENDDO
[1]164             ENDDO
165          ENDDO
[1111]166          !$acc end kernels
[1]167       ENDDO
[1216]168       !$acc end data
[683]169!$OMP  END PARALLEL
[1]170#endif
171
[1106]172    ELSE
173
174!
175!--    Reorder transposed array
176!$OMP  PARALLEL PRIVATE ( i, j, k )
177!$OMP  DO
[1216]178       !$acc kernels present( f_inv, f_out )
[1111]179       !$acc loop
[1106]180       DO  k = nzb_y, nzt_y
181          DO  i = nxl_y, nxr_y
[1111]182             !$acc loop vector( 32 )
[1106]183             DO  j = 0, ny
184                f_out(j,i,k) = f_inv(j,k,i)
185             ENDDO
186          ENDDO
187       ENDDO
[1111]188       !$acc end kernels
[1106]189!$OMP  END PARALLEL
190
191    ENDIF
192
[1]193 END SUBROUTINE transpose_xy
194
195
[1216]196 SUBROUTINE resort_for_xz( f_inv, f_out )
[1]197
198!------------------------------------------------------------------------------!
199! Description:
200! ------------
[1216]201! Resorting data after the transposition from x to z. The transposition itself
202! is carried out in transpose_xz
203!------------------------------------------------------------------------------!
204
205     USE indices
206     USE transpose_indices
207
208     IMPLICIT NONE
209
210     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
211     REAL ::  f_out(1:nz,nys:nyn,nxl:nxr)
212
213
214     INTEGER ::  i, j, k
215
216!
217!-- Rearrange indices of input array in order to make data to be send
218!-- by MPI contiguous.
219!-- In case of parallel fft/transposition, scattered store is faster in
220!-- backward direction!!!
221    !$OMP  PARALLEL PRIVATE ( i, j, k )
222    !$OMP  DO
223    !$acc kernels present( f_inv, f_out )
224    !$acc loop
225     DO  k = 1, nz
226         DO  i = nxl, nxr
227             !$acc loop vector( 32 )
228             DO  j = nys, nyn
229                 f_out(k,j,i) = f_inv(j,i,k)
230             ENDDO
231         ENDDO
232     ENDDO
233     !$acc end kernels
234     !$OMP  END PARALLEL
235
236 END SUBROUTINE resort_for_xz
237
238
239 SUBROUTINE transpose_xz( f_in, f_inv )
240
241!------------------------------------------------------------------------------!
242! Description:
243! ------------
[1]244! Transposition of input array (f_in) from x to z. For the input array, all
245! elements along x reside on the same PE, while after transposition, all
246! elements along z reside on the same PE.
247!------------------------------------------------------------------------------!
248
249    USE cpulog
250    USE indices
251    USE interfaces
252    USE pegrid
253    USE transpose_indices
254
255    IMPLICIT NONE
256
[1111]257    INTEGER ::  i, j, k, l, xs
[1]258   
[1216]259    REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz)
[1]260
[1111]261    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
[1]262
[1111]263
[1]264!
265!-- If the PE grid is one-dimensional along y, the array has only to be
266!-- reordered locally and therefore no transposition has to be done.
267    IF ( pdims(1) /= 1 )  THEN
[1106]268
269#if defined( __parallel )
[1]270!
271!--    Reorder input array for transposition
[1111]272!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
[683]273!$OMP  DO
[1216]274       !$acc data copyout( work )
[1]275       DO  l = 0, pdims(1) - 1
276          xs = 0 + l * nnx
[1111]277          !$acc kernels present( f_in, work )
278          !$acc loop
[1003]279          DO  k = nzb_x, nzt_x
[164]280             DO  i = xs, xs + nnx - 1
[1111]281                !$acc loop vector( 32 )
[1003]282                DO  j = nys_x, nyn_x
[1111]283                   work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]284                ENDDO
285             ENDDO
286          ENDDO
[1111]287          !$acc end kernels
[1]288       ENDDO
[1216]289       !$acc end data
[683]290!$OMP  END PARALLEL
[1]291
292!
293!--    Transpose array
294       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]295       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]296       CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
297                          f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
[1]298                          comm1dx, ierr )
[1111]299       !$acc update device( f_inv )
[1]300       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]301#endif
302
[1]303    ELSE
[1106]304
[1]305!
306!--    Reorder the array in a way that the z index is in first position
[683]307!$OMP  PARALLEL PRIVATE ( i, j, k )
308!$OMP  DO
[1216]309       !$acc kernels present( f_in, f_inv )
[1111]310       !$acc loop
[1003]311       DO  i = nxl, nxr
312          DO  j = nys, nyn
[1111]313             !$acc loop vector( 32 )
[1003]314             DO  k = 1, nz
[164]315                f_inv(j,i,k) = f_in(i,j,k)
[1]316             ENDDO
317          ENDDO
318       ENDDO
[1111]319       !$acc end kernels
[683]320!$OMP  END PARALLEL
[1]321
[164]322    ENDIF
323
[1]324 END SUBROUTINE transpose_xz
325
326
[1216]327 SUBROUTINE resort_for_yx( f_inv, f_out )
[1]328
329!------------------------------------------------------------------------------!
330! Description:
331! ------------
[1216]332! Resorting data after the transposition from y to x. The transposition itself
333! is carried out in transpose_yx
334!------------------------------------------------------------------------------!
335
336     USE indices
337     USE transpose_indices
338
339     IMPLICIT NONE
340
341     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
342     REAL ::  f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
343
344
345     INTEGER ::  i, j, k
346
347!
348!-- Rearrange indices of input array in order to make data to be send
349!-- by MPI contiguous
350    !$OMP  PARALLEL PRIVATE ( i, j, k )
351    !$OMP  DO
352    !$acc kernels present( f_inv, f_out )
353    !$acc loop
354     DO  i = 0, nx
355         DO  k = nzb_x, nzt_x
356             !$acc loop vector( 32 )
357             DO  j = nys_x, nyn_x
358                 f_out(i,j,k) = f_inv(j,k,i)
359             ENDDO
360         ENDDO
361     ENDDO
362     !$acc end kernels
363     !$OMP  END PARALLEL
364
365 END SUBROUTINE resort_for_yx
366
367
368 SUBROUTINE transpose_yx( f_in, f_inv )
369
370!------------------------------------------------------------------------------!
371! Description:
372! ------------
[1]373! Transposition of input array (f_in) from y to x. For the input array, all
374! elements along y reside on the same PE, while after transposition, all
375! elements along x reside on the same PE.
376!------------------------------------------------------------------------------!
377
378    USE cpulog
379    USE indices
380    USE interfaces
381    USE pegrid
382    USE transpose_indices
383
384    IMPLICIT NONE
385
[1111]386    INTEGER ::  i, j, k, l, ys
[1]387   
[1216]388    REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
[1]389
[1111]390    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
391
392
[1106]393    IF ( numprocs /= 1 )  THEN
394
[1]395#if defined( __parallel )
396!
[1106]397!--    Reorder input array for transposition
[1111]398!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
[683]399!$OMP  DO
[1216]400       !$acc data copyout( work )
[1106]401       DO  l = 0, pdims(2) - 1
402          ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]403          !$acc kernels present( f_in, work )
404          !$acc loop
[1106]405          DO  i = nxl_y, nxr_y
406             DO  k = nzb_y, nzt_y
[1111]407                !$acc loop vector( 32 )
[1106]408                DO  j = ys, ys + nyn_x - nys_x
[1111]409                   work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]410                ENDDO
411             ENDDO
412          ENDDO
[1111]413          !$acc end kernels
[1106]414       ENDDO
[1216]415       !$acc end data
[1106]416!$OMP  END PARALLEL
417
418!
419!--    Transpose array
420       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
421       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]422       CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
423                          f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
[1106]424                          comm1dy, ierr )
[1111]425       !$acc update device( f_inv )
[1106]426       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
427#endif
428
429    ELSE
430
431!
432!--    Reorder array f_in the same way as ALLTOALL did it
433!$OMP  PARALLEL PRIVATE ( i, j, k )
434!$OMP  DO
[1216]435       !$acc kernels present( f_in, f_inv )
[1111]436       !$acc loop
[1003]437       DO  i = nxl_y, nxr_y
438          DO  k = nzb_y, nzt_y
[1111]439             !$acc loop vector( 32 )
[1106]440             DO  j = 0, ny
441                f_inv(j,k,i) = f_in(j,i,k)
[1]442             ENDDO
443          ENDDO
444       ENDDO
[1111]445       !$acc end kernels
[683]446!$OMP  END PARALLEL
[1]447
[1106]448    ENDIF
[1]449
450 END SUBROUTINE transpose_yx
451
452
[1216]453 SUBROUTINE transpose_yxd( f_in, f_out )
[1]454
455!------------------------------------------------------------------------------!
456! Description:
457! ------------
458! Transposition of input array (f_in) from y to x. For the input array, all
459! elements along y reside on the same PE, while after transposition, all
460! elements along x reside on the same PE.
461! This is a direct transposition for arrays with indices in regular order
462! (k,j,i) (cf. transpose_yx).
463!------------------------------------------------------------------------------!
464
465    USE cpulog
466    USE indices
467    USE interfaces
468    USE pegrid
469    USE transpose_indices
470
471    IMPLICIT NONE
472
[1092]473    INTEGER ::  i, j, k, l, m, xs
[1]474
[1003]475    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
476             f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x),                     &
[164]477             work(nnx*nny*nnz)
[1]478
479#if defined( __parallel )
480
481!
482!-- Rearrange indices of input array in order to make data to be send
483!-- by MPI contiguous
[1003]484    DO  k = 1, nz
485       DO  j = nys, nyn
486          DO  i = nxl, nxr
[164]487             f_inv(i,k,j) = f_in(k,j,i)
[1]488          ENDDO
489       ENDDO
490    ENDDO
491
492!
493!-- Transpose array
494    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]495    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1]496    CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]497                       work(1),          sendrecvcount_xy, MPI_REAL, &
[1]498                       comm1dx, ierr )
499    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
500
501!
502!-- Reorder transposed array
503    m = 0
504    DO  l = 0, pdims(1) - 1
505       xs = 0 + l * nnx
[1003]506       DO  j = nys_x, nyn_x
507          DO  k = 1, nz
[1]508             DO  i = xs, xs + nnx - 1
509                m = m + 1
[164]510                f_out(i,j,k) = work(m)
[1]511             ENDDO
512          ENDDO
513       ENDDO
514    ENDDO
515
516#endif
517
518 END SUBROUTINE transpose_yxd
519
520
[1216]521 SUBROUTINE resort_for_yz( f_in, f_inv )
[1]522
523!------------------------------------------------------------------------------!
524! Description:
525! ------------
[1216]526! Resorting data for the transposition from y to z. The transposition itself
527! is carried out in transpose_yz
528!------------------------------------------------------------------------------!
529
530     USE indices
531     USE transpose_indices
532
533     IMPLICIT NONE
534
535     REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
536     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
537
538
539     INTEGER ::  i, j, k
540
541!
542!-- Rearrange indices of input array in order to make data to be send
543!-- by MPI contiguous
544    !$OMP  PARALLEL PRIVATE ( i, j, k )
545    !$OMP  DO
546    !$acc kernels present( f_in, f_inv )
547    !$acc loop
548     DO  j = 0, ny
549         DO  k = nzb_y, nzt_y
550             !$acc loop vector( 32 )
551             DO  i = nxl_y, nxr_y
552                 f_inv(i,k,j) = f_in(j,i,k)
553             ENDDO
554         ENDDO
555     ENDDO
556     !$acc end kernels
557     !$OMP  END PARALLEL
558
559 END SUBROUTINE resort_for_yz
560
561
562 SUBROUTINE transpose_yz( f_inv, f_out )
563
564!------------------------------------------------------------------------------!
565! Description:
566! ------------
[1]567! Transposition of input array (f_in) from y to z. For the input array, all
568! elements along y reside on the same PE, while after transposition, all
569! elements along z reside on the same PE.
570!------------------------------------------------------------------------------!
571
572    USE cpulog
573    USE indices
574    USE interfaces
575    USE pegrid
576    USE transpose_indices
577
578    IMPLICIT NONE
579
[1111]580    INTEGER ::  i, j, k, l, zs
[1]581   
[1216]582    REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]583
[1111]584    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
585
586
[1]587!
588!-- If the PE grid is one-dimensional along y, only local reordering
589!-- of the data is necessary and no transposition has to be done.
590    IF ( pdims(1) == 1 )  THEN
[1106]591
[683]592!$OMP  PARALLEL PRIVATE ( i, j, k )
593!$OMP  DO
[1216]594       !$acc kernels present( f_inv, f_out )
[1111]595       !$acc loop
[1003]596       DO  j = 0, ny
597          DO  k = nzb_y, nzt_y
[1111]598             !$acc loop vector( 32 )
[1003]599             DO  i = nxl_y, nxr_y
[164]600                f_out(i,j,k) = f_inv(i,k,j)
[1]601             ENDDO
602          ENDDO
603       ENDDO
[1111]604       !$acc end kernels
[683]605!$OMP  END PARALLEL
[1]606
[1106]607    ELSE
608
609#if defined( __parallel )
[1]610!
[1106]611!--    Transpose array
612       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
613       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]614       !$acc update host( f_inv )
615       CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
616                          work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]617                          comm1dx, ierr )
618       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]619
620!
[1106]621!--    Reorder transposed array
[1111]622!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
[683]623!$OMP  DO
[1216]624       !$acc data copyin( work )
[1106]625       DO  l = 0, pdims(1) - 1
626          zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1216]627          !$acc kernels present( f_out )
[1111]628          !$acc loop
[1106]629          DO  j = nys_z, nyn_z
630             DO  k = zs, zs + nzt_y - nzb_y
[1111]631                !$acc loop vector( 32 )
[1106]632                DO  i = nxl_z, nxr_z
[1111]633                   f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]634                ENDDO
[1]635             ENDDO
636          ENDDO
[1111]637          !$acc end kernels
[1]638       ENDDO
[1216]639       !$acc end data
[683]640!$OMP  END PARALLEL
[1]641#endif
642
[1106]643   ENDIF
644
[1]645 END SUBROUTINE transpose_yz
646
647
[1216]648 SUBROUTINE resort_for_zx( f_in, f_inv )
[1]649
650!------------------------------------------------------------------------------!
651! Description:
652! ------------
[1216]653! Resorting data for the transposition from z to x. The transposition itself
654! is carried out in transpose_zx
655!------------------------------------------------------------------------------!
656
657     USE indices
658     USE transpose_indices
659
660     IMPLICIT NONE
661
662     REAL ::  f_in(1:nz,nys:nyn,nxl:nxr)
663     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
664
665
666     INTEGER ::  i, j, k
667
668!
669!-- Rearrange indices of input array in order to make data to be send
670!-- by MPI contiguous
671    !$OMP  PARALLEL PRIVATE ( i, j, k )
672    !$OMP  DO
673    !$acc kernels present( f_in, f_inv )
674    !$acc loop
675     DO  k = 1,nz
676         DO  i = nxl, nxr
677             !$acc loop vector( 32 )
678             DO  j = nys, nyn
679                 f_inv(j,i,k) = f_in(k,j,i)
680             ENDDO
681         ENDDO
682     ENDDO
683     !$acc end kernels
684     !$OMP  END PARALLEL
685
686 END SUBROUTINE resort_for_zx
687
688
689 SUBROUTINE transpose_zx( f_inv, f_out )
690
691!------------------------------------------------------------------------------!
692! Description:
693! ------------
[1]694! Transposition of input array (f_in) from z to x. For the input array, all
695! elements along z reside on the same PE, while after transposition, all
696! elements along x reside on the same PE.
697!------------------------------------------------------------------------------!
698
699    USE cpulog
700    USE indices
701    USE interfaces
702    USE pegrid
703    USE transpose_indices
704
705    IMPLICIT NONE
706
[1111]707    INTEGER ::  i, j, k, l, xs
[1]708   
[1216]709    REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
[1]710
[1111]711    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
712
[1]713
714!
715!-- If the PE grid is one-dimensional along y, only local reordering
716!-- of the data is necessary and no transposition has to be done.
717    IF ( pdims(1) == 1 )  THEN
[1106]718
[683]719!$OMP  PARALLEL PRIVATE ( i, j, k )
720!$OMP  DO
[1216]721       !$acc kernels present( f_inv, f_out )
[1106]722       !$acc loop
[1003]723       DO  k = 1, nz
724          DO  i = nxl, nxr
[1106]725             !$acc loop vector( 32 )
[1003]726             DO  j = nys, nyn
[164]727                f_out(i,j,k) = f_inv(j,i,k)
[1]728             ENDDO
729          ENDDO
730       ENDDO
[1111]731       !$acc end kernels
[683]732!$OMP  END PARALLEL
[1]733
[1106]734    ELSE
735
736#if defined( __parallel )
[1]737!
[1106]738!--    Transpose array
739       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
740       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]741       !$acc update host( f_inv )
742       CALL MPI_ALLTOALL( f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
743                          work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]744                          comm1dx, ierr )
745       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]746
747!
[1106]748!--    Reorder transposed array
[1111]749!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
[683]750!$OMP  DO
[1216]751       !$acc data copyin( work )
[1106]752       DO  l = 0, pdims(1) - 1
753          xs = 0 + l * nnx
[1216]754          !$acc kernels present( f_out )
[1111]755          !$acc loop
[1106]756          DO  k = nzb_x, nzt_x
757             DO  i = xs, xs + nnx - 1
[1111]758                !$acc loop vector( 32 )
[1106]759                DO  j = nys_x, nyn_x
[1111]760                   f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]761                ENDDO
[1]762             ENDDO
763          ENDDO
[1111]764          !$acc end kernels
[1]765       ENDDO
[1216]766       !$acc end data
[683]767!$OMP  END PARALLEL
[1]768#endif
769
[1106]770    ENDIF
771
[1]772 END SUBROUTINE transpose_zx
773
774
[1216]775 SUBROUTINE resort_for_zy( f_inv, f_out )
[1]776
777!------------------------------------------------------------------------------!
778! Description:
779! ------------
[1216]780! Resorting data after the transposition from z to y. The transposition itself
781! is carried out in transpose_zy
782!------------------------------------------------------------------------------!
783
784     USE indices
785     USE transpose_indices
786
787     IMPLICIT NONE
788
789     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
790     REAL ::  f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
791
792
793     INTEGER ::  i, j, k
794
795!
796!-- Rearrange indices of input array in order to make data to be send
797!-- by MPI contiguous
798    !$OMP  PARALLEL PRIVATE ( i, j, k )
799    !$OMP  DO
800    !$acc kernels present( f_inv, f_out )
801    !$acc loop
802     DO  k = nzb_y, nzt_y
803         DO  j = 0, ny
804             !$acc loop vector( 32 )
805             DO  i = nxl_y, nxr_y
806                 f_out(j,i,k) = f_inv(i,k,j)
807             ENDDO
808         ENDDO
809     ENDDO
810     !$acc end kernels
811     !$OMP  END PARALLEL
812
813 END SUBROUTINE resort_for_zy
814
815
816 SUBROUTINE transpose_zy( f_in, f_inv )
817
818!------------------------------------------------------------------------------!
819! Description:
820! ------------
[1]821! Transposition of input array (f_in) from z to y. For the input array, all
822! elements along z reside on the same PE, while after transposition, all
823! elements along y reside on the same PE.
824!------------------------------------------------------------------------------!
825
826    USE cpulog
827    USE indices
828    USE interfaces
829    USE pegrid
830    USE transpose_indices
831
832    IMPLICIT NONE
833
[1111]834    INTEGER ::  i, j, k, l, zs
[1]835   
[1216]836    REAL ::  f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
[1]837
[1111]838    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
839
840
[1]841!
842!-- If the PE grid is one-dimensional along y, the array has only to be
843!-- reordered locally and therefore no transposition has to be done.
844    IF ( pdims(1) /= 1 )  THEN
[1106]845
846#if defined( __parallel )
[1]847!
848!--    Reorder input array for transposition
[1111]849!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
[683]850!$OMP  DO
[1216]851       !$acc data copyout( work )
[1]852       DO  l = 0, pdims(1) - 1
[1003]853          zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1111]854          !$acc kernels present( f_in, work )
855          !$acc loop
[1003]856          DO  j = nys_z, nyn_z
857             DO  k = zs, zs + nzt_y - nzb_y
[1111]858                !$acc loop vector( 32 )
[1003]859                DO  i = nxl_z, nxr_z
[1111]860                   work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]861                ENDDO
862             ENDDO
863          ENDDO
[1111]864          !$acc end kernels
[1]865       ENDDO
[1216]866       !$acc end data
[683]867!$OMP  END PARALLEL
[1]868
869!
870!--    Transpose array
871       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]872       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1111]873       CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
874                          f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
[1]875                          comm1dx, ierr )
[1111]876       !$acc update device( f_inv )
[1]877       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]878#endif
[1]879
880    ELSE
881!
[1106]882!--    Reorder the array in the same way like ALLTOALL did it
[683]883!$OMP  PARALLEL PRIVATE ( i, j, k )
884!$OMP  DO
[1216]885       !$acc kernels present( f_in, f_inv )
[1111]886       !$acc loop
[1003]887       DO  k = nzb_y, nzt_y
888          DO  j = 0, ny
[1111]889             !$acc loop vector( 32 )
[1003]890             DO  i = nxl_y, nxr_y
[164]891                f_inv(i,k,j) = f_in(i,j,k)
892             ENDDO
893          ENDDO
894       ENDDO
[1111]895       !$acc end kernels
[683]896!$OMP  END PARALLEL
[1106]897
898    ENDIF
899
[1]900 END SUBROUTINE transpose_zy
901
902
[1216]903 SUBROUTINE transpose_zyd( f_in, f_out )
[1]904
905!------------------------------------------------------------------------------!
906! Description:
907! ------------
908! Transposition of input array (f_in) from z to y. For the input array, all
909! elements along z reside on the same PE, while after transposition, all
910! elements along y reside on the same PE.
911! This is a direct transposition for arrays with indices in regular order
912! (k,j,i) (cf. transpose_zy).
913!------------------------------------------------------------------------------!
914
915    USE cpulog
916    USE indices
917    USE interfaces
918    USE pegrid
919    USE transpose_indices
920
921    IMPLICIT NONE
922
923    INTEGER ::  i, j, k, l, m, ys
924   
[1003]925    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
926             f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd),                 &
[164]927             work(nnx*nny*nnz)
[1]928
929#if defined( __parallel )
930
931!
932!-- Rearrange indices of input array in order to make data to be send
933!-- by MPI contiguous
[1003]934    DO  i = nxl, nxr
935       DO  j = nys, nyn
936          DO  k = 1, nz
[164]937             f_inv(j,i,k) = f_in(k,j,i)
[1]938          ENDDO
939       ENDDO
940    ENDDO
941
942!
943!-- Move data to different array, because memory location of work1 is
944!-- needed further below (work1 = work2).
945!-- If the PE grid is one-dimensional along x, only local reordering
946!-- of the data is necessary and no transposition has to be done.
947    IF ( pdims(2) == 1 )  THEN
[1003]948       DO  k = 1, nz
949          DO  i = nxl, nxr
950             DO  j = nys, nyn
[164]951                f_out(j,i,k) = f_inv(j,i,k)
[1]952             ENDDO
953          ENDDO
954       ENDDO
955       RETURN
956    ENDIF
957
958!
959!-- Transpose array
960    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]961    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1]962    CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]963                       work(1),          sendrecvcount_zyd, MPI_REAL, &
[1]964                       comm1dy, ierr )
965    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
966
967!
968!-- Reorder transposed array
969    m = 0
970    DO  l = 0, pdims(2) - 1
971       ys = 0 + l * nny
[1003]972       DO  k = nzb_yd, nzt_yd
973          DO  i = nxl_yd, nxr_yd
[1]974             DO  j = ys, ys + nny - 1
975                m = m + 1
[164]976                f_out(j,i,k) = work(m)
[1]977             ENDDO
978          ENDDO
979       ENDDO
980    ENDDO
981
982#endif
983
984 END SUBROUTINE transpose_zyd
Note: See TracBrowser for help on using the repository browser.