source: palm/trunk/SOURCE/transpose.f90 @ 1216

Last change on this file since 1216 was 1216, checked in by raasch, 11 years ago

overlapping execution of fft and transpositions (MPI_ALLTOALL), but real overlapping is not activated so far,
fftw implemented for 1D-decomposition
resorting of arrays moved to separate routines resort_for_...
bugfix in mbuild concerning Makefile_check

  • Property svn:keywords set to Id
File size: 26.8 KB
Line 
1 SUBROUTINE resort_for_xy( f_in, f_inv )
2
3!--------------------------------------------------------------------------------!
4! This file is part of PALM.
5!
6! PALM is free software: you can redistribute it and/or modify it under the terms
7! of the GNU General Public License as published by the Free Software Foundation,
8! either version 3 of the License, or (at your option) any later version.
9!
10! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12! A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13!
14! You should have received a copy of the GNU General Public License along with
15! PALM. If not, see <http://www.gnu.org/licenses/>.
16!
17! Copyright 1997-2012  Leibniz University Hannover
18!--------------------------------------------------------------------------------!
19!
20! Current revisions:
21! -----------------
22! re-sorting of the transposed / to be transposed arrays moved to separate
23! routines resort_for_...
24!
25! Former revisions:
26! -----------------
27! $Id: transpose.f90 1216 2013-08-26 09:31:42Z raasch $
28!
29! 1111 2013-03-08 23:54:10Z raasch
30! openACC directives added,
31! resorting data from/to work changed, work got 4 dimensions instead of 1
32!
33! 1106 2013-03-04 05:31:38Z raasch
34! preprocessor lines rearranged so that routines can also be used in serial
35! (non-parallel) mode
36!
37! 1092 2013-02-02 11:24:22Z raasch
38! unused variables removed
39!
40! 1036 2012-10-22 13:43:42Z raasch
41! code put under GPL (PALM 3.9)
42!
43! 1003 2012-09-14 14:35:53Z raasch
44! indices nxa, nya, etc. replaced by nx, ny, etc.
45!
46! 683 2011-02-09 14:25:15Z raasch
47! openMP parallelization of transpositions for 2d-domain-decomposition
48!
49! 622 2010-12-10 08:08:13Z raasch
50! optional barriers included in order to speed up collective operations
51!
52! 164 2008-05-15 08:46:15Z raasch
53! f_inv changed from subroutine argument to automatic array in order to do
54! re-ordering from f_in to f_inv in one step, one array work is needed instead
55! of work1 and work2
56!
57! February 2007
58! RCS Log replace by Id keyword, revision history cleaned up
59!
60! Revision 1.2  2004/04/30 13:12:17  raasch
61! Switched from mpi_alltoallv to the simpler mpi_alltoall,
62! all former transpose-routine files collected in this file, enlarged
63! transposition arrays introduced
64!
65! Revision 1.1  2004/04/30 13:08:16  raasch
66! Initial revision (collection of former routines transpose_xy, transpose_xz,
67!                   transpose_yx, transpose_yz, transpose_zx, transpose_zy)
68!
69! Revision 1.1  1997/07/24 11:25:18  raasch
70! Initial revision
71!
72!------------------------------------------------------------------------------!
73! Description:
74! ------------
75! Resorting data for the transposition from x to y. The transposition itself
76! is carried out in transpose_xy
77!------------------------------------------------------------------------------!
78
79     USE indices
80     USE transpose_indices
81
82     IMPLICIT NONE
83
84     REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
85     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
86
87
88     INTEGER ::  i, j, k
89
90!
91!-- Rearrange indices of input array in order to make data to be send
92!-- by MPI contiguous
93    !$OMP  PARALLEL PRIVATE ( i, j, k )
94    !$OMP  DO
95    !$acc kernels present( f_in, f_inv )
96    !$acc loop
97     DO  i = 0, nx
98         DO  k = nzb_x, nzt_x
99             !$acc loop vector( 32 )
100             DO  j = nys_x, nyn_x
101                 f_inv(j,k,i) = f_in(i,j,k)
102             ENDDO
103         ENDDO
104     ENDDO
105     !$acc end kernels
106     !$OMP  END PARALLEL
107
108 END SUBROUTINE resort_for_xy
109
110
111 SUBROUTINE transpose_xy( f_inv, f_out )
112
113!------------------------------------------------------------------------------!
114! Description:
115! ------------
116! Transposition of input array (f_in) from x to y. For the input array, all
117! elements along x reside on the same PE, while after transposition, all
118! elements along y reside on the same PE.
119!------------------------------------------------------------------------------!
120
121    USE cpulog
122    USE indices
123    USE interfaces
124    USE pegrid
125    USE transpose_indices
126
127    IMPLICIT NONE
128
129    INTEGER ::  i, j, k, l, ys
130   
131    REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
132
133    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
134
135
136    IF ( numprocs /= 1 )  THEN
137
138#if defined( __parallel )
139!
140!--    Transpose array
141       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
142       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
143       !$acc update host( f_inv )
144       CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
145                          work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
146                          comm1dy, ierr )
147       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
148
149!
150!--    Reorder transposed array
151!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
152!$OMP  DO
153       !$acc data copyin( work )
154       DO  l = 0, pdims(2) - 1
155          ys = 0 + l * ( nyn_x - nys_x + 1 )
156          !$acc kernels present( f_out, work )
157          !$acc loop
158          DO  i = nxl_y, nxr_y
159             DO  k = nzb_y, nzt_y
160                !$acc loop vector( 32 )
161                DO  j = ys, ys + nyn_x - nys_x
162                   f_out(j,i,k) = work(j-ys+1,k,i,l)
163                ENDDO
164             ENDDO
165          ENDDO
166          !$acc end kernels
167       ENDDO
168       !$acc end data
169!$OMP  END PARALLEL
170#endif
171
172    ELSE
173
174!
175!--    Reorder transposed array
176!$OMP  PARALLEL PRIVATE ( i, j, k )
177!$OMP  DO
178       !$acc kernels present( f_inv, f_out )
179       !$acc loop
180       DO  k = nzb_y, nzt_y
181          DO  i = nxl_y, nxr_y
182             !$acc loop vector( 32 )
183             DO  j = 0, ny
184                f_out(j,i,k) = f_inv(j,k,i)
185             ENDDO
186          ENDDO
187       ENDDO
188       !$acc end kernels
189!$OMP  END PARALLEL
190
191    ENDIF
192
193 END SUBROUTINE transpose_xy
194
195
196 SUBROUTINE resort_for_xz( f_inv, f_out )
197
198!------------------------------------------------------------------------------!
199! Description:
200! ------------
201! Resorting data after the transposition from x to z. The transposition itself
202! is carried out in transpose_xz
203!------------------------------------------------------------------------------!
204
205     USE indices
206     USE transpose_indices
207
208     IMPLICIT NONE
209
210     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
211     REAL ::  f_out(1:nz,nys:nyn,nxl:nxr)
212
213
214     INTEGER ::  i, j, k
215
216!
217!-- Rearrange indices of input array in order to make data to be send
218!-- by MPI contiguous.
219!-- In case of parallel fft/transposition, scattered store is faster in
220!-- backward direction!!!
221    !$OMP  PARALLEL PRIVATE ( i, j, k )
222    !$OMP  DO
223    !$acc kernels present( f_inv, f_out )
224    !$acc loop
225     DO  k = 1, nz
226         DO  i = nxl, nxr
227             !$acc loop vector( 32 )
228             DO  j = nys, nyn
229                 f_out(k,j,i) = f_inv(j,i,k)
230             ENDDO
231         ENDDO
232     ENDDO
233     !$acc end kernels
234     !$OMP  END PARALLEL
235
236 END SUBROUTINE resort_for_xz
237
238
239 SUBROUTINE transpose_xz( f_in, f_inv )
240
241!------------------------------------------------------------------------------!
242! Description:
243! ------------
244! Transposition of input array (f_in) from x to z. For the input array, all
245! elements along x reside on the same PE, while after transposition, all
246! elements along z reside on the same PE.
247!------------------------------------------------------------------------------!
248
249    USE cpulog
250    USE indices
251    USE interfaces
252    USE pegrid
253    USE transpose_indices
254
255    IMPLICIT NONE
256
257    INTEGER ::  i, j, k, l, xs
258   
259    REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz)
260
261    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
262
263
264!
265!-- If the PE grid is one-dimensional along y, the array has only to be
266!-- reordered locally and therefore no transposition has to be done.
267    IF ( pdims(1) /= 1 )  THEN
268
269#if defined( __parallel )
270!
271!--    Reorder input array for transposition
272!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
273!$OMP  DO
274       !$acc data copyout( work )
275       DO  l = 0, pdims(1) - 1
276          xs = 0 + l * nnx
277          !$acc kernels present( f_in, work )
278          !$acc loop
279          DO  k = nzb_x, nzt_x
280             DO  i = xs, xs + nnx - 1
281                !$acc loop vector( 32 )
282                DO  j = nys_x, nyn_x
283                   work(j,i-xs+1,k,l) = f_in(i,j,k)
284                ENDDO
285             ENDDO
286          ENDDO
287          !$acc end kernels
288       ENDDO
289       !$acc end data
290!$OMP  END PARALLEL
291
292!
293!--    Transpose array
294       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
295       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
296       CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
297                          f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
298                          comm1dx, ierr )
299       !$acc update device( f_inv )
300       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
301#endif
302
303    ELSE
304
305!
306!--    Reorder the array in a way that the z index is in first position
307!$OMP  PARALLEL PRIVATE ( i, j, k )
308!$OMP  DO
309       !$acc kernels present( f_in, f_inv )
310       !$acc loop
311       DO  i = nxl, nxr
312          DO  j = nys, nyn
313             !$acc loop vector( 32 )
314             DO  k = 1, nz
315                f_inv(j,i,k) = f_in(i,j,k)
316             ENDDO
317          ENDDO
318       ENDDO
319       !$acc end kernels
320!$OMP  END PARALLEL
321
322    ENDIF
323
324 END SUBROUTINE transpose_xz
325
326
327 SUBROUTINE resort_for_yx( f_inv, f_out )
328
329!------------------------------------------------------------------------------!
330! Description:
331! ------------
332! Resorting data after the transposition from y to x. The transposition itself
333! is carried out in transpose_yx
334!------------------------------------------------------------------------------!
335
336     USE indices
337     USE transpose_indices
338
339     IMPLICIT NONE
340
341     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
342     REAL ::  f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
343
344
345     INTEGER ::  i, j, k
346
347!
348!-- Rearrange indices of input array in order to make data to be send
349!-- by MPI contiguous
350    !$OMP  PARALLEL PRIVATE ( i, j, k )
351    !$OMP  DO
352    !$acc kernels present( f_inv, f_out )
353    !$acc loop
354     DO  i = 0, nx
355         DO  k = nzb_x, nzt_x
356             !$acc loop vector( 32 )
357             DO  j = nys_x, nyn_x
358                 f_out(i,j,k) = f_inv(j,k,i)
359             ENDDO
360         ENDDO
361     ENDDO
362     !$acc end kernels
363     !$OMP  END PARALLEL
364
365 END SUBROUTINE resort_for_yx
366
367
368 SUBROUTINE transpose_yx( f_in, f_inv )
369
370!------------------------------------------------------------------------------!
371! Description:
372! ------------
373! Transposition of input array (f_in) from y to x. For the input array, all
374! elements along y reside on the same PE, while after transposition, all
375! elements along x reside on the same PE.
376!------------------------------------------------------------------------------!
377
378    USE cpulog
379    USE indices
380    USE interfaces
381    USE pegrid
382    USE transpose_indices
383
384    IMPLICIT NONE
385
386    INTEGER ::  i, j, k, l, ys
387   
388    REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
389
390    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
391
392
393    IF ( numprocs /= 1 )  THEN
394
395#if defined( __parallel )
396!
397!--    Reorder input array for transposition
398!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
399!$OMP  DO
400       !$acc data copyout( work )
401       DO  l = 0, pdims(2) - 1
402          ys = 0 + l * ( nyn_x - nys_x + 1 )
403          !$acc kernels present( f_in, work )
404          !$acc loop
405          DO  i = nxl_y, nxr_y
406             DO  k = nzb_y, nzt_y
407                !$acc loop vector( 32 )
408                DO  j = ys, ys + nyn_x - nys_x
409                   work(j-ys+1,k,i,l) = f_in(j,i,k)
410                ENDDO
411             ENDDO
412          ENDDO
413          !$acc end kernels
414       ENDDO
415       !$acc end data
416!$OMP  END PARALLEL
417
418!
419!--    Transpose array
420       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
421       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
422       CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
423                          f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
424                          comm1dy, ierr )
425       !$acc update device( f_inv )
426       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
427#endif
428
429    ELSE
430
431!
432!--    Reorder array f_in the same way as ALLTOALL did it
433!$OMP  PARALLEL PRIVATE ( i, j, k )
434!$OMP  DO
435       !$acc kernels present( f_in, f_inv )
436       !$acc loop
437       DO  i = nxl_y, nxr_y
438          DO  k = nzb_y, nzt_y
439             !$acc loop vector( 32 )
440             DO  j = 0, ny
441                f_inv(j,k,i) = f_in(j,i,k)
442             ENDDO
443          ENDDO
444       ENDDO
445       !$acc end kernels
446!$OMP  END PARALLEL
447
448    ENDIF
449
450 END SUBROUTINE transpose_yx
451
452
453 SUBROUTINE transpose_yxd( f_in, f_out )
454
455!------------------------------------------------------------------------------!
456! Description:
457! ------------
458! Transposition of input array (f_in) from y to x. For the input array, all
459! elements along y reside on the same PE, while after transposition, all
460! elements along x reside on the same PE.
461! This is a direct transposition for arrays with indices in regular order
462! (k,j,i) (cf. transpose_yx).
463!------------------------------------------------------------------------------!
464
465    USE cpulog
466    USE indices
467    USE interfaces
468    USE pegrid
469    USE transpose_indices
470
471    IMPLICIT NONE
472
473    INTEGER ::  i, j, k, l, m, xs
474
475    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
476             f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x),                     &
477             work(nnx*nny*nnz)
478
479#if defined( __parallel )
480
481!
482!-- Rearrange indices of input array in order to make data to be send
483!-- by MPI contiguous
484    DO  k = 1, nz
485       DO  j = nys, nyn
486          DO  i = nxl, nxr
487             f_inv(i,k,j) = f_in(k,j,i)
488          ENDDO
489       ENDDO
490    ENDDO
491
492!
493!-- Transpose array
494    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
495    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
496    CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
497                       work(1),          sendrecvcount_xy, MPI_REAL, &
498                       comm1dx, ierr )
499    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
500
501!
502!-- Reorder transposed array
503    m = 0
504    DO  l = 0, pdims(1) - 1
505       xs = 0 + l * nnx
506       DO  j = nys_x, nyn_x
507          DO  k = 1, nz
508             DO  i = xs, xs + nnx - 1
509                m = m + 1
510                f_out(i,j,k) = work(m)
511             ENDDO
512          ENDDO
513       ENDDO
514    ENDDO
515
516#endif
517
518 END SUBROUTINE transpose_yxd
519
520
521 SUBROUTINE resort_for_yz( f_in, f_inv )
522
523!------------------------------------------------------------------------------!
524! Description:
525! ------------
526! Resorting data for the transposition from y to z. The transposition itself
527! is carried out in transpose_yz
528!------------------------------------------------------------------------------!
529
530     USE indices
531     USE transpose_indices
532
533     IMPLICIT NONE
534
535     REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
536     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
537
538
539     INTEGER ::  i, j, k
540
541!
542!-- Rearrange indices of input array in order to make data to be send
543!-- by MPI contiguous
544    !$OMP  PARALLEL PRIVATE ( i, j, k )
545    !$OMP  DO
546    !$acc kernels present( f_in, f_inv )
547    !$acc loop
548     DO  j = 0, ny
549         DO  k = nzb_y, nzt_y
550             !$acc loop vector( 32 )
551             DO  i = nxl_y, nxr_y
552                 f_inv(i,k,j) = f_in(j,i,k)
553             ENDDO
554         ENDDO
555     ENDDO
556     !$acc end kernels
557     !$OMP  END PARALLEL
558
559 END SUBROUTINE resort_for_yz
560
561
562 SUBROUTINE transpose_yz( f_inv, f_out )
563
564!------------------------------------------------------------------------------!
565! Description:
566! ------------
567! Transposition of input array (f_in) from y to z. For the input array, all
568! elements along y reside on the same PE, while after transposition, all
569! elements along z reside on the same PE.
570!------------------------------------------------------------------------------!
571
572    USE cpulog
573    USE indices
574    USE interfaces
575    USE pegrid
576    USE transpose_indices
577
578    IMPLICIT NONE
579
580    INTEGER ::  i, j, k, l, zs
581   
582    REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
583
584    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
585
586
587!
588!-- If the PE grid is one-dimensional along y, only local reordering
589!-- of the data is necessary and no transposition has to be done.
590    IF ( pdims(1) == 1 )  THEN
591
592!$OMP  PARALLEL PRIVATE ( i, j, k )
593!$OMP  DO
594       !$acc kernels present( f_inv, f_out )
595       !$acc loop
596       DO  j = 0, ny
597          DO  k = nzb_y, nzt_y
598             !$acc loop vector( 32 )
599             DO  i = nxl_y, nxr_y
600                f_out(i,j,k) = f_inv(i,k,j)
601             ENDDO
602          ENDDO
603       ENDDO
604       !$acc end kernels
605!$OMP  END PARALLEL
606
607    ELSE
608
609#if defined( __parallel )
610!
611!--    Transpose array
612       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
613       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
614       !$acc update host( f_inv )
615       CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
616                          work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
617                          comm1dx, ierr )
618       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
619
620!
621!--    Reorder transposed array
622!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
623!$OMP  DO
624       !$acc data copyin( work )
625       DO  l = 0, pdims(1) - 1
626          zs = 1 + l * ( nzt_y - nzb_y + 1 )
627          !$acc kernels present( f_out )
628          !$acc loop
629          DO  j = nys_z, nyn_z
630             DO  k = zs, zs + nzt_y - nzb_y
631                !$acc loop vector( 32 )
632                DO  i = nxl_z, nxr_z
633                   f_out(i,j,k) = work(i,k-zs+1,j,l)
634                ENDDO
635             ENDDO
636          ENDDO
637          !$acc end kernels
638       ENDDO
639       !$acc end data
640!$OMP  END PARALLEL
641#endif
642
643   ENDIF
644
645 END SUBROUTINE transpose_yz
646
647
648 SUBROUTINE resort_for_zx( f_in, f_inv )
649
650!------------------------------------------------------------------------------!
651! Description:
652! ------------
653! Resorting data for the transposition from z to x. The transposition itself
654! is carried out in transpose_zx
655!------------------------------------------------------------------------------!
656
657     USE indices
658     USE transpose_indices
659
660     IMPLICIT NONE
661
662     REAL ::  f_in(1:nz,nys:nyn,nxl:nxr)
663     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
664
665
666     INTEGER ::  i, j, k
667
668!
669!-- Rearrange indices of input array in order to make data to be send
670!-- by MPI contiguous
671    !$OMP  PARALLEL PRIVATE ( i, j, k )
672    !$OMP  DO
673    !$acc kernels present( f_in, f_inv )
674    !$acc loop
675     DO  k = 1,nz
676         DO  i = nxl, nxr
677             !$acc loop vector( 32 )
678             DO  j = nys, nyn
679                 f_inv(j,i,k) = f_in(k,j,i)
680             ENDDO
681         ENDDO
682     ENDDO
683     !$acc end kernels
684     !$OMP  END PARALLEL
685
686 END SUBROUTINE resort_for_zx
687
688
689 SUBROUTINE transpose_zx( f_inv, f_out )
690
691!------------------------------------------------------------------------------!
692! Description:
693! ------------
694! Transposition of input array (f_in) from z to x. For the input array, all
695! elements along z reside on the same PE, while after transposition, all
696! elements along x reside on the same PE.
697!------------------------------------------------------------------------------!
698
699    USE cpulog
700    USE indices
701    USE interfaces
702    USE pegrid
703    USE transpose_indices
704
705    IMPLICIT NONE
706
707    INTEGER ::  i, j, k, l, xs
708   
709    REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
710
711    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
712
713
714!
715!-- If the PE grid is one-dimensional along y, only local reordering
716!-- of the data is necessary and no transposition has to be done.
717    IF ( pdims(1) == 1 )  THEN
718
719!$OMP  PARALLEL PRIVATE ( i, j, k )
720!$OMP  DO
721       !$acc kernels present( f_inv, f_out )
722       !$acc loop
723       DO  k = 1, nz
724          DO  i = nxl, nxr
725             !$acc loop vector( 32 )
726             DO  j = nys, nyn
727                f_out(i,j,k) = f_inv(j,i,k)
728             ENDDO
729          ENDDO
730       ENDDO
731       !$acc end kernels
732!$OMP  END PARALLEL
733
734    ELSE
735
736#if defined( __parallel )
737!
738!--    Transpose array
739       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
740       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
741       !$acc update host( f_inv )
742       CALL MPI_ALLTOALL( f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
743                          work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
744                          comm1dx, ierr )
745       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
746
747!
748!--    Reorder transposed array
749!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
750!$OMP  DO
751       !$acc data copyin( work )
752       DO  l = 0, pdims(1) - 1
753          xs = 0 + l * nnx
754          !$acc kernels present( f_out )
755          !$acc loop
756          DO  k = nzb_x, nzt_x
757             DO  i = xs, xs + nnx - 1
758                !$acc loop vector( 32 )
759                DO  j = nys_x, nyn_x
760                   f_out(i,j,k) = work(j,i-xs+1,k,l)
761                ENDDO
762             ENDDO
763          ENDDO
764          !$acc end kernels
765       ENDDO
766       !$acc end data
767!$OMP  END PARALLEL
768#endif
769
770    ENDIF
771
772 END SUBROUTINE transpose_zx
773
774
775 SUBROUTINE resort_for_zy( f_inv, f_out )
776
777!------------------------------------------------------------------------------!
778! Description:
779! ------------
780! Resorting data after the transposition from z to y. The transposition itself
781! is carried out in transpose_zy
782!------------------------------------------------------------------------------!
783
784     USE indices
785     USE transpose_indices
786
787     IMPLICIT NONE
788
789     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
790     REAL ::  f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
791
792
793     INTEGER ::  i, j, k
794
795!
796!-- Rearrange indices of input array in order to make data to be send
797!-- by MPI contiguous
798    !$OMP  PARALLEL PRIVATE ( i, j, k )
799    !$OMP  DO
800    !$acc kernels present( f_inv, f_out )
801    !$acc loop
802     DO  k = nzb_y, nzt_y
803         DO  j = 0, ny
804             !$acc loop vector( 32 )
805             DO  i = nxl_y, nxr_y
806                 f_out(j,i,k) = f_inv(i,k,j)
807             ENDDO
808         ENDDO
809     ENDDO
810     !$acc end kernels
811     !$OMP  END PARALLEL
812
813 END SUBROUTINE resort_for_zy
814
815
816 SUBROUTINE transpose_zy( f_in, f_inv )
817
818!------------------------------------------------------------------------------!
819! Description:
820! ------------
821! Transposition of input array (f_in) from z to y. For the input array, all
822! elements along z reside on the same PE, while after transposition, all
823! elements along y reside on the same PE.
824!------------------------------------------------------------------------------!
825
826    USE cpulog
827    USE indices
828    USE interfaces
829    USE pegrid
830    USE transpose_indices
831
832    IMPLICIT NONE
833
834    INTEGER ::  i, j, k, l, zs
835   
836    REAL ::  f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
837
838    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
839
840
841!
842!-- If the PE grid is one-dimensional along y, the array has only to be
843!-- reordered locally and therefore no transposition has to be done.
844    IF ( pdims(1) /= 1 )  THEN
845
846#if defined( __parallel )
847!
848!--    Reorder input array for transposition
849!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
850!$OMP  DO
851       !$acc data copyout( work )
852       DO  l = 0, pdims(1) - 1
853          zs = 1 + l * ( nzt_y - nzb_y + 1 )
854          !$acc kernels present( f_in, work )
855          !$acc loop
856          DO  j = nys_z, nyn_z
857             DO  k = zs, zs + nzt_y - nzb_y
858                !$acc loop vector( 32 )
859                DO  i = nxl_z, nxr_z
860                   work(i,k-zs+1,j,l) = f_in(i,j,k)
861                ENDDO
862             ENDDO
863          ENDDO
864          !$acc end kernels
865       ENDDO
866       !$acc end data
867!$OMP  END PARALLEL
868
869!
870!--    Transpose array
871       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
872       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
873       CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
874                          f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
875                          comm1dx, ierr )
876       !$acc update device( f_inv )
877       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
878#endif
879
880    ELSE
881!
882!--    Reorder the array in the same way like ALLTOALL did it
883!$OMP  PARALLEL PRIVATE ( i, j, k )
884!$OMP  DO
885       !$acc kernels present( f_in, f_inv )
886       !$acc loop
887       DO  k = nzb_y, nzt_y
888          DO  j = 0, ny
889             !$acc loop vector( 32 )
890             DO  i = nxl_y, nxr_y
891                f_inv(i,k,j) = f_in(i,j,k)
892             ENDDO
893          ENDDO
894       ENDDO
895       !$acc end kernels
896!$OMP  END PARALLEL
897
898    ENDIF
899
900 END SUBROUTINE transpose_zy
901
902
903 SUBROUTINE transpose_zyd( f_in, f_out )
904
905!------------------------------------------------------------------------------!
906! Description:
907! ------------
908! Transposition of input array (f_in) from z to y. For the input array, all
909! elements along z reside on the same PE, while after transposition, all
910! elements along y reside on the same PE.
911! This is a direct transposition for arrays with indices in regular order
912! (k,j,i) (cf. transpose_zy).
913!------------------------------------------------------------------------------!
914
915    USE cpulog
916    USE indices
917    USE interfaces
918    USE pegrid
919    USE transpose_indices
920
921    IMPLICIT NONE
922
923    INTEGER ::  i, j, k, l, m, ys
924   
925    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
926             f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd),                 &
927             work(nnx*nny*nnz)
928
929#if defined( __parallel )
930
931!
932!-- Rearrange indices of input array in order to make data to be send
933!-- by MPI contiguous
934    DO  i = nxl, nxr
935       DO  j = nys, nyn
936          DO  k = 1, nz
937             f_inv(j,i,k) = f_in(k,j,i)
938          ENDDO
939       ENDDO
940    ENDDO
941
942!
943!-- Move data to different array, because memory location of work1 is
944!-- needed further below (work1 = work2).
945!-- If the PE grid is one-dimensional along x, only local reordering
946!-- of the data is necessary and no transposition has to be done.
947    IF ( pdims(2) == 1 )  THEN
948       DO  k = 1, nz
949          DO  i = nxl, nxr
950             DO  j = nys, nyn
951                f_out(j,i,k) = f_inv(j,i,k)
952             ENDDO
953          ENDDO
954       ENDDO
955       RETURN
956    ENDIF
957
958!
959!-- Transpose array
960    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
961    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
962    CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
963                       work(1),          sendrecvcount_zyd, MPI_REAL, &
964                       comm1dy, ierr )
965    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
966
967!
968!-- Reorder transposed array
969    m = 0
970    DO  l = 0, pdims(2) - 1
971       ys = 0 + l * nny
972       DO  k = nzb_yd, nzt_yd
973          DO  i = nxl_yd, nxr_yd
974             DO  j = ys, ys + nny - 1
975                m = m + 1
976                f_out(j,i,k) = work(m)
977             ENDDO
978          ENDDO
979       ENDDO
980    ENDDO
981
982#endif
983
984 END SUBROUTINE transpose_zyd
Note: See TracBrowser for help on using the repository browser.