source: palm/trunk/SOURCE/transpose.f90 @ 1319

Last change on this file since 1319 was 1319, checked in by raasch, 10 years ago

last commit documented

  • Property svn:keywords set to Id
File size: 26.1 KB
Line 
1 SUBROUTINE resort_for_xy( f_in, f_inv )
2
3!--------------------------------------------------------------------------------!
4! This file is part of PALM.
5!
6! PALM is free software: you can redistribute it and/or modify it under the terms
7! of the GNU General Public License as published by the Free Software Foundation,
8! either version 3 of the License, or (at your option) any later version.
9!
10! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12! A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13!
14! You should have received a copy of the GNU General Public License along with
15! PALM. If not, see <http://www.gnu.org/licenses/>.
16!
17! Copyright 1997-2014 Leibniz Universitaet Hannover
18!--------------------------------------------------------------------------------!
19!
20! Current revisions:
21! -----------------
22!
23!
24! Former revisions:
25! -----------------
26! $Id: transpose.f90 1319 2014-03-17 15:08:44Z raasch $
27!
28! 1318 2014-03-17 13:35:16Z raasch
29! cpu_log_nowait parameter added to cpu measurements of the transpositions
30! required for solving the Poisson equation (poisfft),
31! module interfaces removed
32!
33! 1257 2013-11-08 15:18:40Z raasch
34! openacc loop and loop vector clauses removed
35!
36! 1216 2013-08-26 09:31:42Z raasch
37! re-sorting of the transposed / to be transposed arrays moved to separate
38! routines resort_for_...
39!
40! 1111 2013-03-08 23:54:10Z raasch
41! openACC directives added,
42! resorting data from/to work changed, work got 4 dimensions instead of 1
43!
44! 1106 2013-03-04 05:31:38Z raasch
45! preprocessor lines rearranged so that routines can also be used in serial
46! (non-parallel) mode
47!
48! 1092 2013-02-02 11:24:22Z raasch
49! unused variables removed
50!
51! 1036 2012-10-22 13:43:42Z raasch
52! code put under GPL (PALM 3.9)
53!
54! 1003 2012-09-14 14:35:53Z raasch
55! indices nxa, nya, etc. replaced by nx, ny, etc.
56!
57! 683 2011-02-09 14:25:15Z raasch
58! openMP parallelization of transpositions for 2d-domain-decomposition
59!
60! 622 2010-12-10 08:08:13Z raasch
61! optional barriers included in order to speed up collective operations
62!
63! 164 2008-05-15 08:46:15Z raasch
64! f_inv changed from subroutine argument to automatic array in order to do
65! re-ordering from f_in to f_inv in one step, one array work is needed instead
66! of work1 and work2
67!
68! February 2007
69! RCS Log replace by Id keyword, revision history cleaned up
70!
71! Revision 1.2  2004/04/30 13:12:17  raasch
72! Switched from mpi_alltoallv to the simpler mpi_alltoall,
73! all former transpose-routine files collected in this file, enlarged
74! transposition arrays introduced
75!
76! Revision 1.1  2004/04/30 13:08:16  raasch
77! Initial revision (collection of former routines transpose_xy, transpose_xz,
78!                   transpose_yx, transpose_yz, transpose_zx, transpose_zy)
79!
80! Revision 1.1  1997/07/24 11:25:18  raasch
81! Initial revision
82!
83!------------------------------------------------------------------------------!
84! Description:
85! ------------
86! Resorting data for the transposition from x to y. The transposition itself
87! is carried out in transpose_xy
88!------------------------------------------------------------------------------!
89
90     USE indices
91     USE transpose_indices
92
93     IMPLICIT NONE
94
95     REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
96     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
97
98
99     INTEGER ::  i, j, k
100
101!
102!-- Rearrange indices of input array in order to make data to be send
103!-- by MPI contiguous
104    !$OMP  PARALLEL PRIVATE ( i, j, k )
105    !$OMP  DO
106    !$acc kernels present( f_in, f_inv )
107     DO  i = 0, nx
108         DO  k = nzb_x, nzt_x
109             DO  j = nys_x, nyn_x
110                 f_inv(j,k,i) = f_in(i,j,k)
111             ENDDO
112         ENDDO
113     ENDDO
114     !$acc end kernels
115     !$OMP  END PARALLEL
116
117 END SUBROUTINE resort_for_xy
118
119
120 SUBROUTINE transpose_xy( f_inv, f_out )
121
122!------------------------------------------------------------------------------!
123! Description:
124! ------------
125! Transposition of input array (f_in) from x to y. For the input array, all
126! elements along x reside on the same PE, while after transposition, all
127! elements along y reside on the same PE.
128!------------------------------------------------------------------------------!
129
130    USE cpulog
131    USE indices
132    USE pegrid
133    USE transpose_indices
134
135    IMPLICIT NONE
136
137    INTEGER ::  i, j, k, l, ys
138   
139    REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
140
141    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
142
143
144    IF ( numprocs /= 1 )  THEN
145
146#if defined( __parallel )
147!
148!--    Transpose array
149       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
150       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
151       !$acc update host( f_inv )
152       CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
153                          work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
154                          comm1dy, ierr )
155       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
156
157!
158!--    Reorder transposed array
159!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
160!$OMP  DO
161       !$acc data copyin( work )
162       DO  l = 0, pdims(2) - 1
163          ys = 0 + l * ( nyn_x - nys_x + 1 )
164          !$acc kernels present( f_out, work )
165          DO  i = nxl_y, nxr_y
166             DO  k = nzb_y, nzt_y
167                DO  j = ys, ys + nyn_x - nys_x
168                   f_out(j,i,k) = work(j-ys+1,k,i,l)
169                ENDDO
170             ENDDO
171          ENDDO
172          !$acc end kernels
173       ENDDO
174       !$acc end data
175!$OMP  END PARALLEL
176#endif
177
178    ELSE
179
180!
181!--    Reorder transposed array
182!$OMP  PARALLEL PRIVATE ( i, j, k )
183!$OMP  DO
184       !$acc kernels present( f_inv, f_out )
185       DO  k = nzb_y, nzt_y
186          DO  i = nxl_y, nxr_y
187             DO  j = 0, ny
188                f_out(j,i,k) = f_inv(j,k,i)
189             ENDDO
190          ENDDO
191       ENDDO
192       !$acc end kernels
193!$OMP  END PARALLEL
194
195    ENDIF
196
197 END SUBROUTINE transpose_xy
198
199
200 SUBROUTINE resort_for_xz( f_inv, f_out )
201
202!------------------------------------------------------------------------------!
203! Description:
204! ------------
205! Resorting data after the transposition from x to z. The transposition itself
206! is carried out in transpose_xz
207!------------------------------------------------------------------------------!
208
209     USE indices
210     USE transpose_indices
211
212     IMPLICIT NONE
213
214     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
215     REAL ::  f_out(1:nz,nys:nyn,nxl:nxr)
216
217
218     INTEGER ::  i, j, k
219
220!
221!-- Rearrange indices of input array in order to make data to be send
222!-- by MPI contiguous.
223!-- In case of parallel fft/transposition, scattered store is faster in
224!-- backward direction!!!
225    !$OMP  PARALLEL PRIVATE ( i, j, k )
226    !$OMP  DO
227    !$acc kernels present( f_inv, f_out )
228     DO  k = 1, nz
229         DO  i = nxl, nxr
230             DO  j = nys, nyn
231                 f_out(k,j,i) = f_inv(j,i,k)
232             ENDDO
233         ENDDO
234     ENDDO
235     !$acc end kernels
236     !$OMP  END PARALLEL
237
238 END SUBROUTINE resort_for_xz
239
240
241 SUBROUTINE transpose_xz( f_in, f_inv )
242
243!------------------------------------------------------------------------------!
244! Description:
245! ------------
246! Transposition of input array (f_in) from x to z. For the input array, all
247! elements along x reside on the same PE, while after transposition, all
248! elements along z reside on the same PE.
249!------------------------------------------------------------------------------!
250
251    USE cpulog
252    USE indices
253    USE pegrid
254    USE transpose_indices
255
256    IMPLICIT NONE
257
258    INTEGER ::  i, j, k, l, xs
259   
260    REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz)
261
262    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
263
264
265!
266!-- If the PE grid is one-dimensional along y, the array has only to be
267!-- reordered locally and therefore no transposition has to be done.
268    IF ( pdims(1) /= 1 )  THEN
269
270#if defined( __parallel )
271!
272!--    Reorder input array for transposition
273!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
274!$OMP  DO
275       !$acc data copyout( work )
276       DO  l = 0, pdims(1) - 1
277          xs = 0 + l * nnx
278          !$acc kernels present( f_in, work )
279          DO  k = nzb_x, nzt_x
280             DO  i = xs, xs + nnx - 1
281                DO  j = nys_x, nyn_x
282                   work(j,i-xs+1,k,l) = f_in(i,j,k)
283                ENDDO
284             ENDDO
285          ENDDO
286          !$acc end kernels
287       ENDDO
288       !$acc end data
289!$OMP  END PARALLEL
290
291!
292!--    Transpose array
293       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
294       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
295       CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
296                          f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
297                          comm1dx, ierr )
298       !$acc update device( f_inv )
299       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
300#endif
301
302    ELSE
303
304!
305!--    Reorder the array in a way that the z index is in first position
306!$OMP  PARALLEL PRIVATE ( i, j, k )
307!$OMP  DO
308       !$acc kernels present( f_in, f_inv )
309       DO  i = nxl, nxr
310          DO  j = nys, nyn
311             DO  k = 1, nz
312                f_inv(j,i,k) = f_in(i,j,k)
313             ENDDO
314          ENDDO
315       ENDDO
316       !$acc end kernels
317!$OMP  END PARALLEL
318
319    ENDIF
320
321 END SUBROUTINE transpose_xz
322
323
324 SUBROUTINE resort_for_yx( f_inv, f_out )
325
326!------------------------------------------------------------------------------!
327! Description:
328! ------------
329! Resorting data after the transposition from y to x. The transposition itself
330! is carried out in transpose_yx
331!------------------------------------------------------------------------------!
332
333     USE indices
334     USE transpose_indices
335
336     IMPLICIT NONE
337
338     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
339     REAL ::  f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
340
341
342     INTEGER ::  i, j, k
343
344!
345!-- Rearrange indices of input array in order to make data to be send
346!-- by MPI contiguous
347    !$OMP  PARALLEL PRIVATE ( i, j, k )
348    !$OMP  DO
349    !$acc kernels present( f_inv, f_out )
350     DO  i = 0, nx
351         DO  k = nzb_x, nzt_x
352             DO  j = nys_x, nyn_x
353                 f_out(i,j,k) = f_inv(j,k,i)
354             ENDDO
355         ENDDO
356     ENDDO
357     !$acc end kernels
358     !$OMP  END PARALLEL
359
360 END SUBROUTINE resort_for_yx
361
362
363 SUBROUTINE transpose_yx( f_in, f_inv )
364
365!------------------------------------------------------------------------------!
366! Description:
367! ------------
368! Transposition of input array (f_in) from y to x. For the input array, all
369! elements along y reside on the same PE, while after transposition, all
370! elements along x reside on the same PE.
371!------------------------------------------------------------------------------!
372
373    USE cpulog
374    USE indices
375    USE pegrid
376    USE transpose_indices
377
378    IMPLICIT NONE
379
380    INTEGER ::  i, j, k, l, ys
381   
382    REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
383
384    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
385
386
387    IF ( numprocs /= 1 )  THEN
388
389#if defined( __parallel )
390!
391!--    Reorder input array for transposition
392!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
393!$OMP  DO
394       !$acc data copyout( work )
395       DO  l = 0, pdims(2) - 1
396          ys = 0 + l * ( nyn_x - nys_x + 1 )
397          !$acc kernels present( f_in, work )
398          DO  i = nxl_y, nxr_y
399             DO  k = nzb_y, nzt_y
400                DO  j = ys, ys + nyn_x - nys_x
401                   work(j-ys+1,k,i,l) = f_in(j,i,k)
402                ENDDO
403             ENDDO
404          ENDDO
405          !$acc end kernels
406       ENDDO
407       !$acc end data
408!$OMP  END PARALLEL
409
410!
411!--    Transpose array
412       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
413       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
414       CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
415                          f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
416                          comm1dy, ierr )
417       !$acc update device( f_inv )
418       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
419#endif
420
421    ELSE
422
423!
424!--    Reorder array f_in the same way as ALLTOALL did it
425!$OMP  PARALLEL PRIVATE ( i, j, k )
426!$OMP  DO
427       !$acc kernels present( f_in, f_inv )
428       DO  i = nxl_y, nxr_y
429          DO  k = nzb_y, nzt_y
430             DO  j = 0, ny
431                f_inv(j,k,i) = f_in(j,i,k)
432             ENDDO
433          ENDDO
434       ENDDO
435       !$acc end kernels
436!$OMP  END PARALLEL
437
438    ENDIF
439
440 END SUBROUTINE transpose_yx
441
442
443 SUBROUTINE transpose_yxd( f_in, f_out )
444
445!------------------------------------------------------------------------------!
446! Description:
447! ------------
448! Transposition of input array (f_in) from y to x. For the input array, all
449! elements along y reside on the same PE, while after transposition, all
450! elements along x reside on the same PE.
451! This is a direct transposition for arrays with indices in regular order
452! (k,j,i) (cf. transpose_yx).
453!------------------------------------------------------------------------------!
454
455    USE cpulog
456    USE indices
457    USE pegrid
458    USE transpose_indices
459
460    IMPLICIT NONE
461
462    INTEGER ::  i, j, k, l, m, xs
463
464    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
465             f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x),                     &
466             work(nnx*nny*nnz)
467
468#if defined( __parallel )
469
470!
471!-- Rearrange indices of input array in order to make data to be send
472!-- by MPI contiguous
473    DO  k = 1, nz
474       DO  j = nys, nyn
475          DO  i = nxl, nxr
476             f_inv(i,k,j) = f_in(k,j,i)
477          ENDDO
478       ENDDO
479    ENDDO
480
481!
482!-- Transpose array
483    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
484    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
485    CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
486                       work(1),          sendrecvcount_xy, MPI_REAL, &
487                       comm1dx, ierr )
488    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
489
490!
491!-- Reorder transposed array
492    m = 0
493    DO  l = 0, pdims(1) - 1
494       xs = 0 + l * nnx
495       DO  j = nys_x, nyn_x
496          DO  k = 1, nz
497             DO  i = xs, xs + nnx - 1
498                m = m + 1
499                f_out(i,j,k) = work(m)
500             ENDDO
501          ENDDO
502       ENDDO
503    ENDDO
504
505#endif
506
507 END SUBROUTINE transpose_yxd
508
509
510 SUBROUTINE resort_for_yz( f_in, f_inv )
511
512!------------------------------------------------------------------------------!
513! Description:
514! ------------
515! Resorting data for the transposition from y to z. The transposition itself
516! is carried out in transpose_yz
517!------------------------------------------------------------------------------!
518
519     USE indices
520     USE transpose_indices
521
522     IMPLICIT NONE
523
524     REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
525     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
526
527
528     INTEGER ::  i, j, k
529
530!
531!-- Rearrange indices of input array in order to make data to be send
532!-- by MPI contiguous
533    !$OMP  PARALLEL PRIVATE ( i, j, k )
534    !$OMP  DO
535    !$acc kernels present( f_in, f_inv )
536     DO  j = 0, ny
537         DO  k = nzb_y, nzt_y
538             DO  i = nxl_y, nxr_y
539                 f_inv(i,k,j) = f_in(j,i,k)
540             ENDDO
541         ENDDO
542     ENDDO
543     !$acc end kernels
544     !$OMP  END PARALLEL
545
546 END SUBROUTINE resort_for_yz
547
548
549 SUBROUTINE transpose_yz( f_inv, f_out )
550
551!------------------------------------------------------------------------------!
552! Description:
553! ------------
554! Transposition of input array (f_in) from y to z. For the input array, all
555! elements along y reside on the same PE, while after transposition, all
556! elements along z reside on the same PE.
557!------------------------------------------------------------------------------!
558
559    USE cpulog
560    USE indices
561    USE pegrid
562    USE transpose_indices
563
564    IMPLICIT NONE
565
566    INTEGER ::  i, j, k, l, zs
567   
568    REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
569
570    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
571
572
573!
574!-- If the PE grid is one-dimensional along y, only local reordering
575!-- of the data is necessary and no transposition has to be done.
576    IF ( pdims(1) == 1 )  THEN
577
578!$OMP  PARALLEL PRIVATE ( i, j, k )
579!$OMP  DO
580       !$acc kernels present( f_inv, f_out )
581       DO  j = 0, ny
582          DO  k = nzb_y, nzt_y
583             DO  i = nxl_y, nxr_y
584                f_out(i,j,k) = f_inv(i,k,j)
585             ENDDO
586          ENDDO
587       ENDDO
588       !$acc end kernels
589!$OMP  END PARALLEL
590
591    ELSE
592
593#if defined( __parallel )
594!
595!--    Transpose array
596       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
597       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
598       !$acc update host( f_inv )
599       CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
600                          work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
601                          comm1dx, ierr )
602       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
603
604!
605!--    Reorder transposed array
606!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
607!$OMP  DO
608       !$acc data copyin( work )
609       DO  l = 0, pdims(1) - 1
610          zs = 1 + l * ( nzt_y - nzb_y + 1 )
611          !$acc kernels present( f_out )
612          DO  j = nys_z, nyn_z
613             DO  k = zs, zs + nzt_y - nzb_y
614                DO  i = nxl_z, nxr_z
615                   f_out(i,j,k) = work(i,k-zs+1,j,l)
616                ENDDO
617             ENDDO
618          ENDDO
619          !$acc end kernels
620       ENDDO
621       !$acc end data
622!$OMP  END PARALLEL
623#endif
624
625   ENDIF
626
627 END SUBROUTINE transpose_yz
628
629
630 SUBROUTINE resort_for_zx( f_in, f_inv )
631
632!------------------------------------------------------------------------------!
633! Description:
634! ------------
635! Resorting data for the transposition from z to x. The transposition itself
636! is carried out in transpose_zx
637!------------------------------------------------------------------------------!
638
639     USE indices
640     USE transpose_indices
641
642     IMPLICIT NONE
643
644     REAL ::  f_in(1:nz,nys:nyn,nxl:nxr)
645     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
646
647
648     INTEGER ::  i, j, k
649
650!
651!-- Rearrange indices of input array in order to make data to be send
652!-- by MPI contiguous
653    !$OMP  PARALLEL PRIVATE ( i, j, k )
654    !$OMP  DO
655    !$acc kernels present( f_in, f_inv )
656     DO  k = 1,nz
657         DO  i = nxl, nxr
658             DO  j = nys, nyn
659                 f_inv(j,i,k) = f_in(k,j,i)
660             ENDDO
661         ENDDO
662     ENDDO
663     !$acc end kernels
664     !$OMP  END PARALLEL
665
666 END SUBROUTINE resort_for_zx
667
668
669 SUBROUTINE transpose_zx( f_inv, f_out )
670
671!------------------------------------------------------------------------------!
672! Description:
673! ------------
674! Transposition of input array (f_in) from z to x. For the input array, all
675! elements along z reside on the same PE, while after transposition, all
676! elements along x reside on the same PE.
677!------------------------------------------------------------------------------!
678
679    USE cpulog
680    USE indices
681    USE pegrid
682    USE transpose_indices
683
684    IMPLICIT NONE
685
686    INTEGER ::  i, j, k, l, xs
687   
688    REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
689
690    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
691
692
693!
694!-- If the PE grid is one-dimensional along y, only local reordering
695!-- of the data is necessary and no transposition has to be done.
696    IF ( pdims(1) == 1 )  THEN
697
698!$OMP  PARALLEL PRIVATE ( i, j, k )
699!$OMP  DO
700       !$acc kernels present( f_inv, f_out )
701       DO  k = 1, nz
702          DO  i = nxl, nxr
703             DO  j = nys, nyn
704                f_out(i,j,k) = f_inv(j,i,k)
705             ENDDO
706          ENDDO
707       ENDDO
708       !$acc end kernels
709!$OMP  END PARALLEL
710
711    ELSE
712
713#if defined( __parallel )
714!
715!--    Transpose array
716       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
717       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
718       !$acc update host( f_inv )
719       CALL MPI_ALLTOALL( f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
720                          work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
721                          comm1dx, ierr )
722       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
723
724!
725!--    Reorder transposed array
726!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
727!$OMP  DO
728       !$acc data copyin( work )
729       DO  l = 0, pdims(1) - 1
730          xs = 0 + l * nnx
731          !$acc kernels present( f_out )
732          DO  k = nzb_x, nzt_x
733             DO  i = xs, xs + nnx - 1
734                DO  j = nys_x, nyn_x
735                   f_out(i,j,k) = work(j,i-xs+1,k,l)
736                ENDDO
737             ENDDO
738          ENDDO
739          !$acc end kernels
740       ENDDO
741       !$acc end data
742!$OMP  END PARALLEL
743#endif
744
745    ENDIF
746
747 END SUBROUTINE transpose_zx
748
749
750 SUBROUTINE resort_for_zy( f_inv, f_out )
751
752!------------------------------------------------------------------------------!
753! Description:
754! ------------
755! Resorting data after the transposition from z to y. The transposition itself
756! is carried out in transpose_zy
757!------------------------------------------------------------------------------!
758
759     USE indices
760     USE transpose_indices
761
762     IMPLICIT NONE
763
764     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
765     REAL ::  f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
766
767
768     INTEGER ::  i, j, k
769
770!
771!-- Rearrange indices of input array in order to make data to be send
772!-- by MPI contiguous
773    !$OMP  PARALLEL PRIVATE ( i, j, k )
774    !$OMP  DO
775    !$acc kernels present( f_inv, f_out )
776     DO  k = nzb_y, nzt_y
777         DO  j = 0, ny
778             DO  i = nxl_y, nxr_y
779                 f_out(j,i,k) = f_inv(i,k,j)
780             ENDDO
781         ENDDO
782     ENDDO
783     !$acc end kernels
784     !$OMP  END PARALLEL
785
786 END SUBROUTINE resort_for_zy
787
788
789 SUBROUTINE transpose_zy( f_in, f_inv )
790
791!------------------------------------------------------------------------------!
792! Description:
793! ------------
794! Transposition of input array (f_in) from z to y. For the input array, all
795! elements along z reside on the same PE, while after transposition, all
796! elements along y reside on the same PE.
797!------------------------------------------------------------------------------!
798
799    USE cpulog
800    USE indices
801    USE pegrid
802    USE transpose_indices
803
804    IMPLICIT NONE
805
806    INTEGER ::  i, j, k, l, zs
807   
808    REAL ::  f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
809
810    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
811
812
813!
814!-- If the PE grid is one-dimensional along y, the array has only to be
815!-- reordered locally and therefore no transposition has to be done.
816    IF ( pdims(1) /= 1 )  THEN
817
818#if defined( __parallel )
819!
820!--    Reorder input array for transposition
821!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
822!$OMP  DO
823       !$acc data copyout( work )
824       DO  l = 0, pdims(1) - 1
825          zs = 1 + l * ( nzt_y - nzb_y + 1 )
826          !$acc kernels present( f_in, work )
827          DO  j = nys_z, nyn_z
828             DO  k = zs, zs + nzt_y - nzb_y
829                DO  i = nxl_z, nxr_z
830                   work(i,k-zs+1,j,l) = f_in(i,j,k)
831                ENDDO
832             ENDDO
833          ENDDO
834          !$acc end kernels
835       ENDDO
836       !$acc end data
837!$OMP  END PARALLEL
838
839!
840!--    Transpose array
841       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
842       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
843       CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
844                          f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
845                          comm1dx, ierr )
846       !$acc update device( f_inv )
847       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
848#endif
849
850    ELSE
851!
852!--    Reorder the array in the same way like ALLTOALL did it
853!$OMP  PARALLEL PRIVATE ( i, j, k )
854!$OMP  DO
855       !$acc kernels present( f_in, f_inv )
856       DO  k = nzb_y, nzt_y
857          DO  j = 0, ny
858             DO  i = nxl_y, nxr_y
859                f_inv(i,k,j) = f_in(i,j,k)
860             ENDDO
861          ENDDO
862       ENDDO
863       !$acc end kernels
864!$OMP  END PARALLEL
865
866    ENDIF
867
868 END SUBROUTINE transpose_zy
869
870
871 SUBROUTINE transpose_zyd( f_in, f_out )
872
873!------------------------------------------------------------------------------!
874! Description:
875! ------------
876! Transposition of input array (f_in) from z to y. For the input array, all
877! elements along z reside on the same PE, while after transposition, all
878! elements along y reside on the same PE.
879! This is a direct transposition for arrays with indices in regular order
880! (k,j,i) (cf. transpose_zy).
881!------------------------------------------------------------------------------!
882
883    USE cpulog
884    USE indices
885    USE pegrid
886    USE transpose_indices
887
888    IMPLICIT NONE
889
890    INTEGER ::  i, j, k, l, m, ys
891   
892    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
893             f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd),                 &
894             work(nnx*nny*nnz)
895
896#if defined( __parallel )
897
898!
899!-- Rearrange indices of input array in order to make data to be send
900!-- by MPI contiguous
901    DO  i = nxl, nxr
902       DO  j = nys, nyn
903          DO  k = 1, nz
904             f_inv(j,i,k) = f_in(k,j,i)
905          ENDDO
906       ENDDO
907    ENDDO
908
909!
910!-- Move data to different array, because memory location of work1 is
911!-- needed further below (work1 = work2).
912!-- If the PE grid is one-dimensional along x, only local reordering
913!-- of the data is necessary and no transposition has to be done.
914    IF ( pdims(2) == 1 )  THEN
915       DO  k = 1, nz
916          DO  i = nxl, nxr
917             DO  j = nys, nyn
918                f_out(j,i,k) = f_inv(j,i,k)
919             ENDDO
920          ENDDO
921       ENDDO
922       RETURN
923    ENDIF
924
925!
926!-- Transpose array
927    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
928    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
929    CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
930                       work(1),          sendrecvcount_zyd, MPI_REAL, &
931                       comm1dy, ierr )
932    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
933
934!
935!-- Reorder transposed array
936    m = 0
937    DO  l = 0, pdims(2) - 1
938       ys = 0 + l * nny
939       DO  k = nzb_yd, nzt_yd
940          DO  i = nxl_yd, nxr_yd
941             DO  j = ys, ys + nny - 1
942                m = m + 1
943                f_out(j,i,k) = work(m)
944             ENDDO
945          ENDDO
946       ENDDO
947    ENDDO
948
949#endif
950
951 END SUBROUTINE transpose_zyd
Note: See TracBrowser for help on using the repository browser.