source: palm/trunk/SOURCE/transpose.f90 @ 1318

Last change on this file since 1318 was 1318, checked in by raasch, 10 years ago

former files/routines cpu_log and cpu_statistics combined to one module,
which also includes the former data module cpulog from the modules-file,
module interfaces removed

  • Property svn:keywords set to Id
File size: 26.0 KB
Line 
1 SUBROUTINE resort_for_xy( f_in, f_inv )
2
3!--------------------------------------------------------------------------------!
4! This file is part of PALM.
5!
6! PALM is free software: you can redistribute it and/or modify it under the terms
7! of the GNU General Public License as published by the Free Software Foundation,
8! either version 3 of the License, or (at your option) any later version.
9!
10! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12! A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13!
14! You should have received a copy of the GNU General Public License along with
15! PALM. If not, see <http://www.gnu.org/licenses/>.
16!
17! Copyright 1997-2014 Leibniz Universitaet Hannover
18!--------------------------------------------------------------------------------!
19!
20! Current revisions:
21! -----------------
22! cpu_log_nowait parameter added to cpu measurements of the transpositions
23! required for solving the Poisson equation (poisfft),
24! module interfaces removed
25!
26! Former revisions:
27! -----------------
28! $Id: transpose.f90 1318 2014-03-17 13:35:16Z raasch $
29!
30! 1257 2013-11-08 15:18:40Z raasch
31! openacc loop and loop vector clauses removed
32!
33! 1216 2013-08-26 09:31:42Z raasch
34! re-sorting of the transposed / to be transposed arrays moved to separate
35! routines resort_for_...
36!
37! 1111 2013-03-08 23:54:10Z raasch
38! openACC directives added,
39! resorting data from/to work changed, work got 4 dimensions instead of 1
40!
41! 1106 2013-03-04 05:31:38Z raasch
42! preprocessor lines rearranged so that routines can also be used in serial
43! (non-parallel) mode
44!
45! 1092 2013-02-02 11:24:22Z raasch
46! unused variables removed
47!
48! 1036 2012-10-22 13:43:42Z raasch
49! code put under GPL (PALM 3.9)
50!
51! 1003 2012-09-14 14:35:53Z raasch
52! indices nxa, nya, etc. replaced by nx, ny, etc.
53!
54! 683 2011-02-09 14:25:15Z raasch
55! openMP parallelization of transpositions for 2d-domain-decomposition
56!
57! 622 2010-12-10 08:08:13Z raasch
58! optional barriers included in order to speed up collective operations
59!
60! 164 2008-05-15 08:46:15Z raasch
61! f_inv changed from subroutine argument to automatic array in order to do
62! re-ordering from f_in to f_inv in one step, one array work is needed instead
63! of work1 and work2
64!
65! February 2007
66! RCS Log replace by Id keyword, revision history cleaned up
67!
68! Revision 1.2  2004/04/30 13:12:17  raasch
69! Switched from mpi_alltoallv to the simpler mpi_alltoall,
70! all former transpose-routine files collected in this file, enlarged
71! transposition arrays introduced
72!
73! Revision 1.1  2004/04/30 13:08:16  raasch
74! Initial revision (collection of former routines transpose_xy, transpose_xz,
75!                   transpose_yx, transpose_yz, transpose_zx, transpose_zy)
76!
77! Revision 1.1  1997/07/24 11:25:18  raasch
78! Initial revision
79!
80!------------------------------------------------------------------------------!
81! Description:
82! ------------
83! Resorting data for the transposition from x to y. The transposition itself
84! is carried out in transpose_xy
85!------------------------------------------------------------------------------!
86
87     USE indices
88     USE transpose_indices
89
90     IMPLICIT NONE
91
92     REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
93     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
94
95
96     INTEGER ::  i, j, k
97
98!
99!-- Rearrange indices of input array in order to make data to be send
100!-- by MPI contiguous
101    !$OMP  PARALLEL PRIVATE ( i, j, k )
102    !$OMP  DO
103    !$acc kernels present( f_in, f_inv )
104     DO  i = 0, nx
105         DO  k = nzb_x, nzt_x
106             DO  j = nys_x, nyn_x
107                 f_inv(j,k,i) = f_in(i,j,k)
108             ENDDO
109         ENDDO
110     ENDDO
111     !$acc end kernels
112     !$OMP  END PARALLEL
113
114 END SUBROUTINE resort_for_xy
115
116
117 SUBROUTINE transpose_xy( f_inv, f_out )
118
119!------------------------------------------------------------------------------!
120! Description:
121! ------------
122! Transposition of input array (f_in) from x to y. For the input array, all
123! elements along x reside on the same PE, while after transposition, all
124! elements along y reside on the same PE.
125!------------------------------------------------------------------------------!
126
127    USE cpulog
128    USE indices
129    USE pegrid
130    USE transpose_indices
131
132    IMPLICIT NONE
133
134    INTEGER ::  i, j, k, l, ys
135   
136    REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
137
138    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
139
140
141    IF ( numprocs /= 1 )  THEN
142
143#if defined( __parallel )
144!
145!--    Transpose array
146       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
147       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
148       !$acc update host( f_inv )
149       CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
150                          work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
151                          comm1dy, ierr )
152       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
153
154!
155!--    Reorder transposed array
156!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
157!$OMP  DO
158       !$acc data copyin( work )
159       DO  l = 0, pdims(2) - 1
160          ys = 0 + l * ( nyn_x - nys_x + 1 )
161          !$acc kernels present( f_out, work )
162          DO  i = nxl_y, nxr_y
163             DO  k = nzb_y, nzt_y
164                DO  j = ys, ys + nyn_x - nys_x
165                   f_out(j,i,k) = work(j-ys+1,k,i,l)
166                ENDDO
167             ENDDO
168          ENDDO
169          !$acc end kernels
170       ENDDO
171       !$acc end data
172!$OMP  END PARALLEL
173#endif
174
175    ELSE
176
177!
178!--    Reorder transposed array
179!$OMP  PARALLEL PRIVATE ( i, j, k )
180!$OMP  DO
181       !$acc kernels present( f_inv, f_out )
182       DO  k = nzb_y, nzt_y
183          DO  i = nxl_y, nxr_y
184             DO  j = 0, ny
185                f_out(j,i,k) = f_inv(j,k,i)
186             ENDDO
187          ENDDO
188       ENDDO
189       !$acc end kernels
190!$OMP  END PARALLEL
191
192    ENDIF
193
194 END SUBROUTINE transpose_xy
195
196
197 SUBROUTINE resort_for_xz( f_inv, f_out )
198
199!------------------------------------------------------------------------------!
200! Description:
201! ------------
202! Resorting data after the transposition from x to z. The transposition itself
203! is carried out in transpose_xz
204!------------------------------------------------------------------------------!
205
206     USE indices
207     USE transpose_indices
208
209     IMPLICIT NONE
210
211     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
212     REAL ::  f_out(1:nz,nys:nyn,nxl:nxr)
213
214
215     INTEGER ::  i, j, k
216
217!
218!-- Rearrange indices of input array in order to make data to be send
219!-- by MPI contiguous.
220!-- In case of parallel fft/transposition, scattered store is faster in
221!-- backward direction!!!
222    !$OMP  PARALLEL PRIVATE ( i, j, k )
223    !$OMP  DO
224    !$acc kernels present( f_inv, f_out )
225     DO  k = 1, nz
226         DO  i = nxl, nxr
227             DO  j = nys, nyn
228                 f_out(k,j,i) = f_inv(j,i,k)
229             ENDDO
230         ENDDO
231     ENDDO
232     !$acc end kernels
233     !$OMP  END PARALLEL
234
235 END SUBROUTINE resort_for_xz
236
237
238 SUBROUTINE transpose_xz( f_in, f_inv )
239
240!------------------------------------------------------------------------------!
241! Description:
242! ------------
243! Transposition of input array (f_in) from x to z. For the input array, all
244! elements along x reside on the same PE, while after transposition, all
245! elements along z reside on the same PE.
246!------------------------------------------------------------------------------!
247
248    USE cpulog
249    USE indices
250    USE pegrid
251    USE transpose_indices
252
253    IMPLICIT NONE
254
255    INTEGER ::  i, j, k, l, xs
256   
257    REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz)
258
259    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
260
261
262!
263!-- If the PE grid is one-dimensional along y, the array has only to be
264!-- reordered locally and therefore no transposition has to be done.
265    IF ( pdims(1) /= 1 )  THEN
266
267#if defined( __parallel )
268!
269!--    Reorder input array for transposition
270!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
271!$OMP  DO
272       !$acc data copyout( work )
273       DO  l = 0, pdims(1) - 1
274          xs = 0 + l * nnx
275          !$acc kernels present( f_in, work )
276          DO  k = nzb_x, nzt_x
277             DO  i = xs, xs + nnx - 1
278                DO  j = nys_x, nyn_x
279                   work(j,i-xs+1,k,l) = f_in(i,j,k)
280                ENDDO
281             ENDDO
282          ENDDO
283          !$acc end kernels
284       ENDDO
285       !$acc end data
286!$OMP  END PARALLEL
287
288!
289!--    Transpose array
290       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
291       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
292       CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
293                          f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
294                          comm1dx, ierr )
295       !$acc update device( f_inv )
296       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
297#endif
298
299    ELSE
300
301!
302!--    Reorder the array in a way that the z index is in first position
303!$OMP  PARALLEL PRIVATE ( i, j, k )
304!$OMP  DO
305       !$acc kernels present( f_in, f_inv )
306       DO  i = nxl, nxr
307          DO  j = nys, nyn
308             DO  k = 1, nz
309                f_inv(j,i,k) = f_in(i,j,k)
310             ENDDO
311          ENDDO
312       ENDDO
313       !$acc end kernels
314!$OMP  END PARALLEL
315
316    ENDIF
317
318 END SUBROUTINE transpose_xz
319
320
321 SUBROUTINE resort_for_yx( f_inv, f_out )
322
323!------------------------------------------------------------------------------!
324! Description:
325! ------------
326! Resorting data after the transposition from y to x. The transposition itself
327! is carried out in transpose_yx
328!------------------------------------------------------------------------------!
329
330     USE indices
331     USE transpose_indices
332
333     IMPLICIT NONE
334
335     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
336     REAL ::  f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
337
338
339     INTEGER ::  i, j, k
340
341!
342!-- Rearrange indices of input array in order to make data to be send
343!-- by MPI contiguous
344    !$OMP  PARALLEL PRIVATE ( i, j, k )
345    !$OMP  DO
346    !$acc kernels present( f_inv, f_out )
347     DO  i = 0, nx
348         DO  k = nzb_x, nzt_x
349             DO  j = nys_x, nyn_x
350                 f_out(i,j,k) = f_inv(j,k,i)
351             ENDDO
352         ENDDO
353     ENDDO
354     !$acc end kernels
355     !$OMP  END PARALLEL
356
357 END SUBROUTINE resort_for_yx
358
359
360 SUBROUTINE transpose_yx( f_in, f_inv )
361
362!------------------------------------------------------------------------------!
363! Description:
364! ------------
365! Transposition of input array (f_in) from y to x. For the input array, all
366! elements along y reside on the same PE, while after transposition, all
367! elements along x reside on the same PE.
368!------------------------------------------------------------------------------!
369
370    USE cpulog
371    USE indices
372    USE pegrid
373    USE transpose_indices
374
375    IMPLICIT NONE
376
377    INTEGER ::  i, j, k, l, ys
378   
379    REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
380
381    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
382
383
384    IF ( numprocs /= 1 )  THEN
385
386#if defined( __parallel )
387!
388!--    Reorder input array for transposition
389!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
390!$OMP  DO
391       !$acc data copyout( work )
392       DO  l = 0, pdims(2) - 1
393          ys = 0 + l * ( nyn_x - nys_x + 1 )
394          !$acc kernels present( f_in, work )
395          DO  i = nxl_y, nxr_y
396             DO  k = nzb_y, nzt_y
397                DO  j = ys, ys + nyn_x - nys_x
398                   work(j-ys+1,k,i,l) = f_in(j,i,k)
399                ENDDO
400             ENDDO
401          ENDDO
402          !$acc end kernels
403       ENDDO
404       !$acc end data
405!$OMP  END PARALLEL
406
407!
408!--    Transpose array
409       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
410       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
411       CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
412                          f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
413                          comm1dy, ierr )
414       !$acc update device( f_inv )
415       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
416#endif
417
418    ELSE
419
420!
421!--    Reorder array f_in the same way as ALLTOALL did it
422!$OMP  PARALLEL PRIVATE ( i, j, k )
423!$OMP  DO
424       !$acc kernels present( f_in, f_inv )
425       DO  i = nxl_y, nxr_y
426          DO  k = nzb_y, nzt_y
427             DO  j = 0, ny
428                f_inv(j,k,i) = f_in(j,i,k)
429             ENDDO
430          ENDDO
431       ENDDO
432       !$acc end kernels
433!$OMP  END PARALLEL
434
435    ENDIF
436
437 END SUBROUTINE transpose_yx
438
439
440 SUBROUTINE transpose_yxd( f_in, f_out )
441
442!------------------------------------------------------------------------------!
443! Description:
444! ------------
445! Transposition of input array (f_in) from y to x. For the input array, all
446! elements along y reside on the same PE, while after transposition, all
447! elements along x reside on the same PE.
448! This is a direct transposition for arrays with indices in regular order
449! (k,j,i) (cf. transpose_yx).
450!------------------------------------------------------------------------------!
451
452    USE cpulog
453    USE indices
454    USE pegrid
455    USE transpose_indices
456
457    IMPLICIT NONE
458
459    INTEGER ::  i, j, k, l, m, xs
460
461    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
462             f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x),                     &
463             work(nnx*nny*nnz)
464
465#if defined( __parallel )
466
467!
468!-- Rearrange indices of input array in order to make data to be send
469!-- by MPI contiguous
470    DO  k = 1, nz
471       DO  j = nys, nyn
472          DO  i = nxl, nxr
473             f_inv(i,k,j) = f_in(k,j,i)
474          ENDDO
475       ENDDO
476    ENDDO
477
478!
479!-- Transpose array
480    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
481    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
482    CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
483                       work(1),          sendrecvcount_xy, MPI_REAL, &
484                       comm1dx, ierr )
485    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
486
487!
488!-- Reorder transposed array
489    m = 0
490    DO  l = 0, pdims(1) - 1
491       xs = 0 + l * nnx
492       DO  j = nys_x, nyn_x
493          DO  k = 1, nz
494             DO  i = xs, xs + nnx - 1
495                m = m + 1
496                f_out(i,j,k) = work(m)
497             ENDDO
498          ENDDO
499       ENDDO
500    ENDDO
501
502#endif
503
504 END SUBROUTINE transpose_yxd
505
506
507 SUBROUTINE resort_for_yz( f_in, f_inv )
508
509!------------------------------------------------------------------------------!
510! Description:
511! ------------
512! Resorting data for the transposition from y to z. The transposition itself
513! is carried out in transpose_yz
514!------------------------------------------------------------------------------!
515
516     USE indices
517     USE transpose_indices
518
519     IMPLICIT NONE
520
521     REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
522     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
523
524
525     INTEGER ::  i, j, k
526
527!
528!-- Rearrange indices of input array in order to make data to be send
529!-- by MPI contiguous
530    !$OMP  PARALLEL PRIVATE ( i, j, k )
531    !$OMP  DO
532    !$acc kernels present( f_in, f_inv )
533     DO  j = 0, ny
534         DO  k = nzb_y, nzt_y
535             DO  i = nxl_y, nxr_y
536                 f_inv(i,k,j) = f_in(j,i,k)
537             ENDDO
538         ENDDO
539     ENDDO
540     !$acc end kernels
541     !$OMP  END PARALLEL
542
543 END SUBROUTINE resort_for_yz
544
545
546 SUBROUTINE transpose_yz( f_inv, f_out )
547
548!------------------------------------------------------------------------------!
549! Description:
550! ------------
551! Transposition of input array (f_in) from y to z. For the input array, all
552! elements along y reside on the same PE, while after transposition, all
553! elements along z reside on the same PE.
554!------------------------------------------------------------------------------!
555
556    USE cpulog
557    USE indices
558    USE pegrid
559    USE transpose_indices
560
561    IMPLICIT NONE
562
563    INTEGER ::  i, j, k, l, zs
564   
565    REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
566
567    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
568
569
570!
571!-- If the PE grid is one-dimensional along y, only local reordering
572!-- of the data is necessary and no transposition has to be done.
573    IF ( pdims(1) == 1 )  THEN
574
575!$OMP  PARALLEL PRIVATE ( i, j, k )
576!$OMP  DO
577       !$acc kernels present( f_inv, f_out )
578       DO  j = 0, ny
579          DO  k = nzb_y, nzt_y
580             DO  i = nxl_y, nxr_y
581                f_out(i,j,k) = f_inv(i,k,j)
582             ENDDO
583          ENDDO
584       ENDDO
585       !$acc end kernels
586!$OMP  END PARALLEL
587
588    ELSE
589
590#if defined( __parallel )
591!
592!--    Transpose array
593       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
594       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
595       !$acc update host( f_inv )
596       CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
597                          work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
598                          comm1dx, ierr )
599       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
600
601!
602!--    Reorder transposed array
603!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
604!$OMP  DO
605       !$acc data copyin( work )
606       DO  l = 0, pdims(1) - 1
607          zs = 1 + l * ( nzt_y - nzb_y + 1 )
608          !$acc kernels present( f_out )
609          DO  j = nys_z, nyn_z
610             DO  k = zs, zs + nzt_y - nzb_y
611                DO  i = nxl_z, nxr_z
612                   f_out(i,j,k) = work(i,k-zs+1,j,l)
613                ENDDO
614             ENDDO
615          ENDDO
616          !$acc end kernels
617       ENDDO
618       !$acc end data
619!$OMP  END PARALLEL
620#endif
621
622   ENDIF
623
624 END SUBROUTINE transpose_yz
625
626
627 SUBROUTINE resort_for_zx( f_in, f_inv )
628
629!------------------------------------------------------------------------------!
630! Description:
631! ------------
632! Resorting data for the transposition from z to x. The transposition itself
633! is carried out in transpose_zx
634!------------------------------------------------------------------------------!
635
636     USE indices
637     USE transpose_indices
638
639     IMPLICIT NONE
640
641     REAL ::  f_in(1:nz,nys:nyn,nxl:nxr)
642     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
643
644
645     INTEGER ::  i, j, k
646
647!
648!-- Rearrange indices of input array in order to make data to be send
649!-- by MPI contiguous
650    !$OMP  PARALLEL PRIVATE ( i, j, k )
651    !$OMP  DO
652    !$acc kernels present( f_in, f_inv )
653     DO  k = 1,nz
654         DO  i = nxl, nxr
655             DO  j = nys, nyn
656                 f_inv(j,i,k) = f_in(k,j,i)
657             ENDDO
658         ENDDO
659     ENDDO
660     !$acc end kernels
661     !$OMP  END PARALLEL
662
663 END SUBROUTINE resort_for_zx
664
665
666 SUBROUTINE transpose_zx( f_inv, f_out )
667
668!------------------------------------------------------------------------------!
669! Description:
670! ------------
671! Transposition of input array (f_in) from z to x. For the input array, all
672! elements along z reside on the same PE, while after transposition, all
673! elements along x reside on the same PE.
674!------------------------------------------------------------------------------!
675
676    USE cpulog
677    USE indices
678    USE pegrid
679    USE transpose_indices
680
681    IMPLICIT NONE
682
683    INTEGER ::  i, j, k, l, xs
684   
685    REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
686
687    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
688
689
690!
691!-- If the PE grid is one-dimensional along y, only local reordering
692!-- of the data is necessary and no transposition has to be done.
693    IF ( pdims(1) == 1 )  THEN
694
695!$OMP  PARALLEL PRIVATE ( i, j, k )
696!$OMP  DO
697       !$acc kernels present( f_inv, f_out )
698       DO  k = 1, nz
699          DO  i = nxl, nxr
700             DO  j = nys, nyn
701                f_out(i,j,k) = f_inv(j,i,k)
702             ENDDO
703          ENDDO
704       ENDDO
705       !$acc end kernels
706!$OMP  END PARALLEL
707
708    ELSE
709
710#if defined( __parallel )
711!
712!--    Transpose array
713       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
714       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
715       !$acc update host( f_inv )
716       CALL MPI_ALLTOALL( f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
717                          work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
718                          comm1dx, ierr )
719       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
720
721!
722!--    Reorder transposed array
723!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
724!$OMP  DO
725       !$acc data copyin( work )
726       DO  l = 0, pdims(1) - 1
727          xs = 0 + l * nnx
728          !$acc kernels present( f_out )
729          DO  k = nzb_x, nzt_x
730             DO  i = xs, xs + nnx - 1
731                DO  j = nys_x, nyn_x
732                   f_out(i,j,k) = work(j,i-xs+1,k,l)
733                ENDDO
734             ENDDO
735          ENDDO
736          !$acc end kernels
737       ENDDO
738       !$acc end data
739!$OMP  END PARALLEL
740#endif
741
742    ENDIF
743
744 END SUBROUTINE transpose_zx
745
746
747 SUBROUTINE resort_for_zy( f_inv, f_out )
748
749!------------------------------------------------------------------------------!
750! Description:
751! ------------
752! Resorting data after the transposition from z to y. The transposition itself
753! is carried out in transpose_zy
754!------------------------------------------------------------------------------!
755
756     USE indices
757     USE transpose_indices
758
759     IMPLICIT NONE
760
761     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
762     REAL ::  f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
763
764
765     INTEGER ::  i, j, k
766
767!
768!-- Rearrange indices of input array in order to make data to be send
769!-- by MPI contiguous
770    !$OMP  PARALLEL PRIVATE ( i, j, k )
771    !$OMP  DO
772    !$acc kernels present( f_inv, f_out )
773     DO  k = nzb_y, nzt_y
774         DO  j = 0, ny
775             DO  i = nxl_y, nxr_y
776                 f_out(j,i,k) = f_inv(i,k,j)
777             ENDDO
778         ENDDO
779     ENDDO
780     !$acc end kernels
781     !$OMP  END PARALLEL
782
783 END SUBROUTINE resort_for_zy
784
785
786 SUBROUTINE transpose_zy( f_in, f_inv )
787
788!------------------------------------------------------------------------------!
789! Description:
790! ------------
791! Transposition of input array (f_in) from z to y. For the input array, all
792! elements along z reside on the same PE, while after transposition, all
793! elements along y reside on the same PE.
794!------------------------------------------------------------------------------!
795
796    USE cpulog
797    USE indices
798    USE pegrid
799    USE transpose_indices
800
801    IMPLICIT NONE
802
803    INTEGER ::  i, j, k, l, zs
804   
805    REAL ::  f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
806
807    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
808
809
810!
811!-- If the PE grid is one-dimensional along y, the array has only to be
812!-- reordered locally and therefore no transposition has to be done.
813    IF ( pdims(1) /= 1 )  THEN
814
815#if defined( __parallel )
816!
817!--    Reorder input array for transposition
818!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
819!$OMP  DO
820       !$acc data copyout( work )
821       DO  l = 0, pdims(1) - 1
822          zs = 1 + l * ( nzt_y - nzb_y + 1 )
823          !$acc kernels present( f_in, work )
824          DO  j = nys_z, nyn_z
825             DO  k = zs, zs + nzt_y - nzb_y
826                DO  i = nxl_z, nxr_z
827                   work(i,k-zs+1,j,l) = f_in(i,j,k)
828                ENDDO
829             ENDDO
830          ENDDO
831          !$acc end kernels
832       ENDDO
833       !$acc end data
834!$OMP  END PARALLEL
835
836!
837!--    Transpose array
838       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
839       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
840       CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
841                          f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
842                          comm1dx, ierr )
843       !$acc update device( f_inv )
844       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
845#endif
846
847    ELSE
848!
849!--    Reorder the array in the same way like ALLTOALL did it
850!$OMP  PARALLEL PRIVATE ( i, j, k )
851!$OMP  DO
852       !$acc kernels present( f_in, f_inv )
853       DO  k = nzb_y, nzt_y
854          DO  j = 0, ny
855             DO  i = nxl_y, nxr_y
856                f_inv(i,k,j) = f_in(i,j,k)
857             ENDDO
858          ENDDO
859       ENDDO
860       !$acc end kernels
861!$OMP  END PARALLEL
862
863    ENDIF
864
865 END SUBROUTINE transpose_zy
866
867
868 SUBROUTINE transpose_zyd( f_in, f_out )
869
870!------------------------------------------------------------------------------!
871! Description:
872! ------------
873! Transposition of input array (f_in) from z to y. For the input array, all
874! elements along z reside on the same PE, while after transposition, all
875! elements along y reside on the same PE.
876! This is a direct transposition for arrays with indices in regular order
877! (k,j,i) (cf. transpose_zy).
878!------------------------------------------------------------------------------!
879
880    USE cpulog
881    USE indices
882    USE pegrid
883    USE transpose_indices
884
885    IMPLICIT NONE
886
887    INTEGER ::  i, j, k, l, m, ys
888   
889    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
890             f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd),                 &
891             work(nnx*nny*nnz)
892
893#if defined( __parallel )
894
895!
896!-- Rearrange indices of input array in order to make data to be send
897!-- by MPI contiguous
898    DO  i = nxl, nxr
899       DO  j = nys, nyn
900          DO  k = 1, nz
901             f_inv(j,i,k) = f_in(k,j,i)
902          ENDDO
903       ENDDO
904    ENDDO
905
906!
907!-- Move data to different array, because memory location of work1 is
908!-- needed further below (work1 = work2).
909!-- If the PE grid is one-dimensional along x, only local reordering
910!-- of the data is necessary and no transposition has to be done.
911    IF ( pdims(2) == 1 )  THEN
912       DO  k = 1, nz
913          DO  i = nxl, nxr
914             DO  j = nys, nyn
915                f_out(j,i,k) = f_inv(j,i,k)
916             ENDDO
917          ENDDO
918       ENDDO
919       RETURN
920    ENDIF
921
922!
923!-- Transpose array
924    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
925    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
926    CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
927                       work(1),          sendrecvcount_zyd, MPI_REAL, &
928                       comm1dy, ierr )
929    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
930
931!
932!-- Reorder transposed array
933    m = 0
934    DO  l = 0, pdims(2) - 1
935       ys = 0 + l * nny
936       DO  k = nzb_yd, nzt_yd
937          DO  i = nxl_yd, nxr_yd
938             DO  j = ys, ys + nny - 1
939                m = m + 1
940                f_out(j,i,k) = work(m)
941             ENDDO
942          ENDDO
943       ENDDO
944    ENDDO
945
946#endif
947
948 END SUBROUTINE transpose_zyd
Note: See TracBrowser for help on using the repository browser.