source: palm/trunk/SOURCE/transpose.f90 @ 1316

Last change on this file since 1316 was 1310, checked in by raasch, 11 years ago

update of GPL copyright

  • Property svn:keywords set to Id
File size: 25.9 KB
Line 
1 SUBROUTINE resort_for_xy( f_in, f_inv )
2
3!--------------------------------------------------------------------------------!
4! This file is part of PALM.
5!
6! PALM is free software: you can redistribute it and/or modify it under the terms
7! of the GNU General Public License as published by the Free Software Foundation,
8! either version 3 of the License, or (at your option) any later version.
9!
10! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12! A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13!
14! You should have received a copy of the GNU General Public License along with
15! PALM. If not, see <http://www.gnu.org/licenses/>.
16!
17! Copyright 1997-2014 Leibniz Universitaet Hannover
18!--------------------------------------------------------------------------------!
19!
20! Current revisions:
21! -----------------
22!
23!
24! Former revisions:
25! -----------------
26! $Id: transpose.f90 1310 2014-03-14 08:01:56Z heinze $
27!
28! 1257 2013-11-08 15:18:40Z raasch
29! openacc loop and loop vector clauses removed
30!
31! 1216 2013-08-26 09:31:42Z raasch
32! re-sorting of the transposed / to be transposed arrays moved to separate
33! routines resort_for_...
34!
35! 1111 2013-03-08 23:54:10Z raasch
36! openACC directives added,
37! resorting data from/to work changed, work got 4 dimensions instead of 1
38!
39! 1106 2013-03-04 05:31:38Z raasch
40! preprocessor lines rearranged so that routines can also be used in serial
41! (non-parallel) mode
42!
43! 1092 2013-02-02 11:24:22Z raasch
44! unused variables removed
45!
46! 1036 2012-10-22 13:43:42Z raasch
47! code put under GPL (PALM 3.9)
48!
49! 1003 2012-09-14 14:35:53Z raasch
50! indices nxa, nya, etc. replaced by nx, ny, etc.
51!
52! 683 2011-02-09 14:25:15Z raasch
53! openMP parallelization of transpositions for 2d-domain-decomposition
54!
55! 622 2010-12-10 08:08:13Z raasch
56! optional barriers included in order to speed up collective operations
57!
58! 164 2008-05-15 08:46:15Z raasch
59! f_inv changed from subroutine argument to automatic array in order to do
60! re-ordering from f_in to f_inv in one step, one array work is needed instead
61! of work1 and work2
62!
63! February 2007
64! RCS Log replace by Id keyword, revision history cleaned up
65!
66! Revision 1.2  2004/04/30 13:12:17  raasch
67! Switched from mpi_alltoallv to the simpler mpi_alltoall,
68! all former transpose-routine files collected in this file, enlarged
69! transposition arrays introduced
70!
71! Revision 1.1  2004/04/30 13:08:16  raasch
72! Initial revision (collection of former routines transpose_xy, transpose_xz,
73!                   transpose_yx, transpose_yz, transpose_zx, transpose_zy)
74!
75! Revision 1.1  1997/07/24 11:25:18  raasch
76! Initial revision
77!
78!------------------------------------------------------------------------------!
79! Description:
80! ------------
81! Resorting data for the transposition from x to y. The transposition itself
82! is carried out in transpose_xy
83!------------------------------------------------------------------------------!
84
85     USE indices
86     USE transpose_indices
87
88     IMPLICIT NONE
89
90     REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
91     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
92
93
94     INTEGER ::  i, j, k
95
96!
97!-- Rearrange indices of input array in order to make data to be send
98!-- by MPI contiguous
99    !$OMP  PARALLEL PRIVATE ( i, j, k )
100    !$OMP  DO
101    !$acc kernels present( f_in, f_inv )
102     DO  i = 0, nx
103         DO  k = nzb_x, nzt_x
104             DO  j = nys_x, nyn_x
105                 f_inv(j,k,i) = f_in(i,j,k)
106             ENDDO
107         ENDDO
108     ENDDO
109     !$acc end kernels
110     !$OMP  END PARALLEL
111
112 END SUBROUTINE resort_for_xy
113
114
115 SUBROUTINE transpose_xy( f_inv, f_out )
116
117!------------------------------------------------------------------------------!
118! Description:
119! ------------
120! Transposition of input array (f_in) from x to y. For the input array, all
121! elements along x reside on the same PE, while after transposition, all
122! elements along y reside on the same PE.
123!------------------------------------------------------------------------------!
124
125    USE cpulog
126    USE indices
127    USE interfaces
128    USE pegrid
129    USE transpose_indices
130
131    IMPLICIT NONE
132
133    INTEGER ::  i, j, k, l, ys
134   
135    REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
136
137    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
138
139
140    IF ( numprocs /= 1 )  THEN
141
142#if defined( __parallel )
143!
144!--    Transpose array
145       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
146       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
147       !$acc update host( f_inv )
148       CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
149                          work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
150                          comm1dy, ierr )
151       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
152
153!
154!--    Reorder transposed array
155!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
156!$OMP  DO
157       !$acc data copyin( work )
158       DO  l = 0, pdims(2) - 1
159          ys = 0 + l * ( nyn_x - nys_x + 1 )
160          !$acc kernels present( f_out, work )
161          DO  i = nxl_y, nxr_y
162             DO  k = nzb_y, nzt_y
163                DO  j = ys, ys + nyn_x - nys_x
164                   f_out(j,i,k) = work(j-ys+1,k,i,l)
165                ENDDO
166             ENDDO
167          ENDDO
168          !$acc end kernels
169       ENDDO
170       !$acc end data
171!$OMP  END PARALLEL
172#endif
173
174    ELSE
175
176!
177!--    Reorder transposed array
178!$OMP  PARALLEL PRIVATE ( i, j, k )
179!$OMP  DO
180       !$acc kernels present( f_inv, f_out )
181       DO  k = nzb_y, nzt_y
182          DO  i = nxl_y, nxr_y
183             DO  j = 0, ny
184                f_out(j,i,k) = f_inv(j,k,i)
185             ENDDO
186          ENDDO
187       ENDDO
188       !$acc end kernels
189!$OMP  END PARALLEL
190
191    ENDIF
192
193 END SUBROUTINE transpose_xy
194
195
196 SUBROUTINE resort_for_xz( f_inv, f_out )
197
198!------------------------------------------------------------------------------!
199! Description:
200! ------------
201! Resorting data after the transposition from x to z. The transposition itself
202! is carried out in transpose_xz
203!------------------------------------------------------------------------------!
204
205     USE indices
206     USE transpose_indices
207
208     IMPLICIT NONE
209
210     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
211     REAL ::  f_out(1:nz,nys:nyn,nxl:nxr)
212
213
214     INTEGER ::  i, j, k
215
216!
217!-- Rearrange indices of input array in order to make data to be send
218!-- by MPI contiguous.
219!-- In case of parallel fft/transposition, scattered store is faster in
220!-- backward direction!!!
221    !$OMP  PARALLEL PRIVATE ( i, j, k )
222    !$OMP  DO
223    !$acc kernels present( f_inv, f_out )
224     DO  k = 1, nz
225         DO  i = nxl, nxr
226             DO  j = nys, nyn
227                 f_out(k,j,i) = f_inv(j,i,k)
228             ENDDO
229         ENDDO
230     ENDDO
231     !$acc end kernels
232     !$OMP  END PARALLEL
233
234 END SUBROUTINE resort_for_xz
235
236
237 SUBROUTINE transpose_xz( f_in, f_inv )
238
239!------------------------------------------------------------------------------!
240! Description:
241! ------------
242! Transposition of input array (f_in) from x to z. For the input array, all
243! elements along x reside on the same PE, while after transposition, all
244! elements along z reside on the same PE.
245!------------------------------------------------------------------------------!
246
247    USE cpulog
248    USE indices
249    USE interfaces
250    USE pegrid
251    USE transpose_indices
252
253    IMPLICIT NONE
254
255    INTEGER ::  i, j, k, l, xs
256   
257    REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz)
258
259    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
260
261
262!
263!-- If the PE grid is one-dimensional along y, the array has only to be
264!-- reordered locally and therefore no transposition has to be done.
265    IF ( pdims(1) /= 1 )  THEN
266
267#if defined( __parallel )
268!
269!--    Reorder input array for transposition
270!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
271!$OMP  DO
272       !$acc data copyout( work )
273       DO  l = 0, pdims(1) - 1
274          xs = 0 + l * nnx
275          !$acc kernels present( f_in, work )
276          DO  k = nzb_x, nzt_x
277             DO  i = xs, xs + nnx - 1
278                DO  j = nys_x, nyn_x
279                   work(j,i-xs+1,k,l) = f_in(i,j,k)
280                ENDDO
281             ENDDO
282          ENDDO
283          !$acc end kernels
284       ENDDO
285       !$acc end data
286!$OMP  END PARALLEL
287
288!
289!--    Transpose array
290       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
291       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
292       CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
293                          f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
294                          comm1dx, ierr )
295       !$acc update device( f_inv )
296       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
297#endif
298
299    ELSE
300
301!
302!--    Reorder the array in a way that the z index is in first position
303!$OMP  PARALLEL PRIVATE ( i, j, k )
304!$OMP  DO
305       !$acc kernels present( f_in, f_inv )
306       DO  i = nxl, nxr
307          DO  j = nys, nyn
308             DO  k = 1, nz
309                f_inv(j,i,k) = f_in(i,j,k)
310             ENDDO
311          ENDDO
312       ENDDO
313       !$acc end kernels
314!$OMP  END PARALLEL
315
316    ENDIF
317
318 END SUBROUTINE transpose_xz
319
320
321 SUBROUTINE resort_for_yx( f_inv, f_out )
322
323!------------------------------------------------------------------------------!
324! Description:
325! ------------
326! Resorting data after the transposition from y to x. The transposition itself
327! is carried out in transpose_yx
328!------------------------------------------------------------------------------!
329
330     USE indices
331     USE transpose_indices
332
333     IMPLICIT NONE
334
335     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
336     REAL ::  f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
337
338
339     INTEGER ::  i, j, k
340
341!
342!-- Rearrange indices of input array in order to make data to be send
343!-- by MPI contiguous
344    !$OMP  PARALLEL PRIVATE ( i, j, k )
345    !$OMP  DO
346    !$acc kernels present( f_inv, f_out )
347     DO  i = 0, nx
348         DO  k = nzb_x, nzt_x
349             DO  j = nys_x, nyn_x
350                 f_out(i,j,k) = f_inv(j,k,i)
351             ENDDO
352         ENDDO
353     ENDDO
354     !$acc end kernels
355     !$OMP  END PARALLEL
356
357 END SUBROUTINE resort_for_yx
358
359
360 SUBROUTINE transpose_yx( f_in, f_inv )
361
362!------------------------------------------------------------------------------!
363! Description:
364! ------------
365! Transposition of input array (f_in) from y to x. For the input array, all
366! elements along y reside on the same PE, while after transposition, all
367! elements along x reside on the same PE.
368!------------------------------------------------------------------------------!
369
370    USE cpulog
371    USE indices
372    USE interfaces
373    USE pegrid
374    USE transpose_indices
375
376    IMPLICIT NONE
377
378    INTEGER ::  i, j, k, l, ys
379   
380    REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
381
382    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
383
384
385    IF ( numprocs /= 1 )  THEN
386
387#if defined( __parallel )
388!
389!--    Reorder input array for transposition
390!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
391!$OMP  DO
392       !$acc data copyout( work )
393       DO  l = 0, pdims(2) - 1
394          ys = 0 + l * ( nyn_x - nys_x + 1 )
395          !$acc kernels present( f_in, work )
396          DO  i = nxl_y, nxr_y
397             DO  k = nzb_y, nzt_y
398                DO  j = ys, ys + nyn_x - nys_x
399                   work(j-ys+1,k,i,l) = f_in(j,i,k)
400                ENDDO
401             ENDDO
402          ENDDO
403          !$acc end kernels
404       ENDDO
405       !$acc end data
406!$OMP  END PARALLEL
407
408!
409!--    Transpose array
410       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
411       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
412       CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
413                          f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
414                          comm1dy, ierr )
415       !$acc update device( f_inv )
416       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
417#endif
418
419    ELSE
420
421!
422!--    Reorder array f_in the same way as ALLTOALL did it
423!$OMP  PARALLEL PRIVATE ( i, j, k )
424!$OMP  DO
425       !$acc kernels present( f_in, f_inv )
426       DO  i = nxl_y, nxr_y
427          DO  k = nzb_y, nzt_y
428             DO  j = 0, ny
429                f_inv(j,k,i) = f_in(j,i,k)
430             ENDDO
431          ENDDO
432       ENDDO
433       !$acc end kernels
434!$OMP  END PARALLEL
435
436    ENDIF
437
438 END SUBROUTINE transpose_yx
439
440
441 SUBROUTINE transpose_yxd( f_in, f_out )
442
443!------------------------------------------------------------------------------!
444! Description:
445! ------------
446! Transposition of input array (f_in) from y to x. For the input array, all
447! elements along y reside on the same PE, while after transposition, all
448! elements along x reside on the same PE.
449! This is a direct transposition for arrays with indices in regular order
450! (k,j,i) (cf. transpose_yx).
451!------------------------------------------------------------------------------!
452
453    USE cpulog
454    USE indices
455    USE interfaces
456    USE pegrid
457    USE transpose_indices
458
459    IMPLICIT NONE
460
461    INTEGER ::  i, j, k, l, m, xs
462
463    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
464             f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x),                     &
465             work(nnx*nny*nnz)
466
467#if defined( __parallel )
468
469!
470!-- Rearrange indices of input array in order to make data to be send
471!-- by MPI contiguous
472    DO  k = 1, nz
473       DO  j = nys, nyn
474          DO  i = nxl, nxr
475             f_inv(i,k,j) = f_in(k,j,i)
476          ENDDO
477       ENDDO
478    ENDDO
479
480!
481!-- Transpose array
482    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
483    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
484    CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
485                       work(1),          sendrecvcount_xy, MPI_REAL, &
486                       comm1dx, ierr )
487    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
488
489!
490!-- Reorder transposed array
491    m = 0
492    DO  l = 0, pdims(1) - 1
493       xs = 0 + l * nnx
494       DO  j = nys_x, nyn_x
495          DO  k = 1, nz
496             DO  i = xs, xs + nnx - 1
497                m = m + 1
498                f_out(i,j,k) = work(m)
499             ENDDO
500          ENDDO
501       ENDDO
502    ENDDO
503
504#endif
505
506 END SUBROUTINE transpose_yxd
507
508
509 SUBROUTINE resort_for_yz( f_in, f_inv )
510
511!------------------------------------------------------------------------------!
512! Description:
513! ------------
514! Resorting data for the transposition from y to z. The transposition itself
515! is carried out in transpose_yz
516!------------------------------------------------------------------------------!
517
518     USE indices
519     USE transpose_indices
520
521     IMPLICIT NONE
522
523     REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
524     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
525
526
527     INTEGER ::  i, j, k
528
529!
530!-- Rearrange indices of input array in order to make data to be send
531!-- by MPI contiguous
532    !$OMP  PARALLEL PRIVATE ( i, j, k )
533    !$OMP  DO
534    !$acc kernels present( f_in, f_inv )
535     DO  j = 0, ny
536         DO  k = nzb_y, nzt_y
537             DO  i = nxl_y, nxr_y
538                 f_inv(i,k,j) = f_in(j,i,k)
539             ENDDO
540         ENDDO
541     ENDDO
542     !$acc end kernels
543     !$OMP  END PARALLEL
544
545 END SUBROUTINE resort_for_yz
546
547
548 SUBROUTINE transpose_yz( f_inv, f_out )
549
550!------------------------------------------------------------------------------!
551! Description:
552! ------------
553! Transposition of input array (f_in) from y to z. For the input array, all
554! elements along y reside on the same PE, while after transposition, all
555! elements along z reside on the same PE.
556!------------------------------------------------------------------------------!
557
558    USE cpulog
559    USE indices
560    USE interfaces
561    USE pegrid
562    USE transpose_indices
563
564    IMPLICIT NONE
565
566    INTEGER ::  i, j, k, l, zs
567   
568    REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
569
570    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
571
572
573!
574!-- If the PE grid is one-dimensional along y, only local reordering
575!-- of the data is necessary and no transposition has to be done.
576    IF ( pdims(1) == 1 )  THEN
577
578!$OMP  PARALLEL PRIVATE ( i, j, k )
579!$OMP  DO
580       !$acc kernels present( f_inv, f_out )
581       DO  j = 0, ny
582          DO  k = nzb_y, nzt_y
583             DO  i = nxl_y, nxr_y
584                f_out(i,j,k) = f_inv(i,k,j)
585             ENDDO
586          ENDDO
587       ENDDO
588       !$acc end kernels
589!$OMP  END PARALLEL
590
591    ELSE
592
593#if defined( __parallel )
594!
595!--    Transpose array
596       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
597       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
598       !$acc update host( f_inv )
599       CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
600                          work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
601                          comm1dx, ierr )
602       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
603
604!
605!--    Reorder transposed array
606!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
607!$OMP  DO
608       !$acc data copyin( work )
609       DO  l = 0, pdims(1) - 1
610          zs = 1 + l * ( nzt_y - nzb_y + 1 )
611          !$acc kernels present( f_out )
612          DO  j = nys_z, nyn_z
613             DO  k = zs, zs + nzt_y - nzb_y
614                DO  i = nxl_z, nxr_z
615                   f_out(i,j,k) = work(i,k-zs+1,j,l)
616                ENDDO
617             ENDDO
618          ENDDO
619          !$acc end kernels
620       ENDDO
621       !$acc end data
622!$OMP  END PARALLEL
623#endif
624
625   ENDIF
626
627 END SUBROUTINE transpose_yz
628
629
630 SUBROUTINE resort_for_zx( f_in, f_inv )
631
632!------------------------------------------------------------------------------!
633! Description:
634! ------------
635! Resorting data for the transposition from z to x. The transposition itself
636! is carried out in transpose_zx
637!------------------------------------------------------------------------------!
638
639     USE indices
640     USE transpose_indices
641
642     IMPLICIT NONE
643
644     REAL ::  f_in(1:nz,nys:nyn,nxl:nxr)
645     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
646
647
648     INTEGER ::  i, j, k
649
650!
651!-- Rearrange indices of input array in order to make data to be send
652!-- by MPI contiguous
653    !$OMP  PARALLEL PRIVATE ( i, j, k )
654    !$OMP  DO
655    !$acc kernels present( f_in, f_inv )
656     DO  k = 1,nz
657         DO  i = nxl, nxr
658             DO  j = nys, nyn
659                 f_inv(j,i,k) = f_in(k,j,i)
660             ENDDO
661         ENDDO
662     ENDDO
663     !$acc end kernels
664     !$OMP  END PARALLEL
665
666 END SUBROUTINE resort_for_zx
667
668
669 SUBROUTINE transpose_zx( f_inv, f_out )
670
671!------------------------------------------------------------------------------!
672! Description:
673! ------------
674! Transposition of input array (f_in) from z to x. For the input array, all
675! elements along z reside on the same PE, while after transposition, all
676! elements along x reside on the same PE.
677!------------------------------------------------------------------------------!
678
679    USE cpulog
680    USE indices
681    USE interfaces
682    USE pegrid
683    USE transpose_indices
684
685    IMPLICIT NONE
686
687    INTEGER ::  i, j, k, l, xs
688   
689    REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
690
691    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
692
693
694!
695!-- If the PE grid is one-dimensional along y, only local reordering
696!-- of the data is necessary and no transposition has to be done.
697    IF ( pdims(1) == 1 )  THEN
698
699!$OMP  PARALLEL PRIVATE ( i, j, k )
700!$OMP  DO
701       !$acc kernels present( f_inv, f_out )
702       DO  k = 1, nz
703          DO  i = nxl, nxr
704             DO  j = nys, nyn
705                f_out(i,j,k) = f_inv(j,i,k)
706             ENDDO
707          ENDDO
708       ENDDO
709       !$acc end kernels
710!$OMP  END PARALLEL
711
712    ELSE
713
714#if defined( __parallel )
715!
716!--    Transpose array
717       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
718       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
719       !$acc update host( f_inv )
720       CALL MPI_ALLTOALL( f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
721                          work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
722                          comm1dx, ierr )
723       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
724
725!
726!--    Reorder transposed array
727!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
728!$OMP  DO
729       !$acc data copyin( work )
730       DO  l = 0, pdims(1) - 1
731          xs = 0 + l * nnx
732          !$acc kernels present( f_out )
733          DO  k = nzb_x, nzt_x
734             DO  i = xs, xs + nnx - 1
735                DO  j = nys_x, nyn_x
736                   f_out(i,j,k) = work(j,i-xs+1,k,l)
737                ENDDO
738             ENDDO
739          ENDDO
740          !$acc end kernels
741       ENDDO
742       !$acc end data
743!$OMP  END PARALLEL
744#endif
745
746    ENDIF
747
748 END SUBROUTINE transpose_zx
749
750
751 SUBROUTINE resort_for_zy( f_inv, f_out )
752
753!------------------------------------------------------------------------------!
754! Description:
755! ------------
756! Resorting data after the transposition from z to y. The transposition itself
757! is carried out in transpose_zy
758!------------------------------------------------------------------------------!
759
760     USE indices
761     USE transpose_indices
762
763     IMPLICIT NONE
764
765     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
766     REAL ::  f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
767
768
769     INTEGER ::  i, j, k
770
771!
772!-- Rearrange indices of input array in order to make data to be send
773!-- by MPI contiguous
774    !$OMP  PARALLEL PRIVATE ( i, j, k )
775    !$OMP  DO
776    !$acc kernels present( f_inv, f_out )
777     DO  k = nzb_y, nzt_y
778         DO  j = 0, ny
779             DO  i = nxl_y, nxr_y
780                 f_out(j,i,k) = f_inv(i,k,j)
781             ENDDO
782         ENDDO
783     ENDDO
784     !$acc end kernels
785     !$OMP  END PARALLEL
786
787 END SUBROUTINE resort_for_zy
788
789
790 SUBROUTINE transpose_zy( f_in, f_inv )
791
792!------------------------------------------------------------------------------!
793! Description:
794! ------------
795! Transposition of input array (f_in) from z to y. For the input array, all
796! elements along z reside on the same PE, while after transposition, all
797! elements along y reside on the same PE.
798!------------------------------------------------------------------------------!
799
800    USE cpulog
801    USE indices
802    USE interfaces
803    USE pegrid
804    USE transpose_indices
805
806    IMPLICIT NONE
807
808    INTEGER ::  i, j, k, l, zs
809   
810    REAL ::  f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
811
812    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
813
814
815!
816!-- If the PE grid is one-dimensional along y, the array has only to be
817!-- reordered locally and therefore no transposition has to be done.
818    IF ( pdims(1) /= 1 )  THEN
819
820#if defined( __parallel )
821!
822!--    Reorder input array for transposition
823!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
824!$OMP  DO
825       !$acc data copyout( work )
826       DO  l = 0, pdims(1) - 1
827          zs = 1 + l * ( nzt_y - nzb_y + 1 )
828          !$acc kernels present( f_in, work )
829          DO  j = nys_z, nyn_z
830             DO  k = zs, zs + nzt_y - nzb_y
831                DO  i = nxl_z, nxr_z
832                   work(i,k-zs+1,j,l) = f_in(i,j,k)
833                ENDDO
834             ENDDO
835          ENDDO
836          !$acc end kernels
837       ENDDO
838       !$acc end data
839!$OMP  END PARALLEL
840
841!
842!--    Transpose array
843       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
844       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
845       CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
846                          f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
847                          comm1dx, ierr )
848       !$acc update device( f_inv )
849       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
850#endif
851
852    ELSE
853!
854!--    Reorder the array in the same way like ALLTOALL did it
855!$OMP  PARALLEL PRIVATE ( i, j, k )
856!$OMP  DO
857       !$acc kernels present( f_in, f_inv )
858       DO  k = nzb_y, nzt_y
859          DO  j = 0, ny
860             DO  i = nxl_y, nxr_y
861                f_inv(i,k,j) = f_in(i,j,k)
862             ENDDO
863          ENDDO
864       ENDDO
865       !$acc end kernels
866!$OMP  END PARALLEL
867
868    ENDIF
869
870 END SUBROUTINE transpose_zy
871
872
873 SUBROUTINE transpose_zyd( f_in, f_out )
874
875!------------------------------------------------------------------------------!
876! Description:
877! ------------
878! Transposition of input array (f_in) from z to y. For the input array, all
879! elements along z reside on the same PE, while after transposition, all
880! elements along y reside on the same PE.
881! This is a direct transposition for arrays with indices in regular order
882! (k,j,i) (cf. transpose_zy).
883!------------------------------------------------------------------------------!
884
885    USE cpulog
886    USE indices
887    USE interfaces
888    USE pegrid
889    USE transpose_indices
890
891    IMPLICIT NONE
892
893    INTEGER ::  i, j, k, l, m, ys
894   
895    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
896             f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd),                 &
897             work(nnx*nny*nnz)
898
899#if defined( __parallel )
900
901!
902!-- Rearrange indices of input array in order to make data to be send
903!-- by MPI contiguous
904    DO  i = nxl, nxr
905       DO  j = nys, nyn
906          DO  k = 1, nz
907             f_inv(j,i,k) = f_in(k,j,i)
908          ENDDO
909       ENDDO
910    ENDDO
911
912!
913!-- Move data to different array, because memory location of work1 is
914!-- needed further below (work1 = work2).
915!-- If the PE grid is one-dimensional along x, only local reordering
916!-- of the data is necessary and no transposition has to be done.
917    IF ( pdims(2) == 1 )  THEN
918       DO  k = 1, nz
919          DO  i = nxl, nxr
920             DO  j = nys, nyn
921                f_out(j,i,k) = f_inv(j,i,k)
922             ENDDO
923          ENDDO
924       ENDDO
925       RETURN
926    ENDIF
927
928!
929!-- Transpose array
930    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
931    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
932    CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
933                       work(1),          sendrecvcount_zyd, MPI_REAL, &
934                       comm1dy, ierr )
935    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
936
937!
938!-- Reorder transposed array
939    m = 0
940    DO  l = 0, pdims(2) - 1
941       ys = 0 + l * nny
942       DO  k = nzb_yd, nzt_yd
943          DO  i = nxl_yd, nxr_yd
944             DO  j = ys, ys + nny - 1
945                m = m + 1
946                f_out(j,i,k) = work(m)
947             ENDDO
948          ENDDO
949       ENDDO
950    ENDDO
951
952#endif
953
954 END SUBROUTINE transpose_zyd
Note: See TracBrowser for help on using the repository browser.