source: palm/trunk/SOURCE/transpose.f90 @ 1106

Last change on this file since 1106 was 1106, checked in by raasch, 11 years ago

New:
---

Porting of FFT-solver for serial runs to GPU using CUDA FFT,
preprocessor lines in transpose routines rearranged, so that routines can also
be used in serial (non-parallel) mode,
transpositions also carried out in serial mode, routines fftx, fftxp replaced
by calls of fft_x, fft_x replaced by fft_x_1d in the 1D-decomposition routines
(Makefile, Makefile_check, fft_xy, poisfft, poisfft_hybrid, transpose, new: cuda_fft_interfaces)

--stdin argument for mpiexec on lckyuh, -y and -Y settings output to header (mrun)

Changed:


Module array_kind renamed precision_kind
(check_open, data_output_3d, fft_xy, modules, user_data_output_3d)

some format changes for coupled atmosphere-ocean runs (header)
small changes in code formatting (microphysics, prognostic_equations)

Errors:


bugfix: default value (0) assigned to coupling_start_time (modules)
bugfix: initial time for preruns of coupled runs is output as -coupling_start_time (data_output_profiles)

  • Property svn:keywords set to Id
File size: 21.6 KB
RevLine 
[164]1 SUBROUTINE transpose_xy( f_in, work, f_out )
[1]2
[1036]3!--------------------------------------------------------------------------------!
4! This file is part of PALM.
5!
6! PALM is free software: you can redistribute it and/or modify it under the terms
7! of the GNU General Public License as published by the Free Software Foundation,
8! either version 3 of the License, or (at your option) any later version.
9!
10! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12! A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13!
14! You should have received a copy of the GNU General Public License along with
15! PALM. If not, see <http://www.gnu.org/licenses/>.
16!
17! Copyright 1997-2012  Leibniz University Hannover
18!--------------------------------------------------------------------------------!
19!
[484]20! Current revisions:
[1]21! -----------------
[1106]22! preprocessor lines rearranged so that routines can also be used in serial
23! (non-parallel) mode
[198]24!
25! Former revisions:
26! -----------------
27! $Id: transpose.f90 1106 2013-03-04 05:31:38Z raasch $
28!
[1093]29! 1092 2013-02-02 11:24:22Z raasch
30! unused variables removed
31!
[1037]32! 1036 2012-10-22 13:43:42Z raasch
33! code put under GPL (PALM 3.9)
34!
[1004]35! 1003 2012-09-14 14:35:53Z raasch
36! indices nxa, nya, etc. replaced by nx, ny, etc.
37!
[684]38! 683 2011-02-09 14:25:15Z raasch
39! openMP parallelization of transpositions for 2d-domain-decomposition
40!
[623]41! 622 2010-12-10 08:08:13Z raasch
42! optional barriers included in order to speed up collective operations
43!
[198]44! 164 2008-05-15 08:46:15Z raasch
[164]45! f_inv changed from subroutine argument to automatic array in order to do
46! re-ordering from f_in to f_inv in one step, one array work is needed instead
47! of work1 and work2
[1]48!
[198]49! February 2007
[3]50! RCS Log replace by Id keyword, revision history cleaned up
51!
[1]52! Revision 1.2  2004/04/30 13:12:17  raasch
53! Switched from mpi_alltoallv to the simpler mpi_alltoall,
54! all former transpose-routine files collected in this file, enlarged
55! transposition arrays introduced
56!
57! Revision 1.1  2004/04/30 13:08:16  raasch
58! Initial revision (collection of former routines transpose_xy, transpose_xz,
59!                   transpose_yx, transpose_yz, transpose_zx, transpose_zy)
60!
61! Revision 1.1  1997/07/24 11:25:18  raasch
62! Initial revision
63!
64!
65! Description:
66! ------------
67! Transposition of input array (f_in) from x to y. For the input array, all
68! elements along x reside on the same PE, while after transposition, all
69! elements along y reside on the same PE.
70!------------------------------------------------------------------------------!
71
72    USE cpulog
73    USE indices
74    USE interfaces
75    USE pegrid
76    USE transpose_indices
77
78    IMPLICIT NONE
79
80    INTEGER ::  i, j, k, l, m, ys
81   
[1003]82    REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x),   &
83             f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx),  &
84             f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y),  &
[164]85             work(nnx*nny*nnz)
[1]86
87!
88!-- Rearrange indices of input array in order to make data to be send
89!-- by MPI contiguous
[683]90!$OMP  PARALLEL PRIVATE ( i, j, k )
91!$OMP  DO
[1003]92    DO  i = 0, nx
93       DO  k = nzb_x, nzt_x
94          DO  j = nys_x, nyn_x
[164]95             f_inv(j,k,i) = f_in(i,j,k)
[1]96          ENDDO
97       ENDDO
98    ENDDO
[683]99!$OMP  END PARALLEL
[1]100
[1106]101    IF ( numprocs /= 1 )  THEN
102
103#if defined( __parallel )
[1]104!
[1106]105!--    Transpose array
106       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
107       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
108       CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
109                          work(1),              sendrecvcount_xy, MPI_REAL, &
110                          comm1dy, ierr )
111       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]112
113!
[1106]114!--    Reorder transposed array
[683]115!$OMP  PARALLEL PRIVATE ( i, j, k, l, m, ys )
116!$OMP  DO
[1106]117       DO  l = 0, pdims(2) - 1
118          m  = l * ( nxr_y - nxl_y + 1 ) * ( nzt_y - nzb_y + 1 ) * &
119                   ( nyn_x - nys_x + 1 )
120          ys = 0 + l * ( nyn_x - nys_x + 1 )
121          DO  i = nxl_y, nxr_y
122             DO  k = nzb_y, nzt_y
123                DO  j = ys, ys + nyn_x - nys_x
124                   m = m + 1
125                   f_out(j,i,k) = work(m)
126                ENDDO
[1]127             ENDDO
128          ENDDO
129       ENDDO
[683]130!$OMP  END PARALLEL
[1]131#endif
132
[1106]133    ELSE
134
135!
136!--    Reorder transposed array
137!$OMP  PARALLEL PRIVATE ( i, j, k )
138!$OMP  DO
139       DO  k = nzb_y, nzt_y
140          DO  i = nxl_y, nxr_y
141             DO  j = 0, ny
142                f_out(j,i,k) = f_inv(j,k,i)
143             ENDDO
144          ENDDO
145       ENDDO
146!$OMP  END PARALLEL
147
148    ENDIF
149
[1]150 END SUBROUTINE transpose_xy
151
152
[164]153 SUBROUTINE transpose_xz( f_in, work, f_out )
[1]154
155!------------------------------------------------------------------------------!
156! Description:
157! ------------
158! Transposition of input array (f_in) from x to z. For the input array, all
159! elements along x reside on the same PE, while after transposition, all
160! elements along z reside on the same PE.
161!------------------------------------------------------------------------------!
162
163    USE cpulog
164    USE indices
165    USE interfaces
166    USE pegrid
167    USE transpose_indices
168
169    IMPLICIT NONE
170
171    INTEGER ::  i, j, k, l, m, xs
172   
[1003]173    REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x),  &
174             f_inv(nys:nyn,nxl:nxr,1:nz),         &
175             f_out(1:nz,nys:nyn,nxl:nxr),         &
[164]176             work(nnx*nny*nnz)
[1]177
178
179!
180!-- If the PE grid is one-dimensional along y, the array has only to be
181!-- reordered locally and therefore no transposition has to be done.
182    IF ( pdims(1) /= 1 )  THEN
[1106]183
184#if defined( __parallel )
[1]185!
186!--    Reorder input array for transposition
[683]187!$OMP  PARALLEL PRIVATE ( i, j, k, l, m, xs )
188!$OMP  DO
[1]189       DO  l = 0, pdims(1) - 1
[1003]190          m  = l * ( nzt_x - nzb_x + 1 ) * nnx * ( nyn_x - nys_x + 1 )
[1]191          xs = 0 + l * nnx
[1003]192          DO  k = nzb_x, nzt_x
[164]193             DO  i = xs, xs + nnx - 1
[1003]194                DO  j = nys_x, nyn_x
[1]195                   m = m + 1
[164]196                   work(m) = f_in(i,j,k)
[1]197                ENDDO
198             ENDDO
199          ENDDO
200       ENDDO
[683]201!$OMP  END PARALLEL
[1]202
203!
204!--    Transpose array
205       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]206       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[164]207       CALL MPI_ALLTOALL( work(1),          sendrecvcount_zx, MPI_REAL, &
208                          f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]209                          comm1dx, ierr )
210       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
211
212!
213!--    Reorder transposed array in a way that the z index is in first position
[683]214!$OMP  PARALLEL PRIVATE ( i, j, k )
215!$OMP  DO
[1003]216       DO  k = 1, nz
217          DO  i = nxl, nxr
218             DO  j = nys, nyn
[164]219                f_out(k,j,i) = f_inv(j,i,k)
[1]220             ENDDO
221          ENDDO
222       ENDDO
[683]223!$OMP  END PARALLEL
[1106]224#endif
225
[1]226    ELSE
[1106]227
[1]228!
229!--    Reorder the array in a way that the z index is in first position
[683]230!$OMP  PARALLEL PRIVATE ( i, j, k )
231!$OMP  DO
[1003]232       DO  i = nxl, nxr
233          DO  j = nys, nyn
234             DO  k = 1, nz
[164]235                f_inv(j,i,k) = f_in(i,j,k)
[1]236             ENDDO
237          ENDDO
238       ENDDO
[683]239!$OMP  END PARALLEL
[1]240
[683]241!$OMP  PARALLEL PRIVATE ( i, j, k )
242!$OMP  DO
[1003]243       DO  k = 1, nz
244          DO  i = nxl, nxr
245             DO  j = nys, nyn
[164]246                f_out(k,j,i) = f_inv(j,i,k)
247             ENDDO
[1]248          ENDDO
249       ENDDO
[683]250!$OMP  END PARALLEL
[1]251
[164]252    ENDIF
253
[1]254 END SUBROUTINE transpose_xz
255
256
[164]257 SUBROUTINE transpose_yx( f_in, work, f_out )
[1]258
259!------------------------------------------------------------------------------!
260! Description:
261! ------------
262! Transposition of input array (f_in) from y to x. For the input array, all
263! elements along y reside on the same PE, while after transposition, all
264! elements along x reside on the same PE.
265!------------------------------------------------------------------------------!
266
267    USE cpulog
268    USE indices
269    USE interfaces
270    USE pegrid
271    USE transpose_indices
272
273    IMPLICIT NONE
274
275    INTEGER ::  i, j, k, l, m, ys
276   
[1003]277    REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y),  &
278             f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), &
279             f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x), &
[164]280             work(nnx*nny*nnz)
[1]281
[1106]282    IF ( numprocs /= 1 )  THEN
283
[1]284#if defined( __parallel )
285!
[1106]286!--    Reorder input array for transposition
[683]287!$OMP  PARALLEL PRIVATE ( i, j, k, l, m, ys )
288!$OMP  DO
[1106]289       DO  l = 0, pdims(2) - 1
290          m  = l * ( nxr_y - nxl_y + 1 ) * ( nzt_y - nzb_y + 1 ) * &
291                   ( nyn_x - nys_x + 1 )
292          ys = 0 + l * ( nyn_x - nys_x + 1 )
293          DO  i = nxl_y, nxr_y
294             DO  k = nzb_y, nzt_y
295                DO  j = ys, ys + nyn_x - nys_x
296                   m = m + 1
297                   work(m) = f_in(j,i,k)
298                ENDDO
299             ENDDO
300          ENDDO
301       ENDDO
302!$OMP  END PARALLEL
303
304!
305!--    Transpose array
306       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
307       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
308       CALL MPI_ALLTOALL( work(1),              sendrecvcount_xy, MPI_REAL, &
309                          f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
310                          comm1dy, ierr )
311       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
312#endif
313
314    ELSE
315
316!
317!--    Reorder array f_in the same way as ALLTOALL did it
318!$OMP  PARALLEL PRIVATE ( i, j, k )
319!$OMP  DO
[1003]320       DO  i = nxl_y, nxr_y
321          DO  k = nzb_y, nzt_y
[1106]322             DO  j = 0, ny
323                f_inv(j,k,i) = f_in(j,i,k)
[1]324             ENDDO
325          ENDDO
326       ENDDO
[683]327!$OMP  END PARALLEL
[1]328
[1106]329    ENDIF
[1]330
331!
332!-- Reorder transposed array in a way that the x index is in first position
[683]333!$OMP  PARALLEL PRIVATE ( i, j, k )
334!$OMP  DO
[1003]335    DO  i = 0, nx
336       DO  k = nzb_x, nzt_x
337          DO  j = nys_x, nyn_x
[164]338             f_out(i,j,k) = f_inv(j,k,i)
[1]339          ENDDO
340       ENDDO
341    ENDDO
[683]342!$OMP  END PARALLEL
[1]343
344 END SUBROUTINE transpose_yx
345
346
[164]347 SUBROUTINE transpose_yxd( f_in, work, f_out )
[1]348
349!------------------------------------------------------------------------------!
350! Description:
351! ------------
352! Transposition of input array (f_in) from y to x. For the input array, all
353! elements along y reside on the same PE, while after transposition, all
354! elements along x reside on the same PE.
355! This is a direct transposition for arrays with indices in regular order
356! (k,j,i) (cf. transpose_yx).
357!------------------------------------------------------------------------------!
358
359    USE cpulog
360    USE indices
361    USE interfaces
362    USE pegrid
363    USE transpose_indices
364
365    IMPLICIT NONE
366
[1092]367    INTEGER ::  i, j, k, l, m, xs
[1]368
[1003]369    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
370             f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x),                     &
[164]371             work(nnx*nny*nnz)
[1]372
373#if defined( __parallel )
374
375!
376!-- Rearrange indices of input array in order to make data to be send
377!-- by MPI contiguous
[1003]378    DO  k = 1, nz
379       DO  j = nys, nyn
380          DO  i = nxl, nxr
[164]381             f_inv(i,k,j) = f_in(k,j,i)
[1]382          ENDDO
383       ENDDO
384    ENDDO
385
386!
387!-- Transpose array
388    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]389    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1]390    CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]391                       work(1),          sendrecvcount_xy, MPI_REAL, &
[1]392                       comm1dx, ierr )
393    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
394
395!
396!-- Reorder transposed array
397    m = 0
398    DO  l = 0, pdims(1) - 1
399       xs = 0 + l * nnx
[1003]400       DO  j = nys_x, nyn_x
401          DO  k = 1, nz
[1]402             DO  i = xs, xs + nnx - 1
403                m = m + 1
[164]404                f_out(i,j,k) = work(m)
[1]405             ENDDO
406          ENDDO
407       ENDDO
408    ENDDO
409
410#endif
411
412 END SUBROUTINE transpose_yxd
413
414
[164]415 SUBROUTINE transpose_yz( f_in, work, f_out )
[1]416
417!------------------------------------------------------------------------------!
418! Description:
419! ------------
420! Transposition of input array (f_in) from y to z. For the input array, all
421! elements along y reside on the same PE, while after transposition, all
422! elements along z reside on the same PE.
423!------------------------------------------------------------------------------!
424
425    USE cpulog
426    USE indices
427    USE interfaces
428    USE pegrid
429    USE transpose_indices
430
431    IMPLICIT NONE
432
433    INTEGER ::  i, j, k, l, m, zs
434   
[1003]435    REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y),  &
436             f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), &
437             f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz), &
[164]438             work(nnx*nny*nnz)
[1]439
440!
441!-- Rearrange indices of input array in order to make data to be send
442!-- by MPI contiguous
[683]443!$OMP  PARALLEL PRIVATE ( i, j, k )
444!$OMP  DO
[1003]445    DO  j = 0, ny
446       DO  k = nzb_y, nzt_y
447          DO  i = nxl_y, nxr_y
[164]448             f_inv(i,k,j) = f_in(j,i,k)
[1]449          ENDDO
450       ENDDO
451    ENDDO
[683]452!$OMP  END PARALLEL
[1]453
454!
455!-- Move data to different array, because memory location of work1 is
456!-- needed further below (work1 = work2).
457!-- If the PE grid is one-dimensional along y, only local reordering
458!-- of the data is necessary and no transposition has to be done.
459    IF ( pdims(1) == 1 )  THEN
[1106]460
[683]461!$OMP  PARALLEL PRIVATE ( i, j, k )
462!$OMP  DO
[1003]463       DO  j = 0, ny
464          DO  k = nzb_y, nzt_y
465             DO  i = nxl_y, nxr_y
[164]466                f_out(i,j,k) = f_inv(i,k,j)
[1]467             ENDDO
468          ENDDO
469       ENDDO
[683]470!$OMP  END PARALLEL
[1]471
[1106]472    ELSE
473
474#if defined( __parallel )
[1]475!
[1106]476!--    Transpose array
477       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
478       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
479       CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
480                          work(1),              sendrecvcount_yz, MPI_REAL, &
481                          comm1dx, ierr )
482       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]483
484!
[1106]485!--    Reorder transposed array
[683]486!$OMP  PARALLEL PRIVATE ( i, j, k, l, m, zs )
487!$OMP  DO
[1106]488       DO  l = 0, pdims(1) - 1
489          m  = l * ( nyn_z - nys_z + 1 ) * ( nzt_y - nzb_y + 1 ) * &
490                   ( nxr_z - nxl_z + 1 )
491          zs = 1 + l * ( nzt_y - nzb_y + 1 )
492          DO  j = nys_z, nyn_z
493             DO  k = zs, zs + nzt_y - nzb_y
494                DO  i = nxl_z, nxr_z
495                   m = m + 1
496                   f_out(i,j,k) = work(m)
497                ENDDO
[1]498             ENDDO
499          ENDDO
500       ENDDO
[683]501!$OMP  END PARALLEL
[1]502#endif
503
[1106]504   ENDIF
505
[1]506 END SUBROUTINE transpose_yz
507
508
[164]509 SUBROUTINE transpose_zx( f_in, work, f_out )
[1]510
511!------------------------------------------------------------------------------!
512! Description:
513! ------------
514! Transposition of input array (f_in) from z to x. For the input array, all
515! elements along z reside on the same PE, while after transposition, all
516! elements along x reside on the same PE.
517!------------------------------------------------------------------------------!
518
519    USE cpulog
520    USE indices
521    USE interfaces
522    USE pegrid
523    USE transpose_indices
524
525    IMPLICIT NONE
526
527    INTEGER ::  i, j, k, l, m, xs
528   
[1106]529    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x), &
[164]530             work(nnx*nny*nnz)
[1]531
[1106]532    !$acc declare create ( f_inv )
533    REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
[1]534
[1106]535
[1]536!
537!-- Rearrange indices of input array in order to make data to be send
538!-- by MPI contiguous
[683]539!$OMP  PARALLEL PRIVATE ( i, j, k )
540!$OMP  DO
[1106]541    !$acc kernels present( f_in )
542    !$acc loop
[1003]543    DO  k = 1,nz
544       DO  i = nxl, nxr
[1106]545          !$acc loop vector( 32 )
[1003]546          DO  j = nys, nyn
[164]547             f_inv(j,i,k) = f_in(k,j,i)
[1]548          ENDDO
549       ENDDO
550    ENDDO
[683]551!$OMP  END PARALLEL
[1]552
553!
554!-- Move data to different array, because memory location of work1 is
555!-- needed further below (work1 = work2).
556!-- If the PE grid is one-dimensional along y, only local reordering
557!-- of the data is necessary and no transposition has to be done.
558    IF ( pdims(1) == 1 )  THEN
[1106]559
[683]560!$OMP  PARALLEL PRIVATE ( i, j, k )
561!$OMP  DO
[1106]562       !$acc kernels present( f_out )
563       !$acc loop
[1003]564       DO  k = 1, nz
565          DO  i = nxl, nxr
[1106]566             !$acc loop vector( 32 )
[1003]567             DO  j = nys, nyn
[164]568                f_out(i,j,k) = f_inv(j,i,k)
[1]569             ENDDO
570          ENDDO
571       ENDDO
[683]572!$OMP  END PARALLEL
[1]573
[1106]574    ELSE
575
576#if defined( __parallel )
[1]577!
[1106]578!--    Transpose array
579       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
580       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
581       CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
582                          work(1),          sendrecvcount_zx, MPI_REAL, &
583                          comm1dx, ierr )
584       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]585
586!
[1106]587!--    Reorder transposed array
[683]588!$OMP  PARALLEL PRIVATE ( i, j, k, l, m, xs )
589!$OMP  DO
[1106]590       DO  l = 0, pdims(1) - 1
591          m  = l * ( nzt_x - nzb_x + 1 ) * nnx * ( nyn_x - nys_x + 1 )
592          xs = 0 + l * nnx
593          DO  k = nzb_x, nzt_x
594             DO  i = xs, xs + nnx - 1
595                DO  j = nys_x, nyn_x
596                   m = m + 1
597                   f_out(i,j,k) = work(m)
598                ENDDO
[1]599             ENDDO
600          ENDDO
601       ENDDO
[683]602!$OMP  END PARALLEL
[1]603#endif
604
[1106]605    ENDIF
606
[1]607 END SUBROUTINE transpose_zx
608
609
[164]610 SUBROUTINE transpose_zy( f_in, work, f_out )
[1]611
612!------------------------------------------------------------------------------!
613! Description:
614! ------------
615! Transposition of input array (f_in) from z to y. For the input array, all
616! elements along z reside on the same PE, while after transposition, all
617! elements along y reside on the same PE.
618!------------------------------------------------------------------------------!
619
620    USE cpulog
621    USE indices
622    USE interfaces
623    USE pegrid
624    USE transpose_indices
625
626    IMPLICIT NONE
627
628    INTEGER ::  i, j, k, l, m, zs
629   
[1003]630    REAL ::  f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz),  &
631             f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), &
632             f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
[164]633             work(nnx*nny*nnz)
[1]634
635!
636!-- If the PE grid is one-dimensional along y, the array has only to be
637!-- reordered locally and therefore no transposition has to be done.
638    IF ( pdims(1) /= 1 )  THEN
[1106]639
640#if defined( __parallel )
[1]641!
642!--    Reorder input array for transposition
[683]643!$OMP  PARALLEL PRIVATE ( i, j, k, l, m, zs )
644!$OMP  DO
[1]645       DO  l = 0, pdims(1) - 1
[1003]646          m  = l * ( nyn_z - nys_z + 1 ) * ( nzt_y - nzb_y + 1 ) * &
647                   ( nxr_z - nxl_z + 1 )
648          zs = 1 + l * ( nzt_y - nzb_y + 1 )
649          DO  j = nys_z, nyn_z
650             DO  k = zs, zs + nzt_y - nzb_y
651                DO  i = nxl_z, nxr_z
[1]652                   m = m + 1
[164]653                   work(m) = f_in(i,j,k)
[1]654                ENDDO
655             ENDDO
656          ENDDO
657       ENDDO
[683]658!$OMP  END PARALLEL
[1]659
660!
661!--    Transpose array
662       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]663       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[164]664       CALL MPI_ALLTOALL( work(1),              sendrecvcount_yz, MPI_REAL, &
[1]665                          f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
666                          comm1dx, ierr )
667       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]668#endif
[1]669
670    ELSE
671!
[1106]672!--    Reorder the array in the same way like ALLTOALL did it
[683]673!$OMP  PARALLEL PRIVATE ( i, j, k )
674!$OMP  DO
[1003]675       DO  k = nzb_y, nzt_y
676          DO  j = 0, ny
677             DO  i = nxl_y, nxr_y
[164]678                f_inv(i,k,j) = f_in(i,j,k)
679             ENDDO
680          ENDDO
681       ENDDO
[683]682!$OMP  END PARALLEL
[1106]683
684    ENDIF
685
[164]686!
[1106]687!-- Reorder transposed array in a way that the y index is in first position
[683]688!$OMP  PARALLEL PRIVATE ( i, j, k )
689!$OMP  DO
[1106]690    DO  k = nzb_y, nzt_y
691       DO  i = nxl_y, nxr_y
692          DO  j = 0, ny
693             f_out(j,i,k) = f_inv(i,k,j)
[1]694          ENDDO
695       ENDDO
[1106]696    ENDDO
[683]697!$OMP  END PARALLEL
[164]698
[1]699 END SUBROUTINE transpose_zy
700
701
[164]702 SUBROUTINE transpose_zyd( f_in, work, f_out )
[1]703
704!------------------------------------------------------------------------------!
705! Description:
706! ------------
707! Transposition of input array (f_in) from z to y. For the input array, all
708! elements along z reside on the same PE, while after transposition, all
709! elements along y reside on the same PE.
710! This is a direct transposition for arrays with indices in regular order
711! (k,j,i) (cf. transpose_zy).
712!------------------------------------------------------------------------------!
713
714    USE cpulog
715    USE indices
716    USE interfaces
717    USE pegrid
718    USE transpose_indices
719
720    IMPLICIT NONE
721
722    INTEGER ::  i, j, k, l, m, ys
723   
[1003]724    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
725             f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd),                 &
[164]726             work(nnx*nny*nnz)
[1]727
728#if defined( __parallel )
729
730!
731!-- Rearrange indices of input array in order to make data to be send
732!-- by MPI contiguous
[1003]733    DO  i = nxl, nxr
734       DO  j = nys, nyn
735          DO  k = 1, nz
[164]736             f_inv(j,i,k) = f_in(k,j,i)
[1]737          ENDDO
738       ENDDO
739    ENDDO
740
741!
742!-- Move data to different array, because memory location of work1 is
743!-- needed further below (work1 = work2).
744!-- If the PE grid is one-dimensional along x, only local reordering
745!-- of the data is necessary and no transposition has to be done.
746    IF ( pdims(2) == 1 )  THEN
[1003]747       DO  k = 1, nz
748          DO  i = nxl, nxr
749             DO  j = nys, nyn
[164]750                f_out(j,i,k) = f_inv(j,i,k)
[1]751             ENDDO
752          ENDDO
753       ENDDO
754       RETURN
755    ENDIF
756
757!
758!-- Transpose array
759    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]760    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
[1]761    CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]762                       work(1),          sendrecvcount_zyd, MPI_REAL, &
[1]763                       comm1dy, ierr )
764    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
765
766!
767!-- Reorder transposed array
768    m = 0
769    DO  l = 0, pdims(2) - 1
770       ys = 0 + l * nny
[1003]771       DO  k = nzb_yd, nzt_yd
772          DO  i = nxl_yd, nxr_yd
[1]773             DO  j = ys, ys + nny - 1
774                m = m + 1
[164]775                f_out(j,i,k) = work(m)
[1]776             ENDDO
777          ENDDO
778       ENDDO
779    ENDDO
780
781#endif
782
783 END SUBROUTINE transpose_zyd
Note: See TracBrowser for help on using the repository browser.