Ignore:
Timestamp:
Aug 26, 2013 9:31:42 AM (8 years ago)
Author:
raasch
Message:

overlapping execution of fft and transpositions (MPI_ALLTOALL), but real overlapping is not activated so far,
fftw implemented for 1D-decomposition
resorting of arrays moved to separate routines resort_for_...
bugfix in mbuild concerning Makefile_check

File:
1 edited

Legend:

Unmodified
Added
Removed
  • palm/trunk/SOURCE/transpose.f90

    r1112 r1216  
    1  SUBROUTINE transpose_xy( f_in, work, f_out )
     1 SUBROUTINE resort_for_xy( f_in, f_inv )
    22
    33!--------------------------------------------------------------------------------!
     
    2020! Current revisions:
    2121! -----------------
    22 !
     22! re-sorting of the transposed / to be transposed arrays moved to separate
     23! routines resort_for_...
    2324!
    2425! Former revisions:
     
    6970! Initial revision
    7071!
    71 !
     72!------------------------------------------------------------------------------!
     73! Description:
     74! ------------
     75! Resorting data for the transposition from x to y. The transposition itself
     76! is carried out in transpose_xy
     77!------------------------------------------------------------------------------!
     78
     79     USE indices
     80     USE transpose_indices
     81
     82     IMPLICIT NONE
     83
     84     REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
     85     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
     86
     87
     88     INTEGER ::  i, j, k
     89
     90!
     91!-- Rearrange indices of input array in order to make data to be send
     92!-- by MPI contiguous
     93    !$OMP  PARALLEL PRIVATE ( i, j, k )
     94    !$OMP  DO
     95    !$acc kernels present( f_in, f_inv )
     96    !$acc loop
     97     DO  i = 0, nx
     98         DO  k = nzb_x, nzt_x
     99             !$acc loop vector( 32 )
     100             DO  j = nys_x, nyn_x
     101                 f_inv(j,k,i) = f_in(i,j,k)
     102             ENDDO
     103         ENDDO
     104     ENDDO
     105     !$acc end kernels
     106     !$OMP  END PARALLEL
     107
     108 END SUBROUTINE resort_for_xy
     109
     110
     111 SUBROUTINE transpose_xy( f_inv, f_out )
     112
     113!------------------------------------------------------------------------------!
    72114! Description:
    73115! ------------
     
    87129    INTEGER ::  i, j, k, l, ys
    88130   
    89     REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
     131    REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
    90132
    91133    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
    92134
    93     !$acc declare create( f_inv )
    94     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
    95 
    96 
    97 !
    98 !-- Rearrange indices of input array in order to make data to be send
    99 !-- by MPI contiguous
    100 !$OMP  PARALLEL PRIVATE ( i, j, k )
    101 !$OMP  DO
    102     !$acc kernels present( f_in )
    103     !$acc loop
    104     DO  i = 0, nx
    105        DO  k = nzb_x, nzt_x
    106           !$acc loop vector( 32 )
    107           DO  j = nys_x, nyn_x
    108              f_inv(j,k,i) = f_in(i,j,k)
    109           ENDDO
    110        ENDDO
    111     ENDDO
    112     !$acc end kernels
    113 !$OMP  END PARALLEL
    114135
    115136    IF ( numprocs /= 1 )  THEN
     
    124145                          work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
    125146                          comm1dy, ierr )
    126        !$acc update device( work )
    127147       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
    128148
     
    131151!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
    132152!$OMP  DO
     153       !$acc data copyin( work )
    133154       DO  l = 0, pdims(2) - 1
    134155          ys = 0 + l * ( nyn_x - nys_x + 1 )
     
    145166          !$acc end kernels
    146167       ENDDO
     168       !$acc end data
    147169!$OMP  END PARALLEL
    148170#endif
     
    154176!$OMP  PARALLEL PRIVATE ( i, j, k )
    155177!$OMP  DO
    156        !$acc kernels present( f_out )
     178       !$acc kernels present( f_inv, f_out )
    157179       !$acc loop
    158180       DO  k = nzb_y, nzt_y
     
    172194
    173195
    174  SUBROUTINE transpose_xz( f_in, work, f_out )
     196 SUBROUTINE resort_for_xz( f_inv, f_out )
     197
     198!------------------------------------------------------------------------------!
     199! Description:
     200! ------------
     201! Resorting data after the transposition from x to z. The transposition itself
     202! is carried out in transpose_xz
     203!------------------------------------------------------------------------------!
     204
     205     USE indices
     206     USE transpose_indices
     207
     208     IMPLICIT NONE
     209
     210     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
     211     REAL ::  f_out(1:nz,nys:nyn,nxl:nxr)
     212
     213
     214     INTEGER ::  i, j, k
     215
     216!
     217!-- Rearrange indices of input array in order to make data to be send
     218!-- by MPI contiguous.
     219!-- In case of parallel fft/transposition, scattered store is faster in
     220!-- backward direction!!!
     221    !$OMP  PARALLEL PRIVATE ( i, j, k )
     222    !$OMP  DO
     223    !$acc kernels present( f_inv, f_out )
     224    !$acc loop
     225     DO  k = 1, nz
     226         DO  i = nxl, nxr
     227             !$acc loop vector( 32 )
     228             DO  j = nys, nyn
     229                 f_out(k,j,i) = f_inv(j,i,k)
     230             ENDDO
     231         ENDDO
     232     ENDDO
     233     !$acc end kernels
     234     !$OMP  END PARALLEL
     235
     236 END SUBROUTINE resort_for_xz
     237
     238
     239 SUBROUTINE transpose_xz( f_in, f_inv )
    175240
    176241!------------------------------------------------------------------------------!
     
    192257    INTEGER ::  i, j, k, l, xs
    193258   
    194     REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_out(1:nz,nys:nyn,nxl:nxr)
     259    REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz)
    195260
    196261    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
    197 
    198     !$acc declare create( f_inv )
    199     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
    200262
    201263
     
    210272!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
    211273!$OMP  DO
     274       !$acc data copyout( work )
    212275       DO  l = 0, pdims(1) - 1
    213276          xs = 0 + l * nnx
     
    224287          !$acc end kernels
    225288       ENDDO
     289       !$acc end data
    226290!$OMP  END PARALLEL
    227291
     
    230294       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
    231295       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
    232        !$acc update host( work )
    233296       CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
    234297                          f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
     
    236299       !$acc update device( f_inv )
    237300       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
    238 
    239 !
    240 !--    Reorder transposed array in a way that the z index is in first position
    241 !$OMP  PARALLEL PRIVATE ( i, j, k )
    242 !$OMP  DO
    243        !$acc kernels present( f_out )
    244        !$acc loop
    245        DO  k = 1, nz
    246           DO  i = nxl, nxr
    247              !$acc loop vector( 32 )
    248              DO  j = nys, nyn
    249                 f_out(k,j,i) = f_inv(j,i,k)
    250              ENDDO
    251           ENDDO
    252        ENDDO
    253        !$acc end kernels
    254 !$OMP  END PARALLEL
    255301#endif
    256302
     
    261307!$OMP  PARALLEL PRIVATE ( i, j, k )
    262308!$OMP  DO
    263        !$acc kernels present( f_in )
     309       !$acc kernels present( f_in, f_inv )
    264310       !$acc loop
    265311       DO  i = nxl, nxr
     
    274320!$OMP  END PARALLEL
    275321
    276 !$OMP  PARALLEL PRIVATE ( i, j, k )
    277 !$OMP  DO
    278        !$acc kernels present( f_out )
    279        !$acc loop
    280        DO  k = 1, nz
    281           DO  i = nxl, nxr
    282              !$acc loop vector( 32 )
    283              DO  j = nys, nyn
    284                 f_out(k,j,i) = f_inv(j,i,k)
    285              ENDDO
    286           ENDDO
    287        ENDDO
    288        !$acc end kernels
    289 !$OMP  END PARALLEL
    290 
    291322    ENDIF
    292323
     
    294325
    295326
    296  SUBROUTINE transpose_yx( f_in, work, f_out )
     327 SUBROUTINE resort_for_yx( f_inv, f_out )
     328
     329!------------------------------------------------------------------------------!
     330! Description:
     331! ------------
     332! Resorting data after the transposition from y to x. The transposition itself
     333! is carried out in transpose_yx
     334!------------------------------------------------------------------------------!
     335
     336     USE indices
     337     USE transpose_indices
     338
     339     IMPLICIT NONE
     340
     341     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
     342     REAL ::  f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
     343
     344
     345     INTEGER ::  i, j, k
     346
     347!
     348!-- Rearrange indices of input array in order to make data to be send
     349!-- by MPI contiguous
     350    !$OMP  PARALLEL PRIVATE ( i, j, k )
     351    !$OMP  DO
     352    !$acc kernels present( f_inv, f_out )
     353    !$acc loop
     354     DO  i = 0, nx
     355         DO  k = nzb_x, nzt_x
     356             !$acc loop vector( 32 )
     357             DO  j = nys_x, nyn_x
     358                 f_out(i,j,k) = f_inv(j,k,i)
     359             ENDDO
     360         ENDDO
     361     ENDDO
     362     !$acc end kernels
     363     !$OMP  END PARALLEL
     364
     365 END SUBROUTINE resort_for_yx
     366
     367
     368 SUBROUTINE transpose_yx( f_in, f_inv )
    297369
    298370!------------------------------------------------------------------------------!
     
    314386    INTEGER ::  i, j, k, l, ys
    315387   
    316     REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
     388    REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
    317389
    318390    REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
    319 
    320     !$acc declare create( f_inv )
    321     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
    322391
    323392
     
    329398!$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
    330399!$OMP  DO
     400       !$acc data copyout( work )
    331401       DO  l = 0, pdims(2) - 1
    332402          ys = 0 + l * ( nyn_x - nys_x + 1 )
     
    343413          !$acc end kernels
    344414       ENDDO
     415       !$acc end data
    345416!$OMP  END PARALLEL
    346417
     
    349420       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
    350421       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
    351        !$acc update host( work )
    352422       CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
    353423                          f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
     
    363433!$OMP  PARALLEL PRIVATE ( i, j, k )
    364434!$OMP  DO
    365        !$acc kernels present( f_in )
     435       !$acc kernels present( f_in, f_inv )
    366436       !$acc loop
    367437       DO  i = nxl_y, nxr_y
     
    378448    ENDIF
    379449
    380 !
    381 !-- Reorder transposed array in a way that the x index is in first position
    382 !$OMP  PARALLEL PRIVATE ( i, j, k )
    383 !$OMP  DO
    384     !$acc kernels present( f_out )
    385     !$acc loop
    386     DO  i = 0, nx
    387        DO  k = nzb_x, nzt_x
    388           !$acc loop vector( 32 )
    389           DO  j = nys_x, nyn_x
    390              f_out(i,j,k) = f_inv(j,k,i)
    391           ENDDO
    392        ENDDO
    393     ENDDO
    394     !$acc end kernels
    395 !$OMP  END PARALLEL
    396 
    397450 END SUBROUTINE transpose_yx
    398451
    399452
    400  SUBROUTINE transpose_yxd( f_in, work, f_out )
     453 SUBROUTINE transpose_yxd( f_in, f_out )
    401454
    402455!------------------------------------------------------------------------------!
     
    466519
    467520
    468  SUBROUTINE transpose_yz( f_in, work, f_out )
     521 SUBROUTINE resort_for_yz( f_in, f_inv )
     522
     523!------------------------------------------------------------------------------!
     524! Description:
     525! ------------
     526! Resorting data for the transposition from y to z. The transposition itself
     527! is carried out in transpose_yz
     528!------------------------------------------------------------------------------!
     529
     530     USE indices
     531     USE transpose_indices
     532
     533     IMPLICIT NONE
     534
     535     REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
     536     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
     537
     538
     539     INTEGER ::  i, j, k
     540
     541!
     542!-- Rearrange indices of input array in order to make data to be send
     543!-- by MPI contiguous
     544    !$OMP  PARALLEL PRIVATE ( i, j, k )
     545    !$OMP  DO
     546    !$acc kernels present( f_in, f_inv )
     547    !$acc loop
     548     DO  j = 0, ny
     549         DO  k = nzb_y, nzt_y
     550             !$acc loop vector( 32 )
     551             DO  i = nxl_y, nxr_y
     552                 f_inv(i,k,j) = f_in(j,i,k)
     553             ENDDO
     554         ENDDO
     555     ENDDO
     556     !$acc end kernels
     557     !$OMP  END PARALLEL
     558
     559 END SUBROUTINE resort_for_yz
     560
     561
     562 SUBROUTINE transpose_yz( f_inv, f_out )
    469563
    470564!------------------------------------------------------------------------------!
     
    486580    INTEGER ::  i, j, k, l, zs
    487581   
    488     REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
     582    REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
    489583
    490584    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
    491585
    492     !$acc declare create( f_inv )
    493     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
    494 
    495 
    496 !
    497 !-- Rearrange indices of input array in order to make data to be send
    498 !-- by MPI contiguous
    499 !$OMP  PARALLEL PRIVATE ( i, j, k )
    500 !$OMP  DO
    501     !$acc kernels present( f_in )
    502     !$acc loop
    503     DO  j = 0, ny
    504        DO  k = nzb_y, nzt_y
    505           !$acc loop vector( 32 )
    506           DO  i = nxl_y, nxr_y
    507              f_inv(i,k,j) = f_in(j,i,k)
    508           ENDDO
    509        ENDDO
    510     ENDDO
    511     !$acc end kernels
    512 !$OMP  END PARALLEL
    513 
    514 !
    515 !-- Move data to different array, because memory location of work1 is
    516 !-- needed further below (work1 = work2).
     586
     587!
    517588!-- If the PE grid is one-dimensional along y, only local reordering
    518589!-- of the data is necessary and no transposition has to be done.
     
    521592!$OMP  PARALLEL PRIVATE ( i, j, k )
    522593!$OMP  DO
    523        !$acc kernels present( f_out )
     594       !$acc kernels present( f_inv, f_out )
    524595       !$acc loop
    525596       DO  j = 0, ny
     
    545616                          work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
    546617                          comm1dx, ierr )
    547        !$acc update device( work )
    548618       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
    549619
     
    552622!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
    553623!$OMP  DO
     624       !$acc data copyin( work )
    554625       DO  l = 0, pdims(1) - 1
    555626          zs = 1 + l * ( nzt_y - nzb_y + 1 )
    556           !$acc kernels present( f_out, work )
     627          !$acc kernels present( f_out )
    557628          !$acc loop
    558629          DO  j = nys_z, nyn_z
     
    566637          !$acc end kernels
    567638       ENDDO
     639       !$acc end data
    568640!$OMP  END PARALLEL
    569641#endif
     
    574646
    575647
    576  SUBROUTINE transpose_zx( f_in, work, f_out )
     648 SUBROUTINE resort_for_zx( f_in, f_inv )
     649
     650!------------------------------------------------------------------------------!
     651! Description:
     652! ------------
     653! Resorting data for the transposition from z to x. The transposition itself
     654! is carried out in transpose_zx
     655!------------------------------------------------------------------------------!
     656
     657     USE indices
     658     USE transpose_indices
     659
     660     IMPLICIT NONE
     661
     662     REAL ::  f_in(1:nz,nys:nyn,nxl:nxr)
     663     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
     664
     665
     666     INTEGER ::  i, j, k
     667
     668!
     669!-- Rearrange indices of input array in order to make data to be send
     670!-- by MPI contiguous
     671    !$OMP  PARALLEL PRIVATE ( i, j, k )
     672    !$OMP  DO
     673    !$acc kernels present( f_in, f_inv )
     674    !$acc loop
     675     DO  k = 1,nz
     676         DO  i = nxl, nxr
     677             !$acc loop vector( 32 )
     678             DO  j = nys, nyn
     679                 f_inv(j,i,k) = f_in(k,j,i)
     680             ENDDO
     681         ENDDO
     682     ENDDO
     683     !$acc end kernels
     684     !$OMP  END PARALLEL
     685
     686 END SUBROUTINE resort_for_zx
     687
     688
     689 SUBROUTINE transpose_zx( f_inv, f_out )
    577690
    578691!------------------------------------------------------------------------------!
     
    594707    INTEGER ::  i, j, k, l, xs
    595708   
    596     REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
     709    REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
    597710
    598711    REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
    599712
    600     !$acc declare create( f_inv )
    601     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
    602 
    603 
    604 !
    605 !-- Rearrange indices of input array in order to make data to be send
    606 !-- by MPI contiguous
    607 !$OMP  PARALLEL PRIVATE ( i, j, k )
    608 !$OMP  DO
    609     !$acc kernels present( f_in )
    610     !$acc loop
    611     DO  k = 1,nz
    612        DO  i = nxl, nxr
    613           !$acc loop vector( 32 )
    614           DO  j = nys, nyn
    615              f_inv(j,i,k) = f_in(k,j,i)
    616           ENDDO
    617        ENDDO
    618     ENDDO
    619     !$acc end kernels
    620 !$OMP  END PARALLEL
    621 
    622 !
    623 !-- Move data to different array, because memory location of work1 is
    624 !-- needed further below (work1 = work2).
     713
     714!
    625715!-- If the PE grid is one-dimensional along y, only local reordering
    626716!-- of the data is necessary and no transposition has to be done.
     
    629719!$OMP  PARALLEL PRIVATE ( i, j, k )
    630720!$OMP  DO
    631        !$acc kernels present( f_out )
     721       !$acc kernels present( f_inv, f_out )
    632722       !$acc loop
    633723       DO  k = 1, nz
     
    653743                          work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
    654744                          comm1dx, ierr )
    655        !$acc update device( work )
    656745       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
    657746
     
    660749!$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
    661750!$OMP  DO
     751       !$acc data copyin( work )
    662752       DO  l = 0, pdims(1) - 1
    663753          xs = 0 + l * nnx
    664           !$acc kernels present( f_out, work )
     754          !$acc kernels present( f_out )
    665755          !$acc loop
    666756          DO  k = nzb_x, nzt_x
     
    674764          !$acc end kernels
    675765       ENDDO
     766       !$acc end data
    676767!$OMP  END PARALLEL
    677768#endif
     
    682773
    683774
    684  SUBROUTINE transpose_zy( f_in, work, f_out )
     775 SUBROUTINE resort_for_zy( f_inv, f_out )
     776
     777!------------------------------------------------------------------------------!
     778! Description:
     779! ------------
     780! Resorting data after the transposition from z to y. The transposition itself
     781! is carried out in transpose_zy
     782!------------------------------------------------------------------------------!
     783
     784     USE indices
     785     USE transpose_indices
     786
     787     IMPLICIT NONE
     788
     789     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
     790     REAL ::  f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
     791
     792
     793     INTEGER ::  i, j, k
     794
     795!
     796!-- Rearrange indices of input array in order to make data to be send
     797!-- by MPI contiguous
     798    !$OMP  PARALLEL PRIVATE ( i, j, k )
     799    !$OMP  DO
     800    !$acc kernels present( f_inv, f_out )
     801    !$acc loop
     802     DO  k = nzb_y, nzt_y
     803         DO  j = 0, ny
     804             !$acc loop vector( 32 )
     805             DO  i = nxl_y, nxr_y
     806                 f_out(j,i,k) = f_inv(i,k,j)
     807             ENDDO
     808         ENDDO
     809     ENDDO
     810     !$acc end kernels
     811     !$OMP  END PARALLEL
     812
     813 END SUBROUTINE resort_for_zy
     814
     815
     816 SUBROUTINE transpose_zy( f_in, f_inv )
    685817
    686818!------------------------------------------------------------------------------!
     
    702834    INTEGER ::  i, j, k, l, zs
    703835   
    704     REAL ::  f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
     836    REAL ::  f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
    705837
    706838    REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
    707 
    708     !$acc declare create( f_inv )
    709     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
    710839
    711840
     
    720849!$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
    721850!$OMP  DO
     851       !$acc data copyout( work )
    722852       DO  l = 0, pdims(1) - 1
    723853          zs = 1 + l * ( nzt_y - nzb_y + 1 )
     
    734864          !$acc end kernels
    735865       ENDDO
     866       !$acc end data
    736867!$OMP  END PARALLEL
    737868
     
    740871       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
    741872       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
    742        !$acc update host( work )
    743873       CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
    744874                          f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
     
    753883!$OMP  PARALLEL PRIVATE ( i, j, k )
    754884!$OMP  DO
    755        !$acc kernels present( f_in )
     885       !$acc kernels present( f_in, f_inv )
    756886       !$acc loop
    757887       DO  k = nzb_y, nzt_y
     
    768898    ENDIF
    769899
    770 !
    771 !-- Reorder transposed array in a way that the y index is in first position
    772 !$OMP  PARALLEL PRIVATE ( i, j, k )
    773 !$OMP  DO
    774     !$acc kernels present( f_out )
    775     !$acc loop
    776     DO  k = nzb_y, nzt_y
    777        DO  i = nxl_y, nxr_y
    778           !$acc loop vector( 32 )
    779           DO  j = 0, ny
    780              f_out(j,i,k) = f_inv(i,k,j)
    781           ENDDO
    782        ENDDO
    783     ENDDO
    784     !$acc end kernels
    785 !$OMP  END PARALLEL
    786 
    787900 END SUBROUTINE transpose_zy
    788901
    789902
    790  SUBROUTINE transpose_zyd( f_in, work, f_out )
     903 SUBROUTINE transpose_zyd( f_in, f_out )
    791904
    792905!------------------------------------------------------------------------------!
Note: See TracChangeset for help on using the changeset viewer.